fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include "qemu/bitops.h"
  87 #include "fpu/softfloat.h"
  88
  89 /* We only need stdlib for abort() */
  90
  91 /*----------------------------------------------------------------------------
  92 | Primitive arithmetic functions, including multi-word arithmetic, and
  93 | division and square root approximations.  (Can be specialized to target if
  94 | desired.)
  95 *----------------------------------------------------------------------------*/
  96 #include "fpu/softfloat-macros.h"
  97
  98 /*----------------------------------------------------------------------------
  99 | Returns the fraction bits of the half-precision floating-point value `a'.
 100 *----------------------------------------------------------------------------*/
 101
 102 static inline uint32_t extractFloat16Frac(float16 a)
 103 {
 104     return float16_val(a) & 0x3ff;
 105 }
 106
 107 /*----------------------------------------------------------------------------
 108 | Returns the exponent bits of the half-precision floating-point value `a'.
 109 *----------------------------------------------------------------------------*/
 110
 111 static inline int extractFloat16Exp(float16 a)
 112 {
 113     return (float16_val(a) >> 10) & 0x1f;
 114 }
 115
 116 /*----------------------------------------------------------------------------
 117 | Returns the fraction bits of the single-precision floating-point value `a'.
 118 *----------------------------------------------------------------------------*/
 119
 120 static inline uint32_t extractFloat32Frac(float32 a)
 121 {
 122     return float32_val(a) & 0x007FFFFF;
 123 }
 124
 125 /*----------------------------------------------------------------------------
 126 | Returns the exponent bits of the single-precision floating-point value `a'.
 127 *----------------------------------------------------------------------------*/
 128
 129 static inline int extractFloat32Exp(float32 a)
 130 {
 131     return (float32_val(a) >> 23) & 0xFF;
 132 }
 133
 134 /*----------------------------------------------------------------------------
 135 | Returns the sign bit of the single-precision floating-point value `a'.
 136 *----------------------------------------------------------------------------*/
 137
 138 static inline flag extractFloat32Sign(float32 a)
 139 {
 140     return float32_val(a) >> 31;
 141 }
 142
 143 /*----------------------------------------------------------------------------
 144 | Returns the fraction bits of the double-precision floating-point value `a'.
 145 *----------------------------------------------------------------------------*/
 146
 147 static inline uint64_t extractFloat64Frac(float64 a)
 148 {
 149     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
 150 }
 151
 152 /*----------------------------------------------------------------------------
 153 | Returns the exponent bits of the double-precision floating-point value `a'.
 154 *----------------------------------------------------------------------------*/
 155
 156 static inline int extractFloat64Exp(float64 a)
 157 {
 158     return (float64_val(a) >> 52) & 0x7FF;
 159 }
 160
 161 /*----------------------------------------------------------------------------
 162 | Returns the sign bit of the double-precision floating-point value `a'.
 163 *----------------------------------------------------------------------------*/
 164
 165 static inline flag extractFloat64Sign(float64 a)
 166 {
 167     return float64_val(a) >> 63;
 168 }
 169
 170 /*
 171  * Classify a floating point number. Everything above float_class_qnan
 172  * is a NaN so cls >= float_class_qnan is any NaN.
 173  */
 174
 175 typedef enum __attribute__ ((__packed__)) {
 176     float_class_unclassified,
 177     float_class_zero,
 178     float_class_normal,
 179     float_class_inf,
 180     float_class_qnan,  /* all NaNs from here */
 181     float_class_snan,
 182 } FloatClass;
 183
 184 /*
 185  * Structure holding all of the decomposed parts of a float. The
 186  * exponent is unbiased and the fraction is normalized. All
 187  * calculations are done with a 64 bit fraction and then rounded as
 188  * appropriate for the final format.
 189  *
 190  * Thanks to the packed FloatClass a decent compiler should be able to
 191  * fit the whole structure into registers and avoid using the stack
 192  * for parameter passing.
 193  */
 194
 195 typedef struct {
 196     uint64_t frac;
 197     int32_t  exp;
 198     FloatClass cls;
 199     bool sign;
 200 } FloatParts;
 201
 202 #define DECOMPOSED_BINARY_POINT    (64 - 2)
 203 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 204 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 205
 206 /* Structure holding all of the relevant parameters for a format.
 207  *   exp_size: the size of the exponent field
 208  *   exp_bias: the offset applied to the exponent field
 209  *   exp_max: the maximum normalised exponent
 210  *   frac_size: the size of the fraction field
 211  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 212  * The following are computed based the size of fraction
 213  *   frac_lsb: least significant bit of fraction
 214  *   frac_lsbm1: the bit below the least significant bit (for rounding)
 215  *   round_mask/roundeven_mask: masks used for rounding
 216  * The following optional modifiers are available:
 217  *   arm_althp: handle ARM Alternative Half Precision
 218  */
 219 typedef struct {
 220     int exp_size;
 221     int exp_bias;
 222     int exp_max;
 223     int frac_size;
 224     int frac_shift;
 225     uint64_t frac_lsb;
 226     uint64_t frac_lsbm1;
 227     uint64_t round_mask;
 228     uint64_t roundeven_mask;
 229     bool arm_althp;
 230 } FloatFmt;
 231
 232 /* Expand fields based on the size of exponent and fraction */
 233 #define FLOAT_PARAMS(E, F)                                           \
 234     .exp_size       = E,                                             \
 235     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 236     .exp_max        = (1 << E) - 1,                                  \
 237     .frac_size      = F,                                             \
 238     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 239     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 240     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 241     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 242     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 243
 244 static const FloatFmt float16_params = {
 245     FLOAT_PARAMS(5, 10)
 246 };
 247
 248 static const FloatFmt float16_params_ahp = {
 249     FLOAT_PARAMS(5, 10),
 250     .arm_althp = true
 251 };
 252
 253 static const FloatFmt float32_params = {
 254     FLOAT_PARAMS(8, 23)
 255 };
 256
 257 static const FloatFmt float64_params = {
 258     FLOAT_PARAMS(11, 52)
 259 };
 260
 261 /* Unpack a float to parts, but do not canonicalize.  */
 262 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 263 {
 264     const int sign_pos = fmt.frac_size + fmt.exp_size;
 265
 266     return (FloatParts) {
 267         .cls = float_class_unclassified,
 268         .sign = extract64(raw, sign_pos, 1),
 269         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 270         .frac = extract64(raw, 0, fmt.frac_size),
 271     };
 272 }
 273
 274 static inline FloatParts float16_unpack_raw(float16 f)
 275 {
 276     return unpack_raw(float16_params, f);
 277 }
 278
 279 static inline FloatParts float32_unpack_raw(float32 f)
 280 {
 281     return unpack_raw(float32_params, f);
 282 }
 283
 284 static inline FloatParts float64_unpack_raw(float64 f)
 285 {
 286     return unpack_raw(float64_params, f);
 287 }
 288
 289 /* Pack a float from parts, but do not canonicalize.  */
 290 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 291 {
 292     const int sign_pos = fmt.frac_size + fmt.exp_size;
 293     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 294     return deposit64(ret, sign_pos, 1, p.sign);
 295 }
 296
 297 static inline float16 float16_pack_raw(FloatParts p)
 298 {
 299     return make_float16(pack_raw(float16_params, p));
 300 }
 301
 302 static inline float32 float32_pack_raw(FloatParts p)
 303 {
 304     return make_float32(pack_raw(float32_params, p));
 305 }
 306
 307 static inline float64 float64_pack_raw(FloatParts p)
 308 {
 309     return make_float64(pack_raw(float64_params, p));
 310 }
 311
 312 /*----------------------------------------------------------------------------
 313 | Functions and definitions to determine:  (1) whether tininess for underflow
 314 | is detected before or after rounding by default, (2) what (if anything)
 315 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 316 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 317 | are propagated from function inputs to output.  These details are target-
 318 | specific.
 319 *----------------------------------------------------------------------------*/
 320 #include "softfloat-specialize.h"
 321
 322 /* Canonicalize EXP and FRAC, setting CLS.  */
 323 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
 324                                float_status *status)
 325 {
 326     if (part.exp == parm->exp_max && !parm->arm_althp) {
 327         if (part.frac == 0) {
 328             part.cls = float_class_inf;
 329         } else {
 330             part.frac <<= parm->frac_shift;
 331             part.cls = (parts_is_snan_frac(part.frac, status)
 332                         ? float_class_snan : float_class_qnan);
 333         }
 334     } else if (part.exp == 0) {
 335         if (likely(part.frac == 0)) {
 336             part.cls = float_class_zero;
 337         } else if (status->flush_inputs_to_zero) {
 338             float_raise(float_flag_input_denormal, status);
 339             part.cls = float_class_zero;
 340             part.frac = 0;
 341         } else {
 342             int shift = clz64(part.frac) - 1;
 343             part.cls = float_class_normal;
 344             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 345             part.frac <<= shift;
 346         }
 347     } else {
 348         part.cls = float_class_normal;
 349         part.exp -= parm->exp_bias;
 350         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 351     }
 352     return part;
 353 }
 354
 355 /* Round and uncanonicalize a floating-point number by parts. There
 356  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 357  * fraction; these bits will be removed. The exponent will be biased
 358  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 359  */
 360
 361 static FloatParts round_canonical(FloatParts p, float_status *s,
 362                                   const FloatFmt *parm)
 363 {
 364     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 365     const uint64_t round_mask = parm->round_mask;
 366     const uint64_t roundeven_mask = parm->roundeven_mask;
 367     const int exp_max = parm->exp_max;
 368     const int frac_shift = parm->frac_shift;
 369     uint64_t frac, inc;
 370     int exp, flags = 0;
 371     bool overflow_norm;
 372
 373     frac = p.frac;
 374     exp = p.exp;
 375
 376     switch (p.cls) {
 377     case float_class_normal:
 378         switch (s->float_rounding_mode) {
 379         case float_round_nearest_even:
 380             overflow_norm = false;
 381             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 382             break;
 383         case float_round_ties_away:
 384             overflow_norm = false;
 385             inc = frac_lsbm1;
 386             break;
 387         case float_round_to_zero:
 388             overflow_norm = true;
 389             inc = 0;
 390             break;
 391         case float_round_up:
 392             inc = p.sign ? 0 : round_mask;
 393             overflow_norm = p.sign;
 394             break;
 395         case float_round_down:
 396             inc = p.sign ? round_mask : 0;
 397             overflow_norm = !p.sign;
 398             break;
 399         default:
 400             g_assert_not_reached();
 401         }
 402
 403         exp += parm->exp_bias;
 404         if (likely(exp > 0)) {
 405             if (frac & round_mask) {
 406                 flags |= float_flag_inexact;
 407                 frac += inc;
 408                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
 409                     frac >>= 1;
 410                     exp++;
 411                 }
 412             }
 413             frac >>= frac_shift;
 414
 415             if (parm->arm_althp) {
 416                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
 417                 if (unlikely(exp > exp_max)) {
 418                     /* Overflow.  Return the maximum normal.  */
 419                     flags = float_flag_invalid;
 420                     exp = exp_max;
 421                     frac = -1;
 422                 }
 423             } else if (unlikely(exp >= exp_max)) {
 424                 flags |= float_flag_overflow | float_flag_inexact;
 425                 if (overflow_norm) {
 426                     exp = exp_max - 1;
 427                     frac = -1;
 428                 } else {
 429                     p.cls = float_class_inf;
 430                     goto do_inf;
 431                 }
 432             }
 433         } else if (s->flush_to_zero) {
 434             flags |= float_flag_output_denormal;
 435             p.cls = float_class_zero;
 436             goto do_zero;
 437         } else {
 438             bool is_tiny = (s->float_detect_tininess
 439                             == float_tininess_before_rounding)
 440                         || (exp < 0)
 441                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 442
 443             shift64RightJamming(frac, 1 - exp, &frac);
 444             if (frac & round_mask) {
 445                 /* Need to recompute round-to-even.  */
 446                 if (s->float_rounding_mode == float_round_nearest_even) {
 447                     inc = ((frac & roundeven_mask) != frac_lsbm1
 448                            ? frac_lsbm1 : 0);
 449                 }
 450                 flags |= float_flag_inexact;
 451                 frac += inc;
 452             }
 453
 454             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 455             frac >>= frac_shift;
 456
 457             if (is_tiny && (flags & float_flag_inexact)) {
 458                 flags |= float_flag_underflow;
 459             }
 460             if (exp == 0 && frac == 0) {
 461                 p.cls = float_class_zero;
 462             }
 463         }
 464         break;
 465
 466     case float_class_zero:
 467     do_zero:
 468         exp = 0;
 469         frac = 0;
 470         break;
 471
 472     case float_class_inf:
 473     do_inf:
 474         assert(!parm->arm_althp);
 475         exp = exp_max;
 476         frac = 0;
 477         break;
 478
 479     case float_class_qnan:
 480     case float_class_snan:
 481         assert(!parm->arm_althp);
 482         exp = exp_max;
 483         frac >>= parm->frac_shift;
 484         break;
 485
 486     default:
 487         g_assert_not_reached();
 488     }
 489
 490     float_raise(flags, s);
 491     p.exp = exp;
 492     p.frac = frac;
 493     return p;
 494 }
 495
 496 /* Explicit FloatFmt version */
 497 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
 498                                             const FloatFmt *params)
 499 {
 500     return canonicalize(float16_unpack_raw(f), params, s);
 501 }
 502
 503 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 504 {
 505     return float16a_unpack_canonical(f, s, &float16_params);
 506 }
 507
 508 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
 509                                              const FloatFmt *params)
 510 {
 511     return float16_pack_raw(round_canonical(p, s, params));
 512 }
 513
 514 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 515 {
 516     return float16a_round_pack_canonical(p, s, &float16_params);
 517 }
 518
 519 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 520 {
 521     return canonicalize(float32_unpack_raw(f), &float32_params, s);
 522 }
 523
 524 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 525 {
 526     return float32_pack_raw(round_canonical(p, s, &float32_params));
 527 }
 528
 529 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 530 {
 531     return canonicalize(float64_unpack_raw(f), &float64_params, s);
 532 }
 533
 534 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 535 {
 536     return float64_pack_raw(round_canonical(p, s, &float64_params));
 537 }
 538
 539 /* Simple helpers for checking if what NaN we have */
 540 static bool is_nan(FloatClass c)
 541 {
 542     return unlikely(c >= float_class_qnan);
 543 }
 544 static bool is_snan(FloatClass c)
 545 {
 546     return c == float_class_snan;
 547 }
 548 static bool is_qnan(FloatClass c)
 549 {
 550     return c == float_class_qnan;
 551 }
 552
 553 static FloatParts return_nan(FloatParts a, float_status *s)
 554 {
 555     switch (a.cls) {
 556     case float_class_snan:
 557         s->float_exception_flags |= float_flag_invalid;
 558         a = parts_silence_nan(a, s);
 559         /* fall through */
 560     case float_class_qnan:
 561         if (s->default_nan_mode) {
 562             return parts_default_nan(s);
 563         }
 564         break;
 565
 566     default:
 567         g_assert_not_reached();
 568     }
 569     return a;
 570 }
 571
 572 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 573 {
 574     if (is_snan(a.cls) || is_snan(b.cls)) {
 575         s->float_exception_flags |= float_flag_invalid;
 576     }
 577
 578     if (s->default_nan_mode) {
 579         return parts_default_nan(s);
 580     } else {
 581         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
 582                     is_qnan(b.cls), is_snan(b.cls),
 583                     a.frac > b.frac ||
 584                     (a.frac == b.frac && a.sign < b.sign))) {
 585             a = b;
 586         }
 587         if (is_snan(a.cls)) {
 588             return parts_silence_nan(a, s);
 589         }
 590     }
 591     return a;
 592 }
 593
 594 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 595                                   bool inf_zero, float_status *s)
 596 {
 597     int which;
 598
 599     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 600         s->float_exception_flags |= float_flag_invalid;
 601     }
 602
 603     which = pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
 604                           is_qnan(b.cls), is_snan(b.cls),
 605                           is_qnan(c.cls), is_snan(c.cls),
 606                           inf_zero, s);
 607
 608     if (s->default_nan_mode) {
 609         /* Note that this check is after pickNaNMulAdd so that function
 610          * has an opportunity to set the Invalid flag.
 611          */
 612         which = 3;
 613     }
 614
 615     switch (which) {
 616     case 0:
 617         break;
 618     case 1:
 619         a = b;
 620         break;
 621     case 2:
 622         a = c;
 623         break;
 624     case 3:
 625         return parts_default_nan(s);
 626     default:
 627         g_assert_not_reached();
 628     }
 629
 630     if (is_snan(a.cls)) {
 631         return parts_silence_nan(a, s);
 632     }
 633     return a;
 634 }
 635
 636 /*
 637  * Returns the result of adding or subtracting the values of the
 638  * floating-point values `a' and `b'. The operation is performed
 639  * according to the IEC/IEEE Standard for Binary Floating-Point
 640  * Arithmetic.
 641  */
 642
 643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 644                                 float_status *s)
 645 {
 646     bool a_sign = a.sign;
 647     bool b_sign = b.sign ^ subtract;
 648
 649     if (a_sign != b_sign) {
 650         /* Subtraction */
 651
 652         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 653             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 654                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 655                 a.frac = a.frac - b.frac;
 656             } else {
 657                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 658                 a.frac = b.frac - a.frac;
 659                 a.exp = b.exp;
 660                 a_sign ^= 1;
 661             }
 662
 663             if (a.frac == 0) {
 664                 a.cls = float_class_zero;
 665                 a.sign = s->float_rounding_mode == float_round_down;
 666             } else {
 667                 int shift = clz64(a.frac) - 1;
 668                 a.frac = a.frac << shift;
 669                 a.exp = a.exp - shift;
 670                 a.sign = a_sign;
 671             }
 672             return a;
 673         }
 674         if (is_nan(a.cls) || is_nan(b.cls)) {
 675             return pick_nan(a, b, s);
 676         }
 677         if (a.cls == float_class_inf) {
 678             if (b.cls == float_class_inf) {
 679                 float_raise(float_flag_invalid, s);
 680                 return parts_default_nan(s);
 681             }
 682             return a;
 683         }
 684         if (a.cls == float_class_zero && b.cls == float_class_zero) {
 685             a.sign = s->float_rounding_mode == float_round_down;
 686             return a;
 687         }
 688         if (a.cls == float_class_zero || b.cls == float_class_inf) {
 689             b.sign = a_sign ^ 1;
 690             return b;
 691         }
 692         if (b.cls == float_class_zero) {
 693             return a;
 694         }
 695     } else {
 696         /* Addition */
 697         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 698             if (a.exp > b.exp) {
 699                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 700             } else if (a.exp < b.exp) {
 701                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 702                 a.exp = b.exp;
 703             }
 704             a.frac += b.frac;
 705             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
 706                 a.frac >>= 1;
 707                 a.exp += 1;
 708             }
 709             return a;
 710         }
 711         if (is_nan(a.cls) || is_nan(b.cls)) {
 712             return pick_nan(a, b, s);
 713         }
 714         if (a.cls == float_class_inf || b.cls == float_class_zero) {
 715             return a;
 716         }
 717         if (b.cls == float_class_inf || a.cls == float_class_zero) {
 718             b.sign = b_sign;
 719             return b;
 720         }
 721     }
 722     g_assert_not_reached();
 723 }
 724
 725 /*
 726  * Returns the result of adding or subtracting the floating-point
 727  * values `a' and `b'. The operation is performed according to the
 728  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 729  */
 730
 731 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
 732                                               float_status *status)
 733 {
 734     FloatParts pa = float16_unpack_canonical(a, status);
 735     FloatParts pb = float16_unpack_canonical(b, status);
 736     FloatParts pr = addsub_floats(pa, pb, false, status);
 737
 738     return float16_round_pack_canonical(pr, status);
 739 }
 740
 741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
 742                                              float_status *status)
 743 {
 744     FloatParts pa = float32_unpack_canonical(a, status);
 745     FloatParts pb = float32_unpack_canonical(b, status);
 746     FloatParts pr = addsub_floats(pa, pb, false, status);
 747
 748     return float32_round_pack_canonical(pr, status);
 749 }
 750
 751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
 752                                              float_status *status)
 753 {
 754     FloatParts pa = float64_unpack_canonical(a, status);
 755     FloatParts pb = float64_unpack_canonical(b, status);
 756     FloatParts pr = addsub_floats(pa, pb, false, status);
 757
 758     return float64_round_pack_canonical(pr, status);
 759 }
 760
 761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
 762                                              float_status *status)
 763 {
 764     FloatParts pa = float16_unpack_canonical(a, status);
 765     FloatParts pb = float16_unpack_canonical(b, status);
 766     FloatParts pr = addsub_floats(pa, pb, true, status);
 767
 768     return float16_round_pack_canonical(pr, status);
 769 }
 770
 771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
 772                                              float_status *status)
 773 {
 774     FloatParts pa = float32_unpack_canonical(a, status);
 775     FloatParts pb = float32_unpack_canonical(b, status);
 776     FloatParts pr = addsub_floats(pa, pb, true, status);
 777
 778     return float32_round_pack_canonical(pr, status);
 779 }
 780
 781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
 782                                              float_status *status)
 783 {
 784     FloatParts pa = float64_unpack_canonical(a, status);
 785     FloatParts pb = float64_unpack_canonical(b, status);
 786     FloatParts pr = addsub_floats(pa, pb, true, status);
 787
 788     return float64_round_pack_canonical(pr, status);
 789 }
 790
 791 /*
 792  * Returns the result of multiplying the floating-point values `a' and
 793  * `b'. The operation is performed according to the IEC/IEEE Standard
 794  * for Binary Floating-Point Arithmetic.
 795  */
 796
 797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
 798 {
 799     bool sign = a.sign ^ b.sign;
 800
 801     if (a.cls == float_class_normal && b.cls == float_class_normal) {
 802         uint64_t hi, lo;
 803         int exp = a.exp + b.exp;
 804
 805         mul64To128(a.frac, b.frac, &hi, &lo);
 806         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 807         if (lo & DECOMPOSED_OVERFLOW_BIT) {
 808             shift64RightJamming(lo, 1, &lo);
 809             exp += 1;
 810         }
 811
 812         /* Re-use a */
 813         a.exp = exp;
 814         a.sign = sign;
 815         a.frac = lo;
 816         return a;
 817     }
 818     /* handle all the NaN cases */
 819     if (is_nan(a.cls) || is_nan(b.cls)) {
 820         return pick_nan(a, b, s);
 821     }
 822     /* Inf * Zero == NaN */
 823     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
 824         (a.cls == float_class_zero && b.cls == float_class_inf)) {
 825         s->float_exception_flags |= float_flag_invalid;
 826         return parts_default_nan(s);
 827     }
 828     /* Multiply by 0 or Inf */
 829     if (a.cls == float_class_inf || a.cls == float_class_zero) {
 830         a.sign = sign;
 831         return a;
 832     }
 833     if (b.cls == float_class_inf || b.cls == float_class_zero) {
 834         b.sign = sign;
 835         return b;
 836     }
 837     g_assert_not_reached();
 838 }
 839
 840 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
 841                                              float_status *status)
 842 {
 843     FloatParts pa = float16_unpack_canonical(a, status);
 844     FloatParts pb = float16_unpack_canonical(b, status);
 845     FloatParts pr = mul_floats(pa, pb, status);
 846
 847     return float16_round_pack_canonical(pr, status);
 848 }
 849
 850 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
 851                                              float_status *status)
 852 {
 853     FloatParts pa = float32_unpack_canonical(a, status);
 854     FloatParts pb = float32_unpack_canonical(b, status);
 855     FloatParts pr = mul_floats(pa, pb, status);
 856
 857     return float32_round_pack_canonical(pr, status);
 858 }
 859
 860 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
 861                                              float_status *status)
 862 {
 863     FloatParts pa = float64_unpack_canonical(a, status);
 864     FloatParts pb = float64_unpack_canonical(b, status);
 865     FloatParts pr = mul_floats(pa, pb, status);
 866
 867     return float64_round_pack_canonical(pr, status);
 868 }
 869
 870 /*
 871  * Returns the result of multiplying the floating-point values `a' and
 872  * `b' then adding 'c', with no intermediate rounding step after the
 873  * multiplication. The operation is performed according to the
 874  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
 875  * The flags argument allows the caller to select negation of the
 876  * addend, the intermediate product, or the final result. (The
 877  * difference between this and having the caller do a separate
 878  * negation is that negating externally will flip the sign bit on
 879  * NaNs.)
 880  */
 881
 882 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
 883                                 int flags, float_status *s)
 884 {
 885     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
 886                     ((1 << float_class_inf) | (1 << float_class_zero));
 887     bool p_sign;
 888     bool sign_flip = flags & float_muladd_negate_result;
 889     FloatClass p_class;
 890     uint64_t hi, lo;
 891     int p_exp;
 892
 893     /* It is implementation-defined whether the cases of (0,inf,qnan)
 894      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
 895      * they return if they do), so we have to hand this information
 896      * off to the target-specific pick-a-NaN routine.
 897      */
 898     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
 899         return pick_nan_muladd(a, b, c, inf_zero, s);
 900     }
 901
 902     if (inf_zero) {
 903         s->float_exception_flags |= float_flag_invalid;
 904         return parts_default_nan(s);
 905     }
 906
 907     if (flags & float_muladd_negate_c) {
 908         c.sign ^= 1;
 909     }
 910
 911     p_sign = a.sign ^ b.sign;
 912
 913     if (flags & float_muladd_negate_product) {
 914         p_sign ^= 1;
 915     }
 916
 917     if (a.cls == float_class_inf || b.cls == float_class_inf) {
 918         p_class = float_class_inf;
 919     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
 920         p_class = float_class_zero;
 921     } else {
 922         p_class = float_class_normal;
 923     }
 924
 925     if (c.cls == float_class_inf) {
 926         if (p_class == float_class_inf && p_sign != c.sign) {
 927             s->float_exception_flags |= float_flag_invalid;
 928             return parts_default_nan(s);
 929         } else {
 930             a.cls = float_class_inf;
 931             a.sign = c.sign ^ sign_flip;
 932             return a;
 933         }
 934     }
 935
 936     if (p_class == float_class_inf) {
 937         a.cls = float_class_inf;
 938         a.sign = p_sign ^ sign_flip;
 939         return a;
 940     }
 941
 942     if (p_class == float_class_zero) {
 943         if (c.cls == float_class_zero) {
 944             if (p_sign != c.sign) {
 945                 p_sign = s->float_rounding_mode == float_round_down;
 946             }
 947             c.sign = p_sign;
 948         } else if (flags & float_muladd_halve_result) {
 949             c.exp -= 1;
 950         }
 951         c.sign ^= sign_flip;
 952         return c;
 953     }
 954
 955     /* a & b should be normals now... */
 956     assert(a.cls == float_class_normal &&
 957            b.cls == float_class_normal);
 958
 959     p_exp = a.exp + b.exp;
 960
 961     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
 962      * result.
 963      */
 964     mul64To128(a.frac, b.frac, &hi, &lo);
 965     /* binary point now at bit 124 */
 966
 967     /* check for overflow */
 968     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
 969         shift128RightJamming(hi, lo, 1, &hi, &lo);
 970         p_exp += 1;
 971     }
 972
 973     /* + add/sub */
 974     if (c.cls == float_class_zero) {
 975         /* move binary point back to 62 */
 976         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 977     } else {
 978         int exp_diff = p_exp - c.exp;
 979         if (p_sign == c.sign) {
 980             /* Addition */
 981             if (exp_diff <= 0) {
 982                 shift128RightJamming(hi, lo,
 983                                      DECOMPOSED_BINARY_POINT - exp_diff,
 984                                      &hi, &lo);
 985                 lo += c.frac;
 986                 p_exp = c.exp;
 987             } else {
 988                 uint64_t c_hi, c_lo;
 989                 /* shift c to the same binary point as the product (124) */
 990                 c_hi = c.frac >> 2;
 991                 c_lo = 0;
 992                 shift128RightJamming(c_hi, c_lo,
 993                                      exp_diff,
 994                                      &c_hi, &c_lo);
 995                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
 996                 /* move binary point back to 62 */
 997                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 998             }
 999
1000             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1001                 shift64RightJamming(lo, 1, &lo);
1002                 p_exp += 1;
1003             }
1004
1005         } else {
1006             /* Subtraction */
1007             uint64_t c_hi, c_lo;
1008             /* make C binary point match product at bit 124 */
1009             c_hi = c.frac >> 2;
1010             c_lo = 0;
1011
1012             if (exp_diff <= 0) {
1013                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1014                 if (exp_diff == 0
1015                     &&
1016                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1017                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1018                 } else {
1019                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1020                     p_sign ^= 1;
1021                     p_exp = c.exp;
1022                 }
1023             } else {
1024                 shift128RightJamming(c_hi, c_lo,
1025                                      exp_diff,
1026                                      &c_hi, &c_lo);
1027                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1028             }
1029
1030             if (hi == 0 && lo == 0) {
1031                 a.cls = float_class_zero;
1032                 a.sign = s->float_rounding_mode == float_round_down;
1033                 a.sign ^= sign_flip;
1034                 return a;
1035             } else {
1036                 int shift;
1037                 if (hi != 0) {
1038                     shift = clz64(hi);
1039                 } else {
1040                     shift = clz64(lo) + 64;
1041                 }
1042                 /* Normalizing to a binary point of 124 is the
1043                    correct adjust for the exponent.  However since we're
1044                    shifting, we might as well put the binary point back
1045                    at 62 where we really want it.  Therefore shift as
1046                    if we're leaving 1 bit at the top of the word, but
1047                    adjust the exponent as if we're leaving 3 bits.  */
1048                 shift -= 1;
1049                 if (shift >= 64) {
1050                     lo = lo << (shift - 64);
1051                 } else {
1052                     hi = (hi << shift) | (lo >> (64 - shift));
1053                     lo = hi | ((lo << shift) != 0);
1054                 }
1055                 p_exp -= shift - 2;
1056             }
1057         }
1058     }
1059
1060     if (flags & float_muladd_halve_result) {
1061         p_exp -= 1;
1062     }
1063
1064     /* finally prepare our result */
1065     a.cls = float_class_normal;
1066     a.sign = p_sign ^ sign_flip;
1067     a.exp = p_exp;
1068     a.frac = lo;
1069
1070     return a;
1071 }
1072
1073 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1074                                                 int flags, float_status *status)
1075 {
1076     FloatParts pa = float16_unpack_canonical(a, status);
1077     FloatParts pb = float16_unpack_canonical(b, status);
1078     FloatParts pc = float16_unpack_canonical(c, status);
1079     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1080
1081     return float16_round_pack_canonical(pr, status);
1082 }
1083
1084 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1085                                                 int flags, float_status *status)
1086 {
1087     FloatParts pa = float32_unpack_canonical(a, status);
1088     FloatParts pb = float32_unpack_canonical(b, status);
1089     FloatParts pc = float32_unpack_canonical(c, status);
1090     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1091
1092     return float32_round_pack_canonical(pr, status);
1093 }
1094
1095 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1096                                                 int flags, float_status *status)
1097 {
1098     FloatParts pa = float64_unpack_canonical(a, status);
1099     FloatParts pb = float64_unpack_canonical(b, status);
1100     FloatParts pc = float64_unpack_canonical(c, status);
1101     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1102
1103     return float64_round_pack_canonical(pr, status);
1104 }
1105
1106 /*
1107  * Returns the result of dividing the floating-point value `a' by the
1108  * corresponding value `b'. The operation is performed according to
1109  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110  */
1111
1112 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1113 {
1114     bool sign = a.sign ^ b.sign;
1115
1116     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1117         uint64_t temp_lo, temp_hi;
1118         int exp = a.exp - b.exp;
1119         if (a.frac < b.frac) {
1120             exp -= 1;
1121             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1122                               &temp_hi, &temp_lo);
1123         } else {
1124             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1125                               &temp_hi, &temp_lo);
1126         }
1127         /* LSB of quot is set if inexact which roundandpack will use
1128          * to set flags. Yet again we re-use a for the result */
1129         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1130         a.sign = sign;
1131         a.exp = exp;
1132         return a;
1133     }
1134     /* handle all the NaN cases */
1135     if (is_nan(a.cls) || is_nan(b.cls)) {
1136         return pick_nan(a, b, s);
1137     }
1138     /* 0/0 or Inf/Inf */
1139     if (a.cls == b.cls
1140         &&
1141         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1142         s->float_exception_flags |= float_flag_invalid;
1143         return parts_default_nan(s);
1144     }
1145     /* Inf / x or 0 / x */
1146     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1147         a.sign = sign;
1148         return a;
1149     }
1150     /* Div 0 => Inf */
1151     if (b.cls == float_class_zero) {
1152         s->float_exception_flags |= float_flag_divbyzero;
1153         a.cls = float_class_inf;
1154         a.sign = sign;
1155         return a;
1156     }
1157     /* Div by Inf */
1158     if (b.cls == float_class_inf) {
1159         a.cls = float_class_zero;
1160         a.sign = sign;
1161         return a;
1162     }
1163     g_assert_not_reached();
1164 }
1165
1166 float16 float16_div(float16 a, float16 b, float_status *status)
1167 {
1168     FloatParts pa = float16_unpack_canonical(a, status);
1169     FloatParts pb = float16_unpack_canonical(b, status);
1170     FloatParts pr = div_floats(pa, pb, status);
1171
1172     return float16_round_pack_canonical(pr, status);
1173 }
1174
1175 float32 float32_div(float32 a, float32 b, float_status *status)
1176 {
1177     FloatParts pa = float32_unpack_canonical(a, status);
1178     FloatParts pb = float32_unpack_canonical(b, status);
1179     FloatParts pr = div_floats(pa, pb, status);
1180
1181     return float32_round_pack_canonical(pr, status);
1182 }
1183
1184 float64 float64_div(float64 a, float64 b, float_status *status)
1185 {
1186     FloatParts pa = float64_unpack_canonical(a, status);
1187     FloatParts pb = float64_unpack_canonical(b, status);
1188     FloatParts pr = div_floats(pa, pb, status);
1189
1190     return float64_round_pack_canonical(pr, status);
1191 }
1192
1193 /*
1194  * Float to Float conversions
1195  *
1196  * Returns the result of converting one float format to another. The
1197  * conversion is performed according to the IEC/IEEE Standard for
1198  * Binary Floating-Point Arithmetic.
1199  *
1200  * The float_to_float helper only needs to take care of raising
1201  * invalid exceptions and handling the conversion on NaNs.
1202  */
1203
1204 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1205                                  float_status *s)
1206 {
1207     if (dstf->arm_althp) {
1208         switch (a.cls) {
1209         case float_class_qnan:
1210         case float_class_snan:
1211             /* There is no NaN in the destination format.  Raise Invalid
1212              * and return a zero with the sign of the input NaN.
1213              */
1214             s->float_exception_flags |= float_flag_invalid;
1215             a.cls = float_class_zero;
1216             a.frac = 0;
1217             a.exp = 0;
1218             break;
1219
1220         case float_class_inf:
1221             /* There is no Inf in the destination format.  Raise Invalid
1222              * and return the maximum normal with the correct sign.
1223              */
1224             s->float_exception_flags |= float_flag_invalid;
1225             a.cls = float_class_normal;
1226             a.exp = dstf->exp_max;
1227             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1228             break;
1229
1230         default:
1231             break;
1232         }
1233     } else if (is_nan(a.cls)) {
1234         if (is_snan(a.cls)) {
1235             s->float_exception_flags |= float_flag_invalid;
1236             a = parts_silence_nan(a, s);
1237         }
1238         if (s->default_nan_mode) {
1239             return parts_default_nan(s);
1240         }
1241     }
1242     return a;
1243 }
1244
1245 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1246 {
1247     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1248     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1249     FloatParts pr = float_to_float(p, &float32_params, s);
1250     return float32_round_pack_canonical(pr, s);
1251 }
1252
1253 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1254 {
1255     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1256     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1257     FloatParts pr = float_to_float(p, &float64_params, s);
1258     return float64_round_pack_canonical(pr, s);
1259 }
1260
1261 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1262 {
1263     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1264     FloatParts p = float32_unpack_canonical(a, s);
1265     FloatParts pr = float_to_float(p, fmt16, s);
1266     return float16a_round_pack_canonical(pr, s, fmt16);
1267 }
1268
1269 float64 float32_to_float64(float32 a, float_status *s)
1270 {
1271     FloatParts p = float32_unpack_canonical(a, s);
1272     FloatParts pr = float_to_float(p, &float64_params, s);
1273     return float64_round_pack_canonical(pr, s);
1274 }
1275
1276 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1277 {
1278     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1279     FloatParts p = float64_unpack_canonical(a, s);
1280     FloatParts pr = float_to_float(p, fmt16, s);
1281     return float16a_round_pack_canonical(pr, s, fmt16);
1282 }
1283
1284 float32 float64_to_float32(float64 a, float_status *s)
1285 {
1286     FloatParts p = float64_unpack_canonical(a, s);
1287     FloatParts pr = float_to_float(p, &float32_params, s);
1288     return float32_round_pack_canonical(pr, s);
1289 }
1290
1291 /*
1292  * Rounds the floating-point value `a' to an integer, and returns the
1293  * result as a floating-point value. The operation is performed
1294  * according to the IEC/IEEE Standard for Binary Floating-Point
1295  * Arithmetic.
1296  */
1297
1298 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1299 {
1300     if (is_nan(a.cls)) {
1301         return return_nan(a, s);
1302     }
1303
1304     switch (a.cls) {
1305     case float_class_zero:
1306     case float_class_inf:
1307     case float_class_qnan:
1308         /* already "integral" */
1309         break;
1310     case float_class_normal:
1311         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1312             /* already integral */
1313             break;
1314         }
1315         if (a.exp < 0) {
1316             bool one;
1317             /* all fractional */
1318             s->float_exception_flags |= float_flag_inexact;
1319             switch (rounding_mode) {
1320             case float_round_nearest_even:
1321                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1322                 break;
1323             case float_round_ties_away:
1324                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1325                 break;
1326             case float_round_to_zero:
1327                 one = false;
1328                 break;
1329             case float_round_up:
1330                 one = !a.sign;
1331                 break;
1332             case float_round_down:
1333                 one = a.sign;
1334                 break;
1335             default:
1336                 g_assert_not_reached();
1337             }
1338
1339             if (one) {
1340                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1341                 a.exp = 0;
1342             } else {
1343                 a.cls = float_class_zero;
1344             }
1345         } else {
1346             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1347             uint64_t frac_lsbm1 = frac_lsb >> 1;
1348             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1349             uint64_t rnd_mask = rnd_even_mask >> 1;
1350             uint64_t inc;
1351
1352             switch (rounding_mode) {
1353             case float_round_nearest_even:
1354                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1355                 break;
1356             case float_round_ties_away:
1357                 inc = frac_lsbm1;
1358                 break;
1359             case float_round_to_zero:
1360                 inc = 0;
1361                 break;
1362             case float_round_up:
1363                 inc = a.sign ? 0 : rnd_mask;
1364                 break;
1365             case float_round_down:
1366                 inc = a.sign ? rnd_mask : 0;
1367                 break;
1368             default:
1369                 g_assert_not_reached();
1370             }
1371
1372             if (a.frac & rnd_mask) {
1373                 s->float_exception_flags |= float_flag_inexact;
1374                 a.frac += inc;
1375                 a.frac &= ~rnd_mask;
1376                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1377                     a.frac >>= 1;
1378                     a.exp++;
1379                 }
1380             }
1381         }
1382         break;
1383     default:
1384         g_assert_not_reached();
1385     }
1386     return a;
1387 }
1388
1389 float16 float16_round_to_int(float16 a, float_status *s)
1390 {
1391     FloatParts pa = float16_unpack_canonical(a, s);
1392     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1393     return float16_round_pack_canonical(pr, s);
1394 }
1395
1396 float32 float32_round_to_int(float32 a, float_status *s)
1397 {
1398     FloatParts pa = float32_unpack_canonical(a, s);
1399     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1400     return float32_round_pack_canonical(pr, s);
1401 }
1402
1403 float64 float64_round_to_int(float64 a, float_status *s)
1404 {
1405     FloatParts pa = float64_unpack_canonical(a, s);
1406     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1407     return float64_round_pack_canonical(pr, s);
1408 }
1409
1410 float64 float64_trunc_to_int(float64 a, float_status *s)
1411 {
1412     FloatParts pa = float64_unpack_canonical(a, s);
1413     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1414     return float64_round_pack_canonical(pr, s);
1415 }
1416
1417 /*
1418  * Returns the result of converting the floating-point value `a' to
1419  * the two's complement integer format. The conversion is performed
1420  * according to the IEC/IEEE Standard for Binary Floating-Point
1421  * Arithmetic---which means in particular that the conversion is
1422  * rounded according to the current rounding mode. If `a' is a NaN,
1423  * the largest positive integer is returned. Otherwise, if the
1424  * conversion overflows, the largest integer with the same sign as `a'
1425  * is returned.
1426 */
1427
1428 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1429                                      int64_t min, int64_t max,
1430                                      float_status *s)
1431 {
1432     uint64_t r;
1433     int orig_flags = get_float_exception_flags(s);
1434     FloatParts p = round_to_int(in, rmode, s);
1435
1436     switch (p.cls) {
1437     case float_class_snan:
1438     case float_class_qnan:
1439         s->float_exception_flags = orig_flags | float_flag_invalid;
1440         return max;
1441     case float_class_inf:
1442         s->float_exception_flags = orig_flags | float_flag_invalid;
1443         return p.sign ? min : max;
1444     case float_class_zero:
1445         return 0;
1446     case float_class_normal:
1447         if (p.exp < DECOMPOSED_BINARY_POINT) {
1448             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1449         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1450             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1451         } else {
1452             r = UINT64_MAX;
1453         }
1454         if (p.sign) {
1455             if (r <= -(uint64_t) min) {
1456                 return -r;
1457             } else {
1458                 s->float_exception_flags = orig_flags | float_flag_invalid;
1459                 return min;
1460             }
1461         } else {
1462             if (r <= max) {
1463                 return r;
1464             } else {
1465                 s->float_exception_flags = orig_flags | float_flag_invalid;
1466                 return max;
1467             }
1468         }
1469     default:
1470         g_assert_not_reached();
1471     }
1472 }
1473
1474 #define FLOAT_TO_INT(fsz, isz)                                          \
1475 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1476                                                 float_status *s)        \
1477 {                                                                       \
1478     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1479     return round_to_int_and_pack(p, s->float_rounding_mode,             \
1480                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1481                                  s);                                    \
1482 }                                                                       \
1483                                                                         \
1484 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1485  (float ## fsz a, float_status *s)                                      \
1486 {                                                                       \
1487     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1488     return round_to_int_and_pack(p, float_round_to_zero,                \
1489                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1490                                  s);                                    \
1491 }
1492
1493 FLOAT_TO_INT(16, 16)
1494 FLOAT_TO_INT(16, 32)
1495 FLOAT_TO_INT(16, 64)
1496
1497 FLOAT_TO_INT(32, 16)
1498 FLOAT_TO_INT(32, 32)
1499 FLOAT_TO_INT(32, 64)
1500
1501 FLOAT_TO_INT(64, 16)
1502 FLOAT_TO_INT(64, 32)
1503 FLOAT_TO_INT(64, 64)
1504
1505 #undef FLOAT_TO_INT
1506
1507 /*
1508  *  Returns the result of converting the floating-point value `a' to
1509  *  the unsigned integer format. The conversion is performed according
1510  *  to the IEC/IEEE Standard for Binary Floating-Point
1511  *  Arithmetic---which means in particular that the conversion is
1512  *  rounded according to the current rounding mode. If `a' is a NaN,
1513  *  the largest unsigned integer is returned. Otherwise, if the
1514  *  conversion overflows, the largest unsigned integer is returned. If
1515  *  the 'a' is negative, the result is rounded and zero is returned;
1516  *  values that do not round to zero will raise the inexact exception
1517  *  flag.
1518  */
1519
1520 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1521                                        float_status *s)
1522 {
1523     int orig_flags = get_float_exception_flags(s);
1524     FloatParts p = round_to_int(in, rmode, s);
1525
1526     switch (p.cls) {
1527     case float_class_snan:
1528     case float_class_qnan:
1529         s->float_exception_flags = orig_flags | float_flag_invalid;
1530         return max;
1531     case float_class_inf:
1532         s->float_exception_flags = orig_flags | float_flag_invalid;
1533         return p.sign ? 0 : max;
1534     case float_class_zero:
1535         return 0;
1536     case float_class_normal:
1537     {
1538         uint64_t r;
1539         if (p.sign) {
1540             s->float_exception_flags = orig_flags | float_flag_invalid;
1541             return 0;
1542         }
1543
1544         if (p.exp < DECOMPOSED_BINARY_POINT) {
1545             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1546         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1547             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1548         } else {
1549             s->float_exception_flags = orig_flags | float_flag_invalid;
1550             return max;
1551         }
1552
1553         /* For uint64 this will never trip, but if p.exp is too large
1554          * to shift a decomposed fraction we shall have exited via the
1555          * 3rd leg above.
1556          */
1557         if (r > max) {
1558             s->float_exception_flags = orig_flags | float_flag_invalid;
1559             return max;
1560         } else {
1561             return r;
1562         }
1563     }
1564     default:
1565         g_assert_not_reached();
1566     }
1567 }
1568
1569 #define FLOAT_TO_UINT(fsz, isz) \
1570 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1571                                                   float_status *s)      \
1572 {                                                                       \
1573     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1574     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1575                                  UINT ## isz ## _MAX, s);               \
1576 }                                                                       \
1577                                                                         \
1578 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1579  (float ## fsz a, float_status *s)                                      \
1580 {                                                                       \
1581     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1582     return round_to_uint_and_pack(p, float_round_to_zero,               \
1583                                   UINT ## isz ## _MAX, s);              \
1584 }
1585
1586 FLOAT_TO_UINT(16, 16)
1587 FLOAT_TO_UINT(16, 32)
1588 FLOAT_TO_UINT(16, 64)
1589
1590 FLOAT_TO_UINT(32, 16)
1591 FLOAT_TO_UINT(32, 32)
1592 FLOAT_TO_UINT(32, 64)
1593
1594 FLOAT_TO_UINT(64, 16)
1595 FLOAT_TO_UINT(64, 32)
1596 FLOAT_TO_UINT(64, 64)
1597
1598 #undef FLOAT_TO_UINT
1599
1600 /*
1601  * Integer to float conversions
1602  *
1603  * Returns the result of converting the two's complement integer `a'
1604  * to the floating-point format. The conversion is performed according
1605  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1606  */
1607
1608 static FloatParts int_to_float(int64_t a, float_status *status)
1609 {
1610     FloatParts r = {};
1611     if (a == 0) {
1612         r.cls = float_class_zero;
1613         r.sign = false;
1614     } else if (a == (1ULL << 63)) {
1615         r.cls = float_class_normal;
1616         r.sign = true;
1617         r.frac = DECOMPOSED_IMPLICIT_BIT;
1618         r.exp = 63;
1619     } else {
1620         uint64_t f;
1621         if (a < 0) {
1622             f = -a;
1623             r.sign = true;
1624         } else {
1625             f = a;
1626             r.sign = false;
1627         }
1628         int shift = clz64(f) - 1;
1629         r.cls = float_class_normal;
1630         r.exp = (DECOMPOSED_BINARY_POINT - shift);
1631         r.frac = f << shift;
1632     }
1633
1634     return r;
1635 }
1636
1637 float16 int64_to_float16(int64_t a, float_status *status)
1638 {
1639     FloatParts pa = int_to_float(a, status);
1640     return float16_round_pack_canonical(pa, status);
1641 }
1642
1643 float16 int32_to_float16(int32_t a, float_status *status)
1644 {
1645     return int64_to_float16(a, status);
1646 }
1647
1648 float16 int16_to_float16(int16_t a, float_status *status)
1649 {
1650     return int64_to_float16(a, status);
1651 }
1652
1653 float32 int64_to_float32(int64_t a, float_status *status)
1654 {
1655     FloatParts pa = int_to_float(a, status);
1656     return float32_round_pack_canonical(pa, status);
1657 }
1658
1659 float32 int32_to_float32(int32_t a, float_status *status)
1660 {
1661     return int64_to_float32(a, status);
1662 }
1663
1664 float32 int16_to_float32(int16_t a, float_status *status)
1665 {
1666     return int64_to_float32(a, status);
1667 }
1668
1669 float64 int64_to_float64(int64_t a, float_status *status)
1670 {
1671     FloatParts pa = int_to_float(a, status);
1672     return float64_round_pack_canonical(pa, status);
1673 }
1674
1675 float64 int32_to_float64(int32_t a, float_status *status)
1676 {
1677     return int64_to_float64(a, status);
1678 }
1679
1680 float64 int16_to_float64(int16_t a, float_status *status)
1681 {
1682     return int64_to_float64(a, status);
1683 }
1684
1685
1686 /*
1687  * Unsigned Integer to float conversions
1688  *
1689  * Returns the result of converting the unsigned integer `a' to the
1690  * floating-point format. The conversion is performed according to the
1691  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1692  */
1693
1694 static FloatParts uint_to_float(uint64_t a, float_status *status)
1695 {
1696     FloatParts r = { .sign = false};
1697
1698     if (a == 0) {
1699         r.cls = float_class_zero;
1700     } else {
1701         int spare_bits = clz64(a) - 1;
1702         r.cls = float_class_normal;
1703         r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1704         if (spare_bits < 0) {
1705             shift64RightJamming(a, -spare_bits, &a);
1706             r.frac = a;
1707         } else {
1708             r.frac = a << spare_bits;
1709         }
1710     }
1711
1712     return r;
1713 }
1714
1715 float16 uint64_to_float16(uint64_t a, float_status *status)
1716 {
1717     FloatParts pa = uint_to_float(a, status);
1718     return float16_round_pack_canonical(pa, status);
1719 }
1720
1721 float16 uint32_to_float16(uint32_t a, float_status *status)
1722 {
1723     return uint64_to_float16(a, status);
1724 }
1725
1726 float16 uint16_to_float16(uint16_t a, float_status *status)
1727 {
1728     return uint64_to_float16(a, status);
1729 }
1730
1731 float32 uint64_to_float32(uint64_t a, float_status *status)
1732 {
1733     FloatParts pa = uint_to_float(a, status);
1734     return float32_round_pack_canonical(pa, status);
1735 }
1736
1737 float32 uint32_to_float32(uint32_t a, float_status *status)
1738 {
1739     return uint64_to_float32(a, status);
1740 }
1741
1742 float32 uint16_to_float32(uint16_t a, float_status *status)
1743 {
1744     return uint64_to_float32(a, status);
1745 }
1746
1747 float64 uint64_to_float64(uint64_t a, float_status *status)
1748 {
1749     FloatParts pa = uint_to_float(a, status);
1750     return float64_round_pack_canonical(pa, status);
1751 }
1752
1753 float64 uint32_to_float64(uint32_t a, float_status *status)
1754 {
1755     return uint64_to_float64(a, status);
1756 }
1757
1758 float64 uint16_to_float64(uint16_t a, float_status *status)
1759 {
1760     return uint64_to_float64(a, status);
1761 }
1762
1763 /* Float Min/Max */
1764 /* min() and max() functions. These can't be implemented as
1765  * 'compare and pick one input' because that would mishandle
1766  * NaNs and +0 vs -0.
1767  *
1768  * minnum() and maxnum() functions. These are similar to the min()
1769  * and max() functions but if one of the arguments is a QNaN and
1770  * the other is numerical then the numerical argument is returned.
1771  * SNaNs will get quietened before being returned.
1772  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1773  * and maxNum() operations. min() and max() are the typical min/max
1774  * semantics provided by many CPUs which predate that specification.
1775  *
1776  * minnummag() and maxnummag() functions correspond to minNumMag()
1777  * and minNumMag() from the IEEE-754 2008.
1778  */
1779 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1780                                 bool ieee, bool ismag, float_status *s)
1781 {
1782     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1783         if (ieee) {
1784             /* Takes two floating-point values `a' and `b', one of
1785              * which is a NaN, and returns the appropriate NaN
1786              * result. If either `a' or `b' is a signaling NaN,
1787              * the invalid exception is raised.
1788              */
1789             if (is_snan(a.cls) || is_snan(b.cls)) {
1790                 return pick_nan(a, b, s);
1791             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1792                 return b;
1793             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1794                 return a;
1795             }
1796         }
1797         return pick_nan(a, b, s);
1798     } else {
1799         int a_exp, b_exp;
1800
1801         switch (a.cls) {
1802         case float_class_normal:
1803             a_exp = a.exp;
1804             break;
1805         case float_class_inf:
1806             a_exp = INT_MAX;
1807             break;
1808         case float_class_zero:
1809             a_exp = INT_MIN;
1810             break;
1811         default:
1812             g_assert_not_reached();
1813             break;
1814         }
1815         switch (b.cls) {
1816         case float_class_normal:
1817             b_exp = b.exp;
1818             break;
1819         case float_class_inf:
1820             b_exp = INT_MAX;
1821             break;
1822         case float_class_zero:
1823             b_exp = INT_MIN;
1824             break;
1825         default:
1826             g_assert_not_reached();
1827             break;
1828         }
1829
1830         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1831             bool a_less = a_exp < b_exp;
1832             if (a_exp == b_exp) {
1833                 a_less = a.frac < b.frac;
1834             }
1835             return a_less ^ ismin ? b : a;
1836         }
1837
1838         if (a.sign == b.sign) {
1839             bool a_less = a_exp < b_exp;
1840             if (a_exp == b_exp) {
1841                 a_less = a.frac < b.frac;
1842             }
1843             return a.sign ^ a_less ^ ismin ? b : a;
1844         } else {
1845             return a.sign ^ ismin ? b : a;
1846         }
1847     }
1848 }
1849
1850 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
1851 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
1852                                      float_status *s)                   \
1853 {                                                                       \
1854     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1855     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1856     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
1857                                                                         \
1858     return float ## sz ## _round_pack_canonical(pr, s);                 \
1859 }
1860
1861 MINMAX(16, min, true, false, false)
1862 MINMAX(16, minnum, true, true, false)
1863 MINMAX(16, minnummag, true, true, true)
1864 MINMAX(16, max, false, false, false)
1865 MINMAX(16, maxnum, false, true, false)
1866 MINMAX(16, maxnummag, false, true, true)
1867
1868 MINMAX(32, min, true, false, false)
1869 MINMAX(32, minnum, true, true, false)
1870 MINMAX(32, minnummag, true, true, true)
1871 MINMAX(32, max, false, false, false)
1872 MINMAX(32, maxnum, false, true, false)
1873 MINMAX(32, maxnummag, false, true, true)
1874
1875 MINMAX(64, min, true, false, false)
1876 MINMAX(64, minnum, true, true, false)
1877 MINMAX(64, minnummag, true, true, true)
1878 MINMAX(64, max, false, false, false)
1879 MINMAX(64, maxnum, false, true, false)
1880 MINMAX(64, maxnummag, false, true, true)
1881
1882 #undef MINMAX
1883
1884 /* Floating point compare */
1885 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1886                           float_status *s)
1887 {
1888     if (is_nan(a.cls) || is_nan(b.cls)) {
1889         if (!is_quiet ||
1890             a.cls == float_class_snan ||
1891             b.cls == float_class_snan) {
1892             s->float_exception_flags |= float_flag_invalid;
1893         }
1894         return float_relation_unordered;
1895     }
1896
1897     if (a.cls == float_class_zero) {
1898         if (b.cls == float_class_zero) {
1899             return float_relation_equal;
1900         }
1901         return b.sign ? float_relation_greater : float_relation_less;
1902     } else if (b.cls == float_class_zero) {
1903         return a.sign ? float_relation_less : float_relation_greater;
1904     }
1905
1906     /* The only really important thing about infinity is its sign. If
1907      * both are infinities the sign marks the smallest of the two.
1908      */
1909     if (a.cls == float_class_inf) {
1910         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1911             return float_relation_equal;
1912         }
1913         return a.sign ? float_relation_less : float_relation_greater;
1914     } else if (b.cls == float_class_inf) {
1915         return b.sign ? float_relation_greater : float_relation_less;
1916     }
1917
1918     if (a.sign != b.sign) {
1919         return a.sign ? float_relation_less : float_relation_greater;
1920     }
1921
1922     if (a.exp == b.exp) {
1923         if (a.frac == b.frac) {
1924             return float_relation_equal;
1925         }
1926         if (a.sign) {
1927             return a.frac > b.frac ?
1928                 float_relation_less : float_relation_greater;
1929         } else {
1930             return a.frac > b.frac ?
1931                 float_relation_greater : float_relation_less;
1932         }
1933     } else {
1934         if (a.sign) {
1935             return a.exp > b.exp ? float_relation_less : float_relation_greater;
1936         } else {
1937             return a.exp > b.exp ? float_relation_greater : float_relation_less;
1938         }
1939     }
1940 }
1941
1942 #define COMPARE(sz)                                                     \
1943 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
1944                             float_status *s)                            \
1945 {                                                                       \
1946     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1947     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1948     return compare_floats(pa, pb, false, s);                            \
1949 }                                                                       \
1950 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
1951                                   float_status *s)                      \
1952 {                                                                       \
1953     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1954     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1955     return compare_floats(pa, pb, true, s);                             \
1956 }
1957
1958 COMPARE(16)
1959 COMPARE(32)
1960 COMPARE(64)
1961
1962 #undef COMPARE
1963
1964 /* Multiply A by 2 raised to the power N.  */
1965 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1966 {
1967     if (unlikely(is_nan(a.cls))) {
1968         return return_nan(a, s);
1969     }
1970     if (a.cls == float_class_normal) {
1971         /* The largest float type (even though not supported by FloatParts)
1972          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
1973          * still allows rounding to infinity, without allowing overflow
1974          * within the int32_t that backs FloatParts.exp.
1975          */
1976         n = MIN(MAX(n, -0x10000), 0x10000);
1977         a.exp += n;
1978     }
1979     return a;
1980 }
1981
1982 float16 float16_scalbn(float16 a, int n, float_status *status)
1983 {
1984     FloatParts pa = float16_unpack_canonical(a, status);
1985     FloatParts pr = scalbn_decomposed(pa, n, status);
1986     return float16_round_pack_canonical(pr, status);
1987 }
1988
1989 float32 float32_scalbn(float32 a, int n, float_status *status)
1990 {
1991     FloatParts pa = float32_unpack_canonical(a, status);
1992     FloatParts pr = scalbn_decomposed(pa, n, status);
1993     return float32_round_pack_canonical(pr, status);
1994 }
1995
1996 float64 float64_scalbn(float64 a, int n, float_status *status)
1997 {
1998     FloatParts pa = float64_unpack_canonical(a, status);
1999     FloatParts pr = scalbn_decomposed(pa, n, status);
2000     return float64_round_pack_canonical(pr, status);
2001 }
2002
2003 /*
2004  * Square Root
2005  *
2006  * The old softfloat code did an approximation step before zeroing in
2007  * on the final result. However for simpleness we just compute the
2008  * square root by iterating down from the implicit bit to enough extra
2009  * bits to ensure we get a correctly rounded result.
2010  *
2011  * This does mean however the calculation is slower than before,
2012  * especially for 64 bit floats.
2013  */
2014
2015 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2016 {
2017     uint64_t a_frac, r_frac, s_frac;
2018     int bit, last_bit;
2019
2020     if (is_nan(a.cls)) {
2021         return return_nan(a, s);
2022     }
2023     if (a.cls == float_class_zero) {
2024         return a;  /* sqrt(+-0) = +-0 */
2025     }
2026     if (a.sign) {
2027         s->float_exception_flags |= float_flag_invalid;
2028         return parts_default_nan(s);
2029     }
2030     if (a.cls == float_class_inf) {
2031         return a;  /* sqrt(+inf) = +inf */
2032     }
2033
2034     assert(a.cls == float_class_normal);
2035
2036     /* We need two overflow bits at the top. Adding room for that is a
2037      * right shift. If the exponent is odd, we can discard the low bit
2038      * by multiplying the fraction by 2; that's a left shift. Combine
2039      * those and we shift right if the exponent is even.
2040      */
2041     a_frac = a.frac;
2042     if (!(a.exp & 1)) {
2043         a_frac >>= 1;
2044     }
2045     a.exp >>= 1;
2046
2047     /* Bit-by-bit computation of sqrt.  */
2048     r_frac = 0;
2049     s_frac = 0;
2050
2051     /* Iterate from implicit bit down to the 3 extra bits to compute a
2052      * properly rounded result. Remember we've inserted one more bit
2053      * at the top, so these positions are one less.
2054      */
2055     bit = DECOMPOSED_BINARY_POINT - 1;
2056     last_bit = MAX(p->frac_shift - 4, 0);
2057     do {
2058         uint64_t q = 1ULL << bit;
2059         uint64_t t_frac = s_frac + q;
2060         if (t_frac <= a_frac) {
2061             s_frac = t_frac + q;
2062             a_frac -= t_frac;
2063             r_frac += q;
2064         }
2065         a_frac <<= 1;
2066     } while (--bit >= last_bit);
2067
2068     /* Undo the right shift done above. If there is any remaining
2069      * fraction, the result is inexact. Set the sticky bit.
2070      */
2071     a.frac = (r_frac << 1) + (a_frac != 0);
2072
2073     return a;
2074 }
2075
2076 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
2077 {
2078     FloatParts pa = float16_unpack_canonical(a, status);
2079     FloatParts pr = sqrt_float(pa, status, &float16_params);
2080     return float16_round_pack_canonical(pr, status);
2081 }
2082
2083 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
2084 {
2085     FloatParts pa = float32_unpack_canonical(a, status);
2086     FloatParts pr = sqrt_float(pa, status, &float32_params);
2087     return float32_round_pack_canonical(pr, status);
2088 }
2089
2090 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2091 {
2092     FloatParts pa = float64_unpack_canonical(a, status);
2093     FloatParts pr = sqrt_float(pa, status, &float64_params);
2094     return float64_round_pack_canonical(pr, status);
2095 }
2096
2097
2098 /*----------------------------------------------------------------------------
2099 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2100 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2101 | input.  If `zSign' is 1, the input is negated before being converted to an
2102 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2103 | is simply rounded to an integer, with the inexact exception raised if the
2104 | input cannot be represented exactly as an integer.  However, if the fixed-
2105 | point input is too large, the invalid exception is raised and the largest
2106 | positive or negative integer is returned.
2107 *----------------------------------------------------------------------------*/
2108
2109 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2110 {
2111     int8_t roundingMode;
2112     flag roundNearestEven;
2113     int8_t roundIncrement, roundBits;
2114     int32_t z;
2115
2116     roundingMode = status->float_rounding_mode;
2117     roundNearestEven = ( roundingMode == float_round_nearest_even );
2118     switch (roundingMode) {
2119     case float_round_nearest_even:
2120     case float_round_ties_away:
2121         roundIncrement = 0x40;
2122         break;
2123     case float_round_to_zero:
2124         roundIncrement = 0;
2125         break;
2126     case float_round_up:
2127         roundIncrement = zSign ? 0 : 0x7f;
2128         break;
2129     case float_round_down:
2130         roundIncrement = zSign ? 0x7f : 0;
2131         break;
2132     default:
2133         abort();
2134     }
2135     roundBits = absZ & 0x7F;
2136     absZ = ( absZ + roundIncrement )>>7;
2137     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2138     z = absZ;
2139     if ( zSign ) z = - z;
2140     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2141         float_raise(float_flag_invalid, status);
2142         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2143     }
2144     if (roundBits) {
2145         status->float_exception_flags |= float_flag_inexact;
2146     }
2147     return z;
2148
2149 }
2150
2151 /*----------------------------------------------------------------------------
2152 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2153 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2154 | and returns the properly rounded 64-bit integer corresponding to the input.
2155 | If `zSign' is 1, the input is negated before being converted to an integer.
2156 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2157 | the inexact exception raised if the input cannot be represented exactly as
2158 | an integer.  However, if the fixed-point input is too large, the invalid
2159 | exception is raised and the largest positive or negative integer is
2160 | returned.
2161 *----------------------------------------------------------------------------*/
2162
2163 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2164                                float_status *status)
2165 {
2166     int8_t roundingMode;
2167     flag roundNearestEven, increment;
2168     int64_t z;
2169
2170     roundingMode = status->float_rounding_mode;
2171     roundNearestEven = ( roundingMode == float_round_nearest_even );
2172     switch (roundingMode) {
2173     case float_round_nearest_even:
2174     case float_round_ties_away:
2175         increment = ((int64_t) absZ1 < 0);
2176         break;
2177     case float_round_to_zero:
2178         increment = 0;
2179         break;
2180     case float_round_up:
2181         increment = !zSign && absZ1;
2182         break;
2183     case float_round_down:
2184         increment = zSign && absZ1;
2185         break;
2186     default:
2187         abort();
2188     }
2189     if ( increment ) {
2190         ++absZ0;
2191         if ( absZ0 == 0 ) goto overflow;
2192         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2193     }
2194     z = absZ0;
2195     if ( zSign ) z = - z;
2196     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2197  overflow:
2198         float_raise(float_flag_invalid, status);
2199         return
2200               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2201             : LIT64( 0x7FFFFFFFFFFFFFFF );
2202     }
2203     if (absZ1) {
2204         status->float_exception_flags |= float_flag_inexact;
2205     }
2206     return z;
2207
2208 }
2209
2210 /*----------------------------------------------------------------------------
2211 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2212 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2213 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2214 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2215 | with the inexact exception raised if the input cannot be represented exactly
2216 | as an integer.  However, if the fixed-point input is too large, the invalid
2217 | exception is raised and the largest unsigned integer is returned.
2218 *----------------------------------------------------------------------------*/
2219
2220 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2221                                 uint64_t absZ1, float_status *status)
2222 {
2223     int8_t roundingMode;
2224     flag roundNearestEven, increment;
2225
2226     roundingMode = status->float_rounding_mode;
2227     roundNearestEven = (roundingMode == float_round_nearest_even);
2228     switch (roundingMode) {
2229     case float_round_nearest_even:
2230     case float_round_ties_away:
2231         increment = ((int64_t)absZ1 < 0);
2232         break;
2233     case float_round_to_zero:
2234         increment = 0;
2235         break;
2236     case float_round_up:
2237         increment = !zSign && absZ1;
2238         break;
2239     case float_round_down:
2240         increment = zSign && absZ1;
2241         break;
2242     default:
2243         abort();
2244     }
2245     if (increment) {
2246         ++absZ0;
2247         if (absZ0 == 0) {
2248             float_raise(float_flag_invalid, status);
2249             return LIT64(0xFFFFFFFFFFFFFFFF);
2250         }
2251         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2252     }
2253
2254     if (zSign && absZ0) {
2255         float_raise(float_flag_invalid, status);
2256         return 0;
2257     }
2258
2259     if (absZ1) {
2260         status->float_exception_flags |= float_flag_inexact;
2261     }
2262     return absZ0;
2263 }
2264
2265 /*----------------------------------------------------------------------------
2266 | If `a' is denormal and we are in flush-to-zero mode then set the
2267 | input-denormal exception and return zero. Otherwise just return the value.
2268 *----------------------------------------------------------------------------*/
2269 float32 float32_squash_input_denormal(float32 a, float_status *status)
2270 {
2271     if (status->flush_inputs_to_zero) {
2272         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2273             float_raise(float_flag_input_denormal, status);
2274             return make_float32(float32_val(a) & 0x80000000);
2275         }
2276     }
2277     return a;
2278 }
2279
2280 /*----------------------------------------------------------------------------
2281 | Normalizes the subnormal single-precision floating-point value represented
2282 | by the denormalized significand `aSig'.  The normalized exponent and
2283 | significand are stored at the locations pointed to by `zExpPtr' and
2284 | `zSigPtr', respectively.
2285 *----------------------------------------------------------------------------*/
2286
2287 static void
2288  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2289 {
2290     int8_t shiftCount;
2291
2292     shiftCount = countLeadingZeros32( aSig ) - 8;
2293     *zSigPtr = aSig<<shiftCount;
2294     *zExpPtr = 1 - shiftCount;
2295
2296 }
2297
2298 /*----------------------------------------------------------------------------
2299 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2300 | and significand `zSig', and returns the proper single-precision floating-
2301 | point value corresponding to the abstract input.  Ordinarily, the abstract
2302 | value is simply rounded and packed into the single-precision format, with
2303 | the inexact exception raised if the abstract input cannot be represented
2304 | exactly.  However, if the abstract value is too large, the overflow and
2305 | inexact exceptions are raised and an infinity or maximal finite value is
2306 | returned.  If the abstract value is too small, the input value is rounded to
2307 | a subnormal number, and the underflow and inexact exceptions are raised if
2308 | the abstract input cannot be represented exactly as a subnormal single-
2309 | precision floating-point number.
2310 |     The input significand `zSig' has its binary point between bits 30
2311 | and 29, which is 7 bits to the left of the usual location.  This shifted
2312 | significand must be normalized or smaller.  If `zSig' is not normalized,
2313 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2314 | and it must not require rounding.  In the usual case that `zSig' is
2315 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2316 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2317 | Binary Floating-Point Arithmetic.
2318 *----------------------------------------------------------------------------*/
2319
2320 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2321                                    float_status *status)
2322 {
2323     int8_t roundingMode;
2324     flag roundNearestEven;
2325     int8_t roundIncrement, roundBits;
2326     flag isTiny;
2327
2328     roundingMode = status->float_rounding_mode;
2329     roundNearestEven = ( roundingMode == float_round_nearest_even );
2330     switch (roundingMode) {
2331     case float_round_nearest_even:
2332     case float_round_ties_away:
2333         roundIncrement = 0x40;
2334         break;
2335     case float_round_to_zero:
2336         roundIncrement = 0;
2337         break;
2338     case float_round_up:
2339         roundIncrement = zSign ? 0 : 0x7f;
2340         break;
2341     case float_round_down:
2342         roundIncrement = zSign ? 0x7f : 0;
2343         break;
2344     default:
2345         abort();
2346         break;
2347     }
2348     roundBits = zSig & 0x7F;
2349     if ( 0xFD <= (uint16_t) zExp ) {
2350         if (    ( 0xFD < zExp )
2351              || (    ( zExp == 0xFD )
2352                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2353            ) {
2354             float_raise(float_flag_overflow | float_flag_inexact, status);
2355             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2356         }
2357         if ( zExp < 0 ) {
2358             if (status->flush_to_zero) {
2359                 float_raise(float_flag_output_denormal, status);
2360                 return packFloat32(zSign, 0, 0);
2361             }
2362             isTiny =
2363                 (status->float_detect_tininess
2364                  == float_tininess_before_rounding)
2365                 || ( zExp < -1 )
2366                 || ( zSig + roundIncrement < 0x80000000 );
2367             shift32RightJamming( zSig, - zExp, &zSig );
2368             zExp = 0;
2369             roundBits = zSig & 0x7F;
2370             if (isTiny && roundBits) {
2371                 float_raise(float_flag_underflow, status);
2372             }
2373         }
2374     }
2375     if (roundBits) {
2376         status->float_exception_flags |= float_flag_inexact;
2377     }
2378     zSig = ( zSig + roundIncrement )>>7;
2379     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2380     if ( zSig == 0 ) zExp = 0;
2381     return packFloat32( zSign, zExp, zSig );
2382
2383 }
2384
2385 /*----------------------------------------------------------------------------
2386 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2387 | and significand `zSig', and returns the proper single-precision floating-
2388 | point value corresponding to the abstract input.  This routine is just like
2389 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2390 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2391 | floating-point exponent.
2392 *----------------------------------------------------------------------------*/
2393
2394 static float32
2395  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2396                               float_status *status)
2397 {
2398     int8_t shiftCount;
2399
2400     shiftCount = countLeadingZeros32( zSig ) - 1;
2401     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2402                                status);
2403
2404 }
2405
2406 /*----------------------------------------------------------------------------
2407 | If `a' is denormal and we are in flush-to-zero mode then set the
2408 | input-denormal exception and return zero. Otherwise just return the value.
2409 *----------------------------------------------------------------------------*/
2410 float64 float64_squash_input_denormal(float64 a, float_status *status)
2411 {
2412     if (status->flush_inputs_to_zero) {
2413         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2414             float_raise(float_flag_input_denormal, status);
2415             return make_float64(float64_val(a) & (1ULL << 63));
2416         }
2417     }
2418     return a;
2419 }
2420
2421 /*----------------------------------------------------------------------------
2422 | Normalizes the subnormal double-precision floating-point value represented
2423 | by the denormalized significand `aSig'.  The normalized exponent and
2424 | significand are stored at the locations pointed to by `zExpPtr' and
2425 | `zSigPtr', respectively.
2426 *----------------------------------------------------------------------------*/
2427
2428 static void
2429  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2430 {
2431     int8_t shiftCount;
2432
2433     shiftCount = countLeadingZeros64( aSig ) - 11;
2434     *zSigPtr = aSig<<shiftCount;
2435     *zExpPtr = 1 - shiftCount;
2436
2437 }
2438
2439 /*----------------------------------------------------------------------------
2440 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2441 | double-precision floating-point value, returning the result.  After being
2442 | shifted into the proper positions, the three fields are simply added
2443 | together to form the result.  This means that any integer portion of `zSig'
2444 | will be added into the exponent.  Since a properly normalized significand
2445 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2446 | than the desired result exponent whenever `zSig' is a complete, normalized
2447 | significand.
2448 *----------------------------------------------------------------------------*/
2449
2450 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2451 {
2452
2453     return make_float64(
2454         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2455
2456 }
2457
2458 /*----------------------------------------------------------------------------
2459 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2460 | and significand `zSig', and returns the proper double-precision floating-
2461 | point value corresponding to the abstract input.  Ordinarily, the abstract
2462 | value is simply rounded and packed into the double-precision format, with
2463 | the inexact exception raised if the abstract input cannot be represented
2464 | exactly.  However, if the abstract value is too large, the overflow and
2465 | inexact exceptions are raised and an infinity or maximal finite value is
2466 | returned.  If the abstract value is too small, the input value is rounded to
2467 | a subnormal number, and the underflow and inexact exceptions are raised if
2468 | the abstract input cannot be represented exactly as a subnormal double-
2469 | precision floating-point number.
2470 |     The input significand `zSig' has its binary point between bits 62
2471 | and 61, which is 10 bits to the left of the usual location.  This shifted
2472 | significand must be normalized or smaller.  If `zSig' is not normalized,
2473 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2474 | and it must not require rounding.  In the usual case that `zSig' is
2475 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2476 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2477 | Binary Floating-Point Arithmetic.
2478 *----------------------------------------------------------------------------*/
2479
2480 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2481                                    float_status *status)
2482 {
2483     int8_t roundingMode;
2484     flag roundNearestEven;
2485     int roundIncrement, roundBits;
2486     flag isTiny;
2487
2488     roundingMode = status->float_rounding_mode;
2489     roundNearestEven = ( roundingMode == float_round_nearest_even );
2490     switch (roundingMode) {
2491     case float_round_nearest_even:
2492     case float_round_ties_away:
2493         roundIncrement = 0x200;
2494         break;
2495     case float_round_to_zero:
2496         roundIncrement = 0;
2497         break;
2498     case float_round_up:
2499         roundIncrement = zSign ? 0 : 0x3ff;
2500         break;
2501     case float_round_down:
2502         roundIncrement = zSign ? 0x3ff : 0;
2503         break;
2504     case float_round_to_odd:
2505         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2506         break;
2507     default:
2508         abort();
2509     }
2510     roundBits = zSig & 0x3FF;
2511     if ( 0x7FD <= (uint16_t) zExp ) {
2512         if (    ( 0x7FD < zExp )
2513              || (    ( zExp == 0x7FD )
2514                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2515            ) {
2516             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2517                                    roundIncrement != 0;
2518             float_raise(float_flag_overflow | float_flag_inexact, status);
2519             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2520         }
2521         if ( zExp < 0 ) {
2522             if (status->flush_to_zero) {
2523                 float_raise(float_flag_output_denormal, status);
2524                 return packFloat64(zSign, 0, 0);
2525             }
2526             isTiny =
2527                    (status->float_detect_tininess
2528                     == float_tininess_before_rounding)
2529                 || ( zExp < -1 )
2530                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2531             shift64RightJamming( zSig, - zExp, &zSig );
2532             zExp = 0;
2533             roundBits = zSig & 0x3FF;
2534             if (isTiny && roundBits) {
2535                 float_raise(float_flag_underflow, status);
2536             }
2537             if (roundingMode == float_round_to_odd) {
2538                 /*
2539                  * For round-to-odd case, the roundIncrement depends on
2540                  * zSig which just changed.
2541                  */
2542                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2543             }
2544         }
2545     }
2546     if (roundBits) {
2547         status->float_exception_flags |= float_flag_inexact;
2548     }
2549     zSig = ( zSig + roundIncrement )>>10;
2550     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2551     if ( zSig == 0 ) zExp = 0;
2552     return packFloat64( zSign, zExp, zSig );
2553
2554 }
2555
2556 /*----------------------------------------------------------------------------
2557 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2558 | and significand `zSig', and returns the proper double-precision floating-
2559 | point value corresponding to the abstract input.  This routine is just like
2560 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2561 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2562 | floating-point exponent.
2563 *----------------------------------------------------------------------------*/
2564
2565 static float64
2566  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2567                               float_status *status)
2568 {
2569     int8_t shiftCount;
2570
2571     shiftCount = countLeadingZeros64( zSig ) - 1;
2572     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2573                                status);
2574
2575 }
2576
2577 /*----------------------------------------------------------------------------
2578 | Normalizes the subnormal extended double-precision floating-point value
2579 | represented by the denormalized significand `aSig'.  The normalized exponent
2580 | and significand are stored at the locations pointed to by `zExpPtr' and
2581 | `zSigPtr', respectively.
2582 *----------------------------------------------------------------------------*/
2583
2584 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2585                                 uint64_t *zSigPtr)
2586 {
2587     int8_t shiftCount;
2588
2589     shiftCount = countLeadingZeros64( aSig );
2590     *zSigPtr = aSig<<shiftCount;
2591     *zExpPtr = 1 - shiftCount;
2592 }
2593
2594 /*----------------------------------------------------------------------------
2595 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2596 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2597 | and returns the proper extended double-precision floating-point value
2598 | corresponding to the abstract input.  Ordinarily, the abstract value is
2599 | rounded and packed into the extended double-precision format, with the
2600 | inexact exception raised if the abstract input cannot be represented
2601 | exactly.  However, if the abstract value is too large, the overflow and
2602 | inexact exceptions are raised and an infinity or maximal finite value is
2603 | returned.  If the abstract value is too small, the input value is rounded to
2604 | a subnormal number, and the underflow and inexact exceptions are raised if
2605 | the abstract input cannot be represented exactly as a subnormal extended
2606 | double-precision floating-point number.
2607 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2608 | number of bits as single or double precision, respectively.  Otherwise, the
2609 | result is rounded to the full precision of the extended double-precision
2610 | format.
2611 |     The input significand must be normalized or smaller.  If the input
2612 | significand is not normalized, `zExp' must be 0; in that case, the result
2613 | returned is a subnormal number, and it must not require rounding.  The
2614 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2615 | Floating-Point Arithmetic.
2616 *----------------------------------------------------------------------------*/
2617
2618 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2619                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2620                               float_status *status)
2621 {
2622     int8_t roundingMode;
2623     flag roundNearestEven, increment, isTiny;
2624     int64_t roundIncrement, roundMask, roundBits;
2625
2626     roundingMode = status->float_rounding_mode;
2627     roundNearestEven = ( roundingMode == float_round_nearest_even );
2628     if ( roundingPrecision == 80 ) goto precision80;
2629     if ( roundingPrecision == 64 ) {
2630         roundIncrement = LIT64( 0x0000000000000400 );
2631         roundMask = LIT64( 0x00000000000007FF );
2632     }
2633     else if ( roundingPrecision == 32 ) {
2634         roundIncrement = LIT64( 0x0000008000000000 );
2635         roundMask = LIT64( 0x000000FFFFFFFFFF );
2636     }
2637     else {
2638         goto precision80;
2639     }
2640     zSig0 |= ( zSig1 != 0 );
2641     switch (roundingMode) {
2642     case float_round_nearest_even:
2643     case float_round_ties_away:
2644         break;
2645     case float_round_to_zero:
2646         roundIncrement = 0;
2647         break;
2648     case float_round_up:
2649         roundIncrement = zSign ? 0 : roundMask;
2650         break;
2651     case float_round_down:
2652         roundIncrement = zSign ? roundMask : 0;
2653         break;
2654     default:
2655         abort();
2656     }
2657     roundBits = zSig0 & roundMask;
2658     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2659         if (    ( 0x7FFE < zExp )
2660              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2661            ) {
2662             goto overflow;
2663         }
2664         if ( zExp <= 0 ) {
2665             if (status->flush_to_zero) {
2666                 float_raise(float_flag_output_denormal, status);
2667                 return packFloatx80(zSign, 0, 0);
2668             }
2669             isTiny =
2670                    (status->float_detect_tininess
2671                     == float_tininess_before_rounding)
2672                 || ( zExp < 0 )
2673                 || ( zSig0 <= zSig0 + roundIncrement );
2674             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2675             zExp = 0;
2676             roundBits = zSig0 & roundMask;
2677             if (isTiny && roundBits) {
2678                 float_raise(float_flag_underflow, status);
2679             }
2680             if (roundBits) {
2681                 status->float_exception_flags |= float_flag_inexact;
2682             }
2683             zSig0 += roundIncrement;
2684             if ( (int64_t) zSig0 < 0 ) zExp = 1;
2685             roundIncrement = roundMask + 1;
2686             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2687                 roundMask |= roundIncrement;
2688             }
2689             zSig0 &= ~ roundMask;
2690             return packFloatx80( zSign, zExp, zSig0 );
2691         }
2692     }
2693     if (roundBits) {
2694         status->float_exception_flags |= float_flag_inexact;
2695     }
2696     zSig0 += roundIncrement;
2697     if ( zSig0 < roundIncrement ) {
2698         ++zExp;
2699         zSig0 = LIT64( 0x8000000000000000 );
2700     }
2701     roundIncrement = roundMask + 1;
2702     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2703         roundMask |= roundIncrement;
2704     }
2705     zSig0 &= ~ roundMask;
2706     if ( zSig0 == 0 ) zExp = 0;
2707     return packFloatx80( zSign, zExp, zSig0 );
2708  precision80:
2709     switch (roundingMode) {
2710     case float_round_nearest_even:
2711     case float_round_ties_away:
2712         increment = ((int64_t)zSig1 < 0);
2713         break;
2714     case float_round_to_zero:
2715         increment = 0;
2716         break;
2717     case float_round_up:
2718         increment = !zSign && zSig1;
2719         break;
2720     case float_round_down:
2721         increment = zSign && zSig1;
2722         break;
2723     default:
2724         abort();
2725     }
2726     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2727         if (    ( 0x7FFE < zExp )
2728              || (    ( zExp == 0x7FFE )
2729                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2730                   && increment
2731                 )
2732            ) {
2733             roundMask = 0;
2734  overflow:
2735             float_raise(float_flag_overflow | float_flag_inexact, status);
2736             if (    ( roundingMode == float_round_to_zero )
2737                  || ( zSign && ( roundingMode == float_round_up ) )
2738                  || ( ! zSign && ( roundingMode == float_round_down ) )
2739                ) {
2740                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2741             }
2742             return packFloatx80(zSign,
2743                                 floatx80_infinity_high,
2744                                 floatx80_infinity_low);
2745         }
2746         if ( zExp <= 0 ) {
2747             isTiny =
2748                    (status->float_detect_tininess
2749                     == float_tininess_before_rounding)
2750                 || ( zExp < 0 )
2751                 || ! increment
2752                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2753             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2754             zExp = 0;
2755             if (isTiny && zSig1) {
2756                 float_raise(float_flag_underflow, status);
2757             }
2758             if (zSig1) {
2759                 status->float_exception_flags |= float_flag_inexact;
2760             }
2761             switch (roundingMode) {
2762             case float_round_nearest_even:
2763             case float_round_ties_away:
2764                 increment = ((int64_t)zSig1 < 0);
2765                 break;
2766             case float_round_to_zero:
2767                 increment = 0;
2768                 break;
2769             case float_round_up:
2770                 increment = !zSign && zSig1;
2771                 break;
2772             case float_round_down:
2773                 increment = zSign && zSig1;
2774                 break;
2775             default:
2776                 abort();
2777             }
2778             if ( increment ) {
2779                 ++zSig0;
2780                 zSig0 &=
2781                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2782                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2783             }
2784             return packFloatx80( zSign, zExp, zSig0 );
2785         }
2786     }
2787     if (zSig1) {
2788         status->float_exception_flags |= float_flag_inexact;
2789     }
2790     if ( increment ) {
2791         ++zSig0;
2792         if ( zSig0 == 0 ) {
2793             ++zExp;
2794             zSig0 = LIT64( 0x8000000000000000 );
2795         }
2796         else {
2797             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2798         }
2799     }
2800     else {
2801         if ( zSig0 == 0 ) zExp = 0;
2802     }
2803     return packFloatx80( zSign, zExp, zSig0 );
2804
2805 }
2806
2807 /*----------------------------------------------------------------------------
2808 | Takes an abstract floating-point value having sign `zSign', exponent
2809 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2810 | and returns the proper extended double-precision floating-point value
2811 | corresponding to the abstract input.  This routine is just like
2812 | `roundAndPackFloatx80' except that the input significand does not have to be
2813 | normalized.
2814 *----------------------------------------------------------------------------*/
2815
2816 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2817                                        flag zSign, int32_t zExp,
2818                                        uint64_t zSig0, uint64_t zSig1,
2819                                        float_status *status)
2820 {
2821     int8_t shiftCount;
2822
2823     if ( zSig0 == 0 ) {
2824         zSig0 = zSig1;
2825         zSig1 = 0;
2826         zExp -= 64;
2827     }
2828     shiftCount = countLeadingZeros64( zSig0 );
2829     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2830     zExp -= shiftCount;
2831     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2832                                 zSig0, zSig1, status);
2833
2834 }
2835
2836 /*----------------------------------------------------------------------------
2837 | Returns the least-significant 64 fraction bits of the quadruple-precision
2838 | floating-point value `a'.
2839 *----------------------------------------------------------------------------*/
2840
2841 static inline uint64_t extractFloat128Frac1( float128 a )
2842 {
2843
2844     return a.low;
2845
2846 }
2847
2848 /*----------------------------------------------------------------------------
2849 | Returns the most-significant 48 fraction bits of the quadruple-precision
2850 | floating-point value `a'.
2851 *----------------------------------------------------------------------------*/
2852
2853 static inline uint64_t extractFloat128Frac0( float128 a )
2854 {
2855
2856     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2857
2858 }
2859
2860 /*----------------------------------------------------------------------------
2861 | Returns the exponent bits of the quadruple-precision floating-point value
2862 | `a'.
2863 *----------------------------------------------------------------------------*/
2864
2865 static inline int32_t extractFloat128Exp( float128 a )
2866 {
2867
2868     return ( a.high>>48 ) & 0x7FFF;
2869
2870 }
2871
2872 /*----------------------------------------------------------------------------
2873 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2874 *----------------------------------------------------------------------------*/
2875
2876 static inline flag extractFloat128Sign( float128 a )
2877 {
2878
2879     return a.high>>63;
2880
2881 }
2882
2883 /*----------------------------------------------------------------------------
2884 | Normalizes the subnormal quadruple-precision floating-point value
2885 | represented by the denormalized significand formed by the concatenation of
2886 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2887 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2888 | significand are stored at the location pointed to by `zSig0Ptr', and the
2889 | least significant 64 bits of the normalized significand are stored at the
2890 | location pointed to by `zSig1Ptr'.
2891 *----------------------------------------------------------------------------*/
2892
2893 static void
2894  normalizeFloat128Subnormal(
2895      uint64_t aSig0,
2896      uint64_t aSig1,
2897      int32_t *zExpPtr,
2898      uint64_t *zSig0Ptr,
2899      uint64_t *zSig1Ptr
2900  )
2901 {
2902     int8_t shiftCount;
2903
2904     if ( aSig0 == 0 ) {
2905         shiftCount = countLeadingZeros64( aSig1 ) - 15;
2906         if ( shiftCount < 0 ) {
2907             *zSig0Ptr = aSig1>>( - shiftCount );
2908             *zSig1Ptr = aSig1<<( shiftCount & 63 );
2909         }
2910         else {
2911             *zSig0Ptr = aSig1<<shiftCount;
2912             *zSig1Ptr = 0;
2913         }
2914         *zExpPtr = - shiftCount - 63;
2915     }
2916     else {
2917         shiftCount = countLeadingZeros64( aSig0 ) - 15;
2918         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2919         *zExpPtr = 1 - shiftCount;
2920     }
2921
2922 }
2923
2924 /*----------------------------------------------------------------------------
2925 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2926 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2927 | floating-point value, returning the result.  After being shifted into the
2928 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2929 | added together to form the most significant 32 bits of the result.  This
2930 | means that any integer portion of `zSig0' will be added into the exponent.
2931 | Since a properly normalized significand will have an integer portion equal
2932 | to 1, the `zExp' input should be 1 less than the desired result exponent
2933 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2934 | significand.
2935 *----------------------------------------------------------------------------*/
2936
2937 static inline float128
2938  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2939 {
2940     float128 z;
2941
2942     z.low = zSig1;
2943     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2944     return z;
2945
2946 }
2947
2948 /*----------------------------------------------------------------------------
2949 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2950 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2951 | and `zSig2', and returns the proper quadruple-precision floating-point value
2952 | corresponding to the abstract input.  Ordinarily, the abstract value is
2953 | simply rounded and packed into the quadruple-precision format, with the
2954 | inexact exception raised if the abstract input cannot be represented
2955 | exactly.  However, if the abstract value is too large, the overflow and
2956 | inexact exceptions are raised and an infinity or maximal finite value is
2957 | returned.  If the abstract value is too small, the input value is rounded to
2958 | a subnormal number, and the underflow and inexact exceptions are raised if
2959 | the abstract input cannot be represented exactly as a subnormal quadruple-
2960 | precision floating-point number.
2961 |     The input significand must be normalized or smaller.  If the input
2962 | significand is not normalized, `zExp' must be 0; in that case, the result
2963 | returned is a subnormal number, and it must not require rounding.  In the
2964 | usual case that the input significand is normalized, `zExp' must be 1 less
2965 | than the ``true'' floating-point exponent.  The handling of underflow and
2966 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2967 *----------------------------------------------------------------------------*/
2968
2969 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2970                                      uint64_t zSig0, uint64_t zSig1,
2971                                      uint64_t zSig2, float_status *status)
2972 {
2973     int8_t roundingMode;
2974     flag roundNearestEven, increment, isTiny;
2975
2976     roundingMode = status->float_rounding_mode;
2977     roundNearestEven = ( roundingMode == float_round_nearest_even );
2978     switch (roundingMode) {
2979     case float_round_nearest_even:
2980     case float_round_ties_away:
2981         increment = ((int64_t)zSig2 < 0);
2982         break;
2983     case float_round_to_zero:
2984         increment = 0;
2985         break;
2986     case float_round_up:
2987         increment = !zSign && zSig2;
2988         break;
2989     case float_round_down:
2990         increment = zSign && zSig2;
2991         break;
2992     case float_round_to_odd:
2993         increment = !(zSig1 & 0x1) && zSig2;
2994         break;
2995     default:
2996         abort();
2997     }
2998     if ( 0x7FFD <= (uint32_t) zExp ) {
2999         if (    ( 0x7FFD < zExp )
3000              || (    ( zExp == 0x7FFD )
3001                   && eq128(
3002                          LIT64( 0x0001FFFFFFFFFFFF ),
3003                          LIT64( 0xFFFFFFFFFFFFFFFF ),
3004                          zSig0,
3005                          zSig1
3006                      )
3007                   && increment
3008                 )
3009            ) {
3010             float_raise(float_flag_overflow | float_flag_inexact, status);
3011             if (    ( roundingMode == float_round_to_zero )
3012                  || ( zSign && ( roundingMode == float_round_up ) )
3013                  || ( ! zSign && ( roundingMode == float_round_down ) )
3014                  || (roundingMode == float_round_to_odd)
3015                ) {
3016                 return
3017                     packFloat128(
3018                         zSign,
3019                         0x7FFE,
3020                         LIT64( 0x0000FFFFFFFFFFFF ),
3021                         LIT64( 0xFFFFFFFFFFFFFFFF )
3022                     );
3023             }
3024             return packFloat128( zSign, 0x7FFF, 0, 0 );
3025         }
3026         if ( zExp < 0 ) {
3027             if (status->flush_to_zero) {
3028                 float_raise(float_flag_output_denormal, status);
3029                 return packFloat128(zSign, 0, 0, 0);
3030             }
3031             isTiny =
3032                    (status->float_detect_tininess
3033                     == float_tininess_before_rounding)
3034                 || ( zExp < -1 )
3035                 || ! increment
3036                 || lt128(
3037                        zSig0,
3038                        zSig1,
3039                        LIT64( 0x0001FFFFFFFFFFFF ),
3040                        LIT64( 0xFFFFFFFFFFFFFFFF )
3041                    );
3042             shift128ExtraRightJamming(
3043                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3044             zExp = 0;
3045             if (isTiny && zSig2) {
3046                 float_raise(float_flag_underflow, status);
3047             }
3048             switch (roundingMode) {
3049             case float_round_nearest_even:
3050             case float_round_ties_away:
3051                 increment = ((int64_t)zSig2 < 0);
3052                 break;
3053             case float_round_to_zero:
3054                 increment = 0;
3055                 break;
3056             case float_round_up:
3057                 increment = !zSign && zSig2;
3058                 break;
3059             case float_round_down:
3060                 increment = zSign && zSig2;
3061                 break;
3062             case float_round_to_odd:
3063                 increment = !(zSig1 & 0x1) && zSig2;
3064                 break;
3065             default:
3066                 abort();
3067             }
3068         }
3069     }
3070     if (zSig2) {
3071         status->float_exception_flags |= float_flag_inexact;
3072     }
3073     if ( increment ) {
3074         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3075         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3076     }
3077     else {
3078         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3079     }
3080     return packFloat128( zSign, zExp, zSig0, zSig1 );
3081
3082 }
3083
3084 /*----------------------------------------------------------------------------
3085 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3086 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3087 | returns the proper quadruple-precision floating-point value corresponding
3088 | to the abstract input.  This routine is just like `roundAndPackFloat128'
3089 | except that the input significand has fewer bits and does not have to be
3090 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
3091 | point exponent.
3092 *----------------------------------------------------------------------------*/
3093
3094 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3095                                               uint64_t zSig0, uint64_t zSig1,
3096                                               float_status *status)
3097 {
3098     int8_t shiftCount;
3099     uint64_t zSig2;
3100
3101     if ( zSig0 == 0 ) {
3102         zSig0 = zSig1;
3103         zSig1 = 0;
3104         zExp -= 64;
3105     }
3106     shiftCount = countLeadingZeros64( zSig0 ) - 15;
3107     if ( 0 <= shiftCount ) {
3108         zSig2 = 0;
3109         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3110     }
3111     else {
3112         shift128ExtraRightJamming(
3113             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3114     }
3115     zExp -= shiftCount;
3116     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3117
3118 }
3119
3120
3121 /*----------------------------------------------------------------------------
3122 | Returns the result of converting the 32-bit two's complement integer `a'
3123 | to the extended double-precision floating-point format.  The conversion
3124 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3125 | Arithmetic.
3126 *----------------------------------------------------------------------------*/
3127
3128 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3129 {
3130     flag zSign;
3131     uint32_t absA;
3132     int8_t shiftCount;
3133     uint64_t zSig;
3134
3135     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3136     zSign = ( a < 0 );
3137     absA = zSign ? - a : a;
3138     shiftCount = countLeadingZeros32( absA ) + 32;
3139     zSig = absA;
3140     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3141
3142 }
3143
3144 /*----------------------------------------------------------------------------
3145 | Returns the result of converting the 32-bit two's complement integer `a' to
3146 | the quadruple-precision floating-point format.  The conversion is performed
3147 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3148 *----------------------------------------------------------------------------*/
3149
3150 float128 int32_to_float128(int32_t a, float_status *status)
3151 {
3152     flag zSign;
3153     uint32_t absA;
3154     int8_t shiftCount;
3155     uint64_t zSig0;
3156
3157     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3158     zSign = ( a < 0 );
3159     absA = zSign ? - a : a;
3160     shiftCount = countLeadingZeros32( absA ) + 17;
3161     zSig0 = absA;
3162     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3163
3164 }
3165
3166 /*----------------------------------------------------------------------------
3167 | Returns the result of converting the 64-bit two's complement integer `a'
3168 | to the extended double-precision floating-point format.  The conversion
3169 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3170 | Arithmetic.
3171 *----------------------------------------------------------------------------*/
3172
3173 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3174 {
3175     flag zSign;
3176     uint64_t absA;
3177     int8_t shiftCount;
3178
3179     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3180     zSign = ( a < 0 );
3181     absA = zSign ? - a : a;
3182     shiftCount = countLeadingZeros64( absA );
3183     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3184
3185 }
3186
3187 /*----------------------------------------------------------------------------
3188 | Returns the result of converting the 64-bit two's complement integer `a' to
3189 | the quadruple-precision floating-point format.  The conversion is performed
3190 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3191 *----------------------------------------------------------------------------*/
3192
3193 float128 int64_to_float128(int64_t a, float_status *status)
3194 {
3195     flag zSign;
3196     uint64_t absA;
3197     int8_t shiftCount;
3198     int32_t zExp;
3199     uint64_t zSig0, zSig1;
3200
3201     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3202     zSign = ( a < 0 );
3203     absA = zSign ? - a : a;
3204     shiftCount = countLeadingZeros64( absA ) + 49;
3205     zExp = 0x406E - shiftCount;
3206     if ( 64 <= shiftCount ) {
3207         zSig1 = 0;
3208         zSig0 = absA;
3209         shiftCount -= 64;
3210     }
3211     else {
3212         zSig1 = absA;
3213         zSig0 = 0;
3214     }
3215     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3216     return packFloat128( zSign, zExp, zSig0, zSig1 );
3217
3218 }
3219
3220 /*----------------------------------------------------------------------------
3221 | Returns the result of converting the 64-bit unsigned integer `a'
3222 | to the quadruple-precision floating-point format.  The conversion is performed
3223 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3224 *----------------------------------------------------------------------------*/
3225
3226 float128 uint64_to_float128(uint64_t a, float_status *status)
3227 {
3228     if (a == 0) {
3229         return float128_zero;
3230     }
3231     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3232 }
3233
3234 /*----------------------------------------------------------------------------
3235 | Returns the result of converting the single-precision floating-point value
3236 | `a' to the extended double-precision floating-point format.  The conversion
3237 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3238 | Arithmetic.
3239 *----------------------------------------------------------------------------*/
3240
3241 floatx80 float32_to_floatx80(float32 a, float_status *status)
3242 {
3243     flag aSign;
3244     int aExp;
3245     uint32_t aSig;
3246
3247     a = float32_squash_input_denormal(a, status);
3248     aSig = extractFloat32Frac( a );
3249     aExp = extractFloat32Exp( a );
3250     aSign = extractFloat32Sign( a );
3251     if ( aExp == 0xFF ) {
3252         if (aSig) {
3253             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3254         }
3255         return packFloatx80(aSign,
3256                             floatx80_infinity_high,
3257                             floatx80_infinity_low);
3258     }
3259     if ( aExp == 0 ) {
3260         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3261         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3262     }
3263     aSig |= 0x00800000;
3264     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3265
3266 }
3267
3268 /*----------------------------------------------------------------------------
3269 | Returns the result of converting the single-precision floating-point value
3270 | `a' to the double-precision floating-point format.  The conversion is
3271 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3272 | Arithmetic.
3273 *----------------------------------------------------------------------------*/
3274
3275 float128 float32_to_float128(float32 a, float_status *status)
3276 {
3277     flag aSign;
3278     int aExp;
3279     uint32_t aSig;
3280
3281     a = float32_squash_input_denormal(a, status);
3282     aSig = extractFloat32Frac( a );
3283     aExp = extractFloat32Exp( a );
3284     aSign = extractFloat32Sign( a );
3285     if ( aExp == 0xFF ) {
3286         if (aSig) {
3287             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3288         }
3289         return packFloat128( aSign, 0x7FFF, 0, 0 );
3290     }
3291     if ( aExp == 0 ) {
3292         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3293         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3294         --aExp;
3295     }
3296     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3297
3298 }
3299
3300 /*----------------------------------------------------------------------------
3301 | Returns the remainder of the single-precision floating-point value `a'
3302 | with respect to the corresponding value `b'.  The operation is performed
3303 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3304 *----------------------------------------------------------------------------*/
3305
3306 float32 float32_rem(float32 a, float32 b, float_status *status)
3307 {
3308     flag aSign, zSign;
3309     int aExp, bExp, expDiff;
3310     uint32_t aSig, bSig;
3311     uint32_t q;
3312     uint64_t aSig64, bSig64, q64;
3313     uint32_t alternateASig;
3314     int32_t sigMean;
3315     a = float32_squash_input_denormal(a, status);
3316     b = float32_squash_input_denormal(b, status);
3317
3318     aSig = extractFloat32Frac( a );
3319     aExp = extractFloat32Exp( a );
3320     aSign = extractFloat32Sign( a );
3321     bSig = extractFloat32Frac( b );
3322     bExp = extractFloat32Exp( b );
3323     if ( aExp == 0xFF ) {
3324         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3325             return propagateFloat32NaN(a, b, status);
3326         }
3327         float_raise(float_flag_invalid, status);
3328         return float32_default_nan(status);
3329     }
3330     if ( bExp == 0xFF ) {
3331         if (bSig) {
3332             return propagateFloat32NaN(a, b, status);
3333         }
3334         return a;
3335     }
3336     if ( bExp == 0 ) {
3337         if ( bSig == 0 ) {
3338             float_raise(float_flag_invalid, status);
3339             return float32_default_nan(status);
3340         }
3341         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3342     }
3343     if ( aExp == 0 ) {
3344         if ( aSig == 0 ) return a;
3345         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3346     }
3347     expDiff = aExp - bExp;
3348     aSig |= 0x00800000;
3349     bSig |= 0x00800000;
3350     if ( expDiff < 32 ) {
3351         aSig <<= 8;
3352         bSig <<= 8;
3353         if ( expDiff < 0 ) {
3354             if ( expDiff < -1 ) return a;
3355             aSig >>= 1;
3356         }
3357         q = ( bSig <= aSig );
3358         if ( q ) aSig -= bSig;
3359         if ( 0 < expDiff ) {
3360             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3361             q >>= 32 - expDiff;
3362             bSig >>= 2;
3363             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3364         }
3365         else {
3366             aSig >>= 2;
3367             bSig >>= 2;
3368         }
3369     }
3370     else {
3371         if ( bSig <= aSig ) aSig -= bSig;
3372         aSig64 = ( (uint64_t) aSig )<<40;
3373         bSig64 = ( (uint64_t) bSig )<<40;
3374         expDiff -= 64;
3375         while ( 0 < expDiff ) {
3376             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3377             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3378             aSig64 = - ( ( bSig * q64 )<<38 );
3379             expDiff -= 62;
3380         }
3381         expDiff += 64;
3382         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3383         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3384         q = q64>>( 64 - expDiff );
3385         bSig <<= 6;
3386         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3387     }
3388     do {
3389         alternateASig = aSig;
3390         ++q;
3391         aSig -= bSig;
3392     } while ( 0 <= (int32_t) aSig );
3393     sigMean = aSig + alternateASig;
3394     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3395         aSig = alternateASig;
3396     }
3397     zSign = ( (int32_t) aSig < 0 );
3398     if ( zSign ) aSig = - aSig;
3399     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3400 }
3401
3402
3403
3404 /*----------------------------------------------------------------------------
3405 | Returns the binary exponential of the single-precision floating-point value
3406 | `a'. The operation is performed according to the IEC/IEEE Standard for
3407 | Binary Floating-Point Arithmetic.
3408 |
3409 | Uses the following identities:
3410 |
3411 | 1. -------------------------------------------------------------------------
3412 |      x    x*ln(2)
3413 |     2  = e
3414 |
3415 | 2. -------------------------------------------------------------------------
3416 |                      2     3     4     5           n
3417 |      x        x     x     x     x     x           x
3418 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3419 |               1!    2!    3!    4!    5!          n!
3420 *----------------------------------------------------------------------------*/
3421
3422 static const float64 float32_exp2_coefficients[15] =
3423 {
3424     const_float64( 0x3ff0000000000000ll ), /*  1 */
3425     const_float64( 0x3fe0000000000000ll ), /*  2 */
3426     const_float64( 0x3fc5555555555555ll ), /*  3 */
3427     const_float64( 0x3fa5555555555555ll ), /*  4 */
3428     const_float64( 0x3f81111111111111ll ), /*  5 */
3429     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3430     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3431     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3432     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3433     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3434     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3435     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3436     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3437     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3438     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3439 };
3440
3441 float32 float32_exp2(float32 a, float_status *status)
3442 {
3443     flag aSign;
3444     int aExp;
3445     uint32_t aSig;
3446     float64 r, x, xn;
3447     int i;
3448     a = float32_squash_input_denormal(a, status);
3449
3450     aSig = extractFloat32Frac( a );
3451     aExp = extractFloat32Exp( a );
3452     aSign = extractFloat32Sign( a );
3453
3454     if ( aExp == 0xFF) {
3455         if (aSig) {
3456             return propagateFloat32NaN(a, float32_zero, status);
3457         }
3458         return (aSign) ? float32_zero : a;
3459     }
3460     if (aExp == 0) {
3461         if (aSig == 0) return float32_one;
3462     }
3463
3464     float_raise(float_flag_inexact, status);
3465
3466     /* ******************************* */
3467     /* using float64 for approximation */
3468     /* ******************************* */
3469     x = float32_to_float64(a, status);
3470     x = float64_mul(x, float64_ln2, status);
3471
3472     xn = x;
3473     r = float64_one;
3474     for (i = 0 ; i < 15 ; i++) {
3475         float64 f;
3476
3477         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3478         r = float64_add(r, f, status);
3479
3480         xn = float64_mul(xn, x, status);
3481     }
3482
3483     return float64_to_float32(r, status);
3484 }
3485
3486 /*----------------------------------------------------------------------------
3487 | Returns the binary log of the single-precision floating-point value `a'.
3488 | The operation is performed according to the IEC/IEEE Standard for Binary
3489 | Floating-Point Arithmetic.
3490 *----------------------------------------------------------------------------*/
3491 float32 float32_log2(float32 a, float_status *status)
3492 {
3493     flag aSign, zSign;
3494     int aExp;
3495     uint32_t aSig, zSig, i;
3496
3497     a = float32_squash_input_denormal(a, status);
3498     aSig = extractFloat32Frac( a );
3499     aExp = extractFloat32Exp( a );
3500     aSign = extractFloat32Sign( a );
3501
3502     if ( aExp == 0 ) {
3503         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3504         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3505     }
3506     if ( aSign ) {
3507         float_raise(float_flag_invalid, status);
3508         return float32_default_nan(status);
3509     }
3510     if ( aExp == 0xFF ) {
3511         if (aSig) {
3512             return propagateFloat32NaN(a, float32_zero, status);
3513         }
3514         return a;
3515     }
3516
3517     aExp -= 0x7F;
3518     aSig |= 0x00800000;
3519     zSign = aExp < 0;
3520     zSig = aExp << 23;
3521
3522     for (i = 1 << 22; i > 0; i >>= 1) {
3523         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3524         if ( aSig & 0x01000000 ) {
3525             aSig >>= 1;
3526             zSig |= i;
3527         }
3528     }
3529
3530     if ( zSign )
3531         zSig = -zSig;
3532
3533     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3534 }
3535
3536 /*----------------------------------------------------------------------------
3537 | Returns 1 if the single-precision floating-point value `a' is equal to
3538 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3539 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3540 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3541 *----------------------------------------------------------------------------*/
3542
3543 int float32_eq(float32 a, float32 b, float_status *status)
3544 {
3545     uint32_t av, bv;
3546     a = float32_squash_input_denormal(a, status);
3547     b = float32_squash_input_denormal(b, status);
3548
3549     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3550          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3551        ) {
3552         float_raise(float_flag_invalid, status);
3553         return 0;
3554     }
3555     av = float32_val(a);
3556     bv = float32_val(b);
3557     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3558 }
3559
3560 /*----------------------------------------------------------------------------
3561 | Returns 1 if the single-precision floating-point value `a' is less than
3562 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3563 | exception is raised if either operand is a NaN.  The comparison is performed
3564 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3565 *----------------------------------------------------------------------------*/
3566
3567 int float32_le(float32 a, float32 b, float_status *status)
3568 {
3569     flag aSign, bSign;
3570     uint32_t av, bv;
3571     a = float32_squash_input_denormal(a, status);
3572     b = float32_squash_input_denormal(b, status);
3573
3574     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3575          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3576        ) {
3577         float_raise(float_flag_invalid, status);
3578         return 0;
3579     }
3580     aSign = extractFloat32Sign( a );
3581     bSign = extractFloat32Sign( b );
3582     av = float32_val(a);
3583     bv = float32_val(b);
3584     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3585     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3586
3587 }
3588
3589 /*----------------------------------------------------------------------------
3590 | Returns 1 if the single-precision floating-point value `a' is less than
3591 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3592 | raised if either operand is a NaN.  The comparison is performed according
3593 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3594 *----------------------------------------------------------------------------*/
3595
3596 int float32_lt(float32 a, float32 b, float_status *status)
3597 {
3598     flag aSign, bSign;
3599     uint32_t av, bv;
3600     a = float32_squash_input_denormal(a, status);
3601     b = float32_squash_input_denormal(b, status);
3602
3603     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3604          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3605        ) {
3606         float_raise(float_flag_invalid, status);
3607         return 0;
3608     }
3609     aSign = extractFloat32Sign( a );
3610     bSign = extractFloat32Sign( b );
3611     av = float32_val(a);
3612     bv = float32_val(b);
3613     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3614     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3615
3616 }
3617
3618 /*----------------------------------------------------------------------------
3619 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3620 | be compared, and 0 otherwise.  The invalid exception is raised if either
3621 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3622 | Standard for Binary Floating-Point Arithmetic.
3623 *----------------------------------------------------------------------------*/
3624
3625 int float32_unordered(float32 a, float32 b, float_status *status)
3626 {
3627     a = float32_squash_input_denormal(a, status);
3628     b = float32_squash_input_denormal(b, status);
3629
3630     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3631          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3632        ) {
3633         float_raise(float_flag_invalid, status);
3634         return 1;
3635     }
3636     return 0;
3637 }
3638
3639 /*----------------------------------------------------------------------------
3640 | Returns 1 if the single-precision floating-point value `a' is equal to
3641 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3642 | exception.  The comparison is performed according to the IEC/IEEE Standard
3643 | for Binary Floating-Point Arithmetic.
3644 *----------------------------------------------------------------------------*/
3645
3646 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3647 {
3648     a = float32_squash_input_denormal(a, status);
3649     b = float32_squash_input_denormal(b, status);
3650
3651     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3652          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3653        ) {
3654         if (float32_is_signaling_nan(a, status)
3655          || float32_is_signaling_nan(b, status)) {
3656             float_raise(float_flag_invalid, status);
3657         }
3658         return 0;
3659     }
3660     return ( float32_val(a) == float32_val(b) ) ||
3661             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3662 }
3663
3664 /*----------------------------------------------------------------------------
3665 | Returns 1 if the single-precision floating-point value `a' is less than or
3666 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3667 | cause an exception.  Otherwise, the comparison is performed according to the
3668 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3669 *----------------------------------------------------------------------------*/
3670
3671 int float32_le_quiet(float32 a, float32 b, float_status *status)
3672 {
3673     flag aSign, bSign;
3674     uint32_t av, bv;
3675     a = float32_squash_input_denormal(a, status);
3676     b = float32_squash_input_denormal(b, status);
3677
3678     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3679          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3680        ) {
3681         if (float32_is_signaling_nan(a, status)
3682          || float32_is_signaling_nan(b, status)) {
3683             float_raise(float_flag_invalid, status);
3684         }
3685         return 0;
3686     }
3687     aSign = extractFloat32Sign( a );
3688     bSign = extractFloat32Sign( b );
3689     av = float32_val(a);
3690     bv = float32_val(b);
3691     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3692     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3693
3694 }
3695
3696 /*----------------------------------------------------------------------------
3697 | Returns 1 if the single-precision floating-point value `a' is less than
3698 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3699 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3700 | Standard for Binary Floating-Point Arithmetic.
3701 *----------------------------------------------------------------------------*/
3702
3703 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3704 {
3705     flag aSign, bSign;
3706     uint32_t av, bv;
3707     a = float32_squash_input_denormal(a, status);
3708     b = float32_squash_input_denormal(b, status);
3709
3710     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3711          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3712        ) {
3713         if (float32_is_signaling_nan(a, status)
3714          || float32_is_signaling_nan(b, status)) {
3715             float_raise(float_flag_invalid, status);
3716         }
3717         return 0;
3718     }
3719     aSign = extractFloat32Sign( a );
3720     bSign = extractFloat32Sign( b );
3721     av = float32_val(a);
3722     bv = float32_val(b);
3723     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3724     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3725
3726 }
3727
3728 /*----------------------------------------------------------------------------
3729 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3730 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3731 | comparison is performed according to the IEC/IEEE Standard for Binary
3732 | Floating-Point Arithmetic.
3733 *----------------------------------------------------------------------------*/
3734
3735 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3736 {
3737     a = float32_squash_input_denormal(a, status);
3738     b = float32_squash_input_denormal(b, status);
3739
3740     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3741          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3742        ) {
3743         if (float32_is_signaling_nan(a, status)
3744          || float32_is_signaling_nan(b, status)) {
3745             float_raise(float_flag_invalid, status);
3746         }
3747         return 1;
3748     }
3749     return 0;
3750 }
3751
3752 /*----------------------------------------------------------------------------
3753 | If `a' is denormal and we are in flush-to-zero mode then set the
3754 | input-denormal exception and return zero. Otherwise just return the value.
3755 *----------------------------------------------------------------------------*/
3756 float16 float16_squash_input_denormal(float16 a, float_status *status)
3757 {
3758     if (status->flush_inputs_to_zero) {
3759         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3760             float_raise(float_flag_input_denormal, status);
3761             return make_float16(float16_val(a) & 0x8000);
3762         }
3763     }
3764     return a;
3765 }
3766
3767 /*----------------------------------------------------------------------------
3768 | Returns the result of converting the double-precision floating-point value
3769 | `a' to the extended double-precision floating-point format.  The conversion
3770 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3771 | Arithmetic.
3772 *----------------------------------------------------------------------------*/
3773
3774 floatx80 float64_to_floatx80(float64 a, float_status *status)
3775 {
3776     flag aSign;
3777     int aExp;
3778     uint64_t aSig;
3779
3780     a = float64_squash_input_denormal(a, status);
3781     aSig = extractFloat64Frac( a );
3782     aExp = extractFloat64Exp( a );
3783     aSign = extractFloat64Sign( a );
3784     if ( aExp == 0x7FF ) {
3785         if (aSig) {
3786             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3787         }
3788         return packFloatx80(aSign,
3789                             floatx80_infinity_high,
3790                             floatx80_infinity_low);
3791     }
3792     if ( aExp == 0 ) {
3793         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3794         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3795     }
3796     return
3797         packFloatx80(
3798             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3799
3800 }
3801
3802 /*----------------------------------------------------------------------------
3803 | Returns the result of converting the double-precision floating-point value
3804 | `a' to the quadruple-precision floating-point format.  The conversion is
3805 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3806 | Arithmetic.
3807 *----------------------------------------------------------------------------*/
3808
3809 float128 float64_to_float128(float64 a, float_status *status)
3810 {
3811     flag aSign;
3812     int aExp;
3813     uint64_t aSig, zSig0, zSig1;
3814
3815     a = float64_squash_input_denormal(a, status);
3816     aSig = extractFloat64Frac( a );
3817     aExp = extractFloat64Exp( a );
3818     aSign = extractFloat64Sign( a );
3819     if ( aExp == 0x7FF ) {
3820         if (aSig) {
3821             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3822         }
3823         return packFloat128( aSign, 0x7FFF, 0, 0 );
3824     }
3825     if ( aExp == 0 ) {
3826         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3827         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3828         --aExp;
3829     }
3830     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3831     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3832
3833 }
3834
3835
3836 /*----------------------------------------------------------------------------
3837 | Returns the remainder of the double-precision floating-point value `a'
3838 | with respect to the corresponding value `b'.  The operation is performed
3839 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3840 *----------------------------------------------------------------------------*/
3841
3842 float64 float64_rem(float64 a, float64 b, float_status *status)
3843 {
3844     flag aSign, zSign;
3845     int aExp, bExp, expDiff;
3846     uint64_t aSig, bSig;
3847     uint64_t q, alternateASig;
3848     int64_t sigMean;
3849
3850     a = float64_squash_input_denormal(a, status);
3851     b = float64_squash_input_denormal(b, status);
3852     aSig = extractFloat64Frac( a );
3853     aExp = extractFloat64Exp( a );
3854     aSign = extractFloat64Sign( a );
3855     bSig = extractFloat64Frac( b );
3856     bExp = extractFloat64Exp( b );
3857     if ( aExp == 0x7FF ) {
3858         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3859             return propagateFloat64NaN(a, b, status);
3860         }
3861         float_raise(float_flag_invalid, status);
3862         return float64_default_nan(status);
3863     }
3864     if ( bExp == 0x7FF ) {
3865         if (bSig) {
3866             return propagateFloat64NaN(a, b, status);
3867         }
3868         return a;
3869     }
3870     if ( bExp == 0 ) {
3871         if ( bSig == 0 ) {
3872             float_raise(float_flag_invalid, status);
3873             return float64_default_nan(status);
3874         }
3875         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3876     }
3877     if ( aExp == 0 ) {
3878         if ( aSig == 0 ) return a;
3879         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3880     }
3881     expDiff = aExp - bExp;
3882     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3883     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3884     if ( expDiff < 0 ) {
3885         if ( expDiff < -1 ) return a;
3886         aSig >>= 1;
3887     }
3888     q = ( bSig <= aSig );
3889     if ( q ) aSig -= bSig;
3890     expDiff -= 64;
3891     while ( 0 < expDiff ) {
3892         q = estimateDiv128To64( aSig, 0, bSig );
3893         q = ( 2 < q ) ? q - 2 : 0;
3894         aSig = - ( ( bSig>>2 ) * q );
3895         expDiff -= 62;
3896     }
3897     expDiff += 64;
3898     if ( 0 < expDiff ) {
3899         q = estimateDiv128To64( aSig, 0, bSig );
3900         q = ( 2 < q ) ? q - 2 : 0;
3901         q >>= 64 - expDiff;
3902         bSig >>= 2;
3903         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3904     }
3905     else {
3906         aSig >>= 2;
3907         bSig >>= 2;
3908     }
3909     do {
3910         alternateASig = aSig;
3911         ++q;
3912         aSig -= bSig;
3913     } while ( 0 <= (int64_t) aSig );
3914     sigMean = aSig + alternateASig;
3915     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3916         aSig = alternateASig;
3917     }
3918     zSign = ( (int64_t) aSig < 0 );
3919     if ( zSign ) aSig = - aSig;
3920     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
3921
3922 }
3923
3924 /*----------------------------------------------------------------------------
3925 | Returns the binary log of the double-precision floating-point value `a'.
3926 | The operation is performed according to the IEC/IEEE Standard for Binary
3927 | Floating-Point Arithmetic.
3928 *----------------------------------------------------------------------------*/
3929 float64 float64_log2(float64 a, float_status *status)
3930 {
3931     flag aSign, zSign;
3932     int aExp;
3933     uint64_t aSig, aSig0, aSig1, zSig, i;
3934     a = float64_squash_input_denormal(a, status);
3935
3936     aSig = extractFloat64Frac( a );
3937     aExp = extractFloat64Exp( a );
3938     aSign = extractFloat64Sign( a );
3939
3940     if ( aExp == 0 ) {
3941         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3942         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3943     }
3944     if ( aSign ) {
3945         float_raise(float_flag_invalid, status);
3946         return float64_default_nan(status);
3947     }
3948     if ( aExp == 0x7FF ) {
3949         if (aSig) {
3950             return propagateFloat64NaN(a, float64_zero, status);
3951         }
3952         return a;
3953     }
3954
3955     aExp -= 0x3FF;
3956     aSig |= LIT64( 0x0010000000000000 );
3957     zSign = aExp < 0;
3958     zSig = (uint64_t)aExp << 52;
3959     for (i = 1LL << 51; i > 0; i >>= 1) {
3960         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3961         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3962         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3963             aSig >>= 1;
3964             zSig |= i;
3965         }
3966     }
3967
3968     if ( zSign )
3969         zSig = -zSig;
3970     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
3971 }
3972
3973 /*----------------------------------------------------------------------------
3974 | Returns 1 if the double-precision floating-point value `a' is equal to the
3975 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3976 | if either operand is a NaN.  Otherwise, the comparison is performed
3977 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3978 *----------------------------------------------------------------------------*/
3979
3980 int float64_eq(float64 a, float64 b, float_status *status)
3981 {
3982     uint64_t av, bv;
3983     a = float64_squash_input_denormal(a, status);
3984     b = float64_squash_input_denormal(b, status);
3985
3986     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3987          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3988        ) {
3989         float_raise(float_flag_invalid, status);
3990         return 0;
3991     }
3992     av = float64_val(a);
3993     bv = float64_val(b);
3994     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3995
3996 }
3997
3998 /*----------------------------------------------------------------------------
3999 | Returns 1 if the double-precision floating-point value `a' is less than or
4000 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4001 | exception is raised if either operand is a NaN.  The comparison is performed
4002 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4003 *----------------------------------------------------------------------------*/
4004
4005 int float64_le(float64 a, float64 b, float_status *status)
4006 {
4007     flag aSign, bSign;
4008     uint64_t av, bv;
4009     a = float64_squash_input_denormal(a, status);
4010     b = float64_squash_input_denormal(b, status);
4011
4012     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4013          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4014        ) {
4015         float_raise(float_flag_invalid, status);
4016         return 0;
4017     }
4018     aSign = extractFloat64Sign( a );
4019     bSign = extractFloat64Sign( b );
4020     av = float64_val(a);
4021     bv = float64_val(b);
4022     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4023     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4024
4025 }
4026
4027 /*----------------------------------------------------------------------------
4028 | Returns 1 if the double-precision floating-point value `a' is less than
4029 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4030 | raised if either operand is a NaN.  The comparison is performed according
4031 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4032 *----------------------------------------------------------------------------*/
4033
4034 int float64_lt(float64 a, float64 b, float_status *status)
4035 {
4036     flag aSign, bSign;
4037     uint64_t av, bv;
4038
4039     a = float64_squash_input_denormal(a, status);
4040     b = float64_squash_input_denormal(b, status);
4041     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4042          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4043        ) {
4044         float_raise(float_flag_invalid, status);
4045         return 0;
4046     }
4047     aSign = extractFloat64Sign( a );
4048     bSign = extractFloat64Sign( b );
4049     av = float64_val(a);
4050     bv = float64_val(b);
4051     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4052     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4053
4054 }
4055
4056 /*----------------------------------------------------------------------------
4057 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4058 | be compared, and 0 otherwise.  The invalid exception is raised if either
4059 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4060 | Standard for Binary Floating-Point Arithmetic.
4061 *----------------------------------------------------------------------------*/
4062
4063 int float64_unordered(float64 a, float64 b, float_status *status)
4064 {
4065     a = float64_squash_input_denormal(a, status);
4066     b = float64_squash_input_denormal(b, status);
4067
4068     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4069          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4070        ) {
4071         float_raise(float_flag_invalid, status);
4072         return 1;
4073     }
4074     return 0;
4075 }
4076
4077 /*----------------------------------------------------------------------------
4078 | Returns 1 if the double-precision floating-point value `a' is equal to the
4079 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4080 | exception.The comparison is performed according to the IEC/IEEE Standard
4081 | for Binary Floating-Point Arithmetic.
4082 *----------------------------------------------------------------------------*/
4083
4084 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4085 {
4086     uint64_t av, bv;
4087     a = float64_squash_input_denormal(a, status);
4088     b = float64_squash_input_denormal(b, status);
4089
4090     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4091          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4092        ) {
4093         if (float64_is_signaling_nan(a, status)
4094          || float64_is_signaling_nan(b, status)) {
4095             float_raise(float_flag_invalid, status);
4096         }
4097         return 0;
4098     }
4099     av = float64_val(a);
4100     bv = float64_val(b);
4101     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4102
4103 }
4104
4105 /*----------------------------------------------------------------------------
4106 | Returns 1 if the double-precision floating-point value `a' is less than or
4107 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4108 | cause an exception.  Otherwise, the comparison is performed according to the
4109 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4110 *----------------------------------------------------------------------------*/
4111
4112 int float64_le_quiet(float64 a, float64 b, float_status *status)
4113 {
4114     flag aSign, bSign;
4115     uint64_t av, bv;
4116     a = float64_squash_input_denormal(a, status);
4117     b = float64_squash_input_denormal(b, status);
4118
4119     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4120          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4121        ) {
4122         if (float64_is_signaling_nan(a, status)
4123          || float64_is_signaling_nan(b, status)) {
4124             float_raise(float_flag_invalid, status);
4125         }
4126         return 0;
4127     }
4128     aSign = extractFloat64Sign( a );
4129     bSign = extractFloat64Sign( b );
4130     av = float64_val(a);
4131     bv = float64_val(b);
4132     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4133     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4134
4135 }
4136
4137 /*----------------------------------------------------------------------------
4138 | Returns 1 if the double-precision floating-point value `a' is less than
4139 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4140 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4141 | Standard for Binary Floating-Point Arithmetic.
4142 *----------------------------------------------------------------------------*/
4143
4144 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4145 {
4146     flag aSign, bSign;
4147     uint64_t av, bv;
4148     a = float64_squash_input_denormal(a, status);
4149     b = float64_squash_input_denormal(b, status);
4150
4151     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4152          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4153        ) {
4154         if (float64_is_signaling_nan(a, status)
4155          || float64_is_signaling_nan(b, status)) {
4156             float_raise(float_flag_invalid, status);
4157         }
4158         return 0;
4159     }
4160     aSign = extractFloat64Sign( a );
4161     bSign = extractFloat64Sign( b );
4162     av = float64_val(a);
4163     bv = float64_val(b);
4164     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4165     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4166
4167 }
4168
4169 /*----------------------------------------------------------------------------
4170 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4171 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4172 | comparison is performed according to the IEC/IEEE Standard for Binary
4173 | Floating-Point Arithmetic.
4174 *----------------------------------------------------------------------------*/
4175
4176 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4177 {
4178     a = float64_squash_input_denormal(a, status);
4179     b = float64_squash_input_denormal(b, status);
4180
4181     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4182          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4183        ) {
4184         if (float64_is_signaling_nan(a, status)
4185          || float64_is_signaling_nan(b, status)) {
4186             float_raise(float_flag_invalid, status);
4187         }
4188         return 1;
4189     }
4190     return 0;
4191 }
4192
4193 /*----------------------------------------------------------------------------
4194 | Returns the result of converting the extended double-precision floating-
4195 | point value `a' to the 32-bit two's complement integer format.  The
4196 | conversion is performed according to the IEC/IEEE Standard for Binary
4197 | Floating-Point Arithmetic---which means in particular that the conversion
4198 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4199 | largest positive integer is returned.  Otherwise, if the conversion
4200 | overflows, the largest integer with the same sign as `a' is returned.
4201 *----------------------------------------------------------------------------*/
4202
4203 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4204 {
4205     flag aSign;
4206     int32_t aExp, shiftCount;
4207     uint64_t aSig;
4208
4209     if (floatx80_invalid_encoding(a)) {
4210         float_raise(float_flag_invalid, status);
4211         return 1 << 31;
4212     }
4213     aSig = extractFloatx80Frac( a );
4214     aExp = extractFloatx80Exp( a );
4215     aSign = extractFloatx80Sign( a );
4216     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4217     shiftCount = 0x4037 - aExp;
4218     if ( shiftCount <= 0 ) shiftCount = 1;
4219     shift64RightJamming( aSig, shiftCount, &aSig );
4220     return roundAndPackInt32(aSign, aSig, status);
4221
4222 }
4223
4224 /*----------------------------------------------------------------------------
4225 | Returns the result of converting the extended double-precision floating-
4226 | point value `a' to the 32-bit two's complement integer format.  The
4227 | conversion is performed according to the IEC/IEEE Standard for Binary
4228 | Floating-Point Arithmetic, except that the conversion is always rounded
4229 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4230 | Otherwise, if the conversion overflows, the largest integer with the same
4231 | sign as `a' is returned.
4232 *----------------------------------------------------------------------------*/
4233
4234 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4235 {
4236     flag aSign;
4237     int32_t aExp, shiftCount;
4238     uint64_t aSig, savedASig;
4239     int32_t z;
4240
4241     if (floatx80_invalid_encoding(a)) {
4242         float_raise(float_flag_invalid, status);
4243         return 1 << 31;
4244     }
4245     aSig = extractFloatx80Frac( a );
4246     aExp = extractFloatx80Exp( a );
4247     aSign = extractFloatx80Sign( a );
4248     if ( 0x401E < aExp ) {
4249         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4250         goto invalid;
4251     }
4252     else if ( aExp < 0x3FFF ) {
4253         if (aExp || aSig) {
4254             status->float_exception_flags |= float_flag_inexact;
4255         }
4256         return 0;
4257     }
4258     shiftCount = 0x403E - aExp;
4259     savedASig = aSig;
4260     aSig >>= shiftCount;
4261     z = aSig;
4262     if ( aSign ) z = - z;
4263     if ( ( z < 0 ) ^ aSign ) {
4264  invalid:
4265         float_raise(float_flag_invalid, status);
4266         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4267     }
4268     if ( ( aSig<<shiftCount ) != savedASig ) {
4269         status->float_exception_flags |= float_flag_inexact;
4270     }
4271     return z;
4272
4273 }
4274
4275 /*----------------------------------------------------------------------------
4276 | Returns the result of converting the extended double-precision floating-
4277 | point value `a' to the 64-bit two's complement integer format.  The
4278 | conversion is performed according to the IEC/IEEE Standard for Binary
4279 | Floating-Point Arithmetic---which means in particular that the conversion
4280 | is rounded according to the current rounding mode.  If `a' is a NaN,
4281 | the largest positive integer is returned.  Otherwise, if the conversion
4282 | overflows, the largest integer with the same sign as `a' is returned.
4283 *----------------------------------------------------------------------------*/
4284
4285 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4286 {
4287     flag aSign;
4288     int32_t aExp, shiftCount;
4289     uint64_t aSig, aSigExtra;
4290
4291     if (floatx80_invalid_encoding(a)) {
4292         float_raise(float_flag_invalid, status);
4293         return 1ULL << 63;
4294     }
4295     aSig = extractFloatx80Frac( a );
4296     aExp = extractFloatx80Exp( a );
4297     aSign = extractFloatx80Sign( a );
4298     shiftCount = 0x403E - aExp;
4299     if ( shiftCount <= 0 ) {
4300         if ( shiftCount ) {
4301             float_raise(float_flag_invalid, status);
4302             if (!aSign || floatx80_is_any_nan(a)) {
4303                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4304             }
4305             return (int64_t) LIT64( 0x8000000000000000 );
4306         }
4307         aSigExtra = 0;
4308     }
4309     else {
4310         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4311     }
4312     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4313
4314 }
4315
4316 /*----------------------------------------------------------------------------
4317 | Returns the result of converting the extended double-precision floating-
4318 | point value `a' to the 64-bit two's complement integer format.  The
4319 | conversion is performed according to the IEC/IEEE Standard for Binary
4320 | Floating-Point Arithmetic, except that the conversion is always rounded
4321 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4322 | Otherwise, if the conversion overflows, the largest integer with the same
4323 | sign as `a' is returned.
4324 *----------------------------------------------------------------------------*/
4325
4326 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4327 {
4328     flag aSign;
4329     int32_t aExp, shiftCount;
4330     uint64_t aSig;
4331     int64_t z;
4332
4333     if (floatx80_invalid_encoding(a)) {
4334         float_raise(float_flag_invalid, status);
4335         return 1ULL << 63;
4336     }
4337     aSig = extractFloatx80Frac( a );
4338     aExp = extractFloatx80Exp( a );
4339     aSign = extractFloatx80Sign( a );
4340     shiftCount = aExp - 0x403E;
4341     if ( 0 <= shiftCount ) {
4342         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4343         if ( ( a.high != 0xC03E ) || aSig ) {
4344             float_raise(float_flag_invalid, status);
4345             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4346                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4347             }
4348         }
4349         return (int64_t) LIT64( 0x8000000000000000 );
4350     }
4351     else if ( aExp < 0x3FFF ) {
4352         if (aExp | aSig) {
4353             status->float_exception_flags |= float_flag_inexact;
4354         }
4355         return 0;
4356     }
4357     z = aSig>>( - shiftCount );
4358     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4359         status->float_exception_flags |= float_flag_inexact;
4360     }
4361     if ( aSign ) z = - z;
4362     return z;
4363
4364 }
4365
4366 /*----------------------------------------------------------------------------
4367 | Returns the result of converting the extended double-precision floating-
4368 | point value `a' to the single-precision floating-point format.  The
4369 | conversion is performed according to the IEC/IEEE Standard for Binary
4370 | Floating-Point Arithmetic.
4371 *----------------------------------------------------------------------------*/
4372
4373 float32 floatx80_to_float32(floatx80 a, float_status *status)
4374 {
4375     flag aSign;
4376     int32_t aExp;
4377     uint64_t aSig;
4378
4379     if (floatx80_invalid_encoding(a)) {
4380         float_raise(float_flag_invalid, status);
4381         return float32_default_nan(status);
4382     }
4383     aSig = extractFloatx80Frac( a );
4384     aExp = extractFloatx80Exp( a );
4385     aSign = extractFloatx80Sign( a );
4386     if ( aExp == 0x7FFF ) {
4387         if ( (uint64_t) ( aSig<<1 ) ) {
4388             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4389         }
4390         return packFloat32( aSign, 0xFF, 0 );
4391     }
4392     shift64RightJamming( aSig, 33, &aSig );
4393     if ( aExp || aSig ) aExp -= 0x3F81;
4394     return roundAndPackFloat32(aSign, aExp, aSig, status);
4395
4396 }
4397
4398 /*----------------------------------------------------------------------------
4399 | Returns the result of converting the extended double-precision floating-
4400 | point value `a' to the double-precision floating-point format.  The
4401 | conversion is performed according to the IEC/IEEE Standard for Binary
4402 | Floating-Point Arithmetic.
4403 *----------------------------------------------------------------------------*/
4404
4405 float64 floatx80_to_float64(floatx80 a, float_status *status)
4406 {
4407     flag aSign;
4408     int32_t aExp;
4409     uint64_t aSig, zSig;
4410
4411     if (floatx80_invalid_encoding(a)) {
4412         float_raise(float_flag_invalid, status);
4413         return float64_default_nan(status);
4414     }
4415     aSig = extractFloatx80Frac( a );
4416     aExp = extractFloatx80Exp( a );
4417     aSign = extractFloatx80Sign( a );
4418     if ( aExp == 0x7FFF ) {
4419         if ( (uint64_t) ( aSig<<1 ) ) {
4420             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4421         }
4422         return packFloat64( aSign, 0x7FF, 0 );
4423     }
4424     shift64RightJamming( aSig, 1, &zSig );
4425     if ( aExp || aSig ) aExp -= 0x3C01;
4426     return roundAndPackFloat64(aSign, aExp, zSig, status);
4427
4428 }
4429
4430 /*----------------------------------------------------------------------------
4431 | Returns the result of converting the extended double-precision floating-
4432 | point value `a' to the quadruple-precision floating-point format.  The
4433 | conversion is performed according to the IEC/IEEE Standard for Binary
4434 | Floating-Point Arithmetic.
4435 *----------------------------------------------------------------------------*/
4436
4437 float128 floatx80_to_float128(floatx80 a, float_status *status)
4438 {
4439     flag aSign;
4440     int aExp;
4441     uint64_t aSig, zSig0, zSig1;
4442
4443     if (floatx80_invalid_encoding(a)) {
4444         float_raise(float_flag_invalid, status);
4445         return float128_default_nan(status);
4446     }
4447     aSig = extractFloatx80Frac( a );
4448     aExp = extractFloatx80Exp( a );
4449     aSign = extractFloatx80Sign( a );
4450     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4451         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4452     }
4453     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4454     return packFloat128( aSign, aExp, zSig0, zSig1 );
4455
4456 }
4457
4458 /*----------------------------------------------------------------------------
4459 | Rounds the extended double-precision floating-point value `a'
4460 | to the precision provided by floatx80_rounding_precision and returns the
4461 | result as an extended double-precision floating-point value.
4462 | The operation is performed according to the IEC/IEEE Standard for Binary
4463 | Floating-Point Arithmetic.
4464 *----------------------------------------------------------------------------*/
4465
4466 floatx80 floatx80_round(floatx80 a, float_status *status)
4467 {
4468     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4469                                 extractFloatx80Sign(a),
4470                                 extractFloatx80Exp(a),
4471                                 extractFloatx80Frac(a), 0, status);
4472 }
4473
4474 /*----------------------------------------------------------------------------
4475 | Rounds the extended double-precision floating-point value `a' to an integer,
4476 | and returns the result as an extended quadruple-precision floating-point
4477 | value.  The operation is performed according to the IEC/IEEE Standard for
4478 | Binary Floating-Point Arithmetic.
4479 *----------------------------------------------------------------------------*/
4480
4481 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4482 {
4483     flag aSign;
4484     int32_t aExp;
4485     uint64_t lastBitMask, roundBitsMask;
4486     floatx80 z;
4487
4488     if (floatx80_invalid_encoding(a)) {
4489         float_raise(float_flag_invalid, status);
4490         return floatx80_default_nan(status);
4491     }
4492     aExp = extractFloatx80Exp( a );
4493     if ( 0x403E <= aExp ) {
4494         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4495             return propagateFloatx80NaN(a, a, status);
4496         }
4497         return a;
4498     }
4499     if ( aExp < 0x3FFF ) {
4500         if (    ( aExp == 0 )
4501              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4502             return a;
4503         }
4504         status->float_exception_flags |= float_flag_inexact;
4505         aSign = extractFloatx80Sign( a );
4506         switch (status->float_rounding_mode) {
4507          case float_round_nearest_even:
4508             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4509                ) {
4510                 return
4511                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4512             }
4513             break;
4514         case float_round_ties_away:
4515             if (aExp == 0x3FFE) {
4516                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4517             }
4518             break;
4519          case float_round_down:
4520             return
4521                   aSign ?
4522                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4523                 : packFloatx80( 0, 0, 0 );
4524          case float_round_up:
4525             return
4526                   aSign ? packFloatx80( 1, 0, 0 )
4527                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4528         }
4529         return packFloatx80( aSign, 0, 0 );
4530     }
4531     lastBitMask = 1;
4532     lastBitMask <<= 0x403E - aExp;
4533     roundBitsMask = lastBitMask - 1;
4534     z = a;
4535     switch (status->float_rounding_mode) {
4536     case float_round_nearest_even:
4537         z.low += lastBitMask>>1;
4538         if ((z.low & roundBitsMask) == 0) {
4539             z.low &= ~lastBitMask;
4540         }
4541         break;
4542     case float_round_ties_away:
4543         z.low += lastBitMask >> 1;
4544         break;
4545     case float_round_to_zero:
4546         break;
4547     case float_round_up:
4548         if (!extractFloatx80Sign(z)) {
4549             z.low += roundBitsMask;
4550         }
4551         break;
4552     case float_round_down:
4553         if (extractFloatx80Sign(z)) {
4554             z.low += roundBitsMask;
4555         }
4556         break;
4557     default:
4558         abort();
4559     }
4560     z.low &= ~ roundBitsMask;
4561     if ( z.low == 0 ) {
4562         ++z.high;
4563         z.low = LIT64( 0x8000000000000000 );
4564     }
4565     if (z.low != a.low) {
4566         status->float_exception_flags |= float_flag_inexact;
4567     }
4568     return z;
4569
4570 }
4571
4572 /*----------------------------------------------------------------------------
4573 | Returns the result of adding the absolute values of the extended double-
4574 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4575 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4576 | The addition is performed according to the IEC/IEEE Standard for Binary
4577 | Floating-Point Arithmetic.
4578 *----------------------------------------------------------------------------*/
4579
4580 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4581                                 float_status *status)
4582 {
4583     int32_t aExp, bExp, zExp;
4584     uint64_t aSig, bSig, zSig0, zSig1;
4585     int32_t expDiff;
4586
4587     aSig = extractFloatx80Frac( a );
4588     aExp = extractFloatx80Exp( a );
4589     bSig = extractFloatx80Frac( b );
4590     bExp = extractFloatx80Exp( b );
4591     expDiff = aExp - bExp;
4592     if ( 0 < expDiff ) {
4593         if ( aExp == 0x7FFF ) {
4594             if ((uint64_t)(aSig << 1)) {
4595                 return propagateFloatx80NaN(a, b, status);
4596             }
4597             return a;
4598         }
4599         if ( bExp == 0 ) --expDiff;
4600         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4601         zExp = aExp;
4602     }
4603     else if ( expDiff < 0 ) {
4604         if ( bExp == 0x7FFF ) {
4605             if ((uint64_t)(bSig << 1)) {
4606                 return propagateFloatx80NaN(a, b, status);
4607             }
4608             return packFloatx80(zSign,
4609                                 floatx80_infinity_high,
4610                                 floatx80_infinity_low);
4611         }
4612         if ( aExp == 0 ) ++expDiff;
4613         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4614         zExp = bExp;
4615     }
4616     else {
4617         if ( aExp == 0x7FFF ) {
4618             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4619                 return propagateFloatx80NaN(a, b, status);
4620             }
4621             return a;
4622         }
4623         zSig1 = 0;
4624         zSig0 = aSig + bSig;
4625         if ( aExp == 0 ) {
4626             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4627             goto roundAndPack;
4628         }
4629         zExp = aExp;
4630         goto shiftRight1;
4631     }
4632     zSig0 = aSig + bSig;
4633     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4634  shiftRight1:
4635     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4636     zSig0 |= LIT64( 0x8000000000000000 );
4637     ++zExp;
4638  roundAndPack:
4639     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4640                                 zSign, zExp, zSig0, zSig1, status);
4641 }
4642
4643 /*----------------------------------------------------------------------------
4644 | Returns the result of subtracting the absolute values of the extended
4645 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4646 | difference is negated before being returned.  `zSign' is ignored if the
4647 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4648 | Standard for Binary Floating-Point Arithmetic.
4649 *----------------------------------------------------------------------------*/
4650
4651 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4652                                 float_status *status)
4653 {
4654     int32_t aExp, bExp, zExp;
4655     uint64_t aSig, bSig, zSig0, zSig1;
4656     int32_t expDiff;
4657
4658     aSig = extractFloatx80Frac( a );
4659     aExp = extractFloatx80Exp( a );
4660     bSig = extractFloatx80Frac( b );
4661     bExp = extractFloatx80Exp( b );
4662     expDiff = aExp - bExp;
4663     if ( 0 < expDiff ) goto aExpBigger;
4664     if ( expDiff < 0 ) goto bExpBigger;
4665     if ( aExp == 0x7FFF ) {
4666         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4667             return propagateFloatx80NaN(a, b, status);
4668         }
4669         float_raise(float_flag_invalid, status);
4670         return floatx80_default_nan(status);
4671     }
4672     if ( aExp == 0 ) {
4673         aExp = 1;
4674         bExp = 1;
4675     }
4676     zSig1 = 0;
4677     if ( bSig < aSig ) goto aBigger;
4678     if ( aSig < bSig ) goto bBigger;
4679     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4680  bExpBigger:
4681     if ( bExp == 0x7FFF ) {
4682         if ((uint64_t)(bSig << 1)) {
4683             return propagateFloatx80NaN(a, b, status);
4684         }
4685         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4686                             floatx80_infinity_low);
4687     }
4688     if ( aExp == 0 ) ++expDiff;
4689     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4690  bBigger:
4691     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4692     zExp = bExp;
4693     zSign ^= 1;
4694     goto normalizeRoundAndPack;
4695  aExpBigger:
4696     if ( aExp == 0x7FFF ) {
4697         if ((uint64_t)(aSig << 1)) {
4698             return propagateFloatx80NaN(a, b, status);
4699         }
4700         return a;
4701     }
4702     if ( bExp == 0 ) --expDiff;
4703     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4704  aBigger:
4705     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4706     zExp = aExp;
4707  normalizeRoundAndPack:
4708     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4709                                          zSign, zExp, zSig0, zSig1, status);
4710 }
4711
4712 /*----------------------------------------------------------------------------
4713 | Returns the result of adding the extended double-precision floating-point
4714 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4715 | Standard for Binary Floating-Point Arithmetic.
4716 *----------------------------------------------------------------------------*/
4717
4718 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4719 {
4720     flag aSign, bSign;
4721
4722     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4723         float_raise(float_flag_invalid, status);
4724         return floatx80_default_nan(status);
4725     }
4726     aSign = extractFloatx80Sign( a );
4727     bSign = extractFloatx80Sign( b );
4728     if ( aSign == bSign ) {
4729         return addFloatx80Sigs(a, b, aSign, status);
4730     }
4731     else {
4732         return subFloatx80Sigs(a, b, aSign, status);
4733     }
4734
4735 }
4736
4737 /*----------------------------------------------------------------------------
4738 | Returns the result of subtracting the extended double-precision floating-
4739 | point values `a' and `b'.  The operation is performed according to the
4740 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4741 *----------------------------------------------------------------------------*/
4742
4743 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
4744 {
4745     flag aSign, bSign;
4746
4747     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4748         float_raise(float_flag_invalid, status);
4749         return floatx80_default_nan(status);
4750     }
4751     aSign = extractFloatx80Sign( a );
4752     bSign = extractFloatx80Sign( b );
4753     if ( aSign == bSign ) {
4754         return subFloatx80Sigs(a, b, aSign, status);
4755     }
4756     else {
4757         return addFloatx80Sigs(a, b, aSign, status);
4758     }
4759
4760 }
4761
4762 /*----------------------------------------------------------------------------
4763 | Returns the result of multiplying the extended double-precision floating-
4764 | point values `a' and `b'.  The operation is performed according to the
4765 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4766 *----------------------------------------------------------------------------*/
4767
4768 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
4769 {
4770     flag aSign, bSign, zSign;
4771     int32_t aExp, bExp, zExp;
4772     uint64_t aSig, bSig, zSig0, zSig1;
4773
4774     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4775         float_raise(float_flag_invalid, status);
4776         return floatx80_default_nan(status);
4777     }
4778     aSig = extractFloatx80Frac( a );
4779     aExp = extractFloatx80Exp( a );
4780     aSign = extractFloatx80Sign( a );
4781     bSig = extractFloatx80Frac( b );
4782     bExp = extractFloatx80Exp( b );
4783     bSign = extractFloatx80Sign( b );
4784     zSign = aSign ^ bSign;
4785     if ( aExp == 0x7FFF ) {
4786         if (    (uint64_t) ( aSig<<1 )
4787              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4788             return propagateFloatx80NaN(a, b, status);
4789         }
4790         if ( ( bExp | bSig ) == 0 ) goto invalid;
4791         return packFloatx80(zSign, floatx80_infinity_high,
4792                                    floatx80_infinity_low);
4793     }
4794     if ( bExp == 0x7FFF ) {
4795         if ((uint64_t)(bSig << 1)) {
4796             return propagateFloatx80NaN(a, b, status);
4797         }
4798         if ( ( aExp | aSig ) == 0 ) {
4799  invalid:
4800             float_raise(float_flag_invalid, status);
4801             return floatx80_default_nan(status);
4802         }
4803         return packFloatx80(zSign, floatx80_infinity_high,
4804                                    floatx80_infinity_low);
4805     }
4806     if ( aExp == 0 ) {
4807         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4808         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4809     }
4810     if ( bExp == 0 ) {
4811         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4812         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4813     }
4814     zExp = aExp + bExp - 0x3FFE;
4815     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4816     if ( 0 < (int64_t) zSig0 ) {
4817         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4818         --zExp;
4819     }
4820     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4821                                 zSign, zExp, zSig0, zSig1, status);
4822 }
4823
4824 /*----------------------------------------------------------------------------
4825 | Returns the result of dividing the extended double-precision floating-point
4826 | value `a' by the corresponding value `b'.  The operation is performed
4827 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4828 *----------------------------------------------------------------------------*/
4829
4830 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
4831 {
4832     flag aSign, bSign, zSign;
4833     int32_t aExp, bExp, zExp;
4834     uint64_t aSig, bSig, zSig0, zSig1;
4835     uint64_t rem0, rem1, rem2, term0, term1, term2;
4836
4837     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4838         float_raise(float_flag_invalid, status);
4839         return floatx80_default_nan(status);
4840     }
4841     aSig = extractFloatx80Frac( a );
4842     aExp = extractFloatx80Exp( a );
4843     aSign = extractFloatx80Sign( a );
4844     bSig = extractFloatx80Frac( b );
4845     bExp = extractFloatx80Exp( b );
4846     bSign = extractFloatx80Sign( b );
4847     zSign = aSign ^ bSign;
4848     if ( aExp == 0x7FFF ) {
4849         if ((uint64_t)(aSig << 1)) {
4850             return propagateFloatx80NaN(a, b, status);
4851         }
4852         if ( bExp == 0x7FFF ) {
4853             if ((uint64_t)(bSig << 1)) {
4854                 return propagateFloatx80NaN(a, b, status);
4855             }
4856             goto invalid;
4857         }
4858         return packFloatx80(zSign, floatx80_infinity_high,
4859                                    floatx80_infinity_low);
4860     }
4861     if ( bExp == 0x7FFF ) {
4862         if ((uint64_t)(bSig << 1)) {
4863             return propagateFloatx80NaN(a, b, status);
4864         }
4865         return packFloatx80( zSign, 0, 0 );
4866     }
4867     if ( bExp == 0 ) {
4868         if ( bSig == 0 ) {
4869             if ( ( aExp | aSig ) == 0 ) {
4870  invalid:
4871                 float_raise(float_flag_invalid, status);
4872                 return floatx80_default_nan(status);
4873             }
4874             float_raise(float_flag_divbyzero, status);
4875             return packFloatx80(zSign, floatx80_infinity_high,
4876                                        floatx80_infinity_low);
4877         }
4878         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4879     }
4880     if ( aExp == 0 ) {
4881         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4882         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4883     }
4884     zExp = aExp - bExp + 0x3FFE;
4885     rem1 = 0;
4886     if ( bSig <= aSig ) {
4887         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4888         ++zExp;
4889     }
4890     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4891     mul64To128( bSig, zSig0, &term0, &term1 );
4892     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4893     while ( (int64_t) rem0 < 0 ) {
4894         --zSig0;
4895         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4896     }
4897     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4898     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4899         mul64To128( bSig, zSig1, &term1, &term2 );
4900         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4901         while ( (int64_t) rem1 < 0 ) {
4902             --zSig1;
4903             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4904         }
4905         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4906     }
4907     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4908                                 zSign, zExp, zSig0, zSig1, status);
4909 }
4910
4911 /*----------------------------------------------------------------------------
4912 | Returns the remainder of the extended double-precision floating-point value
4913 | `a' with respect to the corresponding value `b'.  The operation is performed
4914 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4915 *----------------------------------------------------------------------------*/
4916
4917 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
4918 {
4919     flag aSign, zSign;
4920     int32_t aExp, bExp, expDiff;
4921     uint64_t aSig0, aSig1, bSig;
4922     uint64_t q, term0, term1, alternateASig0, alternateASig1;
4923
4924     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4925         float_raise(float_flag_invalid, status);
4926         return floatx80_default_nan(status);
4927     }
4928     aSig0 = extractFloatx80Frac( a );
4929     aExp = extractFloatx80Exp( a );
4930     aSign = extractFloatx80Sign( a );
4931     bSig = extractFloatx80Frac( b );
4932     bExp = extractFloatx80Exp( b );
4933     if ( aExp == 0x7FFF ) {
4934         if (    (uint64_t) ( aSig0<<1 )
4935              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4936             return propagateFloatx80NaN(a, b, status);
4937         }
4938         goto invalid;
4939     }
4940     if ( bExp == 0x7FFF ) {
4941         if ((uint64_t)(bSig << 1)) {
4942             return propagateFloatx80NaN(a, b, status);
4943         }
4944         return a;
4945     }
4946     if ( bExp == 0 ) {
4947         if ( bSig == 0 ) {
4948  invalid:
4949             float_raise(float_flag_invalid, status);
4950             return floatx80_default_nan(status);
4951         }
4952         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4953     }
4954     if ( aExp == 0 ) {
4955         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4956         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4957     }
4958     bSig |= LIT64( 0x8000000000000000 );
4959     zSign = aSign;
4960     expDiff = aExp - bExp;
4961     aSig1 = 0;
4962     if ( expDiff < 0 ) {
4963         if ( expDiff < -1 ) return a;
4964         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4965         expDiff = 0;
4966     }
4967     q = ( bSig <= aSig0 );
4968     if ( q ) aSig0 -= bSig;
4969     expDiff -= 64;
4970     while ( 0 < expDiff ) {
4971         q = estimateDiv128To64( aSig0, aSig1, bSig );
4972         q = ( 2 < q ) ? q - 2 : 0;
4973         mul64To128( bSig, q, &term0, &term1 );
4974         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4975         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4976         expDiff -= 62;
4977     }
4978     expDiff += 64;
4979     if ( 0 < expDiff ) {
4980         q = estimateDiv128To64( aSig0, aSig1, bSig );
4981         q = ( 2 < q ) ? q - 2 : 0;
4982         q >>= 64 - expDiff;
4983         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4984         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4985         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4986         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4987             ++q;
4988             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4989         }
4990     }
4991     else {
4992         term1 = 0;
4993         term0 = bSig;
4994     }
4995     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4996     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4997          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4998               && ( q & 1 ) )
4999        ) {
5000         aSig0 = alternateASig0;
5001         aSig1 = alternateASig1;
5002         zSign = ! zSign;
5003     }
5004     return
5005         normalizeRoundAndPackFloatx80(
5006             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5007
5008 }
5009
5010 /*----------------------------------------------------------------------------
5011 | Returns the square root of the extended double-precision floating-point
5012 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5013 | for Binary Floating-Point Arithmetic.
5014 *----------------------------------------------------------------------------*/
5015
5016 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5017 {
5018     flag aSign;
5019     int32_t aExp, zExp;
5020     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5021     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5022
5023     if (floatx80_invalid_encoding(a)) {
5024         float_raise(float_flag_invalid, status);
5025         return floatx80_default_nan(status);
5026     }
5027     aSig0 = extractFloatx80Frac( a );
5028     aExp = extractFloatx80Exp( a );
5029     aSign = extractFloatx80Sign( a );
5030     if ( aExp == 0x7FFF ) {
5031         if ((uint64_t)(aSig0 << 1)) {
5032             return propagateFloatx80NaN(a, a, status);
5033         }
5034         if ( ! aSign ) return a;
5035         goto invalid;
5036     }
5037     if ( aSign ) {
5038         if ( ( aExp | aSig0 ) == 0 ) return a;
5039  invalid:
5040         float_raise(float_flag_invalid, status);
5041         return floatx80_default_nan(status);
5042     }
5043     if ( aExp == 0 ) {
5044         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5045         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5046     }
5047     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5048     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5049     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5050     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5051     doubleZSig0 = zSig0<<1;
5052     mul64To128( zSig0, zSig0, &term0, &term1 );
5053     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5054     while ( (int64_t) rem0 < 0 ) {
5055         --zSig0;
5056         doubleZSig0 -= 2;
5057         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5058     }
5059     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5060     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5061         if ( zSig1 == 0 ) zSig1 = 1;
5062         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5063         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5064         mul64To128( zSig1, zSig1, &term2, &term3 );
5065         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5066         while ( (int64_t) rem1 < 0 ) {
5067             --zSig1;
5068             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5069             term3 |= 1;
5070             term2 |= doubleZSig0;
5071             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5072         }
5073         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5074     }
5075     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5076     zSig0 |= doubleZSig0;
5077     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5078                                 0, zExp, zSig0, zSig1, status);
5079 }
5080
5081 /*----------------------------------------------------------------------------
5082 | Returns 1 if the extended double-precision floating-point value `a' is equal
5083 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5084 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5085 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5086 *----------------------------------------------------------------------------*/
5087
5088 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5089 {
5090
5091     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5092         || (extractFloatx80Exp(a) == 0x7FFF
5093             && (uint64_t) (extractFloatx80Frac(a) << 1))
5094         || (extractFloatx80Exp(b) == 0x7FFF
5095             && (uint64_t) (extractFloatx80Frac(b) << 1))
5096        ) {
5097         float_raise(float_flag_invalid, status);
5098         return 0;
5099     }
5100     return
5101            ( a.low == b.low )
5102         && (    ( a.high == b.high )
5103              || (    ( a.low == 0 )
5104                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5105            );
5106
5107 }
5108
5109 /*----------------------------------------------------------------------------
5110 | Returns 1 if the extended double-precision floating-point value `a' is
5111 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5112 | invalid exception is raised if either operand is a NaN.  The comparison is
5113 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5114 | Arithmetic.
5115 *----------------------------------------------------------------------------*/
5116
5117 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5118 {
5119     flag aSign, bSign;
5120
5121     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5122         || (extractFloatx80Exp(a) == 0x7FFF
5123             && (uint64_t) (extractFloatx80Frac(a) << 1))
5124         || (extractFloatx80Exp(b) == 0x7FFF
5125             && (uint64_t) (extractFloatx80Frac(b) << 1))
5126        ) {
5127         float_raise(float_flag_invalid, status);
5128         return 0;
5129     }
5130     aSign = extractFloatx80Sign( a );
5131     bSign = extractFloatx80Sign( b );
5132     if ( aSign != bSign ) {
5133         return
5134                aSign
5135             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5136                  == 0 );
5137     }
5138     return
5139           aSign ? le128( b.high, b.low, a.high, a.low )
5140         : le128( a.high, a.low, b.high, b.low );
5141
5142 }
5143
5144 /*----------------------------------------------------------------------------
5145 | Returns 1 if the extended double-precision floating-point value `a' is
5146 | less than the corresponding value `b', and 0 otherwise.  The invalid
5147 | exception is raised if either operand is a NaN.  The comparison is performed
5148 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5149 *----------------------------------------------------------------------------*/
5150
5151 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5152 {
5153     flag aSign, bSign;
5154
5155     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5156         || (extractFloatx80Exp(a) == 0x7FFF
5157             && (uint64_t) (extractFloatx80Frac(a) << 1))
5158         || (extractFloatx80Exp(b) == 0x7FFF
5159             && (uint64_t) (extractFloatx80Frac(b) << 1))
5160        ) {
5161         float_raise(float_flag_invalid, status);
5162         return 0;
5163     }
5164     aSign = extractFloatx80Sign( a );
5165     bSign = extractFloatx80Sign( b );
5166     if ( aSign != bSign ) {
5167         return
5168                aSign
5169             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5170                  != 0 );
5171     }
5172     return
5173           aSign ? lt128( b.high, b.low, a.high, a.low )
5174         : lt128( a.high, a.low, b.high, b.low );
5175
5176 }
5177
5178 /*----------------------------------------------------------------------------
5179 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5180 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5181 | either operand is a NaN.   The comparison is performed according to the
5182 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5183 *----------------------------------------------------------------------------*/
5184 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5185 {
5186     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5187         || (extractFloatx80Exp(a) == 0x7FFF
5188             && (uint64_t) (extractFloatx80Frac(a) << 1))
5189         || (extractFloatx80Exp(b) == 0x7FFF
5190             && (uint64_t) (extractFloatx80Frac(b) << 1))
5191        ) {
5192         float_raise(float_flag_invalid, status);
5193         return 1;
5194     }
5195     return 0;
5196 }
5197
5198 /*----------------------------------------------------------------------------
5199 | Returns 1 if the extended double-precision floating-point value `a' is
5200 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5201 | cause an exception.  The comparison is performed according to the IEC/IEEE
5202 | Standard for Binary Floating-Point Arithmetic.
5203 *----------------------------------------------------------------------------*/
5204
5205 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5206 {
5207
5208     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5209         float_raise(float_flag_invalid, status);
5210         return 0;
5211     }
5212     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5213               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5214          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5215               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5216        ) {
5217         if (floatx80_is_signaling_nan(a, status)
5218          || floatx80_is_signaling_nan(b, status)) {
5219             float_raise(float_flag_invalid, status);
5220         }
5221         return 0;
5222     }
5223     return
5224            ( a.low == b.low )
5225         && (    ( a.high == b.high )
5226              || (    ( a.low == 0 )
5227                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5228            );
5229
5230 }
5231
5232 /*----------------------------------------------------------------------------
5233 | Returns 1 if the extended double-precision floating-point value `a' is less
5234 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5235 | do not cause an exception.  Otherwise, the comparison is performed according
5236 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5237 *----------------------------------------------------------------------------*/
5238
5239 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5240 {
5241     flag aSign, bSign;
5242
5243     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5244         float_raise(float_flag_invalid, status);
5245         return 0;
5246     }
5247     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5248               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5249          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5250               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5251        ) {
5252         if (floatx80_is_signaling_nan(a, status)
5253          || floatx80_is_signaling_nan(b, status)) {
5254             float_raise(float_flag_invalid, status);
5255         }
5256         return 0;
5257     }
5258     aSign = extractFloatx80Sign( a );
5259     bSign = extractFloatx80Sign( b );
5260     if ( aSign != bSign ) {
5261         return
5262                aSign
5263             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5264                  == 0 );
5265     }
5266     return
5267           aSign ? le128( b.high, b.low, a.high, a.low )
5268         : le128( a.high, a.low, b.high, b.low );
5269
5270 }
5271
5272 /*----------------------------------------------------------------------------
5273 | Returns 1 if the extended double-precision floating-point value `a' is less
5274 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5275 | an exception.  Otherwise, the comparison is performed according to the
5276 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5277 *----------------------------------------------------------------------------*/
5278
5279 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5280 {
5281     flag aSign, bSign;
5282
5283     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5284         float_raise(float_flag_invalid, status);
5285         return 0;
5286     }
5287     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5288               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5289          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5290               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5291        ) {
5292         if (floatx80_is_signaling_nan(a, status)
5293          || floatx80_is_signaling_nan(b, status)) {
5294             float_raise(float_flag_invalid, status);
5295         }
5296         return 0;
5297     }
5298     aSign = extractFloatx80Sign( a );
5299     bSign = extractFloatx80Sign( b );
5300     if ( aSign != bSign ) {
5301         return
5302                aSign
5303             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5304                  != 0 );
5305     }
5306     return
5307           aSign ? lt128( b.high, b.low, a.high, a.low )
5308         : lt128( a.high, a.low, b.high, b.low );
5309
5310 }
5311
5312 /*----------------------------------------------------------------------------
5313 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5314 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5315 | The comparison is performed according to the IEC/IEEE Standard for Binary
5316 | Floating-Point Arithmetic.
5317 *----------------------------------------------------------------------------*/
5318 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5319 {
5320     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5321         float_raise(float_flag_invalid, status);
5322         return 1;
5323     }
5324     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5325               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5326          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5327               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5328        ) {
5329         if (floatx80_is_signaling_nan(a, status)
5330          || floatx80_is_signaling_nan(b, status)) {
5331             float_raise(float_flag_invalid, status);
5332         }
5333         return 1;
5334     }
5335     return 0;
5336 }
5337
5338 /*----------------------------------------------------------------------------
5339 | Returns the result of converting the quadruple-precision floating-point
5340 | value `a' to the 32-bit two's complement integer format.  The conversion
5341 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5342 | Arithmetic---which means in particular that the conversion is rounded
5343 | according to the current rounding mode.  If `a' is a NaN, the largest
5344 | positive integer is returned.  Otherwise, if the conversion overflows, the
5345 | largest integer with the same sign as `a' is returned.
5346 *----------------------------------------------------------------------------*/
5347
5348 int32_t float128_to_int32(float128 a, float_status *status)
5349 {
5350     flag aSign;
5351     int32_t aExp, shiftCount;
5352     uint64_t aSig0, aSig1;
5353
5354     aSig1 = extractFloat128Frac1( a );
5355     aSig0 = extractFloat128Frac0( a );
5356     aExp = extractFloat128Exp( a );
5357     aSign = extractFloat128Sign( a );
5358     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5359     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5360     aSig0 |= ( aSig1 != 0 );
5361     shiftCount = 0x4028 - aExp;
5362     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5363     return roundAndPackInt32(aSign, aSig0, status);
5364
5365 }
5366
5367 /*----------------------------------------------------------------------------
5368 | Returns the result of converting the quadruple-precision floating-point
5369 | value `a' to the 32-bit two's complement integer format.  The conversion
5370 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5371 | Arithmetic, except that the conversion is always rounded toward zero.  If
5372 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5373 | conversion overflows, the largest integer with the same sign as `a' is
5374 | returned.
5375 *----------------------------------------------------------------------------*/
5376
5377 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5378 {
5379     flag aSign;
5380     int32_t aExp, shiftCount;
5381     uint64_t aSig0, aSig1, savedASig;
5382     int32_t z;
5383
5384     aSig1 = extractFloat128Frac1( a );
5385     aSig0 = extractFloat128Frac0( a );
5386     aExp = extractFloat128Exp( a );
5387     aSign = extractFloat128Sign( a );
5388     aSig0 |= ( aSig1 != 0 );
5389     if ( 0x401E < aExp ) {
5390         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5391         goto invalid;
5392     }
5393     else if ( aExp < 0x3FFF ) {
5394         if (aExp || aSig0) {
5395             status->float_exception_flags |= float_flag_inexact;
5396         }
5397         return 0;
5398     }
5399     aSig0 |= LIT64( 0x0001000000000000 );
5400     shiftCount = 0x402F - aExp;
5401     savedASig = aSig0;
5402     aSig0 >>= shiftCount;
5403     z = aSig0;
5404     if ( aSign ) z = - z;
5405     if ( ( z < 0 ) ^ aSign ) {
5406  invalid:
5407         float_raise(float_flag_invalid, status);
5408         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5409     }
5410     if ( ( aSig0<<shiftCount ) != savedASig ) {
5411         status->float_exception_flags |= float_flag_inexact;
5412     }
5413     return z;
5414
5415 }
5416
5417 /*----------------------------------------------------------------------------
5418 | Returns the result of converting the quadruple-precision floating-point
5419 | value `a' to the 64-bit two's complement integer format.  The conversion
5420 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5421 | Arithmetic---which means in particular that the conversion is rounded
5422 | according to the current rounding mode.  If `a' is a NaN, the largest
5423 | positive integer is returned.  Otherwise, if the conversion overflows, the
5424 | largest integer with the same sign as `a' is returned.
5425 *----------------------------------------------------------------------------*/
5426
5427 int64_t float128_to_int64(float128 a, float_status *status)
5428 {
5429     flag aSign;
5430     int32_t aExp, shiftCount;
5431     uint64_t aSig0, aSig1;
5432
5433     aSig1 = extractFloat128Frac1( a );
5434     aSig0 = extractFloat128Frac0( a );
5435     aExp = extractFloat128Exp( a );
5436     aSign = extractFloat128Sign( a );
5437     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5438     shiftCount = 0x402F - aExp;
5439     if ( shiftCount <= 0 ) {
5440         if ( 0x403E < aExp ) {
5441             float_raise(float_flag_invalid, status);
5442             if (    ! aSign
5443                  || (    ( aExp == 0x7FFF )
5444                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5445                     )
5446                ) {
5447                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5448             }
5449             return (int64_t) LIT64( 0x8000000000000000 );
5450         }
5451         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5452     }
5453     else {
5454         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5455     }
5456     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5457
5458 }
5459
5460 /*----------------------------------------------------------------------------
5461 | Returns the result of converting the quadruple-precision floating-point
5462 | value `a' to the 64-bit two's complement integer format.  The conversion
5463 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5464 | Arithmetic, except that the conversion is always rounded toward zero.
5465 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5466 | the conversion overflows, the largest integer with the same sign as `a' is
5467 | returned.
5468 *----------------------------------------------------------------------------*/
5469
5470 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5471 {
5472     flag aSign;
5473     int32_t aExp, shiftCount;
5474     uint64_t aSig0, aSig1;
5475     int64_t z;
5476
5477     aSig1 = extractFloat128Frac1( a );
5478     aSig0 = extractFloat128Frac0( a );
5479     aExp = extractFloat128Exp( a );
5480     aSign = extractFloat128Sign( a );
5481     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5482     shiftCount = aExp - 0x402F;
5483     if ( 0 < shiftCount ) {
5484         if ( 0x403E <= aExp ) {
5485             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5486             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5487                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5488                 if (aSig1) {
5489                     status->float_exception_flags |= float_flag_inexact;
5490                 }
5491             }
5492             else {
5493                 float_raise(float_flag_invalid, status);
5494                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5495                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5496                 }
5497             }
5498             return (int64_t) LIT64( 0x8000000000000000 );
5499         }
5500         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5501         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5502             status->float_exception_flags |= float_flag_inexact;
5503         }
5504     }
5505     else {
5506         if ( aExp < 0x3FFF ) {
5507             if ( aExp | aSig0 | aSig1 ) {
5508                 status->float_exception_flags |= float_flag_inexact;
5509             }
5510             return 0;
5511         }
5512         z = aSig0>>( - shiftCount );
5513         if (    aSig1
5514              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5515             status->float_exception_flags |= float_flag_inexact;
5516         }
5517     }
5518     if ( aSign ) z = - z;
5519     return z;
5520
5521 }
5522
5523 /*----------------------------------------------------------------------------
5524 | Returns the result of converting the quadruple-precision floating-point value
5525 | `a' to the 64-bit unsigned integer format.  The conversion is
5526 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5527 | Arithmetic---which means in particular that the conversion is rounded
5528 | according to the current rounding mode.  If `a' is a NaN, the largest
5529 | positive integer is returned.  If the conversion overflows, the
5530 | largest unsigned integer is returned.  If 'a' is negative, the value is
5531 | rounded and zero is returned; negative values that do not round to zero
5532 | will raise the inexact exception.
5533 *----------------------------------------------------------------------------*/
5534
5535 uint64_t float128_to_uint64(float128 a, float_status *status)
5536 {
5537     flag aSign;
5538     int aExp;
5539     int shiftCount;
5540     uint64_t aSig0, aSig1;
5541
5542     aSig0 = extractFloat128Frac0(a);
5543     aSig1 = extractFloat128Frac1(a);
5544     aExp = extractFloat128Exp(a);
5545     aSign = extractFloat128Sign(a);
5546     if (aSign && (aExp > 0x3FFE)) {
5547         float_raise(float_flag_invalid, status);
5548         if (float128_is_any_nan(a)) {
5549             return LIT64(0xFFFFFFFFFFFFFFFF);
5550         } else {
5551             return 0;
5552         }
5553     }
5554     if (aExp) {
5555         aSig0 |= LIT64(0x0001000000000000);
5556     }
5557     shiftCount = 0x402F - aExp;
5558     if (shiftCount <= 0) {
5559         if (0x403E < aExp) {
5560             float_raise(float_flag_invalid, status);
5561             return LIT64(0xFFFFFFFFFFFFFFFF);
5562         }
5563         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5564     } else {
5565         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5566     }
5567     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5568 }
5569
5570 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5571 {
5572     uint64_t v;
5573     signed char current_rounding_mode = status->float_rounding_mode;
5574
5575     set_float_rounding_mode(float_round_to_zero, status);
5576     v = float128_to_uint64(a, status);
5577     set_float_rounding_mode(current_rounding_mode, status);
5578
5579     return v;
5580 }
5581
5582 /*----------------------------------------------------------------------------
5583 | Returns the result of converting the quadruple-precision floating-point
5584 | value `a' to the 32-bit unsigned integer format.  The conversion
5585 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5586 | Arithmetic except that the conversion is always rounded toward zero.
5587 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5588 | if the conversion overflows, the largest unsigned integer is returned.
5589 | If 'a' is negative, the value is rounded and zero is returned; negative
5590 | values that do not round to zero will raise the inexact exception.
5591 *----------------------------------------------------------------------------*/
5592
5593 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5594 {
5595     uint64_t v;
5596     uint32_t res;
5597     int old_exc_flags = get_float_exception_flags(status);
5598
5599     v = float128_to_uint64_round_to_zero(a, status);
5600     if (v > 0xffffffff) {
5601         res = 0xffffffff;
5602     } else {
5603         return v;
5604     }
5605     set_float_exception_flags(old_exc_flags, status);
5606     float_raise(float_flag_invalid, status);
5607     return res;
5608 }
5609
5610 /*----------------------------------------------------------------------------
5611 | Returns the result of converting the quadruple-precision floating-point
5612 | value `a' to the single-precision floating-point format.  The conversion
5613 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5614 | Arithmetic.
5615 *----------------------------------------------------------------------------*/
5616
5617 float32 float128_to_float32(float128 a, float_status *status)
5618 {
5619     flag aSign;
5620     int32_t aExp;
5621     uint64_t aSig0, aSig1;
5622     uint32_t zSig;
5623
5624     aSig1 = extractFloat128Frac1( a );
5625     aSig0 = extractFloat128Frac0( a );
5626     aExp = extractFloat128Exp( a );
5627     aSign = extractFloat128Sign( a );
5628     if ( aExp == 0x7FFF ) {
5629         if ( aSig0 | aSig1 ) {
5630             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5631         }
5632         return packFloat32( aSign, 0xFF, 0 );
5633     }
5634     aSig0 |= ( aSig1 != 0 );
5635     shift64RightJamming( aSig0, 18, &aSig0 );
5636     zSig = aSig0;
5637     if ( aExp || zSig ) {
5638         zSig |= 0x40000000;
5639         aExp -= 0x3F81;
5640     }
5641     return roundAndPackFloat32(aSign, aExp, zSig, status);
5642
5643 }
5644
5645 /*----------------------------------------------------------------------------
5646 | Returns the result of converting the quadruple-precision floating-point
5647 | value `a' to the double-precision floating-point format.  The conversion
5648 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5649 | Arithmetic.
5650 *----------------------------------------------------------------------------*/
5651
5652 float64 float128_to_float64(float128 a, float_status *status)
5653 {
5654     flag aSign;
5655     int32_t aExp;
5656     uint64_t aSig0, aSig1;
5657
5658     aSig1 = extractFloat128Frac1( a );
5659     aSig0 = extractFloat128Frac0( a );
5660     aExp = extractFloat128Exp( a );
5661     aSign = extractFloat128Sign( a );
5662     if ( aExp == 0x7FFF ) {
5663         if ( aSig0 | aSig1 ) {
5664             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5665         }
5666         return packFloat64( aSign, 0x7FF, 0 );
5667     }
5668     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5669     aSig0 |= ( aSig1 != 0 );
5670     if ( aExp || aSig0 ) {
5671         aSig0 |= LIT64( 0x4000000000000000 );
5672         aExp -= 0x3C01;
5673     }
5674     return roundAndPackFloat64(aSign, aExp, aSig0, status);
5675
5676 }
5677
5678 /*----------------------------------------------------------------------------
5679 | Returns the result of converting the quadruple-precision floating-point
5680 | value `a' to the extended double-precision floating-point format.  The
5681 | conversion is performed according to the IEC/IEEE Standard for Binary
5682 | Floating-Point Arithmetic.
5683 *----------------------------------------------------------------------------*/
5684
5685 floatx80 float128_to_floatx80(float128 a, float_status *status)
5686 {
5687     flag aSign;
5688     int32_t aExp;
5689     uint64_t aSig0, aSig1;
5690
5691     aSig1 = extractFloat128Frac1( a );
5692     aSig0 = extractFloat128Frac0( a );
5693     aExp = extractFloat128Exp( a );
5694     aSign = extractFloat128Sign( a );
5695     if ( aExp == 0x7FFF ) {
5696         if ( aSig0 | aSig1 ) {
5697             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5698         }
5699         return packFloatx80(aSign, floatx80_infinity_high,
5700                                    floatx80_infinity_low);
5701     }
5702     if ( aExp == 0 ) {
5703         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5704         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5705     }
5706     else {
5707         aSig0 |= LIT64( 0x0001000000000000 );
5708     }
5709     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5710     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5711
5712 }
5713
5714 /*----------------------------------------------------------------------------
5715 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5716 | returns the result as a quadruple-precision floating-point value.  The
5717 | operation is performed according to the IEC/IEEE Standard for Binary
5718 | Floating-Point Arithmetic.
5719 *----------------------------------------------------------------------------*/
5720
5721 float128 float128_round_to_int(float128 a, float_status *status)
5722 {
5723     flag aSign;
5724     int32_t aExp;
5725     uint64_t lastBitMask, roundBitsMask;
5726     float128 z;
5727
5728     aExp = extractFloat128Exp( a );
5729     if ( 0x402F <= aExp ) {
5730         if ( 0x406F <= aExp ) {
5731             if (    ( aExp == 0x7FFF )
5732                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5733                ) {
5734                 return propagateFloat128NaN(a, a, status);
5735             }
5736             return a;
5737         }
5738         lastBitMask = 1;
5739         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5740         roundBitsMask = lastBitMask - 1;
5741         z = a;
5742         switch (status->float_rounding_mode) {
5743         case float_round_nearest_even:
5744             if ( lastBitMask ) {
5745                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5746                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5747             }
5748             else {
5749                 if ( (int64_t) z.low < 0 ) {
5750                     ++z.high;
5751                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5752                 }
5753             }
5754             break;
5755         case float_round_ties_away:
5756             if (lastBitMask) {
5757                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
5758             } else {
5759                 if ((int64_t) z.low < 0) {
5760                     ++z.high;
5761                 }
5762             }
5763             break;
5764         case float_round_to_zero:
5765             break;
5766         case float_round_up:
5767             if (!extractFloat128Sign(z)) {
5768                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5769             }
5770             break;
5771         case float_round_down:
5772             if (extractFloat128Sign(z)) {
5773                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5774             }
5775             break;
5776         default:
5777             abort();
5778         }
5779         z.low &= ~ roundBitsMask;
5780     }
5781     else {
5782         if ( aExp < 0x3FFF ) {
5783             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5784             status->float_exception_flags |= float_flag_inexact;
5785             aSign = extractFloat128Sign( a );
5786             switch (status->float_rounding_mode) {
5787              case float_round_nearest_even:
5788                 if (    ( aExp == 0x3FFE )
5789                      && (   extractFloat128Frac0( a )
5790                           | extractFloat128Frac1( a ) )
5791                    ) {
5792                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5793                 }
5794                 break;
5795             case float_round_ties_away:
5796                 if (aExp == 0x3FFE) {
5797                     return packFloat128(aSign, 0x3FFF, 0, 0);
5798                 }
5799                 break;
5800              case float_round_down:
5801                 return
5802                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5803                     : packFloat128( 0, 0, 0, 0 );
5804              case float_round_up:
5805                 return
5806                       aSign ? packFloat128( 1, 0, 0, 0 )
5807                     : packFloat128( 0, 0x3FFF, 0, 0 );
5808             }
5809             return packFloat128( aSign, 0, 0, 0 );
5810         }
5811         lastBitMask = 1;
5812         lastBitMask <<= 0x402F - aExp;
5813         roundBitsMask = lastBitMask - 1;
5814         z.low = 0;
5815         z.high = a.high;
5816         switch (status->float_rounding_mode) {
5817         case float_round_nearest_even:
5818             z.high += lastBitMask>>1;
5819             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5820                 z.high &= ~ lastBitMask;
5821             }
5822             break;
5823         case float_round_ties_away:
5824             z.high += lastBitMask>>1;
5825             break;
5826         case float_round_to_zero:
5827             break;
5828         case float_round_up:
5829             if (!extractFloat128Sign(z)) {
5830                 z.high |= ( a.low != 0 );
5831                 z.high += roundBitsMask;
5832             }
5833             break;
5834         case float_round_down:
5835             if (extractFloat128Sign(z)) {
5836                 z.high |= (a.low != 0);
5837                 z.high += roundBitsMask;
5838             }
5839             break;
5840         default:
5841             abort();
5842         }
5843         z.high &= ~ roundBitsMask;
5844     }
5845     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5846         status->float_exception_flags |= float_flag_inexact;
5847     }
5848     return z;
5849
5850 }
5851
5852 /*----------------------------------------------------------------------------
5853 | Returns the result of adding the absolute values of the quadruple-precision
5854 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5855 | before being returned.  `zSign' is ignored if the result is a NaN.
5856 | The addition is performed according to the IEC/IEEE Standard for Binary
5857 | Floating-Point Arithmetic.
5858 *----------------------------------------------------------------------------*/
5859
5860 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
5861                                 float_status *status)
5862 {
5863     int32_t aExp, bExp, zExp;
5864     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5865     int32_t expDiff;
5866
5867     aSig1 = extractFloat128Frac1( a );
5868     aSig0 = extractFloat128Frac0( a );
5869     aExp = extractFloat128Exp( a );
5870     bSig1 = extractFloat128Frac1( b );
5871     bSig0 = extractFloat128Frac0( b );
5872     bExp = extractFloat128Exp( b );
5873     expDiff = aExp - bExp;
5874     if ( 0 < expDiff ) {
5875         if ( aExp == 0x7FFF ) {
5876             if (aSig0 | aSig1) {
5877                 return propagateFloat128NaN(a, b, status);
5878             }
5879             return a;
5880         }
5881         if ( bExp == 0 ) {
5882             --expDiff;
5883         }
5884         else {
5885             bSig0 |= LIT64( 0x0001000000000000 );
5886         }
5887         shift128ExtraRightJamming(
5888             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5889         zExp = aExp;
5890     }
5891     else if ( expDiff < 0 ) {
5892         if ( bExp == 0x7FFF ) {
5893             if (bSig0 | bSig1) {
5894                 return propagateFloat128NaN(a, b, status);
5895             }
5896             return packFloat128( zSign, 0x7FFF, 0, 0 );
5897         }
5898         if ( aExp == 0 ) {
5899             ++expDiff;
5900         }
5901         else {
5902             aSig0 |= LIT64( 0x0001000000000000 );
5903         }
5904         shift128ExtraRightJamming(
5905             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5906         zExp = bExp;
5907     }
5908     else {
5909         if ( aExp == 0x7FFF ) {
5910             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5911                 return propagateFloat128NaN(a, b, status);
5912             }
5913             return a;
5914         }
5915         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5916         if ( aExp == 0 ) {
5917             if (status->flush_to_zero) {
5918                 if (zSig0 | zSig1) {
5919                     float_raise(float_flag_output_denormal, status);
5920                 }
5921                 return packFloat128(zSign, 0, 0, 0);
5922             }
5923             return packFloat128( zSign, 0, zSig0, zSig1 );
5924         }
5925         zSig2 = 0;
5926         zSig0 |= LIT64( 0x0002000000000000 );
5927         zExp = aExp;
5928         goto shiftRight1;
5929     }
5930     aSig0 |= LIT64( 0x0001000000000000 );
5931     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5932     --zExp;
5933     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5934     ++zExp;
5935  shiftRight1:
5936     shift128ExtraRightJamming(
5937         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5938  roundAndPack:
5939     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
5940
5941 }
5942
5943 /*----------------------------------------------------------------------------
5944 | Returns the result of subtracting the absolute values of the quadruple-
5945 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5946 | difference is negated before being returned.  `zSign' is ignored if the
5947 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5948 | Standard for Binary Floating-Point Arithmetic.
5949 *----------------------------------------------------------------------------*/
5950
5951 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
5952                                 float_status *status)
5953 {
5954     int32_t aExp, bExp, zExp;
5955     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5956     int32_t expDiff;
5957
5958     aSig1 = extractFloat128Frac1( a );
5959     aSig0 = extractFloat128Frac0( a );
5960     aExp = extractFloat128Exp( a );
5961     bSig1 = extractFloat128Frac1( b );
5962     bSig0 = extractFloat128Frac0( b );
5963     bExp = extractFloat128Exp( b );
5964     expDiff = aExp - bExp;
5965     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5966     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5967     if ( 0 < expDiff ) goto aExpBigger;
5968     if ( expDiff < 0 ) goto bExpBigger;
5969     if ( aExp == 0x7FFF ) {
5970         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5971             return propagateFloat128NaN(a, b, status);
5972         }
5973         float_raise(float_flag_invalid, status);
5974         return float128_default_nan(status);
5975     }
5976     if ( aExp == 0 ) {
5977         aExp = 1;
5978         bExp = 1;
5979     }
5980     if ( bSig0 < aSig0 ) goto aBigger;
5981     if ( aSig0 < bSig0 ) goto bBigger;
5982     if ( bSig1 < aSig1 ) goto aBigger;
5983     if ( aSig1 < bSig1 ) goto bBigger;
5984     return packFloat128(status->float_rounding_mode == float_round_down,
5985                         0, 0, 0);
5986  bExpBigger:
5987     if ( bExp == 0x7FFF ) {
5988         if (bSig0 | bSig1) {
5989             return propagateFloat128NaN(a, b, status);
5990         }
5991         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5992     }
5993     if ( aExp == 0 ) {
5994         ++expDiff;
5995     }
5996     else {
5997         aSig0 |= LIT64( 0x4000000000000000 );
5998     }
5999     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6000     bSig0 |= LIT64( 0x4000000000000000 );
6001  bBigger:
6002     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6003     zExp = bExp;
6004     zSign ^= 1;
6005     goto normalizeRoundAndPack;
6006  aExpBigger:
6007     if ( aExp == 0x7FFF ) {
6008         if (aSig0 | aSig1) {
6009             return propagateFloat128NaN(a, b, status);
6010         }
6011         return a;
6012     }
6013     if ( bExp == 0 ) {
6014         --expDiff;
6015     }
6016     else {
6017         bSig0 |= LIT64( 0x4000000000000000 );
6018     }
6019     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6020     aSig0 |= LIT64( 0x4000000000000000 );
6021  aBigger:
6022     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6023     zExp = aExp;
6024  normalizeRoundAndPack:
6025     --zExp;
6026     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6027                                          status);
6028
6029 }
6030
6031 /*----------------------------------------------------------------------------
6032 | Returns the result of adding the quadruple-precision floating-point values
6033 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6034 | for Binary Floating-Point Arithmetic.
6035 *----------------------------------------------------------------------------*/
6036
6037 float128 float128_add(float128 a, float128 b, float_status *status)
6038 {
6039     flag aSign, bSign;
6040
6041     aSign = extractFloat128Sign( a );
6042     bSign = extractFloat128Sign( b );
6043     if ( aSign == bSign ) {
6044         return addFloat128Sigs(a, b, aSign, status);
6045     }
6046     else {
6047         return subFloat128Sigs(a, b, aSign, status);
6048     }
6049
6050 }
6051
6052 /*----------------------------------------------------------------------------
6053 | Returns the result of subtracting the quadruple-precision floating-point
6054 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6055 | Standard for Binary Floating-Point Arithmetic.
6056 *----------------------------------------------------------------------------*/
6057
6058 float128 float128_sub(float128 a, float128 b, float_status *status)
6059 {
6060     flag aSign, bSign;
6061
6062     aSign = extractFloat128Sign( a );
6063     bSign = extractFloat128Sign( b );
6064     if ( aSign == bSign ) {
6065         return subFloat128Sigs(a, b, aSign, status);
6066     }
6067     else {
6068         return addFloat128Sigs(a, b, aSign, status);
6069     }
6070
6071 }
6072
6073 /*----------------------------------------------------------------------------
6074 | Returns the result of multiplying the quadruple-precision floating-point
6075 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6076 | Standard for Binary Floating-Point Arithmetic.
6077 *----------------------------------------------------------------------------*/
6078
6079 float128 float128_mul(float128 a, float128 b, float_status *status)
6080 {
6081     flag aSign, bSign, zSign;
6082     int32_t aExp, bExp, zExp;
6083     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6084
6085     aSig1 = extractFloat128Frac1( a );
6086     aSig0 = extractFloat128Frac0( a );
6087     aExp = extractFloat128Exp( a );
6088     aSign = extractFloat128Sign( a );
6089     bSig1 = extractFloat128Frac1( b );
6090     bSig0 = extractFloat128Frac0( b );
6091     bExp = extractFloat128Exp( b );
6092     bSign = extractFloat128Sign( b );
6093     zSign = aSign ^ bSign;
6094     if ( aExp == 0x7FFF ) {
6095         if (    ( aSig0 | aSig1 )
6096              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6097             return propagateFloat128NaN(a, b, status);
6098         }
6099         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6100         return packFloat128( zSign, 0x7FFF, 0, 0 );
6101     }
6102     if ( bExp == 0x7FFF ) {
6103         if (bSig0 | bSig1) {
6104             return propagateFloat128NaN(a, b, status);
6105         }
6106         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6107  invalid:
6108             float_raise(float_flag_invalid, status);
6109             return float128_default_nan(status);
6110         }
6111         return packFloat128( zSign, 0x7FFF, 0, 0 );
6112     }
6113     if ( aExp == 0 ) {
6114         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6115         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6116     }
6117     if ( bExp == 0 ) {
6118         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6119         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6120     }
6121     zExp = aExp + bExp - 0x4000;
6122     aSig0 |= LIT64( 0x0001000000000000 );
6123     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6124     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6125     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6126     zSig2 |= ( zSig3 != 0 );
6127     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6128         shift128ExtraRightJamming(
6129             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6130         ++zExp;
6131     }
6132     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6133
6134 }
6135
6136 /*----------------------------------------------------------------------------
6137 | Returns the result of dividing the quadruple-precision floating-point value
6138 | `a' by the corresponding value `b'.  The operation is performed according to
6139 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6140 *----------------------------------------------------------------------------*/
6141
6142 float128 float128_div(float128 a, float128 b, float_status *status)
6143 {
6144     flag aSign, bSign, zSign;
6145     int32_t aExp, bExp, zExp;
6146     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6147     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6148
6149     aSig1 = extractFloat128Frac1( a );
6150     aSig0 = extractFloat128Frac0( a );
6151     aExp = extractFloat128Exp( a );
6152     aSign = extractFloat128Sign( a );
6153     bSig1 = extractFloat128Frac1( b );
6154     bSig0 = extractFloat128Frac0( b );
6155     bExp = extractFloat128Exp( b );
6156     bSign = extractFloat128Sign( b );
6157     zSign = aSign ^ bSign;
6158     if ( aExp == 0x7FFF ) {
6159         if (aSig0 | aSig1) {
6160             return propagateFloat128NaN(a, b, status);
6161         }
6162         if ( bExp == 0x7FFF ) {
6163             if (bSig0 | bSig1) {
6164                 return propagateFloat128NaN(a, b, status);
6165             }
6166             goto invalid;
6167         }
6168         return packFloat128( zSign, 0x7FFF, 0, 0 );
6169     }
6170     if ( bExp == 0x7FFF ) {
6171         if (bSig0 | bSig1) {
6172             return propagateFloat128NaN(a, b, status);
6173         }
6174         return packFloat128( zSign, 0, 0, 0 );
6175     }
6176     if ( bExp == 0 ) {
6177         if ( ( bSig0 | bSig1 ) == 0 ) {
6178             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6179  invalid:
6180                 float_raise(float_flag_invalid, status);
6181                 return float128_default_nan(status);
6182             }
6183             float_raise(float_flag_divbyzero, status);
6184             return packFloat128( zSign, 0x7FFF, 0, 0 );
6185         }
6186         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6187     }
6188     if ( aExp == 0 ) {
6189         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6190         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6191     }
6192     zExp = aExp - bExp + 0x3FFD;
6193     shortShift128Left(
6194         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6195     shortShift128Left(
6196         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6197     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6198         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6199         ++zExp;
6200     }
6201     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6202     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6203     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6204     while ( (int64_t) rem0 < 0 ) {
6205         --zSig0;
6206         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6207     }
6208     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6209     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6210         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6211         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6212         while ( (int64_t) rem1 < 0 ) {
6213             --zSig1;
6214             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6215         }
6216         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6217     }
6218     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6219     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6220
6221 }
6222
6223 /*----------------------------------------------------------------------------
6224 | Returns the remainder of the quadruple-precision floating-point value `a'
6225 | with respect to the corresponding value `b'.  The operation is performed
6226 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6227 *----------------------------------------------------------------------------*/
6228
6229 float128 float128_rem(float128 a, float128 b, float_status *status)
6230 {
6231     flag aSign, zSign;
6232     int32_t aExp, bExp, expDiff;
6233     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6234     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6235     int64_t sigMean0;
6236
6237     aSig1 = extractFloat128Frac1( a );
6238     aSig0 = extractFloat128Frac0( a );
6239     aExp = extractFloat128Exp( a );
6240     aSign = extractFloat128Sign( a );
6241     bSig1 = extractFloat128Frac1( b );
6242     bSig0 = extractFloat128Frac0( b );
6243     bExp = extractFloat128Exp( b );
6244     if ( aExp == 0x7FFF ) {
6245         if (    ( aSig0 | aSig1 )
6246              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6247             return propagateFloat128NaN(a, b, status);
6248         }
6249         goto invalid;
6250     }
6251     if ( bExp == 0x7FFF ) {
6252         if (bSig0 | bSig1) {
6253             return propagateFloat128NaN(a, b, status);
6254         }
6255         return a;
6256     }
6257     if ( bExp == 0 ) {
6258         if ( ( bSig0 | bSig1 ) == 0 ) {
6259  invalid:
6260             float_raise(float_flag_invalid, status);
6261             return float128_default_nan(status);
6262         }
6263         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6264     }
6265     if ( aExp == 0 ) {
6266         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6267         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6268     }
6269     expDiff = aExp - bExp;
6270     if ( expDiff < -1 ) return a;
6271     shortShift128Left(
6272         aSig0 | LIT64( 0x0001000000000000 ),
6273         aSig1,
6274         15 - ( expDiff < 0 ),
6275         &aSig0,
6276         &aSig1
6277     );
6278     shortShift128Left(
6279         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6280     q = le128( bSig0, bSig1, aSig0, aSig1 );
6281     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6282     expDiff -= 64;
6283     while ( 0 < expDiff ) {
6284         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6285         q = ( 4 < q ) ? q - 4 : 0;
6286         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6287         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6288         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6289         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6290         expDiff -= 61;
6291     }
6292     if ( -64 < expDiff ) {
6293         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6294         q = ( 4 < q ) ? q - 4 : 0;
6295         q >>= - expDiff;
6296         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6297         expDiff += 52;
6298         if ( expDiff < 0 ) {
6299             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6300         }
6301         else {
6302             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6303         }
6304         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6305         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6306     }
6307     else {
6308         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6309         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6310     }
6311     do {
6312         alternateASig0 = aSig0;
6313         alternateASig1 = aSig1;
6314         ++q;
6315         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6316     } while ( 0 <= (int64_t) aSig0 );
6317     add128(
6318         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6319     if (    ( sigMean0 < 0 )
6320          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6321         aSig0 = alternateASig0;
6322         aSig1 = alternateASig1;
6323     }
6324     zSign = ( (int64_t) aSig0 < 0 );
6325     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6326     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6327                                          status);
6328 }
6329
6330 /*----------------------------------------------------------------------------
6331 | Returns the square root of the quadruple-precision floating-point value `a'.
6332 | The operation is performed according to the IEC/IEEE Standard for Binary
6333 | Floating-Point Arithmetic.
6334 *----------------------------------------------------------------------------*/
6335
6336 float128 float128_sqrt(float128 a, float_status *status)
6337 {
6338     flag aSign;
6339     int32_t aExp, zExp;
6340     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6341     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6342
6343     aSig1 = extractFloat128Frac1( a );
6344     aSig0 = extractFloat128Frac0( a );
6345     aExp = extractFloat128Exp( a );
6346     aSign = extractFloat128Sign( a );
6347     if ( aExp == 0x7FFF ) {
6348         if (aSig0 | aSig1) {
6349             return propagateFloat128NaN(a, a, status);
6350         }
6351         if ( ! aSign ) return a;
6352         goto invalid;
6353     }
6354     if ( aSign ) {
6355         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6356  invalid:
6357         float_raise(float_flag_invalid, status);
6358         return float128_default_nan(status);
6359     }
6360     if ( aExp == 0 ) {
6361         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6362         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6363     }
6364     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6365     aSig0 |= LIT64( 0x0001000000000000 );
6366     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6367     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6368     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6369     doubleZSig0 = zSig0<<1;
6370     mul64To128( zSig0, zSig0, &term0, &term1 );
6371     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6372     while ( (int64_t) rem0 < 0 ) {
6373         --zSig0;
6374         doubleZSig0 -= 2;
6375         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6376     }
6377     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6378     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6379         if ( zSig1 == 0 ) zSig1 = 1;
6380         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6381         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6382         mul64To128( zSig1, zSig1, &term2, &term3 );
6383         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6384         while ( (int64_t) rem1 < 0 ) {
6385             --zSig1;
6386             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6387             term3 |= 1;
6388             term2 |= doubleZSig0;
6389             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6390         }
6391         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6392     }
6393     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6394     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6395
6396 }
6397
6398 /*----------------------------------------------------------------------------
6399 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6400 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6401 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6402 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6403 *----------------------------------------------------------------------------*/
6404
6405 int float128_eq(float128 a, float128 b, float_status *status)
6406 {
6407
6408     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6409               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6410          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6411               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6412        ) {
6413         float_raise(float_flag_invalid, status);
6414         return 0;
6415     }
6416     return
6417            ( a.low == b.low )
6418         && (    ( a.high == b.high )
6419              || (    ( a.low == 0 )
6420                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6421            );
6422
6423 }
6424
6425 /*----------------------------------------------------------------------------
6426 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6427 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6428 | exception is raised if either operand is a NaN.  The comparison is performed
6429 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6430 *----------------------------------------------------------------------------*/
6431
6432 int float128_le(float128 a, float128 b, float_status *status)
6433 {
6434     flag aSign, bSign;
6435
6436     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6437               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6438          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6439               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6440        ) {
6441         float_raise(float_flag_invalid, status);
6442         return 0;
6443     }
6444     aSign = extractFloat128Sign( a );
6445     bSign = extractFloat128Sign( b );
6446     if ( aSign != bSign ) {
6447         return
6448                aSign
6449             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6450                  == 0 );
6451     }
6452     return
6453           aSign ? le128( b.high, b.low, a.high, a.low )
6454         : le128( a.high, a.low, b.high, b.low );
6455
6456 }
6457
6458 /*----------------------------------------------------------------------------
6459 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6460 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6461 | raised if either operand is a NaN.  The comparison is performed according
6462 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6463 *----------------------------------------------------------------------------*/
6464
6465 int float128_lt(float128 a, float128 b, float_status *status)
6466 {
6467     flag aSign, bSign;
6468
6469     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6470               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6471          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6472               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6473        ) {
6474         float_raise(float_flag_invalid, status);
6475         return 0;
6476     }
6477     aSign = extractFloat128Sign( a );
6478     bSign = extractFloat128Sign( b );
6479     if ( aSign != bSign ) {
6480         return
6481                aSign
6482             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6483                  != 0 );
6484     }
6485     return
6486           aSign ? lt128( b.high, b.low, a.high, a.low )
6487         : lt128( a.high, a.low, b.high, b.low );
6488
6489 }
6490
6491 /*----------------------------------------------------------------------------
6492 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6493 | be compared, and 0 otherwise.  The invalid exception is raised if either
6494 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6495 | Standard for Binary Floating-Point Arithmetic.
6496 *----------------------------------------------------------------------------*/
6497
6498 int float128_unordered(float128 a, float128 b, float_status *status)
6499 {
6500     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6501               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6502          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6503               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6504        ) {
6505         float_raise(float_flag_invalid, status);
6506         return 1;
6507     }
6508     return 0;
6509 }
6510
6511 /*----------------------------------------------------------------------------
6512 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6513 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6514 | exception.  The comparison is performed according to the IEC/IEEE Standard
6515 | for Binary Floating-Point Arithmetic.
6516 *----------------------------------------------------------------------------*/
6517
6518 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6519 {
6520
6521     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6522               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6523          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6524               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6525        ) {
6526         if (float128_is_signaling_nan(a, status)
6527          || float128_is_signaling_nan(b, status)) {
6528             float_raise(float_flag_invalid, status);
6529         }
6530         return 0;
6531     }
6532     return
6533            ( a.low == b.low )
6534         && (    ( a.high == b.high )
6535              || (    ( a.low == 0 )
6536                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6537            );
6538
6539 }
6540
6541 /*----------------------------------------------------------------------------
6542 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6543 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6544 | cause an exception.  Otherwise, the comparison is performed according to the
6545 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6546 *----------------------------------------------------------------------------*/
6547
6548 int float128_le_quiet(float128 a, float128 b, float_status *status)
6549 {
6550     flag aSign, bSign;
6551
6552     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6553               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6554          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6555               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6556        ) {
6557         if (float128_is_signaling_nan(a, status)
6558          || float128_is_signaling_nan(b, status)) {
6559             float_raise(float_flag_invalid, status);
6560         }
6561         return 0;
6562     }
6563     aSign = extractFloat128Sign( a );
6564     bSign = extractFloat128Sign( b );
6565     if ( aSign != bSign ) {
6566         return
6567                aSign
6568             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6569                  == 0 );
6570     }
6571     return
6572           aSign ? le128( b.high, b.low, a.high, a.low )
6573         : le128( a.high, a.low, b.high, b.low );
6574
6575 }
6576
6577 /*----------------------------------------------------------------------------
6578 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6579 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6580 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6581 | Standard for Binary Floating-Point Arithmetic.
6582 *----------------------------------------------------------------------------*/
6583
6584 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6585 {
6586     flag aSign, bSign;
6587
6588     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6589               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6590          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6591               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6592        ) {
6593         if (float128_is_signaling_nan(a, status)
6594          || float128_is_signaling_nan(b, status)) {
6595             float_raise(float_flag_invalid, status);
6596         }
6597         return 0;
6598     }
6599     aSign = extractFloat128Sign( a );
6600     bSign = extractFloat128Sign( b );
6601     if ( aSign != bSign ) {
6602         return
6603                aSign
6604             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6605                  != 0 );
6606     }
6607     return
6608           aSign ? lt128( b.high, b.low, a.high, a.low )
6609         : lt128( a.high, a.low, b.high, b.low );
6610
6611 }
6612
6613 /*----------------------------------------------------------------------------
6614 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6615 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6616 | comparison is performed according to the IEC/IEEE Standard for Binary
6617 | Floating-Point Arithmetic.
6618 *----------------------------------------------------------------------------*/
6619
6620 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6621 {
6622     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6623               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6624          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6625               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6626        ) {
6627         if (float128_is_signaling_nan(a, status)
6628          || float128_is_signaling_nan(b, status)) {
6629             float_raise(float_flag_invalid, status);
6630         }
6631         return 1;
6632     }
6633     return 0;
6634 }
6635
6636 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6637                                             int is_quiet, float_status *status)
6638 {
6639     flag aSign, bSign;
6640
6641     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6642         float_raise(float_flag_invalid, status);
6643         return float_relation_unordered;
6644     }
6645     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6646           ( extractFloatx80Frac( a )<<1 ) ) ||
6647         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6648           ( extractFloatx80Frac( b )<<1 ) )) {
6649         if (!is_quiet ||
6650             floatx80_is_signaling_nan(a, status) ||
6651             floatx80_is_signaling_nan(b, status)) {
6652             float_raise(float_flag_invalid, status);
6653         }
6654         return float_relation_unordered;
6655     }
6656     aSign = extractFloatx80Sign( a );
6657     bSign = extractFloatx80Sign( b );
6658     if ( aSign != bSign ) {
6659
6660         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6661              ( ( a.low | b.low ) == 0 ) ) {
6662             /* zero case */
6663             return float_relation_equal;
6664         } else {
6665             return 1 - (2 * aSign);
6666         }
6667     } else {
6668         if (a.low == b.low && a.high == b.high) {
6669             return float_relation_equal;
6670         } else {
6671             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6672         }
6673     }
6674 }
6675
6676 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6677 {
6678     return floatx80_compare_internal(a, b, 0, status);
6679 }
6680
6681 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6682 {
6683     return floatx80_compare_internal(a, b, 1, status);
6684 }
6685
6686 static inline int float128_compare_internal(float128 a, float128 b,
6687                                             int is_quiet, float_status *status)
6688 {
6689     flag aSign, bSign;
6690
6691     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6692           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6693         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6694           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6695         if (!is_quiet ||
6696             float128_is_signaling_nan(a, status) ||
6697             float128_is_signaling_nan(b, status)) {
6698             float_raise(float_flag_invalid, status);
6699         }
6700         return float_relation_unordered;
6701     }
6702     aSign = extractFloat128Sign( a );
6703     bSign = extractFloat128Sign( b );
6704     if ( aSign != bSign ) {
6705         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6706             /* zero case */
6707             return float_relation_equal;
6708         } else {
6709             return 1 - (2 * aSign);
6710         }
6711     } else {
6712         if (a.low == b.low && a.high == b.high) {
6713             return float_relation_equal;
6714         } else {
6715             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6716         }
6717     }
6718 }
6719
6720 int float128_compare(float128 a, float128 b, float_status *status)
6721 {
6722     return float128_compare_internal(a, b, 0, status);
6723 }
6724
6725 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6726 {
6727     return float128_compare_internal(a, b, 1, status);
6728 }
6729
6730 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6731 {
6732     flag aSign;
6733     int32_t aExp;
6734     uint64_t aSig;
6735
6736     if (floatx80_invalid_encoding(a)) {
6737         float_raise(float_flag_invalid, status);
6738         return floatx80_default_nan(status);
6739     }
6740     aSig = extractFloatx80Frac( a );
6741     aExp = extractFloatx80Exp( a );
6742     aSign = extractFloatx80Sign( a );
6743
6744     if ( aExp == 0x7FFF ) {
6745         if ( aSig<<1 ) {
6746             return propagateFloatx80NaN(a, a, status);
6747         }
6748         return a;
6749     }
6750
6751     if (aExp == 0) {
6752         if (aSig == 0) {
6753             return a;
6754         }
6755         aExp++;
6756     }
6757
6758     if (n > 0x10000) {
6759         n = 0x10000;
6760     } else if (n < -0x10000) {
6761         n = -0x10000;
6762     }
6763
6764     aExp += n;
6765     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6766                                          aSign, aExp, aSig, 0, status);
6767 }
6768
6769 float128 float128_scalbn(float128 a, int n, float_status *status)
6770 {
6771     flag aSign;
6772     int32_t aExp;
6773     uint64_t aSig0, aSig1;
6774
6775     aSig1 = extractFloat128Frac1( a );
6776     aSig0 = extractFloat128Frac0( a );
6777     aExp = extractFloat128Exp( a );
6778     aSign = extractFloat128Sign( a );
6779     if ( aExp == 0x7FFF ) {
6780         if ( aSig0 | aSig1 ) {
6781             return propagateFloat128NaN(a, a, status);
6782         }
6783         return a;
6784     }
6785     if (aExp != 0) {
6786         aSig0 |= LIT64( 0x0001000000000000 );
6787     } else if (aSig0 == 0 && aSig1 == 0) {
6788         return a;
6789     } else {
6790         aExp++;
6791     }
6792
6793     if (n > 0x10000) {
6794         n = 0x10000;
6795     } else if (n < -0x10000) {
6796         n = -0x10000;
6797     }
6798
6799     aExp += n - 1;
6800     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6801                                          , status);
6802
6803 }