fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include <math.h>
  87 #include "qemu/bitops.h"
  88 #include "fpu/softfloat.h"
  89
  90 /* We only need stdlib for abort() */
  91
  92 /*----------------------------------------------------------------------------
  93 | Primitive arithmetic functions, including multi-word arithmetic, and
  94 | division and square root approximations.  (Can be specialized to target if
  95 | desired.)
  96 *----------------------------------------------------------------------------*/
  97 #include "fpu/softfloat-macros.h"
  98
  99 /*
 100  * Hardfloat
 101  *
 102  * Fast emulation of guest FP instructions is challenging for two reasons.
 103  * First, FP instruction semantics are similar but not identical, particularly
 104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
 105  * exception flags is not trivial: reading the host's flags register with a
 106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
 107  * and trapping on every FP exception is not fast nor pleasant to work with.
 108  *
 109  * We address these challenges by leveraging the host FPU for a subset of the
 110  * operations. To do this we expand on the idea presented in this paper:
 111  *
 112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
 113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
 114  *
 115  * The idea is thus to leverage the host FPU to (1) compute FP operations
 116  * and (2) identify whether FP exceptions occurred while avoiding
 117  * expensive exception flag register accesses.
 118  *
 119  * An important optimization shown in the paper is that given that exception
 120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
 121  * This is particularly useful for the inexact flag, which is very frequently
 122  * raised in floating-point workloads.
 123  *
 124  * We optimize the code further by deferring to soft-fp whenever FP exception
 125  * detection might get hairy. Two examples: (1) when at least one operand is
 126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
 127  * and the result is < the minimum normal.
 128  */
 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
 130     static inline void name(soft_t *a, float_status *s)                 \
 131     {                                                                   \
 132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
 133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
 134                                      soft_t ## _is_neg(*a));            \
 135             s->float_exception_flags |= float_flag_input_denormal;      \
 136         }                                                               \
 137     }
 138
 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
 141 #undef GEN_INPUT_FLUSH__NOCHECK
 142
 143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
 144     static inline void name(soft_t *a, float_status *s) \
 145     {                                                   \
 146         if (likely(!s->flush_inputs_to_zero)) {         \
 147             return;                                     \
 148         }                                               \
 149         soft_t ## _input_flush__nocheck(a, s);          \
 150     }
 151
 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
 154 #undef GEN_INPUT_FLUSH1
 155
 156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
 157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
 158     {                                                                   \
 159         if (likely(!s->flush_inputs_to_zero)) {                         \
 160             return;                                                     \
 161         }                                                               \
 162         soft_t ## _input_flush__nocheck(a, s);                          \
 163         soft_t ## _input_flush__nocheck(b, s);                          \
 164     }
 165
 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
 168 #undef GEN_INPUT_FLUSH2
 169
 170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
 171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
 172     {                                                                   \
 173         if (likely(!s->flush_inputs_to_zero)) {                         \
 174             return;                                                     \
 175         }                                                               \
 176         soft_t ## _input_flush__nocheck(a, s);                          \
 177         soft_t ## _input_flush__nocheck(b, s);                          \
 178         soft_t ## _input_flush__nocheck(c, s);                          \
 179     }
 180
 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
 183 #undef GEN_INPUT_FLUSH3
 184
 185 /*
 186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
 187  * hardfloat functions. Each combination of number of inputs and float size
 188  * gets its own value.
 189  */
 190 #if defined(__x86_64__)
 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
 197 #else
 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
 204 #endif
 205
 206 /*
 207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
 208  * float{32,64}_is_infinity when !USE_FP.
 209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
 210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
 211  */
 212 #if defined(__x86_64__) || defined(__aarch64__)
 213 # define QEMU_HARDFLOAT_USE_ISINF   1
 214 #else
 215 # define QEMU_HARDFLOAT_USE_ISINF   0
 216 #endif
 217
 218 /*
 219  * Some targets clear the FP flags before most FP operations. This prevents
 220  * the use of hardfloat, since hardfloat relies on the inexact flag being
 221  * already set.
 222  */
 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
 224 # if defined(__FAST_MATH__)
 225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
 226     IEEE implementation
 227 # endif
 228 # define QEMU_NO_HARDFLOAT 1
 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
 230 #else
 231 # define QEMU_NO_HARDFLOAT 0
 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
 233 #endif
 234
 235 static inline bool can_use_fpu(const float_status *s)
 236 {
 237     if (QEMU_NO_HARDFLOAT) {
 238         return false;
 239     }
 240     return likely(s->float_exception_flags & float_flag_inexact &&
 241                   s->float_rounding_mode == float_round_nearest_even);
 242 }
 243
 244 /*
 245  * Hardfloat generation functions. Each operation can have two flavors:
 246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
 247  * most condition checks, or native ones (e.g. fpclassify).
 248  *
 249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
 250  * compiler to propagate constants and inline everything into the callers.
 251  *
 252  * We only generate functions for operations with two inputs, since only
 253  * these are common enough to justify consolidating them into common code.
 254  */
 255
 256 typedef union {
 257     float32 s;
 258     float h;
 259 } union_float32;
 260
 261 typedef union {
 262     float64 s;
 263     double h;
 264 } union_float64;
 265
 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
 268
 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
 271 typedef float   (*hard_f32_op2_fn)(float a, float b);
 272 typedef double  (*hard_f64_op2_fn)(double a, double b);
 273
 274 /* 2-input is-zero-or-normal */
 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
 276 {
 277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
 278         /*
 279          * Not using a temp variable for consecutive fpclassify calls ends up
 280          * generating faster code.
 281          */
 282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 284     }
 285     return float32_is_zero_or_normal(a.s) &&
 286            float32_is_zero_or_normal(b.s);
 287 }
 288
 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
 290 {
 291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
 292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 294     }
 295     return float64_is_zero_or_normal(a.s) &&
 296            float64_is_zero_or_normal(b.s);
 297 }
 298
 299 /* 3-input is-zero-or-normal */
 300 static inline
 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
 302 {
 303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
 304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 307     }
 308     return float32_is_zero_or_normal(a.s) &&
 309            float32_is_zero_or_normal(b.s) &&
 310            float32_is_zero_or_normal(c.s);
 311 }
 312
 313 static inline
 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
 315 {
 316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
 317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 320     }
 321     return float64_is_zero_or_normal(a.s) &&
 322            float64_is_zero_or_normal(b.s) &&
 323            float64_is_zero_or_normal(c.s);
 324 }
 325
 326 static inline bool f32_is_inf(union_float32 a)
 327 {
 328     if (QEMU_HARDFLOAT_USE_ISINF) {
 329         return isinf(a.h);
 330     }
 331     return float32_is_infinity(a.s);
 332 }
 333
 334 static inline bool f64_is_inf(union_float64 a)
 335 {
 336     if (QEMU_HARDFLOAT_USE_ISINF) {
 337         return isinf(a.h);
 338     }
 339     return float64_is_infinity(a.s);
 340 }
 341
 342 /* Note: @fast_test and @post can be NULL */
 343 static inline float32
 344 float32_gen2(float32 xa, float32 xb, float_status *s,
 345              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
 346              f32_check_fn pre, f32_check_fn post,
 347              f32_check_fn fast_test, soft_f32_op2_fn fast_op)
 348 {
 349     union_float32 ua, ub, ur;
 350
 351     ua.s = xa;
 352     ub.s = xb;
 353
 354     if (unlikely(!can_use_fpu(s))) {
 355         goto soft;
 356     }
 357
 358     float32_input_flush2(&ua.s, &ub.s, s);
 359     if (unlikely(!pre(ua, ub))) {
 360         goto soft;
 361     }
 362     if (fast_test && fast_test(ua, ub)) {
 363         return fast_op(ua.s, ub.s, s);
 364     }
 365
 366     ur.h = hard(ua.h, ub.h);
 367     if (unlikely(f32_is_inf(ur))) {
 368         s->float_exception_flags |= float_flag_overflow;
 369     } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
 370         if (post == NULL || post(ua, ub)) {
 371             goto soft;
 372         }
 373     }
 374     return ur.s;
 375
 376  soft:
 377     return soft(ua.s, ub.s, s);
 378 }
 379
 380 static inline float64
 381 float64_gen2(float64 xa, float64 xb, float_status *s,
 382              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
 383              f64_check_fn pre, f64_check_fn post,
 384              f64_check_fn fast_test, soft_f64_op2_fn fast_op)
 385 {
 386     union_float64 ua, ub, ur;
 387
 388     ua.s = xa;
 389     ub.s = xb;
 390
 391     if (unlikely(!can_use_fpu(s))) {
 392         goto soft;
 393     }
 394
 395     float64_input_flush2(&ua.s, &ub.s, s);
 396     if (unlikely(!pre(ua, ub))) {
 397         goto soft;
 398     }
 399     if (fast_test && fast_test(ua, ub)) {
 400         return fast_op(ua.s, ub.s, s);
 401     }
 402
 403     ur.h = hard(ua.h, ub.h);
 404     if (unlikely(f64_is_inf(ur))) {
 405         s->float_exception_flags |= float_flag_overflow;
 406     } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
 407         if (post == NULL || post(ua, ub)) {
 408             goto soft;
 409         }
 410     }
 411     return ur.s;
 412
 413  soft:
 414     return soft(ua.s, ub.s, s);
 415 }
 416
 417 /*----------------------------------------------------------------------------
 418 | Returns the fraction bits of the single-precision floating-point value `a'.
 419 *----------------------------------------------------------------------------*/
 420
 421 static inline uint32_t extractFloat32Frac(float32 a)
 422 {
 423     return float32_val(a) & 0x007FFFFF;
 424 }
 425
 426 /*----------------------------------------------------------------------------
 427 | Returns the exponent bits of the single-precision floating-point value `a'.
 428 *----------------------------------------------------------------------------*/
 429
 430 static inline int extractFloat32Exp(float32 a)
 431 {
 432     return (float32_val(a) >> 23) & 0xFF;
 433 }
 434
 435 /*----------------------------------------------------------------------------
 436 | Returns the sign bit of the single-precision floating-point value `a'.
 437 *----------------------------------------------------------------------------*/
 438
 439 static inline flag extractFloat32Sign(float32 a)
 440 {
 441     return float32_val(a) >> 31;
 442 }
 443
 444 /*----------------------------------------------------------------------------
 445 | Returns the fraction bits of the double-precision floating-point value `a'.
 446 *----------------------------------------------------------------------------*/
 447
 448 static inline uint64_t extractFloat64Frac(float64 a)
 449 {
 450     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
 451 }
 452
 453 /*----------------------------------------------------------------------------
 454 | Returns the exponent bits of the double-precision floating-point value `a'.
 455 *----------------------------------------------------------------------------*/
 456
 457 static inline int extractFloat64Exp(float64 a)
 458 {
 459     return (float64_val(a) >> 52) & 0x7FF;
 460 }
 461
 462 /*----------------------------------------------------------------------------
 463 | Returns the sign bit of the double-precision floating-point value `a'.
 464 *----------------------------------------------------------------------------*/
 465
 466 static inline flag extractFloat64Sign(float64 a)
 467 {
 468     return float64_val(a) >> 63;
 469 }
 470
 471 /*
 472  * Classify a floating point number. Everything above float_class_qnan
 473  * is a NaN so cls >= float_class_qnan is any NaN.
 474  */
 475
 476 typedef enum __attribute__ ((__packed__)) {
 477     float_class_unclassified,
 478     float_class_zero,
 479     float_class_normal,
 480     float_class_inf,
 481     float_class_qnan,  /* all NaNs from here */
 482     float_class_snan,
 483 } FloatClass;
 484
 485 /* Simple helpers for checking if, or what kind of, NaN we have */
 486 static inline __attribute__((unused)) bool is_nan(FloatClass c)
 487 {
 488     return unlikely(c >= float_class_qnan);
 489 }
 490
 491 static inline __attribute__((unused)) bool is_snan(FloatClass c)
 492 {
 493     return c == float_class_snan;
 494 }
 495
 496 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 497 {
 498     return c == float_class_qnan;
 499 }
 500
 501 /*
 502  * Structure holding all of the decomposed parts of a float. The
 503  * exponent is unbiased and the fraction is normalized. All
 504  * calculations are done with a 64 bit fraction and then rounded as
 505  * appropriate for the final format.
 506  *
 507  * Thanks to the packed FloatClass a decent compiler should be able to
 508  * fit the whole structure into registers and avoid using the stack
 509  * for parameter passing.
 510  */
 511
 512 typedef struct {
 513     uint64_t frac;
 514     int32_t  exp;
 515     FloatClass cls;
 516     bool sign;
 517 } FloatParts;
 518
 519 #define DECOMPOSED_BINARY_POINT    (64 - 2)
 520 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 521 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 522
 523 /* Structure holding all of the relevant parameters for a format.
 524  *   exp_size: the size of the exponent field
 525  *   exp_bias: the offset applied to the exponent field
 526  *   exp_max: the maximum normalised exponent
 527  *   frac_size: the size of the fraction field
 528  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 529  * The following are computed based the size of fraction
 530  *   frac_lsb: least significant bit of fraction
 531  *   frac_lsbm1: the bit below the least significant bit (for rounding)
 532  *   round_mask/roundeven_mask: masks used for rounding
 533  * The following optional modifiers are available:
 534  *   arm_althp: handle ARM Alternative Half Precision
 535  */
 536 typedef struct {
 537     int exp_size;
 538     int exp_bias;
 539     int exp_max;
 540     int frac_size;
 541     int frac_shift;
 542     uint64_t frac_lsb;
 543     uint64_t frac_lsbm1;
 544     uint64_t round_mask;
 545     uint64_t roundeven_mask;
 546     bool arm_althp;
 547 } FloatFmt;
 548
 549 /* Expand fields based on the size of exponent and fraction */
 550 #define FLOAT_PARAMS(E, F)                                           \
 551     .exp_size       = E,                                             \
 552     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 553     .exp_max        = (1 << E) - 1,                                  \
 554     .frac_size      = F,                                             \
 555     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 556     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 557     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 558     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 559     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 560
 561 static const FloatFmt float16_params = {
 562     FLOAT_PARAMS(5, 10)
 563 };
 564
 565 static const FloatFmt float16_params_ahp = {
 566     FLOAT_PARAMS(5, 10),
 567     .arm_althp = true
 568 };
 569
 570 static const FloatFmt float32_params = {
 571     FLOAT_PARAMS(8, 23)
 572 };
 573
 574 static const FloatFmt float64_params = {
 575     FLOAT_PARAMS(11, 52)
 576 };
 577
 578 /* Unpack a float to parts, but do not canonicalize.  */
 579 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 580 {
 581     const int sign_pos = fmt.frac_size + fmt.exp_size;
 582
 583     return (FloatParts) {
 584         .cls = float_class_unclassified,
 585         .sign = extract64(raw, sign_pos, 1),
 586         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 587         .frac = extract64(raw, 0, fmt.frac_size),
 588     };
 589 }
 590
 591 static inline FloatParts float16_unpack_raw(float16 f)
 592 {
 593     return unpack_raw(float16_params, f);
 594 }
 595
 596 static inline FloatParts float32_unpack_raw(float32 f)
 597 {
 598     return unpack_raw(float32_params, f);
 599 }
 600
 601 static inline FloatParts float64_unpack_raw(float64 f)
 602 {
 603     return unpack_raw(float64_params, f);
 604 }
 605
 606 /* Pack a float from parts, but do not canonicalize.  */
 607 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 608 {
 609     const int sign_pos = fmt.frac_size + fmt.exp_size;
 610     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 611     return deposit64(ret, sign_pos, 1, p.sign);
 612 }
 613
 614 static inline float16 float16_pack_raw(FloatParts p)
 615 {
 616     return make_float16(pack_raw(float16_params, p));
 617 }
 618
 619 static inline float32 float32_pack_raw(FloatParts p)
 620 {
 621     return make_float32(pack_raw(float32_params, p));
 622 }
 623
 624 static inline float64 float64_pack_raw(FloatParts p)
 625 {
 626     return make_float64(pack_raw(float64_params, p));
 627 }
 628
 629 /*----------------------------------------------------------------------------
 630 | Functions and definitions to determine:  (1) whether tininess for underflow
 631 | is detected before or after rounding by default, (2) what (if anything)
 632 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 633 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 634 | are propagated from function inputs to output.  These details are target-
 635 | specific.
 636 *----------------------------------------------------------------------------*/
 637 #include "softfloat-specialize.inc.c"
 638
 639 /* Canonicalize EXP and FRAC, setting CLS.  */
 640 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
 641                                   float_status *status)
 642 {
 643     if (part.exp == parm->exp_max && !parm->arm_althp) {
 644         if (part.frac == 0) {
 645             part.cls = float_class_inf;
 646         } else {
 647             part.frac <<= parm->frac_shift;
 648             part.cls = (parts_is_snan_frac(part.frac, status)
 649                         ? float_class_snan : float_class_qnan);
 650         }
 651     } else if (part.exp == 0) {
 652         if (likely(part.frac == 0)) {
 653             part.cls = float_class_zero;
 654         } else if (status->flush_inputs_to_zero) {
 655             float_raise(float_flag_input_denormal, status);
 656             part.cls = float_class_zero;
 657             part.frac = 0;
 658         } else {
 659             int shift = clz64(part.frac) - 1;
 660             part.cls = float_class_normal;
 661             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 662             part.frac <<= shift;
 663         }
 664     } else {
 665         part.cls = float_class_normal;
 666         part.exp -= parm->exp_bias;
 667         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 668     }
 669     return part;
 670 }
 671
 672 /* Round and uncanonicalize a floating-point number by parts. There
 673  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 674  * fraction; these bits will be removed. The exponent will be biased
 675  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 676  */
 677
 678 static FloatParts round_canonical(FloatParts p, float_status *s,
 679                                   const FloatFmt *parm)
 680 {
 681     const uint64_t frac_lsb = parm->frac_lsb;
 682     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 683     const uint64_t round_mask = parm->round_mask;
 684     const uint64_t roundeven_mask = parm->roundeven_mask;
 685     const int exp_max = parm->exp_max;
 686     const int frac_shift = parm->frac_shift;
 687     uint64_t frac, inc;
 688     int exp, flags = 0;
 689     bool overflow_norm;
 690
 691     frac = p.frac;
 692     exp = p.exp;
 693
 694     switch (p.cls) {
 695     case float_class_normal:
 696         switch (s->float_rounding_mode) {
 697         case float_round_nearest_even:
 698             overflow_norm = false;
 699             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 700             break;
 701         case float_round_ties_away:
 702             overflow_norm = false;
 703             inc = frac_lsbm1;
 704             break;
 705         case float_round_to_zero:
 706             overflow_norm = true;
 707             inc = 0;
 708             break;
 709         case float_round_up:
 710             inc = p.sign ? 0 : round_mask;
 711             overflow_norm = p.sign;
 712             break;
 713         case float_round_down:
 714             inc = p.sign ? round_mask : 0;
 715             overflow_norm = !p.sign;
 716             break;
 717         case float_round_to_odd:
 718             overflow_norm = true;
 719             inc = frac & frac_lsb ? 0 : round_mask;
 720             break;
 721         default:
 722             g_assert_not_reached();
 723         }
 724
 725         exp += parm->exp_bias;
 726         if (likely(exp > 0)) {
 727             if (frac & round_mask) {
 728                 flags |= float_flag_inexact;
 729                 frac += inc;
 730                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
 731                     frac >>= 1;
 732                     exp++;
 733                 }
 734             }
 735             frac >>= frac_shift;
 736
 737             if (parm->arm_althp) {
 738                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
 739                 if (unlikely(exp > exp_max)) {
 740                     /* Overflow.  Return the maximum normal.  */
 741                     flags = float_flag_invalid;
 742                     exp = exp_max;
 743                     frac = -1;
 744                 }
 745             } else if (unlikely(exp >= exp_max)) {
 746                 flags |= float_flag_overflow | float_flag_inexact;
 747                 if (overflow_norm) {
 748                     exp = exp_max - 1;
 749                     frac = -1;
 750                 } else {
 751                     p.cls = float_class_inf;
 752                     goto do_inf;
 753                 }
 754             }
 755         } else if (s->flush_to_zero) {
 756             flags |= float_flag_output_denormal;
 757             p.cls = float_class_zero;
 758             goto do_zero;
 759         } else {
 760             bool is_tiny = (s->float_detect_tininess
 761                             == float_tininess_before_rounding)
 762                         || (exp < 0)
 763                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 764
 765             shift64RightJamming(frac, 1 - exp, &frac);
 766             if (frac & round_mask) {
 767                 /* Need to recompute round-to-even.  */
 768                 switch (s->float_rounding_mode) {
 769                 case float_round_nearest_even:
 770                     inc = ((frac & roundeven_mask) != frac_lsbm1
 771                            ? frac_lsbm1 : 0);
 772                     break;
 773                 case float_round_to_odd:
 774                     inc = frac & frac_lsb ? 0 : round_mask;
 775                     break;
 776                 }
 777                 flags |= float_flag_inexact;
 778                 frac += inc;
 779             }
 780
 781             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 782             frac >>= frac_shift;
 783
 784             if (is_tiny && (flags & float_flag_inexact)) {
 785                 flags |= float_flag_underflow;
 786             }
 787             if (exp == 0 && frac == 0) {
 788                 p.cls = float_class_zero;
 789             }
 790         }
 791         break;
 792
 793     case float_class_zero:
 794     do_zero:
 795         exp = 0;
 796         frac = 0;
 797         break;
 798
 799     case float_class_inf:
 800     do_inf:
 801         assert(!parm->arm_althp);
 802         exp = exp_max;
 803         frac = 0;
 804         break;
 805
 806     case float_class_qnan:
 807     case float_class_snan:
 808         assert(!parm->arm_althp);
 809         exp = exp_max;
 810         frac >>= parm->frac_shift;
 811         break;
 812
 813     default:
 814         g_assert_not_reached();
 815     }
 816
 817     float_raise(flags, s);
 818     p.exp = exp;
 819     p.frac = frac;
 820     return p;
 821 }
 822
 823 /* Explicit FloatFmt version */
 824 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
 825                                             const FloatFmt *params)
 826 {
 827     return sf_canonicalize(float16_unpack_raw(f), params, s);
 828 }
 829
 830 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 831 {
 832     return float16a_unpack_canonical(f, s, &float16_params);
 833 }
 834
 835 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
 836                                              const FloatFmt *params)
 837 {
 838     return float16_pack_raw(round_canonical(p, s, params));
 839 }
 840
 841 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 842 {
 843     return float16a_round_pack_canonical(p, s, &float16_params);
 844 }
 845
 846 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 847 {
 848     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
 849 }
 850
 851 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 852 {
 853     return float32_pack_raw(round_canonical(p, s, &float32_params));
 854 }
 855
 856 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 857 {
 858     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
 859 }
 860
 861 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 862 {
 863     return float64_pack_raw(round_canonical(p, s, &float64_params));
 864 }
 865
 866 static FloatParts return_nan(FloatParts a, float_status *s)
 867 {
 868     switch (a.cls) {
 869     case float_class_snan:
 870         s->float_exception_flags |= float_flag_invalid;
 871         a = parts_silence_nan(a, s);
 872         /* fall through */
 873     case float_class_qnan:
 874         if (s->default_nan_mode) {
 875             return parts_default_nan(s);
 876         }
 877         break;
 878
 879     default:
 880         g_assert_not_reached();
 881     }
 882     return a;
 883 }
 884
 885 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 886 {
 887     if (is_snan(a.cls) || is_snan(b.cls)) {
 888         s->float_exception_flags |= float_flag_invalid;
 889     }
 890
 891     if (s->default_nan_mode) {
 892         return parts_default_nan(s);
 893     } else {
 894         if (pickNaN(a.cls, b.cls,
 895                     a.frac > b.frac ||
 896                     (a.frac == b.frac && a.sign < b.sign))) {
 897             a = b;
 898         }
 899         if (is_snan(a.cls)) {
 900             return parts_silence_nan(a, s);
 901         }
 902     }
 903     return a;
 904 }
 905
 906 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 907                                   bool inf_zero, float_status *s)
 908 {
 909     int which;
 910
 911     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 912         s->float_exception_flags |= float_flag_invalid;
 913     }
 914
 915     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
 916
 917     if (s->default_nan_mode) {
 918         /* Note that this check is after pickNaNMulAdd so that function
 919          * has an opportunity to set the Invalid flag.
 920          */
 921         which = 3;
 922     }
 923
 924     switch (which) {
 925     case 0:
 926         break;
 927     case 1:
 928         a = b;
 929         break;
 930     case 2:
 931         a = c;
 932         break;
 933     case 3:
 934         return parts_default_nan(s);
 935     default:
 936         g_assert_not_reached();
 937     }
 938
 939     if (is_snan(a.cls)) {
 940         return parts_silence_nan(a, s);
 941     }
 942     return a;
 943 }
 944
 945 /*
 946  * Returns the result of adding or subtracting the values of the
 947  * floating-point values `a' and `b'. The operation is performed
 948  * according to the IEC/IEEE Standard for Binary Floating-Point
 949  * Arithmetic.
 950  */
 951
 952 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 953                                 float_status *s)
 954 {
 955     bool a_sign = a.sign;
 956     bool b_sign = b.sign ^ subtract;
 957
 958     if (a_sign != b_sign) {
 959         /* Subtraction */
 960
 961         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 962             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 963                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 964                 a.frac = a.frac - b.frac;
 965             } else {
 966                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 967                 a.frac = b.frac - a.frac;
 968                 a.exp = b.exp;
 969                 a_sign ^= 1;
 970             }
 971
 972             if (a.frac == 0) {
 973                 a.cls = float_class_zero;
 974                 a.sign = s->float_rounding_mode == float_round_down;
 975             } else {
 976                 int shift = clz64(a.frac) - 1;
 977                 a.frac = a.frac << shift;
 978                 a.exp = a.exp - shift;
 979                 a.sign = a_sign;
 980             }
 981             return a;
 982         }
 983         if (is_nan(a.cls) || is_nan(b.cls)) {
 984             return pick_nan(a, b, s);
 985         }
 986         if (a.cls == float_class_inf) {
 987             if (b.cls == float_class_inf) {
 988                 float_raise(float_flag_invalid, s);
 989                 return parts_default_nan(s);
 990             }
 991             return a;
 992         }
 993         if (a.cls == float_class_zero && b.cls == float_class_zero) {
 994             a.sign = s->float_rounding_mode == float_round_down;
 995             return a;
 996         }
 997         if (a.cls == float_class_zero || b.cls == float_class_inf) {
 998             b.sign = a_sign ^ 1;
 999             return b;
1000         }
1001         if (b.cls == float_class_zero) {
1002             return a;
1003         }
1004     } else {
1005         /* Addition */
1006         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1007             if (a.exp > b.exp) {
1008                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1009             } else if (a.exp < b.exp) {
1010                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1011                 a.exp = b.exp;
1012             }
1013             a.frac += b.frac;
1014             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1015                 shift64RightJamming(a.frac, 1, &a.frac);
1016                 a.exp += 1;
1017             }
1018             return a;
1019         }
1020         if (is_nan(a.cls) || is_nan(b.cls)) {
1021             return pick_nan(a, b, s);
1022         }
1023         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1024             return a;
1025         }
1026         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1027             b.sign = b_sign;
1028             return b;
1029         }
1030     }
1031     g_assert_not_reached();
1032 }
1033
1034 /*
1035  * Returns the result of adding or subtracting the floating-point
1036  * values `a' and `b'. The operation is performed according to the
1037  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1038  */
1039
1040 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1041 {
1042     FloatParts pa = float16_unpack_canonical(a, status);
1043     FloatParts pb = float16_unpack_canonical(b, status);
1044     FloatParts pr = addsub_floats(pa, pb, false, status);
1045
1046     return float16_round_pack_canonical(pr, status);
1047 }
1048
1049 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1050 {
1051     FloatParts pa = float16_unpack_canonical(a, status);
1052     FloatParts pb = float16_unpack_canonical(b, status);
1053     FloatParts pr = addsub_floats(pa, pb, true, status);
1054
1055     return float16_round_pack_canonical(pr, status);
1056 }
1057
1058 static float32 QEMU_SOFTFLOAT_ATTR
1059 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1060 {
1061     FloatParts pa = float32_unpack_canonical(a, status);
1062     FloatParts pb = float32_unpack_canonical(b, status);
1063     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1064
1065     return float32_round_pack_canonical(pr, status);
1066 }
1067
1068 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1069 {
1070     return soft_f32_addsub(a, b, false, status);
1071 }
1072
1073 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1074 {
1075     return soft_f32_addsub(a, b, true, status);
1076 }
1077
1078 static float64 QEMU_SOFTFLOAT_ATTR
1079 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1080 {
1081     FloatParts pa = float64_unpack_canonical(a, status);
1082     FloatParts pb = float64_unpack_canonical(b, status);
1083     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1084
1085     return float64_round_pack_canonical(pr, status);
1086 }
1087
1088 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1089 {
1090     return soft_f64_addsub(a, b, false, status);
1091 }
1092
1093 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1094 {
1095     return soft_f64_addsub(a, b, true, status);
1096 }
1097
1098 static float hard_f32_add(float a, float b)
1099 {
1100     return a + b;
1101 }
1102
1103 static float hard_f32_sub(float a, float b)
1104 {
1105     return a - b;
1106 }
1107
1108 static double hard_f64_add(double a, double b)
1109 {
1110     return a + b;
1111 }
1112
1113 static double hard_f64_sub(double a, double b)
1114 {
1115     return a - b;
1116 }
1117
1118 static bool f32_addsub_post(union_float32 a, union_float32 b)
1119 {
1120     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1121         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1122     }
1123     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1124 }
1125
1126 static bool f64_addsub_post(union_float64 a, union_float64 b)
1127 {
1128     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1129         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1130     } else {
1131         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1132     }
1133 }
1134
1135 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1136                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1137 {
1138     return float32_gen2(a, b, s, hard, soft,
1139                         f32_is_zon2, f32_addsub_post, NULL, NULL);
1140 }
1141
1142 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1143                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1144 {
1145     return float64_gen2(a, b, s, hard, soft,
1146                         f64_is_zon2, f64_addsub_post, NULL, NULL);
1147 }
1148
1149 float32 QEMU_FLATTEN
1150 float32_add(float32 a, float32 b, float_status *s)
1151 {
1152     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1153 }
1154
1155 float32 QEMU_FLATTEN
1156 float32_sub(float32 a, float32 b, float_status *s)
1157 {
1158     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1159 }
1160
1161 float64 QEMU_FLATTEN
1162 float64_add(float64 a, float64 b, float_status *s)
1163 {
1164     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1165 }
1166
1167 float64 QEMU_FLATTEN
1168 float64_sub(float64 a, float64 b, float_status *s)
1169 {
1170     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1171 }
1172
1173 /*
1174  * Returns the result of multiplying the floating-point values `a' and
1175  * `b'. The operation is performed according to the IEC/IEEE Standard
1176  * for Binary Floating-Point Arithmetic.
1177  */
1178
1179 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1180 {
1181     bool sign = a.sign ^ b.sign;
1182
1183     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1184         uint64_t hi, lo;
1185         int exp = a.exp + b.exp;
1186
1187         mul64To128(a.frac, b.frac, &hi, &lo);
1188         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1189         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1190             shift64RightJamming(lo, 1, &lo);
1191             exp += 1;
1192         }
1193
1194         /* Re-use a */
1195         a.exp = exp;
1196         a.sign = sign;
1197         a.frac = lo;
1198         return a;
1199     }
1200     /* handle all the NaN cases */
1201     if (is_nan(a.cls) || is_nan(b.cls)) {
1202         return pick_nan(a, b, s);
1203     }
1204     /* Inf * Zero == NaN */
1205     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1206         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1207         s->float_exception_flags |= float_flag_invalid;
1208         return parts_default_nan(s);
1209     }
1210     /* Multiply by 0 or Inf */
1211     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1212         a.sign = sign;
1213         return a;
1214     }
1215     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1216         b.sign = sign;
1217         return b;
1218     }
1219     g_assert_not_reached();
1220 }
1221
1222 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1223 {
1224     FloatParts pa = float16_unpack_canonical(a, status);
1225     FloatParts pb = float16_unpack_canonical(b, status);
1226     FloatParts pr = mul_floats(pa, pb, status);
1227
1228     return float16_round_pack_canonical(pr, status);
1229 }
1230
1231 static float32 QEMU_SOFTFLOAT_ATTR
1232 soft_f32_mul(float32 a, float32 b, float_status *status)
1233 {
1234     FloatParts pa = float32_unpack_canonical(a, status);
1235     FloatParts pb = float32_unpack_canonical(b, status);
1236     FloatParts pr = mul_floats(pa, pb, status);
1237
1238     return float32_round_pack_canonical(pr, status);
1239 }
1240
1241 static float64 QEMU_SOFTFLOAT_ATTR
1242 soft_f64_mul(float64 a, float64 b, float_status *status)
1243 {
1244     FloatParts pa = float64_unpack_canonical(a, status);
1245     FloatParts pb = float64_unpack_canonical(b, status);
1246     FloatParts pr = mul_floats(pa, pb, status);
1247
1248     return float64_round_pack_canonical(pr, status);
1249 }
1250
1251 static float hard_f32_mul(float a, float b)
1252 {
1253     return a * b;
1254 }
1255
1256 static double hard_f64_mul(double a, double b)
1257 {
1258     return a * b;
1259 }
1260
1261 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1262 {
1263     return float32_is_zero(a.s) || float32_is_zero(b.s);
1264 }
1265
1266 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1267 {
1268     return float64_is_zero(a.s) || float64_is_zero(b.s);
1269 }
1270
1271 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1272 {
1273     bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1274
1275     return float32_set_sign(float32_zero, signbit);
1276 }
1277
1278 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1279 {
1280     bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1281
1282     return float64_set_sign(float64_zero, signbit);
1283 }
1284
1285 float32 QEMU_FLATTEN
1286 float32_mul(float32 a, float32 b, float_status *s)
1287 {
1288     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1289                         f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1290 }
1291
1292 float64 QEMU_FLATTEN
1293 float64_mul(float64 a, float64 b, float_status *s)
1294 {
1295     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1296                         f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1297 }
1298
1299 /*
1300  * Returns the result of multiplying the floating-point values `a' and
1301  * `b' then adding 'c', with no intermediate rounding step after the
1302  * multiplication. The operation is performed according to the
1303  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1304  * The flags argument allows the caller to select negation of the
1305  * addend, the intermediate product, or the final result. (The
1306  * difference between this and having the caller do a separate
1307  * negation is that negating externally will flip the sign bit on
1308  * NaNs.)
1309  */
1310
1311 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1312                                 int flags, float_status *s)
1313 {
1314     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1315                     ((1 << float_class_inf) | (1 << float_class_zero));
1316     bool p_sign;
1317     bool sign_flip = flags & float_muladd_negate_result;
1318     FloatClass p_class;
1319     uint64_t hi, lo;
1320     int p_exp;
1321
1322     /* It is implementation-defined whether the cases of (0,inf,qnan)
1323      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1324      * they return if they do), so we have to hand this information
1325      * off to the target-specific pick-a-NaN routine.
1326      */
1327     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1328         return pick_nan_muladd(a, b, c, inf_zero, s);
1329     }
1330
1331     if (inf_zero) {
1332         s->float_exception_flags |= float_flag_invalid;
1333         return parts_default_nan(s);
1334     }
1335
1336     if (flags & float_muladd_negate_c) {
1337         c.sign ^= 1;
1338     }
1339
1340     p_sign = a.sign ^ b.sign;
1341
1342     if (flags & float_muladd_negate_product) {
1343         p_sign ^= 1;
1344     }
1345
1346     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1347         p_class = float_class_inf;
1348     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1349         p_class = float_class_zero;
1350     } else {
1351         p_class = float_class_normal;
1352     }
1353
1354     if (c.cls == float_class_inf) {
1355         if (p_class == float_class_inf && p_sign != c.sign) {
1356             s->float_exception_flags |= float_flag_invalid;
1357             return parts_default_nan(s);
1358         } else {
1359             a.cls = float_class_inf;
1360             a.sign = c.sign ^ sign_flip;
1361             return a;
1362         }
1363     }
1364
1365     if (p_class == float_class_inf) {
1366         a.cls = float_class_inf;
1367         a.sign = p_sign ^ sign_flip;
1368         return a;
1369     }
1370
1371     if (p_class == float_class_zero) {
1372         if (c.cls == float_class_zero) {
1373             if (p_sign != c.sign) {
1374                 p_sign = s->float_rounding_mode == float_round_down;
1375             }
1376             c.sign = p_sign;
1377         } else if (flags & float_muladd_halve_result) {
1378             c.exp -= 1;
1379         }
1380         c.sign ^= sign_flip;
1381         return c;
1382     }
1383
1384     /* a & b should be normals now... */
1385     assert(a.cls == float_class_normal &&
1386            b.cls == float_class_normal);
1387
1388     p_exp = a.exp + b.exp;
1389
1390     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1391      * result.
1392      */
1393     mul64To128(a.frac, b.frac, &hi, &lo);
1394     /* binary point now at bit 124 */
1395
1396     /* check for overflow */
1397     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1398         shift128RightJamming(hi, lo, 1, &hi, &lo);
1399         p_exp += 1;
1400     }
1401
1402     /* + add/sub */
1403     if (c.cls == float_class_zero) {
1404         /* move binary point back to 62 */
1405         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1406     } else {
1407         int exp_diff = p_exp - c.exp;
1408         if (p_sign == c.sign) {
1409             /* Addition */
1410             if (exp_diff <= 0) {
1411                 shift128RightJamming(hi, lo,
1412                                      DECOMPOSED_BINARY_POINT - exp_diff,
1413                                      &hi, &lo);
1414                 lo += c.frac;
1415                 p_exp = c.exp;
1416             } else {
1417                 uint64_t c_hi, c_lo;
1418                 /* shift c to the same binary point as the product (124) */
1419                 c_hi = c.frac >> 2;
1420                 c_lo = 0;
1421                 shift128RightJamming(c_hi, c_lo,
1422                                      exp_diff,
1423                                      &c_hi, &c_lo);
1424                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1425                 /* move binary point back to 62 */
1426                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1427             }
1428
1429             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1430                 shift64RightJamming(lo, 1, &lo);
1431                 p_exp += 1;
1432             }
1433
1434         } else {
1435             /* Subtraction */
1436             uint64_t c_hi, c_lo;
1437             /* make C binary point match product at bit 124 */
1438             c_hi = c.frac >> 2;
1439             c_lo = 0;
1440
1441             if (exp_diff <= 0) {
1442                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1443                 if (exp_diff == 0
1444                     &&
1445                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1446                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1447                 } else {
1448                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1449                     p_sign ^= 1;
1450                     p_exp = c.exp;
1451                 }
1452             } else {
1453                 shift128RightJamming(c_hi, c_lo,
1454                                      exp_diff,
1455                                      &c_hi, &c_lo);
1456                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1457             }
1458
1459             if (hi == 0 && lo == 0) {
1460                 a.cls = float_class_zero;
1461                 a.sign = s->float_rounding_mode == float_round_down;
1462                 a.sign ^= sign_flip;
1463                 return a;
1464             } else {
1465                 int shift;
1466                 if (hi != 0) {
1467                     shift = clz64(hi);
1468                 } else {
1469                     shift = clz64(lo) + 64;
1470                 }
1471                 /* Normalizing to a binary point of 124 is the
1472                    correct adjust for the exponent.  However since we're
1473                    shifting, we might as well put the binary point back
1474                    at 62 where we really want it.  Therefore shift as
1475                    if we're leaving 1 bit at the top of the word, but
1476                    adjust the exponent as if we're leaving 3 bits.  */
1477                 shift -= 1;
1478                 if (shift >= 64) {
1479                     lo = lo << (shift - 64);
1480                 } else {
1481                     hi = (hi << shift) | (lo >> (64 - shift));
1482                     lo = hi | ((lo << shift) != 0);
1483                 }
1484                 p_exp -= shift - 2;
1485             }
1486         }
1487     }
1488
1489     if (flags & float_muladd_halve_result) {
1490         p_exp -= 1;
1491     }
1492
1493     /* finally prepare our result */
1494     a.cls = float_class_normal;
1495     a.sign = p_sign ^ sign_flip;
1496     a.exp = p_exp;
1497     a.frac = lo;
1498
1499     return a;
1500 }
1501
1502 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1503                                                 int flags, float_status *status)
1504 {
1505     FloatParts pa = float16_unpack_canonical(a, status);
1506     FloatParts pb = float16_unpack_canonical(b, status);
1507     FloatParts pc = float16_unpack_canonical(c, status);
1508     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1509
1510     return float16_round_pack_canonical(pr, status);
1511 }
1512
1513 static float32 QEMU_SOFTFLOAT_ATTR
1514 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1515                 float_status *status)
1516 {
1517     FloatParts pa = float32_unpack_canonical(a, status);
1518     FloatParts pb = float32_unpack_canonical(b, status);
1519     FloatParts pc = float32_unpack_canonical(c, status);
1520     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1521
1522     return float32_round_pack_canonical(pr, status);
1523 }
1524
1525 static float64 QEMU_SOFTFLOAT_ATTR
1526 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1527                 float_status *status)
1528 {
1529     FloatParts pa = float64_unpack_canonical(a, status);
1530     FloatParts pb = float64_unpack_canonical(b, status);
1531     FloatParts pc = float64_unpack_canonical(c, status);
1532     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1533
1534     return float64_round_pack_canonical(pr, status);
1535 }
1536
1537 static bool force_soft_fma;
1538
1539 float32 QEMU_FLATTEN
1540 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1541 {
1542     union_float32 ua, ub, uc, ur;
1543
1544     ua.s = xa;
1545     ub.s = xb;
1546     uc.s = xc;
1547
1548     if (unlikely(!can_use_fpu(s))) {
1549         goto soft;
1550     }
1551     if (unlikely(flags & float_muladd_halve_result)) {
1552         goto soft;
1553     }
1554
1555     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1556     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1557         goto soft;
1558     }
1559
1560     if (unlikely(force_soft_fma)) {
1561         goto soft;
1562     }
1563
1564     /*
1565      * When (a || b) == 0, there's no need to check for under/over flow,
1566      * since we know the addend is (normal || 0) and the product is 0.
1567      */
1568     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1569         union_float32 up;
1570         bool prod_sign;
1571
1572         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1573         prod_sign ^= !!(flags & float_muladd_negate_product);
1574         up.s = float32_set_sign(float32_zero, prod_sign);
1575
1576         if (flags & float_muladd_negate_c) {
1577             uc.h = -uc.h;
1578         }
1579         ur.h = up.h + uc.h;
1580     } else {
1581         union_float32 ua_orig = ua;
1582         union_float32 uc_orig = uc;
1583
1584         if (flags & float_muladd_negate_product) {
1585             ua.h = -ua.h;
1586         }
1587         if (flags & float_muladd_negate_c) {
1588             uc.h = -uc.h;
1589         }
1590
1591         ur.h = fmaf(ua.h, ub.h, uc.h);
1592
1593         if (unlikely(f32_is_inf(ur))) {
1594             s->float_exception_flags |= float_flag_overflow;
1595         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1596             ua = ua_orig;
1597             uc = uc_orig;
1598             goto soft;
1599         }
1600     }
1601     if (flags & float_muladd_negate_result) {
1602         return float32_chs(ur.s);
1603     }
1604     return ur.s;
1605
1606  soft:
1607     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1608 }
1609
1610 float64 QEMU_FLATTEN
1611 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1612 {
1613     union_float64 ua, ub, uc, ur;
1614
1615     ua.s = xa;
1616     ub.s = xb;
1617     uc.s = xc;
1618
1619     if (unlikely(!can_use_fpu(s))) {
1620         goto soft;
1621     }
1622     if (unlikely(flags & float_muladd_halve_result)) {
1623         goto soft;
1624     }
1625
1626     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1627     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1628         goto soft;
1629     }
1630
1631     if (unlikely(force_soft_fma)) {
1632         goto soft;
1633     }
1634
1635     /*
1636      * When (a || b) == 0, there's no need to check for under/over flow,
1637      * since we know the addend is (normal || 0) and the product is 0.
1638      */
1639     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1640         union_float64 up;
1641         bool prod_sign;
1642
1643         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1644         prod_sign ^= !!(flags & float_muladd_negate_product);
1645         up.s = float64_set_sign(float64_zero, prod_sign);
1646
1647         if (flags & float_muladd_negate_c) {
1648             uc.h = -uc.h;
1649         }
1650         ur.h = up.h + uc.h;
1651     } else {
1652         union_float64 ua_orig = ua;
1653         union_float64 uc_orig = uc;
1654
1655         if (flags & float_muladd_negate_product) {
1656             ua.h = -ua.h;
1657         }
1658         if (flags & float_muladd_negate_c) {
1659             uc.h = -uc.h;
1660         }
1661
1662         ur.h = fma(ua.h, ub.h, uc.h);
1663
1664         if (unlikely(f64_is_inf(ur))) {
1665             s->float_exception_flags |= float_flag_overflow;
1666         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1667             ua = ua_orig;
1668             uc = uc_orig;
1669             goto soft;
1670         }
1671     }
1672     if (flags & float_muladd_negate_result) {
1673         return float64_chs(ur.s);
1674     }
1675     return ur.s;
1676
1677  soft:
1678     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1679 }
1680
1681 /*
1682  * Returns the result of dividing the floating-point value `a' by the
1683  * corresponding value `b'. The operation is performed according to
1684  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1685  */
1686
1687 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1688 {
1689     bool sign = a.sign ^ b.sign;
1690
1691     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1692         uint64_t n0, n1, q, r;
1693         int exp = a.exp - b.exp;
1694
1695         /*
1696          * We want a 2*N / N-bit division to produce exactly an N-bit
1697          * result, so that we do not lose any precision and so that we
1698          * do not have to renormalize afterward.  If A.frac < B.frac,
1699          * then division would produce an (N-1)-bit result; shift A left
1700          * by one to produce the an N-bit result, and decrement the
1701          * exponent to match.
1702          *
1703          * The udiv_qrnnd algorithm that we're using requires normalization,
1704          * i.e. the msb of the denominator must be set.  Since we know that
1705          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1706          * by one (more), and the remainder must be shifted right by one.
1707          */
1708         if (a.frac < b.frac) {
1709             exp -= 1;
1710             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1711         } else {
1712             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1713         }
1714         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1715
1716         /*
1717          * Set lsb if there is a remainder, to set inexact.
1718          * As mentioned above, to find the actual value of the remainder we
1719          * would need to shift right, but (1) we are only concerned about
1720          * non-zero-ness, and (2) the remainder will always be even because
1721          * both inputs to the division primitive are even.
1722          */
1723         a.frac = q | (r != 0);
1724         a.sign = sign;
1725         a.exp = exp;
1726         return a;
1727     }
1728     /* handle all the NaN cases */
1729     if (is_nan(a.cls) || is_nan(b.cls)) {
1730         return pick_nan(a, b, s);
1731     }
1732     /* 0/0 or Inf/Inf */
1733     if (a.cls == b.cls
1734         &&
1735         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1736         s->float_exception_flags |= float_flag_invalid;
1737         return parts_default_nan(s);
1738     }
1739     /* Inf / x or 0 / x */
1740     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1741         a.sign = sign;
1742         return a;
1743     }
1744     /* Div 0 => Inf */
1745     if (b.cls == float_class_zero) {
1746         s->float_exception_flags |= float_flag_divbyzero;
1747         a.cls = float_class_inf;
1748         a.sign = sign;
1749         return a;
1750     }
1751     /* Div by Inf */
1752     if (b.cls == float_class_inf) {
1753         a.cls = float_class_zero;
1754         a.sign = sign;
1755         return a;
1756     }
1757     g_assert_not_reached();
1758 }
1759
1760 float16 float16_div(float16 a, float16 b, float_status *status)
1761 {
1762     FloatParts pa = float16_unpack_canonical(a, status);
1763     FloatParts pb = float16_unpack_canonical(b, status);
1764     FloatParts pr = div_floats(pa, pb, status);
1765
1766     return float16_round_pack_canonical(pr, status);
1767 }
1768
1769 static float32 QEMU_SOFTFLOAT_ATTR
1770 soft_f32_div(float32 a, float32 b, float_status *status)
1771 {
1772     FloatParts pa = float32_unpack_canonical(a, status);
1773     FloatParts pb = float32_unpack_canonical(b, status);
1774     FloatParts pr = div_floats(pa, pb, status);
1775
1776     return float32_round_pack_canonical(pr, status);
1777 }
1778
1779 static float64 QEMU_SOFTFLOAT_ATTR
1780 soft_f64_div(float64 a, float64 b, float_status *status)
1781 {
1782     FloatParts pa = float64_unpack_canonical(a, status);
1783     FloatParts pb = float64_unpack_canonical(b, status);
1784     FloatParts pr = div_floats(pa, pb, status);
1785
1786     return float64_round_pack_canonical(pr, status);
1787 }
1788
1789 static float hard_f32_div(float a, float b)
1790 {
1791     return a / b;
1792 }
1793
1794 static double hard_f64_div(double a, double b)
1795 {
1796     return a / b;
1797 }
1798
1799 static bool f32_div_pre(union_float32 a, union_float32 b)
1800 {
1801     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1802         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1803                fpclassify(b.h) == FP_NORMAL;
1804     }
1805     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1806 }
1807
1808 static bool f64_div_pre(union_float64 a, union_float64 b)
1809 {
1810     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1811         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1812                fpclassify(b.h) == FP_NORMAL;
1813     }
1814     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1815 }
1816
1817 static bool f32_div_post(union_float32 a, union_float32 b)
1818 {
1819     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1820         return fpclassify(a.h) != FP_ZERO;
1821     }
1822     return !float32_is_zero(a.s);
1823 }
1824
1825 static bool f64_div_post(union_float64 a, union_float64 b)
1826 {
1827     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1828         return fpclassify(a.h) != FP_ZERO;
1829     }
1830     return !float64_is_zero(a.s);
1831 }
1832
1833 float32 QEMU_FLATTEN
1834 float32_div(float32 a, float32 b, float_status *s)
1835 {
1836     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1837                         f32_div_pre, f32_div_post, NULL, NULL);
1838 }
1839
1840 float64 QEMU_FLATTEN
1841 float64_div(float64 a, float64 b, float_status *s)
1842 {
1843     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1844                         f64_div_pre, f64_div_post, NULL, NULL);
1845 }
1846
1847 /*
1848  * Float to Float conversions
1849  *
1850  * Returns the result of converting one float format to another. The
1851  * conversion is performed according to the IEC/IEEE Standard for
1852  * Binary Floating-Point Arithmetic.
1853  *
1854  * The float_to_float helper only needs to take care of raising
1855  * invalid exceptions and handling the conversion on NaNs.
1856  */
1857
1858 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1859                                  float_status *s)
1860 {
1861     if (dstf->arm_althp) {
1862         switch (a.cls) {
1863         case float_class_qnan:
1864         case float_class_snan:
1865             /* There is no NaN in the destination format.  Raise Invalid
1866              * and return a zero with the sign of the input NaN.
1867              */
1868             s->float_exception_flags |= float_flag_invalid;
1869             a.cls = float_class_zero;
1870             a.frac = 0;
1871             a.exp = 0;
1872             break;
1873
1874         case float_class_inf:
1875             /* There is no Inf in the destination format.  Raise Invalid
1876              * and return the maximum normal with the correct sign.
1877              */
1878             s->float_exception_flags |= float_flag_invalid;
1879             a.cls = float_class_normal;
1880             a.exp = dstf->exp_max;
1881             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1882             break;
1883
1884         default:
1885             break;
1886         }
1887     } else if (is_nan(a.cls)) {
1888         if (is_snan(a.cls)) {
1889             s->float_exception_flags |= float_flag_invalid;
1890             a = parts_silence_nan(a, s);
1891         }
1892         if (s->default_nan_mode) {
1893             return parts_default_nan(s);
1894         }
1895     }
1896     return a;
1897 }
1898
1899 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1900 {
1901     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1902     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1903     FloatParts pr = float_to_float(p, &float32_params, s);
1904     return float32_round_pack_canonical(pr, s);
1905 }
1906
1907 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1908 {
1909     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1910     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1911     FloatParts pr = float_to_float(p, &float64_params, s);
1912     return float64_round_pack_canonical(pr, s);
1913 }
1914
1915 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1916 {
1917     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1918     FloatParts p = float32_unpack_canonical(a, s);
1919     FloatParts pr = float_to_float(p, fmt16, s);
1920     return float16a_round_pack_canonical(pr, s, fmt16);
1921 }
1922
1923 float64 float32_to_float64(float32 a, float_status *s)
1924 {
1925     FloatParts p = float32_unpack_canonical(a, s);
1926     FloatParts pr = float_to_float(p, &float64_params, s);
1927     return float64_round_pack_canonical(pr, s);
1928 }
1929
1930 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1931 {
1932     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1933     FloatParts p = float64_unpack_canonical(a, s);
1934     FloatParts pr = float_to_float(p, fmt16, s);
1935     return float16a_round_pack_canonical(pr, s, fmt16);
1936 }
1937
1938 float32 float64_to_float32(float64 a, float_status *s)
1939 {
1940     FloatParts p = float64_unpack_canonical(a, s);
1941     FloatParts pr = float_to_float(p, &float32_params, s);
1942     return float32_round_pack_canonical(pr, s);
1943 }
1944
1945 /*
1946  * Rounds the floating-point value `a' to an integer, and returns the
1947  * result as a floating-point value. The operation is performed
1948  * according to the IEC/IEEE Standard for Binary Floating-Point
1949  * Arithmetic.
1950  */
1951
1952 static FloatParts round_to_int(FloatParts a, int rmode,
1953                                int scale, float_status *s)
1954 {
1955     switch (a.cls) {
1956     case float_class_qnan:
1957     case float_class_snan:
1958         return return_nan(a, s);
1959
1960     case float_class_zero:
1961     case float_class_inf:
1962         /* already "integral" */
1963         break;
1964
1965     case float_class_normal:
1966         scale = MIN(MAX(scale, -0x10000), 0x10000);
1967         a.exp += scale;
1968
1969         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1970             /* already integral */
1971             break;
1972         }
1973         if (a.exp < 0) {
1974             bool one;
1975             /* all fractional */
1976             s->float_exception_flags |= float_flag_inexact;
1977             switch (rmode) {
1978             case float_round_nearest_even:
1979                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1980                 break;
1981             case float_round_ties_away:
1982                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1983                 break;
1984             case float_round_to_zero:
1985                 one = false;
1986                 break;
1987             case float_round_up:
1988                 one = !a.sign;
1989                 break;
1990             case float_round_down:
1991                 one = a.sign;
1992                 break;
1993             case float_round_to_odd:
1994                 one = true;
1995                 break;
1996             default:
1997                 g_assert_not_reached();
1998             }
1999
2000             if (one) {
2001                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2002                 a.exp = 0;
2003             } else {
2004                 a.cls = float_class_zero;
2005             }
2006         } else {
2007             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2008             uint64_t frac_lsbm1 = frac_lsb >> 1;
2009             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2010             uint64_t rnd_mask = rnd_even_mask >> 1;
2011             uint64_t inc;
2012
2013             switch (rmode) {
2014             case float_round_nearest_even:
2015                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2016                 break;
2017             case float_round_ties_away:
2018                 inc = frac_lsbm1;
2019                 break;
2020             case float_round_to_zero:
2021                 inc = 0;
2022                 break;
2023             case float_round_up:
2024                 inc = a.sign ? 0 : rnd_mask;
2025                 break;
2026             case float_round_down:
2027                 inc = a.sign ? rnd_mask : 0;
2028                 break;
2029             case float_round_to_odd:
2030                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2031                 break;
2032             default:
2033                 g_assert_not_reached();
2034             }
2035
2036             if (a.frac & rnd_mask) {
2037                 s->float_exception_flags |= float_flag_inexact;
2038                 a.frac += inc;
2039                 a.frac &= ~rnd_mask;
2040                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2041                     a.frac >>= 1;
2042                     a.exp++;
2043                 }
2044             }
2045         }
2046         break;
2047     default:
2048         g_assert_not_reached();
2049     }
2050     return a;
2051 }
2052
2053 float16 float16_round_to_int(float16 a, float_status *s)
2054 {
2055     FloatParts pa = float16_unpack_canonical(a, s);
2056     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2057     return float16_round_pack_canonical(pr, s);
2058 }
2059
2060 float32 float32_round_to_int(float32 a, float_status *s)
2061 {
2062     FloatParts pa = float32_unpack_canonical(a, s);
2063     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2064     return float32_round_pack_canonical(pr, s);
2065 }
2066
2067 float64 float64_round_to_int(float64 a, float_status *s)
2068 {
2069     FloatParts pa = float64_unpack_canonical(a, s);
2070     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2071     return float64_round_pack_canonical(pr, s);
2072 }
2073
2074 /*
2075  * Returns the result of converting the floating-point value `a' to
2076  * the two's complement integer format. The conversion is performed
2077  * according to the IEC/IEEE Standard for Binary Floating-Point
2078  * Arithmetic---which means in particular that the conversion is
2079  * rounded according to the current rounding mode. If `a' is a NaN,
2080  * the largest positive integer is returned. Otherwise, if the
2081  * conversion overflows, the largest integer with the same sign as `a'
2082  * is returned.
2083 */
2084
2085 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2086                                      int64_t min, int64_t max,
2087                                      float_status *s)
2088 {
2089     uint64_t r;
2090     int orig_flags = get_float_exception_flags(s);
2091     FloatParts p = round_to_int(in, rmode, scale, s);
2092
2093     switch (p.cls) {
2094     case float_class_snan:
2095     case float_class_qnan:
2096         s->float_exception_flags = orig_flags | float_flag_invalid;
2097         return max;
2098     case float_class_inf:
2099         s->float_exception_flags = orig_flags | float_flag_invalid;
2100         return p.sign ? min : max;
2101     case float_class_zero:
2102         return 0;
2103     case float_class_normal:
2104         if (p.exp < DECOMPOSED_BINARY_POINT) {
2105             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2106         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2107             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2108         } else {
2109             r = UINT64_MAX;
2110         }
2111         if (p.sign) {
2112             if (r <= -(uint64_t) min) {
2113                 return -r;
2114             } else {
2115                 s->float_exception_flags = orig_flags | float_flag_invalid;
2116                 return min;
2117             }
2118         } else {
2119             if (r <= max) {
2120                 return r;
2121             } else {
2122                 s->float_exception_flags = orig_flags | float_flag_invalid;
2123                 return max;
2124             }
2125         }
2126     default:
2127         g_assert_not_reached();
2128     }
2129 }
2130
2131 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2132                                 float_status *s)
2133 {
2134     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2135                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2136 }
2137
2138 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2139                                 float_status *s)
2140 {
2141     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2142                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2143 }
2144
2145 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2146                                 float_status *s)
2147 {
2148     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2149                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2150 }
2151
2152 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2153                                 float_status *s)
2154 {
2155     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2156                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2157 }
2158
2159 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2160                                 float_status *s)
2161 {
2162     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2163                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2164 }
2165
2166 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2167                                 float_status *s)
2168 {
2169     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2170                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2171 }
2172
2173 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2174                                 float_status *s)
2175 {
2176     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2177                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2178 }
2179
2180 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2181                                 float_status *s)
2182 {
2183     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2184                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2185 }
2186
2187 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2188                                 float_status *s)
2189 {
2190     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2191                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2192 }
2193
2194 int16_t float16_to_int16(float16 a, float_status *s)
2195 {
2196     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2197 }
2198
2199 int32_t float16_to_int32(float16 a, float_status *s)
2200 {
2201     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2202 }
2203
2204 int64_t float16_to_int64(float16 a, float_status *s)
2205 {
2206     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2207 }
2208
2209 int16_t float32_to_int16(float32 a, float_status *s)
2210 {
2211     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2212 }
2213
2214 int32_t float32_to_int32(float32 a, float_status *s)
2215 {
2216     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2217 }
2218
2219 int64_t float32_to_int64(float32 a, float_status *s)
2220 {
2221     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2222 }
2223
2224 int16_t float64_to_int16(float64 a, float_status *s)
2225 {
2226     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2227 }
2228
2229 int32_t float64_to_int32(float64 a, float_status *s)
2230 {
2231     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2232 }
2233
2234 int64_t float64_to_int64(float64 a, float_status *s)
2235 {
2236     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2237 }
2238
2239 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2240 {
2241     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2242 }
2243
2244 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2245 {
2246     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2247 }
2248
2249 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2250 {
2251     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2252 }
2253
2254 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2255 {
2256     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2257 }
2258
2259 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2260 {
2261     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2262 }
2263
2264 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2265 {
2266     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2267 }
2268
2269 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2270 {
2271     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2272 }
2273
2274 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2275 {
2276     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2277 }
2278
2279 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2280 {
2281     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2282 }
2283
2284 /*
2285  *  Returns the result of converting the floating-point value `a' to
2286  *  the unsigned integer format. The conversion is performed according
2287  *  to the IEC/IEEE Standard for Binary Floating-Point
2288  *  Arithmetic---which means in particular that the conversion is
2289  *  rounded according to the current rounding mode. If `a' is a NaN,
2290  *  the largest unsigned integer is returned. Otherwise, if the
2291  *  conversion overflows, the largest unsigned integer is returned. If
2292  *  the 'a' is negative, the result is rounded and zero is returned;
2293  *  values that do not round to zero will raise the inexact exception
2294  *  flag.
2295  */
2296
2297 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2298                                        uint64_t max, float_status *s)
2299 {
2300     int orig_flags = get_float_exception_flags(s);
2301     FloatParts p = round_to_int(in, rmode, scale, s);
2302     uint64_t r;
2303
2304     switch (p.cls) {
2305     case float_class_snan:
2306     case float_class_qnan:
2307         s->float_exception_flags = orig_flags | float_flag_invalid;
2308         return max;
2309     case float_class_inf:
2310         s->float_exception_flags = orig_flags | float_flag_invalid;
2311         return p.sign ? 0 : max;
2312     case float_class_zero:
2313         return 0;
2314     case float_class_normal:
2315         if (p.sign) {
2316             s->float_exception_flags = orig_flags | float_flag_invalid;
2317             return 0;
2318         }
2319
2320         if (p.exp < DECOMPOSED_BINARY_POINT) {
2321             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2322         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2323             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2324         } else {
2325             s->float_exception_flags = orig_flags | float_flag_invalid;
2326             return max;
2327         }
2328
2329         /* For uint64 this will never trip, but if p.exp is too large
2330          * to shift a decomposed fraction we shall have exited via the
2331          * 3rd leg above.
2332          */
2333         if (r > max) {
2334             s->float_exception_flags = orig_flags | float_flag_invalid;
2335             return max;
2336         }
2337         return r;
2338     default:
2339         g_assert_not_reached();
2340     }
2341 }
2342
2343 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2344                                   float_status *s)
2345 {
2346     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2347                                   rmode, scale, UINT16_MAX, s);
2348 }
2349
2350 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2351                                   float_status *s)
2352 {
2353     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2354                                   rmode, scale, UINT32_MAX, s);
2355 }
2356
2357 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2358                                   float_status *s)
2359 {
2360     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2361                                   rmode, scale, UINT64_MAX, s);
2362 }
2363
2364 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2365                                   float_status *s)
2366 {
2367     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2368                                   rmode, scale, UINT16_MAX, s);
2369 }
2370
2371 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2372                                   float_status *s)
2373 {
2374     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2375                                   rmode, scale, UINT32_MAX, s);
2376 }
2377
2378 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2379                                   float_status *s)
2380 {
2381     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2382                                   rmode, scale, UINT64_MAX, s);
2383 }
2384
2385 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2386                                   float_status *s)
2387 {
2388     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2389                                   rmode, scale, UINT16_MAX, s);
2390 }
2391
2392 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2393                                   float_status *s)
2394 {
2395     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2396                                   rmode, scale, UINT32_MAX, s);
2397 }
2398
2399 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2400                                   float_status *s)
2401 {
2402     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2403                                   rmode, scale, UINT64_MAX, s);
2404 }
2405
2406 uint16_t float16_to_uint16(float16 a, float_status *s)
2407 {
2408     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2409 }
2410
2411 uint32_t float16_to_uint32(float16 a, float_status *s)
2412 {
2413     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2414 }
2415
2416 uint64_t float16_to_uint64(float16 a, float_status *s)
2417 {
2418     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2419 }
2420
2421 uint16_t float32_to_uint16(float32 a, float_status *s)
2422 {
2423     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2424 }
2425
2426 uint32_t float32_to_uint32(float32 a, float_status *s)
2427 {
2428     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2429 }
2430
2431 uint64_t float32_to_uint64(float32 a, float_status *s)
2432 {
2433     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2434 }
2435
2436 uint16_t float64_to_uint16(float64 a, float_status *s)
2437 {
2438     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2439 }
2440
2441 uint32_t float64_to_uint32(float64 a, float_status *s)
2442 {
2443     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2444 }
2445
2446 uint64_t float64_to_uint64(float64 a, float_status *s)
2447 {
2448     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2449 }
2450
2451 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2452 {
2453     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2454 }
2455
2456 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2457 {
2458     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2459 }
2460
2461 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2462 {
2463     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2464 }
2465
2466 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2467 {
2468     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2469 }
2470
2471 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2472 {
2473     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2474 }
2475
2476 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2477 {
2478     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2479 }
2480
2481 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2482 {
2483     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2484 }
2485
2486 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2487 {
2488     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2489 }
2490
2491 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2492 {
2493     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2494 }
2495
2496 /*
2497  * Integer to float conversions
2498  *
2499  * Returns the result of converting the two's complement integer `a'
2500  * to the floating-point format. The conversion is performed according
2501  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2502  */
2503
2504 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2505 {
2506     FloatParts r = { .sign = false };
2507
2508     if (a == 0) {
2509         r.cls = float_class_zero;
2510     } else {
2511         uint64_t f = a;
2512         int shift;
2513
2514         r.cls = float_class_normal;
2515         if (a < 0) {
2516             f = -f;
2517             r.sign = true;
2518         }
2519         shift = clz64(f) - 1;
2520         scale = MIN(MAX(scale, -0x10000), 0x10000);
2521
2522         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2523         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2524     }
2525
2526     return r;
2527 }
2528
2529 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2530 {
2531     FloatParts pa = int_to_float(a, scale, status);
2532     return float16_round_pack_canonical(pa, status);
2533 }
2534
2535 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2536 {
2537     return int64_to_float16_scalbn(a, scale, status);
2538 }
2539
2540 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2541 {
2542     return int64_to_float16_scalbn(a, scale, status);
2543 }
2544
2545 float16 int64_to_float16(int64_t a, float_status *status)
2546 {
2547     return int64_to_float16_scalbn(a, 0, status);
2548 }
2549
2550 float16 int32_to_float16(int32_t a, float_status *status)
2551 {
2552     return int64_to_float16_scalbn(a, 0, status);
2553 }
2554
2555 float16 int16_to_float16(int16_t a, float_status *status)
2556 {
2557     return int64_to_float16_scalbn(a, 0, status);
2558 }
2559
2560 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2561 {
2562     FloatParts pa = int_to_float(a, scale, status);
2563     return float32_round_pack_canonical(pa, status);
2564 }
2565
2566 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2567 {
2568     return int64_to_float32_scalbn(a, scale, status);
2569 }
2570
2571 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2572 {
2573     return int64_to_float32_scalbn(a, scale, status);
2574 }
2575
2576 float32 int64_to_float32(int64_t a, float_status *status)
2577 {
2578     return int64_to_float32_scalbn(a, 0, status);
2579 }
2580
2581 float32 int32_to_float32(int32_t a, float_status *status)
2582 {
2583     return int64_to_float32_scalbn(a, 0, status);
2584 }
2585
2586 float32 int16_to_float32(int16_t a, float_status *status)
2587 {
2588     return int64_to_float32_scalbn(a, 0, status);
2589 }
2590
2591 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2592 {
2593     FloatParts pa = int_to_float(a, scale, status);
2594     return float64_round_pack_canonical(pa, status);
2595 }
2596
2597 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2598 {
2599     return int64_to_float64_scalbn(a, scale, status);
2600 }
2601
2602 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2603 {
2604     return int64_to_float64_scalbn(a, scale, status);
2605 }
2606
2607 float64 int64_to_float64(int64_t a, float_status *status)
2608 {
2609     return int64_to_float64_scalbn(a, 0, status);
2610 }
2611
2612 float64 int32_to_float64(int32_t a, float_status *status)
2613 {
2614     return int64_to_float64_scalbn(a, 0, status);
2615 }
2616
2617 float64 int16_to_float64(int16_t a, float_status *status)
2618 {
2619     return int64_to_float64_scalbn(a, 0, status);
2620 }
2621
2622
2623 /*
2624  * Unsigned Integer to float conversions
2625  *
2626  * Returns the result of converting the unsigned integer `a' to the
2627  * floating-point format. The conversion is performed according to the
2628  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2629  */
2630
2631 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2632 {
2633     FloatParts r = { .sign = false };
2634
2635     if (a == 0) {
2636         r.cls = float_class_zero;
2637     } else {
2638         scale = MIN(MAX(scale, -0x10000), 0x10000);
2639         r.cls = float_class_normal;
2640         if ((int64_t)a < 0) {
2641             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2642             shift64RightJamming(a, 1, &a);
2643             r.frac = a;
2644         } else {
2645             int shift = clz64(a) - 1;
2646             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2647             r.frac = a << shift;
2648         }
2649     }
2650
2651     return r;
2652 }
2653
2654 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2655 {
2656     FloatParts pa = uint_to_float(a, scale, status);
2657     return float16_round_pack_canonical(pa, status);
2658 }
2659
2660 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2661 {
2662     return uint64_to_float16_scalbn(a, scale, status);
2663 }
2664
2665 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2666 {
2667     return uint64_to_float16_scalbn(a, scale, status);
2668 }
2669
2670 float16 uint64_to_float16(uint64_t a, float_status *status)
2671 {
2672     return uint64_to_float16_scalbn(a, 0, status);
2673 }
2674
2675 float16 uint32_to_float16(uint32_t a, float_status *status)
2676 {
2677     return uint64_to_float16_scalbn(a, 0, status);
2678 }
2679
2680 float16 uint16_to_float16(uint16_t a, float_status *status)
2681 {
2682     return uint64_to_float16_scalbn(a, 0, status);
2683 }
2684
2685 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2686 {
2687     FloatParts pa = uint_to_float(a, scale, status);
2688     return float32_round_pack_canonical(pa, status);
2689 }
2690
2691 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2692 {
2693     return uint64_to_float32_scalbn(a, scale, status);
2694 }
2695
2696 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2697 {
2698     return uint64_to_float32_scalbn(a, scale, status);
2699 }
2700
2701 float32 uint64_to_float32(uint64_t a, float_status *status)
2702 {
2703     return uint64_to_float32_scalbn(a, 0, status);
2704 }
2705
2706 float32 uint32_to_float32(uint32_t a, float_status *status)
2707 {
2708     return uint64_to_float32_scalbn(a, 0, status);
2709 }
2710
2711 float32 uint16_to_float32(uint16_t a, float_status *status)
2712 {
2713     return uint64_to_float32_scalbn(a, 0, status);
2714 }
2715
2716 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2717 {
2718     FloatParts pa = uint_to_float(a, scale, status);
2719     return float64_round_pack_canonical(pa, status);
2720 }
2721
2722 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2723 {
2724     return uint64_to_float64_scalbn(a, scale, status);
2725 }
2726
2727 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2728 {
2729     return uint64_to_float64_scalbn(a, scale, status);
2730 }
2731
2732 float64 uint64_to_float64(uint64_t a, float_status *status)
2733 {
2734     return uint64_to_float64_scalbn(a, 0, status);
2735 }
2736
2737 float64 uint32_to_float64(uint32_t a, float_status *status)
2738 {
2739     return uint64_to_float64_scalbn(a, 0, status);
2740 }
2741
2742 float64 uint16_to_float64(uint16_t a, float_status *status)
2743 {
2744     return uint64_to_float64_scalbn(a, 0, status);
2745 }
2746
2747 /* Float Min/Max */
2748 /* min() and max() functions. These can't be implemented as
2749  * 'compare and pick one input' because that would mishandle
2750  * NaNs and +0 vs -0.
2751  *
2752  * minnum() and maxnum() functions. These are similar to the min()
2753  * and max() functions but if one of the arguments is a QNaN and
2754  * the other is numerical then the numerical argument is returned.
2755  * SNaNs will get quietened before being returned.
2756  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2757  * and maxNum() operations. min() and max() are the typical min/max
2758  * semantics provided by many CPUs which predate that specification.
2759  *
2760  * minnummag() and maxnummag() functions correspond to minNumMag()
2761  * and minNumMag() from the IEEE-754 2008.
2762  */
2763 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2764                                 bool ieee, bool ismag, float_status *s)
2765 {
2766     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2767         if (ieee) {
2768             /* Takes two floating-point values `a' and `b', one of
2769              * which is a NaN, and returns the appropriate NaN
2770              * result. If either `a' or `b' is a signaling NaN,
2771              * the invalid exception is raised.
2772              */
2773             if (is_snan(a.cls) || is_snan(b.cls)) {
2774                 return pick_nan(a, b, s);
2775             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2776                 return b;
2777             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2778                 return a;
2779             }
2780         }
2781         return pick_nan(a, b, s);
2782     } else {
2783         int a_exp, b_exp;
2784
2785         switch (a.cls) {
2786         case float_class_normal:
2787             a_exp = a.exp;
2788             break;
2789         case float_class_inf:
2790             a_exp = INT_MAX;
2791             break;
2792         case float_class_zero:
2793             a_exp = INT_MIN;
2794             break;
2795         default:
2796             g_assert_not_reached();
2797             break;
2798         }
2799         switch (b.cls) {
2800         case float_class_normal:
2801             b_exp = b.exp;
2802             break;
2803         case float_class_inf:
2804             b_exp = INT_MAX;
2805             break;
2806         case float_class_zero:
2807             b_exp = INT_MIN;
2808             break;
2809         default:
2810             g_assert_not_reached();
2811             break;
2812         }
2813
2814         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2815             bool a_less = a_exp < b_exp;
2816             if (a_exp == b_exp) {
2817                 a_less = a.frac < b.frac;
2818             }
2819             return a_less ^ ismin ? b : a;
2820         }
2821
2822         if (a.sign == b.sign) {
2823             bool a_less = a_exp < b_exp;
2824             if (a_exp == b_exp) {
2825                 a_less = a.frac < b.frac;
2826             }
2827             return a.sign ^ a_less ^ ismin ? b : a;
2828         } else {
2829             return a.sign ^ ismin ? b : a;
2830         }
2831     }
2832 }
2833
2834 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2835 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2836                                      float_status *s)                   \
2837 {                                                                       \
2838     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2839     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2840     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2841                                                                         \
2842     return float ## sz ## _round_pack_canonical(pr, s);                 \
2843 }
2844
2845 MINMAX(16, min, true, false, false)
2846 MINMAX(16, minnum, true, true, false)
2847 MINMAX(16, minnummag, true, true, true)
2848 MINMAX(16, max, false, false, false)
2849 MINMAX(16, maxnum, false, true, false)
2850 MINMAX(16, maxnummag, false, true, true)
2851
2852 MINMAX(32, min, true, false, false)
2853 MINMAX(32, minnum, true, true, false)
2854 MINMAX(32, minnummag, true, true, true)
2855 MINMAX(32, max, false, false, false)
2856 MINMAX(32, maxnum, false, true, false)
2857 MINMAX(32, maxnummag, false, true, true)
2858
2859 MINMAX(64, min, true, false, false)
2860 MINMAX(64, minnum, true, true, false)
2861 MINMAX(64, minnummag, true, true, true)
2862 MINMAX(64, max, false, false, false)
2863 MINMAX(64, maxnum, false, true, false)
2864 MINMAX(64, maxnummag, false, true, true)
2865
2866 #undef MINMAX
2867
2868 /* Floating point compare */
2869 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2870                           float_status *s)
2871 {
2872     if (is_nan(a.cls) || is_nan(b.cls)) {
2873         if (!is_quiet ||
2874             a.cls == float_class_snan ||
2875             b.cls == float_class_snan) {
2876             s->float_exception_flags |= float_flag_invalid;
2877         }
2878         return float_relation_unordered;
2879     }
2880
2881     if (a.cls == float_class_zero) {
2882         if (b.cls == float_class_zero) {
2883             return float_relation_equal;
2884         }
2885         return b.sign ? float_relation_greater : float_relation_less;
2886     } else if (b.cls == float_class_zero) {
2887         return a.sign ? float_relation_less : float_relation_greater;
2888     }
2889
2890     /* The only really important thing about infinity is its sign. If
2891      * both are infinities the sign marks the smallest of the two.
2892      */
2893     if (a.cls == float_class_inf) {
2894         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2895             return float_relation_equal;
2896         }
2897         return a.sign ? float_relation_less : float_relation_greater;
2898     } else if (b.cls == float_class_inf) {
2899         return b.sign ? float_relation_greater : float_relation_less;
2900     }
2901
2902     if (a.sign != b.sign) {
2903         return a.sign ? float_relation_less : float_relation_greater;
2904     }
2905
2906     if (a.exp == b.exp) {
2907         if (a.frac == b.frac) {
2908             return float_relation_equal;
2909         }
2910         if (a.sign) {
2911             return a.frac > b.frac ?
2912                 float_relation_less : float_relation_greater;
2913         } else {
2914             return a.frac > b.frac ?
2915                 float_relation_greater : float_relation_less;
2916         }
2917     } else {
2918         if (a.sign) {
2919             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2920         } else {
2921             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2922         }
2923     }
2924 }
2925
2926 #define COMPARE(name, attr, sz)                                         \
2927 static int attr                                                         \
2928 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
2929 {                                                                       \
2930     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2931     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2932     return compare_floats(pa, pb, is_quiet, s);                         \
2933 }
2934
2935 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2936 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2937 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2938
2939 #undef COMPARE
2940
2941 int float16_compare(float16 a, float16 b, float_status *s)
2942 {
2943     return soft_f16_compare(a, b, false, s);
2944 }
2945
2946 int float16_compare_quiet(float16 a, float16 b, float_status *s)
2947 {
2948     return soft_f16_compare(a, b, true, s);
2949 }
2950
2951 static int QEMU_FLATTEN
2952 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2953 {
2954     union_float32 ua, ub;
2955
2956     ua.s = xa;
2957     ub.s = xb;
2958
2959     if (QEMU_NO_HARDFLOAT) {
2960         goto soft;
2961     }
2962
2963     float32_input_flush2(&ua.s, &ub.s, s);
2964     if (isgreaterequal(ua.h, ub.h)) {
2965         if (isgreater(ua.h, ub.h)) {
2966             return float_relation_greater;
2967         }
2968         return float_relation_equal;
2969     }
2970     if (likely(isless(ua.h, ub.h))) {
2971         return float_relation_less;
2972     }
2973     /* The only condition remaining is unordered.
2974      * Fall through to set flags.
2975      */
2976  soft:
2977     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2978 }
2979
2980 int float32_compare(float32 a, float32 b, float_status *s)
2981 {
2982     return f32_compare(a, b, false, s);
2983 }
2984
2985 int float32_compare_quiet(float32 a, float32 b, float_status *s)
2986 {
2987     return f32_compare(a, b, true, s);
2988 }
2989
2990 static int QEMU_FLATTEN
2991 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
2992 {
2993     union_float64 ua, ub;
2994
2995     ua.s = xa;
2996     ub.s = xb;
2997
2998     if (QEMU_NO_HARDFLOAT) {
2999         goto soft;
3000     }
3001
3002     float64_input_flush2(&ua.s, &ub.s, s);
3003     if (isgreaterequal(ua.h, ub.h)) {
3004         if (isgreater(ua.h, ub.h)) {
3005             return float_relation_greater;
3006         }
3007         return float_relation_equal;
3008     }
3009     if (likely(isless(ua.h, ub.h))) {
3010         return float_relation_less;
3011     }
3012     /* The only condition remaining is unordered.
3013      * Fall through to set flags.
3014      */
3015  soft:
3016     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3017 }
3018
3019 int float64_compare(float64 a, float64 b, float_status *s)
3020 {
3021     return f64_compare(a, b, false, s);
3022 }
3023
3024 int float64_compare_quiet(float64 a, float64 b, float_status *s)
3025 {
3026     return f64_compare(a, b, true, s);
3027 }
3028
3029 /* Multiply A by 2 raised to the power N.  */
3030 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3031 {
3032     if (unlikely(is_nan(a.cls))) {
3033         return return_nan(a, s);
3034     }
3035     if (a.cls == float_class_normal) {
3036         /* The largest float type (even though not supported by FloatParts)
3037          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3038          * still allows rounding to infinity, without allowing overflow
3039          * within the int32_t that backs FloatParts.exp.
3040          */
3041         n = MIN(MAX(n, -0x10000), 0x10000);
3042         a.exp += n;
3043     }
3044     return a;
3045 }
3046
3047 float16 float16_scalbn(float16 a, int n, float_status *status)
3048 {
3049     FloatParts pa = float16_unpack_canonical(a, status);
3050     FloatParts pr = scalbn_decomposed(pa, n, status);
3051     return float16_round_pack_canonical(pr, status);
3052 }
3053
3054 float32 float32_scalbn(float32 a, int n, float_status *status)
3055 {
3056     FloatParts pa = float32_unpack_canonical(a, status);
3057     FloatParts pr = scalbn_decomposed(pa, n, status);
3058     return float32_round_pack_canonical(pr, status);
3059 }
3060
3061 float64 float64_scalbn(float64 a, int n, float_status *status)
3062 {
3063     FloatParts pa = float64_unpack_canonical(a, status);
3064     FloatParts pr = scalbn_decomposed(pa, n, status);
3065     return float64_round_pack_canonical(pr, status);
3066 }
3067
3068 /*
3069  * Square Root
3070  *
3071  * The old softfloat code did an approximation step before zeroing in
3072  * on the final result. However for simpleness we just compute the
3073  * square root by iterating down from the implicit bit to enough extra
3074  * bits to ensure we get a correctly rounded result.
3075  *
3076  * This does mean however the calculation is slower than before,
3077  * especially for 64 bit floats.
3078  */
3079
3080 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3081 {
3082     uint64_t a_frac, r_frac, s_frac;
3083     int bit, last_bit;
3084
3085     if (is_nan(a.cls)) {
3086         return return_nan(a, s);
3087     }
3088     if (a.cls == float_class_zero) {
3089         return a;  /* sqrt(+-0) = +-0 */
3090     }
3091     if (a.sign) {
3092         s->float_exception_flags |= float_flag_invalid;
3093         return parts_default_nan(s);
3094     }
3095     if (a.cls == float_class_inf) {
3096         return a;  /* sqrt(+inf) = +inf */
3097     }
3098
3099     assert(a.cls == float_class_normal);
3100
3101     /* We need two overflow bits at the top. Adding room for that is a
3102      * right shift. If the exponent is odd, we can discard the low bit
3103      * by multiplying the fraction by 2; that's a left shift. Combine
3104      * those and we shift right if the exponent is even.
3105      */
3106     a_frac = a.frac;
3107     if (!(a.exp & 1)) {
3108         a_frac >>= 1;
3109     }
3110     a.exp >>= 1;
3111
3112     /* Bit-by-bit computation of sqrt.  */
3113     r_frac = 0;
3114     s_frac = 0;
3115
3116     /* Iterate from implicit bit down to the 3 extra bits to compute a
3117      * properly rounded result. Remember we've inserted one more bit
3118      * at the top, so these positions are one less.
3119      */
3120     bit = DECOMPOSED_BINARY_POINT - 1;
3121     last_bit = MAX(p->frac_shift - 4, 0);
3122     do {
3123         uint64_t q = 1ULL << bit;
3124         uint64_t t_frac = s_frac + q;
3125         if (t_frac <= a_frac) {
3126             s_frac = t_frac + q;
3127             a_frac -= t_frac;
3128             r_frac += q;
3129         }
3130         a_frac <<= 1;
3131     } while (--bit >= last_bit);
3132
3133     /* Undo the right shift done above. If there is any remaining
3134      * fraction, the result is inexact. Set the sticky bit.
3135      */
3136     a.frac = (r_frac << 1) + (a_frac != 0);
3137
3138     return a;
3139 }
3140
3141 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3142 {
3143     FloatParts pa = float16_unpack_canonical(a, status);
3144     FloatParts pr = sqrt_float(pa, status, &float16_params);
3145     return float16_round_pack_canonical(pr, status);
3146 }
3147
3148 static float32 QEMU_SOFTFLOAT_ATTR
3149 soft_f32_sqrt(float32 a, float_status *status)
3150 {
3151     FloatParts pa = float32_unpack_canonical(a, status);
3152     FloatParts pr = sqrt_float(pa, status, &float32_params);
3153     return float32_round_pack_canonical(pr, status);
3154 }
3155
3156 static float64 QEMU_SOFTFLOAT_ATTR
3157 soft_f64_sqrt(float64 a, float_status *status)
3158 {
3159     FloatParts pa = float64_unpack_canonical(a, status);
3160     FloatParts pr = sqrt_float(pa, status, &float64_params);
3161     return float64_round_pack_canonical(pr, status);
3162 }
3163
3164 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3165 {
3166     union_float32 ua, ur;
3167
3168     ua.s = xa;
3169     if (unlikely(!can_use_fpu(s))) {
3170         goto soft;
3171     }
3172
3173     float32_input_flush1(&ua.s, s);
3174     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3175         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3176                        fpclassify(ua.h) == FP_ZERO) ||
3177                      signbit(ua.h))) {
3178             goto soft;
3179         }
3180     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3181                         float32_is_neg(ua.s))) {
3182         goto soft;
3183     }
3184     ur.h = sqrtf(ua.h);
3185     return ur.s;
3186
3187  soft:
3188     return soft_f32_sqrt(ua.s, s);
3189 }
3190
3191 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3192 {
3193     union_float64 ua, ur;
3194
3195     ua.s = xa;
3196     if (unlikely(!can_use_fpu(s))) {
3197         goto soft;
3198     }
3199
3200     float64_input_flush1(&ua.s, s);
3201     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3202         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3203                        fpclassify(ua.h) == FP_ZERO) ||
3204                      signbit(ua.h))) {
3205             goto soft;
3206         }
3207     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3208                         float64_is_neg(ua.s))) {
3209         goto soft;
3210     }
3211     ur.h = sqrt(ua.h);
3212     return ur.s;
3213
3214  soft:
3215     return soft_f64_sqrt(ua.s, s);
3216 }
3217
3218 /*----------------------------------------------------------------------------
3219 | The pattern for a default generated NaN.
3220 *----------------------------------------------------------------------------*/
3221
3222 float16 float16_default_nan(float_status *status)
3223 {
3224     FloatParts p = parts_default_nan(status);
3225     p.frac >>= float16_params.frac_shift;
3226     return float16_pack_raw(p);
3227 }
3228
3229 float32 float32_default_nan(float_status *status)
3230 {
3231     FloatParts p = parts_default_nan(status);
3232     p.frac >>= float32_params.frac_shift;
3233     return float32_pack_raw(p);
3234 }
3235
3236 float64 float64_default_nan(float_status *status)
3237 {
3238     FloatParts p = parts_default_nan(status);
3239     p.frac >>= float64_params.frac_shift;
3240     return float64_pack_raw(p);
3241 }
3242
3243 float128 float128_default_nan(float_status *status)
3244 {
3245     FloatParts p = parts_default_nan(status);
3246     float128 r;
3247
3248     /* Extrapolate from the choices made by parts_default_nan to fill
3249      * in the quad-floating format.  If the low bit is set, assume we
3250      * want to set all non-snan bits.
3251      */
3252     r.low = -(p.frac & 1);
3253     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3254     r.high |= UINT64_C(0x7FFF000000000000);
3255     r.high |= (uint64_t)p.sign << 63;
3256
3257     return r;
3258 }
3259
3260 /*----------------------------------------------------------------------------
3261 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3262 *----------------------------------------------------------------------------*/
3263
3264 float16 float16_silence_nan(float16 a, float_status *status)
3265 {
3266     FloatParts p = float16_unpack_raw(a);
3267     p.frac <<= float16_params.frac_shift;
3268     p = parts_silence_nan(p, status);
3269     p.frac >>= float16_params.frac_shift;
3270     return float16_pack_raw(p);
3271 }
3272
3273 float32 float32_silence_nan(float32 a, float_status *status)
3274 {
3275     FloatParts p = float32_unpack_raw(a);
3276     p.frac <<= float32_params.frac_shift;
3277     p = parts_silence_nan(p, status);
3278     p.frac >>= float32_params.frac_shift;
3279     return float32_pack_raw(p);
3280 }
3281
3282 float64 float64_silence_nan(float64 a, float_status *status)
3283 {
3284     FloatParts p = float64_unpack_raw(a);
3285     p.frac <<= float64_params.frac_shift;
3286     p = parts_silence_nan(p, status);
3287     p.frac >>= float64_params.frac_shift;
3288     return float64_pack_raw(p);
3289 }
3290
3291
3292 /*----------------------------------------------------------------------------
3293 | If `a' is denormal and we are in flush-to-zero mode then set the
3294 | input-denormal exception and return zero. Otherwise just return the value.
3295 *----------------------------------------------------------------------------*/
3296
3297 static bool parts_squash_denormal(FloatParts p, float_status *status)
3298 {
3299     if (p.exp == 0 && p.frac != 0) {
3300         float_raise(float_flag_input_denormal, status);
3301         return true;
3302     }
3303
3304     return false;
3305 }
3306
3307 float16 float16_squash_input_denormal(float16 a, float_status *status)
3308 {
3309     if (status->flush_inputs_to_zero) {
3310         FloatParts p = float16_unpack_raw(a);
3311         if (parts_squash_denormal(p, status)) {
3312             return float16_set_sign(float16_zero, p.sign);
3313         }
3314     }
3315     return a;
3316 }
3317
3318 float32 float32_squash_input_denormal(float32 a, float_status *status)
3319 {
3320     if (status->flush_inputs_to_zero) {
3321         FloatParts p = float32_unpack_raw(a);
3322         if (parts_squash_denormal(p, status)) {
3323             return float32_set_sign(float32_zero, p.sign);
3324         }
3325     }
3326     return a;
3327 }
3328
3329 float64 float64_squash_input_denormal(float64 a, float_status *status)
3330 {
3331     if (status->flush_inputs_to_zero) {
3332         FloatParts p = float64_unpack_raw(a);
3333         if (parts_squash_denormal(p, status)) {
3334             return float64_set_sign(float64_zero, p.sign);
3335         }
3336     }
3337     return a;
3338 }
3339
3340 /*----------------------------------------------------------------------------
3341 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3342 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3343 | input.  If `zSign' is 1, the input is negated before being converted to an
3344 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3345 | is simply rounded to an integer, with the inexact exception raised if the
3346 | input cannot be represented exactly as an integer.  However, if the fixed-
3347 | point input is too large, the invalid exception is raised and the largest
3348 | positive or negative integer is returned.
3349 *----------------------------------------------------------------------------*/
3350
3351 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3352 {
3353     int8_t roundingMode;
3354     flag roundNearestEven;
3355     int8_t roundIncrement, roundBits;
3356     int32_t z;
3357
3358     roundingMode = status->float_rounding_mode;
3359     roundNearestEven = ( roundingMode == float_round_nearest_even );
3360     switch (roundingMode) {
3361     case float_round_nearest_even:
3362     case float_round_ties_away:
3363         roundIncrement = 0x40;
3364         break;
3365     case float_round_to_zero:
3366         roundIncrement = 0;
3367         break;
3368     case float_round_up:
3369         roundIncrement = zSign ? 0 : 0x7f;
3370         break;
3371     case float_round_down:
3372         roundIncrement = zSign ? 0x7f : 0;
3373         break;
3374     case float_round_to_odd:
3375         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3376         break;
3377     default:
3378         abort();
3379     }
3380     roundBits = absZ & 0x7F;
3381     absZ = ( absZ + roundIncrement )>>7;
3382     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3383     z = absZ;
3384     if ( zSign ) z = - z;
3385     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3386         float_raise(float_flag_invalid, status);
3387         return zSign ? INT32_MIN : INT32_MAX;
3388     }
3389     if (roundBits) {
3390         status->float_exception_flags |= float_flag_inexact;
3391     }
3392     return z;
3393
3394 }
3395
3396 /*----------------------------------------------------------------------------
3397 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3398 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3399 | and returns the properly rounded 64-bit integer corresponding to the input.
3400 | If `zSign' is 1, the input is negated before being converted to an integer.
3401 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3402 | the inexact exception raised if the input cannot be represented exactly as
3403 | an integer.  However, if the fixed-point input is too large, the invalid
3404 | exception is raised and the largest positive or negative integer is
3405 | returned.
3406 *----------------------------------------------------------------------------*/
3407
3408 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3409                                float_status *status)
3410 {
3411     int8_t roundingMode;
3412     flag roundNearestEven, increment;
3413     int64_t z;
3414
3415     roundingMode = status->float_rounding_mode;
3416     roundNearestEven = ( roundingMode == float_round_nearest_even );
3417     switch (roundingMode) {
3418     case float_round_nearest_even:
3419     case float_round_ties_away:
3420         increment = ((int64_t) absZ1 < 0);
3421         break;
3422     case float_round_to_zero:
3423         increment = 0;
3424         break;
3425     case float_round_up:
3426         increment = !zSign && absZ1;
3427         break;
3428     case float_round_down:
3429         increment = zSign && absZ1;
3430         break;
3431     case float_round_to_odd:
3432         increment = !(absZ0 & 1) && absZ1;
3433         break;
3434     default:
3435         abort();
3436     }
3437     if ( increment ) {
3438         ++absZ0;
3439         if ( absZ0 == 0 ) goto overflow;
3440         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3441     }
3442     z = absZ0;
3443     if ( zSign ) z = - z;
3444     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3445  overflow:
3446         float_raise(float_flag_invalid, status);
3447         return zSign ? INT64_MIN : INT64_MAX;
3448     }
3449     if (absZ1) {
3450         status->float_exception_flags |= float_flag_inexact;
3451     }
3452     return z;
3453
3454 }
3455
3456 /*----------------------------------------------------------------------------
3457 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3458 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3459 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3460 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3461 | with the inexact exception raised if the input cannot be represented exactly
3462 | as an integer.  However, if the fixed-point input is too large, the invalid
3463 | exception is raised and the largest unsigned integer is returned.
3464 *----------------------------------------------------------------------------*/
3465
3466 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3467                                 uint64_t absZ1, float_status *status)
3468 {
3469     int8_t roundingMode;
3470     flag roundNearestEven, increment;
3471
3472     roundingMode = status->float_rounding_mode;
3473     roundNearestEven = (roundingMode == float_round_nearest_even);
3474     switch (roundingMode) {
3475     case float_round_nearest_even:
3476     case float_round_ties_away:
3477         increment = ((int64_t)absZ1 < 0);
3478         break;
3479     case float_round_to_zero:
3480         increment = 0;
3481         break;
3482     case float_round_up:
3483         increment = !zSign && absZ1;
3484         break;
3485     case float_round_down:
3486         increment = zSign && absZ1;
3487         break;
3488     case float_round_to_odd:
3489         increment = !(absZ0 & 1) && absZ1;
3490         break;
3491     default:
3492         abort();
3493     }
3494     if (increment) {
3495         ++absZ0;
3496         if (absZ0 == 0) {
3497             float_raise(float_flag_invalid, status);
3498             return UINT64_MAX;
3499         }
3500         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3501     }
3502
3503     if (zSign && absZ0) {
3504         float_raise(float_flag_invalid, status);
3505         return 0;
3506     }
3507
3508     if (absZ1) {
3509         status->float_exception_flags |= float_flag_inexact;
3510     }
3511     return absZ0;
3512 }
3513
3514 /*----------------------------------------------------------------------------
3515 | Normalizes the subnormal single-precision floating-point value represented
3516 | by the denormalized significand `aSig'.  The normalized exponent and
3517 | significand are stored at the locations pointed to by `zExpPtr' and
3518 | `zSigPtr', respectively.
3519 *----------------------------------------------------------------------------*/
3520
3521 static void
3522  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3523 {
3524     int8_t shiftCount;
3525
3526     shiftCount = clz32(aSig) - 8;
3527     *zSigPtr = aSig<<shiftCount;
3528     *zExpPtr = 1 - shiftCount;
3529
3530 }
3531
3532 /*----------------------------------------------------------------------------
3533 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3534 | and significand `zSig', and returns the proper single-precision floating-
3535 | point value corresponding to the abstract input.  Ordinarily, the abstract
3536 | value is simply rounded and packed into the single-precision format, with
3537 | the inexact exception raised if the abstract input cannot be represented
3538 | exactly.  However, if the abstract value is too large, the overflow and
3539 | inexact exceptions are raised and an infinity or maximal finite value is
3540 | returned.  If the abstract value is too small, the input value is rounded to
3541 | a subnormal number, and the underflow and inexact exceptions are raised if
3542 | the abstract input cannot be represented exactly as a subnormal single-
3543 | precision floating-point number.
3544 |     The input significand `zSig' has its binary point between bits 30
3545 | and 29, which is 7 bits to the left of the usual location.  This shifted
3546 | significand must be normalized or smaller.  If `zSig' is not normalized,
3547 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3548 | and it must not require rounding.  In the usual case that `zSig' is
3549 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3550 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3551 | Binary Floating-Point Arithmetic.
3552 *----------------------------------------------------------------------------*/
3553
3554 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3555                                    float_status *status)
3556 {
3557     int8_t roundingMode;
3558     flag roundNearestEven;
3559     int8_t roundIncrement, roundBits;
3560     flag isTiny;
3561
3562     roundingMode = status->float_rounding_mode;
3563     roundNearestEven = ( roundingMode == float_round_nearest_even );
3564     switch (roundingMode) {
3565     case float_round_nearest_even:
3566     case float_round_ties_away:
3567         roundIncrement = 0x40;
3568         break;
3569     case float_round_to_zero:
3570         roundIncrement = 0;
3571         break;
3572     case float_round_up:
3573         roundIncrement = zSign ? 0 : 0x7f;
3574         break;
3575     case float_round_down:
3576         roundIncrement = zSign ? 0x7f : 0;
3577         break;
3578     case float_round_to_odd:
3579         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3580         break;
3581     default:
3582         abort();
3583         break;
3584     }
3585     roundBits = zSig & 0x7F;
3586     if ( 0xFD <= (uint16_t) zExp ) {
3587         if (    ( 0xFD < zExp )
3588              || (    ( zExp == 0xFD )
3589                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3590            ) {
3591             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3592                                    roundIncrement != 0;
3593             float_raise(float_flag_overflow | float_flag_inexact, status);
3594             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3595         }
3596         if ( zExp < 0 ) {
3597             if (status->flush_to_zero) {
3598                 float_raise(float_flag_output_denormal, status);
3599                 return packFloat32(zSign, 0, 0);
3600             }
3601             isTiny =
3602                 (status->float_detect_tininess
3603                  == float_tininess_before_rounding)
3604                 || ( zExp < -1 )
3605                 || ( zSig + roundIncrement < 0x80000000 );
3606             shift32RightJamming( zSig, - zExp, &zSig );
3607             zExp = 0;
3608             roundBits = zSig & 0x7F;
3609             if (isTiny && roundBits) {
3610                 float_raise(float_flag_underflow, status);
3611             }
3612             if (roundingMode == float_round_to_odd) {
3613                 /*
3614                  * For round-to-odd case, the roundIncrement depends on
3615                  * zSig which just changed.
3616                  */
3617                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3618             }
3619         }
3620     }
3621     if (roundBits) {
3622         status->float_exception_flags |= float_flag_inexact;
3623     }
3624     zSig = ( zSig + roundIncrement )>>7;
3625     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3626     if ( zSig == 0 ) zExp = 0;
3627     return packFloat32( zSign, zExp, zSig );
3628
3629 }
3630
3631 /*----------------------------------------------------------------------------
3632 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3633 | and significand `zSig', and returns the proper single-precision floating-
3634 | point value corresponding to the abstract input.  This routine is just like
3635 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3636 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3637 | floating-point exponent.
3638 *----------------------------------------------------------------------------*/
3639
3640 static float32
3641  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3642                               float_status *status)
3643 {
3644     int8_t shiftCount;
3645
3646     shiftCount = clz32(zSig) - 1;
3647     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3648                                status);
3649
3650 }
3651
3652 /*----------------------------------------------------------------------------
3653 | Normalizes the subnormal double-precision floating-point value represented
3654 | by the denormalized significand `aSig'.  The normalized exponent and
3655 | significand are stored at the locations pointed to by `zExpPtr' and
3656 | `zSigPtr', respectively.
3657 *----------------------------------------------------------------------------*/
3658
3659 static void
3660  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3661 {
3662     int8_t shiftCount;
3663
3664     shiftCount = clz64(aSig) - 11;
3665     *zSigPtr = aSig<<shiftCount;
3666     *zExpPtr = 1 - shiftCount;
3667
3668 }
3669
3670 /*----------------------------------------------------------------------------
3671 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3672 | double-precision floating-point value, returning the result.  After being
3673 | shifted into the proper positions, the three fields are simply added
3674 | together to form the result.  This means that any integer portion of `zSig'
3675 | will be added into the exponent.  Since a properly normalized significand
3676 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3677 | than the desired result exponent whenever `zSig' is a complete, normalized
3678 | significand.
3679 *----------------------------------------------------------------------------*/
3680
3681 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3682 {
3683
3684     return make_float64(
3685         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3686
3687 }
3688
3689 /*----------------------------------------------------------------------------
3690 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3691 | and significand `zSig', and returns the proper double-precision floating-
3692 | point value corresponding to the abstract input.  Ordinarily, the abstract
3693 | value is simply rounded and packed into the double-precision format, with
3694 | the inexact exception raised if the abstract input cannot be represented
3695 | exactly.  However, if the abstract value is too large, the overflow and
3696 | inexact exceptions are raised and an infinity or maximal finite value is
3697 | returned.  If the abstract value is too small, the input value is rounded to
3698 | a subnormal number, and the underflow and inexact exceptions are raised if
3699 | the abstract input cannot be represented exactly as a subnormal double-
3700 | precision floating-point number.
3701 |     The input significand `zSig' has its binary point between bits 62
3702 | and 61, which is 10 bits to the left of the usual location.  This shifted
3703 | significand must be normalized or smaller.  If `zSig' is not normalized,
3704 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3705 | and it must not require rounding.  In the usual case that `zSig' is
3706 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3707 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3708 | Binary Floating-Point Arithmetic.
3709 *----------------------------------------------------------------------------*/
3710
3711 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3712                                    float_status *status)
3713 {
3714     int8_t roundingMode;
3715     flag roundNearestEven;
3716     int roundIncrement, roundBits;
3717     flag isTiny;
3718
3719     roundingMode = status->float_rounding_mode;
3720     roundNearestEven = ( roundingMode == float_round_nearest_even );
3721     switch (roundingMode) {
3722     case float_round_nearest_even:
3723     case float_round_ties_away:
3724         roundIncrement = 0x200;
3725         break;
3726     case float_round_to_zero:
3727         roundIncrement = 0;
3728         break;
3729     case float_round_up:
3730         roundIncrement = zSign ? 0 : 0x3ff;
3731         break;
3732     case float_round_down:
3733         roundIncrement = zSign ? 0x3ff : 0;
3734         break;
3735     case float_round_to_odd:
3736         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3737         break;
3738     default:
3739         abort();
3740     }
3741     roundBits = zSig & 0x3FF;
3742     if ( 0x7FD <= (uint16_t) zExp ) {
3743         if (    ( 0x7FD < zExp )
3744              || (    ( zExp == 0x7FD )
3745                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3746            ) {
3747             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3748                                    roundIncrement != 0;
3749             float_raise(float_flag_overflow | float_flag_inexact, status);
3750             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3751         }
3752         if ( zExp < 0 ) {
3753             if (status->flush_to_zero) {
3754                 float_raise(float_flag_output_denormal, status);
3755                 return packFloat64(zSign, 0, 0);
3756             }
3757             isTiny =
3758                    (status->float_detect_tininess
3759                     == float_tininess_before_rounding)
3760                 || ( zExp < -1 )
3761                 || ( zSig + roundIncrement < UINT64_C(0x8000000000000000) );
3762             shift64RightJamming( zSig, - zExp, &zSig );
3763             zExp = 0;
3764             roundBits = zSig & 0x3FF;
3765             if (isTiny && roundBits) {
3766                 float_raise(float_flag_underflow, status);
3767             }
3768             if (roundingMode == float_round_to_odd) {
3769                 /*
3770                  * For round-to-odd case, the roundIncrement depends on
3771                  * zSig which just changed.
3772                  */
3773                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3774             }
3775         }
3776     }
3777     if (roundBits) {
3778         status->float_exception_flags |= float_flag_inexact;
3779     }
3780     zSig = ( zSig + roundIncrement )>>10;
3781     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3782     if ( zSig == 0 ) zExp = 0;
3783     return packFloat64( zSign, zExp, zSig );
3784
3785 }
3786
3787 /*----------------------------------------------------------------------------
3788 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3789 | and significand `zSig', and returns the proper double-precision floating-
3790 | point value corresponding to the abstract input.  This routine is just like
3791 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3792 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3793 | floating-point exponent.
3794 *----------------------------------------------------------------------------*/
3795
3796 static float64
3797  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3798                               float_status *status)
3799 {
3800     int8_t shiftCount;
3801
3802     shiftCount = clz64(zSig) - 1;
3803     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3804                                status);
3805
3806 }
3807
3808 /*----------------------------------------------------------------------------
3809 | Normalizes the subnormal extended double-precision floating-point value
3810 | represented by the denormalized significand `aSig'.  The normalized exponent
3811 | and significand are stored at the locations pointed to by `zExpPtr' and
3812 | `zSigPtr', respectively.
3813 *----------------------------------------------------------------------------*/
3814
3815 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3816                                 uint64_t *zSigPtr)
3817 {
3818     int8_t shiftCount;
3819
3820     shiftCount = clz64(aSig);
3821     *zSigPtr = aSig<<shiftCount;
3822     *zExpPtr = 1 - shiftCount;
3823 }
3824
3825 /*----------------------------------------------------------------------------
3826 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3827 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3828 | and returns the proper extended double-precision floating-point value
3829 | corresponding to the abstract input.  Ordinarily, the abstract value is
3830 | rounded and packed into the extended double-precision format, with the
3831 | inexact exception raised if the abstract input cannot be represented
3832 | exactly.  However, if the abstract value is too large, the overflow and
3833 | inexact exceptions are raised and an infinity or maximal finite value is
3834 | returned.  If the abstract value is too small, the input value is rounded to
3835 | a subnormal number, and the underflow and inexact exceptions are raised if
3836 | the abstract input cannot be represented exactly as a subnormal extended
3837 | double-precision floating-point number.
3838 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3839 | number of bits as single or double precision, respectively.  Otherwise, the
3840 | result is rounded to the full precision of the extended double-precision
3841 | format.
3842 |     The input significand must be normalized or smaller.  If the input
3843 | significand is not normalized, `zExp' must be 0; in that case, the result
3844 | returned is a subnormal number, and it must not require rounding.  The
3845 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3846 | Floating-Point Arithmetic.
3847 *----------------------------------------------------------------------------*/
3848
3849 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3850                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3851                               float_status *status)
3852 {
3853     int8_t roundingMode;
3854     flag roundNearestEven, increment, isTiny;
3855     int64_t roundIncrement, roundMask, roundBits;
3856
3857     roundingMode = status->float_rounding_mode;
3858     roundNearestEven = ( roundingMode == float_round_nearest_even );
3859     if ( roundingPrecision == 80 ) goto precision80;
3860     if ( roundingPrecision == 64 ) {
3861         roundIncrement = UINT64_C(0x0000000000000400);
3862         roundMask = UINT64_C(0x00000000000007FF);
3863     }
3864     else if ( roundingPrecision == 32 ) {
3865         roundIncrement = UINT64_C(0x0000008000000000);
3866         roundMask = UINT64_C(0x000000FFFFFFFFFF);
3867     }
3868     else {
3869         goto precision80;
3870     }
3871     zSig0 |= ( zSig1 != 0 );
3872     switch (roundingMode) {
3873     case float_round_nearest_even:
3874     case float_round_ties_away:
3875         break;
3876     case float_round_to_zero:
3877         roundIncrement = 0;
3878         break;
3879     case float_round_up:
3880         roundIncrement = zSign ? 0 : roundMask;
3881         break;
3882     case float_round_down:
3883         roundIncrement = zSign ? roundMask : 0;
3884         break;
3885     default:
3886         abort();
3887     }
3888     roundBits = zSig0 & roundMask;
3889     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3890         if (    ( 0x7FFE < zExp )
3891              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3892            ) {
3893             goto overflow;
3894         }
3895         if ( zExp <= 0 ) {
3896             if (status->flush_to_zero) {
3897                 float_raise(float_flag_output_denormal, status);
3898                 return packFloatx80(zSign, 0, 0);
3899             }
3900             isTiny =
3901                    (status->float_detect_tininess
3902                     == float_tininess_before_rounding)
3903                 || ( zExp < 0 )
3904                 || ( zSig0 <= zSig0 + roundIncrement );
3905             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3906             zExp = 0;
3907             roundBits = zSig0 & roundMask;
3908             if (isTiny && roundBits) {
3909                 float_raise(float_flag_underflow, status);
3910             }
3911             if (roundBits) {
3912                 status->float_exception_flags |= float_flag_inexact;
3913             }
3914             zSig0 += roundIncrement;
3915             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3916             roundIncrement = roundMask + 1;
3917             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3918                 roundMask |= roundIncrement;
3919             }
3920             zSig0 &= ~ roundMask;
3921             return packFloatx80( zSign, zExp, zSig0 );
3922         }
3923     }
3924     if (roundBits) {
3925         status->float_exception_flags |= float_flag_inexact;
3926     }
3927     zSig0 += roundIncrement;
3928     if ( zSig0 < roundIncrement ) {
3929         ++zExp;
3930         zSig0 = UINT64_C(0x8000000000000000);
3931     }
3932     roundIncrement = roundMask + 1;
3933     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3934         roundMask |= roundIncrement;
3935     }
3936     zSig0 &= ~ roundMask;
3937     if ( zSig0 == 0 ) zExp = 0;
3938     return packFloatx80( zSign, zExp, zSig0 );
3939  precision80:
3940     switch (roundingMode) {
3941     case float_round_nearest_even:
3942     case float_round_ties_away:
3943         increment = ((int64_t)zSig1 < 0);
3944         break;
3945     case float_round_to_zero:
3946         increment = 0;
3947         break;
3948     case float_round_up:
3949         increment = !zSign && zSig1;
3950         break;
3951     case float_round_down:
3952         increment = zSign && zSig1;
3953         break;
3954     default:
3955         abort();
3956     }
3957     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3958         if (    ( 0x7FFE < zExp )
3959              || (    ( zExp == 0x7FFE )
3960                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
3961                   && increment
3962                 )
3963            ) {
3964             roundMask = 0;
3965  overflow:
3966             float_raise(float_flag_overflow | float_flag_inexact, status);
3967             if (    ( roundingMode == float_round_to_zero )
3968                  || ( zSign && ( roundingMode == float_round_up ) )
3969                  || ( ! zSign && ( roundingMode == float_round_down ) )
3970                ) {
3971                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3972             }
3973             return packFloatx80(zSign,
3974                                 floatx80_infinity_high,
3975                                 floatx80_infinity_low);
3976         }
3977         if ( zExp <= 0 ) {
3978             isTiny =
3979                    (status->float_detect_tininess
3980                     == float_tininess_before_rounding)
3981                 || ( zExp < 0 )
3982                 || ! increment
3983                 || ( zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF) );
3984             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3985             zExp = 0;
3986             if (isTiny && zSig1) {
3987                 float_raise(float_flag_underflow, status);
3988             }
3989             if (zSig1) {
3990                 status->float_exception_flags |= float_flag_inexact;
3991             }
3992             switch (roundingMode) {
3993             case float_round_nearest_even:
3994             case float_round_ties_away:
3995                 increment = ((int64_t)zSig1 < 0);
3996                 break;
3997             case float_round_to_zero:
3998                 increment = 0;
3999                 break;
4000             case float_round_up:
4001                 increment = !zSign && zSig1;
4002                 break;
4003             case float_round_down:
4004                 increment = zSign && zSig1;
4005                 break;
4006             default:
4007                 abort();
4008             }
4009             if ( increment ) {
4010                 ++zSig0;
4011                 zSig0 &=
4012                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4013                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4014             }
4015             return packFloatx80( zSign, zExp, zSig0 );
4016         }
4017     }
4018     if (zSig1) {
4019         status->float_exception_flags |= float_flag_inexact;
4020     }
4021     if ( increment ) {
4022         ++zSig0;
4023         if ( zSig0 == 0 ) {
4024             ++zExp;
4025             zSig0 = UINT64_C(0x8000000000000000);
4026         }
4027         else {
4028             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4029         }
4030     }
4031     else {
4032         if ( zSig0 == 0 ) zExp = 0;
4033     }
4034     return packFloatx80( zSign, zExp, zSig0 );
4035
4036 }
4037
4038 /*----------------------------------------------------------------------------
4039 | Takes an abstract floating-point value having sign `zSign', exponent
4040 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4041 | and returns the proper extended double-precision floating-point value
4042 | corresponding to the abstract input.  This routine is just like
4043 | `roundAndPackFloatx80' except that the input significand does not have to be
4044 | normalized.
4045 *----------------------------------------------------------------------------*/
4046
4047 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4048                                        flag zSign, int32_t zExp,
4049                                        uint64_t zSig0, uint64_t zSig1,
4050                                        float_status *status)
4051 {
4052     int8_t shiftCount;
4053
4054     if ( zSig0 == 0 ) {
4055         zSig0 = zSig1;
4056         zSig1 = 0;
4057         zExp -= 64;
4058     }
4059     shiftCount = clz64(zSig0);
4060     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4061     zExp -= shiftCount;
4062     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4063                                 zSig0, zSig1, status);
4064
4065 }
4066
4067 /*----------------------------------------------------------------------------
4068 | Returns the least-significant 64 fraction bits of the quadruple-precision
4069 | floating-point value `a'.
4070 *----------------------------------------------------------------------------*/
4071
4072 static inline uint64_t extractFloat128Frac1( float128 a )
4073 {
4074
4075     return a.low;
4076
4077 }
4078
4079 /*----------------------------------------------------------------------------
4080 | Returns the most-significant 48 fraction bits of the quadruple-precision
4081 | floating-point value `a'.
4082 *----------------------------------------------------------------------------*/
4083
4084 static inline uint64_t extractFloat128Frac0( float128 a )
4085 {
4086
4087     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4088
4089 }
4090
4091 /*----------------------------------------------------------------------------
4092 | Returns the exponent bits of the quadruple-precision floating-point value
4093 | `a'.
4094 *----------------------------------------------------------------------------*/
4095
4096 static inline int32_t extractFloat128Exp( float128 a )
4097 {
4098
4099     return ( a.high>>48 ) & 0x7FFF;
4100
4101 }
4102
4103 /*----------------------------------------------------------------------------
4104 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4105 *----------------------------------------------------------------------------*/
4106
4107 static inline flag extractFloat128Sign( float128 a )
4108 {
4109
4110     return a.high>>63;
4111
4112 }
4113
4114 /*----------------------------------------------------------------------------
4115 | Normalizes the subnormal quadruple-precision floating-point value
4116 | represented by the denormalized significand formed by the concatenation of
4117 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4118 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4119 | significand are stored at the location pointed to by `zSig0Ptr', and the
4120 | least significant 64 bits of the normalized significand are stored at the
4121 | location pointed to by `zSig1Ptr'.
4122 *----------------------------------------------------------------------------*/
4123
4124 static void
4125  normalizeFloat128Subnormal(
4126      uint64_t aSig0,
4127      uint64_t aSig1,
4128      int32_t *zExpPtr,
4129      uint64_t *zSig0Ptr,
4130      uint64_t *zSig1Ptr
4131  )
4132 {
4133     int8_t shiftCount;
4134
4135     if ( aSig0 == 0 ) {
4136         shiftCount = clz64(aSig1) - 15;
4137         if ( shiftCount < 0 ) {
4138             *zSig0Ptr = aSig1>>( - shiftCount );
4139             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4140         }
4141         else {
4142             *zSig0Ptr = aSig1<<shiftCount;
4143             *zSig1Ptr = 0;
4144         }
4145         *zExpPtr = - shiftCount - 63;
4146     }
4147     else {
4148         shiftCount = clz64(aSig0) - 15;
4149         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4150         *zExpPtr = 1 - shiftCount;
4151     }
4152
4153 }
4154
4155 /*----------------------------------------------------------------------------
4156 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4157 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4158 | floating-point value, returning the result.  After being shifted into the
4159 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4160 | added together to form the most significant 32 bits of the result.  This
4161 | means that any integer portion of `zSig0' will be added into the exponent.
4162 | Since a properly normalized significand will have an integer portion equal
4163 | to 1, the `zExp' input should be 1 less than the desired result exponent
4164 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4165 | significand.
4166 *----------------------------------------------------------------------------*/
4167
4168 static inline float128
4169  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
4170 {
4171     float128 z;
4172
4173     z.low = zSig1;
4174     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
4175     return z;
4176
4177 }
4178
4179 /*----------------------------------------------------------------------------
4180 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4181 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4182 | and `zSig2', and returns the proper quadruple-precision floating-point value
4183 | corresponding to the abstract input.  Ordinarily, the abstract value is
4184 | simply rounded and packed into the quadruple-precision format, with the
4185 | inexact exception raised if the abstract input cannot be represented
4186 | exactly.  However, if the abstract value is too large, the overflow and
4187 | inexact exceptions are raised and an infinity or maximal finite value is
4188 | returned.  If the abstract value is too small, the input value is rounded to
4189 | a subnormal number, and the underflow and inexact exceptions are raised if
4190 | the abstract input cannot be represented exactly as a subnormal quadruple-
4191 | precision floating-point number.
4192 |     The input significand must be normalized or smaller.  If the input
4193 | significand is not normalized, `zExp' must be 0; in that case, the result
4194 | returned is a subnormal number, and it must not require rounding.  In the
4195 | usual case that the input significand is normalized, `zExp' must be 1 less
4196 | than the ``true'' floating-point exponent.  The handling of underflow and
4197 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4198 *----------------------------------------------------------------------------*/
4199
4200 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4201                                      uint64_t zSig0, uint64_t zSig1,
4202                                      uint64_t zSig2, float_status *status)
4203 {
4204     int8_t roundingMode;
4205     flag roundNearestEven, increment, isTiny;
4206
4207     roundingMode = status->float_rounding_mode;
4208     roundNearestEven = ( roundingMode == float_round_nearest_even );
4209     switch (roundingMode) {
4210     case float_round_nearest_even:
4211     case float_round_ties_away:
4212         increment = ((int64_t)zSig2 < 0);
4213         break;
4214     case float_round_to_zero:
4215         increment = 0;
4216         break;
4217     case float_round_up:
4218         increment = !zSign && zSig2;
4219         break;
4220     case float_round_down:
4221         increment = zSign && zSig2;
4222         break;
4223     case float_round_to_odd:
4224         increment = !(zSig1 & 0x1) && zSig2;
4225         break;
4226     default:
4227         abort();
4228     }
4229     if ( 0x7FFD <= (uint32_t) zExp ) {
4230         if (    ( 0x7FFD < zExp )
4231              || (    ( zExp == 0x7FFD )
4232                   && eq128(
4233                          UINT64_C(0x0001FFFFFFFFFFFF),
4234                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4235                          zSig0,
4236                          zSig1
4237                      )
4238                   && increment
4239                 )
4240            ) {
4241             float_raise(float_flag_overflow | float_flag_inexact, status);
4242             if (    ( roundingMode == float_round_to_zero )
4243                  || ( zSign && ( roundingMode == float_round_up ) )
4244                  || ( ! zSign && ( roundingMode == float_round_down ) )
4245                  || (roundingMode == float_round_to_odd)
4246                ) {
4247                 return
4248                     packFloat128(
4249                         zSign,
4250                         0x7FFE,
4251                         UINT64_C(0x0000FFFFFFFFFFFF),
4252                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4253                     );
4254             }
4255             return packFloat128( zSign, 0x7FFF, 0, 0 );
4256         }
4257         if ( zExp < 0 ) {
4258             if (status->flush_to_zero) {
4259                 float_raise(float_flag_output_denormal, status);
4260                 return packFloat128(zSign, 0, 0, 0);
4261             }
4262             isTiny =
4263                    (status->float_detect_tininess
4264                     == float_tininess_before_rounding)
4265                 || ( zExp < -1 )
4266                 || ! increment
4267                 || lt128(
4268                        zSig0,
4269                        zSig1,
4270                        UINT64_C(0x0001FFFFFFFFFFFF),
4271                        UINT64_C(0xFFFFFFFFFFFFFFFF)
4272                    );
4273             shift128ExtraRightJamming(
4274                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4275             zExp = 0;
4276             if (isTiny && zSig2) {
4277                 float_raise(float_flag_underflow, status);
4278             }
4279             switch (roundingMode) {
4280             case float_round_nearest_even:
4281             case float_round_ties_away:
4282                 increment = ((int64_t)zSig2 < 0);
4283                 break;
4284             case float_round_to_zero:
4285                 increment = 0;
4286                 break;
4287             case float_round_up:
4288                 increment = !zSign && zSig2;
4289                 break;
4290             case float_round_down:
4291                 increment = zSign && zSig2;
4292                 break;
4293             case float_round_to_odd:
4294                 increment = !(zSig1 & 0x1) && zSig2;
4295                 break;
4296             default:
4297                 abort();
4298             }
4299         }
4300     }
4301     if (zSig2) {
4302         status->float_exception_flags |= float_flag_inexact;
4303     }
4304     if ( increment ) {
4305         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4306         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4307     }
4308     else {
4309         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4310     }
4311     return packFloat128( zSign, zExp, zSig0, zSig1 );
4312
4313 }
4314
4315 /*----------------------------------------------------------------------------
4316 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4317 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4318 | returns the proper quadruple-precision floating-point value corresponding
4319 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4320 | except that the input significand has fewer bits and does not have to be
4321 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4322 | point exponent.
4323 *----------------------------------------------------------------------------*/
4324
4325 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4326                                               uint64_t zSig0, uint64_t zSig1,
4327                                               float_status *status)
4328 {
4329     int8_t shiftCount;
4330     uint64_t zSig2;
4331
4332     if ( zSig0 == 0 ) {
4333         zSig0 = zSig1;
4334         zSig1 = 0;
4335         zExp -= 64;
4336     }
4337     shiftCount = clz64(zSig0) - 15;
4338     if ( 0 <= shiftCount ) {
4339         zSig2 = 0;
4340         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4341     }
4342     else {
4343         shift128ExtraRightJamming(
4344             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4345     }
4346     zExp -= shiftCount;
4347     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4348
4349 }
4350
4351
4352 /*----------------------------------------------------------------------------
4353 | Returns the result of converting the 32-bit two's complement integer `a'
4354 | to the extended double-precision floating-point format.  The conversion
4355 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4356 | Arithmetic.
4357 *----------------------------------------------------------------------------*/
4358
4359 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4360 {
4361     flag zSign;
4362     uint32_t absA;
4363     int8_t shiftCount;
4364     uint64_t zSig;
4365
4366     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4367     zSign = ( a < 0 );
4368     absA = zSign ? - a : a;
4369     shiftCount = clz32(absA) + 32;
4370     zSig = absA;
4371     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4372
4373 }
4374
4375 /*----------------------------------------------------------------------------
4376 | Returns the result of converting the 32-bit two's complement integer `a' to
4377 | the quadruple-precision floating-point format.  The conversion is performed
4378 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4379 *----------------------------------------------------------------------------*/
4380
4381 float128 int32_to_float128(int32_t a, float_status *status)
4382 {
4383     flag zSign;
4384     uint32_t absA;
4385     int8_t shiftCount;
4386     uint64_t zSig0;
4387
4388     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4389     zSign = ( a < 0 );
4390     absA = zSign ? - a : a;
4391     shiftCount = clz32(absA) + 17;
4392     zSig0 = absA;
4393     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4394
4395 }
4396
4397 /*----------------------------------------------------------------------------
4398 | Returns the result of converting the 64-bit two's complement integer `a'
4399 | to the extended double-precision floating-point format.  The conversion
4400 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4401 | Arithmetic.
4402 *----------------------------------------------------------------------------*/
4403
4404 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4405 {
4406     flag zSign;
4407     uint64_t absA;
4408     int8_t shiftCount;
4409
4410     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4411     zSign = ( a < 0 );
4412     absA = zSign ? - a : a;
4413     shiftCount = clz64(absA);
4414     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4415
4416 }
4417
4418 /*----------------------------------------------------------------------------
4419 | Returns the result of converting the 64-bit two's complement integer `a' to
4420 | the quadruple-precision floating-point format.  The conversion is performed
4421 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4422 *----------------------------------------------------------------------------*/
4423
4424 float128 int64_to_float128(int64_t a, float_status *status)
4425 {
4426     flag zSign;
4427     uint64_t absA;
4428     int8_t shiftCount;
4429     int32_t zExp;
4430     uint64_t zSig0, zSig1;
4431
4432     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4433     zSign = ( a < 0 );
4434     absA = zSign ? - a : a;
4435     shiftCount = clz64(absA) + 49;
4436     zExp = 0x406E - shiftCount;
4437     if ( 64 <= shiftCount ) {
4438         zSig1 = 0;
4439         zSig0 = absA;
4440         shiftCount -= 64;
4441     }
4442     else {
4443         zSig1 = absA;
4444         zSig0 = 0;
4445     }
4446     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4447     return packFloat128( zSign, zExp, zSig0, zSig1 );
4448
4449 }
4450
4451 /*----------------------------------------------------------------------------
4452 | Returns the result of converting the 64-bit unsigned integer `a'
4453 | to the quadruple-precision floating-point format.  The conversion is performed
4454 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4455 *----------------------------------------------------------------------------*/
4456
4457 float128 uint64_to_float128(uint64_t a, float_status *status)
4458 {
4459     if (a == 0) {
4460         return float128_zero;
4461     }
4462     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4463 }
4464
4465 /*----------------------------------------------------------------------------
4466 | Returns the result of converting the single-precision floating-point value
4467 | `a' to the extended double-precision floating-point format.  The conversion
4468 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4469 | Arithmetic.
4470 *----------------------------------------------------------------------------*/
4471
4472 floatx80 float32_to_floatx80(float32 a, float_status *status)
4473 {
4474     flag aSign;
4475     int aExp;
4476     uint32_t aSig;
4477
4478     a = float32_squash_input_denormal(a, status);
4479     aSig = extractFloat32Frac( a );
4480     aExp = extractFloat32Exp( a );
4481     aSign = extractFloat32Sign( a );
4482     if ( aExp == 0xFF ) {
4483         if (aSig) {
4484             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4485         }
4486         return packFloatx80(aSign,
4487                             floatx80_infinity_high,
4488                             floatx80_infinity_low);
4489     }
4490     if ( aExp == 0 ) {
4491         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4492         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4493     }
4494     aSig |= 0x00800000;
4495     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4496
4497 }
4498
4499 /*----------------------------------------------------------------------------
4500 | Returns the result of converting the single-precision floating-point value
4501 | `a' to the double-precision floating-point format.  The conversion is
4502 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4503 | Arithmetic.
4504 *----------------------------------------------------------------------------*/
4505
4506 float128 float32_to_float128(float32 a, float_status *status)
4507 {
4508     flag aSign;
4509     int aExp;
4510     uint32_t aSig;
4511
4512     a = float32_squash_input_denormal(a, status);
4513     aSig = extractFloat32Frac( a );
4514     aExp = extractFloat32Exp( a );
4515     aSign = extractFloat32Sign( a );
4516     if ( aExp == 0xFF ) {
4517         if (aSig) {
4518             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4519         }
4520         return packFloat128( aSign, 0x7FFF, 0, 0 );
4521     }
4522     if ( aExp == 0 ) {
4523         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4524         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4525         --aExp;
4526     }
4527     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4528
4529 }
4530
4531 /*----------------------------------------------------------------------------
4532 | Returns the remainder of the single-precision floating-point value `a'
4533 | with respect to the corresponding value `b'.  The operation is performed
4534 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4535 *----------------------------------------------------------------------------*/
4536
4537 float32 float32_rem(float32 a, float32 b, float_status *status)
4538 {
4539     flag aSign, zSign;
4540     int aExp, bExp, expDiff;
4541     uint32_t aSig, bSig;
4542     uint32_t q;
4543     uint64_t aSig64, bSig64, q64;
4544     uint32_t alternateASig;
4545     int32_t sigMean;
4546     a = float32_squash_input_denormal(a, status);
4547     b = float32_squash_input_denormal(b, status);
4548
4549     aSig = extractFloat32Frac( a );
4550     aExp = extractFloat32Exp( a );
4551     aSign = extractFloat32Sign( a );
4552     bSig = extractFloat32Frac( b );
4553     bExp = extractFloat32Exp( b );
4554     if ( aExp == 0xFF ) {
4555         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4556             return propagateFloat32NaN(a, b, status);
4557         }
4558         float_raise(float_flag_invalid, status);
4559         return float32_default_nan(status);
4560     }
4561     if ( bExp == 0xFF ) {
4562         if (bSig) {
4563             return propagateFloat32NaN(a, b, status);
4564         }
4565         return a;
4566     }
4567     if ( bExp == 0 ) {
4568         if ( bSig == 0 ) {
4569             float_raise(float_flag_invalid, status);
4570             return float32_default_nan(status);
4571         }
4572         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4573     }
4574     if ( aExp == 0 ) {
4575         if ( aSig == 0 ) return a;
4576         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4577     }
4578     expDiff = aExp - bExp;
4579     aSig |= 0x00800000;
4580     bSig |= 0x00800000;
4581     if ( expDiff < 32 ) {
4582         aSig <<= 8;
4583         bSig <<= 8;
4584         if ( expDiff < 0 ) {
4585             if ( expDiff < -1 ) return a;
4586             aSig >>= 1;
4587         }
4588         q = ( bSig <= aSig );
4589         if ( q ) aSig -= bSig;
4590         if ( 0 < expDiff ) {
4591             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4592             q >>= 32 - expDiff;
4593             bSig >>= 2;
4594             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4595         }
4596         else {
4597             aSig >>= 2;
4598             bSig >>= 2;
4599         }
4600     }
4601     else {
4602         if ( bSig <= aSig ) aSig -= bSig;
4603         aSig64 = ( (uint64_t) aSig )<<40;
4604         bSig64 = ( (uint64_t) bSig )<<40;
4605         expDiff -= 64;
4606         while ( 0 < expDiff ) {
4607             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4608             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4609             aSig64 = - ( ( bSig * q64 )<<38 );
4610             expDiff -= 62;
4611         }
4612         expDiff += 64;
4613         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4614         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4615         q = q64>>( 64 - expDiff );
4616         bSig <<= 6;
4617         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4618     }
4619     do {
4620         alternateASig = aSig;
4621         ++q;
4622         aSig -= bSig;
4623     } while ( 0 <= (int32_t) aSig );
4624     sigMean = aSig + alternateASig;
4625     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4626         aSig = alternateASig;
4627     }
4628     zSign = ( (int32_t) aSig < 0 );
4629     if ( zSign ) aSig = - aSig;
4630     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4631 }
4632
4633
4634
4635 /*----------------------------------------------------------------------------
4636 | Returns the binary exponential of the single-precision floating-point value
4637 | `a'. The operation is performed according to the IEC/IEEE Standard for
4638 | Binary Floating-Point Arithmetic.
4639 |
4640 | Uses the following identities:
4641 |
4642 | 1. -------------------------------------------------------------------------
4643 |      x    x*ln(2)
4644 |     2  = e
4645 |
4646 | 2. -------------------------------------------------------------------------
4647 |                      2     3     4     5           n
4648 |      x        x     x     x     x     x           x
4649 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4650 |               1!    2!    3!    4!    5!          n!
4651 *----------------------------------------------------------------------------*/
4652
4653 static const float64 float32_exp2_coefficients[15] =
4654 {
4655     const_float64( 0x3ff0000000000000ll ), /*  1 */
4656     const_float64( 0x3fe0000000000000ll ), /*  2 */
4657     const_float64( 0x3fc5555555555555ll ), /*  3 */
4658     const_float64( 0x3fa5555555555555ll ), /*  4 */
4659     const_float64( 0x3f81111111111111ll ), /*  5 */
4660     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4661     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4662     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4663     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4664     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4665     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4666     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4667     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4668     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4669     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4670 };
4671
4672 float32 float32_exp2(float32 a, float_status *status)
4673 {
4674     flag aSign;
4675     int aExp;
4676     uint32_t aSig;
4677     float64 r, x, xn;
4678     int i;
4679     a = float32_squash_input_denormal(a, status);
4680
4681     aSig = extractFloat32Frac( a );
4682     aExp = extractFloat32Exp( a );
4683     aSign = extractFloat32Sign( a );
4684
4685     if ( aExp == 0xFF) {
4686         if (aSig) {
4687             return propagateFloat32NaN(a, float32_zero, status);
4688         }
4689         return (aSign) ? float32_zero : a;
4690     }
4691     if (aExp == 0) {
4692         if (aSig == 0) return float32_one;
4693     }
4694
4695     float_raise(float_flag_inexact, status);
4696
4697     /* ******************************* */
4698     /* using float64 for approximation */
4699     /* ******************************* */
4700     x = float32_to_float64(a, status);
4701     x = float64_mul(x, float64_ln2, status);
4702
4703     xn = x;
4704     r = float64_one;
4705     for (i = 0 ; i < 15 ; i++) {
4706         float64 f;
4707
4708         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4709         r = float64_add(r, f, status);
4710
4711         xn = float64_mul(xn, x, status);
4712     }
4713
4714     return float64_to_float32(r, status);
4715 }
4716
4717 /*----------------------------------------------------------------------------
4718 | Returns the binary log of the single-precision floating-point value `a'.
4719 | The operation is performed according to the IEC/IEEE Standard for Binary
4720 | Floating-Point Arithmetic.
4721 *----------------------------------------------------------------------------*/
4722 float32 float32_log2(float32 a, float_status *status)
4723 {
4724     flag aSign, zSign;
4725     int aExp;
4726     uint32_t aSig, zSig, i;
4727
4728     a = float32_squash_input_denormal(a, status);
4729     aSig = extractFloat32Frac( a );
4730     aExp = extractFloat32Exp( a );
4731     aSign = extractFloat32Sign( a );
4732
4733     if ( aExp == 0 ) {
4734         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4735         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4736     }
4737     if ( aSign ) {
4738         float_raise(float_flag_invalid, status);
4739         return float32_default_nan(status);
4740     }
4741     if ( aExp == 0xFF ) {
4742         if (aSig) {
4743             return propagateFloat32NaN(a, float32_zero, status);
4744         }
4745         return a;
4746     }
4747
4748     aExp -= 0x7F;
4749     aSig |= 0x00800000;
4750     zSign = aExp < 0;
4751     zSig = aExp << 23;
4752
4753     for (i = 1 << 22; i > 0; i >>= 1) {
4754         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4755         if ( aSig & 0x01000000 ) {
4756             aSig >>= 1;
4757             zSig |= i;
4758         }
4759     }
4760
4761     if ( zSign )
4762         zSig = -zSig;
4763
4764     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4765 }
4766
4767 /*----------------------------------------------------------------------------
4768 | Returns 1 if the single-precision floating-point value `a' is equal to
4769 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4770 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4771 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4772 *----------------------------------------------------------------------------*/
4773
4774 int float32_eq(float32 a, float32 b, float_status *status)
4775 {
4776     uint32_t av, bv;
4777     a = float32_squash_input_denormal(a, status);
4778     b = float32_squash_input_denormal(b, status);
4779
4780     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4781          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4782        ) {
4783         float_raise(float_flag_invalid, status);
4784         return 0;
4785     }
4786     av = float32_val(a);
4787     bv = float32_val(b);
4788     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4789 }
4790
4791 /*----------------------------------------------------------------------------
4792 | Returns 1 if the single-precision floating-point value `a' is less than
4793 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
4794 | exception is raised if either operand is a NaN.  The comparison is performed
4795 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4796 *----------------------------------------------------------------------------*/
4797
4798 int float32_le(float32 a, float32 b, float_status *status)
4799 {
4800     flag aSign, bSign;
4801     uint32_t av, bv;
4802     a = float32_squash_input_denormal(a, status);
4803     b = float32_squash_input_denormal(b, status);
4804
4805     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4806          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4807        ) {
4808         float_raise(float_flag_invalid, status);
4809         return 0;
4810     }
4811     aSign = extractFloat32Sign( a );
4812     bSign = extractFloat32Sign( b );
4813     av = float32_val(a);
4814     bv = float32_val(b);
4815     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4816     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4817
4818 }
4819
4820 /*----------------------------------------------------------------------------
4821 | Returns 1 if the single-precision floating-point value `a' is less than
4822 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4823 | raised if either operand is a NaN.  The comparison is performed according
4824 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4825 *----------------------------------------------------------------------------*/
4826
4827 int float32_lt(float32 a, float32 b, float_status *status)
4828 {
4829     flag aSign, bSign;
4830     uint32_t av, bv;
4831     a = float32_squash_input_denormal(a, status);
4832     b = float32_squash_input_denormal(b, status);
4833
4834     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4835          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4836        ) {
4837         float_raise(float_flag_invalid, status);
4838         return 0;
4839     }
4840     aSign = extractFloat32Sign( a );
4841     bSign = extractFloat32Sign( b );
4842     av = float32_val(a);
4843     bv = float32_val(b);
4844     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4845     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4846
4847 }
4848
4849 /*----------------------------------------------------------------------------
4850 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4851 | be compared, and 0 otherwise.  The invalid exception is raised if either
4852 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4853 | Standard for Binary Floating-Point Arithmetic.
4854 *----------------------------------------------------------------------------*/
4855
4856 int float32_unordered(float32 a, float32 b, float_status *status)
4857 {
4858     a = float32_squash_input_denormal(a, status);
4859     b = float32_squash_input_denormal(b, status);
4860
4861     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4862          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4863        ) {
4864         float_raise(float_flag_invalid, status);
4865         return 1;
4866     }
4867     return 0;
4868 }
4869
4870 /*----------------------------------------------------------------------------
4871 | Returns 1 if the single-precision floating-point value `a' is equal to
4872 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4873 | exception.  The comparison is performed according to the IEC/IEEE Standard
4874 | for Binary Floating-Point Arithmetic.
4875 *----------------------------------------------------------------------------*/
4876
4877 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4878 {
4879     a = float32_squash_input_denormal(a, status);
4880     b = float32_squash_input_denormal(b, status);
4881
4882     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4883          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4884        ) {
4885         if (float32_is_signaling_nan(a, status)
4886          || float32_is_signaling_nan(b, status)) {
4887             float_raise(float_flag_invalid, status);
4888         }
4889         return 0;
4890     }
4891     return ( float32_val(a) == float32_val(b) ) ||
4892             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4893 }
4894
4895 /*----------------------------------------------------------------------------
4896 | Returns 1 if the single-precision floating-point value `a' is less than or
4897 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4898 | cause an exception.  Otherwise, the comparison is performed according to the
4899 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4900 *----------------------------------------------------------------------------*/
4901
4902 int float32_le_quiet(float32 a, float32 b, float_status *status)
4903 {
4904     flag aSign, bSign;
4905     uint32_t av, bv;
4906     a = float32_squash_input_denormal(a, status);
4907     b = float32_squash_input_denormal(b, status);
4908
4909     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4910          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4911        ) {
4912         if (float32_is_signaling_nan(a, status)
4913          || float32_is_signaling_nan(b, status)) {
4914             float_raise(float_flag_invalid, status);
4915         }
4916         return 0;
4917     }
4918     aSign = extractFloat32Sign( a );
4919     bSign = extractFloat32Sign( b );
4920     av = float32_val(a);
4921     bv = float32_val(b);
4922     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4923     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4924
4925 }
4926
4927 /*----------------------------------------------------------------------------
4928 | Returns 1 if the single-precision floating-point value `a' is less than
4929 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4930 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4931 | Standard for Binary Floating-Point Arithmetic.
4932 *----------------------------------------------------------------------------*/
4933
4934 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4935 {
4936     flag aSign, bSign;
4937     uint32_t av, bv;
4938     a = float32_squash_input_denormal(a, status);
4939     b = float32_squash_input_denormal(b, status);
4940
4941     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4942          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4943        ) {
4944         if (float32_is_signaling_nan(a, status)
4945          || float32_is_signaling_nan(b, status)) {
4946             float_raise(float_flag_invalid, status);
4947         }
4948         return 0;
4949     }
4950     aSign = extractFloat32Sign( a );
4951     bSign = extractFloat32Sign( b );
4952     av = float32_val(a);
4953     bv = float32_val(b);
4954     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4955     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4956
4957 }
4958
4959 /*----------------------------------------------------------------------------
4960 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4961 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4962 | comparison is performed according to the IEC/IEEE Standard for Binary
4963 | Floating-Point Arithmetic.
4964 *----------------------------------------------------------------------------*/
4965
4966 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4967 {
4968     a = float32_squash_input_denormal(a, status);
4969     b = float32_squash_input_denormal(b, status);
4970
4971     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4972          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4973        ) {
4974         if (float32_is_signaling_nan(a, status)
4975          || float32_is_signaling_nan(b, status)) {
4976             float_raise(float_flag_invalid, status);
4977         }
4978         return 1;
4979     }
4980     return 0;
4981 }
4982
4983 /*----------------------------------------------------------------------------
4984 | Returns the result of converting the double-precision floating-point value
4985 | `a' to the extended double-precision floating-point format.  The conversion
4986 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4987 | Arithmetic.
4988 *----------------------------------------------------------------------------*/
4989
4990 floatx80 float64_to_floatx80(float64 a, float_status *status)
4991 {
4992     flag aSign;
4993     int aExp;
4994     uint64_t aSig;
4995
4996     a = float64_squash_input_denormal(a, status);
4997     aSig = extractFloat64Frac( a );
4998     aExp = extractFloat64Exp( a );
4999     aSign = extractFloat64Sign( a );
5000     if ( aExp == 0x7FF ) {
5001         if (aSig) {
5002             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
5003         }
5004         return packFloatx80(aSign,
5005                             floatx80_infinity_high,
5006                             floatx80_infinity_low);
5007     }
5008     if ( aExp == 0 ) {
5009         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5010         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5011     }
5012     return
5013         packFloatx80(
5014             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5015
5016 }
5017
5018 /*----------------------------------------------------------------------------
5019 | Returns the result of converting the double-precision floating-point value
5020 | `a' to the quadruple-precision floating-point format.  The conversion is
5021 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5022 | Arithmetic.
5023 *----------------------------------------------------------------------------*/
5024
5025 float128 float64_to_float128(float64 a, float_status *status)
5026 {
5027     flag aSign;
5028     int aExp;
5029     uint64_t aSig, zSig0, zSig1;
5030
5031     a = float64_squash_input_denormal(a, status);
5032     aSig = extractFloat64Frac( a );
5033     aExp = extractFloat64Exp( a );
5034     aSign = extractFloat64Sign( a );
5035     if ( aExp == 0x7FF ) {
5036         if (aSig) {
5037             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5038         }
5039         return packFloat128( aSign, 0x7FFF, 0, 0 );
5040     }
5041     if ( aExp == 0 ) {
5042         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5043         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5044         --aExp;
5045     }
5046     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5047     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5048
5049 }
5050
5051
5052 /*----------------------------------------------------------------------------
5053 | Returns the remainder of the double-precision floating-point value `a'
5054 | with respect to the corresponding value `b'.  The operation is performed
5055 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5056 *----------------------------------------------------------------------------*/
5057
5058 float64 float64_rem(float64 a, float64 b, float_status *status)
5059 {
5060     flag aSign, zSign;
5061     int aExp, bExp, expDiff;
5062     uint64_t aSig, bSig;
5063     uint64_t q, alternateASig;
5064     int64_t sigMean;
5065
5066     a = float64_squash_input_denormal(a, status);
5067     b = float64_squash_input_denormal(b, status);
5068     aSig = extractFloat64Frac( a );
5069     aExp = extractFloat64Exp( a );
5070     aSign = extractFloat64Sign( a );
5071     bSig = extractFloat64Frac( b );
5072     bExp = extractFloat64Exp( b );
5073     if ( aExp == 0x7FF ) {
5074         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5075             return propagateFloat64NaN(a, b, status);
5076         }
5077         float_raise(float_flag_invalid, status);
5078         return float64_default_nan(status);
5079     }
5080     if ( bExp == 0x7FF ) {
5081         if (bSig) {
5082             return propagateFloat64NaN(a, b, status);
5083         }
5084         return a;
5085     }
5086     if ( bExp == 0 ) {
5087         if ( bSig == 0 ) {
5088             float_raise(float_flag_invalid, status);
5089             return float64_default_nan(status);
5090         }
5091         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5092     }
5093     if ( aExp == 0 ) {
5094         if ( aSig == 0 ) return a;
5095         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5096     }
5097     expDiff = aExp - bExp;
5098     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5099     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5100     if ( expDiff < 0 ) {
5101         if ( expDiff < -1 ) return a;
5102         aSig >>= 1;
5103     }
5104     q = ( bSig <= aSig );
5105     if ( q ) aSig -= bSig;
5106     expDiff -= 64;
5107     while ( 0 < expDiff ) {
5108         q = estimateDiv128To64( aSig, 0, bSig );
5109         q = ( 2 < q ) ? q - 2 : 0;
5110         aSig = - ( ( bSig>>2 ) * q );
5111         expDiff -= 62;
5112     }
5113     expDiff += 64;
5114     if ( 0 < expDiff ) {
5115         q = estimateDiv128To64( aSig, 0, bSig );
5116         q = ( 2 < q ) ? q - 2 : 0;
5117         q >>= 64 - expDiff;
5118         bSig >>= 2;
5119         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5120     }
5121     else {
5122         aSig >>= 2;
5123         bSig >>= 2;
5124     }
5125     do {
5126         alternateASig = aSig;
5127         ++q;
5128         aSig -= bSig;
5129     } while ( 0 <= (int64_t) aSig );
5130     sigMean = aSig + alternateASig;
5131     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5132         aSig = alternateASig;
5133     }
5134     zSign = ( (int64_t) aSig < 0 );
5135     if ( zSign ) aSig = - aSig;
5136     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5137
5138 }
5139
5140 /*----------------------------------------------------------------------------
5141 | Returns the binary log of the double-precision floating-point value `a'.
5142 | The operation is performed according to the IEC/IEEE Standard for Binary
5143 | Floating-Point Arithmetic.
5144 *----------------------------------------------------------------------------*/
5145 float64 float64_log2(float64 a, float_status *status)
5146 {
5147     flag aSign, zSign;
5148     int aExp;
5149     uint64_t aSig, aSig0, aSig1, zSig, i;
5150     a = float64_squash_input_denormal(a, status);
5151
5152     aSig = extractFloat64Frac( a );
5153     aExp = extractFloat64Exp( a );
5154     aSign = extractFloat64Sign( a );
5155
5156     if ( aExp == 0 ) {
5157         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5158         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5159     }
5160     if ( aSign ) {
5161         float_raise(float_flag_invalid, status);
5162         return float64_default_nan(status);
5163     }
5164     if ( aExp == 0x7FF ) {
5165         if (aSig) {
5166             return propagateFloat64NaN(a, float64_zero, status);
5167         }
5168         return a;
5169     }
5170
5171     aExp -= 0x3FF;
5172     aSig |= UINT64_C(0x0010000000000000);
5173     zSign = aExp < 0;
5174     zSig = (uint64_t)aExp << 52;
5175     for (i = 1LL << 51; i > 0; i >>= 1) {
5176         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5177         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5178         if ( aSig & UINT64_C(0x0020000000000000) ) {
5179             aSig >>= 1;
5180             zSig |= i;
5181         }
5182     }
5183
5184     if ( zSign )
5185         zSig = -zSig;
5186     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5187 }
5188
5189 /*----------------------------------------------------------------------------
5190 | Returns 1 if the double-precision floating-point value `a' is equal to the
5191 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
5192 | if either operand is a NaN.  Otherwise, the comparison is performed
5193 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5194 *----------------------------------------------------------------------------*/
5195
5196 int float64_eq(float64 a, float64 b, float_status *status)
5197 {
5198     uint64_t av, bv;
5199     a = float64_squash_input_denormal(a, status);
5200     b = float64_squash_input_denormal(b, status);
5201
5202     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5203          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5204        ) {
5205         float_raise(float_flag_invalid, status);
5206         return 0;
5207     }
5208     av = float64_val(a);
5209     bv = float64_val(b);
5210     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5211
5212 }
5213
5214 /*----------------------------------------------------------------------------
5215 | Returns 1 if the double-precision floating-point value `a' is less than or
5216 | equal to the corresponding value `b', and 0 otherwise.  The invalid
5217 | exception is raised if either operand is a NaN.  The comparison is performed
5218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5219 *----------------------------------------------------------------------------*/
5220
5221 int float64_le(float64 a, float64 b, float_status *status)
5222 {
5223     flag aSign, bSign;
5224     uint64_t av, bv;
5225     a = float64_squash_input_denormal(a, status);
5226     b = float64_squash_input_denormal(b, status);
5227
5228     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5229          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5230        ) {
5231         float_raise(float_flag_invalid, status);
5232         return 0;
5233     }
5234     aSign = extractFloat64Sign( a );
5235     bSign = extractFloat64Sign( b );
5236     av = float64_val(a);
5237     bv = float64_val(b);
5238     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5239     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5240
5241 }
5242
5243 /*----------------------------------------------------------------------------
5244 | Returns 1 if the double-precision floating-point value `a' is less than
5245 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5246 | raised if either operand is a NaN.  The comparison is performed according
5247 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5248 *----------------------------------------------------------------------------*/
5249
5250 int float64_lt(float64 a, float64 b, float_status *status)
5251 {
5252     flag aSign, bSign;
5253     uint64_t av, bv;
5254
5255     a = float64_squash_input_denormal(a, status);
5256     b = float64_squash_input_denormal(b, status);
5257     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5258          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5259        ) {
5260         float_raise(float_flag_invalid, status);
5261         return 0;
5262     }
5263     aSign = extractFloat64Sign( a );
5264     bSign = extractFloat64Sign( b );
5265     av = float64_val(a);
5266     bv = float64_val(b);
5267     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5268     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5269
5270 }
5271
5272 /*----------------------------------------------------------------------------
5273 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5274 | be compared, and 0 otherwise.  The invalid exception is raised if either
5275 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
5276 | Standard for Binary Floating-Point Arithmetic.
5277 *----------------------------------------------------------------------------*/
5278
5279 int float64_unordered(float64 a, float64 b, float_status *status)
5280 {
5281     a = float64_squash_input_denormal(a, status);
5282     b = float64_squash_input_denormal(b, status);
5283
5284     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5285          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5286        ) {
5287         float_raise(float_flag_invalid, status);
5288         return 1;
5289     }
5290     return 0;
5291 }
5292
5293 /*----------------------------------------------------------------------------
5294 | Returns 1 if the double-precision floating-point value `a' is equal to the
5295 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5296 | exception.The comparison is performed according to the IEC/IEEE Standard
5297 | for Binary Floating-Point Arithmetic.
5298 *----------------------------------------------------------------------------*/
5299
5300 int float64_eq_quiet(float64 a, float64 b, float_status *status)
5301 {
5302     uint64_t av, bv;
5303     a = float64_squash_input_denormal(a, status);
5304     b = float64_squash_input_denormal(b, status);
5305
5306     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5307          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5308        ) {
5309         if (float64_is_signaling_nan(a, status)
5310          || float64_is_signaling_nan(b, status)) {
5311             float_raise(float_flag_invalid, status);
5312         }
5313         return 0;
5314     }
5315     av = float64_val(a);
5316     bv = float64_val(b);
5317     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5318
5319 }
5320
5321 /*----------------------------------------------------------------------------
5322 | Returns 1 if the double-precision floating-point value `a' is less than or
5323 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5324 | cause an exception.  Otherwise, the comparison is performed according to the
5325 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5326 *----------------------------------------------------------------------------*/
5327
5328 int float64_le_quiet(float64 a, float64 b, float_status *status)
5329 {
5330     flag aSign, bSign;
5331     uint64_t av, bv;
5332     a = float64_squash_input_denormal(a, status);
5333     b = float64_squash_input_denormal(b, status);
5334
5335     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5336          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5337        ) {
5338         if (float64_is_signaling_nan(a, status)
5339          || float64_is_signaling_nan(b, status)) {
5340             float_raise(float_flag_invalid, status);
5341         }
5342         return 0;
5343     }
5344     aSign = extractFloat64Sign( a );
5345     bSign = extractFloat64Sign( b );
5346     av = float64_val(a);
5347     bv = float64_val(b);
5348     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5349     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5350
5351 }
5352
5353 /*----------------------------------------------------------------------------
5354 | Returns 1 if the double-precision floating-point value `a' is less than
5355 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5356 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5357 | Standard for Binary Floating-Point Arithmetic.
5358 *----------------------------------------------------------------------------*/
5359
5360 int float64_lt_quiet(float64 a, float64 b, float_status *status)
5361 {
5362     flag aSign, bSign;
5363     uint64_t av, bv;
5364     a = float64_squash_input_denormal(a, status);
5365     b = float64_squash_input_denormal(b, status);
5366
5367     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5368          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5369        ) {
5370         if (float64_is_signaling_nan(a, status)
5371          || float64_is_signaling_nan(b, status)) {
5372             float_raise(float_flag_invalid, status);
5373         }
5374         return 0;
5375     }
5376     aSign = extractFloat64Sign( a );
5377     bSign = extractFloat64Sign( b );
5378     av = float64_val(a);
5379     bv = float64_val(b);
5380     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5381     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5382
5383 }
5384
5385 /*----------------------------------------------------------------------------
5386 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5387 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5388 | comparison is performed according to the IEC/IEEE Standard for Binary
5389 | Floating-Point Arithmetic.
5390 *----------------------------------------------------------------------------*/
5391
5392 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5393 {
5394     a = float64_squash_input_denormal(a, status);
5395     b = float64_squash_input_denormal(b, status);
5396
5397     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5398          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5399        ) {
5400         if (float64_is_signaling_nan(a, status)
5401          || float64_is_signaling_nan(b, status)) {
5402             float_raise(float_flag_invalid, status);
5403         }
5404         return 1;
5405     }
5406     return 0;
5407 }
5408
5409 /*----------------------------------------------------------------------------
5410 | Returns the result of converting the extended double-precision floating-
5411 | point value `a' to the 32-bit two's complement integer format.  The
5412 | conversion is performed according to the IEC/IEEE Standard for Binary
5413 | Floating-Point Arithmetic---which means in particular that the conversion
5414 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5415 | largest positive integer is returned.  Otherwise, if the conversion
5416 | overflows, the largest integer with the same sign as `a' is returned.
5417 *----------------------------------------------------------------------------*/
5418
5419 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5420 {
5421     flag aSign;
5422     int32_t aExp, shiftCount;
5423     uint64_t aSig;
5424
5425     if (floatx80_invalid_encoding(a)) {
5426         float_raise(float_flag_invalid, status);
5427         return 1 << 31;
5428     }
5429     aSig = extractFloatx80Frac( a );
5430     aExp = extractFloatx80Exp( a );
5431     aSign = extractFloatx80Sign( a );
5432     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5433     shiftCount = 0x4037 - aExp;
5434     if ( shiftCount <= 0 ) shiftCount = 1;
5435     shift64RightJamming( aSig, shiftCount, &aSig );
5436     return roundAndPackInt32(aSign, aSig, status);
5437
5438 }
5439
5440 /*----------------------------------------------------------------------------
5441 | Returns the result of converting the extended double-precision floating-
5442 | point value `a' to the 32-bit two's complement integer format.  The
5443 | conversion is performed according to the IEC/IEEE Standard for Binary
5444 | Floating-Point Arithmetic, except that the conversion is always rounded
5445 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5446 | Otherwise, if the conversion overflows, the largest integer with the same
5447 | sign as `a' is returned.
5448 *----------------------------------------------------------------------------*/
5449
5450 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5451 {
5452     flag aSign;
5453     int32_t aExp, shiftCount;
5454     uint64_t aSig, savedASig;
5455     int32_t z;
5456
5457     if (floatx80_invalid_encoding(a)) {
5458         float_raise(float_flag_invalid, status);
5459         return 1 << 31;
5460     }
5461     aSig = extractFloatx80Frac( a );
5462     aExp = extractFloatx80Exp( a );
5463     aSign = extractFloatx80Sign( a );
5464     if ( 0x401E < aExp ) {
5465         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5466         goto invalid;
5467     }
5468     else if ( aExp < 0x3FFF ) {
5469         if (aExp || aSig) {
5470             status->float_exception_flags |= float_flag_inexact;
5471         }
5472         return 0;
5473     }
5474     shiftCount = 0x403E - aExp;
5475     savedASig = aSig;
5476     aSig >>= shiftCount;
5477     z = aSig;
5478     if ( aSign ) z = - z;
5479     if ( ( z < 0 ) ^ aSign ) {
5480  invalid:
5481         float_raise(float_flag_invalid, status);
5482         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5483     }
5484     if ( ( aSig<<shiftCount ) != savedASig ) {
5485         status->float_exception_flags |= float_flag_inexact;
5486     }
5487     return z;
5488
5489 }
5490
5491 /*----------------------------------------------------------------------------
5492 | Returns the result of converting the extended double-precision floating-
5493 | point value `a' to the 64-bit two's complement integer format.  The
5494 | conversion is performed according to the IEC/IEEE Standard for Binary
5495 | Floating-Point Arithmetic---which means in particular that the conversion
5496 | is rounded according to the current rounding mode.  If `a' is a NaN,
5497 | the largest positive integer is returned.  Otherwise, if the conversion
5498 | overflows, the largest integer with the same sign as `a' is returned.
5499 *----------------------------------------------------------------------------*/
5500
5501 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5502 {
5503     flag aSign;
5504     int32_t aExp, shiftCount;
5505     uint64_t aSig, aSigExtra;
5506
5507     if (floatx80_invalid_encoding(a)) {
5508         float_raise(float_flag_invalid, status);
5509         return 1ULL << 63;
5510     }
5511     aSig = extractFloatx80Frac( a );
5512     aExp = extractFloatx80Exp( a );
5513     aSign = extractFloatx80Sign( a );
5514     shiftCount = 0x403E - aExp;
5515     if ( shiftCount <= 0 ) {
5516         if ( shiftCount ) {
5517             float_raise(float_flag_invalid, status);
5518             if (!aSign || floatx80_is_any_nan(a)) {
5519                 return INT64_MAX;
5520             }
5521             return INT64_MIN;
5522         }
5523         aSigExtra = 0;
5524     }
5525     else {
5526         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5527     }
5528     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5529
5530 }
5531
5532 /*----------------------------------------------------------------------------
5533 | Returns the result of converting the extended double-precision floating-
5534 | point value `a' to the 64-bit two's complement integer format.  The
5535 | conversion is performed according to the IEC/IEEE Standard for Binary
5536 | Floating-Point Arithmetic, except that the conversion is always rounded
5537 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5538 | Otherwise, if the conversion overflows, the largest integer with the same
5539 | sign as `a' is returned.
5540 *----------------------------------------------------------------------------*/
5541
5542 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5543 {
5544     flag aSign;
5545     int32_t aExp, shiftCount;
5546     uint64_t aSig;
5547     int64_t z;
5548
5549     if (floatx80_invalid_encoding(a)) {
5550         float_raise(float_flag_invalid, status);
5551         return 1ULL << 63;
5552     }
5553     aSig = extractFloatx80Frac( a );
5554     aExp = extractFloatx80Exp( a );
5555     aSign = extractFloatx80Sign( a );
5556     shiftCount = aExp - 0x403E;
5557     if ( 0 <= shiftCount ) {
5558         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5559         if ( ( a.high != 0xC03E ) || aSig ) {
5560             float_raise(float_flag_invalid, status);
5561             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5562                 return INT64_MAX;
5563             }
5564         }
5565         return INT64_MIN;
5566     }
5567     else if ( aExp < 0x3FFF ) {
5568         if (aExp | aSig) {
5569             status->float_exception_flags |= float_flag_inexact;
5570         }
5571         return 0;
5572     }
5573     z = aSig>>( - shiftCount );
5574     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5575         status->float_exception_flags |= float_flag_inexact;
5576     }
5577     if ( aSign ) z = - z;
5578     return z;
5579
5580 }
5581
5582 /*----------------------------------------------------------------------------
5583 | Returns the result of converting the extended double-precision floating-
5584 | point value `a' to the single-precision floating-point format.  The
5585 | conversion is performed according to the IEC/IEEE Standard for Binary
5586 | Floating-Point Arithmetic.
5587 *----------------------------------------------------------------------------*/
5588
5589 float32 floatx80_to_float32(floatx80 a, float_status *status)
5590 {
5591     flag aSign;
5592     int32_t aExp;
5593     uint64_t aSig;
5594
5595     if (floatx80_invalid_encoding(a)) {
5596         float_raise(float_flag_invalid, status);
5597         return float32_default_nan(status);
5598     }
5599     aSig = extractFloatx80Frac( a );
5600     aExp = extractFloatx80Exp( a );
5601     aSign = extractFloatx80Sign( a );
5602     if ( aExp == 0x7FFF ) {
5603         if ( (uint64_t) ( aSig<<1 ) ) {
5604             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5605         }
5606         return packFloat32( aSign, 0xFF, 0 );
5607     }
5608     shift64RightJamming( aSig, 33, &aSig );
5609     if ( aExp || aSig ) aExp -= 0x3F81;
5610     return roundAndPackFloat32(aSign, aExp, aSig, status);
5611
5612 }
5613
5614 /*----------------------------------------------------------------------------
5615 | Returns the result of converting the extended double-precision floating-
5616 | point value `a' to the double-precision floating-point format.  The
5617 | conversion is performed according to the IEC/IEEE Standard for Binary
5618 | Floating-Point Arithmetic.
5619 *----------------------------------------------------------------------------*/
5620
5621 float64 floatx80_to_float64(floatx80 a, float_status *status)
5622 {
5623     flag aSign;
5624     int32_t aExp;
5625     uint64_t aSig, zSig;
5626
5627     if (floatx80_invalid_encoding(a)) {
5628         float_raise(float_flag_invalid, status);
5629         return float64_default_nan(status);
5630     }
5631     aSig = extractFloatx80Frac( a );
5632     aExp = extractFloatx80Exp( a );
5633     aSign = extractFloatx80Sign( a );
5634     if ( aExp == 0x7FFF ) {
5635         if ( (uint64_t) ( aSig<<1 ) ) {
5636             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5637         }
5638         return packFloat64( aSign, 0x7FF, 0 );
5639     }
5640     shift64RightJamming( aSig, 1, &zSig );
5641     if ( aExp || aSig ) aExp -= 0x3C01;
5642     return roundAndPackFloat64(aSign, aExp, zSig, status);
5643
5644 }
5645
5646 /*----------------------------------------------------------------------------
5647 | Returns the result of converting the extended double-precision floating-
5648 | point value `a' to the quadruple-precision floating-point format.  The
5649 | conversion is performed according to the IEC/IEEE Standard for Binary
5650 | Floating-Point Arithmetic.
5651 *----------------------------------------------------------------------------*/
5652
5653 float128 floatx80_to_float128(floatx80 a, float_status *status)
5654 {
5655     flag aSign;
5656     int aExp;
5657     uint64_t aSig, zSig0, zSig1;
5658
5659     if (floatx80_invalid_encoding(a)) {
5660         float_raise(float_flag_invalid, status);
5661         return float128_default_nan(status);
5662     }
5663     aSig = extractFloatx80Frac( a );
5664     aExp = extractFloatx80Exp( a );
5665     aSign = extractFloatx80Sign( a );
5666     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5667         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5668     }
5669     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5670     return packFloat128( aSign, aExp, zSig0, zSig1 );
5671
5672 }
5673
5674 /*----------------------------------------------------------------------------
5675 | Rounds the extended double-precision floating-point value `a'
5676 | to the precision provided by floatx80_rounding_precision and returns the
5677 | result as an extended double-precision floating-point value.
5678 | The operation is performed according to the IEC/IEEE Standard for Binary
5679 | Floating-Point Arithmetic.
5680 *----------------------------------------------------------------------------*/
5681
5682 floatx80 floatx80_round(floatx80 a, float_status *status)
5683 {
5684     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5685                                 extractFloatx80Sign(a),
5686                                 extractFloatx80Exp(a),
5687                                 extractFloatx80Frac(a), 0, status);
5688 }
5689
5690 /*----------------------------------------------------------------------------
5691 | Rounds the extended double-precision floating-point value `a' to an integer,
5692 | and returns the result as an extended quadruple-precision floating-point
5693 | value.  The operation is performed according to the IEC/IEEE Standard for
5694 | Binary Floating-Point Arithmetic.
5695 *----------------------------------------------------------------------------*/
5696
5697 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5698 {
5699     flag aSign;
5700     int32_t aExp;
5701     uint64_t lastBitMask, roundBitsMask;
5702     floatx80 z;
5703
5704     if (floatx80_invalid_encoding(a)) {
5705         float_raise(float_flag_invalid, status);
5706         return floatx80_default_nan(status);
5707     }
5708     aExp = extractFloatx80Exp( a );
5709     if ( 0x403E <= aExp ) {
5710         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5711             return propagateFloatx80NaN(a, a, status);
5712         }
5713         return a;
5714     }
5715     if ( aExp < 0x3FFF ) {
5716         if (    ( aExp == 0 )
5717              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5718             return a;
5719         }
5720         status->float_exception_flags |= float_flag_inexact;
5721         aSign = extractFloatx80Sign( a );
5722         switch (status->float_rounding_mode) {
5723          case float_round_nearest_even:
5724             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5725                ) {
5726                 return
5727                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5728             }
5729             break;
5730         case float_round_ties_away:
5731             if (aExp == 0x3FFE) {
5732                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5733             }
5734             break;
5735          case float_round_down:
5736             return
5737                   aSign ?
5738                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5739                 : packFloatx80( 0, 0, 0 );
5740          case float_round_up:
5741             return
5742                   aSign ? packFloatx80( 1, 0, 0 )
5743                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5744         }
5745         return packFloatx80( aSign, 0, 0 );
5746     }
5747     lastBitMask = 1;
5748     lastBitMask <<= 0x403E - aExp;
5749     roundBitsMask = lastBitMask - 1;
5750     z = a;
5751     switch (status->float_rounding_mode) {
5752     case float_round_nearest_even:
5753         z.low += lastBitMask>>1;
5754         if ((z.low & roundBitsMask) == 0) {
5755             z.low &= ~lastBitMask;
5756         }
5757         break;
5758     case float_round_ties_away:
5759         z.low += lastBitMask >> 1;
5760         break;
5761     case float_round_to_zero:
5762         break;
5763     case float_round_up:
5764         if (!extractFloatx80Sign(z)) {
5765             z.low += roundBitsMask;
5766         }
5767         break;
5768     case float_round_down:
5769         if (extractFloatx80Sign(z)) {
5770             z.low += roundBitsMask;
5771         }
5772         break;
5773     default:
5774         abort();
5775     }
5776     z.low &= ~ roundBitsMask;
5777     if ( z.low == 0 ) {
5778         ++z.high;
5779         z.low = UINT64_C(0x8000000000000000);
5780     }
5781     if (z.low != a.low) {
5782         status->float_exception_flags |= float_flag_inexact;
5783     }
5784     return z;
5785
5786 }
5787
5788 /*----------------------------------------------------------------------------
5789 | Returns the result of adding the absolute values of the extended double-
5790 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5791 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5792 | The addition is performed according to the IEC/IEEE Standard for Binary
5793 | Floating-Point Arithmetic.
5794 *----------------------------------------------------------------------------*/
5795
5796 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5797                                 float_status *status)
5798 {
5799     int32_t aExp, bExp, zExp;
5800     uint64_t aSig, bSig, zSig0, zSig1;
5801     int32_t expDiff;
5802
5803     aSig = extractFloatx80Frac( a );
5804     aExp = extractFloatx80Exp( a );
5805     bSig = extractFloatx80Frac( b );
5806     bExp = extractFloatx80Exp( b );
5807     expDiff = aExp - bExp;
5808     if ( 0 < expDiff ) {
5809         if ( aExp == 0x7FFF ) {
5810             if ((uint64_t)(aSig << 1)) {
5811                 return propagateFloatx80NaN(a, b, status);
5812             }
5813             return a;
5814         }
5815         if ( bExp == 0 ) --expDiff;
5816         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5817         zExp = aExp;
5818     }
5819     else if ( expDiff < 0 ) {
5820         if ( bExp == 0x7FFF ) {
5821             if ((uint64_t)(bSig << 1)) {
5822                 return propagateFloatx80NaN(a, b, status);
5823             }
5824             return packFloatx80(zSign,
5825                                 floatx80_infinity_high,
5826                                 floatx80_infinity_low);
5827         }
5828         if ( aExp == 0 ) ++expDiff;
5829         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5830         zExp = bExp;
5831     }
5832     else {
5833         if ( aExp == 0x7FFF ) {
5834             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5835                 return propagateFloatx80NaN(a, b, status);
5836             }
5837             return a;
5838         }
5839         zSig1 = 0;
5840         zSig0 = aSig + bSig;
5841         if ( aExp == 0 ) {
5842             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5843             goto roundAndPack;
5844         }
5845         zExp = aExp;
5846         goto shiftRight1;
5847     }
5848     zSig0 = aSig + bSig;
5849     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5850  shiftRight1:
5851     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5852     zSig0 |= UINT64_C(0x8000000000000000);
5853     ++zExp;
5854  roundAndPack:
5855     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5856                                 zSign, zExp, zSig0, zSig1, status);
5857 }
5858
5859 /*----------------------------------------------------------------------------
5860 | Returns the result of subtracting the absolute values of the extended
5861 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5862 | difference is negated before being returned.  `zSign' is ignored if the
5863 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5864 | Standard for Binary Floating-Point Arithmetic.
5865 *----------------------------------------------------------------------------*/
5866
5867 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5868                                 float_status *status)
5869 {
5870     int32_t aExp, bExp, zExp;
5871     uint64_t aSig, bSig, zSig0, zSig1;
5872     int32_t expDiff;
5873
5874     aSig = extractFloatx80Frac( a );
5875     aExp = extractFloatx80Exp( a );
5876     bSig = extractFloatx80Frac( b );
5877     bExp = extractFloatx80Exp( b );
5878     expDiff = aExp - bExp;
5879     if ( 0 < expDiff ) goto aExpBigger;
5880     if ( expDiff < 0 ) goto bExpBigger;
5881     if ( aExp == 0x7FFF ) {
5882         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5883             return propagateFloatx80NaN(a, b, status);
5884         }
5885         float_raise(float_flag_invalid, status);
5886         return floatx80_default_nan(status);
5887     }
5888     if ( aExp == 0 ) {
5889         aExp = 1;
5890         bExp = 1;
5891     }
5892     zSig1 = 0;
5893     if ( bSig < aSig ) goto aBigger;
5894     if ( aSig < bSig ) goto bBigger;
5895     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5896  bExpBigger:
5897     if ( bExp == 0x7FFF ) {
5898         if ((uint64_t)(bSig << 1)) {
5899             return propagateFloatx80NaN(a, b, status);
5900         }
5901         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5902                             floatx80_infinity_low);
5903     }
5904     if ( aExp == 0 ) ++expDiff;
5905     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5906  bBigger:
5907     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5908     zExp = bExp;
5909     zSign ^= 1;
5910     goto normalizeRoundAndPack;
5911  aExpBigger:
5912     if ( aExp == 0x7FFF ) {
5913         if ((uint64_t)(aSig << 1)) {
5914             return propagateFloatx80NaN(a, b, status);
5915         }
5916         return a;
5917     }
5918     if ( bExp == 0 ) --expDiff;
5919     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5920  aBigger:
5921     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5922     zExp = aExp;
5923  normalizeRoundAndPack:
5924     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5925                                          zSign, zExp, zSig0, zSig1, status);
5926 }
5927
5928 /*----------------------------------------------------------------------------
5929 | Returns the result of adding the extended double-precision floating-point
5930 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5931 | Standard for Binary Floating-Point Arithmetic.
5932 *----------------------------------------------------------------------------*/
5933
5934 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5935 {
5936     flag aSign, bSign;
5937
5938     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5939         float_raise(float_flag_invalid, status);
5940         return floatx80_default_nan(status);
5941     }
5942     aSign = extractFloatx80Sign( a );
5943     bSign = extractFloatx80Sign( b );
5944     if ( aSign == bSign ) {
5945         return addFloatx80Sigs(a, b, aSign, status);
5946     }
5947     else {
5948         return subFloatx80Sigs(a, b, aSign, status);
5949     }
5950
5951 }
5952
5953 /*----------------------------------------------------------------------------
5954 | Returns the result of subtracting the extended double-precision floating-
5955 | point values `a' and `b'.  The operation is performed according to the
5956 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5957 *----------------------------------------------------------------------------*/
5958
5959 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5960 {
5961     flag aSign, bSign;
5962
5963     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5964         float_raise(float_flag_invalid, status);
5965         return floatx80_default_nan(status);
5966     }
5967     aSign = extractFloatx80Sign( a );
5968     bSign = extractFloatx80Sign( b );
5969     if ( aSign == bSign ) {
5970         return subFloatx80Sigs(a, b, aSign, status);
5971     }
5972     else {
5973         return addFloatx80Sigs(a, b, aSign, status);
5974     }
5975
5976 }
5977
5978 /*----------------------------------------------------------------------------
5979 | Returns the result of multiplying the extended double-precision floating-
5980 | point values `a' and `b'.  The operation is performed according to the
5981 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5982 *----------------------------------------------------------------------------*/
5983
5984 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5985 {
5986     flag aSign, bSign, zSign;
5987     int32_t aExp, bExp, zExp;
5988     uint64_t aSig, bSig, zSig0, zSig1;
5989
5990     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5991         float_raise(float_flag_invalid, status);
5992         return floatx80_default_nan(status);
5993     }
5994     aSig = extractFloatx80Frac( a );
5995     aExp = extractFloatx80Exp( a );
5996     aSign = extractFloatx80Sign( a );
5997     bSig = extractFloatx80Frac( b );
5998     bExp = extractFloatx80Exp( b );
5999     bSign = extractFloatx80Sign( b );
6000     zSign = aSign ^ bSign;
6001     if ( aExp == 0x7FFF ) {
6002         if (    (uint64_t) ( aSig<<1 )
6003              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6004             return propagateFloatx80NaN(a, b, status);
6005         }
6006         if ( ( bExp | bSig ) == 0 ) goto invalid;
6007         return packFloatx80(zSign, floatx80_infinity_high,
6008                                    floatx80_infinity_low);
6009     }
6010     if ( bExp == 0x7FFF ) {
6011         if ((uint64_t)(bSig << 1)) {
6012             return propagateFloatx80NaN(a, b, status);
6013         }
6014         if ( ( aExp | aSig ) == 0 ) {
6015  invalid:
6016             float_raise(float_flag_invalid, status);
6017             return floatx80_default_nan(status);
6018         }
6019         return packFloatx80(zSign, floatx80_infinity_high,
6020                                    floatx80_infinity_low);
6021     }
6022     if ( aExp == 0 ) {
6023         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6024         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6025     }
6026     if ( bExp == 0 ) {
6027         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6028         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6029     }
6030     zExp = aExp + bExp - 0x3FFE;
6031     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6032     if ( 0 < (int64_t) zSig0 ) {
6033         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6034         --zExp;
6035     }
6036     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6037                                 zSign, zExp, zSig0, zSig1, status);
6038 }
6039
6040 /*----------------------------------------------------------------------------
6041 | Returns the result of dividing the extended double-precision floating-point
6042 | value `a' by the corresponding value `b'.  The operation is performed
6043 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6044 *----------------------------------------------------------------------------*/
6045
6046 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6047 {
6048     flag aSign, bSign, zSign;
6049     int32_t aExp, bExp, zExp;
6050     uint64_t aSig, bSig, zSig0, zSig1;
6051     uint64_t rem0, rem1, rem2, term0, term1, term2;
6052
6053     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6054         float_raise(float_flag_invalid, status);
6055         return floatx80_default_nan(status);
6056     }
6057     aSig = extractFloatx80Frac( a );
6058     aExp = extractFloatx80Exp( a );
6059     aSign = extractFloatx80Sign( a );
6060     bSig = extractFloatx80Frac( b );
6061     bExp = extractFloatx80Exp( b );
6062     bSign = extractFloatx80Sign( b );
6063     zSign = aSign ^ bSign;
6064     if ( aExp == 0x7FFF ) {
6065         if ((uint64_t)(aSig << 1)) {
6066             return propagateFloatx80NaN(a, b, status);
6067         }
6068         if ( bExp == 0x7FFF ) {
6069             if ((uint64_t)(bSig << 1)) {
6070                 return propagateFloatx80NaN(a, b, status);
6071             }
6072             goto invalid;
6073         }
6074         return packFloatx80(zSign, floatx80_infinity_high,
6075                                    floatx80_infinity_low);
6076     }
6077     if ( bExp == 0x7FFF ) {
6078         if ((uint64_t)(bSig << 1)) {
6079             return propagateFloatx80NaN(a, b, status);
6080         }
6081         return packFloatx80( zSign, 0, 0 );
6082     }
6083     if ( bExp == 0 ) {
6084         if ( bSig == 0 ) {
6085             if ( ( aExp | aSig ) == 0 ) {
6086  invalid:
6087                 float_raise(float_flag_invalid, status);
6088                 return floatx80_default_nan(status);
6089             }
6090             float_raise(float_flag_divbyzero, status);
6091             return packFloatx80(zSign, floatx80_infinity_high,
6092                                        floatx80_infinity_low);
6093         }
6094         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6095     }
6096     if ( aExp == 0 ) {
6097         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6098         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6099     }
6100     zExp = aExp - bExp + 0x3FFE;
6101     rem1 = 0;
6102     if ( bSig <= aSig ) {
6103         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6104         ++zExp;
6105     }
6106     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6107     mul64To128( bSig, zSig0, &term0, &term1 );
6108     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6109     while ( (int64_t) rem0 < 0 ) {
6110         --zSig0;
6111         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6112     }
6113     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6114     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6115         mul64To128( bSig, zSig1, &term1, &term2 );
6116         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6117         while ( (int64_t) rem1 < 0 ) {
6118             --zSig1;
6119             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6120         }
6121         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6122     }
6123     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6124                                 zSign, zExp, zSig0, zSig1, status);
6125 }
6126
6127 /*----------------------------------------------------------------------------
6128 | Returns the remainder of the extended double-precision floating-point value
6129 | `a' with respect to the corresponding value `b'.  The operation is performed
6130 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6131 *----------------------------------------------------------------------------*/
6132
6133 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6134 {
6135     flag aSign, zSign;
6136     int32_t aExp, bExp, expDiff;
6137     uint64_t aSig0, aSig1, bSig;
6138     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6139
6140     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6141         float_raise(float_flag_invalid, status);
6142         return floatx80_default_nan(status);
6143     }
6144     aSig0 = extractFloatx80Frac( a );
6145     aExp = extractFloatx80Exp( a );
6146     aSign = extractFloatx80Sign( a );
6147     bSig = extractFloatx80Frac( b );
6148     bExp = extractFloatx80Exp( b );
6149     if ( aExp == 0x7FFF ) {
6150         if (    (uint64_t) ( aSig0<<1 )
6151              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6152             return propagateFloatx80NaN(a, b, status);
6153         }
6154         goto invalid;
6155     }
6156     if ( bExp == 0x7FFF ) {
6157         if ((uint64_t)(bSig << 1)) {
6158             return propagateFloatx80NaN(a, b, status);
6159         }
6160         return a;
6161     }
6162     if ( bExp == 0 ) {
6163         if ( bSig == 0 ) {
6164  invalid:
6165             float_raise(float_flag_invalid, status);
6166             return floatx80_default_nan(status);
6167         }
6168         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6169     }
6170     if ( aExp == 0 ) {
6171         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
6172         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6173     }
6174     bSig |= UINT64_C(0x8000000000000000);
6175     zSign = aSign;
6176     expDiff = aExp - bExp;
6177     aSig1 = 0;
6178     if ( expDiff < 0 ) {
6179         if ( expDiff < -1 ) return a;
6180         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6181         expDiff = 0;
6182     }
6183     q = ( bSig <= aSig0 );
6184     if ( q ) aSig0 -= bSig;
6185     expDiff -= 64;
6186     while ( 0 < expDiff ) {
6187         q = estimateDiv128To64( aSig0, aSig1, bSig );
6188         q = ( 2 < q ) ? q - 2 : 0;
6189         mul64To128( bSig, q, &term0, &term1 );
6190         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6191         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6192         expDiff -= 62;
6193     }
6194     expDiff += 64;
6195     if ( 0 < expDiff ) {
6196         q = estimateDiv128To64( aSig0, aSig1, bSig );
6197         q = ( 2 < q ) ? q - 2 : 0;
6198         q >>= 64 - expDiff;
6199         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6200         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6201         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6202         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6203             ++q;
6204             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6205         }
6206     }
6207     else {
6208         term1 = 0;
6209         term0 = bSig;
6210     }
6211     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6212     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6213          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6214               && ( q & 1 ) )
6215        ) {
6216         aSig0 = alternateASig0;
6217         aSig1 = alternateASig1;
6218         zSign = ! zSign;
6219     }
6220     return
6221         normalizeRoundAndPackFloatx80(
6222             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6223
6224 }
6225
6226 /*----------------------------------------------------------------------------
6227 | Returns the square root of the extended double-precision floating-point
6228 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6229 | for Binary Floating-Point Arithmetic.
6230 *----------------------------------------------------------------------------*/
6231
6232 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6233 {
6234     flag aSign;
6235     int32_t aExp, zExp;
6236     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6237     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6238
6239     if (floatx80_invalid_encoding(a)) {
6240         float_raise(float_flag_invalid, status);
6241         return floatx80_default_nan(status);
6242     }
6243     aSig0 = extractFloatx80Frac( a );
6244     aExp = extractFloatx80Exp( a );
6245     aSign = extractFloatx80Sign( a );
6246     if ( aExp == 0x7FFF ) {
6247         if ((uint64_t)(aSig0 << 1)) {
6248             return propagateFloatx80NaN(a, a, status);
6249         }
6250         if ( ! aSign ) return a;
6251         goto invalid;
6252     }
6253     if ( aSign ) {
6254         if ( ( aExp | aSig0 ) == 0 ) return a;
6255  invalid:
6256         float_raise(float_flag_invalid, status);
6257         return floatx80_default_nan(status);
6258     }
6259     if ( aExp == 0 ) {
6260         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6261         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6262     }
6263     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6264     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6265     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6266     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6267     doubleZSig0 = zSig0<<1;
6268     mul64To128( zSig0, zSig0, &term0, &term1 );
6269     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6270     while ( (int64_t) rem0 < 0 ) {
6271         --zSig0;
6272         doubleZSig0 -= 2;
6273         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6274     }
6275     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6276     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6277         if ( zSig1 == 0 ) zSig1 = 1;
6278         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6279         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6280         mul64To128( zSig1, zSig1, &term2, &term3 );
6281         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6282         while ( (int64_t) rem1 < 0 ) {
6283             --zSig1;
6284             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6285             term3 |= 1;
6286             term2 |= doubleZSig0;
6287             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6288         }
6289         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6290     }
6291     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6292     zSig0 |= doubleZSig0;
6293     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6294                                 0, zExp, zSig0, zSig1, status);
6295 }
6296
6297 /*----------------------------------------------------------------------------
6298 | Returns 1 if the extended double-precision floating-point value `a' is equal
6299 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
6300 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6301 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6302 *----------------------------------------------------------------------------*/
6303
6304 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6305 {
6306
6307     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6308         || (extractFloatx80Exp(a) == 0x7FFF
6309             && (uint64_t) (extractFloatx80Frac(a) << 1))
6310         || (extractFloatx80Exp(b) == 0x7FFF
6311             && (uint64_t) (extractFloatx80Frac(b) << 1))
6312        ) {
6313         float_raise(float_flag_invalid, status);
6314         return 0;
6315     }
6316     return
6317            ( a.low == b.low )
6318         && (    ( a.high == b.high )
6319              || (    ( a.low == 0 )
6320                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6321            );
6322
6323 }
6324
6325 /*----------------------------------------------------------------------------
6326 | Returns 1 if the extended double-precision floating-point value `a' is
6327 | less than or equal to the corresponding value `b', and 0 otherwise.  The
6328 | invalid exception is raised if either operand is a NaN.  The comparison is
6329 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6330 | Arithmetic.
6331 *----------------------------------------------------------------------------*/
6332
6333 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6334 {
6335     flag aSign, bSign;
6336
6337     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6338         || (extractFloatx80Exp(a) == 0x7FFF
6339             && (uint64_t) (extractFloatx80Frac(a) << 1))
6340         || (extractFloatx80Exp(b) == 0x7FFF
6341             && (uint64_t) (extractFloatx80Frac(b) << 1))
6342        ) {
6343         float_raise(float_flag_invalid, status);
6344         return 0;
6345     }
6346     aSign = extractFloatx80Sign( a );
6347     bSign = extractFloatx80Sign( b );
6348     if ( aSign != bSign ) {
6349         return
6350                aSign
6351             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6352                  == 0 );
6353     }
6354     return
6355           aSign ? le128( b.high, b.low, a.high, a.low )
6356         : le128( a.high, a.low, b.high, b.low );
6357
6358 }
6359
6360 /*----------------------------------------------------------------------------
6361 | Returns 1 if the extended double-precision floating-point value `a' is
6362 | less than the corresponding value `b', and 0 otherwise.  The invalid
6363 | exception is raised if either operand is a NaN.  The comparison is performed
6364 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6365 *----------------------------------------------------------------------------*/
6366
6367 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6368 {
6369     flag aSign, bSign;
6370
6371     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6372         || (extractFloatx80Exp(a) == 0x7FFF
6373             && (uint64_t) (extractFloatx80Frac(a) << 1))
6374         || (extractFloatx80Exp(b) == 0x7FFF
6375             && (uint64_t) (extractFloatx80Frac(b) << 1))
6376        ) {
6377         float_raise(float_flag_invalid, status);
6378         return 0;
6379     }
6380     aSign = extractFloatx80Sign( a );
6381     bSign = extractFloatx80Sign( b );
6382     if ( aSign != bSign ) {
6383         return
6384                aSign
6385             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6386                  != 0 );
6387     }
6388     return
6389           aSign ? lt128( b.high, b.low, a.high, a.low )
6390         : lt128( a.high, a.low, b.high, b.low );
6391
6392 }
6393
6394 /*----------------------------------------------------------------------------
6395 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6396 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
6397 | either operand is a NaN.   The comparison is performed according to the
6398 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6399 *----------------------------------------------------------------------------*/
6400 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6401 {
6402     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6403         || (extractFloatx80Exp(a) == 0x7FFF
6404             && (uint64_t) (extractFloatx80Frac(a) << 1))
6405         || (extractFloatx80Exp(b) == 0x7FFF
6406             && (uint64_t) (extractFloatx80Frac(b) << 1))
6407        ) {
6408         float_raise(float_flag_invalid, status);
6409         return 1;
6410     }
6411     return 0;
6412 }
6413
6414 /*----------------------------------------------------------------------------
6415 | Returns 1 if the extended double-precision floating-point value `a' is
6416 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6417 | cause an exception.  The comparison is performed according to the IEC/IEEE
6418 | Standard for Binary Floating-Point Arithmetic.
6419 *----------------------------------------------------------------------------*/
6420
6421 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6422 {
6423
6424     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6425         float_raise(float_flag_invalid, status);
6426         return 0;
6427     }
6428     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6429               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6430          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6431               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6432        ) {
6433         if (floatx80_is_signaling_nan(a, status)
6434          || floatx80_is_signaling_nan(b, status)) {
6435             float_raise(float_flag_invalid, status);
6436         }
6437         return 0;
6438     }
6439     return
6440            ( a.low == b.low )
6441         && (    ( a.high == b.high )
6442              || (    ( a.low == 0 )
6443                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6444            );
6445
6446 }
6447
6448 /*----------------------------------------------------------------------------
6449 | Returns 1 if the extended double-precision floating-point value `a' is less
6450 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
6451 | do not cause an exception.  Otherwise, the comparison is performed according
6452 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6453 *----------------------------------------------------------------------------*/
6454
6455 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6456 {
6457     flag aSign, bSign;
6458
6459     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6460         float_raise(float_flag_invalid, status);
6461         return 0;
6462     }
6463     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6464               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6465          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6466               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6467        ) {
6468         if (floatx80_is_signaling_nan(a, status)
6469          || floatx80_is_signaling_nan(b, status)) {
6470             float_raise(float_flag_invalid, status);
6471         }
6472         return 0;
6473     }
6474     aSign = extractFloatx80Sign( a );
6475     bSign = extractFloatx80Sign( b );
6476     if ( aSign != bSign ) {
6477         return
6478                aSign
6479             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6480                  == 0 );
6481     }
6482     return
6483           aSign ? le128( b.high, b.low, a.high, a.low )
6484         : le128( a.high, a.low, b.high, b.low );
6485
6486 }
6487
6488 /*----------------------------------------------------------------------------
6489 | Returns 1 if the extended double-precision floating-point value `a' is less
6490 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
6491 | an exception.  Otherwise, the comparison is performed according to the
6492 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6493 *----------------------------------------------------------------------------*/
6494
6495 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6496 {
6497     flag aSign, bSign;
6498
6499     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6500         float_raise(float_flag_invalid, status);
6501         return 0;
6502     }
6503     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6504               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6505          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6506               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6507        ) {
6508         if (floatx80_is_signaling_nan(a, status)
6509          || floatx80_is_signaling_nan(b, status)) {
6510             float_raise(float_flag_invalid, status);
6511         }
6512         return 0;
6513     }
6514     aSign = extractFloatx80Sign( a );
6515     bSign = extractFloatx80Sign( b );
6516     if ( aSign != bSign ) {
6517         return
6518                aSign
6519             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6520                  != 0 );
6521     }
6522     return
6523           aSign ? lt128( b.high, b.low, a.high, a.low )
6524         : lt128( a.high, a.low, b.high, b.low );
6525
6526 }
6527
6528 /*----------------------------------------------------------------------------
6529 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6530 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6531 | The comparison is performed according to the IEC/IEEE Standard for Binary
6532 | Floating-Point Arithmetic.
6533 *----------------------------------------------------------------------------*/
6534 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6535 {
6536     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6537         float_raise(float_flag_invalid, status);
6538         return 1;
6539     }
6540     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6541               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6542          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6543               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6544        ) {
6545         if (floatx80_is_signaling_nan(a, status)
6546          || floatx80_is_signaling_nan(b, status)) {
6547             float_raise(float_flag_invalid, status);
6548         }
6549         return 1;
6550     }
6551     return 0;
6552 }
6553
6554 /*----------------------------------------------------------------------------
6555 | Returns the result of converting the quadruple-precision floating-point
6556 | value `a' to the 32-bit two's complement integer format.  The conversion
6557 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6558 | Arithmetic---which means in particular that the conversion is rounded
6559 | according to the current rounding mode.  If `a' is a NaN, the largest
6560 | positive integer is returned.  Otherwise, if the conversion overflows, the
6561 | largest integer with the same sign as `a' is returned.
6562 *----------------------------------------------------------------------------*/
6563
6564 int32_t float128_to_int32(float128 a, float_status *status)
6565 {
6566     flag aSign;
6567     int32_t aExp, shiftCount;
6568     uint64_t aSig0, aSig1;
6569
6570     aSig1 = extractFloat128Frac1( a );
6571     aSig0 = extractFloat128Frac0( a );
6572     aExp = extractFloat128Exp( a );
6573     aSign = extractFloat128Sign( a );
6574     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6575     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6576     aSig0 |= ( aSig1 != 0 );
6577     shiftCount = 0x4028 - aExp;
6578     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6579     return roundAndPackInt32(aSign, aSig0, status);
6580
6581 }
6582
6583 /*----------------------------------------------------------------------------
6584 | Returns the result of converting the quadruple-precision floating-point
6585 | value `a' to the 32-bit two's complement integer format.  The conversion
6586 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6587 | Arithmetic, except that the conversion is always rounded toward zero.  If
6588 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6589 | conversion overflows, the largest integer with the same sign as `a' is
6590 | returned.
6591 *----------------------------------------------------------------------------*/
6592
6593 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6594 {
6595     flag aSign;
6596     int32_t aExp, shiftCount;
6597     uint64_t aSig0, aSig1, savedASig;
6598     int32_t z;
6599
6600     aSig1 = extractFloat128Frac1( a );
6601     aSig0 = extractFloat128Frac0( a );
6602     aExp = extractFloat128Exp( a );
6603     aSign = extractFloat128Sign( a );
6604     aSig0 |= ( aSig1 != 0 );
6605     if ( 0x401E < aExp ) {
6606         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6607         goto invalid;
6608     }
6609     else if ( aExp < 0x3FFF ) {
6610         if (aExp || aSig0) {
6611             status->float_exception_flags |= float_flag_inexact;
6612         }
6613         return 0;
6614     }
6615     aSig0 |= UINT64_C(0x0001000000000000);
6616     shiftCount = 0x402F - aExp;
6617     savedASig = aSig0;
6618     aSig0 >>= shiftCount;
6619     z = aSig0;
6620     if ( aSign ) z = - z;
6621     if ( ( z < 0 ) ^ aSign ) {
6622  invalid:
6623         float_raise(float_flag_invalid, status);
6624         return aSign ? INT32_MIN : INT32_MAX;
6625     }
6626     if ( ( aSig0<<shiftCount ) != savedASig ) {
6627         status->float_exception_flags |= float_flag_inexact;
6628     }
6629     return z;
6630
6631 }
6632
6633 /*----------------------------------------------------------------------------
6634 | Returns the result of converting the quadruple-precision floating-point
6635 | value `a' to the 64-bit two's complement integer format.  The conversion
6636 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6637 | Arithmetic---which means in particular that the conversion is rounded
6638 | according to the current rounding mode.  If `a' is a NaN, the largest
6639 | positive integer is returned.  Otherwise, if the conversion overflows, the
6640 | largest integer with the same sign as `a' is returned.
6641 *----------------------------------------------------------------------------*/
6642
6643 int64_t float128_to_int64(float128 a, float_status *status)
6644 {
6645     flag aSign;
6646     int32_t aExp, shiftCount;
6647     uint64_t aSig0, aSig1;
6648
6649     aSig1 = extractFloat128Frac1( a );
6650     aSig0 = extractFloat128Frac0( a );
6651     aExp = extractFloat128Exp( a );
6652     aSign = extractFloat128Sign( a );
6653     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6654     shiftCount = 0x402F - aExp;
6655     if ( shiftCount <= 0 ) {
6656         if ( 0x403E < aExp ) {
6657             float_raise(float_flag_invalid, status);
6658             if (    ! aSign
6659                  || (    ( aExp == 0x7FFF )
6660                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6661                     )
6662                ) {
6663                 return INT64_MAX;
6664             }
6665             return INT64_MIN;
6666         }
6667         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6668     }
6669     else {
6670         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6671     }
6672     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6673
6674 }
6675
6676 /*----------------------------------------------------------------------------
6677 | Returns the result of converting the quadruple-precision floating-point
6678 | value `a' to the 64-bit two's complement integer format.  The conversion
6679 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6680 | Arithmetic, except that the conversion is always rounded toward zero.
6681 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6682 | the conversion overflows, the largest integer with the same sign as `a' is
6683 | returned.
6684 *----------------------------------------------------------------------------*/
6685
6686 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6687 {
6688     flag aSign;
6689     int32_t aExp, shiftCount;
6690     uint64_t aSig0, aSig1;
6691     int64_t z;
6692
6693     aSig1 = extractFloat128Frac1( a );
6694     aSig0 = extractFloat128Frac0( a );
6695     aExp = extractFloat128Exp( a );
6696     aSign = extractFloat128Sign( a );
6697     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6698     shiftCount = aExp - 0x402F;
6699     if ( 0 < shiftCount ) {
6700         if ( 0x403E <= aExp ) {
6701             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6702             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6703                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6704                 if (aSig1) {
6705                     status->float_exception_flags |= float_flag_inexact;
6706                 }
6707             }
6708             else {
6709                 float_raise(float_flag_invalid, status);
6710                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6711                     return INT64_MAX;
6712                 }
6713             }
6714             return INT64_MIN;
6715         }
6716         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6717         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6718             status->float_exception_flags |= float_flag_inexact;
6719         }
6720     }
6721     else {
6722         if ( aExp < 0x3FFF ) {
6723             if ( aExp | aSig0 | aSig1 ) {
6724                 status->float_exception_flags |= float_flag_inexact;
6725             }
6726             return 0;
6727         }
6728         z = aSig0>>( - shiftCount );
6729         if (    aSig1
6730              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6731             status->float_exception_flags |= float_flag_inexact;
6732         }
6733     }
6734     if ( aSign ) z = - z;
6735     return z;
6736
6737 }
6738
6739 /*----------------------------------------------------------------------------
6740 | Returns the result of converting the quadruple-precision floating-point value
6741 | `a' to the 64-bit unsigned integer format.  The conversion is
6742 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6743 | Arithmetic---which means in particular that the conversion is rounded
6744 | according to the current rounding mode.  If `a' is a NaN, the largest
6745 | positive integer is returned.  If the conversion overflows, the
6746 | largest unsigned integer is returned.  If 'a' is negative, the value is
6747 | rounded and zero is returned; negative values that do not round to zero
6748 | will raise the inexact exception.
6749 *----------------------------------------------------------------------------*/
6750
6751 uint64_t float128_to_uint64(float128 a, float_status *status)
6752 {
6753     flag aSign;
6754     int aExp;
6755     int shiftCount;
6756     uint64_t aSig0, aSig1;
6757
6758     aSig0 = extractFloat128Frac0(a);
6759     aSig1 = extractFloat128Frac1(a);
6760     aExp = extractFloat128Exp(a);
6761     aSign = extractFloat128Sign(a);
6762     if (aSign && (aExp > 0x3FFE)) {
6763         float_raise(float_flag_invalid, status);
6764         if (float128_is_any_nan(a)) {
6765             return UINT64_MAX;
6766         } else {
6767             return 0;
6768         }
6769     }
6770     if (aExp) {
6771         aSig0 |= UINT64_C(0x0001000000000000);
6772     }
6773     shiftCount = 0x402F - aExp;
6774     if (shiftCount <= 0) {
6775         if (0x403E < aExp) {
6776             float_raise(float_flag_invalid, status);
6777             return UINT64_MAX;
6778         }
6779         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6780     } else {
6781         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6782     }
6783     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6784 }
6785
6786 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6787 {
6788     uint64_t v;
6789     signed char current_rounding_mode = status->float_rounding_mode;
6790
6791     set_float_rounding_mode(float_round_to_zero, status);
6792     v = float128_to_uint64(a, status);
6793     set_float_rounding_mode(current_rounding_mode, status);
6794
6795     return v;
6796 }
6797
6798 /*----------------------------------------------------------------------------
6799 | Returns the result of converting the quadruple-precision floating-point
6800 | value `a' to the 32-bit unsigned integer format.  The conversion
6801 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6802 | Arithmetic except that the conversion is always rounded toward zero.
6803 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6804 | if the conversion overflows, the largest unsigned integer is returned.
6805 | If 'a' is negative, the value is rounded and zero is returned; negative
6806 | values that do not round to zero will raise the inexact exception.
6807 *----------------------------------------------------------------------------*/
6808
6809 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6810 {
6811     uint64_t v;
6812     uint32_t res;
6813     int old_exc_flags = get_float_exception_flags(status);
6814
6815     v = float128_to_uint64_round_to_zero(a, status);
6816     if (v > 0xffffffff) {
6817         res = 0xffffffff;
6818     } else {
6819         return v;
6820     }
6821     set_float_exception_flags(old_exc_flags, status);
6822     float_raise(float_flag_invalid, status);
6823     return res;
6824 }
6825
6826 /*----------------------------------------------------------------------------
6827 | Returns the result of converting the quadruple-precision floating-point value
6828 | `a' to the 32-bit unsigned integer format.  The conversion is
6829 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6830 | Arithmetic---which means in particular that the conversion is rounded
6831 | according to the current rounding mode.  If `a' is a NaN, the largest
6832 | positive integer is returned.  If the conversion overflows, the
6833 | largest unsigned integer is returned.  If 'a' is negative, the value is
6834 | rounded and zero is returned; negative values that do not round to zero
6835 | will raise the inexact exception.
6836 *----------------------------------------------------------------------------*/
6837
6838 uint32_t float128_to_uint32(float128 a, float_status *status)
6839 {
6840     uint64_t v;
6841     uint32_t res;
6842     int old_exc_flags = get_float_exception_flags(status);
6843
6844     v = float128_to_uint64(a, status);
6845     if (v > 0xffffffff) {
6846         res = 0xffffffff;
6847     } else {
6848         return v;
6849     }
6850     set_float_exception_flags(old_exc_flags, status);
6851     float_raise(float_flag_invalid, status);
6852     return res;
6853 }
6854
6855 /*----------------------------------------------------------------------------
6856 | Returns the result of converting the quadruple-precision floating-point
6857 | value `a' to the single-precision floating-point format.  The conversion
6858 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6859 | Arithmetic.
6860 *----------------------------------------------------------------------------*/
6861
6862 float32 float128_to_float32(float128 a, float_status *status)
6863 {
6864     flag aSign;
6865     int32_t aExp;
6866     uint64_t aSig0, aSig1;
6867     uint32_t zSig;
6868
6869     aSig1 = extractFloat128Frac1( a );
6870     aSig0 = extractFloat128Frac0( a );
6871     aExp = extractFloat128Exp( a );
6872     aSign = extractFloat128Sign( a );
6873     if ( aExp == 0x7FFF ) {
6874         if ( aSig0 | aSig1 ) {
6875             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6876         }
6877         return packFloat32( aSign, 0xFF, 0 );
6878     }
6879     aSig0 |= ( aSig1 != 0 );
6880     shift64RightJamming( aSig0, 18, &aSig0 );
6881     zSig = aSig0;
6882     if ( aExp || zSig ) {
6883         zSig |= 0x40000000;
6884         aExp -= 0x3F81;
6885     }
6886     return roundAndPackFloat32(aSign, aExp, zSig, status);
6887
6888 }
6889
6890 /*----------------------------------------------------------------------------
6891 | Returns the result of converting the quadruple-precision floating-point
6892 | value `a' to the double-precision floating-point format.  The conversion
6893 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6894 | Arithmetic.
6895 *----------------------------------------------------------------------------*/
6896
6897 float64 float128_to_float64(float128 a, float_status *status)
6898 {
6899     flag aSign;
6900     int32_t aExp;
6901     uint64_t aSig0, aSig1;
6902
6903     aSig1 = extractFloat128Frac1( a );
6904     aSig0 = extractFloat128Frac0( a );
6905     aExp = extractFloat128Exp( a );
6906     aSign = extractFloat128Sign( a );
6907     if ( aExp == 0x7FFF ) {
6908         if ( aSig0 | aSig1 ) {
6909             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6910         }
6911         return packFloat64( aSign, 0x7FF, 0 );
6912     }
6913     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6914     aSig0 |= ( aSig1 != 0 );
6915     if ( aExp || aSig0 ) {
6916         aSig0 |= UINT64_C(0x4000000000000000);
6917         aExp -= 0x3C01;
6918     }
6919     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6920
6921 }
6922
6923 /*----------------------------------------------------------------------------
6924 | Returns the result of converting the quadruple-precision floating-point
6925 | value `a' to the extended double-precision floating-point format.  The
6926 | conversion is performed according to the IEC/IEEE Standard for Binary
6927 | Floating-Point Arithmetic.
6928 *----------------------------------------------------------------------------*/
6929
6930 floatx80 float128_to_floatx80(float128 a, float_status *status)
6931 {
6932     flag aSign;
6933     int32_t aExp;
6934     uint64_t aSig0, aSig1;
6935
6936     aSig1 = extractFloat128Frac1( a );
6937     aSig0 = extractFloat128Frac0( a );
6938     aExp = extractFloat128Exp( a );
6939     aSign = extractFloat128Sign( a );
6940     if ( aExp == 0x7FFF ) {
6941         if ( aSig0 | aSig1 ) {
6942             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6943         }
6944         return packFloatx80(aSign, floatx80_infinity_high,
6945                                    floatx80_infinity_low);
6946     }
6947     if ( aExp == 0 ) {
6948         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6949         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6950     }
6951     else {
6952         aSig0 |= UINT64_C(0x0001000000000000);
6953     }
6954     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6955     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6956
6957 }
6958
6959 /*----------------------------------------------------------------------------
6960 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6961 | returns the result as a quadruple-precision floating-point value.  The
6962 | operation is performed according to the IEC/IEEE Standard for Binary
6963 | Floating-Point Arithmetic.
6964 *----------------------------------------------------------------------------*/
6965
6966 float128 float128_round_to_int(float128 a, float_status *status)
6967 {
6968     flag aSign;
6969     int32_t aExp;
6970     uint64_t lastBitMask, roundBitsMask;
6971     float128 z;
6972
6973     aExp = extractFloat128Exp( a );
6974     if ( 0x402F <= aExp ) {
6975         if ( 0x406F <= aExp ) {
6976             if (    ( aExp == 0x7FFF )
6977                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6978                ) {
6979                 return propagateFloat128NaN(a, a, status);
6980             }
6981             return a;
6982         }
6983         lastBitMask = 1;
6984         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6985         roundBitsMask = lastBitMask - 1;
6986         z = a;
6987         switch (status->float_rounding_mode) {
6988         case float_round_nearest_even:
6989             if ( lastBitMask ) {
6990                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6991                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6992             }
6993             else {
6994                 if ( (int64_t) z.low < 0 ) {
6995                     ++z.high;
6996                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6997                 }
6998             }
6999             break;
7000         case float_round_ties_away:
7001             if (lastBitMask) {
7002                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
7003             } else {
7004                 if ((int64_t) z.low < 0) {
7005                     ++z.high;
7006                 }
7007             }
7008             break;
7009         case float_round_to_zero:
7010             break;
7011         case float_round_up:
7012             if (!extractFloat128Sign(z)) {
7013                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7014             }
7015             break;
7016         case float_round_down:
7017             if (extractFloat128Sign(z)) {
7018                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7019             }
7020             break;
7021         case float_round_to_odd:
7022             /*
7023              * Note that if lastBitMask == 0, the last bit is the lsb
7024              * of high, and roundBitsMask == -1.
7025              */
7026             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
7027                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7028             }
7029             break;
7030         default:
7031             abort();
7032         }
7033         z.low &= ~ roundBitsMask;
7034     }
7035     else {
7036         if ( aExp < 0x3FFF ) {
7037             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
7038             status->float_exception_flags |= float_flag_inexact;
7039             aSign = extractFloat128Sign( a );
7040             switch (status->float_rounding_mode) {
7041             case float_round_nearest_even:
7042                 if (    ( aExp == 0x3FFE )
7043                      && (   extractFloat128Frac0( a )
7044                           | extractFloat128Frac1( a ) )
7045                    ) {
7046                     return packFloat128( aSign, 0x3FFF, 0, 0 );
7047                 }
7048                 break;
7049             case float_round_ties_away:
7050                 if (aExp == 0x3FFE) {
7051                     return packFloat128(aSign, 0x3FFF, 0, 0);
7052                 }
7053                 break;
7054             case float_round_down:
7055                 return
7056                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7057                     : packFloat128( 0, 0, 0, 0 );
7058             case float_round_up:
7059                 return
7060                       aSign ? packFloat128( 1, 0, 0, 0 )
7061                     : packFloat128( 0, 0x3FFF, 0, 0 );
7062
7063             case float_round_to_odd:
7064                 return packFloat128(aSign, 0x3FFF, 0, 0);
7065             }
7066             return packFloat128( aSign, 0, 0, 0 );
7067         }
7068         lastBitMask = 1;
7069         lastBitMask <<= 0x402F - aExp;
7070         roundBitsMask = lastBitMask - 1;
7071         z.low = 0;
7072         z.high = a.high;
7073         switch (status->float_rounding_mode) {
7074         case float_round_nearest_even:
7075             z.high += lastBitMask>>1;
7076             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7077                 z.high &= ~ lastBitMask;
7078             }
7079             break;
7080         case float_round_ties_away:
7081             z.high += lastBitMask>>1;
7082             break;
7083         case float_round_to_zero:
7084             break;
7085         case float_round_up:
7086             if (!extractFloat128Sign(z)) {
7087                 z.high |= ( a.low != 0 );
7088                 z.high += roundBitsMask;
7089             }
7090             break;
7091         case float_round_down:
7092             if (extractFloat128Sign(z)) {
7093                 z.high |= (a.low != 0);
7094                 z.high += roundBitsMask;
7095             }
7096             break;
7097         case float_round_to_odd:
7098             if ((z.high & lastBitMask) == 0) {
7099                 z.high |= (a.low != 0);
7100                 z.high += roundBitsMask;
7101             }
7102             break;
7103         default:
7104             abort();
7105         }
7106         z.high &= ~ roundBitsMask;
7107     }
7108     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7109         status->float_exception_flags |= float_flag_inexact;
7110     }
7111     return z;
7112
7113 }
7114
7115 /*----------------------------------------------------------------------------
7116 | Returns the result of adding the absolute values of the quadruple-precision
7117 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
7118 | before being returned.  `zSign' is ignored if the result is a NaN.
7119 | The addition is performed according to the IEC/IEEE Standard for Binary
7120 | Floating-Point Arithmetic.
7121 *----------------------------------------------------------------------------*/
7122
7123 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
7124                                 float_status *status)
7125 {
7126     int32_t aExp, bExp, zExp;
7127     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7128     int32_t expDiff;
7129
7130     aSig1 = extractFloat128Frac1( a );
7131     aSig0 = extractFloat128Frac0( a );
7132     aExp = extractFloat128Exp( a );
7133     bSig1 = extractFloat128Frac1( b );
7134     bSig0 = extractFloat128Frac0( b );
7135     bExp = extractFloat128Exp( b );
7136     expDiff = aExp - bExp;
7137     if ( 0 < expDiff ) {
7138         if ( aExp == 0x7FFF ) {
7139             if (aSig0 | aSig1) {
7140                 return propagateFloat128NaN(a, b, status);
7141             }
7142             return a;
7143         }
7144         if ( bExp == 0 ) {
7145             --expDiff;
7146         }
7147         else {
7148             bSig0 |= UINT64_C(0x0001000000000000);
7149         }
7150         shift128ExtraRightJamming(
7151             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7152         zExp = aExp;
7153     }
7154     else if ( expDiff < 0 ) {
7155         if ( bExp == 0x7FFF ) {
7156             if (bSig0 | bSig1) {
7157                 return propagateFloat128NaN(a, b, status);
7158             }
7159             return packFloat128( zSign, 0x7FFF, 0, 0 );
7160         }
7161         if ( aExp == 0 ) {
7162             ++expDiff;
7163         }
7164         else {
7165             aSig0 |= UINT64_C(0x0001000000000000);
7166         }
7167         shift128ExtraRightJamming(
7168             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7169         zExp = bExp;
7170     }
7171     else {
7172         if ( aExp == 0x7FFF ) {
7173             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7174                 return propagateFloat128NaN(a, b, status);
7175             }
7176             return a;
7177         }
7178         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7179         if ( aExp == 0 ) {
7180             if (status->flush_to_zero) {
7181                 if (zSig0 | zSig1) {
7182                     float_raise(float_flag_output_denormal, status);
7183                 }
7184                 return packFloat128(zSign, 0, 0, 0);
7185             }
7186             return packFloat128( zSign, 0, zSig0, zSig1 );
7187         }
7188         zSig2 = 0;
7189         zSig0 |= UINT64_C(0x0002000000000000);
7190         zExp = aExp;
7191         goto shiftRight1;
7192     }
7193     aSig0 |= UINT64_C(0x0001000000000000);
7194     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7195     --zExp;
7196     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
7197     ++zExp;
7198  shiftRight1:
7199     shift128ExtraRightJamming(
7200         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7201  roundAndPack:
7202     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7203
7204 }
7205
7206 /*----------------------------------------------------------------------------
7207 | Returns the result of subtracting the absolute values of the quadruple-
7208 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7209 | difference is negated before being returned.  `zSign' is ignored if the
7210 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7211 | Standard for Binary Floating-Point Arithmetic.
7212 *----------------------------------------------------------------------------*/
7213
7214 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
7215                                 float_status *status)
7216 {
7217     int32_t aExp, bExp, zExp;
7218     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7219     int32_t expDiff;
7220
7221     aSig1 = extractFloat128Frac1( a );
7222     aSig0 = extractFloat128Frac0( a );
7223     aExp = extractFloat128Exp( a );
7224     bSig1 = extractFloat128Frac1( b );
7225     bSig0 = extractFloat128Frac0( b );
7226     bExp = extractFloat128Exp( b );
7227     expDiff = aExp - bExp;
7228     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7229     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7230     if ( 0 < expDiff ) goto aExpBigger;
7231     if ( expDiff < 0 ) goto bExpBigger;
7232     if ( aExp == 0x7FFF ) {
7233         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7234             return propagateFloat128NaN(a, b, status);
7235         }
7236         float_raise(float_flag_invalid, status);
7237         return float128_default_nan(status);
7238     }
7239     if ( aExp == 0 ) {
7240         aExp = 1;
7241         bExp = 1;
7242     }
7243     if ( bSig0 < aSig0 ) goto aBigger;
7244     if ( aSig0 < bSig0 ) goto bBigger;
7245     if ( bSig1 < aSig1 ) goto aBigger;
7246     if ( aSig1 < bSig1 ) goto bBigger;
7247     return packFloat128(status->float_rounding_mode == float_round_down,
7248                         0, 0, 0);
7249  bExpBigger:
7250     if ( bExp == 0x7FFF ) {
7251         if (bSig0 | bSig1) {
7252             return propagateFloat128NaN(a, b, status);
7253         }
7254         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7255     }
7256     if ( aExp == 0 ) {
7257         ++expDiff;
7258     }
7259     else {
7260         aSig0 |= UINT64_C(0x4000000000000000);
7261     }
7262     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7263     bSig0 |= UINT64_C(0x4000000000000000);
7264  bBigger:
7265     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7266     zExp = bExp;
7267     zSign ^= 1;
7268     goto normalizeRoundAndPack;
7269  aExpBigger:
7270     if ( aExp == 0x7FFF ) {
7271         if (aSig0 | aSig1) {
7272             return propagateFloat128NaN(a, b, status);
7273         }
7274         return a;
7275     }
7276     if ( bExp == 0 ) {
7277         --expDiff;
7278     }
7279     else {
7280         bSig0 |= UINT64_C(0x4000000000000000);
7281     }
7282     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7283     aSig0 |= UINT64_C(0x4000000000000000);
7284  aBigger:
7285     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7286     zExp = aExp;
7287  normalizeRoundAndPack:
7288     --zExp;
7289     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7290                                          status);
7291
7292 }
7293
7294 /*----------------------------------------------------------------------------
7295 | Returns the result of adding the quadruple-precision floating-point values
7296 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7297 | for Binary Floating-Point Arithmetic.
7298 *----------------------------------------------------------------------------*/
7299
7300 float128 float128_add(float128 a, float128 b, float_status *status)
7301 {
7302     flag aSign, bSign;
7303
7304     aSign = extractFloat128Sign( a );
7305     bSign = extractFloat128Sign( b );
7306     if ( aSign == bSign ) {
7307         return addFloat128Sigs(a, b, aSign, status);
7308     }
7309     else {
7310         return subFloat128Sigs(a, b, aSign, status);
7311     }
7312
7313 }
7314
7315 /*----------------------------------------------------------------------------
7316 | Returns the result of subtracting the quadruple-precision floating-point
7317 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7318 | Standard for Binary Floating-Point Arithmetic.
7319 *----------------------------------------------------------------------------*/
7320
7321 float128 float128_sub(float128 a, float128 b, float_status *status)
7322 {
7323     flag aSign, bSign;
7324
7325     aSign = extractFloat128Sign( a );
7326     bSign = extractFloat128Sign( b );
7327     if ( aSign == bSign ) {
7328         return subFloat128Sigs(a, b, aSign, status);
7329     }
7330     else {
7331         return addFloat128Sigs(a, b, aSign, status);
7332     }
7333
7334 }
7335
7336 /*----------------------------------------------------------------------------
7337 | Returns the result of multiplying the quadruple-precision floating-point
7338 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7339 | Standard for Binary Floating-Point Arithmetic.
7340 *----------------------------------------------------------------------------*/
7341
7342 float128 float128_mul(float128 a, float128 b, float_status *status)
7343 {
7344     flag aSign, bSign, zSign;
7345     int32_t aExp, bExp, zExp;
7346     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7347
7348     aSig1 = extractFloat128Frac1( a );
7349     aSig0 = extractFloat128Frac0( a );
7350     aExp = extractFloat128Exp( a );
7351     aSign = extractFloat128Sign( a );
7352     bSig1 = extractFloat128Frac1( b );
7353     bSig0 = extractFloat128Frac0( b );
7354     bExp = extractFloat128Exp( b );
7355     bSign = extractFloat128Sign( b );
7356     zSign = aSign ^ bSign;
7357     if ( aExp == 0x7FFF ) {
7358         if (    ( aSig0 | aSig1 )
7359              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7360             return propagateFloat128NaN(a, b, status);
7361         }
7362         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7363         return packFloat128( zSign, 0x7FFF, 0, 0 );
7364     }
7365     if ( bExp == 0x7FFF ) {
7366         if (bSig0 | bSig1) {
7367             return propagateFloat128NaN(a, b, status);
7368         }
7369         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7370  invalid:
7371             float_raise(float_flag_invalid, status);
7372             return float128_default_nan(status);
7373         }
7374         return packFloat128( zSign, 0x7FFF, 0, 0 );
7375     }
7376     if ( aExp == 0 ) {
7377         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7378         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7379     }
7380     if ( bExp == 0 ) {
7381         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7382         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7383     }
7384     zExp = aExp + bExp - 0x4000;
7385     aSig0 |= UINT64_C(0x0001000000000000);
7386     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7387     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7388     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7389     zSig2 |= ( zSig3 != 0 );
7390     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7391         shift128ExtraRightJamming(
7392             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7393         ++zExp;
7394     }
7395     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7396
7397 }
7398
7399 /*----------------------------------------------------------------------------
7400 | Returns the result of dividing the quadruple-precision floating-point value
7401 | `a' by the corresponding value `b'.  The operation is performed according to
7402 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7403 *----------------------------------------------------------------------------*/
7404
7405 float128 float128_div(float128 a, float128 b, float_status *status)
7406 {
7407     flag aSign, bSign, zSign;
7408     int32_t aExp, bExp, zExp;
7409     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7410     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7411
7412     aSig1 = extractFloat128Frac1( a );
7413     aSig0 = extractFloat128Frac0( a );
7414     aExp = extractFloat128Exp( a );
7415     aSign = extractFloat128Sign( a );
7416     bSig1 = extractFloat128Frac1( b );
7417     bSig0 = extractFloat128Frac0( b );
7418     bExp = extractFloat128Exp( b );
7419     bSign = extractFloat128Sign( b );
7420     zSign = aSign ^ bSign;
7421     if ( aExp == 0x7FFF ) {
7422         if (aSig0 | aSig1) {
7423             return propagateFloat128NaN(a, b, status);
7424         }
7425         if ( bExp == 0x7FFF ) {
7426             if (bSig0 | bSig1) {
7427                 return propagateFloat128NaN(a, b, status);
7428             }
7429             goto invalid;
7430         }
7431         return packFloat128( zSign, 0x7FFF, 0, 0 );
7432     }
7433     if ( bExp == 0x7FFF ) {
7434         if (bSig0 | bSig1) {
7435             return propagateFloat128NaN(a, b, status);
7436         }
7437         return packFloat128( zSign, 0, 0, 0 );
7438     }
7439     if ( bExp == 0 ) {
7440         if ( ( bSig0 | bSig1 ) == 0 ) {
7441             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7442  invalid:
7443                 float_raise(float_flag_invalid, status);
7444                 return float128_default_nan(status);
7445             }
7446             float_raise(float_flag_divbyzero, status);
7447             return packFloat128( zSign, 0x7FFF, 0, 0 );
7448         }
7449         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7450     }
7451     if ( aExp == 0 ) {
7452         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7453         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7454     }
7455     zExp = aExp - bExp + 0x3FFD;
7456     shortShift128Left(
7457         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7458     shortShift128Left(
7459         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7460     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7461         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7462         ++zExp;
7463     }
7464     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7465     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7466     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7467     while ( (int64_t) rem0 < 0 ) {
7468         --zSig0;
7469         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7470     }
7471     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7472     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7473         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7474         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7475         while ( (int64_t) rem1 < 0 ) {
7476             --zSig1;
7477             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7478         }
7479         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7480     }
7481     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7482     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7483
7484 }
7485
7486 /*----------------------------------------------------------------------------
7487 | Returns the remainder of the quadruple-precision floating-point value `a'
7488 | with respect to the corresponding value `b'.  The operation is performed
7489 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7490 *----------------------------------------------------------------------------*/
7491
7492 float128 float128_rem(float128 a, float128 b, float_status *status)
7493 {
7494     flag aSign, zSign;
7495     int32_t aExp, bExp, expDiff;
7496     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7497     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7498     int64_t sigMean0;
7499
7500     aSig1 = extractFloat128Frac1( a );
7501     aSig0 = extractFloat128Frac0( a );
7502     aExp = extractFloat128Exp( a );
7503     aSign = extractFloat128Sign( a );
7504     bSig1 = extractFloat128Frac1( b );
7505     bSig0 = extractFloat128Frac0( b );
7506     bExp = extractFloat128Exp( b );
7507     if ( aExp == 0x7FFF ) {
7508         if (    ( aSig0 | aSig1 )
7509              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7510             return propagateFloat128NaN(a, b, status);
7511         }
7512         goto invalid;
7513     }
7514     if ( bExp == 0x7FFF ) {
7515         if (bSig0 | bSig1) {
7516             return propagateFloat128NaN(a, b, status);
7517         }
7518         return a;
7519     }
7520     if ( bExp == 0 ) {
7521         if ( ( bSig0 | bSig1 ) == 0 ) {
7522  invalid:
7523             float_raise(float_flag_invalid, status);
7524             return float128_default_nan(status);
7525         }
7526         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7527     }
7528     if ( aExp == 0 ) {
7529         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7530         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7531     }
7532     expDiff = aExp - bExp;
7533     if ( expDiff < -1 ) return a;
7534     shortShift128Left(
7535         aSig0 | UINT64_C(0x0001000000000000),
7536         aSig1,
7537         15 - ( expDiff < 0 ),
7538         &aSig0,
7539         &aSig1
7540     );
7541     shortShift128Left(
7542         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7543     q = le128( bSig0, bSig1, aSig0, aSig1 );
7544     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7545     expDiff -= 64;
7546     while ( 0 < expDiff ) {
7547         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7548         q = ( 4 < q ) ? q - 4 : 0;
7549         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7550         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7551         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7552         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7553         expDiff -= 61;
7554     }
7555     if ( -64 < expDiff ) {
7556         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7557         q = ( 4 < q ) ? q - 4 : 0;
7558         q >>= - expDiff;
7559         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7560         expDiff += 52;
7561         if ( expDiff < 0 ) {
7562             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7563         }
7564         else {
7565             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7566         }
7567         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7568         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7569     }
7570     else {
7571         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7572         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7573     }
7574     do {
7575         alternateASig0 = aSig0;
7576         alternateASig1 = aSig1;
7577         ++q;
7578         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7579     } while ( 0 <= (int64_t) aSig0 );
7580     add128(
7581         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7582     if (    ( sigMean0 < 0 )
7583          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7584         aSig0 = alternateASig0;
7585         aSig1 = alternateASig1;
7586     }
7587     zSign = ( (int64_t) aSig0 < 0 );
7588     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7589     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7590                                          status);
7591 }
7592
7593 /*----------------------------------------------------------------------------
7594 | Returns the square root of the quadruple-precision floating-point value `a'.
7595 | The operation is performed according to the IEC/IEEE Standard for Binary
7596 | Floating-Point Arithmetic.
7597 *----------------------------------------------------------------------------*/
7598
7599 float128 float128_sqrt(float128 a, float_status *status)
7600 {
7601     flag aSign;
7602     int32_t aExp, zExp;
7603     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7604     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7605
7606     aSig1 = extractFloat128Frac1( a );
7607     aSig0 = extractFloat128Frac0( a );
7608     aExp = extractFloat128Exp( a );
7609     aSign = extractFloat128Sign( a );
7610     if ( aExp == 0x7FFF ) {
7611         if (aSig0 | aSig1) {
7612             return propagateFloat128NaN(a, a, status);
7613         }
7614         if ( ! aSign ) return a;
7615         goto invalid;
7616     }
7617     if ( aSign ) {
7618         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7619  invalid:
7620         float_raise(float_flag_invalid, status);
7621         return float128_default_nan(status);
7622     }
7623     if ( aExp == 0 ) {
7624         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7625         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7626     }
7627     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7628     aSig0 |= UINT64_C(0x0001000000000000);
7629     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7630     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7631     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7632     doubleZSig0 = zSig0<<1;
7633     mul64To128( zSig0, zSig0, &term0, &term1 );
7634     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7635     while ( (int64_t) rem0 < 0 ) {
7636         --zSig0;
7637         doubleZSig0 -= 2;
7638         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7639     }
7640     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7641     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7642         if ( zSig1 == 0 ) zSig1 = 1;
7643         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7644         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7645         mul64To128( zSig1, zSig1, &term2, &term3 );
7646         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7647         while ( (int64_t) rem1 < 0 ) {
7648             --zSig1;
7649             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7650             term3 |= 1;
7651             term2 |= doubleZSig0;
7652             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7653         }
7654         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7655     }
7656     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7657     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7658
7659 }
7660
7661 /*----------------------------------------------------------------------------
7662 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7663 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7664 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7665 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7666 *----------------------------------------------------------------------------*/
7667
7668 int float128_eq(float128 a, float128 b, float_status *status)
7669 {
7670
7671     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7672               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7673          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7674               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7675        ) {
7676         float_raise(float_flag_invalid, status);
7677         return 0;
7678     }
7679     return
7680            ( a.low == b.low )
7681         && (    ( a.high == b.high )
7682              || (    ( a.low == 0 )
7683                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7684            );
7685
7686 }
7687
7688 /*----------------------------------------------------------------------------
7689 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7690 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7691 | exception is raised if either operand is a NaN.  The comparison is performed
7692 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7693 *----------------------------------------------------------------------------*/
7694
7695 int float128_le(float128 a, float128 b, float_status *status)
7696 {
7697     flag aSign, bSign;
7698
7699     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7700               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7701          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7702               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7703        ) {
7704         float_raise(float_flag_invalid, status);
7705         return 0;
7706     }
7707     aSign = extractFloat128Sign( a );
7708     bSign = extractFloat128Sign( b );
7709     if ( aSign != bSign ) {
7710         return
7711                aSign
7712             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7713                  == 0 );
7714     }
7715     return
7716           aSign ? le128( b.high, b.low, a.high, a.low )
7717         : le128( a.high, a.low, b.high, b.low );
7718
7719 }
7720
7721 /*----------------------------------------------------------------------------
7722 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7723 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7724 | raised if either operand is a NaN.  The comparison is performed according
7725 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7726 *----------------------------------------------------------------------------*/
7727
7728 int float128_lt(float128 a, float128 b, float_status *status)
7729 {
7730     flag aSign, bSign;
7731
7732     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7733               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7734          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7735               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7736        ) {
7737         float_raise(float_flag_invalid, status);
7738         return 0;
7739     }
7740     aSign = extractFloat128Sign( a );
7741     bSign = extractFloat128Sign( b );
7742     if ( aSign != bSign ) {
7743         return
7744                aSign
7745             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7746                  != 0 );
7747     }
7748     return
7749           aSign ? lt128( b.high, b.low, a.high, a.low )
7750         : lt128( a.high, a.low, b.high, b.low );
7751
7752 }
7753
7754 /*----------------------------------------------------------------------------
7755 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7756 | be compared, and 0 otherwise.  The invalid exception is raised if either
7757 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7758 | Standard for Binary Floating-Point Arithmetic.
7759 *----------------------------------------------------------------------------*/
7760
7761 int float128_unordered(float128 a, float128 b, float_status *status)
7762 {
7763     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7764               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7765          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7766               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7767        ) {
7768         float_raise(float_flag_invalid, status);
7769         return 1;
7770     }
7771     return 0;
7772 }
7773
7774 /*----------------------------------------------------------------------------
7775 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7776 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7777 | exception.  The comparison is performed according to the IEC/IEEE Standard
7778 | for Binary Floating-Point Arithmetic.
7779 *----------------------------------------------------------------------------*/
7780
7781 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7782 {
7783
7784     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7785               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7786          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7787               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7788        ) {
7789         if (float128_is_signaling_nan(a, status)
7790          || float128_is_signaling_nan(b, status)) {
7791             float_raise(float_flag_invalid, status);
7792         }
7793         return 0;
7794     }
7795     return
7796            ( a.low == b.low )
7797         && (    ( a.high == b.high )
7798              || (    ( a.low == 0 )
7799                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7800            );
7801
7802 }
7803
7804 /*----------------------------------------------------------------------------
7805 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7806 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7807 | cause an exception.  Otherwise, the comparison is performed according to the
7808 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7809 *----------------------------------------------------------------------------*/
7810
7811 int float128_le_quiet(float128 a, float128 b, float_status *status)
7812 {
7813     flag aSign, bSign;
7814
7815     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7816               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7817          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7818               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7819        ) {
7820         if (float128_is_signaling_nan(a, status)
7821          || float128_is_signaling_nan(b, status)) {
7822             float_raise(float_flag_invalid, status);
7823         }
7824         return 0;
7825     }
7826     aSign = extractFloat128Sign( a );
7827     bSign = extractFloat128Sign( b );
7828     if ( aSign != bSign ) {
7829         return
7830                aSign
7831             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7832                  == 0 );
7833     }
7834     return
7835           aSign ? le128( b.high, b.low, a.high, a.low )
7836         : le128( a.high, a.low, b.high, b.low );
7837
7838 }
7839
7840 /*----------------------------------------------------------------------------
7841 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7842 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7843 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7844 | Standard for Binary Floating-Point Arithmetic.
7845 *----------------------------------------------------------------------------*/
7846
7847 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7848 {
7849     flag aSign, bSign;
7850
7851     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7852               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7853          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7854               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7855        ) {
7856         if (float128_is_signaling_nan(a, status)
7857          || float128_is_signaling_nan(b, status)) {
7858             float_raise(float_flag_invalid, status);
7859         }
7860         return 0;
7861     }
7862     aSign = extractFloat128Sign( a );
7863     bSign = extractFloat128Sign( b );
7864     if ( aSign != bSign ) {
7865         return
7866                aSign
7867             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7868                  != 0 );
7869     }
7870     return
7871           aSign ? lt128( b.high, b.low, a.high, a.low )
7872         : lt128( a.high, a.low, b.high, b.low );
7873
7874 }
7875
7876 /*----------------------------------------------------------------------------
7877 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7878 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7879 | comparison is performed according to the IEC/IEEE Standard for Binary
7880 | Floating-Point Arithmetic.
7881 *----------------------------------------------------------------------------*/
7882
7883 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7884 {
7885     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7886               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7887          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7888               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7889        ) {
7890         if (float128_is_signaling_nan(a, status)
7891          || float128_is_signaling_nan(b, status)) {
7892             float_raise(float_flag_invalid, status);
7893         }
7894         return 1;
7895     }
7896     return 0;
7897 }
7898
7899 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7900                                             int is_quiet, float_status *status)
7901 {
7902     flag aSign, bSign;
7903
7904     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7905         float_raise(float_flag_invalid, status);
7906         return float_relation_unordered;
7907     }
7908     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7909           ( extractFloatx80Frac( a )<<1 ) ) ||
7910         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7911           ( extractFloatx80Frac( b )<<1 ) )) {
7912         if (!is_quiet ||
7913             floatx80_is_signaling_nan(a, status) ||
7914             floatx80_is_signaling_nan(b, status)) {
7915             float_raise(float_flag_invalid, status);
7916         }
7917         return float_relation_unordered;
7918     }
7919     aSign = extractFloatx80Sign( a );
7920     bSign = extractFloatx80Sign( b );
7921     if ( aSign != bSign ) {
7922
7923         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7924              ( ( a.low | b.low ) == 0 ) ) {
7925             /* zero case */
7926             return float_relation_equal;
7927         } else {
7928             return 1 - (2 * aSign);
7929         }
7930     } else {
7931         if (a.low == b.low && a.high == b.high) {
7932             return float_relation_equal;
7933         } else {
7934             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7935         }
7936     }
7937 }
7938
7939 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7940 {
7941     return floatx80_compare_internal(a, b, 0, status);
7942 }
7943
7944 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7945 {
7946     return floatx80_compare_internal(a, b, 1, status);
7947 }
7948
7949 static inline int float128_compare_internal(float128 a, float128 b,
7950                                             int is_quiet, float_status *status)
7951 {
7952     flag aSign, bSign;
7953
7954     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7955           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7956         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7957           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7958         if (!is_quiet ||
7959             float128_is_signaling_nan(a, status) ||
7960             float128_is_signaling_nan(b, status)) {
7961             float_raise(float_flag_invalid, status);
7962         }
7963         return float_relation_unordered;
7964     }
7965     aSign = extractFloat128Sign( a );
7966     bSign = extractFloat128Sign( b );
7967     if ( aSign != bSign ) {
7968         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7969             /* zero case */
7970             return float_relation_equal;
7971         } else {
7972             return 1 - (2 * aSign);
7973         }
7974     } else {
7975         if (a.low == b.low && a.high == b.high) {
7976             return float_relation_equal;
7977         } else {
7978             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7979         }
7980     }
7981 }
7982
7983 int float128_compare(float128 a, float128 b, float_status *status)
7984 {
7985     return float128_compare_internal(a, b, 0, status);
7986 }
7987
7988 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7989 {
7990     return float128_compare_internal(a, b, 1, status);
7991 }
7992
7993 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7994 {
7995     flag aSign;
7996     int32_t aExp;
7997     uint64_t aSig;
7998
7999     if (floatx80_invalid_encoding(a)) {
8000         float_raise(float_flag_invalid, status);
8001         return floatx80_default_nan(status);
8002     }
8003     aSig = extractFloatx80Frac( a );
8004     aExp = extractFloatx80Exp( a );
8005     aSign = extractFloatx80Sign( a );
8006
8007     if ( aExp == 0x7FFF ) {
8008         if ( aSig<<1 ) {
8009             return propagateFloatx80NaN(a, a, status);
8010         }
8011         return a;
8012     }
8013
8014     if (aExp == 0) {
8015         if (aSig == 0) {
8016             return a;
8017         }
8018         aExp++;
8019     }
8020
8021     if (n > 0x10000) {
8022         n = 0x10000;
8023     } else if (n < -0x10000) {
8024         n = -0x10000;
8025     }
8026
8027     aExp += n;
8028     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
8029                                          aSign, aExp, aSig, 0, status);
8030 }
8031
8032 float128 float128_scalbn(float128 a, int n, float_status *status)
8033 {
8034     flag aSign;
8035     int32_t aExp;
8036     uint64_t aSig0, aSig1;
8037
8038     aSig1 = extractFloat128Frac1( a );
8039     aSig0 = extractFloat128Frac0( a );
8040     aExp = extractFloat128Exp( a );
8041     aSign = extractFloat128Sign( a );
8042     if ( aExp == 0x7FFF ) {
8043         if ( aSig0 | aSig1 ) {
8044             return propagateFloat128NaN(a, a, status);
8045         }
8046         return a;
8047     }
8048     if (aExp != 0) {
8049         aSig0 |= UINT64_C(0x0001000000000000);
8050     } else if (aSig0 == 0 && aSig1 == 0) {
8051         return a;
8052     } else {
8053         aExp++;
8054     }
8055
8056     if (n > 0x10000) {
8057         n = 0x10000;
8058     } else if (n < -0x10000) {
8059         n = -0x10000;
8060     }
8061
8062     aExp += n - 1;
8063     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
8064                                          , status);
8065
8066 }
8067
8068 static void __attribute__((constructor)) softfloat_init(void)
8069 {
8070     union_float64 ua, ub, uc, ur;
8071
8072     if (QEMU_NO_HARDFLOAT) {
8073         return;
8074     }
8075     /*
8076      * Test that the host's FMA is not obviously broken. For example,
8077      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
8078      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
8079      */
8080     ua.s = 0x0020000000000001ULL;
8081     ub.s = 0x3ca0000000000000ULL;
8082     uc.s = 0x0020000000000000ULL;
8083     ur.h = fma(ua.h, ub.h, uc.h);
8084     if (ur.s != 0x0020000000000001ULL) {
8085         force_soft_fma = true;
8086     }
8087 }