fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include <math.h>
  87 #include "qemu/bitops.h"
  88 #include "fpu/softfloat.h"
  89
  90 /* We only need stdlib for abort() */
  91
  92 /*----------------------------------------------------------------------------
  93 | Primitive arithmetic functions, including multi-word arithmetic, and
  94 | division and square root approximations.  (Can be specialized to target if
  95 | desired.)
  96 *----------------------------------------------------------------------------*/
  97 #include "fpu/softfloat-macros.h"
  98
  99 /*
 100  * Hardfloat
 101  *
 102  * Fast emulation of guest FP instructions is challenging for two reasons.
 103  * First, FP instruction semantics are similar but not identical, particularly
 104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
 105  * exception flags is not trivial: reading the host's flags register with a
 106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
 107  * and trapping on every FP exception is not fast nor pleasant to work with.
 108  *
 109  * We address these challenges by leveraging the host FPU for a subset of the
 110  * operations. To do this we expand on the idea presented in this paper:
 111  *
 112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
 113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
 114  *
 115  * The idea is thus to leverage the host FPU to (1) compute FP operations
 116  * and (2) identify whether FP exceptions occurred while avoiding
 117  * expensive exception flag register accesses.
 118  *
 119  * An important optimization shown in the paper is that given that exception
 120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
 121  * This is particularly useful for the inexact flag, which is very frequently
 122  * raised in floating-point workloads.
 123  *
 124  * We optimize the code further by deferring to soft-fp whenever FP exception
 125  * detection might get hairy. Two examples: (1) when at least one operand is
 126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
 127  * and the result is < the minimum normal.
 128  */
 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
 130     static inline void name(soft_t *a, float_status *s)                 \
 131     {                                                                   \
 132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
 133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
 134                                      soft_t ## _is_neg(*a));            \
 135             s->float_exception_flags |= float_flag_input_denormal;      \
 136         }                                                               \
 137     }
 138
 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
 141 #undef GEN_INPUT_FLUSH__NOCHECK
 142
 143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
 144     static inline void name(soft_t *a, float_status *s) \
 145     {                                                   \
 146         if (likely(!s->flush_inputs_to_zero)) {         \
 147             return;                                     \
 148         }                                               \
 149         soft_t ## _input_flush__nocheck(a, s);          \
 150     }
 151
 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
 154 #undef GEN_INPUT_FLUSH1
 155
 156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
 157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
 158     {                                                                   \
 159         if (likely(!s->flush_inputs_to_zero)) {                         \
 160             return;                                                     \
 161         }                                                               \
 162         soft_t ## _input_flush__nocheck(a, s);                          \
 163         soft_t ## _input_flush__nocheck(b, s);                          \
 164     }
 165
 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
 168 #undef GEN_INPUT_FLUSH2
 169
 170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
 171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
 172     {                                                                   \
 173         if (likely(!s->flush_inputs_to_zero)) {                         \
 174             return;                                                     \
 175         }                                                               \
 176         soft_t ## _input_flush__nocheck(a, s);                          \
 177         soft_t ## _input_flush__nocheck(b, s);                          \
 178         soft_t ## _input_flush__nocheck(c, s);                          \
 179     }
 180
 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
 183 #undef GEN_INPUT_FLUSH3
 184
 185 /*
 186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
 187  * hardfloat functions. Each combination of number of inputs and float size
 188  * gets its own value.
 189  */
 190 #if defined(__x86_64__)
 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
 197 #else
 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
 204 #endif
 205
 206 /*
 207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
 208  * float{32,64}_is_infinity when !USE_FP.
 209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
 210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
 211  */
 212 #if defined(__x86_64__) || defined(__aarch64__)
 213 # define QEMU_HARDFLOAT_USE_ISINF   1
 214 #else
 215 # define QEMU_HARDFLOAT_USE_ISINF   0
 216 #endif
 217
 218 /*
 219  * Some targets clear the FP flags before most FP operations. This prevents
 220  * the use of hardfloat, since hardfloat relies on the inexact flag being
 221  * already set.
 222  */
 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
 224 # if defined(__FAST_MATH__)
 225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
 226     IEEE implementation
 227 # endif
 228 # define QEMU_NO_HARDFLOAT 1
 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
 230 #else
 231 # define QEMU_NO_HARDFLOAT 0
 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
 233 #endif
 234
 235 static inline bool can_use_fpu(const float_status *s)
 236 {
 237     if (QEMU_NO_HARDFLOAT) {
 238         return false;
 239     }
 240     return likely(s->float_exception_flags & float_flag_inexact &&
 241                   s->float_rounding_mode == float_round_nearest_even);
 242 }
 243
 244 /*
 245  * Hardfloat generation functions. Each operation can have two flavors:
 246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
 247  * most condition checks, or native ones (e.g. fpclassify).
 248  *
 249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
 250  * compiler to propagate constants and inline everything into the callers.
 251  *
 252  * We only generate functions for operations with two inputs, since only
 253  * these are common enough to justify consolidating them into common code.
 254  */
 255
 256 typedef union {
 257     float32 s;
 258     float h;
 259 } union_float32;
 260
 261 typedef union {
 262     float64 s;
 263     double h;
 264 } union_float64;
 265
 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
 268
 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
 271 typedef float   (*hard_f32_op2_fn)(float a, float b);
 272 typedef double  (*hard_f64_op2_fn)(double a, double b);
 273
 274 /* 2-input is-zero-or-normal */
 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
 276 {
 277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
 278         /*
 279          * Not using a temp variable for consecutive fpclassify calls ends up
 280          * generating faster code.
 281          */
 282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 284     }
 285     return float32_is_zero_or_normal(a.s) &&
 286            float32_is_zero_or_normal(b.s);
 287 }
 288
 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
 290 {
 291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
 292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 294     }
 295     return float64_is_zero_or_normal(a.s) &&
 296            float64_is_zero_or_normal(b.s);
 297 }
 298
 299 /* 3-input is-zero-or-normal */
 300 static inline
 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
 302 {
 303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
 304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 307     }
 308     return float32_is_zero_or_normal(a.s) &&
 309            float32_is_zero_or_normal(b.s) &&
 310            float32_is_zero_or_normal(c.s);
 311 }
 312
 313 static inline
 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
 315 {
 316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
 317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 320     }
 321     return float64_is_zero_or_normal(a.s) &&
 322            float64_is_zero_or_normal(b.s) &&
 323            float64_is_zero_or_normal(c.s);
 324 }
 325
 326 static inline bool f32_is_inf(union_float32 a)
 327 {
 328     if (QEMU_HARDFLOAT_USE_ISINF) {
 329         return isinf(a.h);
 330     }
 331     return float32_is_infinity(a.s);
 332 }
 333
 334 static inline bool f64_is_inf(union_float64 a)
 335 {
 336     if (QEMU_HARDFLOAT_USE_ISINF) {
 337         return isinf(a.h);
 338     }
 339     return float64_is_infinity(a.s);
 340 }
 341
 342 /* Note: @fast_test and @post can be NULL */
 343 static inline float32
 344 float32_gen2(float32 xa, float32 xb, float_status *s,
 345              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
 346              f32_check_fn pre, f32_check_fn post,
 347              f32_check_fn fast_test, soft_f32_op2_fn fast_op)
 348 {
 349     union_float32 ua, ub, ur;
 350
 351     ua.s = xa;
 352     ub.s = xb;
 353
 354     if (unlikely(!can_use_fpu(s))) {
 355         goto soft;
 356     }
 357
 358     float32_input_flush2(&ua.s, &ub.s, s);
 359     if (unlikely(!pre(ua, ub))) {
 360         goto soft;
 361     }
 362     if (fast_test && fast_test(ua, ub)) {
 363         return fast_op(ua.s, ub.s, s);
 364     }
 365
 366     ur.h = hard(ua.h, ub.h);
 367     if (unlikely(f32_is_inf(ur))) {
 368         s->float_exception_flags |= float_flag_overflow;
 369     } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
 370         if (post == NULL || post(ua, ub)) {
 371             goto soft;
 372         }
 373     }
 374     return ur.s;
 375
 376  soft:
 377     return soft(ua.s, ub.s, s);
 378 }
 379
 380 static inline float64
 381 float64_gen2(float64 xa, float64 xb, float_status *s,
 382              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
 383              f64_check_fn pre, f64_check_fn post,
 384              f64_check_fn fast_test, soft_f64_op2_fn fast_op)
 385 {
 386     union_float64 ua, ub, ur;
 387
 388     ua.s = xa;
 389     ub.s = xb;
 390
 391     if (unlikely(!can_use_fpu(s))) {
 392         goto soft;
 393     }
 394
 395     float64_input_flush2(&ua.s, &ub.s, s);
 396     if (unlikely(!pre(ua, ub))) {
 397         goto soft;
 398     }
 399     if (fast_test && fast_test(ua, ub)) {
 400         return fast_op(ua.s, ub.s, s);
 401     }
 402
 403     ur.h = hard(ua.h, ub.h);
 404     if (unlikely(f64_is_inf(ur))) {
 405         s->float_exception_flags |= float_flag_overflow;
 406     } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
 407         if (post == NULL || post(ua, ub)) {
 408             goto soft;
 409         }
 410     }
 411     return ur.s;
 412
 413  soft:
 414     return soft(ua.s, ub.s, s);
 415 }
 416
 417 /*----------------------------------------------------------------------------
 418 | Returns the fraction bits of the half-precision floating-point value `a'.
 419 *----------------------------------------------------------------------------*/
 420
 421 static inline uint32_t extractFloat16Frac(float16 a)
 422 {
 423     return float16_val(a) & 0x3ff;
 424 }
 425
 426 /*----------------------------------------------------------------------------
 427 | Returns the exponent bits of the half-precision floating-point value `a'.
 428 *----------------------------------------------------------------------------*/
 429
 430 static inline int extractFloat16Exp(float16 a)
 431 {
 432     return (float16_val(a) >> 10) & 0x1f;
 433 }
 434
 435 /*----------------------------------------------------------------------------
 436 | Returns the fraction bits of the single-precision floating-point value `a'.
 437 *----------------------------------------------------------------------------*/
 438
 439 static inline uint32_t extractFloat32Frac(float32 a)
 440 {
 441     return float32_val(a) & 0x007FFFFF;
 442 }
 443
 444 /*----------------------------------------------------------------------------
 445 | Returns the exponent bits of the single-precision floating-point value `a'.
 446 *----------------------------------------------------------------------------*/
 447
 448 static inline int extractFloat32Exp(float32 a)
 449 {
 450     return (float32_val(a) >> 23) & 0xFF;
 451 }
 452
 453 /*----------------------------------------------------------------------------
 454 | Returns the sign bit of the single-precision floating-point value `a'.
 455 *----------------------------------------------------------------------------*/
 456
 457 static inline flag extractFloat32Sign(float32 a)
 458 {
 459     return float32_val(a) >> 31;
 460 }
 461
 462 /*----------------------------------------------------------------------------
 463 | Returns the fraction bits of the double-precision floating-point value `a'.
 464 *----------------------------------------------------------------------------*/
 465
 466 static inline uint64_t extractFloat64Frac(float64 a)
 467 {
 468     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
 469 }
 470
 471 /*----------------------------------------------------------------------------
 472 | Returns the exponent bits of the double-precision floating-point value `a'.
 473 *----------------------------------------------------------------------------*/
 474
 475 static inline int extractFloat64Exp(float64 a)
 476 {
 477     return (float64_val(a) >> 52) & 0x7FF;
 478 }
 479
 480 /*----------------------------------------------------------------------------
 481 | Returns the sign bit of the double-precision floating-point value `a'.
 482 *----------------------------------------------------------------------------*/
 483
 484 static inline flag extractFloat64Sign(float64 a)
 485 {
 486     return float64_val(a) >> 63;
 487 }
 488
 489 /*
 490  * Classify a floating point number. Everything above float_class_qnan
 491  * is a NaN so cls >= float_class_qnan is any NaN.
 492  */
 493
 494 typedef enum __attribute__ ((__packed__)) {
 495     float_class_unclassified,
 496     float_class_zero,
 497     float_class_normal,
 498     float_class_inf,
 499     float_class_qnan,  /* all NaNs from here */
 500     float_class_snan,
 501 } FloatClass;
 502
 503 /* Simple helpers for checking if, or what kind of, NaN we have */
 504 static inline __attribute__((unused)) bool is_nan(FloatClass c)
 505 {
 506     return unlikely(c >= float_class_qnan);
 507 }
 508
 509 static inline __attribute__((unused)) bool is_snan(FloatClass c)
 510 {
 511     return c == float_class_snan;
 512 }
 513
 514 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 515 {
 516     return c == float_class_qnan;
 517 }
 518
 519 /*
 520  * Structure holding all of the decomposed parts of a float. The
 521  * exponent is unbiased and the fraction is normalized. All
 522  * calculations are done with a 64 bit fraction and then rounded as
 523  * appropriate for the final format.
 524  *
 525  * Thanks to the packed FloatClass a decent compiler should be able to
 526  * fit the whole structure into registers and avoid using the stack
 527  * for parameter passing.
 528  */
 529
 530 typedef struct {
 531     uint64_t frac;
 532     int32_t  exp;
 533     FloatClass cls;
 534     bool sign;
 535 } FloatParts;
 536
 537 #define DECOMPOSED_BINARY_POINT    (64 - 2)
 538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 539 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 540
 541 /* Structure holding all of the relevant parameters for a format.
 542  *   exp_size: the size of the exponent field
 543  *   exp_bias: the offset applied to the exponent field
 544  *   exp_max: the maximum normalised exponent
 545  *   frac_size: the size of the fraction field
 546  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 547  * The following are computed based the size of fraction
 548  *   frac_lsb: least significant bit of fraction
 549  *   frac_lsbm1: the bit below the least significant bit (for rounding)
 550  *   round_mask/roundeven_mask: masks used for rounding
 551  * The following optional modifiers are available:
 552  *   arm_althp: handle ARM Alternative Half Precision
 553  */
 554 typedef struct {
 555     int exp_size;
 556     int exp_bias;
 557     int exp_max;
 558     int frac_size;
 559     int frac_shift;
 560     uint64_t frac_lsb;
 561     uint64_t frac_lsbm1;
 562     uint64_t round_mask;
 563     uint64_t roundeven_mask;
 564     bool arm_althp;
 565 } FloatFmt;
 566
 567 /* Expand fields based on the size of exponent and fraction */
 568 #define FLOAT_PARAMS(E, F)                                           \
 569     .exp_size       = E,                                             \
 570     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 571     .exp_max        = (1 << E) - 1,                                  \
 572     .frac_size      = F,                                             \
 573     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 574     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 575     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 576     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 577     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 578
 579 static const FloatFmt float16_params = {
 580     FLOAT_PARAMS(5, 10)
 581 };
 582
 583 static const FloatFmt float16_params_ahp = {
 584     FLOAT_PARAMS(5, 10),
 585     .arm_althp = true
 586 };
 587
 588 static const FloatFmt float32_params = {
 589     FLOAT_PARAMS(8, 23)
 590 };
 591
 592 static const FloatFmt float64_params = {
 593     FLOAT_PARAMS(11, 52)
 594 };
 595
 596 /* Unpack a float to parts, but do not canonicalize.  */
 597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 598 {
 599     const int sign_pos = fmt.frac_size + fmt.exp_size;
 600
 601     return (FloatParts) {
 602         .cls = float_class_unclassified,
 603         .sign = extract64(raw, sign_pos, 1),
 604         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 605         .frac = extract64(raw, 0, fmt.frac_size),
 606     };
 607 }
 608
 609 static inline FloatParts float16_unpack_raw(float16 f)
 610 {
 611     return unpack_raw(float16_params, f);
 612 }
 613
 614 static inline FloatParts float32_unpack_raw(float32 f)
 615 {
 616     return unpack_raw(float32_params, f);
 617 }
 618
 619 static inline FloatParts float64_unpack_raw(float64 f)
 620 {
 621     return unpack_raw(float64_params, f);
 622 }
 623
 624 /* Pack a float from parts, but do not canonicalize.  */
 625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 626 {
 627     const int sign_pos = fmt.frac_size + fmt.exp_size;
 628     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 629     return deposit64(ret, sign_pos, 1, p.sign);
 630 }
 631
 632 static inline float16 float16_pack_raw(FloatParts p)
 633 {
 634     return make_float16(pack_raw(float16_params, p));
 635 }
 636
 637 static inline float32 float32_pack_raw(FloatParts p)
 638 {
 639     return make_float32(pack_raw(float32_params, p));
 640 }
 641
 642 static inline float64 float64_pack_raw(FloatParts p)
 643 {
 644     return make_float64(pack_raw(float64_params, p));
 645 }
 646
 647 /*----------------------------------------------------------------------------
 648 | Functions and definitions to determine:  (1) whether tininess for underflow
 649 | is detected before or after rounding by default, (2) what (if anything)
 650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 652 | are propagated from function inputs to output.  These details are target-
 653 | specific.
 654 *----------------------------------------------------------------------------*/
 655 #include "softfloat-specialize.h"
 656
 657 /* Canonicalize EXP and FRAC, setting CLS.  */
 658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
 659                                   float_status *status)
 660 {
 661     if (part.exp == parm->exp_max && !parm->arm_althp) {
 662         if (part.frac == 0) {
 663             part.cls = float_class_inf;
 664         } else {
 665             part.frac <<= parm->frac_shift;
 666             part.cls = (parts_is_snan_frac(part.frac, status)
 667                         ? float_class_snan : float_class_qnan);
 668         }
 669     } else if (part.exp == 0) {
 670         if (likely(part.frac == 0)) {
 671             part.cls = float_class_zero;
 672         } else if (status->flush_inputs_to_zero) {
 673             float_raise(float_flag_input_denormal, status);
 674             part.cls = float_class_zero;
 675             part.frac = 0;
 676         } else {
 677             int shift = clz64(part.frac) - 1;
 678             part.cls = float_class_normal;
 679             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 680             part.frac <<= shift;
 681         }
 682     } else {
 683         part.cls = float_class_normal;
 684         part.exp -= parm->exp_bias;
 685         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 686     }
 687     return part;
 688 }
 689
 690 /* Round and uncanonicalize a floating-point number by parts. There
 691  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 692  * fraction; these bits will be removed. The exponent will be biased
 693  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 694  */
 695
 696 static FloatParts round_canonical(FloatParts p, float_status *s,
 697                                   const FloatFmt *parm)
 698 {
 699     const uint64_t frac_lsb = parm->frac_lsb;
 700     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 701     const uint64_t round_mask = parm->round_mask;
 702     const uint64_t roundeven_mask = parm->roundeven_mask;
 703     const int exp_max = parm->exp_max;
 704     const int frac_shift = parm->frac_shift;
 705     uint64_t frac, inc;
 706     int exp, flags = 0;
 707     bool overflow_norm;
 708
 709     frac = p.frac;
 710     exp = p.exp;
 711
 712     switch (p.cls) {
 713     case float_class_normal:
 714         switch (s->float_rounding_mode) {
 715         case float_round_nearest_even:
 716             overflow_norm = false;
 717             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 718             break;
 719         case float_round_ties_away:
 720             overflow_norm = false;
 721             inc = frac_lsbm1;
 722             break;
 723         case float_round_to_zero:
 724             overflow_norm = true;
 725             inc = 0;
 726             break;
 727         case float_round_up:
 728             inc = p.sign ? 0 : round_mask;
 729             overflow_norm = p.sign;
 730             break;
 731         case float_round_down:
 732             inc = p.sign ? round_mask : 0;
 733             overflow_norm = !p.sign;
 734             break;
 735         case float_round_to_odd:
 736             overflow_norm = true;
 737             inc = frac & frac_lsb ? 0 : round_mask;
 738             break;
 739         default:
 740             g_assert_not_reached();
 741         }
 742
 743         exp += parm->exp_bias;
 744         if (likely(exp > 0)) {
 745             if (frac & round_mask) {
 746                 flags |= float_flag_inexact;
 747                 frac += inc;
 748                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
 749                     frac >>= 1;
 750                     exp++;
 751                 }
 752             }
 753             frac >>= frac_shift;
 754
 755             if (parm->arm_althp) {
 756                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
 757                 if (unlikely(exp > exp_max)) {
 758                     /* Overflow.  Return the maximum normal.  */
 759                     flags = float_flag_invalid;
 760                     exp = exp_max;
 761                     frac = -1;
 762                 }
 763             } else if (unlikely(exp >= exp_max)) {
 764                 flags |= float_flag_overflow | float_flag_inexact;
 765                 if (overflow_norm) {
 766                     exp = exp_max - 1;
 767                     frac = -1;
 768                 } else {
 769                     p.cls = float_class_inf;
 770                     goto do_inf;
 771                 }
 772             }
 773         } else if (s->flush_to_zero) {
 774             flags |= float_flag_output_denormal;
 775             p.cls = float_class_zero;
 776             goto do_zero;
 777         } else {
 778             bool is_tiny = (s->float_detect_tininess
 779                             == float_tininess_before_rounding)
 780                         || (exp < 0)
 781                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 782
 783             shift64RightJamming(frac, 1 - exp, &frac);
 784             if (frac & round_mask) {
 785                 /* Need to recompute round-to-even.  */
 786                 switch (s->float_rounding_mode) {
 787                 case float_round_nearest_even:
 788                     inc = ((frac & roundeven_mask) != frac_lsbm1
 789                            ? frac_lsbm1 : 0);
 790                     break;
 791                 case float_round_to_odd:
 792                     inc = frac & frac_lsb ? 0 : round_mask;
 793                     break;
 794                 }
 795                 flags |= float_flag_inexact;
 796                 frac += inc;
 797             }
 798
 799             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 800             frac >>= frac_shift;
 801
 802             if (is_tiny && (flags & float_flag_inexact)) {
 803                 flags |= float_flag_underflow;
 804             }
 805             if (exp == 0 && frac == 0) {
 806                 p.cls = float_class_zero;
 807             }
 808         }
 809         break;
 810
 811     case float_class_zero:
 812     do_zero:
 813         exp = 0;
 814         frac = 0;
 815         break;
 816
 817     case float_class_inf:
 818     do_inf:
 819         assert(!parm->arm_althp);
 820         exp = exp_max;
 821         frac = 0;
 822         break;
 823
 824     case float_class_qnan:
 825     case float_class_snan:
 826         assert(!parm->arm_althp);
 827         exp = exp_max;
 828         frac >>= parm->frac_shift;
 829         break;
 830
 831     default:
 832         g_assert_not_reached();
 833     }
 834
 835     float_raise(flags, s);
 836     p.exp = exp;
 837     p.frac = frac;
 838     return p;
 839 }
 840
 841 /* Explicit FloatFmt version */
 842 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
 843                                             const FloatFmt *params)
 844 {
 845     return sf_canonicalize(float16_unpack_raw(f), params, s);
 846 }
 847
 848 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 849 {
 850     return float16a_unpack_canonical(f, s, &float16_params);
 851 }
 852
 853 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
 854                                              const FloatFmt *params)
 855 {
 856     return float16_pack_raw(round_canonical(p, s, params));
 857 }
 858
 859 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 860 {
 861     return float16a_round_pack_canonical(p, s, &float16_params);
 862 }
 863
 864 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 865 {
 866     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
 867 }
 868
 869 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 870 {
 871     return float32_pack_raw(round_canonical(p, s, &float32_params));
 872 }
 873
 874 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 875 {
 876     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
 877 }
 878
 879 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 880 {
 881     return float64_pack_raw(round_canonical(p, s, &float64_params));
 882 }
 883
 884 static FloatParts return_nan(FloatParts a, float_status *s)
 885 {
 886     switch (a.cls) {
 887     case float_class_snan:
 888         s->float_exception_flags |= float_flag_invalid;
 889         a = parts_silence_nan(a, s);
 890         /* fall through */
 891     case float_class_qnan:
 892         if (s->default_nan_mode) {
 893             return parts_default_nan(s);
 894         }
 895         break;
 896
 897     default:
 898         g_assert_not_reached();
 899     }
 900     return a;
 901 }
 902
 903 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 904 {
 905     if (is_snan(a.cls) || is_snan(b.cls)) {
 906         s->float_exception_flags |= float_flag_invalid;
 907     }
 908
 909     if (s->default_nan_mode) {
 910         return parts_default_nan(s);
 911     } else {
 912         if (pickNaN(a.cls, b.cls,
 913                     a.frac > b.frac ||
 914                     (a.frac == b.frac && a.sign < b.sign))) {
 915             a = b;
 916         }
 917         if (is_snan(a.cls)) {
 918             return parts_silence_nan(a, s);
 919         }
 920     }
 921     return a;
 922 }
 923
 924 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 925                                   bool inf_zero, float_status *s)
 926 {
 927     int which;
 928
 929     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 930         s->float_exception_flags |= float_flag_invalid;
 931     }
 932
 933     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
 934
 935     if (s->default_nan_mode) {
 936         /* Note that this check is after pickNaNMulAdd so that function
 937          * has an opportunity to set the Invalid flag.
 938          */
 939         which = 3;
 940     }
 941
 942     switch (which) {
 943     case 0:
 944         break;
 945     case 1:
 946         a = b;
 947         break;
 948     case 2:
 949         a = c;
 950         break;
 951     case 3:
 952         return parts_default_nan(s);
 953     default:
 954         g_assert_not_reached();
 955     }
 956
 957     if (is_snan(a.cls)) {
 958         return parts_silence_nan(a, s);
 959     }
 960     return a;
 961 }
 962
 963 /*
 964  * Returns the result of adding or subtracting the values of the
 965  * floating-point values `a' and `b'. The operation is performed
 966  * according to the IEC/IEEE Standard for Binary Floating-Point
 967  * Arithmetic.
 968  */
 969
 970 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 971                                 float_status *s)
 972 {
 973     bool a_sign = a.sign;
 974     bool b_sign = b.sign ^ subtract;
 975
 976     if (a_sign != b_sign) {
 977         /* Subtraction */
 978
 979         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 980             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 981                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 982                 a.frac = a.frac - b.frac;
 983             } else {
 984                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 985                 a.frac = b.frac - a.frac;
 986                 a.exp = b.exp;
 987                 a_sign ^= 1;
 988             }
 989
 990             if (a.frac == 0) {
 991                 a.cls = float_class_zero;
 992                 a.sign = s->float_rounding_mode == float_round_down;
 993             } else {
 994                 int shift = clz64(a.frac) - 1;
 995                 a.frac = a.frac << shift;
 996                 a.exp = a.exp - shift;
 997                 a.sign = a_sign;
 998             }
 999             return a;
1000         }
1001         if (is_nan(a.cls) || is_nan(b.cls)) {
1002             return pick_nan(a, b, s);
1003         }
1004         if (a.cls == float_class_inf) {
1005             if (b.cls == float_class_inf) {
1006                 float_raise(float_flag_invalid, s);
1007                 return parts_default_nan(s);
1008             }
1009             return a;
1010         }
1011         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1012             a.sign = s->float_rounding_mode == float_round_down;
1013             return a;
1014         }
1015         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1016             b.sign = a_sign ^ 1;
1017             return b;
1018         }
1019         if (b.cls == float_class_zero) {
1020             return a;
1021         }
1022     } else {
1023         /* Addition */
1024         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1025             if (a.exp > b.exp) {
1026                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1027             } else if (a.exp < b.exp) {
1028                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1029                 a.exp = b.exp;
1030             }
1031             a.frac += b.frac;
1032             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1033                 shift64RightJamming(a.frac, 1, &a.frac);
1034                 a.exp += 1;
1035             }
1036             return a;
1037         }
1038         if (is_nan(a.cls) || is_nan(b.cls)) {
1039             return pick_nan(a, b, s);
1040         }
1041         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1042             return a;
1043         }
1044         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1045             b.sign = b_sign;
1046             return b;
1047         }
1048     }
1049     g_assert_not_reached();
1050 }
1051
1052 /*
1053  * Returns the result of adding or subtracting the floating-point
1054  * values `a' and `b'. The operation is performed according to the
1055  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1056  */
1057
1058 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1059 {
1060     FloatParts pa = float16_unpack_canonical(a, status);
1061     FloatParts pb = float16_unpack_canonical(b, status);
1062     FloatParts pr = addsub_floats(pa, pb, false, status);
1063
1064     return float16_round_pack_canonical(pr, status);
1065 }
1066
1067 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1068 {
1069     FloatParts pa = float16_unpack_canonical(a, status);
1070     FloatParts pb = float16_unpack_canonical(b, status);
1071     FloatParts pr = addsub_floats(pa, pb, true, status);
1072
1073     return float16_round_pack_canonical(pr, status);
1074 }
1075
1076 static float32 QEMU_SOFTFLOAT_ATTR
1077 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1078 {
1079     FloatParts pa = float32_unpack_canonical(a, status);
1080     FloatParts pb = float32_unpack_canonical(b, status);
1081     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1082
1083     return float32_round_pack_canonical(pr, status);
1084 }
1085
1086 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1087 {
1088     return soft_f32_addsub(a, b, false, status);
1089 }
1090
1091 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1092 {
1093     return soft_f32_addsub(a, b, true, status);
1094 }
1095
1096 static float64 QEMU_SOFTFLOAT_ATTR
1097 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1098 {
1099     FloatParts pa = float64_unpack_canonical(a, status);
1100     FloatParts pb = float64_unpack_canonical(b, status);
1101     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1102
1103     return float64_round_pack_canonical(pr, status);
1104 }
1105
1106 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1107 {
1108     return soft_f64_addsub(a, b, false, status);
1109 }
1110
1111 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1112 {
1113     return soft_f64_addsub(a, b, true, status);
1114 }
1115
1116 static float hard_f32_add(float a, float b)
1117 {
1118     return a + b;
1119 }
1120
1121 static float hard_f32_sub(float a, float b)
1122 {
1123     return a - b;
1124 }
1125
1126 static double hard_f64_add(double a, double b)
1127 {
1128     return a + b;
1129 }
1130
1131 static double hard_f64_sub(double a, double b)
1132 {
1133     return a - b;
1134 }
1135
1136 static bool f32_addsub_post(union_float32 a, union_float32 b)
1137 {
1138     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1139         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1140     }
1141     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1142 }
1143
1144 static bool f64_addsub_post(union_float64 a, union_float64 b)
1145 {
1146     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1147         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1148     } else {
1149         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1150     }
1151 }
1152
1153 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1154                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1155 {
1156     return float32_gen2(a, b, s, hard, soft,
1157                         f32_is_zon2, f32_addsub_post, NULL, NULL);
1158 }
1159
1160 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1161                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1162 {
1163     return float64_gen2(a, b, s, hard, soft,
1164                         f64_is_zon2, f64_addsub_post, NULL, NULL);
1165 }
1166
1167 float32 QEMU_FLATTEN
1168 float32_add(float32 a, float32 b, float_status *s)
1169 {
1170     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1171 }
1172
1173 float32 QEMU_FLATTEN
1174 float32_sub(float32 a, float32 b, float_status *s)
1175 {
1176     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1177 }
1178
1179 float64 QEMU_FLATTEN
1180 float64_add(float64 a, float64 b, float_status *s)
1181 {
1182     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1183 }
1184
1185 float64 QEMU_FLATTEN
1186 float64_sub(float64 a, float64 b, float_status *s)
1187 {
1188     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1189 }
1190
1191 /*
1192  * Returns the result of multiplying the floating-point values `a' and
1193  * `b'. The operation is performed according to the IEC/IEEE Standard
1194  * for Binary Floating-Point Arithmetic.
1195  */
1196
1197 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1198 {
1199     bool sign = a.sign ^ b.sign;
1200
1201     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1202         uint64_t hi, lo;
1203         int exp = a.exp + b.exp;
1204
1205         mul64To128(a.frac, b.frac, &hi, &lo);
1206         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1207         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1208             shift64RightJamming(lo, 1, &lo);
1209             exp += 1;
1210         }
1211
1212         /* Re-use a */
1213         a.exp = exp;
1214         a.sign = sign;
1215         a.frac = lo;
1216         return a;
1217     }
1218     /* handle all the NaN cases */
1219     if (is_nan(a.cls) || is_nan(b.cls)) {
1220         return pick_nan(a, b, s);
1221     }
1222     /* Inf * Zero == NaN */
1223     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1224         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1225         s->float_exception_flags |= float_flag_invalid;
1226         return parts_default_nan(s);
1227     }
1228     /* Multiply by 0 or Inf */
1229     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1230         a.sign = sign;
1231         return a;
1232     }
1233     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1234         b.sign = sign;
1235         return b;
1236     }
1237     g_assert_not_reached();
1238 }
1239
1240 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1241 {
1242     FloatParts pa = float16_unpack_canonical(a, status);
1243     FloatParts pb = float16_unpack_canonical(b, status);
1244     FloatParts pr = mul_floats(pa, pb, status);
1245
1246     return float16_round_pack_canonical(pr, status);
1247 }
1248
1249 static float32 QEMU_SOFTFLOAT_ATTR
1250 soft_f32_mul(float32 a, float32 b, float_status *status)
1251 {
1252     FloatParts pa = float32_unpack_canonical(a, status);
1253     FloatParts pb = float32_unpack_canonical(b, status);
1254     FloatParts pr = mul_floats(pa, pb, status);
1255
1256     return float32_round_pack_canonical(pr, status);
1257 }
1258
1259 static float64 QEMU_SOFTFLOAT_ATTR
1260 soft_f64_mul(float64 a, float64 b, float_status *status)
1261 {
1262     FloatParts pa = float64_unpack_canonical(a, status);
1263     FloatParts pb = float64_unpack_canonical(b, status);
1264     FloatParts pr = mul_floats(pa, pb, status);
1265
1266     return float64_round_pack_canonical(pr, status);
1267 }
1268
1269 static float hard_f32_mul(float a, float b)
1270 {
1271     return a * b;
1272 }
1273
1274 static double hard_f64_mul(double a, double b)
1275 {
1276     return a * b;
1277 }
1278
1279 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1280 {
1281     return float32_is_zero(a.s) || float32_is_zero(b.s);
1282 }
1283
1284 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1285 {
1286     return float64_is_zero(a.s) || float64_is_zero(b.s);
1287 }
1288
1289 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1290 {
1291     bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1292
1293     return float32_set_sign(float32_zero, signbit);
1294 }
1295
1296 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1297 {
1298     bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1299
1300     return float64_set_sign(float64_zero, signbit);
1301 }
1302
1303 float32 QEMU_FLATTEN
1304 float32_mul(float32 a, float32 b, float_status *s)
1305 {
1306     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1307                         f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1308 }
1309
1310 float64 QEMU_FLATTEN
1311 float64_mul(float64 a, float64 b, float_status *s)
1312 {
1313     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1314                         f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1315 }
1316
1317 /*
1318  * Returns the result of multiplying the floating-point values `a' and
1319  * `b' then adding 'c', with no intermediate rounding step after the
1320  * multiplication. The operation is performed according to the
1321  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1322  * The flags argument allows the caller to select negation of the
1323  * addend, the intermediate product, or the final result. (The
1324  * difference between this and having the caller do a separate
1325  * negation is that negating externally will flip the sign bit on
1326  * NaNs.)
1327  */
1328
1329 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1330                                 int flags, float_status *s)
1331 {
1332     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1333                     ((1 << float_class_inf) | (1 << float_class_zero));
1334     bool p_sign;
1335     bool sign_flip = flags & float_muladd_negate_result;
1336     FloatClass p_class;
1337     uint64_t hi, lo;
1338     int p_exp;
1339
1340     /* It is implementation-defined whether the cases of (0,inf,qnan)
1341      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1342      * they return if they do), so we have to hand this information
1343      * off to the target-specific pick-a-NaN routine.
1344      */
1345     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1346         return pick_nan_muladd(a, b, c, inf_zero, s);
1347     }
1348
1349     if (inf_zero) {
1350         s->float_exception_flags |= float_flag_invalid;
1351         return parts_default_nan(s);
1352     }
1353
1354     if (flags & float_muladd_negate_c) {
1355         c.sign ^= 1;
1356     }
1357
1358     p_sign = a.sign ^ b.sign;
1359
1360     if (flags & float_muladd_negate_product) {
1361         p_sign ^= 1;
1362     }
1363
1364     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1365         p_class = float_class_inf;
1366     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1367         p_class = float_class_zero;
1368     } else {
1369         p_class = float_class_normal;
1370     }
1371
1372     if (c.cls == float_class_inf) {
1373         if (p_class == float_class_inf && p_sign != c.sign) {
1374             s->float_exception_flags |= float_flag_invalid;
1375             return parts_default_nan(s);
1376         } else {
1377             a.cls = float_class_inf;
1378             a.sign = c.sign ^ sign_flip;
1379             return a;
1380         }
1381     }
1382
1383     if (p_class == float_class_inf) {
1384         a.cls = float_class_inf;
1385         a.sign = p_sign ^ sign_flip;
1386         return a;
1387     }
1388
1389     if (p_class == float_class_zero) {
1390         if (c.cls == float_class_zero) {
1391             if (p_sign != c.sign) {
1392                 p_sign = s->float_rounding_mode == float_round_down;
1393             }
1394             c.sign = p_sign;
1395         } else if (flags & float_muladd_halve_result) {
1396             c.exp -= 1;
1397         }
1398         c.sign ^= sign_flip;
1399         return c;
1400     }
1401
1402     /* a & b should be normals now... */
1403     assert(a.cls == float_class_normal &&
1404            b.cls == float_class_normal);
1405
1406     p_exp = a.exp + b.exp;
1407
1408     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1409      * result.
1410      */
1411     mul64To128(a.frac, b.frac, &hi, &lo);
1412     /* binary point now at bit 124 */
1413
1414     /* check for overflow */
1415     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1416         shift128RightJamming(hi, lo, 1, &hi, &lo);
1417         p_exp += 1;
1418     }
1419
1420     /* + add/sub */
1421     if (c.cls == float_class_zero) {
1422         /* move binary point back to 62 */
1423         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1424     } else {
1425         int exp_diff = p_exp - c.exp;
1426         if (p_sign == c.sign) {
1427             /* Addition */
1428             if (exp_diff <= 0) {
1429                 shift128RightJamming(hi, lo,
1430                                      DECOMPOSED_BINARY_POINT - exp_diff,
1431                                      &hi, &lo);
1432                 lo += c.frac;
1433                 p_exp = c.exp;
1434             } else {
1435                 uint64_t c_hi, c_lo;
1436                 /* shift c to the same binary point as the product (124) */
1437                 c_hi = c.frac >> 2;
1438                 c_lo = 0;
1439                 shift128RightJamming(c_hi, c_lo,
1440                                      exp_diff,
1441                                      &c_hi, &c_lo);
1442                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1443                 /* move binary point back to 62 */
1444                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1445             }
1446
1447             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1448                 shift64RightJamming(lo, 1, &lo);
1449                 p_exp += 1;
1450             }
1451
1452         } else {
1453             /* Subtraction */
1454             uint64_t c_hi, c_lo;
1455             /* make C binary point match product at bit 124 */
1456             c_hi = c.frac >> 2;
1457             c_lo = 0;
1458
1459             if (exp_diff <= 0) {
1460                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1461                 if (exp_diff == 0
1462                     &&
1463                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1464                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1465                 } else {
1466                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1467                     p_sign ^= 1;
1468                     p_exp = c.exp;
1469                 }
1470             } else {
1471                 shift128RightJamming(c_hi, c_lo,
1472                                      exp_diff,
1473                                      &c_hi, &c_lo);
1474                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1475             }
1476
1477             if (hi == 0 && lo == 0) {
1478                 a.cls = float_class_zero;
1479                 a.sign = s->float_rounding_mode == float_round_down;
1480                 a.sign ^= sign_flip;
1481                 return a;
1482             } else {
1483                 int shift;
1484                 if (hi != 0) {
1485                     shift = clz64(hi);
1486                 } else {
1487                     shift = clz64(lo) + 64;
1488                 }
1489                 /* Normalizing to a binary point of 124 is the
1490                    correct adjust for the exponent.  However since we're
1491                    shifting, we might as well put the binary point back
1492                    at 62 where we really want it.  Therefore shift as
1493                    if we're leaving 1 bit at the top of the word, but
1494                    adjust the exponent as if we're leaving 3 bits.  */
1495                 shift -= 1;
1496                 if (shift >= 64) {
1497                     lo = lo << (shift - 64);
1498                 } else {
1499                     hi = (hi << shift) | (lo >> (64 - shift));
1500                     lo = hi | ((lo << shift) != 0);
1501                 }
1502                 p_exp -= shift - 2;
1503             }
1504         }
1505     }
1506
1507     if (flags & float_muladd_halve_result) {
1508         p_exp -= 1;
1509     }
1510
1511     /* finally prepare our result */
1512     a.cls = float_class_normal;
1513     a.sign = p_sign ^ sign_flip;
1514     a.exp = p_exp;
1515     a.frac = lo;
1516
1517     return a;
1518 }
1519
1520 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1521                                                 int flags, float_status *status)
1522 {
1523     FloatParts pa = float16_unpack_canonical(a, status);
1524     FloatParts pb = float16_unpack_canonical(b, status);
1525     FloatParts pc = float16_unpack_canonical(c, status);
1526     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1527
1528     return float16_round_pack_canonical(pr, status);
1529 }
1530
1531 static float32 QEMU_SOFTFLOAT_ATTR
1532 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1533                 float_status *status)
1534 {
1535     FloatParts pa = float32_unpack_canonical(a, status);
1536     FloatParts pb = float32_unpack_canonical(b, status);
1537     FloatParts pc = float32_unpack_canonical(c, status);
1538     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1539
1540     return float32_round_pack_canonical(pr, status);
1541 }
1542
1543 static float64 QEMU_SOFTFLOAT_ATTR
1544 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1545                 float_status *status)
1546 {
1547     FloatParts pa = float64_unpack_canonical(a, status);
1548     FloatParts pb = float64_unpack_canonical(b, status);
1549     FloatParts pc = float64_unpack_canonical(c, status);
1550     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1551
1552     return float64_round_pack_canonical(pr, status);
1553 }
1554
1555 static bool force_soft_fma;
1556
1557 float32 QEMU_FLATTEN
1558 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1559 {
1560     union_float32 ua, ub, uc, ur;
1561
1562     ua.s = xa;
1563     ub.s = xb;
1564     uc.s = xc;
1565
1566     if (unlikely(!can_use_fpu(s))) {
1567         goto soft;
1568     }
1569     if (unlikely(flags & float_muladd_halve_result)) {
1570         goto soft;
1571     }
1572
1573     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1574     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1575         goto soft;
1576     }
1577
1578     if (unlikely(force_soft_fma)) {
1579         goto soft;
1580     }
1581
1582     /*
1583      * When (a || b) == 0, there's no need to check for under/over flow,
1584      * since we know the addend is (normal || 0) and the product is 0.
1585      */
1586     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1587         union_float32 up;
1588         bool prod_sign;
1589
1590         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1591         prod_sign ^= !!(flags & float_muladd_negate_product);
1592         up.s = float32_set_sign(float32_zero, prod_sign);
1593
1594         if (flags & float_muladd_negate_c) {
1595             uc.h = -uc.h;
1596         }
1597         ur.h = up.h + uc.h;
1598     } else {
1599         if (flags & float_muladd_negate_product) {
1600             ua.h = -ua.h;
1601         }
1602         if (flags & float_muladd_negate_c) {
1603             uc.h = -uc.h;
1604         }
1605
1606         ur.h = fmaf(ua.h, ub.h, uc.h);
1607
1608         if (unlikely(f32_is_inf(ur))) {
1609             s->float_exception_flags |= float_flag_overflow;
1610         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1611             goto soft;
1612         }
1613     }
1614     if (flags & float_muladd_negate_result) {
1615         return float32_chs(ur.s);
1616     }
1617     return ur.s;
1618
1619  soft:
1620     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1621 }
1622
1623 float64 QEMU_FLATTEN
1624 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1625 {
1626     union_float64 ua, ub, uc, ur;
1627
1628     ua.s = xa;
1629     ub.s = xb;
1630     uc.s = xc;
1631
1632     if (unlikely(!can_use_fpu(s))) {
1633         goto soft;
1634     }
1635     if (unlikely(flags & float_muladd_halve_result)) {
1636         goto soft;
1637     }
1638
1639     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1640     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1641         goto soft;
1642     }
1643
1644     if (unlikely(force_soft_fma)) {
1645         goto soft;
1646     }
1647
1648     /*
1649      * When (a || b) == 0, there's no need to check for under/over flow,
1650      * since we know the addend is (normal || 0) and the product is 0.
1651      */
1652     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1653         union_float64 up;
1654         bool prod_sign;
1655
1656         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1657         prod_sign ^= !!(flags & float_muladd_negate_product);
1658         up.s = float64_set_sign(float64_zero, prod_sign);
1659
1660         if (flags & float_muladd_negate_c) {
1661             uc.h = -uc.h;
1662         }
1663         ur.h = up.h + uc.h;
1664     } else {
1665         if (flags & float_muladd_negate_product) {
1666             ua.h = -ua.h;
1667         }
1668         if (flags & float_muladd_negate_c) {
1669             uc.h = -uc.h;
1670         }
1671
1672         ur.h = fma(ua.h, ub.h, uc.h);
1673
1674         if (unlikely(f64_is_inf(ur))) {
1675             s->float_exception_flags |= float_flag_overflow;
1676         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1677             goto soft;
1678         }
1679     }
1680     if (flags & float_muladd_negate_result) {
1681         return float64_chs(ur.s);
1682     }
1683     return ur.s;
1684
1685  soft:
1686     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1687 }
1688
1689 /*
1690  * Returns the result of dividing the floating-point value `a' by the
1691  * corresponding value `b'. The operation is performed according to
1692  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1693  */
1694
1695 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1696 {
1697     bool sign = a.sign ^ b.sign;
1698
1699     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1700         uint64_t n0, n1, q, r;
1701         int exp = a.exp - b.exp;
1702
1703         /*
1704          * We want a 2*N / N-bit division to produce exactly an N-bit
1705          * result, so that we do not lose any precision and so that we
1706          * do not have to renormalize afterward.  If A.frac < B.frac,
1707          * then division would produce an (N-1)-bit result; shift A left
1708          * by one to produce the an N-bit result, and decrement the
1709          * exponent to match.
1710          *
1711          * The udiv_qrnnd algorithm that we're using requires normalization,
1712          * i.e. the msb of the denominator must be set.  Since we know that
1713          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1714          * by one (more), and the remainder must be shifted right by one.
1715          */
1716         if (a.frac < b.frac) {
1717             exp -= 1;
1718             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1719         } else {
1720             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1721         }
1722         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1723
1724         /*
1725          * Set lsb if there is a remainder, to set inexact.
1726          * As mentioned above, to find the actual value of the remainder we
1727          * would need to shift right, but (1) we are only concerned about
1728          * non-zero-ness, and (2) the remainder will always be even because
1729          * both inputs to the division primitive are even.
1730          */
1731         a.frac = q | (r != 0);
1732         a.sign = sign;
1733         a.exp = exp;
1734         return a;
1735     }
1736     /* handle all the NaN cases */
1737     if (is_nan(a.cls) || is_nan(b.cls)) {
1738         return pick_nan(a, b, s);
1739     }
1740     /* 0/0 or Inf/Inf */
1741     if (a.cls == b.cls
1742         &&
1743         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1744         s->float_exception_flags |= float_flag_invalid;
1745         return parts_default_nan(s);
1746     }
1747     /* Inf / x or 0 / x */
1748     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1749         a.sign = sign;
1750         return a;
1751     }
1752     /* Div 0 => Inf */
1753     if (b.cls == float_class_zero) {
1754         s->float_exception_flags |= float_flag_divbyzero;
1755         a.cls = float_class_inf;
1756         a.sign = sign;
1757         return a;
1758     }
1759     /* Div by Inf */
1760     if (b.cls == float_class_inf) {
1761         a.cls = float_class_zero;
1762         a.sign = sign;
1763         return a;
1764     }
1765     g_assert_not_reached();
1766 }
1767
1768 float16 float16_div(float16 a, float16 b, float_status *status)
1769 {
1770     FloatParts pa = float16_unpack_canonical(a, status);
1771     FloatParts pb = float16_unpack_canonical(b, status);
1772     FloatParts pr = div_floats(pa, pb, status);
1773
1774     return float16_round_pack_canonical(pr, status);
1775 }
1776
1777 static float32 QEMU_SOFTFLOAT_ATTR
1778 soft_f32_div(float32 a, float32 b, float_status *status)
1779 {
1780     FloatParts pa = float32_unpack_canonical(a, status);
1781     FloatParts pb = float32_unpack_canonical(b, status);
1782     FloatParts pr = div_floats(pa, pb, status);
1783
1784     return float32_round_pack_canonical(pr, status);
1785 }
1786
1787 static float64 QEMU_SOFTFLOAT_ATTR
1788 soft_f64_div(float64 a, float64 b, float_status *status)
1789 {
1790     FloatParts pa = float64_unpack_canonical(a, status);
1791     FloatParts pb = float64_unpack_canonical(b, status);
1792     FloatParts pr = div_floats(pa, pb, status);
1793
1794     return float64_round_pack_canonical(pr, status);
1795 }
1796
1797 static float hard_f32_div(float a, float b)
1798 {
1799     return a / b;
1800 }
1801
1802 static double hard_f64_div(double a, double b)
1803 {
1804     return a / b;
1805 }
1806
1807 static bool f32_div_pre(union_float32 a, union_float32 b)
1808 {
1809     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1810         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1811                fpclassify(b.h) == FP_NORMAL;
1812     }
1813     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1814 }
1815
1816 static bool f64_div_pre(union_float64 a, union_float64 b)
1817 {
1818     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1819         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1820                fpclassify(b.h) == FP_NORMAL;
1821     }
1822     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1823 }
1824
1825 static bool f32_div_post(union_float32 a, union_float32 b)
1826 {
1827     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1828         return fpclassify(a.h) != FP_ZERO;
1829     }
1830     return !float32_is_zero(a.s);
1831 }
1832
1833 static bool f64_div_post(union_float64 a, union_float64 b)
1834 {
1835     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1836         return fpclassify(a.h) != FP_ZERO;
1837     }
1838     return !float64_is_zero(a.s);
1839 }
1840
1841 float32 QEMU_FLATTEN
1842 float32_div(float32 a, float32 b, float_status *s)
1843 {
1844     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1845                         f32_div_pre, f32_div_post, NULL, NULL);
1846 }
1847
1848 float64 QEMU_FLATTEN
1849 float64_div(float64 a, float64 b, float_status *s)
1850 {
1851     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1852                         f64_div_pre, f64_div_post, NULL, NULL);
1853 }
1854
1855 /*
1856  * Float to Float conversions
1857  *
1858  * Returns the result of converting one float format to another. The
1859  * conversion is performed according to the IEC/IEEE Standard for
1860  * Binary Floating-Point Arithmetic.
1861  *
1862  * The float_to_float helper only needs to take care of raising
1863  * invalid exceptions and handling the conversion on NaNs.
1864  */
1865
1866 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1867                                  float_status *s)
1868 {
1869     if (dstf->arm_althp) {
1870         switch (a.cls) {
1871         case float_class_qnan:
1872         case float_class_snan:
1873             /* There is no NaN in the destination format.  Raise Invalid
1874              * and return a zero with the sign of the input NaN.
1875              */
1876             s->float_exception_flags |= float_flag_invalid;
1877             a.cls = float_class_zero;
1878             a.frac = 0;
1879             a.exp = 0;
1880             break;
1881
1882         case float_class_inf:
1883             /* There is no Inf in the destination format.  Raise Invalid
1884              * and return the maximum normal with the correct sign.
1885              */
1886             s->float_exception_flags |= float_flag_invalid;
1887             a.cls = float_class_normal;
1888             a.exp = dstf->exp_max;
1889             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1890             break;
1891
1892         default:
1893             break;
1894         }
1895     } else if (is_nan(a.cls)) {
1896         if (is_snan(a.cls)) {
1897             s->float_exception_flags |= float_flag_invalid;
1898             a = parts_silence_nan(a, s);
1899         }
1900         if (s->default_nan_mode) {
1901             return parts_default_nan(s);
1902         }
1903     }
1904     return a;
1905 }
1906
1907 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1908 {
1909     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1910     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1911     FloatParts pr = float_to_float(p, &float32_params, s);
1912     return float32_round_pack_canonical(pr, s);
1913 }
1914
1915 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1916 {
1917     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1918     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1919     FloatParts pr = float_to_float(p, &float64_params, s);
1920     return float64_round_pack_canonical(pr, s);
1921 }
1922
1923 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1924 {
1925     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1926     FloatParts p = float32_unpack_canonical(a, s);
1927     FloatParts pr = float_to_float(p, fmt16, s);
1928     return float16a_round_pack_canonical(pr, s, fmt16);
1929 }
1930
1931 float64 float32_to_float64(float32 a, float_status *s)
1932 {
1933     FloatParts p = float32_unpack_canonical(a, s);
1934     FloatParts pr = float_to_float(p, &float64_params, s);
1935     return float64_round_pack_canonical(pr, s);
1936 }
1937
1938 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1939 {
1940     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1941     FloatParts p = float64_unpack_canonical(a, s);
1942     FloatParts pr = float_to_float(p, fmt16, s);
1943     return float16a_round_pack_canonical(pr, s, fmt16);
1944 }
1945
1946 float32 float64_to_float32(float64 a, float_status *s)
1947 {
1948     FloatParts p = float64_unpack_canonical(a, s);
1949     FloatParts pr = float_to_float(p, &float32_params, s);
1950     return float32_round_pack_canonical(pr, s);
1951 }
1952
1953 /*
1954  * Rounds the floating-point value `a' to an integer, and returns the
1955  * result as a floating-point value. The operation is performed
1956  * according to the IEC/IEEE Standard for Binary Floating-Point
1957  * Arithmetic.
1958  */
1959
1960 static FloatParts round_to_int(FloatParts a, int rmode,
1961                                int scale, float_status *s)
1962 {
1963     switch (a.cls) {
1964     case float_class_qnan:
1965     case float_class_snan:
1966         return return_nan(a, s);
1967
1968     case float_class_zero:
1969     case float_class_inf:
1970         /* already "integral" */
1971         break;
1972
1973     case float_class_normal:
1974         scale = MIN(MAX(scale, -0x10000), 0x10000);
1975         a.exp += scale;
1976
1977         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1978             /* already integral */
1979             break;
1980         }
1981         if (a.exp < 0) {
1982             bool one;
1983             /* all fractional */
1984             s->float_exception_flags |= float_flag_inexact;
1985             switch (rmode) {
1986             case float_round_nearest_even:
1987                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1988                 break;
1989             case float_round_ties_away:
1990                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1991                 break;
1992             case float_round_to_zero:
1993                 one = false;
1994                 break;
1995             case float_round_up:
1996                 one = !a.sign;
1997                 break;
1998             case float_round_down:
1999                 one = a.sign;
2000                 break;
2001             case float_round_to_odd:
2002                 one = true;
2003                 break;
2004             default:
2005                 g_assert_not_reached();
2006             }
2007
2008             if (one) {
2009                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2010                 a.exp = 0;
2011             } else {
2012                 a.cls = float_class_zero;
2013             }
2014         } else {
2015             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2016             uint64_t frac_lsbm1 = frac_lsb >> 1;
2017             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2018             uint64_t rnd_mask = rnd_even_mask >> 1;
2019             uint64_t inc;
2020
2021             switch (rmode) {
2022             case float_round_nearest_even:
2023                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2024                 break;
2025             case float_round_ties_away:
2026                 inc = frac_lsbm1;
2027                 break;
2028             case float_round_to_zero:
2029                 inc = 0;
2030                 break;
2031             case float_round_up:
2032                 inc = a.sign ? 0 : rnd_mask;
2033                 break;
2034             case float_round_down:
2035                 inc = a.sign ? rnd_mask : 0;
2036                 break;
2037             case float_round_to_odd:
2038                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2039                 break;
2040             default:
2041                 g_assert_not_reached();
2042             }
2043
2044             if (a.frac & rnd_mask) {
2045                 s->float_exception_flags |= float_flag_inexact;
2046                 a.frac += inc;
2047                 a.frac &= ~rnd_mask;
2048                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2049                     a.frac >>= 1;
2050                     a.exp++;
2051                 }
2052             }
2053         }
2054         break;
2055     default:
2056         g_assert_not_reached();
2057     }
2058     return a;
2059 }
2060
2061 float16 float16_round_to_int(float16 a, float_status *s)
2062 {
2063     FloatParts pa = float16_unpack_canonical(a, s);
2064     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2065     return float16_round_pack_canonical(pr, s);
2066 }
2067
2068 float32 float32_round_to_int(float32 a, float_status *s)
2069 {
2070     FloatParts pa = float32_unpack_canonical(a, s);
2071     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2072     return float32_round_pack_canonical(pr, s);
2073 }
2074
2075 float64 float64_round_to_int(float64 a, float_status *s)
2076 {
2077     FloatParts pa = float64_unpack_canonical(a, s);
2078     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2079     return float64_round_pack_canonical(pr, s);
2080 }
2081
2082 /*
2083  * Returns the result of converting the floating-point value `a' to
2084  * the two's complement integer format. The conversion is performed
2085  * according to the IEC/IEEE Standard for Binary Floating-Point
2086  * Arithmetic---which means in particular that the conversion is
2087  * rounded according to the current rounding mode. If `a' is a NaN,
2088  * the largest positive integer is returned. Otherwise, if the
2089  * conversion overflows, the largest integer with the same sign as `a'
2090  * is returned.
2091 */
2092
2093 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2094                                      int64_t min, int64_t max,
2095                                      float_status *s)
2096 {
2097     uint64_t r;
2098     int orig_flags = get_float_exception_flags(s);
2099     FloatParts p = round_to_int(in, rmode, scale, s);
2100
2101     switch (p.cls) {
2102     case float_class_snan:
2103     case float_class_qnan:
2104         s->float_exception_flags = orig_flags | float_flag_invalid;
2105         return max;
2106     case float_class_inf:
2107         s->float_exception_flags = orig_flags | float_flag_invalid;
2108         return p.sign ? min : max;
2109     case float_class_zero:
2110         return 0;
2111     case float_class_normal:
2112         if (p.exp < DECOMPOSED_BINARY_POINT) {
2113             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2114         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2115             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2116         } else {
2117             r = UINT64_MAX;
2118         }
2119         if (p.sign) {
2120             if (r <= -(uint64_t) min) {
2121                 return -r;
2122             } else {
2123                 s->float_exception_flags = orig_flags | float_flag_invalid;
2124                 return min;
2125             }
2126         } else {
2127             if (r <= max) {
2128                 return r;
2129             } else {
2130                 s->float_exception_flags = orig_flags | float_flag_invalid;
2131                 return max;
2132             }
2133         }
2134     default:
2135         g_assert_not_reached();
2136     }
2137 }
2138
2139 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2140                                 float_status *s)
2141 {
2142     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2143                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2144 }
2145
2146 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2147                                 float_status *s)
2148 {
2149     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2150                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2151 }
2152
2153 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2154                                 float_status *s)
2155 {
2156     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2157                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2158 }
2159
2160 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2161                                 float_status *s)
2162 {
2163     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2164                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2165 }
2166
2167 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2168                                 float_status *s)
2169 {
2170     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2171                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2172 }
2173
2174 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2175                                 float_status *s)
2176 {
2177     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2178                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2179 }
2180
2181 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2182                                 float_status *s)
2183 {
2184     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2185                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2186 }
2187
2188 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2189                                 float_status *s)
2190 {
2191     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2192                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2193 }
2194
2195 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2196                                 float_status *s)
2197 {
2198     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2199                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2200 }
2201
2202 int16_t float16_to_int16(float16 a, float_status *s)
2203 {
2204     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2205 }
2206
2207 int32_t float16_to_int32(float16 a, float_status *s)
2208 {
2209     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2210 }
2211
2212 int64_t float16_to_int64(float16 a, float_status *s)
2213 {
2214     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2215 }
2216
2217 int16_t float32_to_int16(float32 a, float_status *s)
2218 {
2219     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2220 }
2221
2222 int32_t float32_to_int32(float32 a, float_status *s)
2223 {
2224     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2225 }
2226
2227 int64_t float32_to_int64(float32 a, float_status *s)
2228 {
2229     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2230 }
2231
2232 int16_t float64_to_int16(float64 a, float_status *s)
2233 {
2234     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2235 }
2236
2237 int32_t float64_to_int32(float64 a, float_status *s)
2238 {
2239     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2240 }
2241
2242 int64_t float64_to_int64(float64 a, float_status *s)
2243 {
2244     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2245 }
2246
2247 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2248 {
2249     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2250 }
2251
2252 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2253 {
2254     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2255 }
2256
2257 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2258 {
2259     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2260 }
2261
2262 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2263 {
2264     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2265 }
2266
2267 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2268 {
2269     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2270 }
2271
2272 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2273 {
2274     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2275 }
2276
2277 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2278 {
2279     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2280 }
2281
2282 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2283 {
2284     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2285 }
2286
2287 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2288 {
2289     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2290 }
2291
2292 /*
2293  *  Returns the result of converting the floating-point value `a' to
2294  *  the unsigned integer format. The conversion is performed according
2295  *  to the IEC/IEEE Standard for Binary Floating-Point
2296  *  Arithmetic---which means in particular that the conversion is
2297  *  rounded according to the current rounding mode. If `a' is a NaN,
2298  *  the largest unsigned integer is returned. Otherwise, if the
2299  *  conversion overflows, the largest unsigned integer is returned. If
2300  *  the 'a' is negative, the result is rounded and zero is returned;
2301  *  values that do not round to zero will raise the inexact exception
2302  *  flag.
2303  */
2304
2305 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2306                                        uint64_t max, float_status *s)
2307 {
2308     int orig_flags = get_float_exception_flags(s);
2309     FloatParts p = round_to_int(in, rmode, scale, s);
2310     uint64_t r;
2311
2312     switch (p.cls) {
2313     case float_class_snan:
2314     case float_class_qnan:
2315         s->float_exception_flags = orig_flags | float_flag_invalid;
2316         return max;
2317     case float_class_inf:
2318         s->float_exception_flags = orig_flags | float_flag_invalid;
2319         return p.sign ? 0 : max;
2320     case float_class_zero:
2321         return 0;
2322     case float_class_normal:
2323         if (p.sign) {
2324             s->float_exception_flags = orig_flags | float_flag_invalid;
2325             return 0;
2326         }
2327
2328         if (p.exp < DECOMPOSED_BINARY_POINT) {
2329             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2330         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2331             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2332         } else {
2333             s->float_exception_flags = orig_flags | float_flag_invalid;
2334             return max;
2335         }
2336
2337         /* For uint64 this will never trip, but if p.exp is too large
2338          * to shift a decomposed fraction we shall have exited via the
2339          * 3rd leg above.
2340          */
2341         if (r > max) {
2342             s->float_exception_flags = orig_flags | float_flag_invalid;
2343             return max;
2344         }
2345         return r;
2346     default:
2347         g_assert_not_reached();
2348     }
2349 }
2350
2351 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2352                                   float_status *s)
2353 {
2354     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2355                                   rmode, scale, UINT16_MAX, s);
2356 }
2357
2358 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2359                                   float_status *s)
2360 {
2361     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2362                                   rmode, scale, UINT32_MAX, s);
2363 }
2364
2365 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2366                                   float_status *s)
2367 {
2368     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2369                                   rmode, scale, UINT64_MAX, s);
2370 }
2371
2372 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2373                                   float_status *s)
2374 {
2375     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2376                                   rmode, scale, UINT16_MAX, s);
2377 }
2378
2379 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2380                                   float_status *s)
2381 {
2382     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2383                                   rmode, scale, UINT32_MAX, s);
2384 }
2385
2386 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2387                                   float_status *s)
2388 {
2389     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2390                                   rmode, scale, UINT64_MAX, s);
2391 }
2392
2393 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2394                                   float_status *s)
2395 {
2396     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2397                                   rmode, scale, UINT16_MAX, s);
2398 }
2399
2400 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2401                                   float_status *s)
2402 {
2403     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2404                                   rmode, scale, UINT32_MAX, s);
2405 }
2406
2407 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2408                                   float_status *s)
2409 {
2410     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2411                                   rmode, scale, UINT64_MAX, s);
2412 }
2413
2414 uint16_t float16_to_uint16(float16 a, float_status *s)
2415 {
2416     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2417 }
2418
2419 uint32_t float16_to_uint32(float16 a, float_status *s)
2420 {
2421     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2422 }
2423
2424 uint64_t float16_to_uint64(float16 a, float_status *s)
2425 {
2426     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2427 }
2428
2429 uint16_t float32_to_uint16(float32 a, float_status *s)
2430 {
2431     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2432 }
2433
2434 uint32_t float32_to_uint32(float32 a, float_status *s)
2435 {
2436     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2437 }
2438
2439 uint64_t float32_to_uint64(float32 a, float_status *s)
2440 {
2441     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2442 }
2443
2444 uint16_t float64_to_uint16(float64 a, float_status *s)
2445 {
2446     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2447 }
2448
2449 uint32_t float64_to_uint32(float64 a, float_status *s)
2450 {
2451     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2452 }
2453
2454 uint64_t float64_to_uint64(float64 a, float_status *s)
2455 {
2456     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2457 }
2458
2459 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2460 {
2461     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2462 }
2463
2464 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2465 {
2466     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2467 }
2468
2469 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2470 {
2471     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2472 }
2473
2474 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2475 {
2476     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2477 }
2478
2479 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2480 {
2481     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2482 }
2483
2484 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2485 {
2486     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2487 }
2488
2489 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2490 {
2491     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2492 }
2493
2494 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2495 {
2496     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2497 }
2498
2499 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2500 {
2501     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2502 }
2503
2504 /*
2505  * Integer to float conversions
2506  *
2507  * Returns the result of converting the two's complement integer `a'
2508  * to the floating-point format. The conversion is performed according
2509  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2510  */
2511
2512 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2513 {
2514     FloatParts r = { .sign = false };
2515
2516     if (a == 0) {
2517         r.cls = float_class_zero;
2518     } else {
2519         uint64_t f = a;
2520         int shift;
2521
2522         r.cls = float_class_normal;
2523         if (a < 0) {
2524             f = -f;
2525             r.sign = true;
2526         }
2527         shift = clz64(f) - 1;
2528         scale = MIN(MAX(scale, -0x10000), 0x10000);
2529
2530         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2531         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2532     }
2533
2534     return r;
2535 }
2536
2537 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2538 {
2539     FloatParts pa = int_to_float(a, scale, status);
2540     return float16_round_pack_canonical(pa, status);
2541 }
2542
2543 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2544 {
2545     return int64_to_float16_scalbn(a, scale, status);
2546 }
2547
2548 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2549 {
2550     return int64_to_float16_scalbn(a, scale, status);
2551 }
2552
2553 float16 int64_to_float16(int64_t a, float_status *status)
2554 {
2555     return int64_to_float16_scalbn(a, 0, status);
2556 }
2557
2558 float16 int32_to_float16(int32_t a, float_status *status)
2559 {
2560     return int64_to_float16_scalbn(a, 0, status);
2561 }
2562
2563 float16 int16_to_float16(int16_t a, float_status *status)
2564 {
2565     return int64_to_float16_scalbn(a, 0, status);
2566 }
2567
2568 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2569 {
2570     FloatParts pa = int_to_float(a, scale, status);
2571     return float32_round_pack_canonical(pa, status);
2572 }
2573
2574 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2575 {
2576     return int64_to_float32_scalbn(a, scale, status);
2577 }
2578
2579 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2580 {
2581     return int64_to_float32_scalbn(a, scale, status);
2582 }
2583
2584 float32 int64_to_float32(int64_t a, float_status *status)
2585 {
2586     return int64_to_float32_scalbn(a, 0, status);
2587 }
2588
2589 float32 int32_to_float32(int32_t a, float_status *status)
2590 {
2591     return int64_to_float32_scalbn(a, 0, status);
2592 }
2593
2594 float32 int16_to_float32(int16_t a, float_status *status)
2595 {
2596     return int64_to_float32_scalbn(a, 0, status);
2597 }
2598
2599 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2600 {
2601     FloatParts pa = int_to_float(a, scale, status);
2602     return float64_round_pack_canonical(pa, status);
2603 }
2604
2605 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2606 {
2607     return int64_to_float64_scalbn(a, scale, status);
2608 }
2609
2610 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2611 {
2612     return int64_to_float64_scalbn(a, scale, status);
2613 }
2614
2615 float64 int64_to_float64(int64_t a, float_status *status)
2616 {
2617     return int64_to_float64_scalbn(a, 0, status);
2618 }
2619
2620 float64 int32_to_float64(int32_t a, float_status *status)
2621 {
2622     return int64_to_float64_scalbn(a, 0, status);
2623 }
2624
2625 float64 int16_to_float64(int16_t a, float_status *status)
2626 {
2627     return int64_to_float64_scalbn(a, 0, status);
2628 }
2629
2630
2631 /*
2632  * Unsigned Integer to float conversions
2633  *
2634  * Returns the result of converting the unsigned integer `a' to the
2635  * floating-point format. The conversion is performed according to the
2636  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2637  */
2638
2639 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2640 {
2641     FloatParts r = { .sign = false };
2642
2643     if (a == 0) {
2644         r.cls = float_class_zero;
2645     } else {
2646         scale = MIN(MAX(scale, -0x10000), 0x10000);
2647         r.cls = float_class_normal;
2648         if ((int64_t)a < 0) {
2649             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2650             shift64RightJamming(a, 1, &a);
2651             r.frac = a;
2652         } else {
2653             int shift = clz64(a) - 1;
2654             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2655             r.frac = a << shift;
2656         }
2657     }
2658
2659     return r;
2660 }
2661
2662 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2663 {
2664     FloatParts pa = uint_to_float(a, scale, status);
2665     return float16_round_pack_canonical(pa, status);
2666 }
2667
2668 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2669 {
2670     return uint64_to_float16_scalbn(a, scale, status);
2671 }
2672
2673 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2674 {
2675     return uint64_to_float16_scalbn(a, scale, status);
2676 }
2677
2678 float16 uint64_to_float16(uint64_t a, float_status *status)
2679 {
2680     return uint64_to_float16_scalbn(a, 0, status);
2681 }
2682
2683 float16 uint32_to_float16(uint32_t a, float_status *status)
2684 {
2685     return uint64_to_float16_scalbn(a, 0, status);
2686 }
2687
2688 float16 uint16_to_float16(uint16_t a, float_status *status)
2689 {
2690     return uint64_to_float16_scalbn(a, 0, status);
2691 }
2692
2693 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2694 {
2695     FloatParts pa = uint_to_float(a, scale, status);
2696     return float32_round_pack_canonical(pa, status);
2697 }
2698
2699 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2700 {
2701     return uint64_to_float32_scalbn(a, scale, status);
2702 }
2703
2704 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2705 {
2706     return uint64_to_float32_scalbn(a, scale, status);
2707 }
2708
2709 float32 uint64_to_float32(uint64_t a, float_status *status)
2710 {
2711     return uint64_to_float32_scalbn(a, 0, status);
2712 }
2713
2714 float32 uint32_to_float32(uint32_t a, float_status *status)
2715 {
2716     return uint64_to_float32_scalbn(a, 0, status);
2717 }
2718
2719 float32 uint16_to_float32(uint16_t a, float_status *status)
2720 {
2721     return uint64_to_float32_scalbn(a, 0, status);
2722 }
2723
2724 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2725 {
2726     FloatParts pa = uint_to_float(a, scale, status);
2727     return float64_round_pack_canonical(pa, status);
2728 }
2729
2730 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2731 {
2732     return uint64_to_float64_scalbn(a, scale, status);
2733 }
2734
2735 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2736 {
2737     return uint64_to_float64_scalbn(a, scale, status);
2738 }
2739
2740 float64 uint64_to_float64(uint64_t a, float_status *status)
2741 {
2742     return uint64_to_float64_scalbn(a, 0, status);
2743 }
2744
2745 float64 uint32_to_float64(uint32_t a, float_status *status)
2746 {
2747     return uint64_to_float64_scalbn(a, 0, status);
2748 }
2749
2750 float64 uint16_to_float64(uint16_t a, float_status *status)
2751 {
2752     return uint64_to_float64_scalbn(a, 0, status);
2753 }
2754
2755 /* Float Min/Max */
2756 /* min() and max() functions. These can't be implemented as
2757  * 'compare and pick one input' because that would mishandle
2758  * NaNs and +0 vs -0.
2759  *
2760  * minnum() and maxnum() functions. These are similar to the min()
2761  * and max() functions but if one of the arguments is a QNaN and
2762  * the other is numerical then the numerical argument is returned.
2763  * SNaNs will get quietened before being returned.
2764  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2765  * and maxNum() operations. min() and max() are the typical min/max
2766  * semantics provided by many CPUs which predate that specification.
2767  *
2768  * minnummag() and maxnummag() functions correspond to minNumMag()
2769  * and minNumMag() from the IEEE-754 2008.
2770  */
2771 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2772                                 bool ieee, bool ismag, float_status *s)
2773 {
2774     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2775         if (ieee) {
2776             /* Takes two floating-point values `a' and `b', one of
2777              * which is a NaN, and returns the appropriate NaN
2778              * result. If either `a' or `b' is a signaling NaN,
2779              * the invalid exception is raised.
2780              */
2781             if (is_snan(a.cls) || is_snan(b.cls)) {
2782                 return pick_nan(a, b, s);
2783             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2784                 return b;
2785             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2786                 return a;
2787             }
2788         }
2789         return pick_nan(a, b, s);
2790     } else {
2791         int a_exp, b_exp;
2792
2793         switch (a.cls) {
2794         case float_class_normal:
2795             a_exp = a.exp;
2796             break;
2797         case float_class_inf:
2798             a_exp = INT_MAX;
2799             break;
2800         case float_class_zero:
2801             a_exp = INT_MIN;
2802             break;
2803         default:
2804             g_assert_not_reached();
2805             break;
2806         }
2807         switch (b.cls) {
2808         case float_class_normal:
2809             b_exp = b.exp;
2810             break;
2811         case float_class_inf:
2812             b_exp = INT_MAX;
2813             break;
2814         case float_class_zero:
2815             b_exp = INT_MIN;
2816             break;
2817         default:
2818             g_assert_not_reached();
2819             break;
2820         }
2821
2822         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2823             bool a_less = a_exp < b_exp;
2824             if (a_exp == b_exp) {
2825                 a_less = a.frac < b.frac;
2826             }
2827             return a_less ^ ismin ? b : a;
2828         }
2829
2830         if (a.sign == b.sign) {
2831             bool a_less = a_exp < b_exp;
2832             if (a_exp == b_exp) {
2833                 a_less = a.frac < b.frac;
2834             }
2835             return a.sign ^ a_less ^ ismin ? b : a;
2836         } else {
2837             return a.sign ^ ismin ? b : a;
2838         }
2839     }
2840 }
2841
2842 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2843 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2844                                      float_status *s)                   \
2845 {                                                                       \
2846     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2847     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2848     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2849                                                                         \
2850     return float ## sz ## _round_pack_canonical(pr, s);                 \
2851 }
2852
2853 MINMAX(16, min, true, false, false)
2854 MINMAX(16, minnum, true, true, false)
2855 MINMAX(16, minnummag, true, true, true)
2856 MINMAX(16, max, false, false, false)
2857 MINMAX(16, maxnum, false, true, false)
2858 MINMAX(16, maxnummag, false, true, true)
2859
2860 MINMAX(32, min, true, false, false)
2861 MINMAX(32, minnum, true, true, false)
2862 MINMAX(32, minnummag, true, true, true)
2863 MINMAX(32, max, false, false, false)
2864 MINMAX(32, maxnum, false, true, false)
2865 MINMAX(32, maxnummag, false, true, true)
2866
2867 MINMAX(64, min, true, false, false)
2868 MINMAX(64, minnum, true, true, false)
2869 MINMAX(64, minnummag, true, true, true)
2870 MINMAX(64, max, false, false, false)
2871 MINMAX(64, maxnum, false, true, false)
2872 MINMAX(64, maxnummag, false, true, true)
2873
2874 #undef MINMAX
2875
2876 /* Floating point compare */
2877 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2878                           float_status *s)
2879 {
2880     if (is_nan(a.cls) || is_nan(b.cls)) {
2881         if (!is_quiet ||
2882             a.cls == float_class_snan ||
2883             b.cls == float_class_snan) {
2884             s->float_exception_flags |= float_flag_invalid;
2885         }
2886         return float_relation_unordered;
2887     }
2888
2889     if (a.cls == float_class_zero) {
2890         if (b.cls == float_class_zero) {
2891             return float_relation_equal;
2892         }
2893         return b.sign ? float_relation_greater : float_relation_less;
2894     } else if (b.cls == float_class_zero) {
2895         return a.sign ? float_relation_less : float_relation_greater;
2896     }
2897
2898     /* The only really important thing about infinity is its sign. If
2899      * both are infinities the sign marks the smallest of the two.
2900      */
2901     if (a.cls == float_class_inf) {
2902         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2903             return float_relation_equal;
2904         }
2905         return a.sign ? float_relation_less : float_relation_greater;
2906     } else if (b.cls == float_class_inf) {
2907         return b.sign ? float_relation_greater : float_relation_less;
2908     }
2909
2910     if (a.sign != b.sign) {
2911         return a.sign ? float_relation_less : float_relation_greater;
2912     }
2913
2914     if (a.exp == b.exp) {
2915         if (a.frac == b.frac) {
2916             return float_relation_equal;
2917         }
2918         if (a.sign) {
2919             return a.frac > b.frac ?
2920                 float_relation_less : float_relation_greater;
2921         } else {
2922             return a.frac > b.frac ?
2923                 float_relation_greater : float_relation_less;
2924         }
2925     } else {
2926         if (a.sign) {
2927             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2928         } else {
2929             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2930         }
2931     }
2932 }
2933
2934 #define COMPARE(name, attr, sz)                                         \
2935 static int attr                                                         \
2936 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
2937 {                                                                       \
2938     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2939     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2940     return compare_floats(pa, pb, is_quiet, s);                         \
2941 }
2942
2943 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2944 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2945 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2946
2947 #undef COMPARE
2948
2949 int float16_compare(float16 a, float16 b, float_status *s)
2950 {
2951     return soft_f16_compare(a, b, false, s);
2952 }
2953
2954 int float16_compare_quiet(float16 a, float16 b, float_status *s)
2955 {
2956     return soft_f16_compare(a, b, true, s);
2957 }
2958
2959 static int QEMU_FLATTEN
2960 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2961 {
2962     union_float32 ua, ub;
2963
2964     ua.s = xa;
2965     ub.s = xb;
2966
2967     if (QEMU_NO_HARDFLOAT) {
2968         goto soft;
2969     }
2970
2971     float32_input_flush2(&ua.s, &ub.s, s);
2972     if (isgreaterequal(ua.h, ub.h)) {
2973         if (isgreater(ua.h, ub.h)) {
2974             return float_relation_greater;
2975         }
2976         return float_relation_equal;
2977     }
2978     if (likely(isless(ua.h, ub.h))) {
2979         return float_relation_less;
2980     }
2981     /* The only condition remaining is unordered.
2982      * Fall through to set flags.
2983      */
2984  soft:
2985     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2986 }
2987
2988 int float32_compare(float32 a, float32 b, float_status *s)
2989 {
2990     return f32_compare(a, b, false, s);
2991 }
2992
2993 int float32_compare_quiet(float32 a, float32 b, float_status *s)
2994 {
2995     return f32_compare(a, b, true, s);
2996 }
2997
2998 static int QEMU_FLATTEN
2999 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3000 {
3001     union_float64 ua, ub;
3002
3003     ua.s = xa;
3004     ub.s = xb;
3005
3006     if (QEMU_NO_HARDFLOAT) {
3007         goto soft;
3008     }
3009
3010     float64_input_flush2(&ua.s, &ub.s, s);
3011     if (isgreaterequal(ua.h, ub.h)) {
3012         if (isgreater(ua.h, ub.h)) {
3013             return float_relation_greater;
3014         }
3015         return float_relation_equal;
3016     }
3017     if (likely(isless(ua.h, ub.h))) {
3018         return float_relation_less;
3019     }
3020     /* The only condition remaining is unordered.
3021      * Fall through to set flags.
3022      */
3023  soft:
3024     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3025 }
3026
3027 int float64_compare(float64 a, float64 b, float_status *s)
3028 {
3029     return f64_compare(a, b, false, s);
3030 }
3031
3032 int float64_compare_quiet(float64 a, float64 b, float_status *s)
3033 {
3034     return f64_compare(a, b, true, s);
3035 }
3036
3037 /* Multiply A by 2 raised to the power N.  */
3038 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3039 {
3040     if (unlikely(is_nan(a.cls))) {
3041         return return_nan(a, s);
3042     }
3043     if (a.cls == float_class_normal) {
3044         /* The largest float type (even though not supported by FloatParts)
3045          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3046          * still allows rounding to infinity, without allowing overflow
3047          * within the int32_t that backs FloatParts.exp.
3048          */
3049         n = MIN(MAX(n, -0x10000), 0x10000);
3050         a.exp += n;
3051     }
3052     return a;
3053 }
3054
3055 float16 float16_scalbn(float16 a, int n, float_status *status)
3056 {
3057     FloatParts pa = float16_unpack_canonical(a, status);
3058     FloatParts pr = scalbn_decomposed(pa, n, status);
3059     return float16_round_pack_canonical(pr, status);
3060 }
3061
3062 float32 float32_scalbn(float32 a, int n, float_status *status)
3063 {
3064     FloatParts pa = float32_unpack_canonical(a, status);
3065     FloatParts pr = scalbn_decomposed(pa, n, status);
3066     return float32_round_pack_canonical(pr, status);
3067 }
3068
3069 float64 float64_scalbn(float64 a, int n, float_status *status)
3070 {
3071     FloatParts pa = float64_unpack_canonical(a, status);
3072     FloatParts pr = scalbn_decomposed(pa, n, status);
3073     return float64_round_pack_canonical(pr, status);
3074 }
3075
3076 /*
3077  * Square Root
3078  *
3079  * The old softfloat code did an approximation step before zeroing in
3080  * on the final result. However for simpleness we just compute the
3081  * square root by iterating down from the implicit bit to enough extra
3082  * bits to ensure we get a correctly rounded result.
3083  *
3084  * This does mean however the calculation is slower than before,
3085  * especially for 64 bit floats.
3086  */
3087
3088 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3089 {
3090     uint64_t a_frac, r_frac, s_frac;
3091     int bit, last_bit;
3092
3093     if (is_nan(a.cls)) {
3094         return return_nan(a, s);
3095     }
3096     if (a.cls == float_class_zero) {
3097         return a;  /* sqrt(+-0) = +-0 */
3098     }
3099     if (a.sign) {
3100         s->float_exception_flags |= float_flag_invalid;
3101         return parts_default_nan(s);
3102     }
3103     if (a.cls == float_class_inf) {
3104         return a;  /* sqrt(+inf) = +inf */
3105     }
3106
3107     assert(a.cls == float_class_normal);
3108
3109     /* We need two overflow bits at the top. Adding room for that is a
3110      * right shift. If the exponent is odd, we can discard the low bit
3111      * by multiplying the fraction by 2; that's a left shift. Combine
3112      * those and we shift right if the exponent is even.
3113      */
3114     a_frac = a.frac;
3115     if (!(a.exp & 1)) {
3116         a_frac >>= 1;
3117     }
3118     a.exp >>= 1;
3119
3120     /* Bit-by-bit computation of sqrt.  */
3121     r_frac = 0;
3122     s_frac = 0;
3123
3124     /* Iterate from implicit bit down to the 3 extra bits to compute a
3125      * properly rounded result. Remember we've inserted one more bit
3126      * at the top, so these positions are one less.
3127      */
3128     bit = DECOMPOSED_BINARY_POINT - 1;
3129     last_bit = MAX(p->frac_shift - 4, 0);
3130     do {
3131         uint64_t q = 1ULL << bit;
3132         uint64_t t_frac = s_frac + q;
3133         if (t_frac <= a_frac) {
3134             s_frac = t_frac + q;
3135             a_frac -= t_frac;
3136             r_frac += q;
3137         }
3138         a_frac <<= 1;
3139     } while (--bit >= last_bit);
3140
3141     /* Undo the right shift done above. If there is any remaining
3142      * fraction, the result is inexact. Set the sticky bit.
3143      */
3144     a.frac = (r_frac << 1) + (a_frac != 0);
3145
3146     return a;
3147 }
3148
3149 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3150 {
3151     FloatParts pa = float16_unpack_canonical(a, status);
3152     FloatParts pr = sqrt_float(pa, status, &float16_params);
3153     return float16_round_pack_canonical(pr, status);
3154 }
3155
3156 static float32 QEMU_SOFTFLOAT_ATTR
3157 soft_f32_sqrt(float32 a, float_status *status)
3158 {
3159     FloatParts pa = float32_unpack_canonical(a, status);
3160     FloatParts pr = sqrt_float(pa, status, &float32_params);
3161     return float32_round_pack_canonical(pr, status);
3162 }
3163
3164 static float64 QEMU_SOFTFLOAT_ATTR
3165 soft_f64_sqrt(float64 a, float_status *status)
3166 {
3167     FloatParts pa = float64_unpack_canonical(a, status);
3168     FloatParts pr = sqrt_float(pa, status, &float64_params);
3169     return float64_round_pack_canonical(pr, status);
3170 }
3171
3172 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3173 {
3174     union_float32 ua, ur;
3175
3176     ua.s = xa;
3177     if (unlikely(!can_use_fpu(s))) {
3178         goto soft;
3179     }
3180
3181     float32_input_flush1(&ua.s, s);
3182     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3183         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3184                        fpclassify(ua.h) == FP_ZERO) ||
3185                      signbit(ua.h))) {
3186             goto soft;
3187         }
3188     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3189                         float32_is_neg(ua.s))) {
3190         goto soft;
3191     }
3192     ur.h = sqrtf(ua.h);
3193     return ur.s;
3194
3195  soft:
3196     return soft_f32_sqrt(ua.s, s);
3197 }
3198
3199 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3200 {
3201     union_float64 ua, ur;
3202
3203     ua.s = xa;
3204     if (unlikely(!can_use_fpu(s))) {
3205         goto soft;
3206     }
3207
3208     float64_input_flush1(&ua.s, s);
3209     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3210         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3211                        fpclassify(ua.h) == FP_ZERO) ||
3212                      signbit(ua.h))) {
3213             goto soft;
3214         }
3215     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3216                         float64_is_neg(ua.s))) {
3217         goto soft;
3218     }
3219     ur.h = sqrt(ua.h);
3220     return ur.s;
3221
3222  soft:
3223     return soft_f64_sqrt(ua.s, s);
3224 }
3225
3226 /*----------------------------------------------------------------------------
3227 | The pattern for a default generated NaN.
3228 *----------------------------------------------------------------------------*/
3229
3230 float16 float16_default_nan(float_status *status)
3231 {
3232     FloatParts p = parts_default_nan(status);
3233     p.frac >>= float16_params.frac_shift;
3234     return float16_pack_raw(p);
3235 }
3236
3237 float32 float32_default_nan(float_status *status)
3238 {
3239     FloatParts p = parts_default_nan(status);
3240     p.frac >>= float32_params.frac_shift;
3241     return float32_pack_raw(p);
3242 }
3243
3244 float64 float64_default_nan(float_status *status)
3245 {
3246     FloatParts p = parts_default_nan(status);
3247     p.frac >>= float64_params.frac_shift;
3248     return float64_pack_raw(p);
3249 }
3250
3251 float128 float128_default_nan(float_status *status)
3252 {
3253     FloatParts p = parts_default_nan(status);
3254     float128 r;
3255
3256     /* Extrapolate from the choices made by parts_default_nan to fill
3257      * in the quad-floating format.  If the low bit is set, assume we
3258      * want to set all non-snan bits.
3259      */
3260     r.low = -(p.frac & 1);
3261     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3262     r.high |= LIT64(0x7FFF000000000000);
3263     r.high |= (uint64_t)p.sign << 63;
3264
3265     return r;
3266 }
3267
3268 /*----------------------------------------------------------------------------
3269 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3270 *----------------------------------------------------------------------------*/
3271
3272 float16 float16_silence_nan(float16 a, float_status *status)
3273 {
3274     FloatParts p = float16_unpack_raw(a);
3275     p.frac <<= float16_params.frac_shift;
3276     p = parts_silence_nan(p, status);
3277     p.frac >>= float16_params.frac_shift;
3278     return float16_pack_raw(p);
3279 }
3280
3281 float32 float32_silence_nan(float32 a, float_status *status)
3282 {
3283     FloatParts p = float32_unpack_raw(a);
3284     p.frac <<= float32_params.frac_shift;
3285     p = parts_silence_nan(p, status);
3286     p.frac >>= float32_params.frac_shift;
3287     return float32_pack_raw(p);
3288 }
3289
3290 float64 float64_silence_nan(float64 a, float_status *status)
3291 {
3292     FloatParts p = float64_unpack_raw(a);
3293     p.frac <<= float64_params.frac_shift;
3294     p = parts_silence_nan(p, status);
3295     p.frac >>= float64_params.frac_shift;
3296     return float64_pack_raw(p);
3297 }
3298
3299 /*----------------------------------------------------------------------------
3300 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3301 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3302 | input.  If `zSign' is 1, the input is negated before being converted to an
3303 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3304 | is simply rounded to an integer, with the inexact exception raised if the
3305 | input cannot be represented exactly as an integer.  However, if the fixed-
3306 | point input is too large, the invalid exception is raised and the largest
3307 | positive or negative integer is returned.
3308 *----------------------------------------------------------------------------*/
3309
3310 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3311 {
3312     int8_t roundingMode;
3313     flag roundNearestEven;
3314     int8_t roundIncrement, roundBits;
3315     int32_t z;
3316
3317     roundingMode = status->float_rounding_mode;
3318     roundNearestEven = ( roundingMode == float_round_nearest_even );
3319     switch (roundingMode) {
3320     case float_round_nearest_even:
3321     case float_round_ties_away:
3322         roundIncrement = 0x40;
3323         break;
3324     case float_round_to_zero:
3325         roundIncrement = 0;
3326         break;
3327     case float_round_up:
3328         roundIncrement = zSign ? 0 : 0x7f;
3329         break;
3330     case float_round_down:
3331         roundIncrement = zSign ? 0x7f : 0;
3332         break;
3333     case float_round_to_odd:
3334         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3335         break;
3336     default:
3337         abort();
3338     }
3339     roundBits = absZ & 0x7F;
3340     absZ = ( absZ + roundIncrement )>>7;
3341     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3342     z = absZ;
3343     if ( zSign ) z = - z;
3344     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3345         float_raise(float_flag_invalid, status);
3346         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3347     }
3348     if (roundBits) {
3349         status->float_exception_flags |= float_flag_inexact;
3350     }
3351     return z;
3352
3353 }
3354
3355 /*----------------------------------------------------------------------------
3356 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3357 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3358 | and returns the properly rounded 64-bit integer corresponding to the input.
3359 | If `zSign' is 1, the input is negated before being converted to an integer.
3360 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3361 | the inexact exception raised if the input cannot be represented exactly as
3362 | an integer.  However, if the fixed-point input is too large, the invalid
3363 | exception is raised and the largest positive or negative integer is
3364 | returned.
3365 *----------------------------------------------------------------------------*/
3366
3367 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3368                                float_status *status)
3369 {
3370     int8_t roundingMode;
3371     flag roundNearestEven, increment;
3372     int64_t z;
3373
3374     roundingMode = status->float_rounding_mode;
3375     roundNearestEven = ( roundingMode == float_round_nearest_even );
3376     switch (roundingMode) {
3377     case float_round_nearest_even:
3378     case float_round_ties_away:
3379         increment = ((int64_t) absZ1 < 0);
3380         break;
3381     case float_round_to_zero:
3382         increment = 0;
3383         break;
3384     case float_round_up:
3385         increment = !zSign && absZ1;
3386         break;
3387     case float_round_down:
3388         increment = zSign && absZ1;
3389         break;
3390     case float_round_to_odd:
3391         increment = !(absZ0 & 1) && absZ1;
3392         break;
3393     default:
3394         abort();
3395     }
3396     if ( increment ) {
3397         ++absZ0;
3398         if ( absZ0 == 0 ) goto overflow;
3399         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3400     }
3401     z = absZ0;
3402     if ( zSign ) z = - z;
3403     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3404  overflow:
3405         float_raise(float_flag_invalid, status);
3406         return
3407               zSign ? (int64_t) LIT64( 0x8000000000000000 )
3408             : LIT64( 0x7FFFFFFFFFFFFFFF );
3409     }
3410     if (absZ1) {
3411         status->float_exception_flags |= float_flag_inexact;
3412     }
3413     return z;
3414
3415 }
3416
3417 /*----------------------------------------------------------------------------
3418 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3419 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3420 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3421 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3422 | with the inexact exception raised if the input cannot be represented exactly
3423 | as an integer.  However, if the fixed-point input is too large, the invalid
3424 | exception is raised and the largest unsigned integer is returned.
3425 *----------------------------------------------------------------------------*/
3426
3427 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3428                                 uint64_t absZ1, float_status *status)
3429 {
3430     int8_t roundingMode;
3431     flag roundNearestEven, increment;
3432
3433     roundingMode = status->float_rounding_mode;
3434     roundNearestEven = (roundingMode == float_round_nearest_even);
3435     switch (roundingMode) {
3436     case float_round_nearest_even:
3437     case float_round_ties_away:
3438         increment = ((int64_t)absZ1 < 0);
3439         break;
3440     case float_round_to_zero:
3441         increment = 0;
3442         break;
3443     case float_round_up:
3444         increment = !zSign && absZ1;
3445         break;
3446     case float_round_down:
3447         increment = zSign && absZ1;
3448         break;
3449     case float_round_to_odd:
3450         increment = !(absZ0 & 1) && absZ1;
3451         break;
3452     default:
3453         abort();
3454     }
3455     if (increment) {
3456         ++absZ0;
3457         if (absZ0 == 0) {
3458             float_raise(float_flag_invalid, status);
3459             return LIT64(0xFFFFFFFFFFFFFFFF);
3460         }
3461         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3462     }
3463
3464     if (zSign && absZ0) {
3465         float_raise(float_flag_invalid, status);
3466         return 0;
3467     }
3468
3469     if (absZ1) {
3470         status->float_exception_flags |= float_flag_inexact;
3471     }
3472     return absZ0;
3473 }
3474
3475 /*----------------------------------------------------------------------------
3476 | If `a' is denormal and we are in flush-to-zero mode then set the
3477 | input-denormal exception and return zero. Otherwise just return the value.
3478 *----------------------------------------------------------------------------*/
3479 float32 float32_squash_input_denormal(float32 a, float_status *status)
3480 {
3481     if (status->flush_inputs_to_zero) {
3482         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
3483             float_raise(float_flag_input_denormal, status);
3484             return make_float32(float32_val(a) & 0x80000000);
3485         }
3486     }
3487     return a;
3488 }
3489
3490 /*----------------------------------------------------------------------------
3491 | Normalizes the subnormal single-precision floating-point value represented
3492 | by the denormalized significand `aSig'.  The normalized exponent and
3493 | significand are stored at the locations pointed to by `zExpPtr' and
3494 | `zSigPtr', respectively.
3495 *----------------------------------------------------------------------------*/
3496
3497 static void
3498  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3499 {
3500     int8_t shiftCount;
3501
3502     shiftCount = clz32(aSig) - 8;
3503     *zSigPtr = aSig<<shiftCount;
3504     *zExpPtr = 1 - shiftCount;
3505
3506 }
3507
3508 /*----------------------------------------------------------------------------
3509 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3510 | and significand `zSig', and returns the proper single-precision floating-
3511 | point value corresponding to the abstract input.  Ordinarily, the abstract
3512 | value is simply rounded and packed into the single-precision format, with
3513 | the inexact exception raised if the abstract input cannot be represented
3514 | exactly.  However, if the abstract value is too large, the overflow and
3515 | inexact exceptions are raised and an infinity or maximal finite value is
3516 | returned.  If the abstract value is too small, the input value is rounded to
3517 | a subnormal number, and the underflow and inexact exceptions are raised if
3518 | the abstract input cannot be represented exactly as a subnormal single-
3519 | precision floating-point number.
3520 |     The input significand `zSig' has its binary point between bits 30
3521 | and 29, which is 7 bits to the left of the usual location.  This shifted
3522 | significand must be normalized or smaller.  If `zSig' is not normalized,
3523 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3524 | and it must not require rounding.  In the usual case that `zSig' is
3525 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3526 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3527 | Binary Floating-Point Arithmetic.
3528 *----------------------------------------------------------------------------*/
3529
3530 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3531                                    float_status *status)
3532 {
3533     int8_t roundingMode;
3534     flag roundNearestEven;
3535     int8_t roundIncrement, roundBits;
3536     flag isTiny;
3537
3538     roundingMode = status->float_rounding_mode;
3539     roundNearestEven = ( roundingMode == float_round_nearest_even );
3540     switch (roundingMode) {
3541     case float_round_nearest_even:
3542     case float_round_ties_away:
3543         roundIncrement = 0x40;
3544         break;
3545     case float_round_to_zero:
3546         roundIncrement = 0;
3547         break;
3548     case float_round_up:
3549         roundIncrement = zSign ? 0 : 0x7f;
3550         break;
3551     case float_round_down:
3552         roundIncrement = zSign ? 0x7f : 0;
3553         break;
3554     case float_round_to_odd:
3555         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3556         break;
3557     default:
3558         abort();
3559         break;
3560     }
3561     roundBits = zSig & 0x7F;
3562     if ( 0xFD <= (uint16_t) zExp ) {
3563         if (    ( 0xFD < zExp )
3564              || (    ( zExp == 0xFD )
3565                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3566            ) {
3567             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3568                                    roundIncrement != 0;
3569             float_raise(float_flag_overflow | float_flag_inexact, status);
3570             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3571         }
3572         if ( zExp < 0 ) {
3573             if (status->flush_to_zero) {
3574                 float_raise(float_flag_output_denormal, status);
3575                 return packFloat32(zSign, 0, 0);
3576             }
3577             isTiny =
3578                 (status->float_detect_tininess
3579                  == float_tininess_before_rounding)
3580                 || ( zExp < -1 )
3581                 || ( zSig + roundIncrement < 0x80000000 );
3582             shift32RightJamming( zSig, - zExp, &zSig );
3583             zExp = 0;
3584             roundBits = zSig & 0x7F;
3585             if (isTiny && roundBits) {
3586                 float_raise(float_flag_underflow, status);
3587             }
3588             if (roundingMode == float_round_to_odd) {
3589                 /*
3590                  * For round-to-odd case, the roundIncrement depends on
3591                  * zSig which just changed.
3592                  */
3593                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3594             }
3595         }
3596     }
3597     if (roundBits) {
3598         status->float_exception_flags |= float_flag_inexact;
3599     }
3600     zSig = ( zSig + roundIncrement )>>7;
3601     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3602     if ( zSig == 0 ) zExp = 0;
3603     return packFloat32( zSign, zExp, zSig );
3604
3605 }
3606
3607 /*----------------------------------------------------------------------------
3608 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3609 | and significand `zSig', and returns the proper single-precision floating-
3610 | point value corresponding to the abstract input.  This routine is just like
3611 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3612 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3613 | floating-point exponent.
3614 *----------------------------------------------------------------------------*/
3615
3616 static float32
3617  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3618                               float_status *status)
3619 {
3620     int8_t shiftCount;
3621
3622     shiftCount = clz32(zSig) - 1;
3623     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3624                                status);
3625
3626 }
3627
3628 /*----------------------------------------------------------------------------
3629 | If `a' is denormal and we are in flush-to-zero mode then set the
3630 | input-denormal exception and return zero. Otherwise just return the value.
3631 *----------------------------------------------------------------------------*/
3632 float64 float64_squash_input_denormal(float64 a, float_status *status)
3633 {
3634     if (status->flush_inputs_to_zero) {
3635         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
3636             float_raise(float_flag_input_denormal, status);
3637             return make_float64(float64_val(a) & (1ULL << 63));
3638         }
3639     }
3640     return a;
3641 }
3642
3643 /*----------------------------------------------------------------------------
3644 | Normalizes the subnormal double-precision floating-point value represented
3645 | by the denormalized significand `aSig'.  The normalized exponent and
3646 | significand are stored at the locations pointed to by `zExpPtr' and
3647 | `zSigPtr', respectively.
3648 *----------------------------------------------------------------------------*/
3649
3650 static void
3651  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3652 {
3653     int8_t shiftCount;
3654
3655     shiftCount = clz64(aSig) - 11;
3656     *zSigPtr = aSig<<shiftCount;
3657     *zExpPtr = 1 - shiftCount;
3658
3659 }
3660
3661 /*----------------------------------------------------------------------------
3662 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3663 | double-precision floating-point value, returning the result.  After being
3664 | shifted into the proper positions, the three fields are simply added
3665 | together to form the result.  This means that any integer portion of `zSig'
3666 | will be added into the exponent.  Since a properly normalized significand
3667 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3668 | than the desired result exponent whenever `zSig' is a complete, normalized
3669 | significand.
3670 *----------------------------------------------------------------------------*/
3671
3672 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3673 {
3674
3675     return make_float64(
3676         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3677
3678 }
3679
3680 /*----------------------------------------------------------------------------
3681 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3682 | and significand `zSig', and returns the proper double-precision floating-
3683 | point value corresponding to the abstract input.  Ordinarily, the abstract
3684 | value is simply rounded and packed into the double-precision format, with
3685 | the inexact exception raised if the abstract input cannot be represented
3686 | exactly.  However, if the abstract value is too large, the overflow and
3687 | inexact exceptions are raised and an infinity or maximal finite value is
3688 | returned.  If the abstract value is too small, the input value is rounded to
3689 | a subnormal number, and the underflow and inexact exceptions are raised if
3690 | the abstract input cannot be represented exactly as a subnormal double-
3691 | precision floating-point number.
3692 |     The input significand `zSig' has its binary point between bits 62
3693 | and 61, which is 10 bits to the left of the usual location.  This shifted
3694 | significand must be normalized or smaller.  If `zSig' is not normalized,
3695 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3696 | and it must not require rounding.  In the usual case that `zSig' is
3697 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3698 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3699 | Binary Floating-Point Arithmetic.
3700 *----------------------------------------------------------------------------*/
3701
3702 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3703                                    float_status *status)
3704 {
3705     int8_t roundingMode;
3706     flag roundNearestEven;
3707     int roundIncrement, roundBits;
3708     flag isTiny;
3709
3710     roundingMode = status->float_rounding_mode;
3711     roundNearestEven = ( roundingMode == float_round_nearest_even );
3712     switch (roundingMode) {
3713     case float_round_nearest_even:
3714     case float_round_ties_away:
3715         roundIncrement = 0x200;
3716         break;
3717     case float_round_to_zero:
3718         roundIncrement = 0;
3719         break;
3720     case float_round_up:
3721         roundIncrement = zSign ? 0 : 0x3ff;
3722         break;
3723     case float_round_down:
3724         roundIncrement = zSign ? 0x3ff : 0;
3725         break;
3726     case float_round_to_odd:
3727         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3728         break;
3729     default:
3730         abort();
3731     }
3732     roundBits = zSig & 0x3FF;
3733     if ( 0x7FD <= (uint16_t) zExp ) {
3734         if (    ( 0x7FD < zExp )
3735              || (    ( zExp == 0x7FD )
3736                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3737            ) {
3738             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3739                                    roundIncrement != 0;
3740             float_raise(float_flag_overflow | float_flag_inexact, status);
3741             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3742         }
3743         if ( zExp < 0 ) {
3744             if (status->flush_to_zero) {
3745                 float_raise(float_flag_output_denormal, status);
3746                 return packFloat64(zSign, 0, 0);
3747             }
3748             isTiny =
3749                    (status->float_detect_tininess
3750                     == float_tininess_before_rounding)
3751                 || ( zExp < -1 )
3752                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
3753             shift64RightJamming( zSig, - zExp, &zSig );
3754             zExp = 0;
3755             roundBits = zSig & 0x3FF;
3756             if (isTiny && roundBits) {
3757                 float_raise(float_flag_underflow, status);
3758             }
3759             if (roundingMode == float_round_to_odd) {
3760                 /*
3761                  * For round-to-odd case, the roundIncrement depends on
3762                  * zSig which just changed.
3763                  */
3764                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3765             }
3766         }
3767     }
3768     if (roundBits) {
3769         status->float_exception_flags |= float_flag_inexact;
3770     }
3771     zSig = ( zSig + roundIncrement )>>10;
3772     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3773     if ( zSig == 0 ) zExp = 0;
3774     return packFloat64( zSign, zExp, zSig );
3775
3776 }
3777
3778 /*----------------------------------------------------------------------------
3779 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3780 | and significand `zSig', and returns the proper double-precision floating-
3781 | point value corresponding to the abstract input.  This routine is just like
3782 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3783 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3784 | floating-point exponent.
3785 *----------------------------------------------------------------------------*/
3786
3787 static float64
3788  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3789                               float_status *status)
3790 {
3791     int8_t shiftCount;
3792
3793     shiftCount = clz64(zSig) - 1;
3794     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3795                                status);
3796
3797 }
3798
3799 /*----------------------------------------------------------------------------
3800 | Normalizes the subnormal extended double-precision floating-point value
3801 | represented by the denormalized significand `aSig'.  The normalized exponent
3802 | and significand are stored at the locations pointed to by `zExpPtr' and
3803 | `zSigPtr', respectively.
3804 *----------------------------------------------------------------------------*/
3805
3806 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3807                                 uint64_t *zSigPtr)
3808 {
3809     int8_t shiftCount;
3810
3811     shiftCount = clz64(aSig);
3812     *zSigPtr = aSig<<shiftCount;
3813     *zExpPtr = 1 - shiftCount;
3814 }
3815
3816 /*----------------------------------------------------------------------------
3817 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3818 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3819 | and returns the proper extended double-precision floating-point value
3820 | corresponding to the abstract input.  Ordinarily, the abstract value is
3821 | rounded and packed into the extended double-precision format, with the
3822 | inexact exception raised if the abstract input cannot be represented
3823 | exactly.  However, if the abstract value is too large, the overflow and
3824 | inexact exceptions are raised and an infinity or maximal finite value is
3825 | returned.  If the abstract value is too small, the input value is rounded to
3826 | a subnormal number, and the underflow and inexact exceptions are raised if
3827 | the abstract input cannot be represented exactly as a subnormal extended
3828 | double-precision floating-point number.
3829 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3830 | number of bits as single or double precision, respectively.  Otherwise, the
3831 | result is rounded to the full precision of the extended double-precision
3832 | format.
3833 |     The input significand must be normalized or smaller.  If the input
3834 | significand is not normalized, `zExp' must be 0; in that case, the result
3835 | returned is a subnormal number, and it must not require rounding.  The
3836 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3837 | Floating-Point Arithmetic.
3838 *----------------------------------------------------------------------------*/
3839
3840 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3841                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3842                               float_status *status)
3843 {
3844     int8_t roundingMode;
3845     flag roundNearestEven, increment, isTiny;
3846     int64_t roundIncrement, roundMask, roundBits;
3847
3848     roundingMode = status->float_rounding_mode;
3849     roundNearestEven = ( roundingMode == float_round_nearest_even );
3850     if ( roundingPrecision == 80 ) goto precision80;
3851     if ( roundingPrecision == 64 ) {
3852         roundIncrement = LIT64( 0x0000000000000400 );
3853         roundMask = LIT64( 0x00000000000007FF );
3854     }
3855     else if ( roundingPrecision == 32 ) {
3856         roundIncrement = LIT64( 0x0000008000000000 );
3857         roundMask = LIT64( 0x000000FFFFFFFFFF );
3858     }
3859     else {
3860         goto precision80;
3861     }
3862     zSig0 |= ( zSig1 != 0 );
3863     switch (roundingMode) {
3864     case float_round_nearest_even:
3865     case float_round_ties_away:
3866         break;
3867     case float_round_to_zero:
3868         roundIncrement = 0;
3869         break;
3870     case float_round_up:
3871         roundIncrement = zSign ? 0 : roundMask;
3872         break;
3873     case float_round_down:
3874         roundIncrement = zSign ? roundMask : 0;
3875         break;
3876     default:
3877         abort();
3878     }
3879     roundBits = zSig0 & roundMask;
3880     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3881         if (    ( 0x7FFE < zExp )
3882              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3883            ) {
3884             goto overflow;
3885         }
3886         if ( zExp <= 0 ) {
3887             if (status->flush_to_zero) {
3888                 float_raise(float_flag_output_denormal, status);
3889                 return packFloatx80(zSign, 0, 0);
3890             }
3891             isTiny =
3892                    (status->float_detect_tininess
3893                     == float_tininess_before_rounding)
3894                 || ( zExp < 0 )
3895                 || ( zSig0 <= zSig0 + roundIncrement );
3896             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3897             zExp = 0;
3898             roundBits = zSig0 & roundMask;
3899             if (isTiny && roundBits) {
3900                 float_raise(float_flag_underflow, status);
3901             }
3902             if (roundBits) {
3903                 status->float_exception_flags |= float_flag_inexact;
3904             }
3905             zSig0 += roundIncrement;
3906             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3907             roundIncrement = roundMask + 1;
3908             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3909                 roundMask |= roundIncrement;
3910             }
3911             zSig0 &= ~ roundMask;
3912             return packFloatx80( zSign, zExp, zSig0 );
3913         }
3914     }
3915     if (roundBits) {
3916         status->float_exception_flags |= float_flag_inexact;
3917     }
3918     zSig0 += roundIncrement;
3919     if ( zSig0 < roundIncrement ) {
3920         ++zExp;
3921         zSig0 = LIT64( 0x8000000000000000 );
3922     }
3923     roundIncrement = roundMask + 1;
3924     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3925         roundMask |= roundIncrement;
3926     }
3927     zSig0 &= ~ roundMask;
3928     if ( zSig0 == 0 ) zExp = 0;
3929     return packFloatx80( zSign, zExp, zSig0 );
3930  precision80:
3931     switch (roundingMode) {
3932     case float_round_nearest_even:
3933     case float_round_ties_away:
3934         increment = ((int64_t)zSig1 < 0);
3935         break;
3936     case float_round_to_zero:
3937         increment = 0;
3938         break;
3939     case float_round_up:
3940         increment = !zSign && zSig1;
3941         break;
3942     case float_round_down:
3943         increment = zSign && zSig1;
3944         break;
3945     default:
3946         abort();
3947     }
3948     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3949         if (    ( 0x7FFE < zExp )
3950              || (    ( zExp == 0x7FFE )
3951                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3952                   && increment
3953                 )
3954            ) {
3955             roundMask = 0;
3956  overflow:
3957             float_raise(float_flag_overflow | float_flag_inexact, status);
3958             if (    ( roundingMode == float_round_to_zero )
3959                  || ( zSign && ( roundingMode == float_round_up ) )
3960                  || ( ! zSign && ( roundingMode == float_round_down ) )
3961                ) {
3962                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3963             }
3964             return packFloatx80(zSign,
3965                                 floatx80_infinity_high,
3966                                 floatx80_infinity_low);
3967         }
3968         if ( zExp <= 0 ) {
3969             isTiny =
3970                    (status->float_detect_tininess
3971                     == float_tininess_before_rounding)
3972                 || ( zExp < 0 )
3973                 || ! increment
3974                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3975             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3976             zExp = 0;
3977             if (isTiny && zSig1) {
3978                 float_raise(float_flag_underflow, status);
3979             }
3980             if (zSig1) {
3981                 status->float_exception_flags |= float_flag_inexact;
3982             }
3983             switch (roundingMode) {
3984             case float_round_nearest_even:
3985             case float_round_ties_away:
3986                 increment = ((int64_t)zSig1 < 0);
3987                 break;
3988             case float_round_to_zero:
3989                 increment = 0;
3990                 break;
3991             case float_round_up:
3992                 increment = !zSign && zSig1;
3993                 break;
3994             case float_round_down:
3995                 increment = zSign && zSig1;
3996                 break;
3997             default:
3998                 abort();
3999             }
4000             if ( increment ) {
4001                 ++zSig0;
4002                 zSig0 &=
4003                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4004                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4005             }
4006             return packFloatx80( zSign, zExp, zSig0 );
4007         }
4008     }
4009     if (zSig1) {
4010         status->float_exception_flags |= float_flag_inexact;
4011     }
4012     if ( increment ) {
4013         ++zSig0;
4014         if ( zSig0 == 0 ) {
4015             ++zExp;
4016             zSig0 = LIT64( 0x8000000000000000 );
4017         }
4018         else {
4019             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4020         }
4021     }
4022     else {
4023         if ( zSig0 == 0 ) zExp = 0;
4024     }
4025     return packFloatx80( zSign, zExp, zSig0 );
4026
4027 }
4028
4029 /*----------------------------------------------------------------------------
4030 | Takes an abstract floating-point value having sign `zSign', exponent
4031 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4032 | and returns the proper extended double-precision floating-point value
4033 | corresponding to the abstract input.  This routine is just like
4034 | `roundAndPackFloatx80' except that the input significand does not have to be
4035 | normalized.
4036 *----------------------------------------------------------------------------*/
4037
4038 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4039                                        flag zSign, int32_t zExp,
4040                                        uint64_t zSig0, uint64_t zSig1,
4041                                        float_status *status)
4042 {
4043     int8_t shiftCount;
4044
4045     if ( zSig0 == 0 ) {
4046         zSig0 = zSig1;
4047         zSig1 = 0;
4048         zExp -= 64;
4049     }
4050     shiftCount = clz64(zSig0);
4051     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4052     zExp -= shiftCount;
4053     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4054                                 zSig0, zSig1, status);
4055
4056 }
4057
4058 /*----------------------------------------------------------------------------
4059 | Returns the least-significant 64 fraction bits of the quadruple-precision
4060 | floating-point value `a'.
4061 *----------------------------------------------------------------------------*/
4062
4063 static inline uint64_t extractFloat128Frac1( float128 a )
4064 {
4065
4066     return a.low;
4067
4068 }
4069
4070 /*----------------------------------------------------------------------------
4071 | Returns the most-significant 48 fraction bits of the quadruple-precision
4072 | floating-point value `a'.
4073 *----------------------------------------------------------------------------*/
4074
4075 static inline uint64_t extractFloat128Frac0( float128 a )
4076 {
4077
4078     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
4079
4080 }
4081
4082 /*----------------------------------------------------------------------------
4083 | Returns the exponent bits of the quadruple-precision floating-point value
4084 | `a'.
4085 *----------------------------------------------------------------------------*/
4086
4087 static inline int32_t extractFloat128Exp( float128 a )
4088 {
4089
4090     return ( a.high>>48 ) & 0x7FFF;
4091
4092 }
4093
4094 /*----------------------------------------------------------------------------
4095 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4096 *----------------------------------------------------------------------------*/
4097
4098 static inline flag extractFloat128Sign( float128 a )
4099 {
4100
4101     return a.high>>63;
4102
4103 }
4104
4105 /*----------------------------------------------------------------------------
4106 | Normalizes the subnormal quadruple-precision floating-point value
4107 | represented by the denormalized significand formed by the concatenation of
4108 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4109 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4110 | significand are stored at the location pointed to by `zSig0Ptr', and the
4111 | least significant 64 bits of the normalized significand are stored at the
4112 | location pointed to by `zSig1Ptr'.
4113 *----------------------------------------------------------------------------*/
4114
4115 static void
4116  normalizeFloat128Subnormal(
4117      uint64_t aSig0,
4118      uint64_t aSig1,
4119      int32_t *zExpPtr,
4120      uint64_t *zSig0Ptr,
4121      uint64_t *zSig1Ptr
4122  )
4123 {
4124     int8_t shiftCount;
4125
4126     if ( aSig0 == 0 ) {
4127         shiftCount = clz64(aSig1) - 15;
4128         if ( shiftCount < 0 ) {
4129             *zSig0Ptr = aSig1>>( - shiftCount );
4130             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4131         }
4132         else {
4133             *zSig0Ptr = aSig1<<shiftCount;
4134             *zSig1Ptr = 0;
4135         }
4136         *zExpPtr = - shiftCount - 63;
4137     }
4138     else {
4139         shiftCount = clz64(aSig0) - 15;
4140         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4141         *zExpPtr = 1 - shiftCount;
4142     }
4143
4144 }
4145
4146 /*----------------------------------------------------------------------------
4147 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4148 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4149 | floating-point value, returning the result.  After being shifted into the
4150 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4151 | added together to form the most significant 32 bits of the result.  This
4152 | means that any integer portion of `zSig0' will be added into the exponent.
4153 | Since a properly normalized significand will have an integer portion equal
4154 | to 1, the `zExp' input should be 1 less than the desired result exponent
4155 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4156 | significand.
4157 *----------------------------------------------------------------------------*/
4158
4159 static inline float128
4160  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
4161 {
4162     float128 z;
4163
4164     z.low = zSig1;
4165     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
4166     return z;
4167
4168 }
4169
4170 /*----------------------------------------------------------------------------
4171 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4172 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4173 | and `zSig2', and returns the proper quadruple-precision floating-point value
4174 | corresponding to the abstract input.  Ordinarily, the abstract value is
4175 | simply rounded and packed into the quadruple-precision format, with the
4176 | inexact exception raised if the abstract input cannot be represented
4177 | exactly.  However, if the abstract value is too large, the overflow and
4178 | inexact exceptions are raised and an infinity or maximal finite value is
4179 | returned.  If the abstract value is too small, the input value is rounded to
4180 | a subnormal number, and the underflow and inexact exceptions are raised if
4181 | the abstract input cannot be represented exactly as a subnormal quadruple-
4182 | precision floating-point number.
4183 |     The input significand must be normalized or smaller.  If the input
4184 | significand is not normalized, `zExp' must be 0; in that case, the result
4185 | returned is a subnormal number, and it must not require rounding.  In the
4186 | usual case that the input significand is normalized, `zExp' must be 1 less
4187 | than the ``true'' floating-point exponent.  The handling of underflow and
4188 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4189 *----------------------------------------------------------------------------*/
4190
4191 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4192                                      uint64_t zSig0, uint64_t zSig1,
4193                                      uint64_t zSig2, float_status *status)
4194 {
4195     int8_t roundingMode;
4196     flag roundNearestEven, increment, isTiny;
4197
4198     roundingMode = status->float_rounding_mode;
4199     roundNearestEven = ( roundingMode == float_round_nearest_even );
4200     switch (roundingMode) {
4201     case float_round_nearest_even:
4202     case float_round_ties_away:
4203         increment = ((int64_t)zSig2 < 0);
4204         break;
4205     case float_round_to_zero:
4206         increment = 0;
4207         break;
4208     case float_round_up:
4209         increment = !zSign && zSig2;
4210         break;
4211     case float_round_down:
4212         increment = zSign && zSig2;
4213         break;
4214     case float_round_to_odd:
4215         increment = !(zSig1 & 0x1) && zSig2;
4216         break;
4217     default:
4218         abort();
4219     }
4220     if ( 0x7FFD <= (uint32_t) zExp ) {
4221         if (    ( 0x7FFD < zExp )
4222              || (    ( zExp == 0x7FFD )
4223                   && eq128(
4224                          LIT64( 0x0001FFFFFFFFFFFF ),
4225                          LIT64( 0xFFFFFFFFFFFFFFFF ),
4226                          zSig0,
4227                          zSig1
4228                      )
4229                   && increment
4230                 )
4231            ) {
4232             float_raise(float_flag_overflow | float_flag_inexact, status);
4233             if (    ( roundingMode == float_round_to_zero )
4234                  || ( zSign && ( roundingMode == float_round_up ) )
4235                  || ( ! zSign && ( roundingMode == float_round_down ) )
4236                  || (roundingMode == float_round_to_odd)
4237                ) {
4238                 return
4239                     packFloat128(
4240                         zSign,
4241                         0x7FFE,
4242                         LIT64( 0x0000FFFFFFFFFFFF ),
4243                         LIT64( 0xFFFFFFFFFFFFFFFF )
4244                     );
4245             }
4246             return packFloat128( zSign, 0x7FFF, 0, 0 );
4247         }
4248         if ( zExp < 0 ) {
4249             if (status->flush_to_zero) {
4250                 float_raise(float_flag_output_denormal, status);
4251                 return packFloat128(zSign, 0, 0, 0);
4252             }
4253             isTiny =
4254                    (status->float_detect_tininess
4255                     == float_tininess_before_rounding)
4256                 || ( zExp < -1 )
4257                 || ! increment
4258                 || lt128(
4259                        zSig0,
4260                        zSig1,
4261                        LIT64( 0x0001FFFFFFFFFFFF ),
4262                        LIT64( 0xFFFFFFFFFFFFFFFF )
4263                    );
4264             shift128ExtraRightJamming(
4265                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4266             zExp = 0;
4267             if (isTiny && zSig2) {
4268                 float_raise(float_flag_underflow, status);
4269             }
4270             switch (roundingMode) {
4271             case float_round_nearest_even:
4272             case float_round_ties_away:
4273                 increment = ((int64_t)zSig2 < 0);
4274                 break;
4275             case float_round_to_zero:
4276                 increment = 0;
4277                 break;
4278             case float_round_up:
4279                 increment = !zSign && zSig2;
4280                 break;
4281             case float_round_down:
4282                 increment = zSign && zSig2;
4283                 break;
4284             case float_round_to_odd:
4285                 increment = !(zSig1 & 0x1) && zSig2;
4286                 break;
4287             default:
4288                 abort();
4289             }
4290         }
4291     }
4292     if (zSig2) {
4293         status->float_exception_flags |= float_flag_inexact;
4294     }
4295     if ( increment ) {
4296         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4297         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4298     }
4299     else {
4300         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4301     }
4302     return packFloat128( zSign, zExp, zSig0, zSig1 );
4303
4304 }
4305
4306 /*----------------------------------------------------------------------------
4307 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4308 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4309 | returns the proper quadruple-precision floating-point value corresponding
4310 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4311 | except that the input significand has fewer bits and does not have to be
4312 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4313 | point exponent.
4314 *----------------------------------------------------------------------------*/
4315
4316 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4317                                               uint64_t zSig0, uint64_t zSig1,
4318                                               float_status *status)
4319 {
4320     int8_t shiftCount;
4321     uint64_t zSig2;
4322
4323     if ( zSig0 == 0 ) {
4324         zSig0 = zSig1;
4325         zSig1 = 0;
4326         zExp -= 64;
4327     }
4328     shiftCount = clz64(zSig0) - 15;
4329     if ( 0 <= shiftCount ) {
4330         zSig2 = 0;
4331         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4332     }
4333     else {
4334         shift128ExtraRightJamming(
4335             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4336     }
4337     zExp -= shiftCount;
4338     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4339
4340 }
4341
4342
4343 /*----------------------------------------------------------------------------
4344 | Returns the result of converting the 32-bit two's complement integer `a'
4345 | to the extended double-precision floating-point format.  The conversion
4346 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4347 | Arithmetic.
4348 *----------------------------------------------------------------------------*/
4349
4350 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4351 {
4352     flag zSign;
4353     uint32_t absA;
4354     int8_t shiftCount;
4355     uint64_t zSig;
4356
4357     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4358     zSign = ( a < 0 );
4359     absA = zSign ? - a : a;
4360     shiftCount = clz32(absA) + 32;
4361     zSig = absA;
4362     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4363
4364 }
4365
4366 /*----------------------------------------------------------------------------
4367 | Returns the result of converting the 32-bit two's complement integer `a' to
4368 | the quadruple-precision floating-point format.  The conversion is performed
4369 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4370 *----------------------------------------------------------------------------*/
4371
4372 float128 int32_to_float128(int32_t a, float_status *status)
4373 {
4374     flag zSign;
4375     uint32_t absA;
4376     int8_t shiftCount;
4377     uint64_t zSig0;
4378
4379     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4380     zSign = ( a < 0 );
4381     absA = zSign ? - a : a;
4382     shiftCount = clz32(absA) + 17;
4383     zSig0 = absA;
4384     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4385
4386 }
4387
4388 /*----------------------------------------------------------------------------
4389 | Returns the result of converting the 64-bit two's complement integer `a'
4390 | to the extended double-precision floating-point format.  The conversion
4391 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4392 | Arithmetic.
4393 *----------------------------------------------------------------------------*/
4394
4395 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4396 {
4397     flag zSign;
4398     uint64_t absA;
4399     int8_t shiftCount;
4400
4401     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4402     zSign = ( a < 0 );
4403     absA = zSign ? - a : a;
4404     shiftCount = clz64(absA);
4405     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4406
4407 }
4408
4409 /*----------------------------------------------------------------------------
4410 | Returns the result of converting the 64-bit two's complement integer `a' to
4411 | the quadruple-precision floating-point format.  The conversion is performed
4412 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4413 *----------------------------------------------------------------------------*/
4414
4415 float128 int64_to_float128(int64_t a, float_status *status)
4416 {
4417     flag zSign;
4418     uint64_t absA;
4419     int8_t shiftCount;
4420     int32_t zExp;
4421     uint64_t zSig0, zSig1;
4422
4423     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4424     zSign = ( a < 0 );
4425     absA = zSign ? - a : a;
4426     shiftCount = clz64(absA) + 49;
4427     zExp = 0x406E - shiftCount;
4428     if ( 64 <= shiftCount ) {
4429         zSig1 = 0;
4430         zSig0 = absA;
4431         shiftCount -= 64;
4432     }
4433     else {
4434         zSig1 = absA;
4435         zSig0 = 0;
4436     }
4437     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4438     return packFloat128( zSign, zExp, zSig0, zSig1 );
4439
4440 }
4441
4442 /*----------------------------------------------------------------------------
4443 | Returns the result of converting the 64-bit unsigned integer `a'
4444 | to the quadruple-precision floating-point format.  The conversion is performed
4445 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4446 *----------------------------------------------------------------------------*/
4447
4448 float128 uint64_to_float128(uint64_t a, float_status *status)
4449 {
4450     if (a == 0) {
4451         return float128_zero;
4452     }
4453     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4454 }
4455
4456 /*----------------------------------------------------------------------------
4457 | Returns the result of converting the single-precision floating-point value
4458 | `a' to the extended double-precision floating-point format.  The conversion
4459 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4460 | Arithmetic.
4461 *----------------------------------------------------------------------------*/
4462
4463 floatx80 float32_to_floatx80(float32 a, float_status *status)
4464 {
4465     flag aSign;
4466     int aExp;
4467     uint32_t aSig;
4468
4469     a = float32_squash_input_denormal(a, status);
4470     aSig = extractFloat32Frac( a );
4471     aExp = extractFloat32Exp( a );
4472     aSign = extractFloat32Sign( a );
4473     if ( aExp == 0xFF ) {
4474         if (aSig) {
4475             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4476         }
4477         return packFloatx80(aSign,
4478                             floatx80_infinity_high,
4479                             floatx80_infinity_low);
4480     }
4481     if ( aExp == 0 ) {
4482         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4483         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4484     }
4485     aSig |= 0x00800000;
4486     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4487
4488 }
4489
4490 /*----------------------------------------------------------------------------
4491 | Returns the result of converting the single-precision floating-point value
4492 | `a' to the double-precision floating-point format.  The conversion is
4493 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4494 | Arithmetic.
4495 *----------------------------------------------------------------------------*/
4496
4497 float128 float32_to_float128(float32 a, float_status *status)
4498 {
4499     flag aSign;
4500     int aExp;
4501     uint32_t aSig;
4502
4503     a = float32_squash_input_denormal(a, status);
4504     aSig = extractFloat32Frac( a );
4505     aExp = extractFloat32Exp( a );
4506     aSign = extractFloat32Sign( a );
4507     if ( aExp == 0xFF ) {
4508         if (aSig) {
4509             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4510         }
4511         return packFloat128( aSign, 0x7FFF, 0, 0 );
4512     }
4513     if ( aExp == 0 ) {
4514         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4515         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4516         --aExp;
4517     }
4518     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4519
4520 }
4521
4522 /*----------------------------------------------------------------------------
4523 | Returns the remainder of the single-precision floating-point value `a'
4524 | with respect to the corresponding value `b'.  The operation is performed
4525 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4526 *----------------------------------------------------------------------------*/
4527
4528 float32 float32_rem(float32 a, float32 b, float_status *status)
4529 {
4530     flag aSign, zSign;
4531     int aExp, bExp, expDiff;
4532     uint32_t aSig, bSig;
4533     uint32_t q;
4534     uint64_t aSig64, bSig64, q64;
4535     uint32_t alternateASig;
4536     int32_t sigMean;
4537     a = float32_squash_input_denormal(a, status);
4538     b = float32_squash_input_denormal(b, status);
4539
4540     aSig = extractFloat32Frac( a );
4541     aExp = extractFloat32Exp( a );
4542     aSign = extractFloat32Sign( a );
4543     bSig = extractFloat32Frac( b );
4544     bExp = extractFloat32Exp( b );
4545     if ( aExp == 0xFF ) {
4546         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4547             return propagateFloat32NaN(a, b, status);
4548         }
4549         float_raise(float_flag_invalid, status);
4550         return float32_default_nan(status);
4551     }
4552     if ( bExp == 0xFF ) {
4553         if (bSig) {
4554             return propagateFloat32NaN(a, b, status);
4555         }
4556         return a;
4557     }
4558     if ( bExp == 0 ) {
4559         if ( bSig == 0 ) {
4560             float_raise(float_flag_invalid, status);
4561             return float32_default_nan(status);
4562         }
4563         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4564     }
4565     if ( aExp == 0 ) {
4566         if ( aSig == 0 ) return a;
4567         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4568     }
4569     expDiff = aExp - bExp;
4570     aSig |= 0x00800000;
4571     bSig |= 0x00800000;
4572     if ( expDiff < 32 ) {
4573         aSig <<= 8;
4574         bSig <<= 8;
4575         if ( expDiff < 0 ) {
4576             if ( expDiff < -1 ) return a;
4577             aSig >>= 1;
4578         }
4579         q = ( bSig <= aSig );
4580         if ( q ) aSig -= bSig;
4581         if ( 0 < expDiff ) {
4582             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4583             q >>= 32 - expDiff;
4584             bSig >>= 2;
4585             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4586         }
4587         else {
4588             aSig >>= 2;
4589             bSig >>= 2;
4590         }
4591     }
4592     else {
4593         if ( bSig <= aSig ) aSig -= bSig;
4594         aSig64 = ( (uint64_t) aSig )<<40;
4595         bSig64 = ( (uint64_t) bSig )<<40;
4596         expDiff -= 64;
4597         while ( 0 < expDiff ) {
4598             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4599             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4600             aSig64 = - ( ( bSig * q64 )<<38 );
4601             expDiff -= 62;
4602         }
4603         expDiff += 64;
4604         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4605         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4606         q = q64>>( 64 - expDiff );
4607         bSig <<= 6;
4608         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4609     }
4610     do {
4611         alternateASig = aSig;
4612         ++q;
4613         aSig -= bSig;
4614     } while ( 0 <= (int32_t) aSig );
4615     sigMean = aSig + alternateASig;
4616     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4617         aSig = alternateASig;
4618     }
4619     zSign = ( (int32_t) aSig < 0 );
4620     if ( zSign ) aSig = - aSig;
4621     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4622 }
4623
4624
4625
4626 /*----------------------------------------------------------------------------
4627 | Returns the binary exponential of the single-precision floating-point value
4628 | `a'. The operation is performed according to the IEC/IEEE Standard for
4629 | Binary Floating-Point Arithmetic.
4630 |
4631 | Uses the following identities:
4632 |
4633 | 1. -------------------------------------------------------------------------
4634 |      x    x*ln(2)
4635 |     2  = e
4636 |
4637 | 2. -------------------------------------------------------------------------
4638 |                      2     3     4     5           n
4639 |      x        x     x     x     x     x           x
4640 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4641 |               1!    2!    3!    4!    5!          n!
4642 *----------------------------------------------------------------------------*/
4643
4644 static const float64 float32_exp2_coefficients[15] =
4645 {
4646     const_float64( 0x3ff0000000000000ll ), /*  1 */
4647     const_float64( 0x3fe0000000000000ll ), /*  2 */
4648     const_float64( 0x3fc5555555555555ll ), /*  3 */
4649     const_float64( 0x3fa5555555555555ll ), /*  4 */
4650     const_float64( 0x3f81111111111111ll ), /*  5 */
4651     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4652     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4653     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4654     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4655     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4656     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4657     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4658     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4659     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4660     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4661 };
4662
4663 float32 float32_exp2(float32 a, float_status *status)
4664 {
4665     flag aSign;
4666     int aExp;
4667     uint32_t aSig;
4668     float64 r, x, xn;
4669     int i;
4670     a = float32_squash_input_denormal(a, status);
4671
4672     aSig = extractFloat32Frac( a );
4673     aExp = extractFloat32Exp( a );
4674     aSign = extractFloat32Sign( a );
4675
4676     if ( aExp == 0xFF) {
4677         if (aSig) {
4678             return propagateFloat32NaN(a, float32_zero, status);
4679         }
4680         return (aSign) ? float32_zero : a;
4681     }
4682     if (aExp == 0) {
4683         if (aSig == 0) return float32_one;
4684     }
4685
4686     float_raise(float_flag_inexact, status);
4687
4688     /* ******************************* */
4689     /* using float64 for approximation */
4690     /* ******************************* */
4691     x = float32_to_float64(a, status);
4692     x = float64_mul(x, float64_ln2, status);
4693
4694     xn = x;
4695     r = float64_one;
4696     for (i = 0 ; i < 15 ; i++) {
4697         float64 f;
4698
4699         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4700         r = float64_add(r, f, status);
4701
4702         xn = float64_mul(xn, x, status);
4703     }
4704
4705     return float64_to_float32(r, status);
4706 }
4707
4708 /*----------------------------------------------------------------------------
4709 | Returns the binary log of the single-precision floating-point value `a'.
4710 | The operation is performed according to the IEC/IEEE Standard for Binary
4711 | Floating-Point Arithmetic.
4712 *----------------------------------------------------------------------------*/
4713 float32 float32_log2(float32 a, float_status *status)
4714 {
4715     flag aSign, zSign;
4716     int aExp;
4717     uint32_t aSig, zSig, i;
4718
4719     a = float32_squash_input_denormal(a, status);
4720     aSig = extractFloat32Frac( a );
4721     aExp = extractFloat32Exp( a );
4722     aSign = extractFloat32Sign( a );
4723
4724     if ( aExp == 0 ) {
4725         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4726         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4727     }
4728     if ( aSign ) {
4729         float_raise(float_flag_invalid, status);
4730         return float32_default_nan(status);
4731     }
4732     if ( aExp == 0xFF ) {
4733         if (aSig) {
4734             return propagateFloat32NaN(a, float32_zero, status);
4735         }
4736         return a;
4737     }
4738
4739     aExp -= 0x7F;
4740     aSig |= 0x00800000;
4741     zSign = aExp < 0;
4742     zSig = aExp << 23;
4743
4744     for (i = 1 << 22; i > 0; i >>= 1) {
4745         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4746         if ( aSig & 0x01000000 ) {
4747             aSig >>= 1;
4748             zSig |= i;
4749         }
4750     }
4751
4752     if ( zSign )
4753         zSig = -zSig;
4754
4755     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4756 }
4757
4758 /*----------------------------------------------------------------------------
4759 | Returns 1 if the single-precision floating-point value `a' is equal to
4760 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4761 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4762 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4763 *----------------------------------------------------------------------------*/
4764
4765 int float32_eq(float32 a, float32 b, float_status *status)
4766 {
4767     uint32_t av, bv;
4768     a = float32_squash_input_denormal(a, status);
4769     b = float32_squash_input_denormal(b, status);
4770
4771     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4772          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4773        ) {
4774         float_raise(float_flag_invalid, status);
4775         return 0;
4776     }
4777     av = float32_val(a);
4778     bv = float32_val(b);
4779     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4780 }
4781
4782 /*----------------------------------------------------------------------------
4783 | Returns 1 if the single-precision floating-point value `a' is less than
4784 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
4785 | exception is raised if either operand is a NaN.  The comparison is performed
4786 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4787 *----------------------------------------------------------------------------*/
4788
4789 int float32_le(float32 a, float32 b, float_status *status)
4790 {
4791     flag aSign, bSign;
4792     uint32_t av, bv;
4793     a = float32_squash_input_denormal(a, status);
4794     b = float32_squash_input_denormal(b, status);
4795
4796     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4797          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4798        ) {
4799         float_raise(float_flag_invalid, status);
4800         return 0;
4801     }
4802     aSign = extractFloat32Sign( a );
4803     bSign = extractFloat32Sign( b );
4804     av = float32_val(a);
4805     bv = float32_val(b);
4806     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4807     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4808
4809 }
4810
4811 /*----------------------------------------------------------------------------
4812 | Returns 1 if the single-precision floating-point value `a' is less than
4813 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4814 | raised if either operand is a NaN.  The comparison is performed according
4815 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4816 *----------------------------------------------------------------------------*/
4817
4818 int float32_lt(float32 a, float32 b, float_status *status)
4819 {
4820     flag aSign, bSign;
4821     uint32_t av, bv;
4822     a = float32_squash_input_denormal(a, status);
4823     b = float32_squash_input_denormal(b, status);
4824
4825     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4826          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4827        ) {
4828         float_raise(float_flag_invalid, status);
4829         return 0;
4830     }
4831     aSign = extractFloat32Sign( a );
4832     bSign = extractFloat32Sign( b );
4833     av = float32_val(a);
4834     bv = float32_val(b);
4835     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4836     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4837
4838 }
4839
4840 /*----------------------------------------------------------------------------
4841 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4842 | be compared, and 0 otherwise.  The invalid exception is raised if either
4843 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4844 | Standard for Binary Floating-Point Arithmetic.
4845 *----------------------------------------------------------------------------*/
4846
4847 int float32_unordered(float32 a, float32 b, float_status *status)
4848 {
4849     a = float32_squash_input_denormal(a, status);
4850     b = float32_squash_input_denormal(b, status);
4851
4852     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4853          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4854        ) {
4855         float_raise(float_flag_invalid, status);
4856         return 1;
4857     }
4858     return 0;
4859 }
4860
4861 /*----------------------------------------------------------------------------
4862 | Returns 1 if the single-precision floating-point value `a' is equal to
4863 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4864 | exception.  The comparison is performed according to the IEC/IEEE Standard
4865 | for Binary Floating-Point Arithmetic.
4866 *----------------------------------------------------------------------------*/
4867
4868 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4869 {
4870     a = float32_squash_input_denormal(a, status);
4871     b = float32_squash_input_denormal(b, status);
4872
4873     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4874          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4875        ) {
4876         if (float32_is_signaling_nan(a, status)
4877          || float32_is_signaling_nan(b, status)) {
4878             float_raise(float_flag_invalid, status);
4879         }
4880         return 0;
4881     }
4882     return ( float32_val(a) == float32_val(b) ) ||
4883             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4884 }
4885
4886 /*----------------------------------------------------------------------------
4887 | Returns 1 if the single-precision floating-point value `a' is less than or
4888 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4889 | cause an exception.  Otherwise, the comparison is performed according to the
4890 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4891 *----------------------------------------------------------------------------*/
4892
4893 int float32_le_quiet(float32 a, float32 b, float_status *status)
4894 {
4895     flag aSign, bSign;
4896     uint32_t av, bv;
4897     a = float32_squash_input_denormal(a, status);
4898     b = float32_squash_input_denormal(b, status);
4899
4900     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4901          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4902        ) {
4903         if (float32_is_signaling_nan(a, status)
4904          || float32_is_signaling_nan(b, status)) {
4905             float_raise(float_flag_invalid, status);
4906         }
4907         return 0;
4908     }
4909     aSign = extractFloat32Sign( a );
4910     bSign = extractFloat32Sign( b );
4911     av = float32_val(a);
4912     bv = float32_val(b);
4913     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4914     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4915
4916 }
4917
4918 /*----------------------------------------------------------------------------
4919 | Returns 1 if the single-precision floating-point value `a' is less than
4920 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4921 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4922 | Standard for Binary Floating-Point Arithmetic.
4923 *----------------------------------------------------------------------------*/
4924
4925 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4926 {
4927     flag aSign, bSign;
4928     uint32_t av, bv;
4929     a = float32_squash_input_denormal(a, status);
4930     b = float32_squash_input_denormal(b, status);
4931
4932     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4933          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4934        ) {
4935         if (float32_is_signaling_nan(a, status)
4936          || float32_is_signaling_nan(b, status)) {
4937             float_raise(float_flag_invalid, status);
4938         }
4939         return 0;
4940     }
4941     aSign = extractFloat32Sign( a );
4942     bSign = extractFloat32Sign( b );
4943     av = float32_val(a);
4944     bv = float32_val(b);
4945     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4946     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4947
4948 }
4949
4950 /*----------------------------------------------------------------------------
4951 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4952 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4953 | comparison is performed according to the IEC/IEEE Standard for Binary
4954 | Floating-Point Arithmetic.
4955 *----------------------------------------------------------------------------*/
4956
4957 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4958 {
4959     a = float32_squash_input_denormal(a, status);
4960     b = float32_squash_input_denormal(b, status);
4961
4962     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4963          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4964        ) {
4965         if (float32_is_signaling_nan(a, status)
4966          || float32_is_signaling_nan(b, status)) {
4967             float_raise(float_flag_invalid, status);
4968         }
4969         return 1;
4970     }
4971     return 0;
4972 }
4973
4974 /*----------------------------------------------------------------------------
4975 | If `a' is denormal and we are in flush-to-zero mode then set the
4976 | input-denormal exception and return zero. Otherwise just return the value.
4977 *----------------------------------------------------------------------------*/
4978 float16 float16_squash_input_denormal(float16 a, float_status *status)
4979 {
4980     if (status->flush_inputs_to_zero) {
4981         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4982             float_raise(float_flag_input_denormal, status);
4983             return make_float16(float16_val(a) & 0x8000);
4984         }
4985     }
4986     return a;
4987 }
4988
4989 /*----------------------------------------------------------------------------
4990 | Returns the result of converting the double-precision floating-point value
4991 | `a' to the extended double-precision floating-point format.  The conversion
4992 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4993 | Arithmetic.
4994 *----------------------------------------------------------------------------*/
4995
4996 floatx80 float64_to_floatx80(float64 a, float_status *status)
4997 {
4998     flag aSign;
4999     int aExp;
5000     uint64_t aSig;
5001
5002     a = float64_squash_input_denormal(a, status);
5003     aSig = extractFloat64Frac( a );
5004     aExp = extractFloat64Exp( a );
5005     aSign = extractFloat64Sign( a );
5006     if ( aExp == 0x7FF ) {
5007         if (aSig) {
5008             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
5009         }
5010         return packFloatx80(aSign,
5011                             floatx80_infinity_high,
5012                             floatx80_infinity_low);
5013     }
5014     if ( aExp == 0 ) {
5015         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5016         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5017     }
5018     return
5019         packFloatx80(
5020             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
5021
5022 }
5023
5024 /*----------------------------------------------------------------------------
5025 | Returns the result of converting the double-precision floating-point value
5026 | `a' to the quadruple-precision floating-point format.  The conversion is
5027 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5028 | Arithmetic.
5029 *----------------------------------------------------------------------------*/
5030
5031 float128 float64_to_float128(float64 a, float_status *status)
5032 {
5033     flag aSign;
5034     int aExp;
5035     uint64_t aSig, zSig0, zSig1;
5036
5037     a = float64_squash_input_denormal(a, status);
5038     aSig = extractFloat64Frac( a );
5039     aExp = extractFloat64Exp( a );
5040     aSign = extractFloat64Sign( a );
5041     if ( aExp == 0x7FF ) {
5042         if (aSig) {
5043             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5044         }
5045         return packFloat128( aSign, 0x7FFF, 0, 0 );
5046     }
5047     if ( aExp == 0 ) {
5048         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5049         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5050         --aExp;
5051     }
5052     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5053     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5054
5055 }
5056
5057
5058 /*----------------------------------------------------------------------------
5059 | Returns the remainder of the double-precision floating-point value `a'
5060 | with respect to the corresponding value `b'.  The operation is performed
5061 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5062 *----------------------------------------------------------------------------*/
5063
5064 float64 float64_rem(float64 a, float64 b, float_status *status)
5065 {
5066     flag aSign, zSign;
5067     int aExp, bExp, expDiff;
5068     uint64_t aSig, bSig;
5069     uint64_t q, alternateASig;
5070     int64_t sigMean;
5071
5072     a = float64_squash_input_denormal(a, status);
5073     b = float64_squash_input_denormal(b, status);
5074     aSig = extractFloat64Frac( a );
5075     aExp = extractFloat64Exp( a );
5076     aSign = extractFloat64Sign( a );
5077     bSig = extractFloat64Frac( b );
5078     bExp = extractFloat64Exp( b );
5079     if ( aExp == 0x7FF ) {
5080         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5081             return propagateFloat64NaN(a, b, status);
5082         }
5083         float_raise(float_flag_invalid, status);
5084         return float64_default_nan(status);
5085     }
5086     if ( bExp == 0x7FF ) {
5087         if (bSig) {
5088             return propagateFloat64NaN(a, b, status);
5089         }
5090         return a;
5091     }
5092     if ( bExp == 0 ) {
5093         if ( bSig == 0 ) {
5094             float_raise(float_flag_invalid, status);
5095             return float64_default_nan(status);
5096         }
5097         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5098     }
5099     if ( aExp == 0 ) {
5100         if ( aSig == 0 ) return a;
5101         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5102     }
5103     expDiff = aExp - bExp;
5104     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
5105     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
5106     if ( expDiff < 0 ) {
5107         if ( expDiff < -1 ) return a;
5108         aSig >>= 1;
5109     }
5110     q = ( bSig <= aSig );
5111     if ( q ) aSig -= bSig;
5112     expDiff -= 64;
5113     while ( 0 < expDiff ) {
5114         q = estimateDiv128To64( aSig, 0, bSig );
5115         q = ( 2 < q ) ? q - 2 : 0;
5116         aSig = - ( ( bSig>>2 ) * q );
5117         expDiff -= 62;
5118     }
5119     expDiff += 64;
5120     if ( 0 < expDiff ) {
5121         q = estimateDiv128To64( aSig, 0, bSig );
5122         q = ( 2 < q ) ? q - 2 : 0;
5123         q >>= 64 - expDiff;
5124         bSig >>= 2;
5125         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5126     }
5127     else {
5128         aSig >>= 2;
5129         bSig >>= 2;
5130     }
5131     do {
5132         alternateASig = aSig;
5133         ++q;
5134         aSig -= bSig;
5135     } while ( 0 <= (int64_t) aSig );
5136     sigMean = aSig + alternateASig;
5137     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5138         aSig = alternateASig;
5139     }
5140     zSign = ( (int64_t) aSig < 0 );
5141     if ( zSign ) aSig = - aSig;
5142     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5143
5144 }
5145
5146 /*----------------------------------------------------------------------------
5147 | Returns the binary log of the double-precision floating-point value `a'.
5148 | The operation is performed according to the IEC/IEEE Standard for Binary
5149 | Floating-Point Arithmetic.
5150 *----------------------------------------------------------------------------*/
5151 float64 float64_log2(float64 a, float_status *status)
5152 {
5153     flag aSign, zSign;
5154     int aExp;
5155     uint64_t aSig, aSig0, aSig1, zSig, i;
5156     a = float64_squash_input_denormal(a, status);
5157
5158     aSig = extractFloat64Frac( a );
5159     aExp = extractFloat64Exp( a );
5160     aSign = extractFloat64Sign( a );
5161
5162     if ( aExp == 0 ) {
5163         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5164         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5165     }
5166     if ( aSign ) {
5167         float_raise(float_flag_invalid, status);
5168         return float64_default_nan(status);
5169     }
5170     if ( aExp == 0x7FF ) {
5171         if (aSig) {
5172             return propagateFloat64NaN(a, float64_zero, status);
5173         }
5174         return a;
5175     }
5176
5177     aExp -= 0x3FF;
5178     aSig |= LIT64( 0x0010000000000000 );
5179     zSign = aExp < 0;
5180     zSig = (uint64_t)aExp << 52;
5181     for (i = 1LL << 51; i > 0; i >>= 1) {
5182         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5183         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5184         if ( aSig & LIT64( 0x0020000000000000 ) ) {
5185             aSig >>= 1;
5186             zSig |= i;
5187         }
5188     }
5189
5190     if ( zSign )
5191         zSig = -zSig;
5192     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5193 }
5194
5195 /*----------------------------------------------------------------------------
5196 | Returns 1 if the double-precision floating-point value `a' is equal to the
5197 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
5198 | if either operand is a NaN.  Otherwise, the comparison is performed
5199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5200 *----------------------------------------------------------------------------*/
5201
5202 int float64_eq(float64 a, float64 b, float_status *status)
5203 {
5204     uint64_t av, bv;
5205     a = float64_squash_input_denormal(a, status);
5206     b = float64_squash_input_denormal(b, status);
5207
5208     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5209          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5210        ) {
5211         float_raise(float_flag_invalid, status);
5212         return 0;
5213     }
5214     av = float64_val(a);
5215     bv = float64_val(b);
5216     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5217
5218 }
5219
5220 /*----------------------------------------------------------------------------
5221 | Returns 1 if the double-precision floating-point value `a' is less than or
5222 | equal to the corresponding value `b', and 0 otherwise.  The invalid
5223 | exception is raised if either operand is a NaN.  The comparison is performed
5224 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5225 *----------------------------------------------------------------------------*/
5226
5227 int float64_le(float64 a, float64 b, float_status *status)
5228 {
5229     flag aSign, bSign;
5230     uint64_t av, bv;
5231     a = float64_squash_input_denormal(a, status);
5232     b = float64_squash_input_denormal(b, status);
5233
5234     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5235          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5236        ) {
5237         float_raise(float_flag_invalid, status);
5238         return 0;
5239     }
5240     aSign = extractFloat64Sign( a );
5241     bSign = extractFloat64Sign( b );
5242     av = float64_val(a);
5243     bv = float64_val(b);
5244     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5245     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5246
5247 }
5248
5249 /*----------------------------------------------------------------------------
5250 | Returns 1 if the double-precision floating-point value `a' is less than
5251 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5252 | raised if either operand is a NaN.  The comparison is performed according
5253 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5254 *----------------------------------------------------------------------------*/
5255
5256 int float64_lt(float64 a, float64 b, float_status *status)
5257 {
5258     flag aSign, bSign;
5259     uint64_t av, bv;
5260
5261     a = float64_squash_input_denormal(a, status);
5262     b = float64_squash_input_denormal(b, status);
5263     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5264          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5265        ) {
5266         float_raise(float_flag_invalid, status);
5267         return 0;
5268     }
5269     aSign = extractFloat64Sign( a );
5270     bSign = extractFloat64Sign( b );
5271     av = float64_val(a);
5272     bv = float64_val(b);
5273     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5274     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5275
5276 }
5277
5278 /*----------------------------------------------------------------------------
5279 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5280 | be compared, and 0 otherwise.  The invalid exception is raised if either
5281 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
5282 | Standard for Binary Floating-Point Arithmetic.
5283 *----------------------------------------------------------------------------*/
5284
5285 int float64_unordered(float64 a, float64 b, float_status *status)
5286 {
5287     a = float64_squash_input_denormal(a, status);
5288     b = float64_squash_input_denormal(b, status);
5289
5290     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5291          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5292        ) {
5293         float_raise(float_flag_invalid, status);
5294         return 1;
5295     }
5296     return 0;
5297 }
5298
5299 /*----------------------------------------------------------------------------
5300 | Returns 1 if the double-precision floating-point value `a' is equal to the
5301 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5302 | exception.The comparison is performed according to the IEC/IEEE Standard
5303 | for Binary Floating-Point Arithmetic.
5304 *----------------------------------------------------------------------------*/
5305
5306 int float64_eq_quiet(float64 a, float64 b, float_status *status)
5307 {
5308     uint64_t av, bv;
5309     a = float64_squash_input_denormal(a, status);
5310     b = float64_squash_input_denormal(b, status);
5311
5312     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5313          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5314        ) {
5315         if (float64_is_signaling_nan(a, status)
5316          || float64_is_signaling_nan(b, status)) {
5317             float_raise(float_flag_invalid, status);
5318         }
5319         return 0;
5320     }
5321     av = float64_val(a);
5322     bv = float64_val(b);
5323     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5324
5325 }
5326
5327 /*----------------------------------------------------------------------------
5328 | Returns 1 if the double-precision floating-point value `a' is less than or
5329 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5330 | cause an exception.  Otherwise, the comparison is performed according to the
5331 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5332 *----------------------------------------------------------------------------*/
5333
5334 int float64_le_quiet(float64 a, float64 b, float_status *status)
5335 {
5336     flag aSign, bSign;
5337     uint64_t av, bv;
5338     a = float64_squash_input_denormal(a, status);
5339     b = float64_squash_input_denormal(b, status);
5340
5341     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5342          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5343        ) {
5344         if (float64_is_signaling_nan(a, status)
5345          || float64_is_signaling_nan(b, status)) {
5346             float_raise(float_flag_invalid, status);
5347         }
5348         return 0;
5349     }
5350     aSign = extractFloat64Sign( a );
5351     bSign = extractFloat64Sign( b );
5352     av = float64_val(a);
5353     bv = float64_val(b);
5354     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5355     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5356
5357 }
5358
5359 /*----------------------------------------------------------------------------
5360 | Returns 1 if the double-precision floating-point value `a' is less than
5361 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5362 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5363 | Standard for Binary Floating-Point Arithmetic.
5364 *----------------------------------------------------------------------------*/
5365
5366 int float64_lt_quiet(float64 a, float64 b, float_status *status)
5367 {
5368     flag aSign, bSign;
5369     uint64_t av, bv;
5370     a = float64_squash_input_denormal(a, status);
5371     b = float64_squash_input_denormal(b, status);
5372
5373     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5374          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5375        ) {
5376         if (float64_is_signaling_nan(a, status)
5377          || float64_is_signaling_nan(b, status)) {
5378             float_raise(float_flag_invalid, status);
5379         }
5380         return 0;
5381     }
5382     aSign = extractFloat64Sign( a );
5383     bSign = extractFloat64Sign( b );
5384     av = float64_val(a);
5385     bv = float64_val(b);
5386     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5387     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5388
5389 }
5390
5391 /*----------------------------------------------------------------------------
5392 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5393 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5394 | comparison is performed according to the IEC/IEEE Standard for Binary
5395 | Floating-Point Arithmetic.
5396 *----------------------------------------------------------------------------*/
5397
5398 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5399 {
5400     a = float64_squash_input_denormal(a, status);
5401     b = float64_squash_input_denormal(b, status);
5402
5403     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5404          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5405        ) {
5406         if (float64_is_signaling_nan(a, status)
5407          || float64_is_signaling_nan(b, status)) {
5408             float_raise(float_flag_invalid, status);
5409         }
5410         return 1;
5411     }
5412     return 0;
5413 }
5414
5415 /*----------------------------------------------------------------------------
5416 | Returns the result of converting the extended double-precision floating-
5417 | point value `a' to the 32-bit two's complement integer format.  The
5418 | conversion is performed according to the IEC/IEEE Standard for Binary
5419 | Floating-Point Arithmetic---which means in particular that the conversion
5420 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5421 | largest positive integer is returned.  Otherwise, if the conversion
5422 | overflows, the largest integer with the same sign as `a' is returned.
5423 *----------------------------------------------------------------------------*/
5424
5425 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5426 {
5427     flag aSign;
5428     int32_t aExp, shiftCount;
5429     uint64_t aSig;
5430
5431     if (floatx80_invalid_encoding(a)) {
5432         float_raise(float_flag_invalid, status);
5433         return 1 << 31;
5434     }
5435     aSig = extractFloatx80Frac( a );
5436     aExp = extractFloatx80Exp( a );
5437     aSign = extractFloatx80Sign( a );
5438     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5439     shiftCount = 0x4037 - aExp;
5440     if ( shiftCount <= 0 ) shiftCount = 1;
5441     shift64RightJamming( aSig, shiftCount, &aSig );
5442     return roundAndPackInt32(aSign, aSig, status);
5443
5444 }
5445
5446 /*----------------------------------------------------------------------------
5447 | Returns the result of converting the extended double-precision floating-
5448 | point value `a' to the 32-bit two's complement integer format.  The
5449 | conversion is performed according to the IEC/IEEE Standard for Binary
5450 | Floating-Point Arithmetic, except that the conversion is always rounded
5451 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5452 | Otherwise, if the conversion overflows, the largest integer with the same
5453 | sign as `a' is returned.
5454 *----------------------------------------------------------------------------*/
5455
5456 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5457 {
5458     flag aSign;
5459     int32_t aExp, shiftCount;
5460     uint64_t aSig, savedASig;
5461     int32_t z;
5462
5463     if (floatx80_invalid_encoding(a)) {
5464         float_raise(float_flag_invalid, status);
5465         return 1 << 31;
5466     }
5467     aSig = extractFloatx80Frac( a );
5468     aExp = extractFloatx80Exp( a );
5469     aSign = extractFloatx80Sign( a );
5470     if ( 0x401E < aExp ) {
5471         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5472         goto invalid;
5473     }
5474     else if ( aExp < 0x3FFF ) {
5475         if (aExp || aSig) {
5476             status->float_exception_flags |= float_flag_inexact;
5477         }
5478         return 0;
5479     }
5480     shiftCount = 0x403E - aExp;
5481     savedASig = aSig;
5482     aSig >>= shiftCount;
5483     z = aSig;
5484     if ( aSign ) z = - z;
5485     if ( ( z < 0 ) ^ aSign ) {
5486  invalid:
5487         float_raise(float_flag_invalid, status);
5488         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5489     }
5490     if ( ( aSig<<shiftCount ) != savedASig ) {
5491         status->float_exception_flags |= float_flag_inexact;
5492     }
5493     return z;
5494
5495 }
5496
5497 /*----------------------------------------------------------------------------
5498 | Returns the result of converting the extended double-precision floating-
5499 | point value `a' to the 64-bit two's complement integer format.  The
5500 | conversion is performed according to the IEC/IEEE Standard for Binary
5501 | Floating-Point Arithmetic---which means in particular that the conversion
5502 | is rounded according to the current rounding mode.  If `a' is a NaN,
5503 | the largest positive integer is returned.  Otherwise, if the conversion
5504 | overflows, the largest integer with the same sign as `a' is returned.
5505 *----------------------------------------------------------------------------*/
5506
5507 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5508 {
5509     flag aSign;
5510     int32_t aExp, shiftCount;
5511     uint64_t aSig, aSigExtra;
5512
5513     if (floatx80_invalid_encoding(a)) {
5514         float_raise(float_flag_invalid, status);
5515         return 1ULL << 63;
5516     }
5517     aSig = extractFloatx80Frac( a );
5518     aExp = extractFloatx80Exp( a );
5519     aSign = extractFloatx80Sign( a );
5520     shiftCount = 0x403E - aExp;
5521     if ( shiftCount <= 0 ) {
5522         if ( shiftCount ) {
5523             float_raise(float_flag_invalid, status);
5524             if (!aSign || floatx80_is_any_nan(a)) {
5525                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5526             }
5527             return (int64_t) LIT64( 0x8000000000000000 );
5528         }
5529         aSigExtra = 0;
5530     }
5531     else {
5532         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5533     }
5534     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5535
5536 }
5537
5538 /*----------------------------------------------------------------------------
5539 | Returns the result of converting the extended double-precision floating-
5540 | point value `a' to the 64-bit two's complement integer format.  The
5541 | conversion is performed according to the IEC/IEEE Standard for Binary
5542 | Floating-Point Arithmetic, except that the conversion is always rounded
5543 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5544 | Otherwise, if the conversion overflows, the largest integer with the same
5545 | sign as `a' is returned.
5546 *----------------------------------------------------------------------------*/
5547
5548 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5549 {
5550     flag aSign;
5551     int32_t aExp, shiftCount;
5552     uint64_t aSig;
5553     int64_t z;
5554
5555     if (floatx80_invalid_encoding(a)) {
5556         float_raise(float_flag_invalid, status);
5557         return 1ULL << 63;
5558     }
5559     aSig = extractFloatx80Frac( a );
5560     aExp = extractFloatx80Exp( a );
5561     aSign = extractFloatx80Sign( a );
5562     shiftCount = aExp - 0x403E;
5563     if ( 0 <= shiftCount ) {
5564         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5565         if ( ( a.high != 0xC03E ) || aSig ) {
5566             float_raise(float_flag_invalid, status);
5567             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5568                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5569             }
5570         }
5571         return (int64_t) LIT64( 0x8000000000000000 );
5572     }
5573     else if ( aExp < 0x3FFF ) {
5574         if (aExp | aSig) {
5575             status->float_exception_flags |= float_flag_inexact;
5576         }
5577         return 0;
5578     }
5579     z = aSig>>( - shiftCount );
5580     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5581         status->float_exception_flags |= float_flag_inexact;
5582     }
5583     if ( aSign ) z = - z;
5584     return z;
5585
5586 }
5587
5588 /*----------------------------------------------------------------------------
5589 | Returns the result of converting the extended double-precision floating-
5590 | point value `a' to the single-precision floating-point format.  The
5591 | conversion is performed according to the IEC/IEEE Standard for Binary
5592 | Floating-Point Arithmetic.
5593 *----------------------------------------------------------------------------*/
5594
5595 float32 floatx80_to_float32(floatx80 a, float_status *status)
5596 {
5597     flag aSign;
5598     int32_t aExp;
5599     uint64_t aSig;
5600
5601     if (floatx80_invalid_encoding(a)) {
5602         float_raise(float_flag_invalid, status);
5603         return float32_default_nan(status);
5604     }
5605     aSig = extractFloatx80Frac( a );
5606     aExp = extractFloatx80Exp( a );
5607     aSign = extractFloatx80Sign( a );
5608     if ( aExp == 0x7FFF ) {
5609         if ( (uint64_t) ( aSig<<1 ) ) {
5610             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5611         }
5612         return packFloat32( aSign, 0xFF, 0 );
5613     }
5614     shift64RightJamming( aSig, 33, &aSig );
5615     if ( aExp || aSig ) aExp -= 0x3F81;
5616     return roundAndPackFloat32(aSign, aExp, aSig, status);
5617
5618 }
5619
5620 /*----------------------------------------------------------------------------
5621 | Returns the result of converting the extended double-precision floating-
5622 | point value `a' to the double-precision floating-point format.  The
5623 | conversion is performed according to the IEC/IEEE Standard for Binary
5624 | Floating-Point Arithmetic.
5625 *----------------------------------------------------------------------------*/
5626
5627 float64 floatx80_to_float64(floatx80 a, float_status *status)
5628 {
5629     flag aSign;
5630     int32_t aExp;
5631     uint64_t aSig, zSig;
5632
5633     if (floatx80_invalid_encoding(a)) {
5634         float_raise(float_flag_invalid, status);
5635         return float64_default_nan(status);
5636     }
5637     aSig = extractFloatx80Frac( a );
5638     aExp = extractFloatx80Exp( a );
5639     aSign = extractFloatx80Sign( a );
5640     if ( aExp == 0x7FFF ) {
5641         if ( (uint64_t) ( aSig<<1 ) ) {
5642             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5643         }
5644         return packFloat64( aSign, 0x7FF, 0 );
5645     }
5646     shift64RightJamming( aSig, 1, &zSig );
5647     if ( aExp || aSig ) aExp -= 0x3C01;
5648     return roundAndPackFloat64(aSign, aExp, zSig, status);
5649
5650 }
5651
5652 /*----------------------------------------------------------------------------
5653 | Returns the result of converting the extended double-precision floating-
5654 | point value `a' to the quadruple-precision floating-point format.  The
5655 | conversion is performed according to the IEC/IEEE Standard for Binary
5656 | Floating-Point Arithmetic.
5657 *----------------------------------------------------------------------------*/
5658
5659 float128 floatx80_to_float128(floatx80 a, float_status *status)
5660 {
5661     flag aSign;
5662     int aExp;
5663     uint64_t aSig, zSig0, zSig1;
5664
5665     if (floatx80_invalid_encoding(a)) {
5666         float_raise(float_flag_invalid, status);
5667         return float128_default_nan(status);
5668     }
5669     aSig = extractFloatx80Frac( a );
5670     aExp = extractFloatx80Exp( a );
5671     aSign = extractFloatx80Sign( a );
5672     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5673         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5674     }
5675     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5676     return packFloat128( aSign, aExp, zSig0, zSig1 );
5677
5678 }
5679
5680 /*----------------------------------------------------------------------------
5681 | Rounds the extended double-precision floating-point value `a'
5682 | to the precision provided by floatx80_rounding_precision and returns the
5683 | result as an extended double-precision floating-point value.
5684 | The operation is performed according to the IEC/IEEE Standard for Binary
5685 | Floating-Point Arithmetic.
5686 *----------------------------------------------------------------------------*/
5687
5688 floatx80 floatx80_round(floatx80 a, float_status *status)
5689 {
5690     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5691                                 extractFloatx80Sign(a),
5692                                 extractFloatx80Exp(a),
5693                                 extractFloatx80Frac(a), 0, status);
5694 }
5695
5696 /*----------------------------------------------------------------------------
5697 | Rounds the extended double-precision floating-point value `a' to an integer,
5698 | and returns the result as an extended quadruple-precision floating-point
5699 | value.  The operation is performed according to the IEC/IEEE Standard for
5700 | Binary Floating-Point Arithmetic.
5701 *----------------------------------------------------------------------------*/
5702
5703 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5704 {
5705     flag aSign;
5706     int32_t aExp;
5707     uint64_t lastBitMask, roundBitsMask;
5708     floatx80 z;
5709
5710     if (floatx80_invalid_encoding(a)) {
5711         float_raise(float_flag_invalid, status);
5712         return floatx80_default_nan(status);
5713     }
5714     aExp = extractFloatx80Exp( a );
5715     if ( 0x403E <= aExp ) {
5716         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5717             return propagateFloatx80NaN(a, a, status);
5718         }
5719         return a;
5720     }
5721     if ( aExp < 0x3FFF ) {
5722         if (    ( aExp == 0 )
5723              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5724             return a;
5725         }
5726         status->float_exception_flags |= float_flag_inexact;
5727         aSign = extractFloatx80Sign( a );
5728         switch (status->float_rounding_mode) {
5729          case float_round_nearest_even:
5730             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5731                ) {
5732                 return
5733                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5734             }
5735             break;
5736         case float_round_ties_away:
5737             if (aExp == 0x3FFE) {
5738                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5739             }
5740             break;
5741          case float_round_down:
5742             return
5743                   aSign ?
5744                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5745                 : packFloatx80( 0, 0, 0 );
5746          case float_round_up:
5747             return
5748                   aSign ? packFloatx80( 1, 0, 0 )
5749                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5750         }
5751         return packFloatx80( aSign, 0, 0 );
5752     }
5753     lastBitMask = 1;
5754     lastBitMask <<= 0x403E - aExp;
5755     roundBitsMask = lastBitMask - 1;
5756     z = a;
5757     switch (status->float_rounding_mode) {
5758     case float_round_nearest_even:
5759         z.low += lastBitMask>>1;
5760         if ((z.low & roundBitsMask) == 0) {
5761             z.low &= ~lastBitMask;
5762         }
5763         break;
5764     case float_round_ties_away:
5765         z.low += lastBitMask >> 1;
5766         break;
5767     case float_round_to_zero:
5768         break;
5769     case float_round_up:
5770         if (!extractFloatx80Sign(z)) {
5771             z.low += roundBitsMask;
5772         }
5773         break;
5774     case float_round_down:
5775         if (extractFloatx80Sign(z)) {
5776             z.low += roundBitsMask;
5777         }
5778         break;
5779     default:
5780         abort();
5781     }
5782     z.low &= ~ roundBitsMask;
5783     if ( z.low == 0 ) {
5784         ++z.high;
5785         z.low = LIT64( 0x8000000000000000 );
5786     }
5787     if (z.low != a.low) {
5788         status->float_exception_flags |= float_flag_inexact;
5789     }
5790     return z;
5791
5792 }
5793
5794 /*----------------------------------------------------------------------------
5795 | Returns the result of adding the absolute values of the extended double-
5796 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5797 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5798 | The addition is performed according to the IEC/IEEE Standard for Binary
5799 | Floating-Point Arithmetic.
5800 *----------------------------------------------------------------------------*/
5801
5802 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5803                                 float_status *status)
5804 {
5805     int32_t aExp, bExp, zExp;
5806     uint64_t aSig, bSig, zSig0, zSig1;
5807     int32_t expDiff;
5808
5809     aSig = extractFloatx80Frac( a );
5810     aExp = extractFloatx80Exp( a );
5811     bSig = extractFloatx80Frac( b );
5812     bExp = extractFloatx80Exp( b );
5813     expDiff = aExp - bExp;
5814     if ( 0 < expDiff ) {
5815         if ( aExp == 0x7FFF ) {
5816             if ((uint64_t)(aSig << 1)) {
5817                 return propagateFloatx80NaN(a, b, status);
5818             }
5819             return a;
5820         }
5821         if ( bExp == 0 ) --expDiff;
5822         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5823         zExp = aExp;
5824     }
5825     else if ( expDiff < 0 ) {
5826         if ( bExp == 0x7FFF ) {
5827             if ((uint64_t)(bSig << 1)) {
5828                 return propagateFloatx80NaN(a, b, status);
5829             }
5830             return packFloatx80(zSign,
5831                                 floatx80_infinity_high,
5832                                 floatx80_infinity_low);
5833         }
5834         if ( aExp == 0 ) ++expDiff;
5835         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5836         zExp = bExp;
5837     }
5838     else {
5839         if ( aExp == 0x7FFF ) {
5840             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5841                 return propagateFloatx80NaN(a, b, status);
5842             }
5843             return a;
5844         }
5845         zSig1 = 0;
5846         zSig0 = aSig + bSig;
5847         if ( aExp == 0 ) {
5848             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5849             goto roundAndPack;
5850         }
5851         zExp = aExp;
5852         goto shiftRight1;
5853     }
5854     zSig0 = aSig + bSig;
5855     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5856  shiftRight1:
5857     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5858     zSig0 |= LIT64( 0x8000000000000000 );
5859     ++zExp;
5860  roundAndPack:
5861     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5862                                 zSign, zExp, zSig0, zSig1, status);
5863 }
5864
5865 /*----------------------------------------------------------------------------
5866 | Returns the result of subtracting the absolute values of the extended
5867 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5868 | difference is negated before being returned.  `zSign' is ignored if the
5869 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5870 | Standard for Binary Floating-Point Arithmetic.
5871 *----------------------------------------------------------------------------*/
5872
5873 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5874                                 float_status *status)
5875 {
5876     int32_t aExp, bExp, zExp;
5877     uint64_t aSig, bSig, zSig0, zSig1;
5878     int32_t expDiff;
5879
5880     aSig = extractFloatx80Frac( a );
5881     aExp = extractFloatx80Exp( a );
5882     bSig = extractFloatx80Frac( b );
5883     bExp = extractFloatx80Exp( b );
5884     expDiff = aExp - bExp;
5885     if ( 0 < expDiff ) goto aExpBigger;
5886     if ( expDiff < 0 ) goto bExpBigger;
5887     if ( aExp == 0x7FFF ) {
5888         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5889             return propagateFloatx80NaN(a, b, status);
5890         }
5891         float_raise(float_flag_invalid, status);
5892         return floatx80_default_nan(status);
5893     }
5894     if ( aExp == 0 ) {
5895         aExp = 1;
5896         bExp = 1;
5897     }
5898     zSig1 = 0;
5899     if ( bSig < aSig ) goto aBigger;
5900     if ( aSig < bSig ) goto bBigger;
5901     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5902  bExpBigger:
5903     if ( bExp == 0x7FFF ) {
5904         if ((uint64_t)(bSig << 1)) {
5905             return propagateFloatx80NaN(a, b, status);
5906         }
5907         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5908                             floatx80_infinity_low);
5909     }
5910     if ( aExp == 0 ) ++expDiff;
5911     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5912  bBigger:
5913     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5914     zExp = bExp;
5915     zSign ^= 1;
5916     goto normalizeRoundAndPack;
5917  aExpBigger:
5918     if ( aExp == 0x7FFF ) {
5919         if ((uint64_t)(aSig << 1)) {
5920             return propagateFloatx80NaN(a, b, status);
5921         }
5922         return a;
5923     }
5924     if ( bExp == 0 ) --expDiff;
5925     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5926  aBigger:
5927     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5928     zExp = aExp;
5929  normalizeRoundAndPack:
5930     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5931                                          zSign, zExp, zSig0, zSig1, status);
5932 }
5933
5934 /*----------------------------------------------------------------------------
5935 | Returns the result of adding the extended double-precision floating-point
5936 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5937 | Standard for Binary Floating-Point Arithmetic.
5938 *----------------------------------------------------------------------------*/
5939
5940 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5941 {
5942     flag aSign, bSign;
5943
5944     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5945         float_raise(float_flag_invalid, status);
5946         return floatx80_default_nan(status);
5947     }
5948     aSign = extractFloatx80Sign( a );
5949     bSign = extractFloatx80Sign( b );
5950     if ( aSign == bSign ) {
5951         return addFloatx80Sigs(a, b, aSign, status);
5952     }
5953     else {
5954         return subFloatx80Sigs(a, b, aSign, status);
5955     }
5956
5957 }
5958
5959 /*----------------------------------------------------------------------------
5960 | Returns the result of subtracting the extended double-precision floating-
5961 | point values `a' and `b'.  The operation is performed according to the
5962 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5963 *----------------------------------------------------------------------------*/
5964
5965 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5966 {
5967     flag aSign, bSign;
5968
5969     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5970         float_raise(float_flag_invalid, status);
5971         return floatx80_default_nan(status);
5972     }
5973     aSign = extractFloatx80Sign( a );
5974     bSign = extractFloatx80Sign( b );
5975     if ( aSign == bSign ) {
5976         return subFloatx80Sigs(a, b, aSign, status);
5977     }
5978     else {
5979         return addFloatx80Sigs(a, b, aSign, status);
5980     }
5981
5982 }
5983
5984 /*----------------------------------------------------------------------------
5985 | Returns the result of multiplying the extended double-precision floating-
5986 | point values `a' and `b'.  The operation is performed according to the
5987 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5988 *----------------------------------------------------------------------------*/
5989
5990 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5991 {
5992     flag aSign, bSign, zSign;
5993     int32_t aExp, bExp, zExp;
5994     uint64_t aSig, bSig, zSig0, zSig1;
5995
5996     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5997         float_raise(float_flag_invalid, status);
5998         return floatx80_default_nan(status);
5999     }
6000     aSig = extractFloatx80Frac( a );
6001     aExp = extractFloatx80Exp( a );
6002     aSign = extractFloatx80Sign( a );
6003     bSig = extractFloatx80Frac( b );
6004     bExp = extractFloatx80Exp( b );
6005     bSign = extractFloatx80Sign( b );
6006     zSign = aSign ^ bSign;
6007     if ( aExp == 0x7FFF ) {
6008         if (    (uint64_t) ( aSig<<1 )
6009              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6010             return propagateFloatx80NaN(a, b, status);
6011         }
6012         if ( ( bExp | bSig ) == 0 ) goto invalid;
6013         return packFloatx80(zSign, floatx80_infinity_high,
6014                                    floatx80_infinity_low);
6015     }
6016     if ( bExp == 0x7FFF ) {
6017         if ((uint64_t)(bSig << 1)) {
6018             return propagateFloatx80NaN(a, b, status);
6019         }
6020         if ( ( aExp | aSig ) == 0 ) {
6021  invalid:
6022             float_raise(float_flag_invalid, status);
6023             return floatx80_default_nan(status);
6024         }
6025         return packFloatx80(zSign, floatx80_infinity_high,
6026                                    floatx80_infinity_low);
6027     }
6028     if ( aExp == 0 ) {
6029         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6030         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6031     }
6032     if ( bExp == 0 ) {
6033         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6034         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6035     }
6036     zExp = aExp + bExp - 0x3FFE;
6037     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6038     if ( 0 < (int64_t) zSig0 ) {
6039         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6040         --zExp;
6041     }
6042     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6043                                 zSign, zExp, zSig0, zSig1, status);
6044 }
6045
6046 /*----------------------------------------------------------------------------
6047 | Returns the result of dividing the extended double-precision floating-point
6048 | value `a' by the corresponding value `b'.  The operation is performed
6049 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6050 *----------------------------------------------------------------------------*/
6051
6052 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6053 {
6054     flag aSign, bSign, zSign;
6055     int32_t aExp, bExp, zExp;
6056     uint64_t aSig, bSig, zSig0, zSig1;
6057     uint64_t rem0, rem1, rem2, term0, term1, term2;
6058
6059     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6060         float_raise(float_flag_invalid, status);
6061         return floatx80_default_nan(status);
6062     }
6063     aSig = extractFloatx80Frac( a );
6064     aExp = extractFloatx80Exp( a );
6065     aSign = extractFloatx80Sign( a );
6066     bSig = extractFloatx80Frac( b );
6067     bExp = extractFloatx80Exp( b );
6068     bSign = extractFloatx80Sign( b );
6069     zSign = aSign ^ bSign;
6070     if ( aExp == 0x7FFF ) {
6071         if ((uint64_t)(aSig << 1)) {
6072             return propagateFloatx80NaN(a, b, status);
6073         }
6074         if ( bExp == 0x7FFF ) {
6075             if ((uint64_t)(bSig << 1)) {
6076                 return propagateFloatx80NaN(a, b, status);
6077             }
6078             goto invalid;
6079         }
6080         return packFloatx80(zSign, floatx80_infinity_high,
6081                                    floatx80_infinity_low);
6082     }
6083     if ( bExp == 0x7FFF ) {
6084         if ((uint64_t)(bSig << 1)) {
6085             return propagateFloatx80NaN(a, b, status);
6086         }
6087         return packFloatx80( zSign, 0, 0 );
6088     }
6089     if ( bExp == 0 ) {
6090         if ( bSig == 0 ) {
6091             if ( ( aExp | aSig ) == 0 ) {
6092  invalid:
6093                 float_raise(float_flag_invalid, status);
6094                 return floatx80_default_nan(status);
6095             }
6096             float_raise(float_flag_divbyzero, status);
6097             return packFloatx80(zSign, floatx80_infinity_high,
6098                                        floatx80_infinity_low);
6099         }
6100         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6101     }
6102     if ( aExp == 0 ) {
6103         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6104         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6105     }
6106     zExp = aExp - bExp + 0x3FFE;
6107     rem1 = 0;
6108     if ( bSig <= aSig ) {
6109         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6110         ++zExp;
6111     }
6112     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6113     mul64To128( bSig, zSig0, &term0, &term1 );
6114     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6115     while ( (int64_t) rem0 < 0 ) {
6116         --zSig0;
6117         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6118     }
6119     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6120     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6121         mul64To128( bSig, zSig1, &term1, &term2 );
6122         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6123         while ( (int64_t) rem1 < 0 ) {
6124             --zSig1;
6125             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6126         }
6127         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6128     }
6129     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6130                                 zSign, zExp, zSig0, zSig1, status);
6131 }
6132
6133 /*----------------------------------------------------------------------------
6134 | Returns the remainder of the extended double-precision floating-point value
6135 | `a' with respect to the corresponding value `b'.  The operation is performed
6136 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6137 *----------------------------------------------------------------------------*/
6138
6139 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6140 {
6141     flag aSign, zSign;
6142     int32_t aExp, bExp, expDiff;
6143     uint64_t aSig0, aSig1, bSig;
6144     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6145
6146     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6147         float_raise(float_flag_invalid, status);
6148         return floatx80_default_nan(status);
6149     }
6150     aSig0 = extractFloatx80Frac( a );
6151     aExp = extractFloatx80Exp( a );
6152     aSign = extractFloatx80Sign( a );
6153     bSig = extractFloatx80Frac( b );
6154     bExp = extractFloatx80Exp( b );
6155     if ( aExp == 0x7FFF ) {
6156         if (    (uint64_t) ( aSig0<<1 )
6157              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6158             return propagateFloatx80NaN(a, b, status);
6159         }
6160         goto invalid;
6161     }
6162     if ( bExp == 0x7FFF ) {
6163         if ((uint64_t)(bSig << 1)) {
6164             return propagateFloatx80NaN(a, b, status);
6165         }
6166         return a;
6167     }
6168     if ( bExp == 0 ) {
6169         if ( bSig == 0 ) {
6170  invalid:
6171             float_raise(float_flag_invalid, status);
6172             return floatx80_default_nan(status);
6173         }
6174         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6175     }
6176     if ( aExp == 0 ) {
6177         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
6178         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6179     }
6180     bSig |= LIT64( 0x8000000000000000 );
6181     zSign = aSign;
6182     expDiff = aExp - bExp;
6183     aSig1 = 0;
6184     if ( expDiff < 0 ) {
6185         if ( expDiff < -1 ) return a;
6186         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6187         expDiff = 0;
6188     }
6189     q = ( bSig <= aSig0 );
6190     if ( q ) aSig0 -= bSig;
6191     expDiff -= 64;
6192     while ( 0 < expDiff ) {
6193         q = estimateDiv128To64( aSig0, aSig1, bSig );
6194         q = ( 2 < q ) ? q - 2 : 0;
6195         mul64To128( bSig, q, &term0, &term1 );
6196         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6197         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6198         expDiff -= 62;
6199     }
6200     expDiff += 64;
6201     if ( 0 < expDiff ) {
6202         q = estimateDiv128To64( aSig0, aSig1, bSig );
6203         q = ( 2 < q ) ? q - 2 : 0;
6204         q >>= 64 - expDiff;
6205         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6206         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6207         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6208         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6209             ++q;
6210             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6211         }
6212     }
6213     else {
6214         term1 = 0;
6215         term0 = bSig;
6216     }
6217     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6218     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6219          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6220               && ( q & 1 ) )
6221        ) {
6222         aSig0 = alternateASig0;
6223         aSig1 = alternateASig1;
6224         zSign = ! zSign;
6225     }
6226     return
6227         normalizeRoundAndPackFloatx80(
6228             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6229
6230 }
6231
6232 /*----------------------------------------------------------------------------
6233 | Returns the square root of the extended double-precision floating-point
6234 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6235 | for Binary Floating-Point Arithmetic.
6236 *----------------------------------------------------------------------------*/
6237
6238 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6239 {
6240     flag aSign;
6241     int32_t aExp, zExp;
6242     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6243     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6244
6245     if (floatx80_invalid_encoding(a)) {
6246         float_raise(float_flag_invalid, status);
6247         return floatx80_default_nan(status);
6248     }
6249     aSig0 = extractFloatx80Frac( a );
6250     aExp = extractFloatx80Exp( a );
6251     aSign = extractFloatx80Sign( a );
6252     if ( aExp == 0x7FFF ) {
6253         if ((uint64_t)(aSig0 << 1)) {
6254             return propagateFloatx80NaN(a, a, status);
6255         }
6256         if ( ! aSign ) return a;
6257         goto invalid;
6258     }
6259     if ( aSign ) {
6260         if ( ( aExp | aSig0 ) == 0 ) return a;
6261  invalid:
6262         float_raise(float_flag_invalid, status);
6263         return floatx80_default_nan(status);
6264     }
6265     if ( aExp == 0 ) {
6266         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6267         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6268     }
6269     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6270     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6271     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6272     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6273     doubleZSig0 = zSig0<<1;
6274     mul64To128( zSig0, zSig0, &term0, &term1 );
6275     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6276     while ( (int64_t) rem0 < 0 ) {
6277         --zSig0;
6278         doubleZSig0 -= 2;
6279         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6280     }
6281     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6282     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
6283         if ( zSig1 == 0 ) zSig1 = 1;
6284         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6285         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6286         mul64To128( zSig1, zSig1, &term2, &term3 );
6287         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6288         while ( (int64_t) rem1 < 0 ) {
6289             --zSig1;
6290             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6291             term3 |= 1;
6292             term2 |= doubleZSig0;
6293             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6294         }
6295         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6296     }
6297     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6298     zSig0 |= doubleZSig0;
6299     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6300                                 0, zExp, zSig0, zSig1, status);
6301 }
6302
6303 /*----------------------------------------------------------------------------
6304 | Returns 1 if the extended double-precision floating-point value `a' is equal
6305 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
6306 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6308 *----------------------------------------------------------------------------*/
6309
6310 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6311 {
6312
6313     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6314         || (extractFloatx80Exp(a) == 0x7FFF
6315             && (uint64_t) (extractFloatx80Frac(a) << 1))
6316         || (extractFloatx80Exp(b) == 0x7FFF
6317             && (uint64_t) (extractFloatx80Frac(b) << 1))
6318        ) {
6319         float_raise(float_flag_invalid, status);
6320         return 0;
6321     }
6322     return
6323            ( a.low == b.low )
6324         && (    ( a.high == b.high )
6325              || (    ( a.low == 0 )
6326                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6327            );
6328
6329 }
6330
6331 /*----------------------------------------------------------------------------
6332 | Returns 1 if the extended double-precision floating-point value `a' is
6333 | less than or equal to the corresponding value `b', and 0 otherwise.  The
6334 | invalid exception is raised if either operand is a NaN.  The comparison is
6335 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6336 | Arithmetic.
6337 *----------------------------------------------------------------------------*/
6338
6339 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6340 {
6341     flag aSign, bSign;
6342
6343     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6344         || (extractFloatx80Exp(a) == 0x7FFF
6345             && (uint64_t) (extractFloatx80Frac(a) << 1))
6346         || (extractFloatx80Exp(b) == 0x7FFF
6347             && (uint64_t) (extractFloatx80Frac(b) << 1))
6348        ) {
6349         float_raise(float_flag_invalid, status);
6350         return 0;
6351     }
6352     aSign = extractFloatx80Sign( a );
6353     bSign = extractFloatx80Sign( b );
6354     if ( aSign != bSign ) {
6355         return
6356                aSign
6357             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6358                  == 0 );
6359     }
6360     return
6361           aSign ? le128( b.high, b.low, a.high, a.low )
6362         : le128( a.high, a.low, b.high, b.low );
6363
6364 }
6365
6366 /*----------------------------------------------------------------------------
6367 | Returns 1 if the extended double-precision floating-point value `a' is
6368 | less than the corresponding value `b', and 0 otherwise.  The invalid
6369 | exception is raised if either operand is a NaN.  The comparison is performed
6370 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6371 *----------------------------------------------------------------------------*/
6372
6373 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6374 {
6375     flag aSign, bSign;
6376
6377     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6378         || (extractFloatx80Exp(a) == 0x7FFF
6379             && (uint64_t) (extractFloatx80Frac(a) << 1))
6380         || (extractFloatx80Exp(b) == 0x7FFF
6381             && (uint64_t) (extractFloatx80Frac(b) << 1))
6382        ) {
6383         float_raise(float_flag_invalid, status);
6384         return 0;
6385     }
6386     aSign = extractFloatx80Sign( a );
6387     bSign = extractFloatx80Sign( b );
6388     if ( aSign != bSign ) {
6389         return
6390                aSign
6391             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6392                  != 0 );
6393     }
6394     return
6395           aSign ? lt128( b.high, b.low, a.high, a.low )
6396         : lt128( a.high, a.low, b.high, b.low );
6397
6398 }
6399
6400 /*----------------------------------------------------------------------------
6401 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6402 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
6403 | either operand is a NaN.   The comparison is performed according to the
6404 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6405 *----------------------------------------------------------------------------*/
6406 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6407 {
6408     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6409         || (extractFloatx80Exp(a) == 0x7FFF
6410             && (uint64_t) (extractFloatx80Frac(a) << 1))
6411         || (extractFloatx80Exp(b) == 0x7FFF
6412             && (uint64_t) (extractFloatx80Frac(b) << 1))
6413        ) {
6414         float_raise(float_flag_invalid, status);
6415         return 1;
6416     }
6417     return 0;
6418 }
6419
6420 /*----------------------------------------------------------------------------
6421 | Returns 1 if the extended double-precision floating-point value `a' is
6422 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6423 | cause an exception.  The comparison is performed according to the IEC/IEEE
6424 | Standard for Binary Floating-Point Arithmetic.
6425 *----------------------------------------------------------------------------*/
6426
6427 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6428 {
6429
6430     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6431         float_raise(float_flag_invalid, status);
6432         return 0;
6433     }
6434     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6435               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6436          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6437               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6438        ) {
6439         if (floatx80_is_signaling_nan(a, status)
6440          || floatx80_is_signaling_nan(b, status)) {
6441             float_raise(float_flag_invalid, status);
6442         }
6443         return 0;
6444     }
6445     return
6446            ( a.low == b.low )
6447         && (    ( a.high == b.high )
6448              || (    ( a.low == 0 )
6449                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6450            );
6451
6452 }
6453
6454 /*----------------------------------------------------------------------------
6455 | Returns 1 if the extended double-precision floating-point value `a' is less
6456 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
6457 | do not cause an exception.  Otherwise, the comparison is performed according
6458 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6459 *----------------------------------------------------------------------------*/
6460
6461 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6462 {
6463     flag aSign, bSign;
6464
6465     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6466         float_raise(float_flag_invalid, status);
6467         return 0;
6468     }
6469     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6470               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6471          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6472               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6473        ) {
6474         if (floatx80_is_signaling_nan(a, status)
6475          || floatx80_is_signaling_nan(b, status)) {
6476             float_raise(float_flag_invalid, status);
6477         }
6478         return 0;
6479     }
6480     aSign = extractFloatx80Sign( a );
6481     bSign = extractFloatx80Sign( b );
6482     if ( aSign != bSign ) {
6483         return
6484                aSign
6485             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6486                  == 0 );
6487     }
6488     return
6489           aSign ? le128( b.high, b.low, a.high, a.low )
6490         : le128( a.high, a.low, b.high, b.low );
6491
6492 }
6493
6494 /*----------------------------------------------------------------------------
6495 | Returns 1 if the extended double-precision floating-point value `a' is less
6496 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
6497 | an exception.  Otherwise, the comparison is performed according to the
6498 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6499 *----------------------------------------------------------------------------*/
6500
6501 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6502 {
6503     flag aSign, bSign;
6504
6505     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6506         float_raise(float_flag_invalid, status);
6507         return 0;
6508     }
6509     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6510               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6511          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6512               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6513        ) {
6514         if (floatx80_is_signaling_nan(a, status)
6515          || floatx80_is_signaling_nan(b, status)) {
6516             float_raise(float_flag_invalid, status);
6517         }
6518         return 0;
6519     }
6520     aSign = extractFloatx80Sign( a );
6521     bSign = extractFloatx80Sign( b );
6522     if ( aSign != bSign ) {
6523         return
6524                aSign
6525             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6526                  != 0 );
6527     }
6528     return
6529           aSign ? lt128( b.high, b.low, a.high, a.low )
6530         : lt128( a.high, a.low, b.high, b.low );
6531
6532 }
6533
6534 /*----------------------------------------------------------------------------
6535 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6536 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6537 | The comparison is performed according to the IEC/IEEE Standard for Binary
6538 | Floating-Point Arithmetic.
6539 *----------------------------------------------------------------------------*/
6540 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6541 {
6542     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6543         float_raise(float_flag_invalid, status);
6544         return 1;
6545     }
6546     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6547               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6548          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6549               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6550        ) {
6551         if (floatx80_is_signaling_nan(a, status)
6552          || floatx80_is_signaling_nan(b, status)) {
6553             float_raise(float_flag_invalid, status);
6554         }
6555         return 1;
6556     }
6557     return 0;
6558 }
6559
6560 /*----------------------------------------------------------------------------
6561 | Returns the result of converting the quadruple-precision floating-point
6562 | value `a' to the 32-bit two's complement integer format.  The conversion
6563 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6564 | Arithmetic---which means in particular that the conversion is rounded
6565 | according to the current rounding mode.  If `a' is a NaN, the largest
6566 | positive integer is returned.  Otherwise, if the conversion overflows, the
6567 | largest integer with the same sign as `a' is returned.
6568 *----------------------------------------------------------------------------*/
6569
6570 int32_t float128_to_int32(float128 a, float_status *status)
6571 {
6572     flag aSign;
6573     int32_t aExp, shiftCount;
6574     uint64_t aSig0, aSig1;
6575
6576     aSig1 = extractFloat128Frac1( a );
6577     aSig0 = extractFloat128Frac0( a );
6578     aExp = extractFloat128Exp( a );
6579     aSign = extractFloat128Sign( a );
6580     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6581     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6582     aSig0 |= ( aSig1 != 0 );
6583     shiftCount = 0x4028 - aExp;
6584     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6585     return roundAndPackInt32(aSign, aSig0, status);
6586
6587 }
6588
6589 /*----------------------------------------------------------------------------
6590 | Returns the result of converting the quadruple-precision floating-point
6591 | value `a' to the 32-bit two's complement integer format.  The conversion
6592 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6593 | Arithmetic, except that the conversion is always rounded toward zero.  If
6594 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6595 | conversion overflows, the largest integer with the same sign as `a' is
6596 | returned.
6597 *----------------------------------------------------------------------------*/
6598
6599 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6600 {
6601     flag aSign;
6602     int32_t aExp, shiftCount;
6603     uint64_t aSig0, aSig1, savedASig;
6604     int32_t z;
6605
6606     aSig1 = extractFloat128Frac1( a );
6607     aSig0 = extractFloat128Frac0( a );
6608     aExp = extractFloat128Exp( a );
6609     aSign = extractFloat128Sign( a );
6610     aSig0 |= ( aSig1 != 0 );
6611     if ( 0x401E < aExp ) {
6612         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6613         goto invalid;
6614     }
6615     else if ( aExp < 0x3FFF ) {
6616         if (aExp || aSig0) {
6617             status->float_exception_flags |= float_flag_inexact;
6618         }
6619         return 0;
6620     }
6621     aSig0 |= LIT64( 0x0001000000000000 );
6622     shiftCount = 0x402F - aExp;
6623     savedASig = aSig0;
6624     aSig0 >>= shiftCount;
6625     z = aSig0;
6626     if ( aSign ) z = - z;
6627     if ( ( z < 0 ) ^ aSign ) {
6628  invalid:
6629         float_raise(float_flag_invalid, status);
6630         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6631     }
6632     if ( ( aSig0<<shiftCount ) != savedASig ) {
6633         status->float_exception_flags |= float_flag_inexact;
6634     }
6635     return z;
6636
6637 }
6638
6639 /*----------------------------------------------------------------------------
6640 | Returns the result of converting the quadruple-precision floating-point
6641 | value `a' to the 64-bit two's complement integer format.  The conversion
6642 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6643 | Arithmetic---which means in particular that the conversion is rounded
6644 | according to the current rounding mode.  If `a' is a NaN, the largest
6645 | positive integer is returned.  Otherwise, if the conversion overflows, the
6646 | largest integer with the same sign as `a' is returned.
6647 *----------------------------------------------------------------------------*/
6648
6649 int64_t float128_to_int64(float128 a, float_status *status)
6650 {
6651     flag aSign;
6652     int32_t aExp, shiftCount;
6653     uint64_t aSig0, aSig1;
6654
6655     aSig1 = extractFloat128Frac1( a );
6656     aSig0 = extractFloat128Frac0( a );
6657     aExp = extractFloat128Exp( a );
6658     aSign = extractFloat128Sign( a );
6659     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6660     shiftCount = 0x402F - aExp;
6661     if ( shiftCount <= 0 ) {
6662         if ( 0x403E < aExp ) {
6663             float_raise(float_flag_invalid, status);
6664             if (    ! aSign
6665                  || (    ( aExp == 0x7FFF )
6666                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6667                     )
6668                ) {
6669                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6670             }
6671             return (int64_t) LIT64( 0x8000000000000000 );
6672         }
6673         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6674     }
6675     else {
6676         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6677     }
6678     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6679
6680 }
6681
6682 /*----------------------------------------------------------------------------
6683 | Returns the result of converting the quadruple-precision floating-point
6684 | value `a' to the 64-bit two's complement integer format.  The conversion
6685 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6686 | Arithmetic, except that the conversion is always rounded toward zero.
6687 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6688 | the conversion overflows, the largest integer with the same sign as `a' is
6689 | returned.
6690 *----------------------------------------------------------------------------*/
6691
6692 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6693 {
6694     flag aSign;
6695     int32_t aExp, shiftCount;
6696     uint64_t aSig0, aSig1;
6697     int64_t z;
6698
6699     aSig1 = extractFloat128Frac1( a );
6700     aSig0 = extractFloat128Frac0( a );
6701     aExp = extractFloat128Exp( a );
6702     aSign = extractFloat128Sign( a );
6703     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6704     shiftCount = aExp - 0x402F;
6705     if ( 0 < shiftCount ) {
6706         if ( 0x403E <= aExp ) {
6707             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6708             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6709                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6710                 if (aSig1) {
6711                     status->float_exception_flags |= float_flag_inexact;
6712                 }
6713             }
6714             else {
6715                 float_raise(float_flag_invalid, status);
6716                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6717                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6718                 }
6719             }
6720             return (int64_t) LIT64( 0x8000000000000000 );
6721         }
6722         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6723         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6724             status->float_exception_flags |= float_flag_inexact;
6725         }
6726     }
6727     else {
6728         if ( aExp < 0x3FFF ) {
6729             if ( aExp | aSig0 | aSig1 ) {
6730                 status->float_exception_flags |= float_flag_inexact;
6731             }
6732             return 0;
6733         }
6734         z = aSig0>>( - shiftCount );
6735         if (    aSig1
6736              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6737             status->float_exception_flags |= float_flag_inexact;
6738         }
6739     }
6740     if ( aSign ) z = - z;
6741     return z;
6742
6743 }
6744
6745 /*----------------------------------------------------------------------------
6746 | Returns the result of converting the quadruple-precision floating-point value
6747 | `a' to the 64-bit unsigned integer format.  The conversion is
6748 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6749 | Arithmetic---which means in particular that the conversion is rounded
6750 | according to the current rounding mode.  If `a' is a NaN, the largest
6751 | positive integer is returned.  If the conversion overflows, the
6752 | largest unsigned integer is returned.  If 'a' is negative, the value is
6753 | rounded and zero is returned; negative values that do not round to zero
6754 | will raise the inexact exception.
6755 *----------------------------------------------------------------------------*/
6756
6757 uint64_t float128_to_uint64(float128 a, float_status *status)
6758 {
6759     flag aSign;
6760     int aExp;
6761     int shiftCount;
6762     uint64_t aSig0, aSig1;
6763
6764     aSig0 = extractFloat128Frac0(a);
6765     aSig1 = extractFloat128Frac1(a);
6766     aExp = extractFloat128Exp(a);
6767     aSign = extractFloat128Sign(a);
6768     if (aSign && (aExp > 0x3FFE)) {
6769         float_raise(float_flag_invalid, status);
6770         if (float128_is_any_nan(a)) {
6771             return LIT64(0xFFFFFFFFFFFFFFFF);
6772         } else {
6773             return 0;
6774         }
6775     }
6776     if (aExp) {
6777         aSig0 |= LIT64(0x0001000000000000);
6778     }
6779     shiftCount = 0x402F - aExp;
6780     if (shiftCount <= 0) {
6781         if (0x403E < aExp) {
6782             float_raise(float_flag_invalid, status);
6783             return LIT64(0xFFFFFFFFFFFFFFFF);
6784         }
6785         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6786     } else {
6787         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6788     }
6789     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6790 }
6791
6792 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6793 {
6794     uint64_t v;
6795     signed char current_rounding_mode = status->float_rounding_mode;
6796
6797     set_float_rounding_mode(float_round_to_zero, status);
6798     v = float128_to_uint64(a, status);
6799     set_float_rounding_mode(current_rounding_mode, status);
6800
6801     return v;
6802 }
6803
6804 /*----------------------------------------------------------------------------
6805 | Returns the result of converting the quadruple-precision floating-point
6806 | value `a' to the 32-bit unsigned integer format.  The conversion
6807 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6808 | Arithmetic except that the conversion is always rounded toward zero.
6809 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6810 | if the conversion overflows, the largest unsigned integer is returned.
6811 | If 'a' is negative, the value is rounded and zero is returned; negative
6812 | values that do not round to zero will raise the inexact exception.
6813 *----------------------------------------------------------------------------*/
6814
6815 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6816 {
6817     uint64_t v;
6818     uint32_t res;
6819     int old_exc_flags = get_float_exception_flags(status);
6820
6821     v = float128_to_uint64_round_to_zero(a, status);
6822     if (v > 0xffffffff) {
6823         res = 0xffffffff;
6824     } else {
6825         return v;
6826     }
6827     set_float_exception_flags(old_exc_flags, status);
6828     float_raise(float_flag_invalid, status);
6829     return res;
6830 }
6831
6832 /*----------------------------------------------------------------------------
6833 | Returns the result of converting the quadruple-precision floating-point value
6834 | `a' to the 32-bit unsigned integer format.  The conversion is
6835 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6836 | Arithmetic---which means in particular that the conversion is rounded
6837 | according to the current rounding mode.  If `a' is a NaN, the largest
6838 | positive integer is returned.  If the conversion overflows, the
6839 | largest unsigned integer is returned.  If 'a' is negative, the value is
6840 | rounded and zero is returned; negative values that do not round to zero
6841 | will raise the inexact exception.
6842 *----------------------------------------------------------------------------*/
6843
6844 uint32_t float128_to_uint32(float128 a, float_status *status)
6845 {
6846     uint64_t v;
6847     uint32_t res;
6848     int old_exc_flags = get_float_exception_flags(status);
6849
6850     v = float128_to_uint64(a, status);
6851     if (v > 0xffffffff) {
6852         res = 0xffffffff;
6853     } else {
6854         return v;
6855     }
6856     set_float_exception_flags(old_exc_flags, status);
6857     float_raise(float_flag_invalid, status);
6858     return res;
6859 }
6860
6861 /*----------------------------------------------------------------------------
6862 | Returns the result of converting the quadruple-precision floating-point
6863 | value `a' to the single-precision floating-point format.  The conversion
6864 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6865 | Arithmetic.
6866 *----------------------------------------------------------------------------*/
6867
6868 float32 float128_to_float32(float128 a, float_status *status)
6869 {
6870     flag aSign;
6871     int32_t aExp;
6872     uint64_t aSig0, aSig1;
6873     uint32_t zSig;
6874
6875     aSig1 = extractFloat128Frac1( a );
6876     aSig0 = extractFloat128Frac0( a );
6877     aExp = extractFloat128Exp( a );
6878     aSign = extractFloat128Sign( a );
6879     if ( aExp == 0x7FFF ) {
6880         if ( aSig0 | aSig1 ) {
6881             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6882         }
6883         return packFloat32( aSign, 0xFF, 0 );
6884     }
6885     aSig0 |= ( aSig1 != 0 );
6886     shift64RightJamming( aSig0, 18, &aSig0 );
6887     zSig = aSig0;
6888     if ( aExp || zSig ) {
6889         zSig |= 0x40000000;
6890         aExp -= 0x3F81;
6891     }
6892     return roundAndPackFloat32(aSign, aExp, zSig, status);
6893
6894 }
6895
6896 /*----------------------------------------------------------------------------
6897 | Returns the result of converting the quadruple-precision floating-point
6898 | value `a' to the double-precision floating-point format.  The conversion
6899 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6900 | Arithmetic.
6901 *----------------------------------------------------------------------------*/
6902
6903 float64 float128_to_float64(float128 a, float_status *status)
6904 {
6905     flag aSign;
6906     int32_t aExp;
6907     uint64_t aSig0, aSig1;
6908
6909     aSig1 = extractFloat128Frac1( a );
6910     aSig0 = extractFloat128Frac0( a );
6911     aExp = extractFloat128Exp( a );
6912     aSign = extractFloat128Sign( a );
6913     if ( aExp == 0x7FFF ) {
6914         if ( aSig0 | aSig1 ) {
6915             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6916         }
6917         return packFloat64( aSign, 0x7FF, 0 );
6918     }
6919     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6920     aSig0 |= ( aSig1 != 0 );
6921     if ( aExp || aSig0 ) {
6922         aSig0 |= LIT64( 0x4000000000000000 );
6923         aExp -= 0x3C01;
6924     }
6925     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6926
6927 }
6928
6929 /*----------------------------------------------------------------------------
6930 | Returns the result of converting the quadruple-precision floating-point
6931 | value `a' to the extended double-precision floating-point format.  The
6932 | conversion is performed according to the IEC/IEEE Standard for Binary
6933 | Floating-Point Arithmetic.
6934 *----------------------------------------------------------------------------*/
6935
6936 floatx80 float128_to_floatx80(float128 a, float_status *status)
6937 {
6938     flag aSign;
6939     int32_t aExp;
6940     uint64_t aSig0, aSig1;
6941
6942     aSig1 = extractFloat128Frac1( a );
6943     aSig0 = extractFloat128Frac0( a );
6944     aExp = extractFloat128Exp( a );
6945     aSign = extractFloat128Sign( a );
6946     if ( aExp == 0x7FFF ) {
6947         if ( aSig0 | aSig1 ) {
6948             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6949         }
6950         return packFloatx80(aSign, floatx80_infinity_high,
6951                                    floatx80_infinity_low);
6952     }
6953     if ( aExp == 0 ) {
6954         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6955         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6956     }
6957     else {
6958         aSig0 |= LIT64( 0x0001000000000000 );
6959     }
6960     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6961     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6962
6963 }
6964
6965 /*----------------------------------------------------------------------------
6966 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6967 | returns the result as a quadruple-precision floating-point value.  The
6968 | operation is performed according to the IEC/IEEE Standard for Binary
6969 | Floating-Point Arithmetic.
6970 *----------------------------------------------------------------------------*/
6971
6972 float128 float128_round_to_int(float128 a, float_status *status)
6973 {
6974     flag aSign;
6975     int32_t aExp;
6976     uint64_t lastBitMask, roundBitsMask;
6977     float128 z;
6978
6979     aExp = extractFloat128Exp( a );
6980     if ( 0x402F <= aExp ) {
6981         if ( 0x406F <= aExp ) {
6982             if (    ( aExp == 0x7FFF )
6983                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6984                ) {
6985                 return propagateFloat128NaN(a, a, status);
6986             }
6987             return a;
6988         }
6989         lastBitMask = 1;
6990         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6991         roundBitsMask = lastBitMask - 1;
6992         z = a;
6993         switch (status->float_rounding_mode) {
6994         case float_round_nearest_even:
6995             if ( lastBitMask ) {
6996                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6997                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6998             }
6999             else {
7000                 if ( (int64_t) z.low < 0 ) {
7001                     ++z.high;
7002                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
7003                 }
7004             }
7005             break;
7006         case float_round_ties_away:
7007             if (lastBitMask) {
7008                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
7009             } else {
7010                 if ((int64_t) z.low < 0) {
7011                     ++z.high;
7012                 }
7013             }
7014             break;
7015         case float_round_to_zero:
7016             break;
7017         case float_round_up:
7018             if (!extractFloat128Sign(z)) {
7019                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7020             }
7021             break;
7022         case float_round_down:
7023             if (extractFloat128Sign(z)) {
7024                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7025             }
7026             break;
7027         case float_round_to_odd:
7028             /*
7029              * Note that if lastBitMask == 0, the last bit is the lsb
7030              * of high, and roundBitsMask == -1.
7031              */
7032             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
7033                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7034             }
7035             break;
7036         default:
7037             abort();
7038         }
7039         z.low &= ~ roundBitsMask;
7040     }
7041     else {
7042         if ( aExp < 0x3FFF ) {
7043             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
7044             status->float_exception_flags |= float_flag_inexact;
7045             aSign = extractFloat128Sign( a );
7046             switch (status->float_rounding_mode) {
7047             case float_round_nearest_even:
7048                 if (    ( aExp == 0x3FFE )
7049                      && (   extractFloat128Frac0( a )
7050                           | extractFloat128Frac1( a ) )
7051                    ) {
7052                     return packFloat128( aSign, 0x3FFF, 0, 0 );
7053                 }
7054                 break;
7055             case float_round_ties_away:
7056                 if (aExp == 0x3FFE) {
7057                     return packFloat128(aSign, 0x3FFF, 0, 0);
7058                 }
7059                 break;
7060             case float_round_down:
7061                 return
7062                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7063                     : packFloat128( 0, 0, 0, 0 );
7064             case float_round_up:
7065                 return
7066                       aSign ? packFloat128( 1, 0, 0, 0 )
7067                     : packFloat128( 0, 0x3FFF, 0, 0 );
7068
7069             case float_round_to_odd:
7070                 return packFloat128(aSign, 0x3FFF, 0, 0);
7071             }
7072             return packFloat128( aSign, 0, 0, 0 );
7073         }
7074         lastBitMask = 1;
7075         lastBitMask <<= 0x402F - aExp;
7076         roundBitsMask = lastBitMask - 1;
7077         z.low = 0;
7078         z.high = a.high;
7079         switch (status->float_rounding_mode) {
7080         case float_round_nearest_even:
7081             z.high += lastBitMask>>1;
7082             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7083                 z.high &= ~ lastBitMask;
7084             }
7085             break;
7086         case float_round_ties_away:
7087             z.high += lastBitMask>>1;
7088             break;
7089         case float_round_to_zero:
7090             break;
7091         case float_round_up:
7092             if (!extractFloat128Sign(z)) {
7093                 z.high |= ( a.low != 0 );
7094                 z.high += roundBitsMask;
7095             }
7096             break;
7097         case float_round_down:
7098             if (extractFloat128Sign(z)) {
7099                 z.high |= (a.low != 0);
7100                 z.high += roundBitsMask;
7101             }
7102             break;
7103         case float_round_to_odd:
7104             if ((z.high & lastBitMask) == 0) {
7105                 z.high |= (a.low != 0);
7106                 z.high += roundBitsMask;
7107             }
7108             break;
7109         default:
7110             abort();
7111         }
7112         z.high &= ~ roundBitsMask;
7113     }
7114     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7115         status->float_exception_flags |= float_flag_inexact;
7116     }
7117     return z;
7118
7119 }
7120
7121 /*----------------------------------------------------------------------------
7122 | Returns the result of adding the absolute values of the quadruple-precision
7123 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
7124 | before being returned.  `zSign' is ignored if the result is a NaN.
7125 | The addition is performed according to the IEC/IEEE Standard for Binary
7126 | Floating-Point Arithmetic.
7127 *----------------------------------------------------------------------------*/
7128
7129 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
7130                                 float_status *status)
7131 {
7132     int32_t aExp, bExp, zExp;
7133     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7134     int32_t expDiff;
7135
7136     aSig1 = extractFloat128Frac1( a );
7137     aSig0 = extractFloat128Frac0( a );
7138     aExp = extractFloat128Exp( a );
7139     bSig1 = extractFloat128Frac1( b );
7140     bSig0 = extractFloat128Frac0( b );
7141     bExp = extractFloat128Exp( b );
7142     expDiff = aExp - bExp;
7143     if ( 0 < expDiff ) {
7144         if ( aExp == 0x7FFF ) {
7145             if (aSig0 | aSig1) {
7146                 return propagateFloat128NaN(a, b, status);
7147             }
7148             return a;
7149         }
7150         if ( bExp == 0 ) {
7151             --expDiff;
7152         }
7153         else {
7154             bSig0 |= LIT64( 0x0001000000000000 );
7155         }
7156         shift128ExtraRightJamming(
7157             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7158         zExp = aExp;
7159     }
7160     else if ( expDiff < 0 ) {
7161         if ( bExp == 0x7FFF ) {
7162             if (bSig0 | bSig1) {
7163                 return propagateFloat128NaN(a, b, status);
7164             }
7165             return packFloat128( zSign, 0x7FFF, 0, 0 );
7166         }
7167         if ( aExp == 0 ) {
7168             ++expDiff;
7169         }
7170         else {
7171             aSig0 |= LIT64( 0x0001000000000000 );
7172         }
7173         shift128ExtraRightJamming(
7174             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7175         zExp = bExp;
7176     }
7177     else {
7178         if ( aExp == 0x7FFF ) {
7179             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7180                 return propagateFloat128NaN(a, b, status);
7181             }
7182             return a;
7183         }
7184         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7185         if ( aExp == 0 ) {
7186             if (status->flush_to_zero) {
7187                 if (zSig0 | zSig1) {
7188                     float_raise(float_flag_output_denormal, status);
7189                 }
7190                 return packFloat128(zSign, 0, 0, 0);
7191             }
7192             return packFloat128( zSign, 0, zSig0, zSig1 );
7193         }
7194         zSig2 = 0;
7195         zSig0 |= LIT64( 0x0002000000000000 );
7196         zExp = aExp;
7197         goto shiftRight1;
7198     }
7199     aSig0 |= LIT64( 0x0001000000000000 );
7200     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7201     --zExp;
7202     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
7203     ++zExp;
7204  shiftRight1:
7205     shift128ExtraRightJamming(
7206         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7207  roundAndPack:
7208     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7209
7210 }
7211
7212 /*----------------------------------------------------------------------------
7213 | Returns the result of subtracting the absolute values of the quadruple-
7214 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7215 | difference is negated before being returned.  `zSign' is ignored if the
7216 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7217 | Standard for Binary Floating-Point Arithmetic.
7218 *----------------------------------------------------------------------------*/
7219
7220 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
7221                                 float_status *status)
7222 {
7223     int32_t aExp, bExp, zExp;
7224     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7225     int32_t expDiff;
7226
7227     aSig1 = extractFloat128Frac1( a );
7228     aSig0 = extractFloat128Frac0( a );
7229     aExp = extractFloat128Exp( a );
7230     bSig1 = extractFloat128Frac1( b );
7231     bSig0 = extractFloat128Frac0( b );
7232     bExp = extractFloat128Exp( b );
7233     expDiff = aExp - bExp;
7234     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7235     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7236     if ( 0 < expDiff ) goto aExpBigger;
7237     if ( expDiff < 0 ) goto bExpBigger;
7238     if ( aExp == 0x7FFF ) {
7239         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7240             return propagateFloat128NaN(a, b, status);
7241         }
7242         float_raise(float_flag_invalid, status);
7243         return float128_default_nan(status);
7244     }
7245     if ( aExp == 0 ) {
7246         aExp = 1;
7247         bExp = 1;
7248     }
7249     if ( bSig0 < aSig0 ) goto aBigger;
7250     if ( aSig0 < bSig0 ) goto bBigger;
7251     if ( bSig1 < aSig1 ) goto aBigger;
7252     if ( aSig1 < bSig1 ) goto bBigger;
7253     return packFloat128(status->float_rounding_mode == float_round_down,
7254                         0, 0, 0);
7255  bExpBigger:
7256     if ( bExp == 0x7FFF ) {
7257         if (bSig0 | bSig1) {
7258             return propagateFloat128NaN(a, b, status);
7259         }
7260         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7261     }
7262     if ( aExp == 0 ) {
7263         ++expDiff;
7264     }
7265     else {
7266         aSig0 |= LIT64( 0x4000000000000000 );
7267     }
7268     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7269     bSig0 |= LIT64( 0x4000000000000000 );
7270  bBigger:
7271     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7272     zExp = bExp;
7273     zSign ^= 1;
7274     goto normalizeRoundAndPack;
7275  aExpBigger:
7276     if ( aExp == 0x7FFF ) {
7277         if (aSig0 | aSig1) {
7278             return propagateFloat128NaN(a, b, status);
7279         }
7280         return a;
7281     }
7282     if ( bExp == 0 ) {
7283         --expDiff;
7284     }
7285     else {
7286         bSig0 |= LIT64( 0x4000000000000000 );
7287     }
7288     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7289     aSig0 |= LIT64( 0x4000000000000000 );
7290  aBigger:
7291     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7292     zExp = aExp;
7293  normalizeRoundAndPack:
7294     --zExp;
7295     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7296                                          status);
7297
7298 }
7299
7300 /*----------------------------------------------------------------------------
7301 | Returns the result of adding the quadruple-precision floating-point values
7302 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7303 | for Binary Floating-Point Arithmetic.
7304 *----------------------------------------------------------------------------*/
7305
7306 float128 float128_add(float128 a, float128 b, float_status *status)
7307 {
7308     flag aSign, bSign;
7309
7310     aSign = extractFloat128Sign( a );
7311     bSign = extractFloat128Sign( b );
7312     if ( aSign == bSign ) {
7313         return addFloat128Sigs(a, b, aSign, status);
7314     }
7315     else {
7316         return subFloat128Sigs(a, b, aSign, status);
7317     }
7318
7319 }
7320
7321 /*----------------------------------------------------------------------------
7322 | Returns the result of subtracting the quadruple-precision floating-point
7323 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7324 | Standard for Binary Floating-Point Arithmetic.
7325 *----------------------------------------------------------------------------*/
7326
7327 float128 float128_sub(float128 a, float128 b, float_status *status)
7328 {
7329     flag aSign, bSign;
7330
7331     aSign = extractFloat128Sign( a );
7332     bSign = extractFloat128Sign( b );
7333     if ( aSign == bSign ) {
7334         return subFloat128Sigs(a, b, aSign, status);
7335     }
7336     else {
7337         return addFloat128Sigs(a, b, aSign, status);
7338     }
7339
7340 }
7341
7342 /*----------------------------------------------------------------------------
7343 | Returns the result of multiplying the quadruple-precision floating-point
7344 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7345 | Standard for Binary Floating-Point Arithmetic.
7346 *----------------------------------------------------------------------------*/
7347
7348 float128 float128_mul(float128 a, float128 b, float_status *status)
7349 {
7350     flag aSign, bSign, zSign;
7351     int32_t aExp, bExp, zExp;
7352     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7353
7354     aSig1 = extractFloat128Frac1( a );
7355     aSig0 = extractFloat128Frac0( a );
7356     aExp = extractFloat128Exp( a );
7357     aSign = extractFloat128Sign( a );
7358     bSig1 = extractFloat128Frac1( b );
7359     bSig0 = extractFloat128Frac0( b );
7360     bExp = extractFloat128Exp( b );
7361     bSign = extractFloat128Sign( b );
7362     zSign = aSign ^ bSign;
7363     if ( aExp == 0x7FFF ) {
7364         if (    ( aSig0 | aSig1 )
7365              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7366             return propagateFloat128NaN(a, b, status);
7367         }
7368         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7369         return packFloat128( zSign, 0x7FFF, 0, 0 );
7370     }
7371     if ( bExp == 0x7FFF ) {
7372         if (bSig0 | bSig1) {
7373             return propagateFloat128NaN(a, b, status);
7374         }
7375         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7376  invalid:
7377             float_raise(float_flag_invalid, status);
7378             return float128_default_nan(status);
7379         }
7380         return packFloat128( zSign, 0x7FFF, 0, 0 );
7381     }
7382     if ( aExp == 0 ) {
7383         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7384         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7385     }
7386     if ( bExp == 0 ) {
7387         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7388         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7389     }
7390     zExp = aExp + bExp - 0x4000;
7391     aSig0 |= LIT64( 0x0001000000000000 );
7392     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7393     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7394     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7395     zSig2 |= ( zSig3 != 0 );
7396     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
7397         shift128ExtraRightJamming(
7398             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7399         ++zExp;
7400     }
7401     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7402
7403 }
7404
7405 /*----------------------------------------------------------------------------
7406 | Returns the result of dividing the quadruple-precision floating-point value
7407 | `a' by the corresponding value `b'.  The operation is performed according to
7408 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7409 *----------------------------------------------------------------------------*/
7410
7411 float128 float128_div(float128 a, float128 b, float_status *status)
7412 {
7413     flag aSign, bSign, zSign;
7414     int32_t aExp, bExp, zExp;
7415     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7416     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7417
7418     aSig1 = extractFloat128Frac1( a );
7419     aSig0 = extractFloat128Frac0( a );
7420     aExp = extractFloat128Exp( a );
7421     aSign = extractFloat128Sign( a );
7422     bSig1 = extractFloat128Frac1( b );
7423     bSig0 = extractFloat128Frac0( b );
7424     bExp = extractFloat128Exp( b );
7425     bSign = extractFloat128Sign( b );
7426     zSign = aSign ^ bSign;
7427     if ( aExp == 0x7FFF ) {
7428         if (aSig0 | aSig1) {
7429             return propagateFloat128NaN(a, b, status);
7430         }
7431         if ( bExp == 0x7FFF ) {
7432             if (bSig0 | bSig1) {
7433                 return propagateFloat128NaN(a, b, status);
7434             }
7435             goto invalid;
7436         }
7437         return packFloat128( zSign, 0x7FFF, 0, 0 );
7438     }
7439     if ( bExp == 0x7FFF ) {
7440         if (bSig0 | bSig1) {
7441             return propagateFloat128NaN(a, b, status);
7442         }
7443         return packFloat128( zSign, 0, 0, 0 );
7444     }
7445     if ( bExp == 0 ) {
7446         if ( ( bSig0 | bSig1 ) == 0 ) {
7447             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7448  invalid:
7449                 float_raise(float_flag_invalid, status);
7450                 return float128_default_nan(status);
7451             }
7452             float_raise(float_flag_divbyzero, status);
7453             return packFloat128( zSign, 0x7FFF, 0, 0 );
7454         }
7455         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7456     }
7457     if ( aExp == 0 ) {
7458         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7459         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7460     }
7461     zExp = aExp - bExp + 0x3FFD;
7462     shortShift128Left(
7463         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
7464     shortShift128Left(
7465         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7466     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7467         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7468         ++zExp;
7469     }
7470     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7471     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7472     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7473     while ( (int64_t) rem0 < 0 ) {
7474         --zSig0;
7475         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7476     }
7477     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7478     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7479         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7480         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7481         while ( (int64_t) rem1 < 0 ) {
7482             --zSig1;
7483             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7484         }
7485         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7486     }
7487     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7488     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7489
7490 }
7491
7492 /*----------------------------------------------------------------------------
7493 | Returns the remainder of the quadruple-precision floating-point value `a'
7494 | with respect to the corresponding value `b'.  The operation is performed
7495 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7496 *----------------------------------------------------------------------------*/
7497
7498 float128 float128_rem(float128 a, float128 b, float_status *status)
7499 {
7500     flag aSign, zSign;
7501     int32_t aExp, bExp, expDiff;
7502     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7503     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7504     int64_t sigMean0;
7505
7506     aSig1 = extractFloat128Frac1( a );
7507     aSig0 = extractFloat128Frac0( a );
7508     aExp = extractFloat128Exp( a );
7509     aSign = extractFloat128Sign( a );
7510     bSig1 = extractFloat128Frac1( b );
7511     bSig0 = extractFloat128Frac0( b );
7512     bExp = extractFloat128Exp( b );
7513     if ( aExp == 0x7FFF ) {
7514         if (    ( aSig0 | aSig1 )
7515              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7516             return propagateFloat128NaN(a, b, status);
7517         }
7518         goto invalid;
7519     }
7520     if ( bExp == 0x7FFF ) {
7521         if (bSig0 | bSig1) {
7522             return propagateFloat128NaN(a, b, status);
7523         }
7524         return a;
7525     }
7526     if ( bExp == 0 ) {
7527         if ( ( bSig0 | bSig1 ) == 0 ) {
7528  invalid:
7529             float_raise(float_flag_invalid, status);
7530             return float128_default_nan(status);
7531         }
7532         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7533     }
7534     if ( aExp == 0 ) {
7535         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7536         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7537     }
7538     expDiff = aExp - bExp;
7539     if ( expDiff < -1 ) return a;
7540     shortShift128Left(
7541         aSig0 | LIT64( 0x0001000000000000 ),
7542         aSig1,
7543         15 - ( expDiff < 0 ),
7544         &aSig0,
7545         &aSig1
7546     );
7547     shortShift128Left(
7548         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7549     q = le128( bSig0, bSig1, aSig0, aSig1 );
7550     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7551     expDiff -= 64;
7552     while ( 0 < expDiff ) {
7553         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7554         q = ( 4 < q ) ? q - 4 : 0;
7555         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7556         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7557         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7558         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7559         expDiff -= 61;
7560     }
7561     if ( -64 < expDiff ) {
7562         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7563         q = ( 4 < q ) ? q - 4 : 0;
7564         q >>= - expDiff;
7565         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7566         expDiff += 52;
7567         if ( expDiff < 0 ) {
7568             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7569         }
7570         else {
7571             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7572         }
7573         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7574         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7575     }
7576     else {
7577         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7578         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7579     }
7580     do {
7581         alternateASig0 = aSig0;
7582         alternateASig1 = aSig1;
7583         ++q;
7584         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7585     } while ( 0 <= (int64_t) aSig0 );
7586     add128(
7587         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7588     if (    ( sigMean0 < 0 )
7589          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7590         aSig0 = alternateASig0;
7591         aSig1 = alternateASig1;
7592     }
7593     zSign = ( (int64_t) aSig0 < 0 );
7594     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7595     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7596                                          status);
7597 }
7598
7599 /*----------------------------------------------------------------------------
7600 | Returns the square root of the quadruple-precision floating-point value `a'.
7601 | The operation is performed according to the IEC/IEEE Standard for Binary
7602 | Floating-Point Arithmetic.
7603 *----------------------------------------------------------------------------*/
7604
7605 float128 float128_sqrt(float128 a, float_status *status)
7606 {
7607     flag aSign;
7608     int32_t aExp, zExp;
7609     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7610     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7611
7612     aSig1 = extractFloat128Frac1( a );
7613     aSig0 = extractFloat128Frac0( a );
7614     aExp = extractFloat128Exp( a );
7615     aSign = extractFloat128Sign( a );
7616     if ( aExp == 0x7FFF ) {
7617         if (aSig0 | aSig1) {
7618             return propagateFloat128NaN(a, a, status);
7619         }
7620         if ( ! aSign ) return a;
7621         goto invalid;
7622     }
7623     if ( aSign ) {
7624         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7625  invalid:
7626         float_raise(float_flag_invalid, status);
7627         return float128_default_nan(status);
7628     }
7629     if ( aExp == 0 ) {
7630         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7631         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7632     }
7633     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7634     aSig0 |= LIT64( 0x0001000000000000 );
7635     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7636     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7637     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7638     doubleZSig0 = zSig0<<1;
7639     mul64To128( zSig0, zSig0, &term0, &term1 );
7640     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7641     while ( (int64_t) rem0 < 0 ) {
7642         --zSig0;
7643         doubleZSig0 -= 2;
7644         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7645     }
7646     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7647     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7648         if ( zSig1 == 0 ) zSig1 = 1;
7649         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7650         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7651         mul64To128( zSig1, zSig1, &term2, &term3 );
7652         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7653         while ( (int64_t) rem1 < 0 ) {
7654             --zSig1;
7655             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7656             term3 |= 1;
7657             term2 |= doubleZSig0;
7658             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7659         }
7660         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7661     }
7662     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7663     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7664
7665 }
7666
7667 /*----------------------------------------------------------------------------
7668 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7669 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7670 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7671 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7672 *----------------------------------------------------------------------------*/
7673
7674 int float128_eq(float128 a, float128 b, float_status *status)
7675 {
7676
7677     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7678               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7679          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7680               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7681        ) {
7682         float_raise(float_flag_invalid, status);
7683         return 0;
7684     }
7685     return
7686            ( a.low == b.low )
7687         && (    ( a.high == b.high )
7688              || (    ( a.low == 0 )
7689                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7690            );
7691
7692 }
7693
7694 /*----------------------------------------------------------------------------
7695 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7696 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7697 | exception is raised if either operand is a NaN.  The comparison is performed
7698 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7699 *----------------------------------------------------------------------------*/
7700
7701 int float128_le(float128 a, float128 b, float_status *status)
7702 {
7703     flag aSign, bSign;
7704
7705     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7706               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7707          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7708               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7709        ) {
7710         float_raise(float_flag_invalid, status);
7711         return 0;
7712     }
7713     aSign = extractFloat128Sign( a );
7714     bSign = extractFloat128Sign( b );
7715     if ( aSign != bSign ) {
7716         return
7717                aSign
7718             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7719                  == 0 );
7720     }
7721     return
7722           aSign ? le128( b.high, b.low, a.high, a.low )
7723         : le128( a.high, a.low, b.high, b.low );
7724
7725 }
7726
7727 /*----------------------------------------------------------------------------
7728 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7729 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7730 | raised if either operand is a NaN.  The comparison is performed according
7731 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7732 *----------------------------------------------------------------------------*/
7733
7734 int float128_lt(float128 a, float128 b, float_status *status)
7735 {
7736     flag aSign, bSign;
7737
7738     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7739               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7740          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7741               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7742        ) {
7743         float_raise(float_flag_invalid, status);
7744         return 0;
7745     }
7746     aSign = extractFloat128Sign( a );
7747     bSign = extractFloat128Sign( b );
7748     if ( aSign != bSign ) {
7749         return
7750                aSign
7751             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7752                  != 0 );
7753     }
7754     return
7755           aSign ? lt128( b.high, b.low, a.high, a.low )
7756         : lt128( a.high, a.low, b.high, b.low );
7757
7758 }
7759
7760 /*----------------------------------------------------------------------------
7761 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7762 | be compared, and 0 otherwise.  The invalid exception is raised if either
7763 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7764 | Standard for Binary Floating-Point Arithmetic.
7765 *----------------------------------------------------------------------------*/
7766
7767 int float128_unordered(float128 a, float128 b, float_status *status)
7768 {
7769     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7770               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7771          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7772               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7773        ) {
7774         float_raise(float_flag_invalid, status);
7775         return 1;
7776     }
7777     return 0;
7778 }
7779
7780 /*----------------------------------------------------------------------------
7781 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7782 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7783 | exception.  The comparison is performed according to the IEC/IEEE Standard
7784 | for Binary Floating-Point Arithmetic.
7785 *----------------------------------------------------------------------------*/
7786
7787 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7788 {
7789
7790     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7791               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7792          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7793               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7794        ) {
7795         if (float128_is_signaling_nan(a, status)
7796          || float128_is_signaling_nan(b, status)) {
7797             float_raise(float_flag_invalid, status);
7798         }
7799         return 0;
7800     }
7801     return
7802            ( a.low == b.low )
7803         && (    ( a.high == b.high )
7804              || (    ( a.low == 0 )
7805                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7806            );
7807
7808 }
7809
7810 /*----------------------------------------------------------------------------
7811 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7812 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7813 | cause an exception.  Otherwise, the comparison is performed according to the
7814 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7815 *----------------------------------------------------------------------------*/
7816
7817 int float128_le_quiet(float128 a, float128 b, float_status *status)
7818 {
7819     flag aSign, bSign;
7820
7821     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7822               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7823          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7824               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7825        ) {
7826         if (float128_is_signaling_nan(a, status)
7827          || float128_is_signaling_nan(b, status)) {
7828             float_raise(float_flag_invalid, status);
7829         }
7830         return 0;
7831     }
7832     aSign = extractFloat128Sign( a );
7833     bSign = extractFloat128Sign( b );
7834     if ( aSign != bSign ) {
7835         return
7836                aSign
7837             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7838                  == 0 );
7839     }
7840     return
7841           aSign ? le128( b.high, b.low, a.high, a.low )
7842         : le128( a.high, a.low, b.high, b.low );
7843
7844 }
7845
7846 /*----------------------------------------------------------------------------
7847 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7848 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7849 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7850 | Standard for Binary Floating-Point Arithmetic.
7851 *----------------------------------------------------------------------------*/
7852
7853 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7854 {
7855     flag aSign, bSign;
7856
7857     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7858               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7859          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7860               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7861        ) {
7862         if (float128_is_signaling_nan(a, status)
7863          || float128_is_signaling_nan(b, status)) {
7864             float_raise(float_flag_invalid, status);
7865         }
7866         return 0;
7867     }
7868     aSign = extractFloat128Sign( a );
7869     bSign = extractFloat128Sign( b );
7870     if ( aSign != bSign ) {
7871         return
7872                aSign
7873             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7874                  != 0 );
7875     }
7876     return
7877           aSign ? lt128( b.high, b.low, a.high, a.low )
7878         : lt128( a.high, a.low, b.high, b.low );
7879
7880 }
7881
7882 /*----------------------------------------------------------------------------
7883 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7884 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7885 | comparison is performed according to the IEC/IEEE Standard for Binary
7886 | Floating-Point Arithmetic.
7887 *----------------------------------------------------------------------------*/
7888
7889 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7890 {
7891     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7892               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7893          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7894               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7895        ) {
7896         if (float128_is_signaling_nan(a, status)
7897          || float128_is_signaling_nan(b, status)) {
7898             float_raise(float_flag_invalid, status);
7899         }
7900         return 1;
7901     }
7902     return 0;
7903 }
7904
7905 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7906                                             int is_quiet, float_status *status)
7907 {
7908     flag aSign, bSign;
7909
7910     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7911         float_raise(float_flag_invalid, status);
7912         return float_relation_unordered;
7913     }
7914     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7915           ( extractFloatx80Frac( a )<<1 ) ) ||
7916         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7917           ( extractFloatx80Frac( b )<<1 ) )) {
7918         if (!is_quiet ||
7919             floatx80_is_signaling_nan(a, status) ||
7920             floatx80_is_signaling_nan(b, status)) {
7921             float_raise(float_flag_invalid, status);
7922         }
7923         return float_relation_unordered;
7924     }
7925     aSign = extractFloatx80Sign( a );
7926     bSign = extractFloatx80Sign( b );
7927     if ( aSign != bSign ) {
7928
7929         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7930              ( ( a.low | b.low ) == 0 ) ) {
7931             /* zero case */
7932             return float_relation_equal;
7933         } else {
7934             return 1 - (2 * aSign);
7935         }
7936     } else {
7937         if (a.low == b.low && a.high == b.high) {
7938             return float_relation_equal;
7939         } else {
7940             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7941         }
7942     }
7943 }
7944
7945 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7946 {
7947     return floatx80_compare_internal(a, b, 0, status);
7948 }
7949
7950 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7951 {
7952     return floatx80_compare_internal(a, b, 1, status);
7953 }
7954
7955 static inline int float128_compare_internal(float128 a, float128 b,
7956                                             int is_quiet, float_status *status)
7957 {
7958     flag aSign, bSign;
7959
7960     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7961           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7962         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7963           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7964         if (!is_quiet ||
7965             float128_is_signaling_nan(a, status) ||
7966             float128_is_signaling_nan(b, status)) {
7967             float_raise(float_flag_invalid, status);
7968         }
7969         return float_relation_unordered;
7970     }
7971     aSign = extractFloat128Sign( a );
7972     bSign = extractFloat128Sign( b );
7973     if ( aSign != bSign ) {
7974         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7975             /* zero case */
7976             return float_relation_equal;
7977         } else {
7978             return 1 - (2 * aSign);
7979         }
7980     } else {
7981         if (a.low == b.low && a.high == b.high) {
7982             return float_relation_equal;
7983         } else {
7984             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7985         }
7986     }
7987 }
7988
7989 int float128_compare(float128 a, float128 b, float_status *status)
7990 {
7991     return float128_compare_internal(a, b, 0, status);
7992 }
7993
7994 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7995 {
7996     return float128_compare_internal(a, b, 1, status);
7997 }
7998
7999 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
8000 {
8001     flag aSign;
8002     int32_t aExp;
8003     uint64_t aSig;
8004
8005     if (floatx80_invalid_encoding(a)) {
8006         float_raise(float_flag_invalid, status);
8007         return floatx80_default_nan(status);
8008     }
8009     aSig = extractFloatx80Frac( a );
8010     aExp = extractFloatx80Exp( a );
8011     aSign = extractFloatx80Sign( a );
8012
8013     if ( aExp == 0x7FFF ) {
8014         if ( aSig<<1 ) {
8015             return propagateFloatx80NaN(a, a, status);
8016         }
8017         return a;
8018     }
8019
8020     if (aExp == 0) {
8021         if (aSig == 0) {
8022             return a;
8023         }
8024         aExp++;
8025     }
8026
8027     if (n > 0x10000) {
8028         n = 0x10000;
8029     } else if (n < -0x10000) {
8030         n = -0x10000;
8031     }
8032
8033     aExp += n;
8034     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
8035                                          aSign, aExp, aSig, 0, status);
8036 }
8037
8038 float128 float128_scalbn(float128 a, int n, float_status *status)
8039 {
8040     flag aSign;
8041     int32_t aExp;
8042     uint64_t aSig0, aSig1;
8043
8044     aSig1 = extractFloat128Frac1( a );
8045     aSig0 = extractFloat128Frac0( a );
8046     aExp = extractFloat128Exp( a );
8047     aSign = extractFloat128Sign( a );
8048     if ( aExp == 0x7FFF ) {
8049         if ( aSig0 | aSig1 ) {
8050             return propagateFloat128NaN(a, a, status);
8051         }
8052         return a;
8053     }
8054     if (aExp != 0) {
8055         aSig0 |= LIT64( 0x0001000000000000 );
8056     } else if (aSig0 == 0 && aSig1 == 0) {
8057         return a;
8058     } else {
8059         aExp++;
8060     }
8061
8062     if (n > 0x10000) {
8063         n = 0x10000;
8064     } else if (n < -0x10000) {
8065         n = -0x10000;
8066     }
8067
8068     aExp += n - 1;
8069     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
8070                                          , status);
8071
8072 }
8073
8074 static void __attribute__((constructor)) softfloat_init(void)
8075 {
8076     union_float64 ua, ub, uc, ur;
8077
8078     if (QEMU_NO_HARDFLOAT) {
8079         return;
8080     }
8081     /*
8082      * Test that the host's FMA is not obviously broken. For example,
8083      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
8084      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
8085      */
8086     ua.s = 0x0020000000000001ULL;
8087     ub.s = 0x3ca0000000000000ULL;
8088     uc.s = 0x0020000000000000ULL;
8089     ur.h = fma(ua.h, ub.h, uc.h);
8090     if (ur.s != 0x0020000000000001ULL) {
8091         force_soft_fma = true;
8092     }
8093 }