fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include <math.h>
  87 #include "qemu/bitops.h"
  88 #include "fpu/softfloat.h"
  89
  90 /* We only need stdlib for abort() */
  91
  92 /*----------------------------------------------------------------------------
  93 | Primitive arithmetic functions, including multi-word arithmetic, and
  94 | division and square root approximations.  (Can be specialized to target if
  95 | desired.)
  96 *----------------------------------------------------------------------------*/
  97 #include "fpu/softfloat-macros.h"
  98
  99 /*
 100  * Hardfloat
 101  *
 102  * Fast emulation of guest FP instructions is challenging for two reasons.
 103  * First, FP instruction semantics are similar but not identical, particularly
 104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
 105  * exception flags is not trivial: reading the host's flags register with a
 106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
 107  * and trapping on every FP exception is not fast nor pleasant to work with.
 108  *
 109  * We address these challenges by leveraging the host FPU for a subset of the
 110  * operations. To do this we expand on the idea presented in this paper:
 111  *
 112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
 113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
 114  *
 115  * The idea is thus to leverage the host FPU to (1) compute FP operations
 116  * and (2) identify whether FP exceptions occurred while avoiding
 117  * expensive exception flag register accesses.
 118  *
 119  * An important optimization shown in the paper is that given that exception
 120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
 121  * This is particularly useful for the inexact flag, which is very frequently
 122  * raised in floating-point workloads.
 123  *
 124  * We optimize the code further by deferring to soft-fp whenever FP exception
 125  * detection might get hairy. Two examples: (1) when at least one operand is
 126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
 127  * and the result is < the minimum normal.
 128  */
 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
 130     static inline void name(soft_t *a, float_status *s)                 \
 131     {                                                                   \
 132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
 133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
 134                                      soft_t ## _is_neg(*a));            \
 135             s->float_exception_flags |= float_flag_input_denormal;      \
 136         }                                                               \
 137     }
 138
 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
 141 #undef GEN_INPUT_FLUSH__NOCHECK
 142
 143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
 144     static inline void name(soft_t *a, float_status *s) \
 145     {                                                   \
 146         if (likely(!s->flush_inputs_to_zero)) {         \
 147             return;                                     \
 148         }                                               \
 149         soft_t ## _input_flush__nocheck(a, s);          \
 150     }
 151
 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
 154 #undef GEN_INPUT_FLUSH1
 155
 156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
 157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
 158     {                                                                   \
 159         if (likely(!s->flush_inputs_to_zero)) {                         \
 160             return;                                                     \
 161         }                                                               \
 162         soft_t ## _input_flush__nocheck(a, s);                          \
 163         soft_t ## _input_flush__nocheck(b, s);                          \
 164     }
 165
 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
 168 #undef GEN_INPUT_FLUSH2
 169
 170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
 171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
 172     {                                                                   \
 173         if (likely(!s->flush_inputs_to_zero)) {                         \
 174             return;                                                     \
 175         }                                                               \
 176         soft_t ## _input_flush__nocheck(a, s);                          \
 177         soft_t ## _input_flush__nocheck(b, s);                          \
 178         soft_t ## _input_flush__nocheck(c, s);                          \
 179     }
 180
 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
 183 #undef GEN_INPUT_FLUSH3
 184
 185 /*
 186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
 187  * hardfloat functions. Each combination of number of inputs and float size
 188  * gets its own value.
 189  */
 190 #if defined(__x86_64__)
 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
 197 #else
 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
 204 #endif
 205
 206 /*
 207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
 208  * float{32,64}_is_infinity when !USE_FP.
 209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
 210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
 211  */
 212 #if defined(__x86_64__) || defined(__aarch64__)
 213 # define QEMU_HARDFLOAT_USE_ISINF   1
 214 #else
 215 # define QEMU_HARDFLOAT_USE_ISINF   0
 216 #endif
 217
 218 /*
 219  * Some targets clear the FP flags before most FP operations. This prevents
 220  * the use of hardfloat, since hardfloat relies on the inexact flag being
 221  * already set.
 222  */
 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
 224 # if defined(__FAST_MATH__)
 225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
 226     IEEE implementation
 227 # endif
 228 # define QEMU_NO_HARDFLOAT 1
 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
 230 #else
 231 # define QEMU_NO_HARDFLOAT 0
 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
 233 #endif
 234
 235 static inline bool can_use_fpu(const float_status *s)
 236 {
 237     if (QEMU_NO_HARDFLOAT) {
 238         return false;
 239     }
 240     return likely(s->float_exception_flags & float_flag_inexact &&
 241                   s->float_rounding_mode == float_round_nearest_even);
 242 }
 243
 244 /*
 245  * Hardfloat generation functions. Each operation can have two flavors:
 246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
 247  * most condition checks, or native ones (e.g. fpclassify).
 248  *
 249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
 250  * compiler to propagate constants and inline everything into the callers.
 251  *
 252  * We only generate functions for operations with two inputs, since only
 253  * these are common enough to justify consolidating them into common code.
 254  */
 255
 256 typedef union {
 257     float32 s;
 258     float h;
 259 } union_float32;
 260
 261 typedef union {
 262     float64 s;
 263     double h;
 264 } union_float64;
 265
 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
 268
 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
 271 typedef float   (*hard_f32_op2_fn)(float a, float b);
 272 typedef double  (*hard_f64_op2_fn)(double a, double b);
 273
 274 /* 2-input is-zero-or-normal */
 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
 276 {
 277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
 278         /*
 279          * Not using a temp variable for consecutive fpclassify calls ends up
 280          * generating faster code.
 281          */
 282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 284     }
 285     return float32_is_zero_or_normal(a.s) &&
 286            float32_is_zero_or_normal(b.s);
 287 }
 288
 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
 290 {
 291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
 292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 294     }
 295     return float64_is_zero_or_normal(a.s) &&
 296            float64_is_zero_or_normal(b.s);
 297 }
 298
 299 /* 3-input is-zero-or-normal */
 300 static inline
 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
 302 {
 303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
 304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 307     }
 308     return float32_is_zero_or_normal(a.s) &&
 309            float32_is_zero_or_normal(b.s) &&
 310            float32_is_zero_or_normal(c.s);
 311 }
 312
 313 static inline
 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
 315 {
 316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
 317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 320     }
 321     return float64_is_zero_or_normal(a.s) &&
 322            float64_is_zero_or_normal(b.s) &&
 323            float64_is_zero_or_normal(c.s);
 324 }
 325
 326 static inline bool f32_is_inf(union_float32 a)
 327 {
 328     if (QEMU_HARDFLOAT_USE_ISINF) {
 329         return isinf(a.h);
 330     }
 331     return float32_is_infinity(a.s);
 332 }
 333
 334 static inline bool f64_is_inf(union_float64 a)
 335 {
 336     if (QEMU_HARDFLOAT_USE_ISINF) {
 337         return isinf(a.h);
 338     }
 339     return float64_is_infinity(a.s);
 340 }
 341
 342 /* Note: @fast_test and @post can be NULL */
 343 static inline float32
 344 float32_gen2(float32 xa, float32 xb, float_status *s,
 345              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
 346              f32_check_fn pre, f32_check_fn post,
 347              f32_check_fn fast_test, soft_f32_op2_fn fast_op)
 348 {
 349     union_float32 ua, ub, ur;
 350
 351     ua.s = xa;
 352     ub.s = xb;
 353
 354     if (unlikely(!can_use_fpu(s))) {
 355         goto soft;
 356     }
 357
 358     float32_input_flush2(&ua.s, &ub.s, s);
 359     if (unlikely(!pre(ua, ub))) {
 360         goto soft;
 361     }
 362     if (fast_test && fast_test(ua, ub)) {
 363         return fast_op(ua.s, ub.s, s);
 364     }
 365
 366     ur.h = hard(ua.h, ub.h);
 367     if (unlikely(f32_is_inf(ur))) {
 368         s->float_exception_flags |= float_flag_overflow;
 369     } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
 370         if (post == NULL || post(ua, ub)) {
 371             goto soft;
 372         }
 373     }
 374     return ur.s;
 375
 376  soft:
 377     return soft(ua.s, ub.s, s);
 378 }
 379
 380 static inline float64
 381 float64_gen2(float64 xa, float64 xb, float_status *s,
 382              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
 383              f64_check_fn pre, f64_check_fn post,
 384              f64_check_fn fast_test, soft_f64_op2_fn fast_op)
 385 {
 386     union_float64 ua, ub, ur;
 387
 388     ua.s = xa;
 389     ub.s = xb;
 390
 391     if (unlikely(!can_use_fpu(s))) {
 392         goto soft;
 393     }
 394
 395     float64_input_flush2(&ua.s, &ub.s, s);
 396     if (unlikely(!pre(ua, ub))) {
 397         goto soft;
 398     }
 399     if (fast_test && fast_test(ua, ub)) {
 400         return fast_op(ua.s, ub.s, s);
 401     }
 402
 403     ur.h = hard(ua.h, ub.h);
 404     if (unlikely(f64_is_inf(ur))) {
 405         s->float_exception_flags |= float_flag_overflow;
 406     } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
 407         if (post == NULL || post(ua, ub)) {
 408             goto soft;
 409         }
 410     }
 411     return ur.s;
 412
 413  soft:
 414     return soft(ua.s, ub.s, s);
 415 }
 416
 417 /*----------------------------------------------------------------------------
 418 | Returns the fraction bits of the half-precision floating-point value `a'.
 419 *----------------------------------------------------------------------------*/
 420
 421 static inline uint32_t extractFloat16Frac(float16 a)
 422 {
 423     return float16_val(a) & 0x3ff;
 424 }
 425
 426 /*----------------------------------------------------------------------------
 427 | Returns the exponent bits of the half-precision floating-point value `a'.
 428 *----------------------------------------------------------------------------*/
 429
 430 static inline int extractFloat16Exp(float16 a)
 431 {
 432     return (float16_val(a) >> 10) & 0x1f;
 433 }
 434
 435 /*----------------------------------------------------------------------------
 436 | Returns the fraction bits of the single-precision floating-point value `a'.
 437 *----------------------------------------------------------------------------*/
 438
 439 static inline uint32_t extractFloat32Frac(float32 a)
 440 {
 441     return float32_val(a) & 0x007FFFFF;
 442 }
 443
 444 /*----------------------------------------------------------------------------
 445 | Returns the exponent bits of the single-precision floating-point value `a'.
 446 *----------------------------------------------------------------------------*/
 447
 448 static inline int extractFloat32Exp(float32 a)
 449 {
 450     return (float32_val(a) >> 23) & 0xFF;
 451 }
 452
 453 /*----------------------------------------------------------------------------
 454 | Returns the sign bit of the single-precision floating-point value `a'.
 455 *----------------------------------------------------------------------------*/
 456
 457 static inline flag extractFloat32Sign(float32 a)
 458 {
 459     return float32_val(a) >> 31;
 460 }
 461
 462 /*----------------------------------------------------------------------------
 463 | Returns the fraction bits of the double-precision floating-point value `a'.
 464 *----------------------------------------------------------------------------*/
 465
 466 static inline uint64_t extractFloat64Frac(float64 a)
 467 {
 468     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
 469 }
 470
 471 /*----------------------------------------------------------------------------
 472 | Returns the exponent bits of the double-precision floating-point value `a'.
 473 *----------------------------------------------------------------------------*/
 474
 475 static inline int extractFloat64Exp(float64 a)
 476 {
 477     return (float64_val(a) >> 52) & 0x7FF;
 478 }
 479
 480 /*----------------------------------------------------------------------------
 481 | Returns the sign bit of the double-precision floating-point value `a'.
 482 *----------------------------------------------------------------------------*/
 483
 484 static inline flag extractFloat64Sign(float64 a)
 485 {
 486     return float64_val(a) >> 63;
 487 }
 488
 489 /*
 490  * Classify a floating point number. Everything above float_class_qnan
 491  * is a NaN so cls >= float_class_qnan is any NaN.
 492  */
 493
 494 typedef enum __attribute__ ((__packed__)) {
 495     float_class_unclassified,
 496     float_class_zero,
 497     float_class_normal,
 498     float_class_inf,
 499     float_class_qnan,  /* all NaNs from here */
 500     float_class_snan,
 501 } FloatClass;
 502
 503 /* Simple helpers for checking if, or what kind of, NaN we have */
 504 static inline __attribute__((unused)) bool is_nan(FloatClass c)
 505 {
 506     return unlikely(c >= float_class_qnan);
 507 }
 508
 509 static inline __attribute__((unused)) bool is_snan(FloatClass c)
 510 {
 511     return c == float_class_snan;
 512 }
 513
 514 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 515 {
 516     return c == float_class_qnan;
 517 }
 518
 519 /*
 520  * Structure holding all of the decomposed parts of a float. The
 521  * exponent is unbiased and the fraction is normalized. All
 522  * calculations are done with a 64 bit fraction and then rounded as
 523  * appropriate for the final format.
 524  *
 525  * Thanks to the packed FloatClass a decent compiler should be able to
 526  * fit the whole structure into registers and avoid using the stack
 527  * for parameter passing.
 528  */
 529
 530 typedef struct {
 531     uint64_t frac;
 532     int32_t  exp;
 533     FloatClass cls;
 534     bool sign;
 535 } FloatParts;
 536
 537 #define DECOMPOSED_BINARY_POINT    (64 - 2)
 538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 539 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 540
 541 /* Structure holding all of the relevant parameters for a format.
 542  *   exp_size: the size of the exponent field
 543  *   exp_bias: the offset applied to the exponent field
 544  *   exp_max: the maximum normalised exponent
 545  *   frac_size: the size of the fraction field
 546  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 547  * The following are computed based the size of fraction
 548  *   frac_lsb: least significant bit of fraction
 549  *   frac_lsbm1: the bit below the least significant bit (for rounding)
 550  *   round_mask/roundeven_mask: masks used for rounding
 551  * The following optional modifiers are available:
 552  *   arm_althp: handle ARM Alternative Half Precision
 553  */
 554 typedef struct {
 555     int exp_size;
 556     int exp_bias;
 557     int exp_max;
 558     int frac_size;
 559     int frac_shift;
 560     uint64_t frac_lsb;
 561     uint64_t frac_lsbm1;
 562     uint64_t round_mask;
 563     uint64_t roundeven_mask;
 564     bool arm_althp;
 565 } FloatFmt;
 566
 567 /* Expand fields based on the size of exponent and fraction */
 568 #define FLOAT_PARAMS(E, F)                                           \
 569     .exp_size       = E,                                             \
 570     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 571     .exp_max        = (1 << E) - 1,                                  \
 572     .frac_size      = F,                                             \
 573     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 574     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 575     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 576     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 577     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 578
 579 static const FloatFmt float16_params = {
 580     FLOAT_PARAMS(5, 10)
 581 };
 582
 583 static const FloatFmt float16_params_ahp = {
 584     FLOAT_PARAMS(5, 10),
 585     .arm_althp = true
 586 };
 587
 588 static const FloatFmt float32_params = {
 589     FLOAT_PARAMS(8, 23)
 590 };
 591
 592 static const FloatFmt float64_params = {
 593     FLOAT_PARAMS(11, 52)
 594 };
 595
 596 /* Unpack a float to parts, but do not canonicalize.  */
 597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 598 {
 599     const int sign_pos = fmt.frac_size + fmt.exp_size;
 600
 601     return (FloatParts) {
 602         .cls = float_class_unclassified,
 603         .sign = extract64(raw, sign_pos, 1),
 604         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 605         .frac = extract64(raw, 0, fmt.frac_size),
 606     };
 607 }
 608
 609 static inline FloatParts float16_unpack_raw(float16 f)
 610 {
 611     return unpack_raw(float16_params, f);
 612 }
 613
 614 static inline FloatParts float32_unpack_raw(float32 f)
 615 {
 616     return unpack_raw(float32_params, f);
 617 }
 618
 619 static inline FloatParts float64_unpack_raw(float64 f)
 620 {
 621     return unpack_raw(float64_params, f);
 622 }
 623
 624 /* Pack a float from parts, but do not canonicalize.  */
 625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 626 {
 627     const int sign_pos = fmt.frac_size + fmt.exp_size;
 628     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 629     return deposit64(ret, sign_pos, 1, p.sign);
 630 }
 631
 632 static inline float16 float16_pack_raw(FloatParts p)
 633 {
 634     return make_float16(pack_raw(float16_params, p));
 635 }
 636
 637 static inline float32 float32_pack_raw(FloatParts p)
 638 {
 639     return make_float32(pack_raw(float32_params, p));
 640 }
 641
 642 static inline float64 float64_pack_raw(FloatParts p)
 643 {
 644     return make_float64(pack_raw(float64_params, p));
 645 }
 646
 647 /*----------------------------------------------------------------------------
 648 | Functions and definitions to determine:  (1) whether tininess for underflow
 649 | is detected before or after rounding by default, (2) what (if anything)
 650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 652 | are propagated from function inputs to output.  These details are target-
 653 | specific.
 654 *----------------------------------------------------------------------------*/
 655 #include "softfloat-specialize.h"
 656
 657 /* Canonicalize EXP and FRAC, setting CLS.  */
 658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
 659                                   float_status *status)
 660 {
 661     if (part.exp == parm->exp_max && !parm->arm_althp) {
 662         if (part.frac == 0) {
 663             part.cls = float_class_inf;
 664         } else {
 665             part.frac <<= parm->frac_shift;
 666             part.cls = (parts_is_snan_frac(part.frac, status)
 667                         ? float_class_snan : float_class_qnan);
 668         }
 669     } else if (part.exp == 0) {
 670         if (likely(part.frac == 0)) {
 671             part.cls = float_class_zero;
 672         } else if (status->flush_inputs_to_zero) {
 673             float_raise(float_flag_input_denormal, status);
 674             part.cls = float_class_zero;
 675             part.frac = 0;
 676         } else {
 677             int shift = clz64(part.frac) - 1;
 678             part.cls = float_class_normal;
 679             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 680             part.frac <<= shift;
 681         }
 682     } else {
 683         part.cls = float_class_normal;
 684         part.exp -= parm->exp_bias;
 685         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 686     }
 687     return part;
 688 }
 689
 690 /* Round and uncanonicalize a floating-point number by parts. There
 691  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 692  * fraction; these bits will be removed. The exponent will be biased
 693  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 694  */
 695
 696 static FloatParts round_canonical(FloatParts p, float_status *s,
 697                                   const FloatFmt *parm)
 698 {
 699     const uint64_t frac_lsb = parm->frac_lsb;
 700     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 701     const uint64_t round_mask = parm->round_mask;
 702     const uint64_t roundeven_mask = parm->roundeven_mask;
 703     const int exp_max = parm->exp_max;
 704     const int frac_shift = parm->frac_shift;
 705     uint64_t frac, inc;
 706     int exp, flags = 0;
 707     bool overflow_norm;
 708
 709     frac = p.frac;
 710     exp = p.exp;
 711
 712     switch (p.cls) {
 713     case float_class_normal:
 714         switch (s->float_rounding_mode) {
 715         case float_round_nearest_even:
 716             overflow_norm = false;
 717             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 718             break;
 719         case float_round_ties_away:
 720             overflow_norm = false;
 721             inc = frac_lsbm1;
 722             break;
 723         case float_round_to_zero:
 724             overflow_norm = true;
 725             inc = 0;
 726             break;
 727         case float_round_up:
 728             inc = p.sign ? 0 : round_mask;
 729             overflow_norm = p.sign;
 730             break;
 731         case float_round_down:
 732             inc = p.sign ? round_mask : 0;
 733             overflow_norm = !p.sign;
 734             break;
 735         case float_round_to_odd:
 736             overflow_norm = true;
 737             inc = frac & frac_lsb ? 0 : round_mask;
 738             break;
 739         default:
 740             g_assert_not_reached();
 741         }
 742
 743         exp += parm->exp_bias;
 744         if (likely(exp > 0)) {
 745             if (frac & round_mask) {
 746                 flags |= float_flag_inexact;
 747                 frac += inc;
 748                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
 749                     frac >>= 1;
 750                     exp++;
 751                 }
 752             }
 753             frac >>= frac_shift;
 754
 755             if (parm->arm_althp) {
 756                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
 757                 if (unlikely(exp > exp_max)) {
 758                     /* Overflow.  Return the maximum normal.  */
 759                     flags = float_flag_invalid;
 760                     exp = exp_max;
 761                     frac = -1;
 762                 }
 763             } else if (unlikely(exp >= exp_max)) {
 764                 flags |= float_flag_overflow | float_flag_inexact;
 765                 if (overflow_norm) {
 766                     exp = exp_max - 1;
 767                     frac = -1;
 768                 } else {
 769                     p.cls = float_class_inf;
 770                     goto do_inf;
 771                 }
 772             }
 773         } else if (s->flush_to_zero) {
 774             flags |= float_flag_output_denormal;
 775             p.cls = float_class_zero;
 776             goto do_zero;
 777         } else {
 778             bool is_tiny = (s->float_detect_tininess
 779                             == float_tininess_before_rounding)
 780                         || (exp < 0)
 781                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 782
 783             shift64RightJamming(frac, 1 - exp, &frac);
 784             if (frac & round_mask) {
 785                 /* Need to recompute round-to-even.  */
 786                 switch (s->float_rounding_mode) {
 787                 case float_round_nearest_even:
 788                     inc = ((frac & roundeven_mask) != frac_lsbm1
 789                            ? frac_lsbm1 : 0);
 790                     break;
 791                 case float_round_to_odd:
 792                     inc = frac & frac_lsb ? 0 : round_mask;
 793                     break;
 794                 }
 795                 flags |= float_flag_inexact;
 796                 frac += inc;
 797             }
 798
 799             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 800             frac >>= frac_shift;
 801
 802             if (is_tiny && (flags & float_flag_inexact)) {
 803                 flags |= float_flag_underflow;
 804             }
 805             if (exp == 0 && frac == 0) {
 806                 p.cls = float_class_zero;
 807             }
 808         }
 809         break;
 810
 811     case float_class_zero:
 812     do_zero:
 813         exp = 0;
 814         frac = 0;
 815         break;
 816
 817     case float_class_inf:
 818     do_inf:
 819         assert(!parm->arm_althp);
 820         exp = exp_max;
 821         frac = 0;
 822         break;
 823
 824     case float_class_qnan:
 825     case float_class_snan:
 826         assert(!parm->arm_althp);
 827         exp = exp_max;
 828         frac >>= parm->frac_shift;
 829         break;
 830
 831     default:
 832         g_assert_not_reached();
 833     }
 834
 835     float_raise(flags, s);
 836     p.exp = exp;
 837     p.frac = frac;
 838     return p;
 839 }
 840
 841 /* Explicit FloatFmt version */
 842 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
 843                                             const FloatFmt *params)
 844 {
 845     return sf_canonicalize(float16_unpack_raw(f), params, s);
 846 }
 847
 848 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 849 {
 850     return float16a_unpack_canonical(f, s, &float16_params);
 851 }
 852
 853 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
 854                                              const FloatFmt *params)
 855 {
 856     return float16_pack_raw(round_canonical(p, s, params));
 857 }
 858
 859 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 860 {
 861     return float16a_round_pack_canonical(p, s, &float16_params);
 862 }
 863
 864 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 865 {
 866     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
 867 }
 868
 869 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 870 {
 871     return float32_pack_raw(round_canonical(p, s, &float32_params));
 872 }
 873
 874 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 875 {
 876     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
 877 }
 878
 879 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 880 {
 881     return float64_pack_raw(round_canonical(p, s, &float64_params));
 882 }
 883
 884 static FloatParts return_nan(FloatParts a, float_status *s)
 885 {
 886     switch (a.cls) {
 887     case float_class_snan:
 888         s->float_exception_flags |= float_flag_invalid;
 889         a = parts_silence_nan(a, s);
 890         /* fall through */
 891     case float_class_qnan:
 892         if (s->default_nan_mode) {
 893             return parts_default_nan(s);
 894         }
 895         break;
 896
 897     default:
 898         g_assert_not_reached();
 899     }
 900     return a;
 901 }
 902
 903 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 904 {
 905     if (is_snan(a.cls) || is_snan(b.cls)) {
 906         s->float_exception_flags |= float_flag_invalid;
 907     }
 908
 909     if (s->default_nan_mode) {
 910         return parts_default_nan(s);
 911     } else {
 912         if (pickNaN(a.cls, b.cls,
 913                     a.frac > b.frac ||
 914                     (a.frac == b.frac && a.sign < b.sign))) {
 915             a = b;
 916         }
 917         if (is_snan(a.cls)) {
 918             return parts_silence_nan(a, s);
 919         }
 920     }
 921     return a;
 922 }
 923
 924 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 925                                   bool inf_zero, float_status *s)
 926 {
 927     int which;
 928
 929     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 930         s->float_exception_flags |= float_flag_invalid;
 931     }
 932
 933     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
 934
 935     if (s->default_nan_mode) {
 936         /* Note that this check is after pickNaNMulAdd so that function
 937          * has an opportunity to set the Invalid flag.
 938          */
 939         which = 3;
 940     }
 941
 942     switch (which) {
 943     case 0:
 944         break;
 945     case 1:
 946         a = b;
 947         break;
 948     case 2:
 949         a = c;
 950         break;
 951     case 3:
 952         return parts_default_nan(s);
 953     default:
 954         g_assert_not_reached();
 955     }
 956
 957     if (is_snan(a.cls)) {
 958         return parts_silence_nan(a, s);
 959     }
 960     return a;
 961 }
 962
 963 /*
 964  * Returns the result of adding or subtracting the values of the
 965  * floating-point values `a' and `b'. The operation is performed
 966  * according to the IEC/IEEE Standard for Binary Floating-Point
 967  * Arithmetic.
 968  */
 969
 970 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 971                                 float_status *s)
 972 {
 973     bool a_sign = a.sign;
 974     bool b_sign = b.sign ^ subtract;
 975
 976     if (a_sign != b_sign) {
 977         /* Subtraction */
 978
 979         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 980             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 981                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 982                 a.frac = a.frac - b.frac;
 983             } else {
 984                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 985                 a.frac = b.frac - a.frac;
 986                 a.exp = b.exp;
 987                 a_sign ^= 1;
 988             }
 989
 990             if (a.frac == 0) {
 991                 a.cls = float_class_zero;
 992                 a.sign = s->float_rounding_mode == float_round_down;
 993             } else {
 994                 int shift = clz64(a.frac) - 1;
 995                 a.frac = a.frac << shift;
 996                 a.exp = a.exp - shift;
 997                 a.sign = a_sign;
 998             }
 999             return a;
1000         }
1001         if (is_nan(a.cls) || is_nan(b.cls)) {
1002             return pick_nan(a, b, s);
1003         }
1004         if (a.cls == float_class_inf) {
1005             if (b.cls == float_class_inf) {
1006                 float_raise(float_flag_invalid, s);
1007                 return parts_default_nan(s);
1008             }
1009             return a;
1010         }
1011         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1012             a.sign = s->float_rounding_mode == float_round_down;
1013             return a;
1014         }
1015         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1016             b.sign = a_sign ^ 1;
1017             return b;
1018         }
1019         if (b.cls == float_class_zero) {
1020             return a;
1021         }
1022     } else {
1023         /* Addition */
1024         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1025             if (a.exp > b.exp) {
1026                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1027             } else if (a.exp < b.exp) {
1028                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1029                 a.exp = b.exp;
1030             }
1031             a.frac += b.frac;
1032             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1033                 shift64RightJamming(a.frac, 1, &a.frac);
1034                 a.exp += 1;
1035             }
1036             return a;
1037         }
1038         if (is_nan(a.cls) || is_nan(b.cls)) {
1039             return pick_nan(a, b, s);
1040         }
1041         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1042             return a;
1043         }
1044         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1045             b.sign = b_sign;
1046             return b;
1047         }
1048     }
1049     g_assert_not_reached();
1050 }
1051
1052 /*
1053  * Returns the result of adding or subtracting the floating-point
1054  * values `a' and `b'. The operation is performed according to the
1055  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1056  */
1057
1058 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1059 {
1060     FloatParts pa = float16_unpack_canonical(a, status);
1061     FloatParts pb = float16_unpack_canonical(b, status);
1062     FloatParts pr = addsub_floats(pa, pb, false, status);
1063
1064     return float16_round_pack_canonical(pr, status);
1065 }
1066
1067 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1068 {
1069     FloatParts pa = float16_unpack_canonical(a, status);
1070     FloatParts pb = float16_unpack_canonical(b, status);
1071     FloatParts pr = addsub_floats(pa, pb, true, status);
1072
1073     return float16_round_pack_canonical(pr, status);
1074 }
1075
1076 static float32 QEMU_SOFTFLOAT_ATTR
1077 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1078 {
1079     FloatParts pa = float32_unpack_canonical(a, status);
1080     FloatParts pb = float32_unpack_canonical(b, status);
1081     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1082
1083     return float32_round_pack_canonical(pr, status);
1084 }
1085
1086 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1087 {
1088     return soft_f32_addsub(a, b, false, status);
1089 }
1090
1091 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1092 {
1093     return soft_f32_addsub(a, b, true, status);
1094 }
1095
1096 static float64 QEMU_SOFTFLOAT_ATTR
1097 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1098 {
1099     FloatParts pa = float64_unpack_canonical(a, status);
1100     FloatParts pb = float64_unpack_canonical(b, status);
1101     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1102
1103     return float64_round_pack_canonical(pr, status);
1104 }
1105
1106 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1107 {
1108     return soft_f64_addsub(a, b, false, status);
1109 }
1110
1111 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1112 {
1113     return soft_f64_addsub(a, b, true, status);
1114 }
1115
1116 static float hard_f32_add(float a, float b)
1117 {
1118     return a + b;
1119 }
1120
1121 static float hard_f32_sub(float a, float b)
1122 {
1123     return a - b;
1124 }
1125
1126 static double hard_f64_add(double a, double b)
1127 {
1128     return a + b;
1129 }
1130
1131 static double hard_f64_sub(double a, double b)
1132 {
1133     return a - b;
1134 }
1135
1136 static bool f32_addsub_post(union_float32 a, union_float32 b)
1137 {
1138     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1139         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1140     }
1141     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1142 }
1143
1144 static bool f64_addsub_post(union_float64 a, union_float64 b)
1145 {
1146     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1147         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1148     } else {
1149         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1150     }
1151 }
1152
1153 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1154                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1155 {
1156     return float32_gen2(a, b, s, hard, soft,
1157                         f32_is_zon2, f32_addsub_post, NULL, NULL);
1158 }
1159
1160 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1161                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1162 {
1163     return float64_gen2(a, b, s, hard, soft,
1164                         f64_is_zon2, f64_addsub_post, NULL, NULL);
1165 }
1166
1167 float32 QEMU_FLATTEN
1168 float32_add(float32 a, float32 b, float_status *s)
1169 {
1170     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1171 }
1172
1173 float32 QEMU_FLATTEN
1174 float32_sub(float32 a, float32 b, float_status *s)
1175 {
1176     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1177 }
1178
1179 float64 QEMU_FLATTEN
1180 float64_add(float64 a, float64 b, float_status *s)
1181 {
1182     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1183 }
1184
1185 float64 QEMU_FLATTEN
1186 float64_sub(float64 a, float64 b, float_status *s)
1187 {
1188     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1189 }
1190
1191 /*
1192  * Returns the result of multiplying the floating-point values `a' and
1193  * `b'. The operation is performed according to the IEC/IEEE Standard
1194  * for Binary Floating-Point Arithmetic.
1195  */
1196
1197 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1198 {
1199     bool sign = a.sign ^ b.sign;
1200
1201     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1202         uint64_t hi, lo;
1203         int exp = a.exp + b.exp;
1204
1205         mul64To128(a.frac, b.frac, &hi, &lo);
1206         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1207         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1208             shift64RightJamming(lo, 1, &lo);
1209             exp += 1;
1210         }
1211
1212         /* Re-use a */
1213         a.exp = exp;
1214         a.sign = sign;
1215         a.frac = lo;
1216         return a;
1217     }
1218     /* handle all the NaN cases */
1219     if (is_nan(a.cls) || is_nan(b.cls)) {
1220         return pick_nan(a, b, s);
1221     }
1222     /* Inf * Zero == NaN */
1223     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1224         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1225         s->float_exception_flags |= float_flag_invalid;
1226         return parts_default_nan(s);
1227     }
1228     /* Multiply by 0 or Inf */
1229     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1230         a.sign = sign;
1231         return a;
1232     }
1233     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1234         b.sign = sign;
1235         return b;
1236     }
1237     g_assert_not_reached();
1238 }
1239
1240 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1241 {
1242     FloatParts pa = float16_unpack_canonical(a, status);
1243     FloatParts pb = float16_unpack_canonical(b, status);
1244     FloatParts pr = mul_floats(pa, pb, status);
1245
1246     return float16_round_pack_canonical(pr, status);
1247 }
1248
1249 static float32 QEMU_SOFTFLOAT_ATTR
1250 soft_f32_mul(float32 a, float32 b, float_status *status)
1251 {
1252     FloatParts pa = float32_unpack_canonical(a, status);
1253     FloatParts pb = float32_unpack_canonical(b, status);
1254     FloatParts pr = mul_floats(pa, pb, status);
1255
1256     return float32_round_pack_canonical(pr, status);
1257 }
1258
1259 static float64 QEMU_SOFTFLOAT_ATTR
1260 soft_f64_mul(float64 a, float64 b, float_status *status)
1261 {
1262     FloatParts pa = float64_unpack_canonical(a, status);
1263     FloatParts pb = float64_unpack_canonical(b, status);
1264     FloatParts pr = mul_floats(pa, pb, status);
1265
1266     return float64_round_pack_canonical(pr, status);
1267 }
1268
1269 static float hard_f32_mul(float a, float b)
1270 {
1271     return a * b;
1272 }
1273
1274 static double hard_f64_mul(double a, double b)
1275 {
1276     return a * b;
1277 }
1278
1279 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1280 {
1281     return float32_is_zero(a.s) || float32_is_zero(b.s);
1282 }
1283
1284 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1285 {
1286     return float64_is_zero(a.s) || float64_is_zero(b.s);
1287 }
1288
1289 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1290 {
1291     bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1292
1293     return float32_set_sign(float32_zero, signbit);
1294 }
1295
1296 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1297 {
1298     bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1299
1300     return float64_set_sign(float64_zero, signbit);
1301 }
1302
1303 float32 QEMU_FLATTEN
1304 float32_mul(float32 a, float32 b, float_status *s)
1305 {
1306     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1307                         f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1308 }
1309
1310 float64 QEMU_FLATTEN
1311 float64_mul(float64 a, float64 b, float_status *s)
1312 {
1313     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1314                         f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1315 }
1316
1317 /*
1318  * Returns the result of multiplying the floating-point values `a' and
1319  * `b' then adding 'c', with no intermediate rounding step after the
1320  * multiplication. The operation is performed according to the
1321  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1322  * The flags argument allows the caller to select negation of the
1323  * addend, the intermediate product, or the final result. (The
1324  * difference between this and having the caller do a separate
1325  * negation is that negating externally will flip the sign bit on
1326  * NaNs.)
1327  */
1328
1329 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1330                                 int flags, float_status *s)
1331 {
1332     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1333                     ((1 << float_class_inf) | (1 << float_class_zero));
1334     bool p_sign;
1335     bool sign_flip = flags & float_muladd_negate_result;
1336     FloatClass p_class;
1337     uint64_t hi, lo;
1338     int p_exp;
1339
1340     /* It is implementation-defined whether the cases of (0,inf,qnan)
1341      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1342      * they return if they do), so we have to hand this information
1343      * off to the target-specific pick-a-NaN routine.
1344      */
1345     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1346         return pick_nan_muladd(a, b, c, inf_zero, s);
1347     }
1348
1349     if (inf_zero) {
1350         s->float_exception_flags |= float_flag_invalid;
1351         return parts_default_nan(s);
1352     }
1353
1354     if (flags & float_muladd_negate_c) {
1355         c.sign ^= 1;
1356     }
1357
1358     p_sign = a.sign ^ b.sign;
1359
1360     if (flags & float_muladd_negate_product) {
1361         p_sign ^= 1;
1362     }
1363
1364     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1365         p_class = float_class_inf;
1366     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1367         p_class = float_class_zero;
1368     } else {
1369         p_class = float_class_normal;
1370     }
1371
1372     if (c.cls == float_class_inf) {
1373         if (p_class == float_class_inf && p_sign != c.sign) {
1374             s->float_exception_flags |= float_flag_invalid;
1375             return parts_default_nan(s);
1376         } else {
1377             a.cls = float_class_inf;
1378             a.sign = c.sign ^ sign_flip;
1379             return a;
1380         }
1381     }
1382
1383     if (p_class == float_class_inf) {
1384         a.cls = float_class_inf;
1385         a.sign = p_sign ^ sign_flip;
1386         return a;
1387     }
1388
1389     if (p_class == float_class_zero) {
1390         if (c.cls == float_class_zero) {
1391             if (p_sign != c.sign) {
1392                 p_sign = s->float_rounding_mode == float_round_down;
1393             }
1394             c.sign = p_sign;
1395         } else if (flags & float_muladd_halve_result) {
1396             c.exp -= 1;
1397         }
1398         c.sign ^= sign_flip;
1399         return c;
1400     }
1401
1402     /* a & b should be normals now... */
1403     assert(a.cls == float_class_normal &&
1404            b.cls == float_class_normal);
1405
1406     p_exp = a.exp + b.exp;
1407
1408     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1409      * result.
1410      */
1411     mul64To128(a.frac, b.frac, &hi, &lo);
1412     /* binary point now at bit 124 */
1413
1414     /* check for overflow */
1415     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1416         shift128RightJamming(hi, lo, 1, &hi, &lo);
1417         p_exp += 1;
1418     }
1419
1420     /* + add/sub */
1421     if (c.cls == float_class_zero) {
1422         /* move binary point back to 62 */
1423         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1424     } else {
1425         int exp_diff = p_exp - c.exp;
1426         if (p_sign == c.sign) {
1427             /* Addition */
1428             if (exp_diff <= 0) {
1429                 shift128RightJamming(hi, lo,
1430                                      DECOMPOSED_BINARY_POINT - exp_diff,
1431                                      &hi, &lo);
1432                 lo += c.frac;
1433                 p_exp = c.exp;
1434             } else {
1435                 uint64_t c_hi, c_lo;
1436                 /* shift c to the same binary point as the product (124) */
1437                 c_hi = c.frac >> 2;
1438                 c_lo = 0;
1439                 shift128RightJamming(c_hi, c_lo,
1440                                      exp_diff,
1441                                      &c_hi, &c_lo);
1442                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1443                 /* move binary point back to 62 */
1444                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1445             }
1446
1447             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1448                 shift64RightJamming(lo, 1, &lo);
1449                 p_exp += 1;
1450             }
1451
1452         } else {
1453             /* Subtraction */
1454             uint64_t c_hi, c_lo;
1455             /* make C binary point match product at bit 124 */
1456             c_hi = c.frac >> 2;
1457             c_lo = 0;
1458
1459             if (exp_diff <= 0) {
1460                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1461                 if (exp_diff == 0
1462                     &&
1463                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1464                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1465                 } else {
1466                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1467                     p_sign ^= 1;
1468                     p_exp = c.exp;
1469                 }
1470             } else {
1471                 shift128RightJamming(c_hi, c_lo,
1472                                      exp_diff,
1473                                      &c_hi, &c_lo);
1474                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1475             }
1476
1477             if (hi == 0 && lo == 0) {
1478                 a.cls = float_class_zero;
1479                 a.sign = s->float_rounding_mode == float_round_down;
1480                 a.sign ^= sign_flip;
1481                 return a;
1482             } else {
1483                 int shift;
1484                 if (hi != 0) {
1485                     shift = clz64(hi);
1486                 } else {
1487                     shift = clz64(lo) + 64;
1488                 }
1489                 /* Normalizing to a binary point of 124 is the
1490                    correct adjust for the exponent.  However since we're
1491                    shifting, we might as well put the binary point back
1492                    at 62 where we really want it.  Therefore shift as
1493                    if we're leaving 1 bit at the top of the word, but
1494                    adjust the exponent as if we're leaving 3 bits.  */
1495                 shift -= 1;
1496                 if (shift >= 64) {
1497                     lo = lo << (shift - 64);
1498                 } else {
1499                     hi = (hi << shift) | (lo >> (64 - shift));
1500                     lo = hi | ((lo << shift) != 0);
1501                 }
1502                 p_exp -= shift - 2;
1503             }
1504         }
1505     }
1506
1507     if (flags & float_muladd_halve_result) {
1508         p_exp -= 1;
1509     }
1510
1511     /* finally prepare our result */
1512     a.cls = float_class_normal;
1513     a.sign = p_sign ^ sign_flip;
1514     a.exp = p_exp;
1515     a.frac = lo;
1516
1517     return a;
1518 }
1519
1520 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1521                                                 int flags, float_status *status)
1522 {
1523     FloatParts pa = float16_unpack_canonical(a, status);
1524     FloatParts pb = float16_unpack_canonical(b, status);
1525     FloatParts pc = float16_unpack_canonical(c, status);
1526     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1527
1528     return float16_round_pack_canonical(pr, status);
1529 }
1530
1531 static float32 QEMU_SOFTFLOAT_ATTR
1532 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1533                 float_status *status)
1534 {
1535     FloatParts pa = float32_unpack_canonical(a, status);
1536     FloatParts pb = float32_unpack_canonical(b, status);
1537     FloatParts pc = float32_unpack_canonical(c, status);
1538     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1539
1540     return float32_round_pack_canonical(pr, status);
1541 }
1542
1543 static float64 QEMU_SOFTFLOAT_ATTR
1544 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1545                 float_status *status)
1546 {
1547     FloatParts pa = float64_unpack_canonical(a, status);
1548     FloatParts pb = float64_unpack_canonical(b, status);
1549     FloatParts pc = float64_unpack_canonical(c, status);
1550     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1551
1552     return float64_round_pack_canonical(pr, status);
1553 }
1554
1555 static bool force_soft_fma;
1556
1557 float32 QEMU_FLATTEN
1558 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1559 {
1560     union_float32 ua, ub, uc, ur;
1561
1562     ua.s = xa;
1563     ub.s = xb;
1564     uc.s = xc;
1565
1566     if (unlikely(!can_use_fpu(s))) {
1567         goto soft;
1568     }
1569     if (unlikely(flags & float_muladd_halve_result)) {
1570         goto soft;
1571     }
1572
1573     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1574     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1575         goto soft;
1576     }
1577
1578     if (unlikely(force_soft_fma)) {
1579         goto soft;
1580     }
1581
1582     /*
1583      * When (a || b) == 0, there's no need to check for under/over flow,
1584      * since we know the addend is (normal || 0) and the product is 0.
1585      */
1586     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1587         union_float32 up;
1588         bool prod_sign;
1589
1590         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1591         prod_sign ^= !!(flags & float_muladd_negate_product);
1592         up.s = float32_set_sign(float32_zero, prod_sign);
1593
1594         if (flags & float_muladd_negate_c) {
1595             uc.h = -uc.h;
1596         }
1597         ur.h = up.h + uc.h;
1598     } else {
1599         union_float32 ua_orig = ua;
1600         union_float32 uc_orig = uc;
1601
1602         if (flags & float_muladd_negate_product) {
1603             ua.h = -ua.h;
1604         }
1605         if (flags & float_muladd_negate_c) {
1606             uc.h = -uc.h;
1607         }
1608
1609         ur.h = fmaf(ua.h, ub.h, uc.h);
1610
1611         if (unlikely(f32_is_inf(ur))) {
1612             s->float_exception_flags |= float_flag_overflow;
1613         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1614             ua = ua_orig;
1615             uc = uc_orig;
1616             goto soft;
1617         }
1618     }
1619     if (flags & float_muladd_negate_result) {
1620         return float32_chs(ur.s);
1621     }
1622     return ur.s;
1623
1624  soft:
1625     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1626 }
1627
1628 float64 QEMU_FLATTEN
1629 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1630 {
1631     union_float64 ua, ub, uc, ur;
1632
1633     ua.s = xa;
1634     ub.s = xb;
1635     uc.s = xc;
1636
1637     if (unlikely(!can_use_fpu(s))) {
1638         goto soft;
1639     }
1640     if (unlikely(flags & float_muladd_halve_result)) {
1641         goto soft;
1642     }
1643
1644     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1645     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1646         goto soft;
1647     }
1648
1649     if (unlikely(force_soft_fma)) {
1650         goto soft;
1651     }
1652
1653     /*
1654      * When (a || b) == 0, there's no need to check for under/over flow,
1655      * since we know the addend is (normal || 0) and the product is 0.
1656      */
1657     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1658         union_float64 up;
1659         bool prod_sign;
1660
1661         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1662         prod_sign ^= !!(flags & float_muladd_negate_product);
1663         up.s = float64_set_sign(float64_zero, prod_sign);
1664
1665         if (flags & float_muladd_negate_c) {
1666             uc.h = -uc.h;
1667         }
1668         ur.h = up.h + uc.h;
1669     } else {
1670         union_float64 ua_orig = ua;
1671         union_float64 uc_orig = uc;
1672
1673         if (flags & float_muladd_negate_product) {
1674             ua.h = -ua.h;
1675         }
1676         if (flags & float_muladd_negate_c) {
1677             uc.h = -uc.h;
1678         }
1679
1680         ur.h = fma(ua.h, ub.h, uc.h);
1681
1682         if (unlikely(f64_is_inf(ur))) {
1683             s->float_exception_flags |= float_flag_overflow;
1684         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1685             ua = ua_orig;
1686             uc = uc_orig;
1687             goto soft;
1688         }
1689     }
1690     if (flags & float_muladd_negate_result) {
1691         return float64_chs(ur.s);
1692     }
1693     return ur.s;
1694
1695  soft:
1696     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1697 }
1698
1699 /*
1700  * Returns the result of dividing the floating-point value `a' by the
1701  * corresponding value `b'. The operation is performed according to
1702  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1703  */
1704
1705 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1706 {
1707     bool sign = a.sign ^ b.sign;
1708
1709     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1710         uint64_t n0, n1, q, r;
1711         int exp = a.exp - b.exp;
1712
1713         /*
1714          * We want a 2*N / N-bit division to produce exactly an N-bit
1715          * result, so that we do not lose any precision and so that we
1716          * do not have to renormalize afterward.  If A.frac < B.frac,
1717          * then division would produce an (N-1)-bit result; shift A left
1718          * by one to produce the an N-bit result, and decrement the
1719          * exponent to match.
1720          *
1721          * The udiv_qrnnd algorithm that we're using requires normalization,
1722          * i.e. the msb of the denominator must be set.  Since we know that
1723          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1724          * by one (more), and the remainder must be shifted right by one.
1725          */
1726         if (a.frac < b.frac) {
1727             exp -= 1;
1728             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1729         } else {
1730             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1731         }
1732         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1733
1734         /*
1735          * Set lsb if there is a remainder, to set inexact.
1736          * As mentioned above, to find the actual value of the remainder we
1737          * would need to shift right, but (1) we are only concerned about
1738          * non-zero-ness, and (2) the remainder will always be even because
1739          * both inputs to the division primitive are even.
1740          */
1741         a.frac = q | (r != 0);
1742         a.sign = sign;
1743         a.exp = exp;
1744         return a;
1745     }
1746     /* handle all the NaN cases */
1747     if (is_nan(a.cls) || is_nan(b.cls)) {
1748         return pick_nan(a, b, s);
1749     }
1750     /* 0/0 or Inf/Inf */
1751     if (a.cls == b.cls
1752         &&
1753         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1754         s->float_exception_flags |= float_flag_invalid;
1755         return parts_default_nan(s);
1756     }
1757     /* Inf / x or 0 / x */
1758     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1759         a.sign = sign;
1760         return a;
1761     }
1762     /* Div 0 => Inf */
1763     if (b.cls == float_class_zero) {
1764         s->float_exception_flags |= float_flag_divbyzero;
1765         a.cls = float_class_inf;
1766         a.sign = sign;
1767         return a;
1768     }
1769     /* Div by Inf */
1770     if (b.cls == float_class_inf) {
1771         a.cls = float_class_zero;
1772         a.sign = sign;
1773         return a;
1774     }
1775     g_assert_not_reached();
1776 }
1777
1778 float16 float16_div(float16 a, float16 b, float_status *status)
1779 {
1780     FloatParts pa = float16_unpack_canonical(a, status);
1781     FloatParts pb = float16_unpack_canonical(b, status);
1782     FloatParts pr = div_floats(pa, pb, status);
1783
1784     return float16_round_pack_canonical(pr, status);
1785 }
1786
1787 static float32 QEMU_SOFTFLOAT_ATTR
1788 soft_f32_div(float32 a, float32 b, float_status *status)
1789 {
1790     FloatParts pa = float32_unpack_canonical(a, status);
1791     FloatParts pb = float32_unpack_canonical(b, status);
1792     FloatParts pr = div_floats(pa, pb, status);
1793
1794     return float32_round_pack_canonical(pr, status);
1795 }
1796
1797 static float64 QEMU_SOFTFLOAT_ATTR
1798 soft_f64_div(float64 a, float64 b, float_status *status)
1799 {
1800     FloatParts pa = float64_unpack_canonical(a, status);
1801     FloatParts pb = float64_unpack_canonical(b, status);
1802     FloatParts pr = div_floats(pa, pb, status);
1803
1804     return float64_round_pack_canonical(pr, status);
1805 }
1806
1807 static float hard_f32_div(float a, float b)
1808 {
1809     return a / b;
1810 }
1811
1812 static double hard_f64_div(double a, double b)
1813 {
1814     return a / b;
1815 }
1816
1817 static bool f32_div_pre(union_float32 a, union_float32 b)
1818 {
1819     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1820         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1821                fpclassify(b.h) == FP_NORMAL;
1822     }
1823     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1824 }
1825
1826 static bool f64_div_pre(union_float64 a, union_float64 b)
1827 {
1828     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1829         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1830                fpclassify(b.h) == FP_NORMAL;
1831     }
1832     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1833 }
1834
1835 static bool f32_div_post(union_float32 a, union_float32 b)
1836 {
1837     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1838         return fpclassify(a.h) != FP_ZERO;
1839     }
1840     return !float32_is_zero(a.s);
1841 }
1842
1843 static bool f64_div_post(union_float64 a, union_float64 b)
1844 {
1845     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1846         return fpclassify(a.h) != FP_ZERO;
1847     }
1848     return !float64_is_zero(a.s);
1849 }
1850
1851 float32 QEMU_FLATTEN
1852 float32_div(float32 a, float32 b, float_status *s)
1853 {
1854     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1855                         f32_div_pre, f32_div_post, NULL, NULL);
1856 }
1857
1858 float64 QEMU_FLATTEN
1859 float64_div(float64 a, float64 b, float_status *s)
1860 {
1861     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1862                         f64_div_pre, f64_div_post, NULL, NULL);
1863 }
1864
1865 /*
1866  * Float to Float conversions
1867  *
1868  * Returns the result of converting one float format to another. The
1869  * conversion is performed according to the IEC/IEEE Standard for
1870  * Binary Floating-Point Arithmetic.
1871  *
1872  * The float_to_float helper only needs to take care of raising
1873  * invalid exceptions and handling the conversion on NaNs.
1874  */
1875
1876 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1877                                  float_status *s)
1878 {
1879     if (dstf->arm_althp) {
1880         switch (a.cls) {
1881         case float_class_qnan:
1882         case float_class_snan:
1883             /* There is no NaN in the destination format.  Raise Invalid
1884              * and return a zero with the sign of the input NaN.
1885              */
1886             s->float_exception_flags |= float_flag_invalid;
1887             a.cls = float_class_zero;
1888             a.frac = 0;
1889             a.exp = 0;
1890             break;
1891
1892         case float_class_inf:
1893             /* There is no Inf in the destination format.  Raise Invalid
1894              * and return the maximum normal with the correct sign.
1895              */
1896             s->float_exception_flags |= float_flag_invalid;
1897             a.cls = float_class_normal;
1898             a.exp = dstf->exp_max;
1899             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1900             break;
1901
1902         default:
1903             break;
1904         }
1905     } else if (is_nan(a.cls)) {
1906         if (is_snan(a.cls)) {
1907             s->float_exception_flags |= float_flag_invalid;
1908             a = parts_silence_nan(a, s);
1909         }
1910         if (s->default_nan_mode) {
1911             return parts_default_nan(s);
1912         }
1913     }
1914     return a;
1915 }
1916
1917 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1918 {
1919     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1920     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1921     FloatParts pr = float_to_float(p, &float32_params, s);
1922     return float32_round_pack_canonical(pr, s);
1923 }
1924
1925 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1926 {
1927     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1928     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1929     FloatParts pr = float_to_float(p, &float64_params, s);
1930     return float64_round_pack_canonical(pr, s);
1931 }
1932
1933 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1934 {
1935     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1936     FloatParts p = float32_unpack_canonical(a, s);
1937     FloatParts pr = float_to_float(p, fmt16, s);
1938     return float16a_round_pack_canonical(pr, s, fmt16);
1939 }
1940
1941 float64 float32_to_float64(float32 a, float_status *s)
1942 {
1943     FloatParts p = float32_unpack_canonical(a, s);
1944     FloatParts pr = float_to_float(p, &float64_params, s);
1945     return float64_round_pack_canonical(pr, s);
1946 }
1947
1948 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1949 {
1950     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1951     FloatParts p = float64_unpack_canonical(a, s);
1952     FloatParts pr = float_to_float(p, fmt16, s);
1953     return float16a_round_pack_canonical(pr, s, fmt16);
1954 }
1955
1956 float32 float64_to_float32(float64 a, float_status *s)
1957 {
1958     FloatParts p = float64_unpack_canonical(a, s);
1959     FloatParts pr = float_to_float(p, &float32_params, s);
1960     return float32_round_pack_canonical(pr, s);
1961 }
1962
1963 /*
1964  * Rounds the floating-point value `a' to an integer, and returns the
1965  * result as a floating-point value. The operation is performed
1966  * according to the IEC/IEEE Standard for Binary Floating-Point
1967  * Arithmetic.
1968  */
1969
1970 static FloatParts round_to_int(FloatParts a, int rmode,
1971                                int scale, float_status *s)
1972 {
1973     switch (a.cls) {
1974     case float_class_qnan:
1975     case float_class_snan:
1976         return return_nan(a, s);
1977
1978     case float_class_zero:
1979     case float_class_inf:
1980         /* already "integral" */
1981         break;
1982
1983     case float_class_normal:
1984         scale = MIN(MAX(scale, -0x10000), 0x10000);
1985         a.exp += scale;
1986
1987         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1988             /* already integral */
1989             break;
1990         }
1991         if (a.exp < 0) {
1992             bool one;
1993             /* all fractional */
1994             s->float_exception_flags |= float_flag_inexact;
1995             switch (rmode) {
1996             case float_round_nearest_even:
1997                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1998                 break;
1999             case float_round_ties_away:
2000                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2001                 break;
2002             case float_round_to_zero:
2003                 one = false;
2004                 break;
2005             case float_round_up:
2006                 one = !a.sign;
2007                 break;
2008             case float_round_down:
2009                 one = a.sign;
2010                 break;
2011             case float_round_to_odd:
2012                 one = true;
2013                 break;
2014             default:
2015                 g_assert_not_reached();
2016             }
2017
2018             if (one) {
2019                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2020                 a.exp = 0;
2021             } else {
2022                 a.cls = float_class_zero;
2023             }
2024         } else {
2025             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2026             uint64_t frac_lsbm1 = frac_lsb >> 1;
2027             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2028             uint64_t rnd_mask = rnd_even_mask >> 1;
2029             uint64_t inc;
2030
2031             switch (rmode) {
2032             case float_round_nearest_even:
2033                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2034                 break;
2035             case float_round_ties_away:
2036                 inc = frac_lsbm1;
2037                 break;
2038             case float_round_to_zero:
2039                 inc = 0;
2040                 break;
2041             case float_round_up:
2042                 inc = a.sign ? 0 : rnd_mask;
2043                 break;
2044             case float_round_down:
2045                 inc = a.sign ? rnd_mask : 0;
2046                 break;
2047             case float_round_to_odd:
2048                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2049                 break;
2050             default:
2051                 g_assert_not_reached();
2052             }
2053
2054             if (a.frac & rnd_mask) {
2055                 s->float_exception_flags |= float_flag_inexact;
2056                 a.frac += inc;
2057                 a.frac &= ~rnd_mask;
2058                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2059                     a.frac >>= 1;
2060                     a.exp++;
2061                 }
2062             }
2063         }
2064         break;
2065     default:
2066         g_assert_not_reached();
2067     }
2068     return a;
2069 }
2070
2071 float16 float16_round_to_int(float16 a, float_status *s)
2072 {
2073     FloatParts pa = float16_unpack_canonical(a, s);
2074     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2075     return float16_round_pack_canonical(pr, s);
2076 }
2077
2078 float32 float32_round_to_int(float32 a, float_status *s)
2079 {
2080     FloatParts pa = float32_unpack_canonical(a, s);
2081     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2082     return float32_round_pack_canonical(pr, s);
2083 }
2084
2085 float64 float64_round_to_int(float64 a, float_status *s)
2086 {
2087     FloatParts pa = float64_unpack_canonical(a, s);
2088     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2089     return float64_round_pack_canonical(pr, s);
2090 }
2091
2092 /*
2093  * Returns the result of converting the floating-point value `a' to
2094  * the two's complement integer format. The conversion is performed
2095  * according to the IEC/IEEE Standard for Binary Floating-Point
2096  * Arithmetic---which means in particular that the conversion is
2097  * rounded according to the current rounding mode. If `a' is a NaN,
2098  * the largest positive integer is returned. Otherwise, if the
2099  * conversion overflows, the largest integer with the same sign as `a'
2100  * is returned.
2101 */
2102
2103 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2104                                      int64_t min, int64_t max,
2105                                      float_status *s)
2106 {
2107     uint64_t r;
2108     int orig_flags = get_float_exception_flags(s);
2109     FloatParts p = round_to_int(in, rmode, scale, s);
2110
2111     switch (p.cls) {
2112     case float_class_snan:
2113     case float_class_qnan:
2114         s->float_exception_flags = orig_flags | float_flag_invalid;
2115         return max;
2116     case float_class_inf:
2117         s->float_exception_flags = orig_flags | float_flag_invalid;
2118         return p.sign ? min : max;
2119     case float_class_zero:
2120         return 0;
2121     case float_class_normal:
2122         if (p.exp < DECOMPOSED_BINARY_POINT) {
2123             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2124         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2125             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2126         } else {
2127             r = UINT64_MAX;
2128         }
2129         if (p.sign) {
2130             if (r <= -(uint64_t) min) {
2131                 return -r;
2132             } else {
2133                 s->float_exception_flags = orig_flags | float_flag_invalid;
2134                 return min;
2135             }
2136         } else {
2137             if (r <= max) {
2138                 return r;
2139             } else {
2140                 s->float_exception_flags = orig_flags | float_flag_invalid;
2141                 return max;
2142             }
2143         }
2144     default:
2145         g_assert_not_reached();
2146     }
2147 }
2148
2149 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2150                                 float_status *s)
2151 {
2152     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2153                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2154 }
2155
2156 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2157                                 float_status *s)
2158 {
2159     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2160                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2161 }
2162
2163 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2164                                 float_status *s)
2165 {
2166     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2167                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2168 }
2169
2170 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2171                                 float_status *s)
2172 {
2173     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2174                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2175 }
2176
2177 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2178                                 float_status *s)
2179 {
2180     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2181                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2182 }
2183
2184 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2185                                 float_status *s)
2186 {
2187     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2188                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2189 }
2190
2191 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2192                                 float_status *s)
2193 {
2194     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2195                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2196 }
2197
2198 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2199                                 float_status *s)
2200 {
2201     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2202                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2203 }
2204
2205 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2206                                 float_status *s)
2207 {
2208     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2209                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2210 }
2211
2212 int16_t float16_to_int16(float16 a, float_status *s)
2213 {
2214     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2215 }
2216
2217 int32_t float16_to_int32(float16 a, float_status *s)
2218 {
2219     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2220 }
2221
2222 int64_t float16_to_int64(float16 a, float_status *s)
2223 {
2224     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2225 }
2226
2227 int16_t float32_to_int16(float32 a, float_status *s)
2228 {
2229     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2230 }
2231
2232 int32_t float32_to_int32(float32 a, float_status *s)
2233 {
2234     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2235 }
2236
2237 int64_t float32_to_int64(float32 a, float_status *s)
2238 {
2239     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2240 }
2241
2242 int16_t float64_to_int16(float64 a, float_status *s)
2243 {
2244     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2245 }
2246
2247 int32_t float64_to_int32(float64 a, float_status *s)
2248 {
2249     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2250 }
2251
2252 int64_t float64_to_int64(float64 a, float_status *s)
2253 {
2254     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2255 }
2256
2257 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2258 {
2259     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2260 }
2261
2262 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2263 {
2264     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2265 }
2266
2267 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2268 {
2269     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2270 }
2271
2272 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2273 {
2274     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2275 }
2276
2277 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2278 {
2279     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2280 }
2281
2282 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2283 {
2284     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2285 }
2286
2287 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2288 {
2289     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2290 }
2291
2292 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2293 {
2294     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2295 }
2296
2297 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2298 {
2299     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2300 }
2301
2302 /*
2303  *  Returns the result of converting the floating-point value `a' to
2304  *  the unsigned integer format. The conversion is performed according
2305  *  to the IEC/IEEE Standard for Binary Floating-Point
2306  *  Arithmetic---which means in particular that the conversion is
2307  *  rounded according to the current rounding mode. If `a' is a NaN,
2308  *  the largest unsigned integer is returned. Otherwise, if the
2309  *  conversion overflows, the largest unsigned integer is returned. If
2310  *  the 'a' is negative, the result is rounded and zero is returned;
2311  *  values that do not round to zero will raise the inexact exception
2312  *  flag.
2313  */
2314
2315 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2316                                        uint64_t max, float_status *s)
2317 {
2318     int orig_flags = get_float_exception_flags(s);
2319     FloatParts p = round_to_int(in, rmode, scale, s);
2320     uint64_t r;
2321
2322     switch (p.cls) {
2323     case float_class_snan:
2324     case float_class_qnan:
2325         s->float_exception_flags = orig_flags | float_flag_invalid;
2326         return max;
2327     case float_class_inf:
2328         s->float_exception_flags = orig_flags | float_flag_invalid;
2329         return p.sign ? 0 : max;
2330     case float_class_zero:
2331         return 0;
2332     case float_class_normal:
2333         if (p.sign) {
2334             s->float_exception_flags = orig_flags | float_flag_invalid;
2335             return 0;
2336         }
2337
2338         if (p.exp < DECOMPOSED_BINARY_POINT) {
2339             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2340         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2341             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2342         } else {
2343             s->float_exception_flags = orig_flags | float_flag_invalid;
2344             return max;
2345         }
2346
2347         /* For uint64 this will never trip, but if p.exp is too large
2348          * to shift a decomposed fraction we shall have exited via the
2349          * 3rd leg above.
2350          */
2351         if (r > max) {
2352             s->float_exception_flags = orig_flags | float_flag_invalid;
2353             return max;
2354         }
2355         return r;
2356     default:
2357         g_assert_not_reached();
2358     }
2359 }
2360
2361 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2362                                   float_status *s)
2363 {
2364     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2365                                   rmode, scale, UINT16_MAX, s);
2366 }
2367
2368 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2369                                   float_status *s)
2370 {
2371     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2372                                   rmode, scale, UINT32_MAX, s);
2373 }
2374
2375 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2376                                   float_status *s)
2377 {
2378     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2379                                   rmode, scale, UINT64_MAX, s);
2380 }
2381
2382 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2383                                   float_status *s)
2384 {
2385     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2386                                   rmode, scale, UINT16_MAX, s);
2387 }
2388
2389 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2390                                   float_status *s)
2391 {
2392     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2393                                   rmode, scale, UINT32_MAX, s);
2394 }
2395
2396 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2397                                   float_status *s)
2398 {
2399     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2400                                   rmode, scale, UINT64_MAX, s);
2401 }
2402
2403 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2404                                   float_status *s)
2405 {
2406     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2407                                   rmode, scale, UINT16_MAX, s);
2408 }
2409
2410 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2411                                   float_status *s)
2412 {
2413     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2414                                   rmode, scale, UINT32_MAX, s);
2415 }
2416
2417 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2418                                   float_status *s)
2419 {
2420     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2421                                   rmode, scale, UINT64_MAX, s);
2422 }
2423
2424 uint16_t float16_to_uint16(float16 a, float_status *s)
2425 {
2426     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2427 }
2428
2429 uint32_t float16_to_uint32(float16 a, float_status *s)
2430 {
2431     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2432 }
2433
2434 uint64_t float16_to_uint64(float16 a, float_status *s)
2435 {
2436     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2437 }
2438
2439 uint16_t float32_to_uint16(float32 a, float_status *s)
2440 {
2441     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2442 }
2443
2444 uint32_t float32_to_uint32(float32 a, float_status *s)
2445 {
2446     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2447 }
2448
2449 uint64_t float32_to_uint64(float32 a, float_status *s)
2450 {
2451     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2452 }
2453
2454 uint16_t float64_to_uint16(float64 a, float_status *s)
2455 {
2456     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2457 }
2458
2459 uint32_t float64_to_uint32(float64 a, float_status *s)
2460 {
2461     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2462 }
2463
2464 uint64_t float64_to_uint64(float64 a, float_status *s)
2465 {
2466     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2467 }
2468
2469 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2470 {
2471     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2472 }
2473
2474 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2475 {
2476     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2477 }
2478
2479 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2480 {
2481     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2482 }
2483
2484 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2485 {
2486     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2487 }
2488
2489 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2490 {
2491     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2492 }
2493
2494 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2495 {
2496     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2497 }
2498
2499 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2500 {
2501     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2502 }
2503
2504 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2505 {
2506     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2507 }
2508
2509 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2510 {
2511     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2512 }
2513
2514 /*
2515  * Integer to float conversions
2516  *
2517  * Returns the result of converting the two's complement integer `a'
2518  * to the floating-point format. The conversion is performed according
2519  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2520  */
2521
2522 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2523 {
2524     FloatParts r = { .sign = false };
2525
2526     if (a == 0) {
2527         r.cls = float_class_zero;
2528     } else {
2529         uint64_t f = a;
2530         int shift;
2531
2532         r.cls = float_class_normal;
2533         if (a < 0) {
2534             f = -f;
2535             r.sign = true;
2536         }
2537         shift = clz64(f) - 1;
2538         scale = MIN(MAX(scale, -0x10000), 0x10000);
2539
2540         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2541         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2542     }
2543
2544     return r;
2545 }
2546
2547 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2548 {
2549     FloatParts pa = int_to_float(a, scale, status);
2550     return float16_round_pack_canonical(pa, status);
2551 }
2552
2553 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2554 {
2555     return int64_to_float16_scalbn(a, scale, status);
2556 }
2557
2558 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2559 {
2560     return int64_to_float16_scalbn(a, scale, status);
2561 }
2562
2563 float16 int64_to_float16(int64_t a, float_status *status)
2564 {
2565     return int64_to_float16_scalbn(a, 0, status);
2566 }
2567
2568 float16 int32_to_float16(int32_t a, float_status *status)
2569 {
2570     return int64_to_float16_scalbn(a, 0, status);
2571 }
2572
2573 float16 int16_to_float16(int16_t a, float_status *status)
2574 {
2575     return int64_to_float16_scalbn(a, 0, status);
2576 }
2577
2578 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2579 {
2580     FloatParts pa = int_to_float(a, scale, status);
2581     return float32_round_pack_canonical(pa, status);
2582 }
2583
2584 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2585 {
2586     return int64_to_float32_scalbn(a, scale, status);
2587 }
2588
2589 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2590 {
2591     return int64_to_float32_scalbn(a, scale, status);
2592 }
2593
2594 float32 int64_to_float32(int64_t a, float_status *status)
2595 {
2596     return int64_to_float32_scalbn(a, 0, status);
2597 }
2598
2599 float32 int32_to_float32(int32_t a, float_status *status)
2600 {
2601     return int64_to_float32_scalbn(a, 0, status);
2602 }
2603
2604 float32 int16_to_float32(int16_t a, float_status *status)
2605 {
2606     return int64_to_float32_scalbn(a, 0, status);
2607 }
2608
2609 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2610 {
2611     FloatParts pa = int_to_float(a, scale, status);
2612     return float64_round_pack_canonical(pa, status);
2613 }
2614
2615 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2616 {
2617     return int64_to_float64_scalbn(a, scale, status);
2618 }
2619
2620 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2621 {
2622     return int64_to_float64_scalbn(a, scale, status);
2623 }
2624
2625 float64 int64_to_float64(int64_t a, float_status *status)
2626 {
2627     return int64_to_float64_scalbn(a, 0, status);
2628 }
2629
2630 float64 int32_to_float64(int32_t a, float_status *status)
2631 {
2632     return int64_to_float64_scalbn(a, 0, status);
2633 }
2634
2635 float64 int16_to_float64(int16_t a, float_status *status)
2636 {
2637     return int64_to_float64_scalbn(a, 0, status);
2638 }
2639
2640
2641 /*
2642  * Unsigned Integer to float conversions
2643  *
2644  * Returns the result of converting the unsigned integer `a' to the
2645  * floating-point format. The conversion is performed according to the
2646  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2647  */
2648
2649 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2650 {
2651     FloatParts r = { .sign = false };
2652
2653     if (a == 0) {
2654         r.cls = float_class_zero;
2655     } else {
2656         scale = MIN(MAX(scale, -0x10000), 0x10000);
2657         r.cls = float_class_normal;
2658         if ((int64_t)a < 0) {
2659             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2660             shift64RightJamming(a, 1, &a);
2661             r.frac = a;
2662         } else {
2663             int shift = clz64(a) - 1;
2664             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2665             r.frac = a << shift;
2666         }
2667     }
2668
2669     return r;
2670 }
2671
2672 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2673 {
2674     FloatParts pa = uint_to_float(a, scale, status);
2675     return float16_round_pack_canonical(pa, status);
2676 }
2677
2678 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2679 {
2680     return uint64_to_float16_scalbn(a, scale, status);
2681 }
2682
2683 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2684 {
2685     return uint64_to_float16_scalbn(a, scale, status);
2686 }
2687
2688 float16 uint64_to_float16(uint64_t a, float_status *status)
2689 {
2690     return uint64_to_float16_scalbn(a, 0, status);
2691 }
2692
2693 float16 uint32_to_float16(uint32_t a, float_status *status)
2694 {
2695     return uint64_to_float16_scalbn(a, 0, status);
2696 }
2697
2698 float16 uint16_to_float16(uint16_t a, float_status *status)
2699 {
2700     return uint64_to_float16_scalbn(a, 0, status);
2701 }
2702
2703 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2704 {
2705     FloatParts pa = uint_to_float(a, scale, status);
2706     return float32_round_pack_canonical(pa, status);
2707 }
2708
2709 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2710 {
2711     return uint64_to_float32_scalbn(a, scale, status);
2712 }
2713
2714 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2715 {
2716     return uint64_to_float32_scalbn(a, scale, status);
2717 }
2718
2719 float32 uint64_to_float32(uint64_t a, float_status *status)
2720 {
2721     return uint64_to_float32_scalbn(a, 0, status);
2722 }
2723
2724 float32 uint32_to_float32(uint32_t a, float_status *status)
2725 {
2726     return uint64_to_float32_scalbn(a, 0, status);
2727 }
2728
2729 float32 uint16_to_float32(uint16_t a, float_status *status)
2730 {
2731     return uint64_to_float32_scalbn(a, 0, status);
2732 }
2733
2734 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2735 {
2736     FloatParts pa = uint_to_float(a, scale, status);
2737     return float64_round_pack_canonical(pa, status);
2738 }
2739
2740 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2741 {
2742     return uint64_to_float64_scalbn(a, scale, status);
2743 }
2744
2745 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2746 {
2747     return uint64_to_float64_scalbn(a, scale, status);
2748 }
2749
2750 float64 uint64_to_float64(uint64_t a, float_status *status)
2751 {
2752     return uint64_to_float64_scalbn(a, 0, status);
2753 }
2754
2755 float64 uint32_to_float64(uint32_t a, float_status *status)
2756 {
2757     return uint64_to_float64_scalbn(a, 0, status);
2758 }
2759
2760 float64 uint16_to_float64(uint16_t a, float_status *status)
2761 {
2762     return uint64_to_float64_scalbn(a, 0, status);
2763 }
2764
2765 /* Float Min/Max */
2766 /* min() and max() functions. These can't be implemented as
2767  * 'compare and pick one input' because that would mishandle
2768  * NaNs and +0 vs -0.
2769  *
2770  * minnum() and maxnum() functions. These are similar to the min()
2771  * and max() functions but if one of the arguments is a QNaN and
2772  * the other is numerical then the numerical argument is returned.
2773  * SNaNs will get quietened before being returned.
2774  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2775  * and maxNum() operations. min() and max() are the typical min/max
2776  * semantics provided by many CPUs which predate that specification.
2777  *
2778  * minnummag() and maxnummag() functions correspond to minNumMag()
2779  * and minNumMag() from the IEEE-754 2008.
2780  */
2781 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2782                                 bool ieee, bool ismag, float_status *s)
2783 {
2784     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2785         if (ieee) {
2786             /* Takes two floating-point values `a' and `b', one of
2787              * which is a NaN, and returns the appropriate NaN
2788              * result. If either `a' or `b' is a signaling NaN,
2789              * the invalid exception is raised.
2790              */
2791             if (is_snan(a.cls) || is_snan(b.cls)) {
2792                 return pick_nan(a, b, s);
2793             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2794                 return b;
2795             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2796                 return a;
2797             }
2798         }
2799         return pick_nan(a, b, s);
2800     } else {
2801         int a_exp, b_exp;
2802
2803         switch (a.cls) {
2804         case float_class_normal:
2805             a_exp = a.exp;
2806             break;
2807         case float_class_inf:
2808             a_exp = INT_MAX;
2809             break;
2810         case float_class_zero:
2811             a_exp = INT_MIN;
2812             break;
2813         default:
2814             g_assert_not_reached();
2815             break;
2816         }
2817         switch (b.cls) {
2818         case float_class_normal:
2819             b_exp = b.exp;
2820             break;
2821         case float_class_inf:
2822             b_exp = INT_MAX;
2823             break;
2824         case float_class_zero:
2825             b_exp = INT_MIN;
2826             break;
2827         default:
2828             g_assert_not_reached();
2829             break;
2830         }
2831
2832         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2833             bool a_less = a_exp < b_exp;
2834             if (a_exp == b_exp) {
2835                 a_less = a.frac < b.frac;
2836             }
2837             return a_less ^ ismin ? b : a;
2838         }
2839
2840         if (a.sign == b.sign) {
2841             bool a_less = a_exp < b_exp;
2842             if (a_exp == b_exp) {
2843                 a_less = a.frac < b.frac;
2844             }
2845             return a.sign ^ a_less ^ ismin ? b : a;
2846         } else {
2847             return a.sign ^ ismin ? b : a;
2848         }
2849     }
2850 }
2851
2852 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2853 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2854                                      float_status *s)                   \
2855 {                                                                       \
2856     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2857     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2858     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2859                                                                         \
2860     return float ## sz ## _round_pack_canonical(pr, s);                 \
2861 }
2862
2863 MINMAX(16, min, true, false, false)
2864 MINMAX(16, minnum, true, true, false)
2865 MINMAX(16, minnummag, true, true, true)
2866 MINMAX(16, max, false, false, false)
2867 MINMAX(16, maxnum, false, true, false)
2868 MINMAX(16, maxnummag, false, true, true)
2869
2870 MINMAX(32, min, true, false, false)
2871 MINMAX(32, minnum, true, true, false)
2872 MINMAX(32, minnummag, true, true, true)
2873 MINMAX(32, max, false, false, false)
2874 MINMAX(32, maxnum, false, true, false)
2875 MINMAX(32, maxnummag, false, true, true)
2876
2877 MINMAX(64, min, true, false, false)
2878 MINMAX(64, minnum, true, true, false)
2879 MINMAX(64, minnummag, true, true, true)
2880 MINMAX(64, max, false, false, false)
2881 MINMAX(64, maxnum, false, true, false)
2882 MINMAX(64, maxnummag, false, true, true)
2883
2884 #undef MINMAX
2885
2886 /* Floating point compare */
2887 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2888                           float_status *s)
2889 {
2890     if (is_nan(a.cls) || is_nan(b.cls)) {
2891         if (!is_quiet ||
2892             a.cls == float_class_snan ||
2893             b.cls == float_class_snan) {
2894             s->float_exception_flags |= float_flag_invalid;
2895         }
2896         return float_relation_unordered;
2897     }
2898
2899     if (a.cls == float_class_zero) {
2900         if (b.cls == float_class_zero) {
2901             return float_relation_equal;
2902         }
2903         return b.sign ? float_relation_greater : float_relation_less;
2904     } else if (b.cls == float_class_zero) {
2905         return a.sign ? float_relation_less : float_relation_greater;
2906     }
2907
2908     /* The only really important thing about infinity is its sign. If
2909      * both are infinities the sign marks the smallest of the two.
2910      */
2911     if (a.cls == float_class_inf) {
2912         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2913             return float_relation_equal;
2914         }
2915         return a.sign ? float_relation_less : float_relation_greater;
2916     } else if (b.cls == float_class_inf) {
2917         return b.sign ? float_relation_greater : float_relation_less;
2918     }
2919
2920     if (a.sign != b.sign) {
2921         return a.sign ? float_relation_less : float_relation_greater;
2922     }
2923
2924     if (a.exp == b.exp) {
2925         if (a.frac == b.frac) {
2926             return float_relation_equal;
2927         }
2928         if (a.sign) {
2929             return a.frac > b.frac ?
2930                 float_relation_less : float_relation_greater;
2931         } else {
2932             return a.frac > b.frac ?
2933                 float_relation_greater : float_relation_less;
2934         }
2935     } else {
2936         if (a.sign) {
2937             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2938         } else {
2939             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2940         }
2941     }
2942 }
2943
2944 #define COMPARE(name, attr, sz)                                         \
2945 static int attr                                                         \
2946 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
2947 {                                                                       \
2948     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2949     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2950     return compare_floats(pa, pb, is_quiet, s);                         \
2951 }
2952
2953 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2954 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2955 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2956
2957 #undef COMPARE
2958
2959 int float16_compare(float16 a, float16 b, float_status *s)
2960 {
2961     return soft_f16_compare(a, b, false, s);
2962 }
2963
2964 int float16_compare_quiet(float16 a, float16 b, float_status *s)
2965 {
2966     return soft_f16_compare(a, b, true, s);
2967 }
2968
2969 static int QEMU_FLATTEN
2970 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2971 {
2972     union_float32 ua, ub;
2973
2974     ua.s = xa;
2975     ub.s = xb;
2976
2977     if (QEMU_NO_HARDFLOAT) {
2978         goto soft;
2979     }
2980
2981     float32_input_flush2(&ua.s, &ub.s, s);
2982     if (isgreaterequal(ua.h, ub.h)) {
2983         if (isgreater(ua.h, ub.h)) {
2984             return float_relation_greater;
2985         }
2986         return float_relation_equal;
2987     }
2988     if (likely(isless(ua.h, ub.h))) {
2989         return float_relation_less;
2990     }
2991     /* The only condition remaining is unordered.
2992      * Fall through to set flags.
2993      */
2994  soft:
2995     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2996 }
2997
2998 int float32_compare(float32 a, float32 b, float_status *s)
2999 {
3000     return f32_compare(a, b, false, s);
3001 }
3002
3003 int float32_compare_quiet(float32 a, float32 b, float_status *s)
3004 {
3005     return f32_compare(a, b, true, s);
3006 }
3007
3008 static int QEMU_FLATTEN
3009 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3010 {
3011     union_float64 ua, ub;
3012
3013     ua.s = xa;
3014     ub.s = xb;
3015
3016     if (QEMU_NO_HARDFLOAT) {
3017         goto soft;
3018     }
3019
3020     float64_input_flush2(&ua.s, &ub.s, s);
3021     if (isgreaterequal(ua.h, ub.h)) {
3022         if (isgreater(ua.h, ub.h)) {
3023             return float_relation_greater;
3024         }
3025         return float_relation_equal;
3026     }
3027     if (likely(isless(ua.h, ub.h))) {
3028         return float_relation_less;
3029     }
3030     /* The only condition remaining is unordered.
3031      * Fall through to set flags.
3032      */
3033  soft:
3034     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3035 }
3036
3037 int float64_compare(float64 a, float64 b, float_status *s)
3038 {
3039     return f64_compare(a, b, false, s);
3040 }
3041
3042 int float64_compare_quiet(float64 a, float64 b, float_status *s)
3043 {
3044     return f64_compare(a, b, true, s);
3045 }
3046
3047 /* Multiply A by 2 raised to the power N.  */
3048 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3049 {
3050     if (unlikely(is_nan(a.cls))) {
3051         return return_nan(a, s);
3052     }
3053     if (a.cls == float_class_normal) {
3054         /* The largest float type (even though not supported by FloatParts)
3055          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3056          * still allows rounding to infinity, without allowing overflow
3057          * within the int32_t that backs FloatParts.exp.
3058          */
3059         n = MIN(MAX(n, -0x10000), 0x10000);
3060         a.exp += n;
3061     }
3062     return a;
3063 }
3064
3065 float16 float16_scalbn(float16 a, int n, float_status *status)
3066 {
3067     FloatParts pa = float16_unpack_canonical(a, status);
3068     FloatParts pr = scalbn_decomposed(pa, n, status);
3069     return float16_round_pack_canonical(pr, status);
3070 }
3071
3072 float32 float32_scalbn(float32 a, int n, float_status *status)
3073 {
3074     FloatParts pa = float32_unpack_canonical(a, status);
3075     FloatParts pr = scalbn_decomposed(pa, n, status);
3076     return float32_round_pack_canonical(pr, status);
3077 }
3078
3079 float64 float64_scalbn(float64 a, int n, float_status *status)
3080 {
3081     FloatParts pa = float64_unpack_canonical(a, status);
3082     FloatParts pr = scalbn_decomposed(pa, n, status);
3083     return float64_round_pack_canonical(pr, status);
3084 }
3085
3086 /*
3087  * Square Root
3088  *
3089  * The old softfloat code did an approximation step before zeroing in
3090  * on the final result. However for simpleness we just compute the
3091  * square root by iterating down from the implicit bit to enough extra
3092  * bits to ensure we get a correctly rounded result.
3093  *
3094  * This does mean however the calculation is slower than before,
3095  * especially for 64 bit floats.
3096  */
3097
3098 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3099 {
3100     uint64_t a_frac, r_frac, s_frac;
3101     int bit, last_bit;
3102
3103     if (is_nan(a.cls)) {
3104         return return_nan(a, s);
3105     }
3106     if (a.cls == float_class_zero) {
3107         return a;  /* sqrt(+-0) = +-0 */
3108     }
3109     if (a.sign) {
3110         s->float_exception_flags |= float_flag_invalid;
3111         return parts_default_nan(s);
3112     }
3113     if (a.cls == float_class_inf) {
3114         return a;  /* sqrt(+inf) = +inf */
3115     }
3116
3117     assert(a.cls == float_class_normal);
3118
3119     /* We need two overflow bits at the top. Adding room for that is a
3120      * right shift. If the exponent is odd, we can discard the low bit
3121      * by multiplying the fraction by 2; that's a left shift. Combine
3122      * those and we shift right if the exponent is even.
3123      */
3124     a_frac = a.frac;
3125     if (!(a.exp & 1)) {
3126         a_frac >>= 1;
3127     }
3128     a.exp >>= 1;
3129
3130     /* Bit-by-bit computation of sqrt.  */
3131     r_frac = 0;
3132     s_frac = 0;
3133
3134     /* Iterate from implicit bit down to the 3 extra bits to compute a
3135      * properly rounded result. Remember we've inserted one more bit
3136      * at the top, so these positions are one less.
3137      */
3138     bit = DECOMPOSED_BINARY_POINT - 1;
3139     last_bit = MAX(p->frac_shift - 4, 0);
3140     do {
3141         uint64_t q = 1ULL << bit;
3142         uint64_t t_frac = s_frac + q;
3143         if (t_frac <= a_frac) {
3144             s_frac = t_frac + q;
3145             a_frac -= t_frac;
3146             r_frac += q;
3147         }
3148         a_frac <<= 1;
3149     } while (--bit >= last_bit);
3150
3151     /* Undo the right shift done above. If there is any remaining
3152      * fraction, the result is inexact. Set the sticky bit.
3153      */
3154     a.frac = (r_frac << 1) + (a_frac != 0);
3155
3156     return a;
3157 }
3158
3159 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3160 {
3161     FloatParts pa = float16_unpack_canonical(a, status);
3162     FloatParts pr = sqrt_float(pa, status, &float16_params);
3163     return float16_round_pack_canonical(pr, status);
3164 }
3165
3166 static float32 QEMU_SOFTFLOAT_ATTR
3167 soft_f32_sqrt(float32 a, float_status *status)
3168 {
3169     FloatParts pa = float32_unpack_canonical(a, status);
3170     FloatParts pr = sqrt_float(pa, status, &float32_params);
3171     return float32_round_pack_canonical(pr, status);
3172 }
3173
3174 static float64 QEMU_SOFTFLOAT_ATTR
3175 soft_f64_sqrt(float64 a, float_status *status)
3176 {
3177     FloatParts pa = float64_unpack_canonical(a, status);
3178     FloatParts pr = sqrt_float(pa, status, &float64_params);
3179     return float64_round_pack_canonical(pr, status);
3180 }
3181
3182 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3183 {
3184     union_float32 ua, ur;
3185
3186     ua.s = xa;
3187     if (unlikely(!can_use_fpu(s))) {
3188         goto soft;
3189     }
3190
3191     float32_input_flush1(&ua.s, s);
3192     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3193         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3194                        fpclassify(ua.h) == FP_ZERO) ||
3195                      signbit(ua.h))) {
3196             goto soft;
3197         }
3198     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3199                         float32_is_neg(ua.s))) {
3200         goto soft;
3201     }
3202     ur.h = sqrtf(ua.h);
3203     return ur.s;
3204
3205  soft:
3206     return soft_f32_sqrt(ua.s, s);
3207 }
3208
3209 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3210 {
3211     union_float64 ua, ur;
3212
3213     ua.s = xa;
3214     if (unlikely(!can_use_fpu(s))) {
3215         goto soft;
3216     }
3217
3218     float64_input_flush1(&ua.s, s);
3219     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3220         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3221                        fpclassify(ua.h) == FP_ZERO) ||
3222                      signbit(ua.h))) {
3223             goto soft;
3224         }
3225     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3226                         float64_is_neg(ua.s))) {
3227         goto soft;
3228     }
3229     ur.h = sqrt(ua.h);
3230     return ur.s;
3231
3232  soft:
3233     return soft_f64_sqrt(ua.s, s);
3234 }
3235
3236 /*----------------------------------------------------------------------------
3237 | The pattern for a default generated NaN.
3238 *----------------------------------------------------------------------------*/
3239
3240 float16 float16_default_nan(float_status *status)
3241 {
3242     FloatParts p = parts_default_nan(status);
3243     p.frac >>= float16_params.frac_shift;
3244     return float16_pack_raw(p);
3245 }
3246
3247 float32 float32_default_nan(float_status *status)
3248 {
3249     FloatParts p = parts_default_nan(status);
3250     p.frac >>= float32_params.frac_shift;
3251     return float32_pack_raw(p);
3252 }
3253
3254 float64 float64_default_nan(float_status *status)
3255 {
3256     FloatParts p = parts_default_nan(status);
3257     p.frac >>= float64_params.frac_shift;
3258     return float64_pack_raw(p);
3259 }
3260
3261 float128 float128_default_nan(float_status *status)
3262 {
3263     FloatParts p = parts_default_nan(status);
3264     float128 r;
3265
3266     /* Extrapolate from the choices made by parts_default_nan to fill
3267      * in the quad-floating format.  If the low bit is set, assume we
3268      * want to set all non-snan bits.
3269      */
3270     r.low = -(p.frac & 1);
3271     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3272     r.high |= LIT64(0x7FFF000000000000);
3273     r.high |= (uint64_t)p.sign << 63;
3274
3275     return r;
3276 }
3277
3278 /*----------------------------------------------------------------------------
3279 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3280 *----------------------------------------------------------------------------*/
3281
3282 float16 float16_silence_nan(float16 a, float_status *status)
3283 {
3284     FloatParts p = float16_unpack_raw(a);
3285     p.frac <<= float16_params.frac_shift;
3286     p = parts_silence_nan(p, status);
3287     p.frac >>= float16_params.frac_shift;
3288     return float16_pack_raw(p);
3289 }
3290
3291 float32 float32_silence_nan(float32 a, float_status *status)
3292 {
3293     FloatParts p = float32_unpack_raw(a);
3294     p.frac <<= float32_params.frac_shift;
3295     p = parts_silence_nan(p, status);
3296     p.frac >>= float32_params.frac_shift;
3297     return float32_pack_raw(p);
3298 }
3299
3300 float64 float64_silence_nan(float64 a, float_status *status)
3301 {
3302     FloatParts p = float64_unpack_raw(a);
3303     p.frac <<= float64_params.frac_shift;
3304     p = parts_silence_nan(p, status);
3305     p.frac >>= float64_params.frac_shift;
3306     return float64_pack_raw(p);
3307 }
3308
3309 /*----------------------------------------------------------------------------
3310 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3311 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3312 | input.  If `zSign' is 1, the input is negated before being converted to an
3313 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3314 | is simply rounded to an integer, with the inexact exception raised if the
3315 | input cannot be represented exactly as an integer.  However, if the fixed-
3316 | point input is too large, the invalid exception is raised and the largest
3317 | positive or negative integer is returned.
3318 *----------------------------------------------------------------------------*/
3319
3320 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3321 {
3322     int8_t roundingMode;
3323     flag roundNearestEven;
3324     int8_t roundIncrement, roundBits;
3325     int32_t z;
3326
3327     roundingMode = status->float_rounding_mode;
3328     roundNearestEven = ( roundingMode == float_round_nearest_even );
3329     switch (roundingMode) {
3330     case float_round_nearest_even:
3331     case float_round_ties_away:
3332         roundIncrement = 0x40;
3333         break;
3334     case float_round_to_zero:
3335         roundIncrement = 0;
3336         break;
3337     case float_round_up:
3338         roundIncrement = zSign ? 0 : 0x7f;
3339         break;
3340     case float_round_down:
3341         roundIncrement = zSign ? 0x7f : 0;
3342         break;
3343     case float_round_to_odd:
3344         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3345         break;
3346     default:
3347         abort();
3348     }
3349     roundBits = absZ & 0x7F;
3350     absZ = ( absZ + roundIncrement )>>7;
3351     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3352     z = absZ;
3353     if ( zSign ) z = - z;
3354     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3355         float_raise(float_flag_invalid, status);
3356         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3357     }
3358     if (roundBits) {
3359         status->float_exception_flags |= float_flag_inexact;
3360     }
3361     return z;
3362
3363 }
3364
3365 /*----------------------------------------------------------------------------
3366 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3367 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3368 | and returns the properly rounded 64-bit integer corresponding to the input.
3369 | If `zSign' is 1, the input is negated before being converted to an integer.
3370 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3371 | the inexact exception raised if the input cannot be represented exactly as
3372 | an integer.  However, if the fixed-point input is too large, the invalid
3373 | exception is raised and the largest positive or negative integer is
3374 | returned.
3375 *----------------------------------------------------------------------------*/
3376
3377 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3378                                float_status *status)
3379 {
3380     int8_t roundingMode;
3381     flag roundNearestEven, increment;
3382     int64_t z;
3383
3384     roundingMode = status->float_rounding_mode;
3385     roundNearestEven = ( roundingMode == float_round_nearest_even );
3386     switch (roundingMode) {
3387     case float_round_nearest_even:
3388     case float_round_ties_away:
3389         increment = ((int64_t) absZ1 < 0);
3390         break;
3391     case float_round_to_zero:
3392         increment = 0;
3393         break;
3394     case float_round_up:
3395         increment = !zSign && absZ1;
3396         break;
3397     case float_round_down:
3398         increment = zSign && absZ1;
3399         break;
3400     case float_round_to_odd:
3401         increment = !(absZ0 & 1) && absZ1;
3402         break;
3403     default:
3404         abort();
3405     }
3406     if ( increment ) {
3407         ++absZ0;
3408         if ( absZ0 == 0 ) goto overflow;
3409         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3410     }
3411     z = absZ0;
3412     if ( zSign ) z = - z;
3413     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3414  overflow:
3415         float_raise(float_flag_invalid, status);
3416         return
3417               zSign ? (int64_t) LIT64( 0x8000000000000000 )
3418             : LIT64( 0x7FFFFFFFFFFFFFFF );
3419     }
3420     if (absZ1) {
3421         status->float_exception_flags |= float_flag_inexact;
3422     }
3423     return z;
3424
3425 }
3426
3427 /*----------------------------------------------------------------------------
3428 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3429 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3430 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3431 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3432 | with the inexact exception raised if the input cannot be represented exactly
3433 | as an integer.  However, if the fixed-point input is too large, the invalid
3434 | exception is raised and the largest unsigned integer is returned.
3435 *----------------------------------------------------------------------------*/
3436
3437 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3438                                 uint64_t absZ1, float_status *status)
3439 {
3440     int8_t roundingMode;
3441     flag roundNearestEven, increment;
3442
3443     roundingMode = status->float_rounding_mode;
3444     roundNearestEven = (roundingMode == float_round_nearest_even);
3445     switch (roundingMode) {
3446     case float_round_nearest_even:
3447     case float_round_ties_away:
3448         increment = ((int64_t)absZ1 < 0);
3449         break;
3450     case float_round_to_zero:
3451         increment = 0;
3452         break;
3453     case float_round_up:
3454         increment = !zSign && absZ1;
3455         break;
3456     case float_round_down:
3457         increment = zSign && absZ1;
3458         break;
3459     case float_round_to_odd:
3460         increment = !(absZ0 & 1) && absZ1;
3461         break;
3462     default:
3463         abort();
3464     }
3465     if (increment) {
3466         ++absZ0;
3467         if (absZ0 == 0) {
3468             float_raise(float_flag_invalid, status);
3469             return LIT64(0xFFFFFFFFFFFFFFFF);
3470         }
3471         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3472     }
3473
3474     if (zSign && absZ0) {
3475         float_raise(float_flag_invalid, status);
3476         return 0;
3477     }
3478
3479     if (absZ1) {
3480         status->float_exception_flags |= float_flag_inexact;
3481     }
3482     return absZ0;
3483 }
3484
3485 /*----------------------------------------------------------------------------
3486 | If `a' is denormal and we are in flush-to-zero mode then set the
3487 | input-denormal exception and return zero. Otherwise just return the value.
3488 *----------------------------------------------------------------------------*/
3489 float32 float32_squash_input_denormal(float32 a, float_status *status)
3490 {
3491     if (status->flush_inputs_to_zero) {
3492         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
3493             float_raise(float_flag_input_denormal, status);
3494             return make_float32(float32_val(a) & 0x80000000);
3495         }
3496     }
3497     return a;
3498 }
3499
3500 /*----------------------------------------------------------------------------
3501 | Normalizes the subnormal single-precision floating-point value represented
3502 | by the denormalized significand `aSig'.  The normalized exponent and
3503 | significand are stored at the locations pointed to by `zExpPtr' and
3504 | `zSigPtr', respectively.
3505 *----------------------------------------------------------------------------*/
3506
3507 static void
3508  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3509 {
3510     int8_t shiftCount;
3511
3512     shiftCount = clz32(aSig) - 8;
3513     *zSigPtr = aSig<<shiftCount;
3514     *zExpPtr = 1 - shiftCount;
3515
3516 }
3517
3518 /*----------------------------------------------------------------------------
3519 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3520 | and significand `zSig', and returns the proper single-precision floating-
3521 | point value corresponding to the abstract input.  Ordinarily, the abstract
3522 | value is simply rounded and packed into the single-precision format, with
3523 | the inexact exception raised if the abstract input cannot be represented
3524 | exactly.  However, if the abstract value is too large, the overflow and
3525 | inexact exceptions are raised and an infinity or maximal finite value is
3526 | returned.  If the abstract value is too small, the input value is rounded to
3527 | a subnormal number, and the underflow and inexact exceptions are raised if
3528 | the abstract input cannot be represented exactly as a subnormal single-
3529 | precision floating-point number.
3530 |     The input significand `zSig' has its binary point between bits 30
3531 | and 29, which is 7 bits to the left of the usual location.  This shifted
3532 | significand must be normalized or smaller.  If `zSig' is not normalized,
3533 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3534 | and it must not require rounding.  In the usual case that `zSig' is
3535 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3536 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3537 | Binary Floating-Point Arithmetic.
3538 *----------------------------------------------------------------------------*/
3539
3540 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3541                                    float_status *status)
3542 {
3543     int8_t roundingMode;
3544     flag roundNearestEven;
3545     int8_t roundIncrement, roundBits;
3546     flag isTiny;
3547
3548     roundingMode = status->float_rounding_mode;
3549     roundNearestEven = ( roundingMode == float_round_nearest_even );
3550     switch (roundingMode) {
3551     case float_round_nearest_even:
3552     case float_round_ties_away:
3553         roundIncrement = 0x40;
3554         break;
3555     case float_round_to_zero:
3556         roundIncrement = 0;
3557         break;
3558     case float_round_up:
3559         roundIncrement = zSign ? 0 : 0x7f;
3560         break;
3561     case float_round_down:
3562         roundIncrement = zSign ? 0x7f : 0;
3563         break;
3564     case float_round_to_odd:
3565         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3566         break;
3567     default:
3568         abort();
3569         break;
3570     }
3571     roundBits = zSig & 0x7F;
3572     if ( 0xFD <= (uint16_t) zExp ) {
3573         if (    ( 0xFD < zExp )
3574              || (    ( zExp == 0xFD )
3575                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3576            ) {
3577             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3578                                    roundIncrement != 0;
3579             float_raise(float_flag_overflow | float_flag_inexact, status);
3580             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3581         }
3582         if ( zExp < 0 ) {
3583             if (status->flush_to_zero) {
3584                 float_raise(float_flag_output_denormal, status);
3585                 return packFloat32(zSign, 0, 0);
3586             }
3587             isTiny =
3588                 (status->float_detect_tininess
3589                  == float_tininess_before_rounding)
3590                 || ( zExp < -1 )
3591                 || ( zSig + roundIncrement < 0x80000000 );
3592             shift32RightJamming( zSig, - zExp, &zSig );
3593             zExp = 0;
3594             roundBits = zSig & 0x7F;
3595             if (isTiny && roundBits) {
3596                 float_raise(float_flag_underflow, status);
3597             }
3598             if (roundingMode == float_round_to_odd) {
3599                 /*
3600                  * For round-to-odd case, the roundIncrement depends on
3601                  * zSig which just changed.
3602                  */
3603                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3604             }
3605         }
3606     }
3607     if (roundBits) {
3608         status->float_exception_flags |= float_flag_inexact;
3609     }
3610     zSig = ( zSig + roundIncrement )>>7;
3611     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3612     if ( zSig == 0 ) zExp = 0;
3613     return packFloat32( zSign, zExp, zSig );
3614
3615 }
3616
3617 /*----------------------------------------------------------------------------
3618 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3619 | and significand `zSig', and returns the proper single-precision floating-
3620 | point value corresponding to the abstract input.  This routine is just like
3621 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3622 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3623 | floating-point exponent.
3624 *----------------------------------------------------------------------------*/
3625
3626 static float32
3627  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3628                               float_status *status)
3629 {
3630     int8_t shiftCount;
3631
3632     shiftCount = clz32(zSig) - 1;
3633     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3634                                status);
3635
3636 }
3637
3638 /*----------------------------------------------------------------------------
3639 | If `a' is denormal and we are in flush-to-zero mode then set the
3640 | input-denormal exception and return zero. Otherwise just return the value.
3641 *----------------------------------------------------------------------------*/
3642 float64 float64_squash_input_denormal(float64 a, float_status *status)
3643 {
3644     if (status->flush_inputs_to_zero) {
3645         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
3646             float_raise(float_flag_input_denormal, status);
3647             return make_float64(float64_val(a) & (1ULL << 63));
3648         }
3649     }
3650     return a;
3651 }
3652
3653 /*----------------------------------------------------------------------------
3654 | Normalizes the subnormal double-precision floating-point value represented
3655 | by the denormalized significand `aSig'.  The normalized exponent and
3656 | significand are stored at the locations pointed to by `zExpPtr' and
3657 | `zSigPtr', respectively.
3658 *----------------------------------------------------------------------------*/
3659
3660 static void
3661  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3662 {
3663     int8_t shiftCount;
3664
3665     shiftCount = clz64(aSig) - 11;
3666     *zSigPtr = aSig<<shiftCount;
3667     *zExpPtr = 1 - shiftCount;
3668
3669 }
3670
3671 /*----------------------------------------------------------------------------
3672 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3673 | double-precision floating-point value, returning the result.  After being
3674 | shifted into the proper positions, the three fields are simply added
3675 | together to form the result.  This means that any integer portion of `zSig'
3676 | will be added into the exponent.  Since a properly normalized significand
3677 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3678 | than the desired result exponent whenever `zSig' is a complete, normalized
3679 | significand.
3680 *----------------------------------------------------------------------------*/
3681
3682 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3683 {
3684
3685     return make_float64(
3686         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3687
3688 }
3689
3690 /*----------------------------------------------------------------------------
3691 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3692 | and significand `zSig', and returns the proper double-precision floating-
3693 | point value corresponding to the abstract input.  Ordinarily, the abstract
3694 | value is simply rounded and packed into the double-precision format, with
3695 | the inexact exception raised if the abstract input cannot be represented
3696 | exactly.  However, if the abstract value is too large, the overflow and
3697 | inexact exceptions are raised and an infinity or maximal finite value is
3698 | returned.  If the abstract value is too small, the input value is rounded to
3699 | a subnormal number, and the underflow and inexact exceptions are raised if
3700 | the abstract input cannot be represented exactly as a subnormal double-
3701 | precision floating-point number.
3702 |     The input significand `zSig' has its binary point between bits 62
3703 | and 61, which is 10 bits to the left of the usual location.  This shifted
3704 | significand must be normalized or smaller.  If `zSig' is not normalized,
3705 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3706 | and it must not require rounding.  In the usual case that `zSig' is
3707 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3708 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3709 | Binary Floating-Point Arithmetic.
3710 *----------------------------------------------------------------------------*/
3711
3712 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3713                                    float_status *status)
3714 {
3715     int8_t roundingMode;
3716     flag roundNearestEven;
3717     int roundIncrement, roundBits;
3718     flag isTiny;
3719
3720     roundingMode = status->float_rounding_mode;
3721     roundNearestEven = ( roundingMode == float_round_nearest_even );
3722     switch (roundingMode) {
3723     case float_round_nearest_even:
3724     case float_round_ties_away:
3725         roundIncrement = 0x200;
3726         break;
3727     case float_round_to_zero:
3728         roundIncrement = 0;
3729         break;
3730     case float_round_up:
3731         roundIncrement = zSign ? 0 : 0x3ff;
3732         break;
3733     case float_round_down:
3734         roundIncrement = zSign ? 0x3ff : 0;
3735         break;
3736     case float_round_to_odd:
3737         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3738         break;
3739     default:
3740         abort();
3741     }
3742     roundBits = zSig & 0x3FF;
3743     if ( 0x7FD <= (uint16_t) zExp ) {
3744         if (    ( 0x7FD < zExp )
3745              || (    ( zExp == 0x7FD )
3746                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3747            ) {
3748             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3749                                    roundIncrement != 0;
3750             float_raise(float_flag_overflow | float_flag_inexact, status);
3751             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3752         }
3753         if ( zExp < 0 ) {
3754             if (status->flush_to_zero) {
3755                 float_raise(float_flag_output_denormal, status);
3756                 return packFloat64(zSign, 0, 0);
3757             }
3758             isTiny =
3759                    (status->float_detect_tininess
3760                     == float_tininess_before_rounding)
3761                 || ( zExp < -1 )
3762                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
3763             shift64RightJamming( zSig, - zExp, &zSig );
3764             zExp = 0;
3765             roundBits = zSig & 0x3FF;
3766             if (isTiny && roundBits) {
3767                 float_raise(float_flag_underflow, status);
3768             }
3769             if (roundingMode == float_round_to_odd) {
3770                 /*
3771                  * For round-to-odd case, the roundIncrement depends on
3772                  * zSig which just changed.
3773                  */
3774                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3775             }
3776         }
3777     }
3778     if (roundBits) {
3779         status->float_exception_flags |= float_flag_inexact;
3780     }
3781     zSig = ( zSig + roundIncrement )>>10;
3782     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3783     if ( zSig == 0 ) zExp = 0;
3784     return packFloat64( zSign, zExp, zSig );
3785
3786 }
3787
3788 /*----------------------------------------------------------------------------
3789 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3790 | and significand `zSig', and returns the proper double-precision floating-
3791 | point value corresponding to the abstract input.  This routine is just like
3792 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3793 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3794 | floating-point exponent.
3795 *----------------------------------------------------------------------------*/
3796
3797 static float64
3798  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3799                               float_status *status)
3800 {
3801     int8_t shiftCount;
3802
3803     shiftCount = clz64(zSig) - 1;
3804     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3805                                status);
3806
3807 }
3808
3809 /*----------------------------------------------------------------------------
3810 | Normalizes the subnormal extended double-precision floating-point value
3811 | represented by the denormalized significand `aSig'.  The normalized exponent
3812 | and significand are stored at the locations pointed to by `zExpPtr' and
3813 | `zSigPtr', respectively.
3814 *----------------------------------------------------------------------------*/
3815
3816 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3817                                 uint64_t *zSigPtr)
3818 {
3819     int8_t shiftCount;
3820
3821     shiftCount = clz64(aSig);
3822     *zSigPtr = aSig<<shiftCount;
3823     *zExpPtr = 1 - shiftCount;
3824 }
3825
3826 /*----------------------------------------------------------------------------
3827 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3828 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3829 | and returns the proper extended double-precision floating-point value
3830 | corresponding to the abstract input.  Ordinarily, the abstract value is
3831 | rounded and packed into the extended double-precision format, with the
3832 | inexact exception raised if the abstract input cannot be represented
3833 | exactly.  However, if the abstract value is too large, the overflow and
3834 | inexact exceptions are raised and an infinity or maximal finite value is
3835 | returned.  If the abstract value is too small, the input value is rounded to
3836 | a subnormal number, and the underflow and inexact exceptions are raised if
3837 | the abstract input cannot be represented exactly as a subnormal extended
3838 | double-precision floating-point number.
3839 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3840 | number of bits as single or double precision, respectively.  Otherwise, the
3841 | result is rounded to the full precision of the extended double-precision
3842 | format.
3843 |     The input significand must be normalized or smaller.  If the input
3844 | significand is not normalized, `zExp' must be 0; in that case, the result
3845 | returned is a subnormal number, and it must not require rounding.  The
3846 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3847 | Floating-Point Arithmetic.
3848 *----------------------------------------------------------------------------*/
3849
3850 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3851                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3852                               float_status *status)
3853 {
3854     int8_t roundingMode;
3855     flag roundNearestEven, increment, isTiny;
3856     int64_t roundIncrement, roundMask, roundBits;
3857
3858     roundingMode = status->float_rounding_mode;
3859     roundNearestEven = ( roundingMode == float_round_nearest_even );
3860     if ( roundingPrecision == 80 ) goto precision80;
3861     if ( roundingPrecision == 64 ) {
3862         roundIncrement = LIT64( 0x0000000000000400 );
3863         roundMask = LIT64( 0x00000000000007FF );
3864     }
3865     else if ( roundingPrecision == 32 ) {
3866         roundIncrement = LIT64( 0x0000008000000000 );
3867         roundMask = LIT64( 0x000000FFFFFFFFFF );
3868     }
3869     else {
3870         goto precision80;
3871     }
3872     zSig0 |= ( zSig1 != 0 );
3873     switch (roundingMode) {
3874     case float_round_nearest_even:
3875     case float_round_ties_away:
3876         break;
3877     case float_round_to_zero:
3878         roundIncrement = 0;
3879         break;
3880     case float_round_up:
3881         roundIncrement = zSign ? 0 : roundMask;
3882         break;
3883     case float_round_down:
3884         roundIncrement = zSign ? roundMask : 0;
3885         break;
3886     default:
3887         abort();
3888     }
3889     roundBits = zSig0 & roundMask;
3890     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3891         if (    ( 0x7FFE < zExp )
3892              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3893            ) {
3894             goto overflow;
3895         }
3896         if ( zExp <= 0 ) {
3897             if (status->flush_to_zero) {
3898                 float_raise(float_flag_output_denormal, status);
3899                 return packFloatx80(zSign, 0, 0);
3900             }
3901             isTiny =
3902                    (status->float_detect_tininess
3903                     == float_tininess_before_rounding)
3904                 || ( zExp < 0 )
3905                 || ( zSig0 <= zSig0 + roundIncrement );
3906             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3907             zExp = 0;
3908             roundBits = zSig0 & roundMask;
3909             if (isTiny && roundBits) {
3910                 float_raise(float_flag_underflow, status);
3911             }
3912             if (roundBits) {
3913                 status->float_exception_flags |= float_flag_inexact;
3914             }
3915             zSig0 += roundIncrement;
3916             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3917             roundIncrement = roundMask + 1;
3918             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3919                 roundMask |= roundIncrement;
3920             }
3921             zSig0 &= ~ roundMask;
3922             return packFloatx80( zSign, zExp, zSig0 );
3923         }
3924     }
3925     if (roundBits) {
3926         status->float_exception_flags |= float_flag_inexact;
3927     }
3928     zSig0 += roundIncrement;
3929     if ( zSig0 < roundIncrement ) {
3930         ++zExp;
3931         zSig0 = LIT64( 0x8000000000000000 );
3932     }
3933     roundIncrement = roundMask + 1;
3934     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3935         roundMask |= roundIncrement;
3936     }
3937     zSig0 &= ~ roundMask;
3938     if ( zSig0 == 0 ) zExp = 0;
3939     return packFloatx80( zSign, zExp, zSig0 );
3940  precision80:
3941     switch (roundingMode) {
3942     case float_round_nearest_even:
3943     case float_round_ties_away:
3944         increment = ((int64_t)zSig1 < 0);
3945         break;
3946     case float_round_to_zero:
3947         increment = 0;
3948         break;
3949     case float_round_up:
3950         increment = !zSign && zSig1;
3951         break;
3952     case float_round_down:
3953         increment = zSign && zSig1;
3954         break;
3955     default:
3956         abort();
3957     }
3958     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3959         if (    ( 0x7FFE < zExp )
3960              || (    ( zExp == 0x7FFE )
3961                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3962                   && increment
3963                 )
3964            ) {
3965             roundMask = 0;
3966  overflow:
3967             float_raise(float_flag_overflow | float_flag_inexact, status);
3968             if (    ( roundingMode == float_round_to_zero )
3969                  || ( zSign && ( roundingMode == float_round_up ) )
3970                  || ( ! zSign && ( roundingMode == float_round_down ) )
3971                ) {
3972                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3973             }
3974             return packFloatx80(zSign,
3975                                 floatx80_infinity_high,
3976                                 floatx80_infinity_low);
3977         }
3978         if ( zExp <= 0 ) {
3979             isTiny =
3980                    (status->float_detect_tininess
3981                     == float_tininess_before_rounding)
3982                 || ( zExp < 0 )
3983                 || ! increment
3984                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3985             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3986             zExp = 0;
3987             if (isTiny && zSig1) {
3988                 float_raise(float_flag_underflow, status);
3989             }
3990             if (zSig1) {
3991                 status->float_exception_flags |= float_flag_inexact;
3992             }
3993             switch (roundingMode) {
3994             case float_round_nearest_even:
3995             case float_round_ties_away:
3996                 increment = ((int64_t)zSig1 < 0);
3997                 break;
3998             case float_round_to_zero:
3999                 increment = 0;
4000                 break;
4001             case float_round_up:
4002                 increment = !zSign && zSig1;
4003                 break;
4004             case float_round_down:
4005                 increment = zSign && zSig1;
4006                 break;
4007             default:
4008                 abort();
4009             }
4010             if ( increment ) {
4011                 ++zSig0;
4012                 zSig0 &=
4013                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4014                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4015             }
4016             return packFloatx80( zSign, zExp, zSig0 );
4017         }
4018     }
4019     if (zSig1) {
4020         status->float_exception_flags |= float_flag_inexact;
4021     }
4022     if ( increment ) {
4023         ++zSig0;
4024         if ( zSig0 == 0 ) {
4025             ++zExp;
4026             zSig0 = LIT64( 0x8000000000000000 );
4027         }
4028         else {
4029             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4030         }
4031     }
4032     else {
4033         if ( zSig0 == 0 ) zExp = 0;
4034     }
4035     return packFloatx80( zSign, zExp, zSig0 );
4036
4037 }
4038
4039 /*----------------------------------------------------------------------------
4040 | Takes an abstract floating-point value having sign `zSign', exponent
4041 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4042 | and returns the proper extended double-precision floating-point value
4043 | corresponding to the abstract input.  This routine is just like
4044 | `roundAndPackFloatx80' except that the input significand does not have to be
4045 | normalized.
4046 *----------------------------------------------------------------------------*/
4047
4048 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4049                                        flag zSign, int32_t zExp,
4050                                        uint64_t zSig0, uint64_t zSig1,
4051                                        float_status *status)
4052 {
4053     int8_t shiftCount;
4054
4055     if ( zSig0 == 0 ) {
4056         zSig0 = zSig1;
4057         zSig1 = 0;
4058         zExp -= 64;
4059     }
4060     shiftCount = clz64(zSig0);
4061     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4062     zExp -= shiftCount;
4063     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4064                                 zSig0, zSig1, status);
4065
4066 }
4067
4068 /*----------------------------------------------------------------------------
4069 | Returns the least-significant 64 fraction bits of the quadruple-precision
4070 | floating-point value `a'.
4071 *----------------------------------------------------------------------------*/
4072
4073 static inline uint64_t extractFloat128Frac1( float128 a )
4074 {
4075
4076     return a.low;
4077
4078 }
4079
4080 /*----------------------------------------------------------------------------
4081 | Returns the most-significant 48 fraction bits of the quadruple-precision
4082 | floating-point value `a'.
4083 *----------------------------------------------------------------------------*/
4084
4085 static inline uint64_t extractFloat128Frac0( float128 a )
4086 {
4087
4088     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
4089
4090 }
4091
4092 /*----------------------------------------------------------------------------
4093 | Returns the exponent bits of the quadruple-precision floating-point value
4094 | `a'.
4095 *----------------------------------------------------------------------------*/
4096
4097 static inline int32_t extractFloat128Exp( float128 a )
4098 {
4099
4100     return ( a.high>>48 ) & 0x7FFF;
4101
4102 }
4103
4104 /*----------------------------------------------------------------------------
4105 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4106 *----------------------------------------------------------------------------*/
4107
4108 static inline flag extractFloat128Sign( float128 a )
4109 {
4110
4111     return a.high>>63;
4112
4113 }
4114
4115 /*----------------------------------------------------------------------------
4116 | Normalizes the subnormal quadruple-precision floating-point value
4117 | represented by the denormalized significand formed by the concatenation of
4118 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4119 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4120 | significand are stored at the location pointed to by `zSig0Ptr', and the
4121 | least significant 64 bits of the normalized significand are stored at the
4122 | location pointed to by `zSig1Ptr'.
4123 *----------------------------------------------------------------------------*/
4124
4125 static void
4126  normalizeFloat128Subnormal(
4127      uint64_t aSig0,
4128      uint64_t aSig1,
4129      int32_t *zExpPtr,
4130      uint64_t *zSig0Ptr,
4131      uint64_t *zSig1Ptr
4132  )
4133 {
4134     int8_t shiftCount;
4135
4136     if ( aSig0 == 0 ) {
4137         shiftCount = clz64(aSig1) - 15;
4138         if ( shiftCount < 0 ) {
4139             *zSig0Ptr = aSig1>>( - shiftCount );
4140             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4141         }
4142         else {
4143             *zSig0Ptr = aSig1<<shiftCount;
4144             *zSig1Ptr = 0;
4145         }
4146         *zExpPtr = - shiftCount - 63;
4147     }
4148     else {
4149         shiftCount = clz64(aSig0) - 15;
4150         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4151         *zExpPtr = 1 - shiftCount;
4152     }
4153
4154 }
4155
4156 /*----------------------------------------------------------------------------
4157 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4158 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4159 | floating-point value, returning the result.  After being shifted into the
4160 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4161 | added together to form the most significant 32 bits of the result.  This
4162 | means that any integer portion of `zSig0' will be added into the exponent.
4163 | Since a properly normalized significand will have an integer portion equal
4164 | to 1, the `zExp' input should be 1 less than the desired result exponent
4165 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4166 | significand.
4167 *----------------------------------------------------------------------------*/
4168
4169 static inline float128
4170  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
4171 {
4172     float128 z;
4173
4174     z.low = zSig1;
4175     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
4176     return z;
4177
4178 }
4179
4180 /*----------------------------------------------------------------------------
4181 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4182 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4183 | and `zSig2', and returns the proper quadruple-precision floating-point value
4184 | corresponding to the abstract input.  Ordinarily, the abstract value is
4185 | simply rounded and packed into the quadruple-precision format, with the
4186 | inexact exception raised if the abstract input cannot be represented
4187 | exactly.  However, if the abstract value is too large, the overflow and
4188 | inexact exceptions are raised and an infinity or maximal finite value is
4189 | returned.  If the abstract value is too small, the input value is rounded to
4190 | a subnormal number, and the underflow and inexact exceptions are raised if
4191 | the abstract input cannot be represented exactly as a subnormal quadruple-
4192 | precision floating-point number.
4193 |     The input significand must be normalized or smaller.  If the input
4194 | significand is not normalized, `zExp' must be 0; in that case, the result
4195 | returned is a subnormal number, and it must not require rounding.  In the
4196 | usual case that the input significand is normalized, `zExp' must be 1 less
4197 | than the ``true'' floating-point exponent.  The handling of underflow and
4198 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4199 *----------------------------------------------------------------------------*/
4200
4201 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4202                                      uint64_t zSig0, uint64_t zSig1,
4203                                      uint64_t zSig2, float_status *status)
4204 {
4205     int8_t roundingMode;
4206     flag roundNearestEven, increment, isTiny;
4207
4208     roundingMode = status->float_rounding_mode;
4209     roundNearestEven = ( roundingMode == float_round_nearest_even );
4210     switch (roundingMode) {
4211     case float_round_nearest_even:
4212     case float_round_ties_away:
4213         increment = ((int64_t)zSig2 < 0);
4214         break;
4215     case float_round_to_zero:
4216         increment = 0;
4217         break;
4218     case float_round_up:
4219         increment = !zSign && zSig2;
4220         break;
4221     case float_round_down:
4222         increment = zSign && zSig2;
4223         break;
4224     case float_round_to_odd:
4225         increment = !(zSig1 & 0x1) && zSig2;
4226         break;
4227     default:
4228         abort();
4229     }
4230     if ( 0x7FFD <= (uint32_t) zExp ) {
4231         if (    ( 0x7FFD < zExp )
4232              || (    ( zExp == 0x7FFD )
4233                   && eq128(
4234                          LIT64( 0x0001FFFFFFFFFFFF ),
4235                          LIT64( 0xFFFFFFFFFFFFFFFF ),
4236                          zSig0,
4237                          zSig1
4238                      )
4239                   && increment
4240                 )
4241            ) {
4242             float_raise(float_flag_overflow | float_flag_inexact, status);
4243             if (    ( roundingMode == float_round_to_zero )
4244                  || ( zSign && ( roundingMode == float_round_up ) )
4245                  || ( ! zSign && ( roundingMode == float_round_down ) )
4246                  || (roundingMode == float_round_to_odd)
4247                ) {
4248                 return
4249                     packFloat128(
4250                         zSign,
4251                         0x7FFE,
4252                         LIT64( 0x0000FFFFFFFFFFFF ),
4253                         LIT64( 0xFFFFFFFFFFFFFFFF )
4254                     );
4255             }
4256             return packFloat128( zSign, 0x7FFF, 0, 0 );
4257         }
4258         if ( zExp < 0 ) {
4259             if (status->flush_to_zero) {
4260                 float_raise(float_flag_output_denormal, status);
4261                 return packFloat128(zSign, 0, 0, 0);
4262             }
4263             isTiny =
4264                    (status->float_detect_tininess
4265                     == float_tininess_before_rounding)
4266                 || ( zExp < -1 )
4267                 || ! increment
4268                 || lt128(
4269                        zSig0,
4270                        zSig1,
4271                        LIT64( 0x0001FFFFFFFFFFFF ),
4272                        LIT64( 0xFFFFFFFFFFFFFFFF )
4273                    );
4274             shift128ExtraRightJamming(
4275                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4276             zExp = 0;
4277             if (isTiny && zSig2) {
4278                 float_raise(float_flag_underflow, status);
4279             }
4280             switch (roundingMode) {
4281             case float_round_nearest_even:
4282             case float_round_ties_away:
4283                 increment = ((int64_t)zSig2 < 0);
4284                 break;
4285             case float_round_to_zero:
4286                 increment = 0;
4287                 break;
4288             case float_round_up:
4289                 increment = !zSign && zSig2;
4290                 break;
4291             case float_round_down:
4292                 increment = zSign && zSig2;
4293                 break;
4294             case float_round_to_odd:
4295                 increment = !(zSig1 & 0x1) && zSig2;
4296                 break;
4297             default:
4298                 abort();
4299             }
4300         }
4301     }
4302     if (zSig2) {
4303         status->float_exception_flags |= float_flag_inexact;
4304     }
4305     if ( increment ) {
4306         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4307         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4308     }
4309     else {
4310         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4311     }
4312     return packFloat128( zSign, zExp, zSig0, zSig1 );
4313
4314 }
4315
4316 /*----------------------------------------------------------------------------
4317 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4318 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4319 | returns the proper quadruple-precision floating-point value corresponding
4320 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4321 | except that the input significand has fewer bits and does not have to be
4322 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4323 | point exponent.
4324 *----------------------------------------------------------------------------*/
4325
4326 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4327                                               uint64_t zSig0, uint64_t zSig1,
4328                                               float_status *status)
4329 {
4330     int8_t shiftCount;
4331     uint64_t zSig2;
4332
4333     if ( zSig0 == 0 ) {
4334         zSig0 = zSig1;
4335         zSig1 = 0;
4336         zExp -= 64;
4337     }
4338     shiftCount = clz64(zSig0) - 15;
4339     if ( 0 <= shiftCount ) {
4340         zSig2 = 0;
4341         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4342     }
4343     else {
4344         shift128ExtraRightJamming(
4345             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4346     }
4347     zExp -= shiftCount;
4348     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4349
4350 }
4351
4352
4353 /*----------------------------------------------------------------------------
4354 | Returns the result of converting the 32-bit two's complement integer `a'
4355 | to the extended double-precision floating-point format.  The conversion
4356 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4357 | Arithmetic.
4358 *----------------------------------------------------------------------------*/
4359
4360 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4361 {
4362     flag zSign;
4363     uint32_t absA;
4364     int8_t shiftCount;
4365     uint64_t zSig;
4366
4367     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4368     zSign = ( a < 0 );
4369     absA = zSign ? - a : a;
4370     shiftCount = clz32(absA) + 32;
4371     zSig = absA;
4372     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4373
4374 }
4375
4376 /*----------------------------------------------------------------------------
4377 | Returns the result of converting the 32-bit two's complement integer `a' to
4378 | the quadruple-precision floating-point format.  The conversion is performed
4379 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4380 *----------------------------------------------------------------------------*/
4381
4382 float128 int32_to_float128(int32_t a, float_status *status)
4383 {
4384     flag zSign;
4385     uint32_t absA;
4386     int8_t shiftCount;
4387     uint64_t zSig0;
4388
4389     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4390     zSign = ( a < 0 );
4391     absA = zSign ? - a : a;
4392     shiftCount = clz32(absA) + 17;
4393     zSig0 = absA;
4394     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4395
4396 }
4397
4398 /*----------------------------------------------------------------------------
4399 | Returns the result of converting the 64-bit two's complement integer `a'
4400 | to the extended double-precision floating-point format.  The conversion
4401 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4402 | Arithmetic.
4403 *----------------------------------------------------------------------------*/
4404
4405 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4406 {
4407     flag zSign;
4408     uint64_t absA;
4409     int8_t shiftCount;
4410
4411     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4412     zSign = ( a < 0 );
4413     absA = zSign ? - a : a;
4414     shiftCount = clz64(absA);
4415     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4416
4417 }
4418
4419 /*----------------------------------------------------------------------------
4420 | Returns the result of converting the 64-bit two's complement integer `a' to
4421 | the quadruple-precision floating-point format.  The conversion is performed
4422 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4423 *----------------------------------------------------------------------------*/
4424
4425 float128 int64_to_float128(int64_t a, float_status *status)
4426 {
4427     flag zSign;
4428     uint64_t absA;
4429     int8_t shiftCount;
4430     int32_t zExp;
4431     uint64_t zSig0, zSig1;
4432
4433     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4434     zSign = ( a < 0 );
4435     absA = zSign ? - a : a;
4436     shiftCount = clz64(absA) + 49;
4437     zExp = 0x406E - shiftCount;
4438     if ( 64 <= shiftCount ) {
4439         zSig1 = 0;
4440         zSig0 = absA;
4441         shiftCount -= 64;
4442     }
4443     else {
4444         zSig1 = absA;
4445         zSig0 = 0;
4446     }
4447     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4448     return packFloat128( zSign, zExp, zSig0, zSig1 );
4449
4450 }
4451
4452 /*----------------------------------------------------------------------------
4453 | Returns the result of converting the 64-bit unsigned integer `a'
4454 | to the quadruple-precision floating-point format.  The conversion is performed
4455 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4456 *----------------------------------------------------------------------------*/
4457
4458 float128 uint64_to_float128(uint64_t a, float_status *status)
4459 {
4460     if (a == 0) {
4461         return float128_zero;
4462     }
4463     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4464 }
4465
4466 /*----------------------------------------------------------------------------
4467 | Returns the result of converting the single-precision floating-point value
4468 | `a' to the extended double-precision floating-point format.  The conversion
4469 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4470 | Arithmetic.
4471 *----------------------------------------------------------------------------*/
4472
4473 floatx80 float32_to_floatx80(float32 a, float_status *status)
4474 {
4475     flag aSign;
4476     int aExp;
4477     uint32_t aSig;
4478
4479     a = float32_squash_input_denormal(a, status);
4480     aSig = extractFloat32Frac( a );
4481     aExp = extractFloat32Exp( a );
4482     aSign = extractFloat32Sign( a );
4483     if ( aExp == 0xFF ) {
4484         if (aSig) {
4485             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4486         }
4487         return packFloatx80(aSign,
4488                             floatx80_infinity_high,
4489                             floatx80_infinity_low);
4490     }
4491     if ( aExp == 0 ) {
4492         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4493         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4494     }
4495     aSig |= 0x00800000;
4496     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4497
4498 }
4499
4500 /*----------------------------------------------------------------------------
4501 | Returns the result of converting the single-precision floating-point value
4502 | `a' to the double-precision floating-point format.  The conversion is
4503 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4504 | Arithmetic.
4505 *----------------------------------------------------------------------------*/
4506
4507 float128 float32_to_float128(float32 a, float_status *status)
4508 {
4509     flag aSign;
4510     int aExp;
4511     uint32_t aSig;
4512
4513     a = float32_squash_input_denormal(a, status);
4514     aSig = extractFloat32Frac( a );
4515     aExp = extractFloat32Exp( a );
4516     aSign = extractFloat32Sign( a );
4517     if ( aExp == 0xFF ) {
4518         if (aSig) {
4519             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4520         }
4521         return packFloat128( aSign, 0x7FFF, 0, 0 );
4522     }
4523     if ( aExp == 0 ) {
4524         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4525         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4526         --aExp;
4527     }
4528     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4529
4530 }
4531
4532 /*----------------------------------------------------------------------------
4533 | Returns the remainder of the single-precision floating-point value `a'
4534 | with respect to the corresponding value `b'.  The operation is performed
4535 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4536 *----------------------------------------------------------------------------*/
4537
4538 float32 float32_rem(float32 a, float32 b, float_status *status)
4539 {
4540     flag aSign, zSign;
4541     int aExp, bExp, expDiff;
4542     uint32_t aSig, bSig;
4543     uint32_t q;
4544     uint64_t aSig64, bSig64, q64;
4545     uint32_t alternateASig;
4546     int32_t sigMean;
4547     a = float32_squash_input_denormal(a, status);
4548     b = float32_squash_input_denormal(b, status);
4549
4550     aSig = extractFloat32Frac( a );
4551     aExp = extractFloat32Exp( a );
4552     aSign = extractFloat32Sign( a );
4553     bSig = extractFloat32Frac( b );
4554     bExp = extractFloat32Exp( b );
4555     if ( aExp == 0xFF ) {
4556         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4557             return propagateFloat32NaN(a, b, status);
4558         }
4559         float_raise(float_flag_invalid, status);
4560         return float32_default_nan(status);
4561     }
4562     if ( bExp == 0xFF ) {
4563         if (bSig) {
4564             return propagateFloat32NaN(a, b, status);
4565         }
4566         return a;
4567     }
4568     if ( bExp == 0 ) {
4569         if ( bSig == 0 ) {
4570             float_raise(float_flag_invalid, status);
4571             return float32_default_nan(status);
4572         }
4573         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4574     }
4575     if ( aExp == 0 ) {
4576         if ( aSig == 0 ) return a;
4577         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4578     }
4579     expDiff = aExp - bExp;
4580     aSig |= 0x00800000;
4581     bSig |= 0x00800000;
4582     if ( expDiff < 32 ) {
4583         aSig <<= 8;
4584         bSig <<= 8;
4585         if ( expDiff < 0 ) {
4586             if ( expDiff < -1 ) return a;
4587             aSig >>= 1;
4588         }
4589         q = ( bSig <= aSig );
4590         if ( q ) aSig -= bSig;
4591         if ( 0 < expDiff ) {
4592             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4593             q >>= 32 - expDiff;
4594             bSig >>= 2;
4595             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4596         }
4597         else {
4598             aSig >>= 2;
4599             bSig >>= 2;
4600         }
4601     }
4602     else {
4603         if ( bSig <= aSig ) aSig -= bSig;
4604         aSig64 = ( (uint64_t) aSig )<<40;
4605         bSig64 = ( (uint64_t) bSig )<<40;
4606         expDiff -= 64;
4607         while ( 0 < expDiff ) {
4608             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4609             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4610             aSig64 = - ( ( bSig * q64 )<<38 );
4611             expDiff -= 62;
4612         }
4613         expDiff += 64;
4614         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4615         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4616         q = q64>>( 64 - expDiff );
4617         bSig <<= 6;
4618         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4619     }
4620     do {
4621         alternateASig = aSig;
4622         ++q;
4623         aSig -= bSig;
4624     } while ( 0 <= (int32_t) aSig );
4625     sigMean = aSig + alternateASig;
4626     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4627         aSig = alternateASig;
4628     }
4629     zSign = ( (int32_t) aSig < 0 );
4630     if ( zSign ) aSig = - aSig;
4631     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4632 }
4633
4634
4635
4636 /*----------------------------------------------------------------------------
4637 | Returns the binary exponential of the single-precision floating-point value
4638 | `a'. The operation is performed according to the IEC/IEEE Standard for
4639 | Binary Floating-Point Arithmetic.
4640 |
4641 | Uses the following identities:
4642 |
4643 | 1. -------------------------------------------------------------------------
4644 |      x    x*ln(2)
4645 |     2  = e
4646 |
4647 | 2. -------------------------------------------------------------------------
4648 |                      2     3     4     5           n
4649 |      x        x     x     x     x     x           x
4650 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4651 |               1!    2!    3!    4!    5!          n!
4652 *----------------------------------------------------------------------------*/
4653
4654 static const float64 float32_exp2_coefficients[15] =
4655 {
4656     const_float64( 0x3ff0000000000000ll ), /*  1 */
4657     const_float64( 0x3fe0000000000000ll ), /*  2 */
4658     const_float64( 0x3fc5555555555555ll ), /*  3 */
4659     const_float64( 0x3fa5555555555555ll ), /*  4 */
4660     const_float64( 0x3f81111111111111ll ), /*  5 */
4661     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4662     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4663     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4664     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4665     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4666     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4667     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4668     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4669     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4670     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4671 };
4672
4673 float32 float32_exp2(float32 a, float_status *status)
4674 {
4675     flag aSign;
4676     int aExp;
4677     uint32_t aSig;
4678     float64 r, x, xn;
4679     int i;
4680     a = float32_squash_input_denormal(a, status);
4681
4682     aSig = extractFloat32Frac( a );
4683     aExp = extractFloat32Exp( a );
4684     aSign = extractFloat32Sign( a );
4685
4686     if ( aExp == 0xFF) {
4687         if (aSig) {
4688             return propagateFloat32NaN(a, float32_zero, status);
4689         }
4690         return (aSign) ? float32_zero : a;
4691     }
4692     if (aExp == 0) {
4693         if (aSig == 0) return float32_one;
4694     }
4695
4696     float_raise(float_flag_inexact, status);
4697
4698     /* ******************************* */
4699     /* using float64 for approximation */
4700     /* ******************************* */
4701     x = float32_to_float64(a, status);
4702     x = float64_mul(x, float64_ln2, status);
4703
4704     xn = x;
4705     r = float64_one;
4706     for (i = 0 ; i < 15 ; i++) {
4707         float64 f;
4708
4709         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4710         r = float64_add(r, f, status);
4711
4712         xn = float64_mul(xn, x, status);
4713     }
4714
4715     return float64_to_float32(r, status);
4716 }
4717
4718 /*----------------------------------------------------------------------------
4719 | Returns the binary log of the single-precision floating-point value `a'.
4720 | The operation is performed according to the IEC/IEEE Standard for Binary
4721 | Floating-Point Arithmetic.
4722 *----------------------------------------------------------------------------*/
4723 float32 float32_log2(float32 a, float_status *status)
4724 {
4725     flag aSign, zSign;
4726     int aExp;
4727     uint32_t aSig, zSig, i;
4728
4729     a = float32_squash_input_denormal(a, status);
4730     aSig = extractFloat32Frac( a );
4731     aExp = extractFloat32Exp( a );
4732     aSign = extractFloat32Sign( a );
4733
4734     if ( aExp == 0 ) {
4735         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4736         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4737     }
4738     if ( aSign ) {
4739         float_raise(float_flag_invalid, status);
4740         return float32_default_nan(status);
4741     }
4742     if ( aExp == 0xFF ) {
4743         if (aSig) {
4744             return propagateFloat32NaN(a, float32_zero, status);
4745         }
4746         return a;
4747     }
4748
4749     aExp -= 0x7F;
4750     aSig |= 0x00800000;
4751     zSign = aExp < 0;
4752     zSig = aExp << 23;
4753
4754     for (i = 1 << 22; i > 0; i >>= 1) {
4755         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4756         if ( aSig & 0x01000000 ) {
4757             aSig >>= 1;
4758             zSig |= i;
4759         }
4760     }
4761
4762     if ( zSign )
4763         zSig = -zSig;
4764
4765     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4766 }
4767
4768 /*----------------------------------------------------------------------------
4769 | Returns 1 if the single-precision floating-point value `a' is equal to
4770 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4771 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4772 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4773 *----------------------------------------------------------------------------*/
4774
4775 int float32_eq(float32 a, float32 b, float_status *status)
4776 {
4777     uint32_t av, bv;
4778     a = float32_squash_input_denormal(a, status);
4779     b = float32_squash_input_denormal(b, status);
4780
4781     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4782          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4783        ) {
4784         float_raise(float_flag_invalid, status);
4785         return 0;
4786     }
4787     av = float32_val(a);
4788     bv = float32_val(b);
4789     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4790 }
4791
4792 /*----------------------------------------------------------------------------
4793 | Returns 1 if the single-precision floating-point value `a' is less than
4794 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
4795 | exception is raised if either operand is a NaN.  The comparison is performed
4796 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4797 *----------------------------------------------------------------------------*/
4798
4799 int float32_le(float32 a, float32 b, float_status *status)
4800 {
4801     flag aSign, bSign;
4802     uint32_t av, bv;
4803     a = float32_squash_input_denormal(a, status);
4804     b = float32_squash_input_denormal(b, status);
4805
4806     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4807          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4808        ) {
4809         float_raise(float_flag_invalid, status);
4810         return 0;
4811     }
4812     aSign = extractFloat32Sign( a );
4813     bSign = extractFloat32Sign( b );
4814     av = float32_val(a);
4815     bv = float32_val(b);
4816     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4817     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4818
4819 }
4820
4821 /*----------------------------------------------------------------------------
4822 | Returns 1 if the single-precision floating-point value `a' is less than
4823 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4824 | raised if either operand is a NaN.  The comparison is performed according
4825 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4826 *----------------------------------------------------------------------------*/
4827
4828 int float32_lt(float32 a, float32 b, float_status *status)
4829 {
4830     flag aSign, bSign;
4831     uint32_t av, bv;
4832     a = float32_squash_input_denormal(a, status);
4833     b = float32_squash_input_denormal(b, status);
4834
4835     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4836          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4837        ) {
4838         float_raise(float_flag_invalid, status);
4839         return 0;
4840     }
4841     aSign = extractFloat32Sign( a );
4842     bSign = extractFloat32Sign( b );
4843     av = float32_val(a);
4844     bv = float32_val(b);
4845     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4846     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4847
4848 }
4849
4850 /*----------------------------------------------------------------------------
4851 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4852 | be compared, and 0 otherwise.  The invalid exception is raised if either
4853 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4854 | Standard for Binary Floating-Point Arithmetic.
4855 *----------------------------------------------------------------------------*/
4856
4857 int float32_unordered(float32 a, float32 b, float_status *status)
4858 {
4859     a = float32_squash_input_denormal(a, status);
4860     b = float32_squash_input_denormal(b, status);
4861
4862     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4863          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4864        ) {
4865         float_raise(float_flag_invalid, status);
4866         return 1;
4867     }
4868     return 0;
4869 }
4870
4871 /*----------------------------------------------------------------------------
4872 | Returns 1 if the single-precision floating-point value `a' is equal to
4873 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4874 | exception.  The comparison is performed according to the IEC/IEEE Standard
4875 | for Binary Floating-Point Arithmetic.
4876 *----------------------------------------------------------------------------*/
4877
4878 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4879 {
4880     a = float32_squash_input_denormal(a, status);
4881     b = float32_squash_input_denormal(b, status);
4882
4883     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4884          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4885        ) {
4886         if (float32_is_signaling_nan(a, status)
4887          || float32_is_signaling_nan(b, status)) {
4888             float_raise(float_flag_invalid, status);
4889         }
4890         return 0;
4891     }
4892     return ( float32_val(a) == float32_val(b) ) ||
4893             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4894 }
4895
4896 /*----------------------------------------------------------------------------
4897 | Returns 1 if the single-precision floating-point value `a' is less than or
4898 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4899 | cause an exception.  Otherwise, the comparison is performed according to the
4900 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4901 *----------------------------------------------------------------------------*/
4902
4903 int float32_le_quiet(float32 a, float32 b, float_status *status)
4904 {
4905     flag aSign, bSign;
4906     uint32_t av, bv;
4907     a = float32_squash_input_denormal(a, status);
4908     b = float32_squash_input_denormal(b, status);
4909
4910     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4911          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4912        ) {
4913         if (float32_is_signaling_nan(a, status)
4914          || float32_is_signaling_nan(b, status)) {
4915             float_raise(float_flag_invalid, status);
4916         }
4917         return 0;
4918     }
4919     aSign = extractFloat32Sign( a );
4920     bSign = extractFloat32Sign( b );
4921     av = float32_val(a);
4922     bv = float32_val(b);
4923     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4924     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4925
4926 }
4927
4928 /*----------------------------------------------------------------------------
4929 | Returns 1 if the single-precision floating-point value `a' is less than
4930 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4931 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4932 | Standard for Binary Floating-Point Arithmetic.
4933 *----------------------------------------------------------------------------*/
4934
4935 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4936 {
4937     flag aSign, bSign;
4938     uint32_t av, bv;
4939     a = float32_squash_input_denormal(a, status);
4940     b = float32_squash_input_denormal(b, status);
4941
4942     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4943          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4944        ) {
4945         if (float32_is_signaling_nan(a, status)
4946          || float32_is_signaling_nan(b, status)) {
4947             float_raise(float_flag_invalid, status);
4948         }
4949         return 0;
4950     }
4951     aSign = extractFloat32Sign( a );
4952     bSign = extractFloat32Sign( b );
4953     av = float32_val(a);
4954     bv = float32_val(b);
4955     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4956     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4957
4958 }
4959
4960 /*----------------------------------------------------------------------------
4961 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4962 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4963 | comparison is performed according to the IEC/IEEE Standard for Binary
4964 | Floating-Point Arithmetic.
4965 *----------------------------------------------------------------------------*/
4966
4967 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4968 {
4969     a = float32_squash_input_denormal(a, status);
4970     b = float32_squash_input_denormal(b, status);
4971
4972     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4973          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4974        ) {
4975         if (float32_is_signaling_nan(a, status)
4976          || float32_is_signaling_nan(b, status)) {
4977             float_raise(float_flag_invalid, status);
4978         }
4979         return 1;
4980     }
4981     return 0;
4982 }
4983
4984 /*----------------------------------------------------------------------------
4985 | If `a' is denormal and we are in flush-to-zero mode then set the
4986 | input-denormal exception and return zero. Otherwise just return the value.
4987 *----------------------------------------------------------------------------*/
4988 float16 float16_squash_input_denormal(float16 a, float_status *status)
4989 {
4990     if (status->flush_inputs_to_zero) {
4991         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4992             float_raise(float_flag_input_denormal, status);
4993             return make_float16(float16_val(a) & 0x8000);
4994         }
4995     }
4996     return a;
4997 }
4998
4999 /*----------------------------------------------------------------------------
5000 | Returns the result of converting the double-precision floating-point value
5001 | `a' to the extended double-precision floating-point format.  The conversion
5002 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5003 | Arithmetic.
5004 *----------------------------------------------------------------------------*/
5005
5006 floatx80 float64_to_floatx80(float64 a, float_status *status)
5007 {
5008     flag aSign;
5009     int aExp;
5010     uint64_t aSig;
5011
5012     a = float64_squash_input_denormal(a, status);
5013     aSig = extractFloat64Frac( a );
5014     aExp = extractFloat64Exp( a );
5015     aSign = extractFloat64Sign( a );
5016     if ( aExp == 0x7FF ) {
5017         if (aSig) {
5018             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
5019         }
5020         return packFloatx80(aSign,
5021                             floatx80_infinity_high,
5022                             floatx80_infinity_low);
5023     }
5024     if ( aExp == 0 ) {
5025         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5026         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5027     }
5028     return
5029         packFloatx80(
5030             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
5031
5032 }
5033
5034 /*----------------------------------------------------------------------------
5035 | Returns the result of converting the double-precision floating-point value
5036 | `a' to the quadruple-precision floating-point format.  The conversion is
5037 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5038 | Arithmetic.
5039 *----------------------------------------------------------------------------*/
5040
5041 float128 float64_to_float128(float64 a, float_status *status)
5042 {
5043     flag aSign;
5044     int aExp;
5045     uint64_t aSig, zSig0, zSig1;
5046
5047     a = float64_squash_input_denormal(a, status);
5048     aSig = extractFloat64Frac( a );
5049     aExp = extractFloat64Exp( a );
5050     aSign = extractFloat64Sign( a );
5051     if ( aExp == 0x7FF ) {
5052         if (aSig) {
5053             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5054         }
5055         return packFloat128( aSign, 0x7FFF, 0, 0 );
5056     }
5057     if ( aExp == 0 ) {
5058         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5059         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5060         --aExp;
5061     }
5062     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5063     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5064
5065 }
5066
5067
5068 /*----------------------------------------------------------------------------
5069 | Returns the remainder of the double-precision floating-point value `a'
5070 | with respect to the corresponding value `b'.  The operation is performed
5071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5072 *----------------------------------------------------------------------------*/
5073
5074 float64 float64_rem(float64 a, float64 b, float_status *status)
5075 {
5076     flag aSign, zSign;
5077     int aExp, bExp, expDiff;
5078     uint64_t aSig, bSig;
5079     uint64_t q, alternateASig;
5080     int64_t sigMean;
5081
5082     a = float64_squash_input_denormal(a, status);
5083     b = float64_squash_input_denormal(b, status);
5084     aSig = extractFloat64Frac( a );
5085     aExp = extractFloat64Exp( a );
5086     aSign = extractFloat64Sign( a );
5087     bSig = extractFloat64Frac( b );
5088     bExp = extractFloat64Exp( b );
5089     if ( aExp == 0x7FF ) {
5090         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5091             return propagateFloat64NaN(a, b, status);
5092         }
5093         float_raise(float_flag_invalid, status);
5094         return float64_default_nan(status);
5095     }
5096     if ( bExp == 0x7FF ) {
5097         if (bSig) {
5098             return propagateFloat64NaN(a, b, status);
5099         }
5100         return a;
5101     }
5102     if ( bExp == 0 ) {
5103         if ( bSig == 0 ) {
5104             float_raise(float_flag_invalid, status);
5105             return float64_default_nan(status);
5106         }
5107         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5108     }
5109     if ( aExp == 0 ) {
5110         if ( aSig == 0 ) return a;
5111         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5112     }
5113     expDiff = aExp - bExp;
5114     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
5115     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
5116     if ( expDiff < 0 ) {
5117         if ( expDiff < -1 ) return a;
5118         aSig >>= 1;
5119     }
5120     q = ( bSig <= aSig );
5121     if ( q ) aSig -= bSig;
5122     expDiff -= 64;
5123     while ( 0 < expDiff ) {
5124         q = estimateDiv128To64( aSig, 0, bSig );
5125         q = ( 2 < q ) ? q - 2 : 0;
5126         aSig = - ( ( bSig>>2 ) * q );
5127         expDiff -= 62;
5128     }
5129     expDiff += 64;
5130     if ( 0 < expDiff ) {
5131         q = estimateDiv128To64( aSig, 0, bSig );
5132         q = ( 2 < q ) ? q - 2 : 0;
5133         q >>= 64 - expDiff;
5134         bSig >>= 2;
5135         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5136     }
5137     else {
5138         aSig >>= 2;
5139         bSig >>= 2;
5140     }
5141     do {
5142         alternateASig = aSig;
5143         ++q;
5144         aSig -= bSig;
5145     } while ( 0 <= (int64_t) aSig );
5146     sigMean = aSig + alternateASig;
5147     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5148         aSig = alternateASig;
5149     }
5150     zSign = ( (int64_t) aSig < 0 );
5151     if ( zSign ) aSig = - aSig;
5152     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5153
5154 }
5155
5156 /*----------------------------------------------------------------------------
5157 | Returns the binary log of the double-precision floating-point value `a'.
5158 | The operation is performed according to the IEC/IEEE Standard for Binary
5159 | Floating-Point Arithmetic.
5160 *----------------------------------------------------------------------------*/
5161 float64 float64_log2(float64 a, float_status *status)
5162 {
5163     flag aSign, zSign;
5164     int aExp;
5165     uint64_t aSig, aSig0, aSig1, zSig, i;
5166     a = float64_squash_input_denormal(a, status);
5167
5168     aSig = extractFloat64Frac( a );
5169     aExp = extractFloat64Exp( a );
5170     aSign = extractFloat64Sign( a );
5171
5172     if ( aExp == 0 ) {
5173         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5174         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5175     }
5176     if ( aSign ) {
5177         float_raise(float_flag_invalid, status);
5178         return float64_default_nan(status);
5179     }
5180     if ( aExp == 0x7FF ) {
5181         if (aSig) {
5182             return propagateFloat64NaN(a, float64_zero, status);
5183         }
5184         return a;
5185     }
5186
5187     aExp -= 0x3FF;
5188     aSig |= LIT64( 0x0010000000000000 );
5189     zSign = aExp < 0;
5190     zSig = (uint64_t)aExp << 52;
5191     for (i = 1LL << 51; i > 0; i >>= 1) {
5192         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5193         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5194         if ( aSig & LIT64( 0x0020000000000000 ) ) {
5195             aSig >>= 1;
5196             zSig |= i;
5197         }
5198     }
5199
5200     if ( zSign )
5201         zSig = -zSig;
5202     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5203 }
5204
5205 /*----------------------------------------------------------------------------
5206 | Returns 1 if the double-precision floating-point value `a' is equal to the
5207 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
5208 | if either operand is a NaN.  Otherwise, the comparison is performed
5209 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5210 *----------------------------------------------------------------------------*/
5211
5212 int float64_eq(float64 a, float64 b, float_status *status)
5213 {
5214     uint64_t av, bv;
5215     a = float64_squash_input_denormal(a, status);
5216     b = float64_squash_input_denormal(b, status);
5217
5218     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5219          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5220        ) {
5221         float_raise(float_flag_invalid, status);
5222         return 0;
5223     }
5224     av = float64_val(a);
5225     bv = float64_val(b);
5226     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5227
5228 }
5229
5230 /*----------------------------------------------------------------------------
5231 | Returns 1 if the double-precision floating-point value `a' is less than or
5232 | equal to the corresponding value `b', and 0 otherwise.  The invalid
5233 | exception is raised if either operand is a NaN.  The comparison is performed
5234 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5235 *----------------------------------------------------------------------------*/
5236
5237 int float64_le(float64 a, float64 b, float_status *status)
5238 {
5239     flag aSign, bSign;
5240     uint64_t av, bv;
5241     a = float64_squash_input_denormal(a, status);
5242     b = float64_squash_input_denormal(b, status);
5243
5244     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5245          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5246        ) {
5247         float_raise(float_flag_invalid, status);
5248         return 0;
5249     }
5250     aSign = extractFloat64Sign( a );
5251     bSign = extractFloat64Sign( b );
5252     av = float64_val(a);
5253     bv = float64_val(b);
5254     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5255     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5256
5257 }
5258
5259 /*----------------------------------------------------------------------------
5260 | Returns 1 if the double-precision floating-point value `a' is less than
5261 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5262 | raised if either operand is a NaN.  The comparison is performed according
5263 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5264 *----------------------------------------------------------------------------*/
5265
5266 int float64_lt(float64 a, float64 b, float_status *status)
5267 {
5268     flag aSign, bSign;
5269     uint64_t av, bv;
5270
5271     a = float64_squash_input_denormal(a, status);
5272     b = float64_squash_input_denormal(b, status);
5273     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5274          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5275        ) {
5276         float_raise(float_flag_invalid, status);
5277         return 0;
5278     }
5279     aSign = extractFloat64Sign( a );
5280     bSign = extractFloat64Sign( b );
5281     av = float64_val(a);
5282     bv = float64_val(b);
5283     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5284     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5285
5286 }
5287
5288 /*----------------------------------------------------------------------------
5289 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5290 | be compared, and 0 otherwise.  The invalid exception is raised if either
5291 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
5292 | Standard for Binary Floating-Point Arithmetic.
5293 *----------------------------------------------------------------------------*/
5294
5295 int float64_unordered(float64 a, float64 b, float_status *status)
5296 {
5297     a = float64_squash_input_denormal(a, status);
5298     b = float64_squash_input_denormal(b, status);
5299
5300     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5301          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5302        ) {
5303         float_raise(float_flag_invalid, status);
5304         return 1;
5305     }
5306     return 0;
5307 }
5308
5309 /*----------------------------------------------------------------------------
5310 | Returns 1 if the double-precision floating-point value `a' is equal to the
5311 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5312 | exception.The comparison is performed according to the IEC/IEEE Standard
5313 | for Binary Floating-Point Arithmetic.
5314 *----------------------------------------------------------------------------*/
5315
5316 int float64_eq_quiet(float64 a, float64 b, float_status *status)
5317 {
5318     uint64_t av, bv;
5319     a = float64_squash_input_denormal(a, status);
5320     b = float64_squash_input_denormal(b, status);
5321
5322     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5323          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5324        ) {
5325         if (float64_is_signaling_nan(a, status)
5326          || float64_is_signaling_nan(b, status)) {
5327             float_raise(float_flag_invalid, status);
5328         }
5329         return 0;
5330     }
5331     av = float64_val(a);
5332     bv = float64_val(b);
5333     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5334
5335 }
5336
5337 /*----------------------------------------------------------------------------
5338 | Returns 1 if the double-precision floating-point value `a' is less than or
5339 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5340 | cause an exception.  Otherwise, the comparison is performed according to the
5341 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5342 *----------------------------------------------------------------------------*/
5343
5344 int float64_le_quiet(float64 a, float64 b, float_status *status)
5345 {
5346     flag aSign, bSign;
5347     uint64_t av, bv;
5348     a = float64_squash_input_denormal(a, status);
5349     b = float64_squash_input_denormal(b, status);
5350
5351     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5352          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5353        ) {
5354         if (float64_is_signaling_nan(a, status)
5355          || float64_is_signaling_nan(b, status)) {
5356             float_raise(float_flag_invalid, status);
5357         }
5358         return 0;
5359     }
5360     aSign = extractFloat64Sign( a );
5361     bSign = extractFloat64Sign( b );
5362     av = float64_val(a);
5363     bv = float64_val(b);
5364     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5365     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5366
5367 }
5368
5369 /*----------------------------------------------------------------------------
5370 | Returns 1 if the double-precision floating-point value `a' is less than
5371 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5372 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5373 | Standard for Binary Floating-Point Arithmetic.
5374 *----------------------------------------------------------------------------*/
5375
5376 int float64_lt_quiet(float64 a, float64 b, float_status *status)
5377 {
5378     flag aSign, bSign;
5379     uint64_t av, bv;
5380     a = float64_squash_input_denormal(a, status);
5381     b = float64_squash_input_denormal(b, status);
5382
5383     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5384          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5385        ) {
5386         if (float64_is_signaling_nan(a, status)
5387          || float64_is_signaling_nan(b, status)) {
5388             float_raise(float_flag_invalid, status);
5389         }
5390         return 0;
5391     }
5392     aSign = extractFloat64Sign( a );
5393     bSign = extractFloat64Sign( b );
5394     av = float64_val(a);
5395     bv = float64_val(b);
5396     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5397     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5398
5399 }
5400
5401 /*----------------------------------------------------------------------------
5402 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5403 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5404 | comparison is performed according to the IEC/IEEE Standard for Binary
5405 | Floating-Point Arithmetic.
5406 *----------------------------------------------------------------------------*/
5407
5408 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5409 {
5410     a = float64_squash_input_denormal(a, status);
5411     b = float64_squash_input_denormal(b, status);
5412
5413     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5414          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5415        ) {
5416         if (float64_is_signaling_nan(a, status)
5417          || float64_is_signaling_nan(b, status)) {
5418             float_raise(float_flag_invalid, status);
5419         }
5420         return 1;
5421     }
5422     return 0;
5423 }
5424
5425 /*----------------------------------------------------------------------------
5426 | Returns the result of converting the extended double-precision floating-
5427 | point value `a' to the 32-bit two's complement integer format.  The
5428 | conversion is performed according to the IEC/IEEE Standard for Binary
5429 | Floating-Point Arithmetic---which means in particular that the conversion
5430 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5431 | largest positive integer is returned.  Otherwise, if the conversion
5432 | overflows, the largest integer with the same sign as `a' is returned.
5433 *----------------------------------------------------------------------------*/
5434
5435 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5436 {
5437     flag aSign;
5438     int32_t aExp, shiftCount;
5439     uint64_t aSig;
5440
5441     if (floatx80_invalid_encoding(a)) {
5442         float_raise(float_flag_invalid, status);
5443         return 1 << 31;
5444     }
5445     aSig = extractFloatx80Frac( a );
5446     aExp = extractFloatx80Exp( a );
5447     aSign = extractFloatx80Sign( a );
5448     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5449     shiftCount = 0x4037 - aExp;
5450     if ( shiftCount <= 0 ) shiftCount = 1;
5451     shift64RightJamming( aSig, shiftCount, &aSig );
5452     return roundAndPackInt32(aSign, aSig, status);
5453
5454 }
5455
5456 /*----------------------------------------------------------------------------
5457 | Returns the result of converting the extended double-precision floating-
5458 | point value `a' to the 32-bit two's complement integer format.  The
5459 | conversion is performed according to the IEC/IEEE Standard for Binary
5460 | Floating-Point Arithmetic, except that the conversion is always rounded
5461 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5462 | Otherwise, if the conversion overflows, the largest integer with the same
5463 | sign as `a' is returned.
5464 *----------------------------------------------------------------------------*/
5465
5466 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5467 {
5468     flag aSign;
5469     int32_t aExp, shiftCount;
5470     uint64_t aSig, savedASig;
5471     int32_t z;
5472
5473     if (floatx80_invalid_encoding(a)) {
5474         float_raise(float_flag_invalid, status);
5475         return 1 << 31;
5476     }
5477     aSig = extractFloatx80Frac( a );
5478     aExp = extractFloatx80Exp( a );
5479     aSign = extractFloatx80Sign( a );
5480     if ( 0x401E < aExp ) {
5481         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5482         goto invalid;
5483     }
5484     else if ( aExp < 0x3FFF ) {
5485         if (aExp || aSig) {
5486             status->float_exception_flags |= float_flag_inexact;
5487         }
5488         return 0;
5489     }
5490     shiftCount = 0x403E - aExp;
5491     savedASig = aSig;
5492     aSig >>= shiftCount;
5493     z = aSig;
5494     if ( aSign ) z = - z;
5495     if ( ( z < 0 ) ^ aSign ) {
5496  invalid:
5497         float_raise(float_flag_invalid, status);
5498         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5499     }
5500     if ( ( aSig<<shiftCount ) != savedASig ) {
5501         status->float_exception_flags |= float_flag_inexact;
5502     }
5503     return z;
5504
5505 }
5506
5507 /*----------------------------------------------------------------------------
5508 | Returns the result of converting the extended double-precision floating-
5509 | point value `a' to the 64-bit two's complement integer format.  The
5510 | conversion is performed according to the IEC/IEEE Standard for Binary
5511 | Floating-Point Arithmetic---which means in particular that the conversion
5512 | is rounded according to the current rounding mode.  If `a' is a NaN,
5513 | the largest positive integer is returned.  Otherwise, if the conversion
5514 | overflows, the largest integer with the same sign as `a' is returned.
5515 *----------------------------------------------------------------------------*/
5516
5517 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5518 {
5519     flag aSign;
5520     int32_t aExp, shiftCount;
5521     uint64_t aSig, aSigExtra;
5522
5523     if (floatx80_invalid_encoding(a)) {
5524         float_raise(float_flag_invalid, status);
5525         return 1ULL << 63;
5526     }
5527     aSig = extractFloatx80Frac( a );
5528     aExp = extractFloatx80Exp( a );
5529     aSign = extractFloatx80Sign( a );
5530     shiftCount = 0x403E - aExp;
5531     if ( shiftCount <= 0 ) {
5532         if ( shiftCount ) {
5533             float_raise(float_flag_invalid, status);
5534             if (!aSign || floatx80_is_any_nan(a)) {
5535                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5536             }
5537             return (int64_t) LIT64( 0x8000000000000000 );
5538         }
5539         aSigExtra = 0;
5540     }
5541     else {
5542         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5543     }
5544     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5545
5546 }
5547
5548 /*----------------------------------------------------------------------------
5549 | Returns the result of converting the extended double-precision floating-
5550 | point value `a' to the 64-bit two's complement integer format.  The
5551 | conversion is performed according to the IEC/IEEE Standard for Binary
5552 | Floating-Point Arithmetic, except that the conversion is always rounded
5553 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5554 | Otherwise, if the conversion overflows, the largest integer with the same
5555 | sign as `a' is returned.
5556 *----------------------------------------------------------------------------*/
5557
5558 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5559 {
5560     flag aSign;
5561     int32_t aExp, shiftCount;
5562     uint64_t aSig;
5563     int64_t z;
5564
5565     if (floatx80_invalid_encoding(a)) {
5566         float_raise(float_flag_invalid, status);
5567         return 1ULL << 63;
5568     }
5569     aSig = extractFloatx80Frac( a );
5570     aExp = extractFloatx80Exp( a );
5571     aSign = extractFloatx80Sign( a );
5572     shiftCount = aExp - 0x403E;
5573     if ( 0 <= shiftCount ) {
5574         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5575         if ( ( a.high != 0xC03E ) || aSig ) {
5576             float_raise(float_flag_invalid, status);
5577             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5578                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5579             }
5580         }
5581         return (int64_t) LIT64( 0x8000000000000000 );
5582     }
5583     else if ( aExp < 0x3FFF ) {
5584         if (aExp | aSig) {
5585             status->float_exception_flags |= float_flag_inexact;
5586         }
5587         return 0;
5588     }
5589     z = aSig>>( - shiftCount );
5590     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5591         status->float_exception_flags |= float_flag_inexact;
5592     }
5593     if ( aSign ) z = - z;
5594     return z;
5595
5596 }
5597
5598 /*----------------------------------------------------------------------------
5599 | Returns the result of converting the extended double-precision floating-
5600 | point value `a' to the single-precision floating-point format.  The
5601 | conversion is performed according to the IEC/IEEE Standard for Binary
5602 | Floating-Point Arithmetic.
5603 *----------------------------------------------------------------------------*/
5604
5605 float32 floatx80_to_float32(floatx80 a, float_status *status)
5606 {
5607     flag aSign;
5608     int32_t aExp;
5609     uint64_t aSig;
5610
5611     if (floatx80_invalid_encoding(a)) {
5612         float_raise(float_flag_invalid, status);
5613         return float32_default_nan(status);
5614     }
5615     aSig = extractFloatx80Frac( a );
5616     aExp = extractFloatx80Exp( a );
5617     aSign = extractFloatx80Sign( a );
5618     if ( aExp == 0x7FFF ) {
5619         if ( (uint64_t) ( aSig<<1 ) ) {
5620             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5621         }
5622         return packFloat32( aSign, 0xFF, 0 );
5623     }
5624     shift64RightJamming( aSig, 33, &aSig );
5625     if ( aExp || aSig ) aExp -= 0x3F81;
5626     return roundAndPackFloat32(aSign, aExp, aSig, status);
5627
5628 }
5629
5630 /*----------------------------------------------------------------------------
5631 | Returns the result of converting the extended double-precision floating-
5632 | point value `a' to the double-precision floating-point format.  The
5633 | conversion is performed according to the IEC/IEEE Standard for Binary
5634 | Floating-Point Arithmetic.
5635 *----------------------------------------------------------------------------*/
5636
5637 float64 floatx80_to_float64(floatx80 a, float_status *status)
5638 {
5639     flag aSign;
5640     int32_t aExp;
5641     uint64_t aSig, zSig;
5642
5643     if (floatx80_invalid_encoding(a)) {
5644         float_raise(float_flag_invalid, status);
5645         return float64_default_nan(status);
5646     }
5647     aSig = extractFloatx80Frac( a );
5648     aExp = extractFloatx80Exp( a );
5649     aSign = extractFloatx80Sign( a );
5650     if ( aExp == 0x7FFF ) {
5651         if ( (uint64_t) ( aSig<<1 ) ) {
5652             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5653         }
5654         return packFloat64( aSign, 0x7FF, 0 );
5655     }
5656     shift64RightJamming( aSig, 1, &zSig );
5657     if ( aExp || aSig ) aExp -= 0x3C01;
5658     return roundAndPackFloat64(aSign, aExp, zSig, status);
5659
5660 }
5661
5662 /*----------------------------------------------------------------------------
5663 | Returns the result of converting the extended double-precision floating-
5664 | point value `a' to the quadruple-precision floating-point format.  The
5665 | conversion is performed according to the IEC/IEEE Standard for Binary
5666 | Floating-Point Arithmetic.
5667 *----------------------------------------------------------------------------*/
5668
5669 float128 floatx80_to_float128(floatx80 a, float_status *status)
5670 {
5671     flag aSign;
5672     int aExp;
5673     uint64_t aSig, zSig0, zSig1;
5674
5675     if (floatx80_invalid_encoding(a)) {
5676         float_raise(float_flag_invalid, status);
5677         return float128_default_nan(status);
5678     }
5679     aSig = extractFloatx80Frac( a );
5680     aExp = extractFloatx80Exp( a );
5681     aSign = extractFloatx80Sign( a );
5682     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5683         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5684     }
5685     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5686     return packFloat128( aSign, aExp, zSig0, zSig1 );
5687
5688 }
5689
5690 /*----------------------------------------------------------------------------
5691 | Rounds the extended double-precision floating-point value `a'
5692 | to the precision provided by floatx80_rounding_precision and returns the
5693 | result as an extended double-precision floating-point value.
5694 | The operation is performed according to the IEC/IEEE Standard for Binary
5695 | Floating-Point Arithmetic.
5696 *----------------------------------------------------------------------------*/
5697
5698 floatx80 floatx80_round(floatx80 a, float_status *status)
5699 {
5700     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5701                                 extractFloatx80Sign(a),
5702                                 extractFloatx80Exp(a),
5703                                 extractFloatx80Frac(a), 0, status);
5704 }
5705
5706 /*----------------------------------------------------------------------------
5707 | Rounds the extended double-precision floating-point value `a' to an integer,
5708 | and returns the result as an extended quadruple-precision floating-point
5709 | value.  The operation is performed according to the IEC/IEEE Standard for
5710 | Binary Floating-Point Arithmetic.
5711 *----------------------------------------------------------------------------*/
5712
5713 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5714 {
5715     flag aSign;
5716     int32_t aExp;
5717     uint64_t lastBitMask, roundBitsMask;
5718     floatx80 z;
5719
5720     if (floatx80_invalid_encoding(a)) {
5721         float_raise(float_flag_invalid, status);
5722         return floatx80_default_nan(status);
5723     }
5724     aExp = extractFloatx80Exp( a );
5725     if ( 0x403E <= aExp ) {
5726         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5727             return propagateFloatx80NaN(a, a, status);
5728         }
5729         return a;
5730     }
5731     if ( aExp < 0x3FFF ) {
5732         if (    ( aExp == 0 )
5733              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5734             return a;
5735         }
5736         status->float_exception_flags |= float_flag_inexact;
5737         aSign = extractFloatx80Sign( a );
5738         switch (status->float_rounding_mode) {
5739          case float_round_nearest_even:
5740             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5741                ) {
5742                 return
5743                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5744             }
5745             break;
5746         case float_round_ties_away:
5747             if (aExp == 0x3FFE) {
5748                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5749             }
5750             break;
5751          case float_round_down:
5752             return
5753                   aSign ?
5754                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5755                 : packFloatx80( 0, 0, 0 );
5756          case float_round_up:
5757             return
5758                   aSign ? packFloatx80( 1, 0, 0 )
5759                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5760         }
5761         return packFloatx80( aSign, 0, 0 );
5762     }
5763     lastBitMask = 1;
5764     lastBitMask <<= 0x403E - aExp;
5765     roundBitsMask = lastBitMask - 1;
5766     z = a;
5767     switch (status->float_rounding_mode) {
5768     case float_round_nearest_even:
5769         z.low += lastBitMask>>1;
5770         if ((z.low & roundBitsMask) == 0) {
5771             z.low &= ~lastBitMask;
5772         }
5773         break;
5774     case float_round_ties_away:
5775         z.low += lastBitMask >> 1;
5776         break;
5777     case float_round_to_zero:
5778         break;
5779     case float_round_up:
5780         if (!extractFloatx80Sign(z)) {
5781             z.low += roundBitsMask;
5782         }
5783         break;
5784     case float_round_down:
5785         if (extractFloatx80Sign(z)) {
5786             z.low += roundBitsMask;
5787         }
5788         break;
5789     default:
5790         abort();
5791     }
5792     z.low &= ~ roundBitsMask;
5793     if ( z.low == 0 ) {
5794         ++z.high;
5795         z.low = LIT64( 0x8000000000000000 );
5796     }
5797     if (z.low != a.low) {
5798         status->float_exception_flags |= float_flag_inexact;
5799     }
5800     return z;
5801
5802 }
5803
5804 /*----------------------------------------------------------------------------
5805 | Returns the result of adding the absolute values of the extended double-
5806 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5807 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5808 | The addition is performed according to the IEC/IEEE Standard for Binary
5809 | Floating-Point Arithmetic.
5810 *----------------------------------------------------------------------------*/
5811
5812 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5813                                 float_status *status)
5814 {
5815     int32_t aExp, bExp, zExp;
5816     uint64_t aSig, bSig, zSig0, zSig1;
5817     int32_t expDiff;
5818
5819     aSig = extractFloatx80Frac( a );
5820     aExp = extractFloatx80Exp( a );
5821     bSig = extractFloatx80Frac( b );
5822     bExp = extractFloatx80Exp( b );
5823     expDiff = aExp - bExp;
5824     if ( 0 < expDiff ) {
5825         if ( aExp == 0x7FFF ) {
5826             if ((uint64_t)(aSig << 1)) {
5827                 return propagateFloatx80NaN(a, b, status);
5828             }
5829             return a;
5830         }
5831         if ( bExp == 0 ) --expDiff;
5832         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5833         zExp = aExp;
5834     }
5835     else if ( expDiff < 0 ) {
5836         if ( bExp == 0x7FFF ) {
5837             if ((uint64_t)(bSig << 1)) {
5838                 return propagateFloatx80NaN(a, b, status);
5839             }
5840             return packFloatx80(zSign,
5841                                 floatx80_infinity_high,
5842                                 floatx80_infinity_low);
5843         }
5844         if ( aExp == 0 ) ++expDiff;
5845         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5846         zExp = bExp;
5847     }
5848     else {
5849         if ( aExp == 0x7FFF ) {
5850             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5851                 return propagateFloatx80NaN(a, b, status);
5852             }
5853             return a;
5854         }
5855         zSig1 = 0;
5856         zSig0 = aSig + bSig;
5857         if ( aExp == 0 ) {
5858             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5859             goto roundAndPack;
5860         }
5861         zExp = aExp;
5862         goto shiftRight1;
5863     }
5864     zSig0 = aSig + bSig;
5865     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5866  shiftRight1:
5867     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5868     zSig0 |= LIT64( 0x8000000000000000 );
5869     ++zExp;
5870  roundAndPack:
5871     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5872                                 zSign, zExp, zSig0, zSig1, status);
5873 }
5874
5875 /*----------------------------------------------------------------------------
5876 | Returns the result of subtracting the absolute values of the extended
5877 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5878 | difference is negated before being returned.  `zSign' is ignored if the
5879 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5880 | Standard for Binary Floating-Point Arithmetic.
5881 *----------------------------------------------------------------------------*/
5882
5883 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5884                                 float_status *status)
5885 {
5886     int32_t aExp, bExp, zExp;
5887     uint64_t aSig, bSig, zSig0, zSig1;
5888     int32_t expDiff;
5889
5890     aSig = extractFloatx80Frac( a );
5891     aExp = extractFloatx80Exp( a );
5892     bSig = extractFloatx80Frac( b );
5893     bExp = extractFloatx80Exp( b );
5894     expDiff = aExp - bExp;
5895     if ( 0 < expDiff ) goto aExpBigger;
5896     if ( expDiff < 0 ) goto bExpBigger;
5897     if ( aExp == 0x7FFF ) {
5898         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5899             return propagateFloatx80NaN(a, b, status);
5900         }
5901         float_raise(float_flag_invalid, status);
5902         return floatx80_default_nan(status);
5903     }
5904     if ( aExp == 0 ) {
5905         aExp = 1;
5906         bExp = 1;
5907     }
5908     zSig1 = 0;
5909     if ( bSig < aSig ) goto aBigger;
5910     if ( aSig < bSig ) goto bBigger;
5911     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5912  bExpBigger:
5913     if ( bExp == 0x7FFF ) {
5914         if ((uint64_t)(bSig << 1)) {
5915             return propagateFloatx80NaN(a, b, status);
5916         }
5917         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5918                             floatx80_infinity_low);
5919     }
5920     if ( aExp == 0 ) ++expDiff;
5921     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5922  bBigger:
5923     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5924     zExp = bExp;
5925     zSign ^= 1;
5926     goto normalizeRoundAndPack;
5927  aExpBigger:
5928     if ( aExp == 0x7FFF ) {
5929         if ((uint64_t)(aSig << 1)) {
5930             return propagateFloatx80NaN(a, b, status);
5931         }
5932         return a;
5933     }
5934     if ( bExp == 0 ) --expDiff;
5935     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5936  aBigger:
5937     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5938     zExp = aExp;
5939  normalizeRoundAndPack:
5940     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5941                                          zSign, zExp, zSig0, zSig1, status);
5942 }
5943
5944 /*----------------------------------------------------------------------------
5945 | Returns the result of adding the extended double-precision floating-point
5946 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5947 | Standard for Binary Floating-Point Arithmetic.
5948 *----------------------------------------------------------------------------*/
5949
5950 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5951 {
5952     flag aSign, bSign;
5953
5954     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5955         float_raise(float_flag_invalid, status);
5956         return floatx80_default_nan(status);
5957     }
5958     aSign = extractFloatx80Sign( a );
5959     bSign = extractFloatx80Sign( b );
5960     if ( aSign == bSign ) {
5961         return addFloatx80Sigs(a, b, aSign, status);
5962     }
5963     else {
5964         return subFloatx80Sigs(a, b, aSign, status);
5965     }
5966
5967 }
5968
5969 /*----------------------------------------------------------------------------
5970 | Returns the result of subtracting the extended double-precision floating-
5971 | point values `a' and `b'.  The operation is performed according to the
5972 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5973 *----------------------------------------------------------------------------*/
5974
5975 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5976 {
5977     flag aSign, bSign;
5978
5979     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5980         float_raise(float_flag_invalid, status);
5981         return floatx80_default_nan(status);
5982     }
5983     aSign = extractFloatx80Sign( a );
5984     bSign = extractFloatx80Sign( b );
5985     if ( aSign == bSign ) {
5986         return subFloatx80Sigs(a, b, aSign, status);
5987     }
5988     else {
5989         return addFloatx80Sigs(a, b, aSign, status);
5990     }
5991
5992 }
5993
5994 /*----------------------------------------------------------------------------
5995 | Returns the result of multiplying the extended double-precision floating-
5996 | point values `a' and `b'.  The operation is performed according to the
5997 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5998 *----------------------------------------------------------------------------*/
5999
6000 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6001 {
6002     flag aSign, bSign, zSign;
6003     int32_t aExp, bExp, zExp;
6004     uint64_t aSig, bSig, zSig0, zSig1;
6005
6006     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6007         float_raise(float_flag_invalid, status);
6008         return floatx80_default_nan(status);
6009     }
6010     aSig = extractFloatx80Frac( a );
6011     aExp = extractFloatx80Exp( a );
6012     aSign = extractFloatx80Sign( a );
6013     bSig = extractFloatx80Frac( b );
6014     bExp = extractFloatx80Exp( b );
6015     bSign = extractFloatx80Sign( b );
6016     zSign = aSign ^ bSign;
6017     if ( aExp == 0x7FFF ) {
6018         if (    (uint64_t) ( aSig<<1 )
6019              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6020             return propagateFloatx80NaN(a, b, status);
6021         }
6022         if ( ( bExp | bSig ) == 0 ) goto invalid;
6023         return packFloatx80(zSign, floatx80_infinity_high,
6024                                    floatx80_infinity_low);
6025     }
6026     if ( bExp == 0x7FFF ) {
6027         if ((uint64_t)(bSig << 1)) {
6028             return propagateFloatx80NaN(a, b, status);
6029         }
6030         if ( ( aExp | aSig ) == 0 ) {
6031  invalid:
6032             float_raise(float_flag_invalid, status);
6033             return floatx80_default_nan(status);
6034         }
6035         return packFloatx80(zSign, floatx80_infinity_high,
6036                                    floatx80_infinity_low);
6037     }
6038     if ( aExp == 0 ) {
6039         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6040         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6041     }
6042     if ( bExp == 0 ) {
6043         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6044         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6045     }
6046     zExp = aExp + bExp - 0x3FFE;
6047     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6048     if ( 0 < (int64_t) zSig0 ) {
6049         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6050         --zExp;
6051     }
6052     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6053                                 zSign, zExp, zSig0, zSig1, status);
6054 }
6055
6056 /*----------------------------------------------------------------------------
6057 | Returns the result of dividing the extended double-precision floating-point
6058 | value `a' by the corresponding value `b'.  The operation is performed
6059 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6060 *----------------------------------------------------------------------------*/
6061
6062 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6063 {
6064     flag aSign, bSign, zSign;
6065     int32_t aExp, bExp, zExp;
6066     uint64_t aSig, bSig, zSig0, zSig1;
6067     uint64_t rem0, rem1, rem2, term0, term1, term2;
6068
6069     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6070         float_raise(float_flag_invalid, status);
6071         return floatx80_default_nan(status);
6072     }
6073     aSig = extractFloatx80Frac( a );
6074     aExp = extractFloatx80Exp( a );
6075     aSign = extractFloatx80Sign( a );
6076     bSig = extractFloatx80Frac( b );
6077     bExp = extractFloatx80Exp( b );
6078     bSign = extractFloatx80Sign( b );
6079     zSign = aSign ^ bSign;
6080     if ( aExp == 0x7FFF ) {
6081         if ((uint64_t)(aSig << 1)) {
6082             return propagateFloatx80NaN(a, b, status);
6083         }
6084         if ( bExp == 0x7FFF ) {
6085             if ((uint64_t)(bSig << 1)) {
6086                 return propagateFloatx80NaN(a, b, status);
6087             }
6088             goto invalid;
6089         }
6090         return packFloatx80(zSign, floatx80_infinity_high,
6091                                    floatx80_infinity_low);
6092     }
6093     if ( bExp == 0x7FFF ) {
6094         if ((uint64_t)(bSig << 1)) {
6095             return propagateFloatx80NaN(a, b, status);
6096         }
6097         return packFloatx80( zSign, 0, 0 );
6098     }
6099     if ( bExp == 0 ) {
6100         if ( bSig == 0 ) {
6101             if ( ( aExp | aSig ) == 0 ) {
6102  invalid:
6103                 float_raise(float_flag_invalid, status);
6104                 return floatx80_default_nan(status);
6105             }
6106             float_raise(float_flag_divbyzero, status);
6107             return packFloatx80(zSign, floatx80_infinity_high,
6108                                        floatx80_infinity_low);
6109         }
6110         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6111     }
6112     if ( aExp == 0 ) {
6113         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6114         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6115     }
6116     zExp = aExp - bExp + 0x3FFE;
6117     rem1 = 0;
6118     if ( bSig <= aSig ) {
6119         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6120         ++zExp;
6121     }
6122     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6123     mul64To128( bSig, zSig0, &term0, &term1 );
6124     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6125     while ( (int64_t) rem0 < 0 ) {
6126         --zSig0;
6127         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6128     }
6129     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6130     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6131         mul64To128( bSig, zSig1, &term1, &term2 );
6132         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6133         while ( (int64_t) rem1 < 0 ) {
6134             --zSig1;
6135             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6136         }
6137         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6138     }
6139     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6140                                 zSign, zExp, zSig0, zSig1, status);
6141 }
6142
6143 /*----------------------------------------------------------------------------
6144 | Returns the remainder of the extended double-precision floating-point value
6145 | `a' with respect to the corresponding value `b'.  The operation is performed
6146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6147 *----------------------------------------------------------------------------*/
6148
6149 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6150 {
6151     flag aSign, zSign;
6152     int32_t aExp, bExp, expDiff;
6153     uint64_t aSig0, aSig1, bSig;
6154     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6155
6156     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6157         float_raise(float_flag_invalid, status);
6158         return floatx80_default_nan(status);
6159     }
6160     aSig0 = extractFloatx80Frac( a );
6161     aExp = extractFloatx80Exp( a );
6162     aSign = extractFloatx80Sign( a );
6163     bSig = extractFloatx80Frac( b );
6164     bExp = extractFloatx80Exp( b );
6165     if ( aExp == 0x7FFF ) {
6166         if (    (uint64_t) ( aSig0<<1 )
6167              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6168             return propagateFloatx80NaN(a, b, status);
6169         }
6170         goto invalid;
6171     }
6172     if ( bExp == 0x7FFF ) {
6173         if ((uint64_t)(bSig << 1)) {
6174             return propagateFloatx80NaN(a, b, status);
6175         }
6176         return a;
6177     }
6178     if ( bExp == 0 ) {
6179         if ( bSig == 0 ) {
6180  invalid:
6181             float_raise(float_flag_invalid, status);
6182             return floatx80_default_nan(status);
6183         }
6184         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6185     }
6186     if ( aExp == 0 ) {
6187         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
6188         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6189     }
6190     bSig |= LIT64( 0x8000000000000000 );
6191     zSign = aSign;
6192     expDiff = aExp - bExp;
6193     aSig1 = 0;
6194     if ( expDiff < 0 ) {
6195         if ( expDiff < -1 ) return a;
6196         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6197         expDiff = 0;
6198     }
6199     q = ( bSig <= aSig0 );
6200     if ( q ) aSig0 -= bSig;
6201     expDiff -= 64;
6202     while ( 0 < expDiff ) {
6203         q = estimateDiv128To64( aSig0, aSig1, bSig );
6204         q = ( 2 < q ) ? q - 2 : 0;
6205         mul64To128( bSig, q, &term0, &term1 );
6206         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6207         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6208         expDiff -= 62;
6209     }
6210     expDiff += 64;
6211     if ( 0 < expDiff ) {
6212         q = estimateDiv128To64( aSig0, aSig1, bSig );
6213         q = ( 2 < q ) ? q - 2 : 0;
6214         q >>= 64 - expDiff;
6215         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6216         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6217         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6218         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6219             ++q;
6220             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6221         }
6222     }
6223     else {
6224         term1 = 0;
6225         term0 = bSig;
6226     }
6227     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6228     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6229          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6230               && ( q & 1 ) )
6231        ) {
6232         aSig0 = alternateASig0;
6233         aSig1 = alternateASig1;
6234         zSign = ! zSign;
6235     }
6236     return
6237         normalizeRoundAndPackFloatx80(
6238             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6239
6240 }
6241
6242 /*----------------------------------------------------------------------------
6243 | Returns the square root of the extended double-precision floating-point
6244 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6245 | for Binary Floating-Point Arithmetic.
6246 *----------------------------------------------------------------------------*/
6247
6248 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6249 {
6250     flag aSign;
6251     int32_t aExp, zExp;
6252     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6253     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6254
6255     if (floatx80_invalid_encoding(a)) {
6256         float_raise(float_flag_invalid, status);
6257         return floatx80_default_nan(status);
6258     }
6259     aSig0 = extractFloatx80Frac( a );
6260     aExp = extractFloatx80Exp( a );
6261     aSign = extractFloatx80Sign( a );
6262     if ( aExp == 0x7FFF ) {
6263         if ((uint64_t)(aSig0 << 1)) {
6264             return propagateFloatx80NaN(a, a, status);
6265         }
6266         if ( ! aSign ) return a;
6267         goto invalid;
6268     }
6269     if ( aSign ) {
6270         if ( ( aExp | aSig0 ) == 0 ) return a;
6271  invalid:
6272         float_raise(float_flag_invalid, status);
6273         return floatx80_default_nan(status);
6274     }
6275     if ( aExp == 0 ) {
6276         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6277         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6278     }
6279     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6280     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6281     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6282     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6283     doubleZSig0 = zSig0<<1;
6284     mul64To128( zSig0, zSig0, &term0, &term1 );
6285     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6286     while ( (int64_t) rem0 < 0 ) {
6287         --zSig0;
6288         doubleZSig0 -= 2;
6289         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6290     }
6291     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6292     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
6293         if ( zSig1 == 0 ) zSig1 = 1;
6294         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6295         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6296         mul64To128( zSig1, zSig1, &term2, &term3 );
6297         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6298         while ( (int64_t) rem1 < 0 ) {
6299             --zSig1;
6300             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6301             term3 |= 1;
6302             term2 |= doubleZSig0;
6303             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6304         }
6305         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6306     }
6307     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6308     zSig0 |= doubleZSig0;
6309     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6310                                 0, zExp, zSig0, zSig1, status);
6311 }
6312
6313 /*----------------------------------------------------------------------------
6314 | Returns 1 if the extended double-precision floating-point value `a' is equal
6315 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
6316 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6317 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6318 *----------------------------------------------------------------------------*/
6319
6320 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6321 {
6322
6323     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6324         || (extractFloatx80Exp(a) == 0x7FFF
6325             && (uint64_t) (extractFloatx80Frac(a) << 1))
6326         || (extractFloatx80Exp(b) == 0x7FFF
6327             && (uint64_t) (extractFloatx80Frac(b) << 1))
6328        ) {
6329         float_raise(float_flag_invalid, status);
6330         return 0;
6331     }
6332     return
6333            ( a.low == b.low )
6334         && (    ( a.high == b.high )
6335              || (    ( a.low == 0 )
6336                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6337            );
6338
6339 }
6340
6341 /*----------------------------------------------------------------------------
6342 | Returns 1 if the extended double-precision floating-point value `a' is
6343 | less than or equal to the corresponding value `b', and 0 otherwise.  The
6344 | invalid exception is raised if either operand is a NaN.  The comparison is
6345 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6346 | Arithmetic.
6347 *----------------------------------------------------------------------------*/
6348
6349 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6350 {
6351     flag aSign, bSign;
6352
6353     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6354         || (extractFloatx80Exp(a) == 0x7FFF
6355             && (uint64_t) (extractFloatx80Frac(a) << 1))
6356         || (extractFloatx80Exp(b) == 0x7FFF
6357             && (uint64_t) (extractFloatx80Frac(b) << 1))
6358        ) {
6359         float_raise(float_flag_invalid, status);
6360         return 0;
6361     }
6362     aSign = extractFloatx80Sign( a );
6363     bSign = extractFloatx80Sign( b );
6364     if ( aSign != bSign ) {
6365         return
6366                aSign
6367             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6368                  == 0 );
6369     }
6370     return
6371           aSign ? le128( b.high, b.low, a.high, a.low )
6372         : le128( a.high, a.low, b.high, b.low );
6373
6374 }
6375
6376 /*----------------------------------------------------------------------------
6377 | Returns 1 if the extended double-precision floating-point value `a' is
6378 | less than the corresponding value `b', and 0 otherwise.  The invalid
6379 | exception is raised if either operand is a NaN.  The comparison is performed
6380 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6381 *----------------------------------------------------------------------------*/
6382
6383 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6384 {
6385     flag aSign, bSign;
6386
6387     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6388         || (extractFloatx80Exp(a) == 0x7FFF
6389             && (uint64_t) (extractFloatx80Frac(a) << 1))
6390         || (extractFloatx80Exp(b) == 0x7FFF
6391             && (uint64_t) (extractFloatx80Frac(b) << 1))
6392        ) {
6393         float_raise(float_flag_invalid, status);
6394         return 0;
6395     }
6396     aSign = extractFloatx80Sign( a );
6397     bSign = extractFloatx80Sign( b );
6398     if ( aSign != bSign ) {
6399         return
6400                aSign
6401             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6402                  != 0 );
6403     }
6404     return
6405           aSign ? lt128( b.high, b.low, a.high, a.low )
6406         : lt128( a.high, a.low, b.high, b.low );
6407
6408 }
6409
6410 /*----------------------------------------------------------------------------
6411 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6412 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
6413 | either operand is a NaN.   The comparison is performed according to the
6414 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6415 *----------------------------------------------------------------------------*/
6416 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6417 {
6418     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6419         || (extractFloatx80Exp(a) == 0x7FFF
6420             && (uint64_t) (extractFloatx80Frac(a) << 1))
6421         || (extractFloatx80Exp(b) == 0x7FFF
6422             && (uint64_t) (extractFloatx80Frac(b) << 1))
6423        ) {
6424         float_raise(float_flag_invalid, status);
6425         return 1;
6426     }
6427     return 0;
6428 }
6429
6430 /*----------------------------------------------------------------------------
6431 | Returns 1 if the extended double-precision floating-point value `a' is
6432 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6433 | cause an exception.  The comparison is performed according to the IEC/IEEE
6434 | Standard for Binary Floating-Point Arithmetic.
6435 *----------------------------------------------------------------------------*/
6436
6437 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6438 {
6439
6440     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6441         float_raise(float_flag_invalid, status);
6442         return 0;
6443     }
6444     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6445               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6446          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6447               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6448        ) {
6449         if (floatx80_is_signaling_nan(a, status)
6450          || floatx80_is_signaling_nan(b, status)) {
6451             float_raise(float_flag_invalid, status);
6452         }
6453         return 0;
6454     }
6455     return
6456            ( a.low == b.low )
6457         && (    ( a.high == b.high )
6458              || (    ( a.low == 0 )
6459                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6460            );
6461
6462 }
6463
6464 /*----------------------------------------------------------------------------
6465 | Returns 1 if the extended double-precision floating-point value `a' is less
6466 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
6467 | do not cause an exception.  Otherwise, the comparison is performed according
6468 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6469 *----------------------------------------------------------------------------*/
6470
6471 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6472 {
6473     flag aSign, bSign;
6474
6475     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6476         float_raise(float_flag_invalid, status);
6477         return 0;
6478     }
6479     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6480               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6481          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6482               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6483        ) {
6484         if (floatx80_is_signaling_nan(a, status)
6485          || floatx80_is_signaling_nan(b, status)) {
6486             float_raise(float_flag_invalid, status);
6487         }
6488         return 0;
6489     }
6490     aSign = extractFloatx80Sign( a );
6491     bSign = extractFloatx80Sign( b );
6492     if ( aSign != bSign ) {
6493         return
6494                aSign
6495             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6496                  == 0 );
6497     }
6498     return
6499           aSign ? le128( b.high, b.low, a.high, a.low )
6500         : le128( a.high, a.low, b.high, b.low );
6501
6502 }
6503
6504 /*----------------------------------------------------------------------------
6505 | Returns 1 if the extended double-precision floating-point value `a' is less
6506 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
6507 | an exception.  Otherwise, the comparison is performed according to the
6508 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6509 *----------------------------------------------------------------------------*/
6510
6511 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6512 {
6513     flag aSign, bSign;
6514
6515     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6516         float_raise(float_flag_invalid, status);
6517         return 0;
6518     }
6519     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6520               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6521          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6522               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6523        ) {
6524         if (floatx80_is_signaling_nan(a, status)
6525          || floatx80_is_signaling_nan(b, status)) {
6526             float_raise(float_flag_invalid, status);
6527         }
6528         return 0;
6529     }
6530     aSign = extractFloatx80Sign( a );
6531     bSign = extractFloatx80Sign( b );
6532     if ( aSign != bSign ) {
6533         return
6534                aSign
6535             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6536                  != 0 );
6537     }
6538     return
6539           aSign ? lt128( b.high, b.low, a.high, a.low )
6540         : lt128( a.high, a.low, b.high, b.low );
6541
6542 }
6543
6544 /*----------------------------------------------------------------------------
6545 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6546 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6547 | The comparison is performed according to the IEC/IEEE Standard for Binary
6548 | Floating-Point Arithmetic.
6549 *----------------------------------------------------------------------------*/
6550 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6551 {
6552     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6553         float_raise(float_flag_invalid, status);
6554         return 1;
6555     }
6556     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6557               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6558          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6559               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6560        ) {
6561         if (floatx80_is_signaling_nan(a, status)
6562          || floatx80_is_signaling_nan(b, status)) {
6563             float_raise(float_flag_invalid, status);
6564         }
6565         return 1;
6566     }
6567     return 0;
6568 }
6569
6570 /*----------------------------------------------------------------------------
6571 | Returns the result of converting the quadruple-precision floating-point
6572 | value `a' to the 32-bit two's complement integer format.  The conversion
6573 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6574 | Arithmetic---which means in particular that the conversion is rounded
6575 | according to the current rounding mode.  If `a' is a NaN, the largest
6576 | positive integer is returned.  Otherwise, if the conversion overflows, the
6577 | largest integer with the same sign as `a' is returned.
6578 *----------------------------------------------------------------------------*/
6579
6580 int32_t float128_to_int32(float128 a, float_status *status)
6581 {
6582     flag aSign;
6583     int32_t aExp, shiftCount;
6584     uint64_t aSig0, aSig1;
6585
6586     aSig1 = extractFloat128Frac1( a );
6587     aSig0 = extractFloat128Frac0( a );
6588     aExp = extractFloat128Exp( a );
6589     aSign = extractFloat128Sign( a );
6590     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6591     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6592     aSig0 |= ( aSig1 != 0 );
6593     shiftCount = 0x4028 - aExp;
6594     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6595     return roundAndPackInt32(aSign, aSig0, status);
6596
6597 }
6598
6599 /*----------------------------------------------------------------------------
6600 | Returns the result of converting the quadruple-precision floating-point
6601 | value `a' to the 32-bit two's complement integer format.  The conversion
6602 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6603 | Arithmetic, except that the conversion is always rounded toward zero.  If
6604 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6605 | conversion overflows, the largest integer with the same sign as `a' is
6606 | returned.
6607 *----------------------------------------------------------------------------*/
6608
6609 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6610 {
6611     flag aSign;
6612     int32_t aExp, shiftCount;
6613     uint64_t aSig0, aSig1, savedASig;
6614     int32_t z;
6615
6616     aSig1 = extractFloat128Frac1( a );
6617     aSig0 = extractFloat128Frac0( a );
6618     aExp = extractFloat128Exp( a );
6619     aSign = extractFloat128Sign( a );
6620     aSig0 |= ( aSig1 != 0 );
6621     if ( 0x401E < aExp ) {
6622         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6623         goto invalid;
6624     }
6625     else if ( aExp < 0x3FFF ) {
6626         if (aExp || aSig0) {
6627             status->float_exception_flags |= float_flag_inexact;
6628         }
6629         return 0;
6630     }
6631     aSig0 |= LIT64( 0x0001000000000000 );
6632     shiftCount = 0x402F - aExp;
6633     savedASig = aSig0;
6634     aSig0 >>= shiftCount;
6635     z = aSig0;
6636     if ( aSign ) z = - z;
6637     if ( ( z < 0 ) ^ aSign ) {
6638  invalid:
6639         float_raise(float_flag_invalid, status);
6640         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6641     }
6642     if ( ( aSig0<<shiftCount ) != savedASig ) {
6643         status->float_exception_flags |= float_flag_inexact;
6644     }
6645     return z;
6646
6647 }
6648
6649 /*----------------------------------------------------------------------------
6650 | Returns the result of converting the quadruple-precision floating-point
6651 | value `a' to the 64-bit two's complement integer format.  The conversion
6652 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6653 | Arithmetic---which means in particular that the conversion is rounded
6654 | according to the current rounding mode.  If `a' is a NaN, the largest
6655 | positive integer is returned.  Otherwise, if the conversion overflows, the
6656 | largest integer with the same sign as `a' is returned.
6657 *----------------------------------------------------------------------------*/
6658
6659 int64_t float128_to_int64(float128 a, float_status *status)
6660 {
6661     flag aSign;
6662     int32_t aExp, shiftCount;
6663     uint64_t aSig0, aSig1;
6664
6665     aSig1 = extractFloat128Frac1( a );
6666     aSig0 = extractFloat128Frac0( a );
6667     aExp = extractFloat128Exp( a );
6668     aSign = extractFloat128Sign( a );
6669     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6670     shiftCount = 0x402F - aExp;
6671     if ( shiftCount <= 0 ) {
6672         if ( 0x403E < aExp ) {
6673             float_raise(float_flag_invalid, status);
6674             if (    ! aSign
6675                  || (    ( aExp == 0x7FFF )
6676                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6677                     )
6678                ) {
6679                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6680             }
6681             return (int64_t) LIT64( 0x8000000000000000 );
6682         }
6683         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6684     }
6685     else {
6686         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6687     }
6688     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6689
6690 }
6691
6692 /*----------------------------------------------------------------------------
6693 | Returns the result of converting the quadruple-precision floating-point
6694 | value `a' to the 64-bit two's complement integer format.  The conversion
6695 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6696 | Arithmetic, except that the conversion is always rounded toward zero.
6697 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6698 | the conversion overflows, the largest integer with the same sign as `a' is
6699 | returned.
6700 *----------------------------------------------------------------------------*/
6701
6702 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6703 {
6704     flag aSign;
6705     int32_t aExp, shiftCount;
6706     uint64_t aSig0, aSig1;
6707     int64_t z;
6708
6709     aSig1 = extractFloat128Frac1( a );
6710     aSig0 = extractFloat128Frac0( a );
6711     aExp = extractFloat128Exp( a );
6712     aSign = extractFloat128Sign( a );
6713     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6714     shiftCount = aExp - 0x402F;
6715     if ( 0 < shiftCount ) {
6716         if ( 0x403E <= aExp ) {
6717             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6718             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6719                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6720                 if (aSig1) {
6721                     status->float_exception_flags |= float_flag_inexact;
6722                 }
6723             }
6724             else {
6725                 float_raise(float_flag_invalid, status);
6726                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6727                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6728                 }
6729             }
6730             return (int64_t) LIT64( 0x8000000000000000 );
6731         }
6732         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6733         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6734             status->float_exception_flags |= float_flag_inexact;
6735         }
6736     }
6737     else {
6738         if ( aExp < 0x3FFF ) {
6739             if ( aExp | aSig0 | aSig1 ) {
6740                 status->float_exception_flags |= float_flag_inexact;
6741             }
6742             return 0;
6743         }
6744         z = aSig0>>( - shiftCount );
6745         if (    aSig1
6746              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6747             status->float_exception_flags |= float_flag_inexact;
6748         }
6749     }
6750     if ( aSign ) z = - z;
6751     return z;
6752
6753 }
6754
6755 /*----------------------------------------------------------------------------
6756 | Returns the result of converting the quadruple-precision floating-point value
6757 | `a' to the 64-bit unsigned integer format.  The conversion is
6758 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6759 | Arithmetic---which means in particular that the conversion is rounded
6760 | according to the current rounding mode.  If `a' is a NaN, the largest
6761 | positive integer is returned.  If the conversion overflows, the
6762 | largest unsigned integer is returned.  If 'a' is negative, the value is
6763 | rounded and zero is returned; negative values that do not round to zero
6764 | will raise the inexact exception.
6765 *----------------------------------------------------------------------------*/
6766
6767 uint64_t float128_to_uint64(float128 a, float_status *status)
6768 {
6769     flag aSign;
6770     int aExp;
6771     int shiftCount;
6772     uint64_t aSig0, aSig1;
6773
6774     aSig0 = extractFloat128Frac0(a);
6775     aSig1 = extractFloat128Frac1(a);
6776     aExp = extractFloat128Exp(a);
6777     aSign = extractFloat128Sign(a);
6778     if (aSign && (aExp > 0x3FFE)) {
6779         float_raise(float_flag_invalid, status);
6780         if (float128_is_any_nan(a)) {
6781             return LIT64(0xFFFFFFFFFFFFFFFF);
6782         } else {
6783             return 0;
6784         }
6785     }
6786     if (aExp) {
6787         aSig0 |= LIT64(0x0001000000000000);
6788     }
6789     shiftCount = 0x402F - aExp;
6790     if (shiftCount <= 0) {
6791         if (0x403E < aExp) {
6792             float_raise(float_flag_invalid, status);
6793             return LIT64(0xFFFFFFFFFFFFFFFF);
6794         }
6795         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6796     } else {
6797         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6798     }
6799     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6800 }
6801
6802 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6803 {
6804     uint64_t v;
6805     signed char current_rounding_mode = status->float_rounding_mode;
6806
6807     set_float_rounding_mode(float_round_to_zero, status);
6808     v = float128_to_uint64(a, status);
6809     set_float_rounding_mode(current_rounding_mode, status);
6810
6811     return v;
6812 }
6813
6814 /*----------------------------------------------------------------------------
6815 | Returns the result of converting the quadruple-precision floating-point
6816 | value `a' to the 32-bit unsigned integer format.  The conversion
6817 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6818 | Arithmetic except that the conversion is always rounded toward zero.
6819 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6820 | if the conversion overflows, the largest unsigned integer is returned.
6821 | If 'a' is negative, the value is rounded and zero is returned; negative
6822 | values that do not round to zero will raise the inexact exception.
6823 *----------------------------------------------------------------------------*/
6824
6825 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6826 {
6827     uint64_t v;
6828     uint32_t res;
6829     int old_exc_flags = get_float_exception_flags(status);
6830
6831     v = float128_to_uint64_round_to_zero(a, status);
6832     if (v > 0xffffffff) {
6833         res = 0xffffffff;
6834     } else {
6835         return v;
6836     }
6837     set_float_exception_flags(old_exc_flags, status);
6838     float_raise(float_flag_invalid, status);
6839     return res;
6840 }
6841
6842 /*----------------------------------------------------------------------------
6843 | Returns the result of converting the quadruple-precision floating-point value
6844 | `a' to the 32-bit unsigned integer format.  The conversion is
6845 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6846 | Arithmetic---which means in particular that the conversion is rounded
6847 | according to the current rounding mode.  If `a' is a NaN, the largest
6848 | positive integer is returned.  If the conversion overflows, the
6849 | largest unsigned integer is returned.  If 'a' is negative, the value is
6850 | rounded and zero is returned; negative values that do not round to zero
6851 | will raise the inexact exception.
6852 *----------------------------------------------------------------------------*/
6853
6854 uint32_t float128_to_uint32(float128 a, float_status *status)
6855 {
6856     uint64_t v;
6857     uint32_t res;
6858     int old_exc_flags = get_float_exception_flags(status);
6859
6860     v = float128_to_uint64(a, status);
6861     if (v > 0xffffffff) {
6862         res = 0xffffffff;
6863     } else {
6864         return v;
6865     }
6866     set_float_exception_flags(old_exc_flags, status);
6867     float_raise(float_flag_invalid, status);
6868     return res;
6869 }
6870
6871 /*----------------------------------------------------------------------------
6872 | Returns the result of converting the quadruple-precision floating-point
6873 | value `a' to the single-precision floating-point format.  The conversion
6874 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6875 | Arithmetic.
6876 *----------------------------------------------------------------------------*/
6877
6878 float32 float128_to_float32(float128 a, float_status *status)
6879 {
6880     flag aSign;
6881     int32_t aExp;
6882     uint64_t aSig0, aSig1;
6883     uint32_t zSig;
6884
6885     aSig1 = extractFloat128Frac1( a );
6886     aSig0 = extractFloat128Frac0( a );
6887     aExp = extractFloat128Exp( a );
6888     aSign = extractFloat128Sign( a );
6889     if ( aExp == 0x7FFF ) {
6890         if ( aSig0 | aSig1 ) {
6891             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6892         }
6893         return packFloat32( aSign, 0xFF, 0 );
6894     }
6895     aSig0 |= ( aSig1 != 0 );
6896     shift64RightJamming( aSig0, 18, &aSig0 );
6897     zSig = aSig0;
6898     if ( aExp || zSig ) {
6899         zSig |= 0x40000000;
6900         aExp -= 0x3F81;
6901     }
6902     return roundAndPackFloat32(aSign, aExp, zSig, status);
6903
6904 }
6905
6906 /*----------------------------------------------------------------------------
6907 | Returns the result of converting the quadruple-precision floating-point
6908 | value `a' to the double-precision floating-point format.  The conversion
6909 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6910 | Arithmetic.
6911 *----------------------------------------------------------------------------*/
6912
6913 float64 float128_to_float64(float128 a, float_status *status)
6914 {
6915     flag aSign;
6916     int32_t aExp;
6917     uint64_t aSig0, aSig1;
6918
6919     aSig1 = extractFloat128Frac1( a );
6920     aSig0 = extractFloat128Frac0( a );
6921     aExp = extractFloat128Exp( a );
6922     aSign = extractFloat128Sign( a );
6923     if ( aExp == 0x7FFF ) {
6924         if ( aSig0 | aSig1 ) {
6925             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6926         }
6927         return packFloat64( aSign, 0x7FF, 0 );
6928     }
6929     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6930     aSig0 |= ( aSig1 != 0 );
6931     if ( aExp || aSig0 ) {
6932         aSig0 |= LIT64( 0x4000000000000000 );
6933         aExp -= 0x3C01;
6934     }
6935     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6936
6937 }
6938
6939 /*----------------------------------------------------------------------------
6940 | Returns the result of converting the quadruple-precision floating-point
6941 | value `a' to the extended double-precision floating-point format.  The
6942 | conversion is performed according to the IEC/IEEE Standard for Binary
6943 | Floating-Point Arithmetic.
6944 *----------------------------------------------------------------------------*/
6945
6946 floatx80 float128_to_floatx80(float128 a, float_status *status)
6947 {
6948     flag aSign;
6949     int32_t aExp;
6950     uint64_t aSig0, aSig1;
6951
6952     aSig1 = extractFloat128Frac1( a );
6953     aSig0 = extractFloat128Frac0( a );
6954     aExp = extractFloat128Exp( a );
6955     aSign = extractFloat128Sign( a );
6956     if ( aExp == 0x7FFF ) {
6957         if ( aSig0 | aSig1 ) {
6958             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6959         }
6960         return packFloatx80(aSign, floatx80_infinity_high,
6961                                    floatx80_infinity_low);
6962     }
6963     if ( aExp == 0 ) {
6964         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6965         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6966     }
6967     else {
6968         aSig0 |= LIT64( 0x0001000000000000 );
6969     }
6970     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6971     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6972
6973 }
6974
6975 /*----------------------------------------------------------------------------
6976 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6977 | returns the result as a quadruple-precision floating-point value.  The
6978 | operation is performed according to the IEC/IEEE Standard for Binary
6979 | Floating-Point Arithmetic.
6980 *----------------------------------------------------------------------------*/
6981
6982 float128 float128_round_to_int(float128 a, float_status *status)
6983 {
6984     flag aSign;
6985     int32_t aExp;
6986     uint64_t lastBitMask, roundBitsMask;
6987     float128 z;
6988
6989     aExp = extractFloat128Exp( a );
6990     if ( 0x402F <= aExp ) {
6991         if ( 0x406F <= aExp ) {
6992             if (    ( aExp == 0x7FFF )
6993                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6994                ) {
6995                 return propagateFloat128NaN(a, a, status);
6996             }
6997             return a;
6998         }
6999         lastBitMask = 1;
7000         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
7001         roundBitsMask = lastBitMask - 1;
7002         z = a;
7003         switch (status->float_rounding_mode) {
7004         case float_round_nearest_even:
7005             if ( lastBitMask ) {
7006                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
7007                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
7008             }
7009             else {
7010                 if ( (int64_t) z.low < 0 ) {
7011                     ++z.high;
7012                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
7013                 }
7014             }
7015             break;
7016         case float_round_ties_away:
7017             if (lastBitMask) {
7018                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
7019             } else {
7020                 if ((int64_t) z.low < 0) {
7021                     ++z.high;
7022                 }
7023             }
7024             break;
7025         case float_round_to_zero:
7026             break;
7027         case float_round_up:
7028             if (!extractFloat128Sign(z)) {
7029                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7030             }
7031             break;
7032         case float_round_down:
7033             if (extractFloat128Sign(z)) {
7034                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7035             }
7036             break;
7037         case float_round_to_odd:
7038             /*
7039              * Note that if lastBitMask == 0, the last bit is the lsb
7040              * of high, and roundBitsMask == -1.
7041              */
7042             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
7043                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7044             }
7045             break;
7046         default:
7047             abort();
7048         }
7049         z.low &= ~ roundBitsMask;
7050     }
7051     else {
7052         if ( aExp < 0x3FFF ) {
7053             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
7054             status->float_exception_flags |= float_flag_inexact;
7055             aSign = extractFloat128Sign( a );
7056             switch (status->float_rounding_mode) {
7057             case float_round_nearest_even:
7058                 if (    ( aExp == 0x3FFE )
7059                      && (   extractFloat128Frac0( a )
7060                           | extractFloat128Frac1( a ) )
7061                    ) {
7062                     return packFloat128( aSign, 0x3FFF, 0, 0 );
7063                 }
7064                 break;
7065             case float_round_ties_away:
7066                 if (aExp == 0x3FFE) {
7067                     return packFloat128(aSign, 0x3FFF, 0, 0);
7068                 }
7069                 break;
7070             case float_round_down:
7071                 return
7072                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7073                     : packFloat128( 0, 0, 0, 0 );
7074             case float_round_up:
7075                 return
7076                       aSign ? packFloat128( 1, 0, 0, 0 )
7077                     : packFloat128( 0, 0x3FFF, 0, 0 );
7078
7079             case float_round_to_odd:
7080                 return packFloat128(aSign, 0x3FFF, 0, 0);
7081             }
7082             return packFloat128( aSign, 0, 0, 0 );
7083         }
7084         lastBitMask = 1;
7085         lastBitMask <<= 0x402F - aExp;
7086         roundBitsMask = lastBitMask - 1;
7087         z.low = 0;
7088         z.high = a.high;
7089         switch (status->float_rounding_mode) {
7090         case float_round_nearest_even:
7091             z.high += lastBitMask>>1;
7092             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7093                 z.high &= ~ lastBitMask;
7094             }
7095             break;
7096         case float_round_ties_away:
7097             z.high += lastBitMask>>1;
7098             break;
7099         case float_round_to_zero:
7100             break;
7101         case float_round_up:
7102             if (!extractFloat128Sign(z)) {
7103                 z.high |= ( a.low != 0 );
7104                 z.high += roundBitsMask;
7105             }
7106             break;
7107         case float_round_down:
7108             if (extractFloat128Sign(z)) {
7109                 z.high |= (a.low != 0);
7110                 z.high += roundBitsMask;
7111             }
7112             break;
7113         case float_round_to_odd:
7114             if ((z.high & lastBitMask) == 0) {
7115                 z.high |= (a.low != 0);
7116                 z.high += roundBitsMask;
7117             }
7118             break;
7119         default:
7120             abort();
7121         }
7122         z.high &= ~ roundBitsMask;
7123     }
7124     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7125         status->float_exception_flags |= float_flag_inexact;
7126     }
7127     return z;
7128
7129 }
7130
7131 /*----------------------------------------------------------------------------
7132 | Returns the result of adding the absolute values of the quadruple-precision
7133 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
7134 | before being returned.  `zSign' is ignored if the result is a NaN.
7135 | The addition is performed according to the IEC/IEEE Standard for Binary
7136 | Floating-Point Arithmetic.
7137 *----------------------------------------------------------------------------*/
7138
7139 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
7140                                 float_status *status)
7141 {
7142     int32_t aExp, bExp, zExp;
7143     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7144     int32_t expDiff;
7145
7146     aSig1 = extractFloat128Frac1( a );
7147     aSig0 = extractFloat128Frac0( a );
7148     aExp = extractFloat128Exp( a );
7149     bSig1 = extractFloat128Frac1( b );
7150     bSig0 = extractFloat128Frac0( b );
7151     bExp = extractFloat128Exp( b );
7152     expDiff = aExp - bExp;
7153     if ( 0 < expDiff ) {
7154         if ( aExp == 0x7FFF ) {
7155             if (aSig0 | aSig1) {
7156                 return propagateFloat128NaN(a, b, status);
7157             }
7158             return a;
7159         }
7160         if ( bExp == 0 ) {
7161             --expDiff;
7162         }
7163         else {
7164             bSig0 |= LIT64( 0x0001000000000000 );
7165         }
7166         shift128ExtraRightJamming(
7167             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7168         zExp = aExp;
7169     }
7170     else if ( expDiff < 0 ) {
7171         if ( bExp == 0x7FFF ) {
7172             if (bSig0 | bSig1) {
7173                 return propagateFloat128NaN(a, b, status);
7174             }
7175             return packFloat128( zSign, 0x7FFF, 0, 0 );
7176         }
7177         if ( aExp == 0 ) {
7178             ++expDiff;
7179         }
7180         else {
7181             aSig0 |= LIT64( 0x0001000000000000 );
7182         }
7183         shift128ExtraRightJamming(
7184             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7185         zExp = bExp;
7186     }
7187     else {
7188         if ( aExp == 0x7FFF ) {
7189             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7190                 return propagateFloat128NaN(a, b, status);
7191             }
7192             return a;
7193         }
7194         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7195         if ( aExp == 0 ) {
7196             if (status->flush_to_zero) {
7197                 if (zSig0 | zSig1) {
7198                     float_raise(float_flag_output_denormal, status);
7199                 }
7200                 return packFloat128(zSign, 0, 0, 0);
7201             }
7202             return packFloat128( zSign, 0, zSig0, zSig1 );
7203         }
7204         zSig2 = 0;
7205         zSig0 |= LIT64( 0x0002000000000000 );
7206         zExp = aExp;
7207         goto shiftRight1;
7208     }
7209     aSig0 |= LIT64( 0x0001000000000000 );
7210     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7211     --zExp;
7212     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
7213     ++zExp;
7214  shiftRight1:
7215     shift128ExtraRightJamming(
7216         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7217  roundAndPack:
7218     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7219
7220 }
7221
7222 /*----------------------------------------------------------------------------
7223 | Returns the result of subtracting the absolute values of the quadruple-
7224 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7225 | difference is negated before being returned.  `zSign' is ignored if the
7226 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7227 | Standard for Binary Floating-Point Arithmetic.
7228 *----------------------------------------------------------------------------*/
7229
7230 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
7231                                 float_status *status)
7232 {
7233     int32_t aExp, bExp, zExp;
7234     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7235     int32_t expDiff;
7236
7237     aSig1 = extractFloat128Frac1( a );
7238     aSig0 = extractFloat128Frac0( a );
7239     aExp = extractFloat128Exp( a );
7240     bSig1 = extractFloat128Frac1( b );
7241     bSig0 = extractFloat128Frac0( b );
7242     bExp = extractFloat128Exp( b );
7243     expDiff = aExp - bExp;
7244     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7245     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7246     if ( 0 < expDiff ) goto aExpBigger;
7247     if ( expDiff < 0 ) goto bExpBigger;
7248     if ( aExp == 0x7FFF ) {
7249         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7250             return propagateFloat128NaN(a, b, status);
7251         }
7252         float_raise(float_flag_invalid, status);
7253         return float128_default_nan(status);
7254     }
7255     if ( aExp == 0 ) {
7256         aExp = 1;
7257         bExp = 1;
7258     }
7259     if ( bSig0 < aSig0 ) goto aBigger;
7260     if ( aSig0 < bSig0 ) goto bBigger;
7261     if ( bSig1 < aSig1 ) goto aBigger;
7262     if ( aSig1 < bSig1 ) goto bBigger;
7263     return packFloat128(status->float_rounding_mode == float_round_down,
7264                         0, 0, 0);
7265  bExpBigger:
7266     if ( bExp == 0x7FFF ) {
7267         if (bSig0 | bSig1) {
7268             return propagateFloat128NaN(a, b, status);
7269         }
7270         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7271     }
7272     if ( aExp == 0 ) {
7273         ++expDiff;
7274     }
7275     else {
7276         aSig0 |= LIT64( 0x4000000000000000 );
7277     }
7278     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7279     bSig0 |= LIT64( 0x4000000000000000 );
7280  bBigger:
7281     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7282     zExp = bExp;
7283     zSign ^= 1;
7284     goto normalizeRoundAndPack;
7285  aExpBigger:
7286     if ( aExp == 0x7FFF ) {
7287         if (aSig0 | aSig1) {
7288             return propagateFloat128NaN(a, b, status);
7289         }
7290         return a;
7291     }
7292     if ( bExp == 0 ) {
7293         --expDiff;
7294     }
7295     else {
7296         bSig0 |= LIT64( 0x4000000000000000 );
7297     }
7298     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7299     aSig0 |= LIT64( 0x4000000000000000 );
7300  aBigger:
7301     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7302     zExp = aExp;
7303  normalizeRoundAndPack:
7304     --zExp;
7305     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7306                                          status);
7307
7308 }
7309
7310 /*----------------------------------------------------------------------------
7311 | Returns the result of adding the quadruple-precision floating-point values
7312 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7313 | for Binary Floating-Point Arithmetic.
7314 *----------------------------------------------------------------------------*/
7315
7316 float128 float128_add(float128 a, float128 b, float_status *status)
7317 {
7318     flag aSign, bSign;
7319
7320     aSign = extractFloat128Sign( a );
7321     bSign = extractFloat128Sign( b );
7322     if ( aSign == bSign ) {
7323         return addFloat128Sigs(a, b, aSign, status);
7324     }
7325     else {
7326         return subFloat128Sigs(a, b, aSign, status);
7327     }
7328
7329 }
7330
7331 /*----------------------------------------------------------------------------
7332 | Returns the result of subtracting the quadruple-precision floating-point
7333 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7334 | Standard for Binary Floating-Point Arithmetic.
7335 *----------------------------------------------------------------------------*/
7336
7337 float128 float128_sub(float128 a, float128 b, float_status *status)
7338 {
7339     flag aSign, bSign;
7340
7341     aSign = extractFloat128Sign( a );
7342     bSign = extractFloat128Sign( b );
7343     if ( aSign == bSign ) {
7344         return subFloat128Sigs(a, b, aSign, status);
7345     }
7346     else {
7347         return addFloat128Sigs(a, b, aSign, status);
7348     }
7349
7350 }
7351
7352 /*----------------------------------------------------------------------------
7353 | Returns the result of multiplying the quadruple-precision floating-point
7354 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7355 | Standard for Binary Floating-Point Arithmetic.
7356 *----------------------------------------------------------------------------*/
7357
7358 float128 float128_mul(float128 a, float128 b, float_status *status)
7359 {
7360     flag aSign, bSign, zSign;
7361     int32_t aExp, bExp, zExp;
7362     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7363
7364     aSig1 = extractFloat128Frac1( a );
7365     aSig0 = extractFloat128Frac0( a );
7366     aExp = extractFloat128Exp( a );
7367     aSign = extractFloat128Sign( a );
7368     bSig1 = extractFloat128Frac1( b );
7369     bSig0 = extractFloat128Frac0( b );
7370     bExp = extractFloat128Exp( b );
7371     bSign = extractFloat128Sign( b );
7372     zSign = aSign ^ bSign;
7373     if ( aExp == 0x7FFF ) {
7374         if (    ( aSig0 | aSig1 )
7375              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7376             return propagateFloat128NaN(a, b, status);
7377         }
7378         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7379         return packFloat128( zSign, 0x7FFF, 0, 0 );
7380     }
7381     if ( bExp == 0x7FFF ) {
7382         if (bSig0 | bSig1) {
7383             return propagateFloat128NaN(a, b, status);
7384         }
7385         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7386  invalid:
7387             float_raise(float_flag_invalid, status);
7388             return float128_default_nan(status);
7389         }
7390         return packFloat128( zSign, 0x7FFF, 0, 0 );
7391     }
7392     if ( aExp == 0 ) {
7393         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7394         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7395     }
7396     if ( bExp == 0 ) {
7397         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7398         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7399     }
7400     zExp = aExp + bExp - 0x4000;
7401     aSig0 |= LIT64( 0x0001000000000000 );
7402     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7403     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7404     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7405     zSig2 |= ( zSig3 != 0 );
7406     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
7407         shift128ExtraRightJamming(
7408             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7409         ++zExp;
7410     }
7411     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7412
7413 }
7414
7415 /*----------------------------------------------------------------------------
7416 | Returns the result of dividing the quadruple-precision floating-point value
7417 | `a' by the corresponding value `b'.  The operation is performed according to
7418 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7419 *----------------------------------------------------------------------------*/
7420
7421 float128 float128_div(float128 a, float128 b, float_status *status)
7422 {
7423     flag aSign, bSign, zSign;
7424     int32_t aExp, bExp, zExp;
7425     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7426     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7427
7428     aSig1 = extractFloat128Frac1( a );
7429     aSig0 = extractFloat128Frac0( a );
7430     aExp = extractFloat128Exp( a );
7431     aSign = extractFloat128Sign( a );
7432     bSig1 = extractFloat128Frac1( b );
7433     bSig0 = extractFloat128Frac0( b );
7434     bExp = extractFloat128Exp( b );
7435     bSign = extractFloat128Sign( b );
7436     zSign = aSign ^ bSign;
7437     if ( aExp == 0x7FFF ) {
7438         if (aSig0 | aSig1) {
7439             return propagateFloat128NaN(a, b, status);
7440         }
7441         if ( bExp == 0x7FFF ) {
7442             if (bSig0 | bSig1) {
7443                 return propagateFloat128NaN(a, b, status);
7444             }
7445             goto invalid;
7446         }
7447         return packFloat128( zSign, 0x7FFF, 0, 0 );
7448     }
7449     if ( bExp == 0x7FFF ) {
7450         if (bSig0 | bSig1) {
7451             return propagateFloat128NaN(a, b, status);
7452         }
7453         return packFloat128( zSign, 0, 0, 0 );
7454     }
7455     if ( bExp == 0 ) {
7456         if ( ( bSig0 | bSig1 ) == 0 ) {
7457             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7458  invalid:
7459                 float_raise(float_flag_invalid, status);
7460                 return float128_default_nan(status);
7461             }
7462             float_raise(float_flag_divbyzero, status);
7463             return packFloat128( zSign, 0x7FFF, 0, 0 );
7464         }
7465         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7466     }
7467     if ( aExp == 0 ) {
7468         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7469         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7470     }
7471     zExp = aExp - bExp + 0x3FFD;
7472     shortShift128Left(
7473         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
7474     shortShift128Left(
7475         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7476     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7477         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7478         ++zExp;
7479     }
7480     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7481     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7482     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7483     while ( (int64_t) rem0 < 0 ) {
7484         --zSig0;
7485         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7486     }
7487     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7488     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7489         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7490         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7491         while ( (int64_t) rem1 < 0 ) {
7492             --zSig1;
7493             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7494         }
7495         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7496     }
7497     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7498     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7499
7500 }
7501
7502 /*----------------------------------------------------------------------------
7503 | Returns the remainder of the quadruple-precision floating-point value `a'
7504 | with respect to the corresponding value `b'.  The operation is performed
7505 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7506 *----------------------------------------------------------------------------*/
7507
7508 float128 float128_rem(float128 a, float128 b, float_status *status)
7509 {
7510     flag aSign, zSign;
7511     int32_t aExp, bExp, expDiff;
7512     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7513     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7514     int64_t sigMean0;
7515
7516     aSig1 = extractFloat128Frac1( a );
7517     aSig0 = extractFloat128Frac0( a );
7518     aExp = extractFloat128Exp( a );
7519     aSign = extractFloat128Sign( a );
7520     bSig1 = extractFloat128Frac1( b );
7521     bSig0 = extractFloat128Frac0( b );
7522     bExp = extractFloat128Exp( b );
7523     if ( aExp == 0x7FFF ) {
7524         if (    ( aSig0 | aSig1 )
7525              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7526             return propagateFloat128NaN(a, b, status);
7527         }
7528         goto invalid;
7529     }
7530     if ( bExp == 0x7FFF ) {
7531         if (bSig0 | bSig1) {
7532             return propagateFloat128NaN(a, b, status);
7533         }
7534         return a;
7535     }
7536     if ( bExp == 0 ) {
7537         if ( ( bSig0 | bSig1 ) == 0 ) {
7538  invalid:
7539             float_raise(float_flag_invalid, status);
7540             return float128_default_nan(status);
7541         }
7542         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7543     }
7544     if ( aExp == 0 ) {
7545         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7546         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7547     }
7548     expDiff = aExp - bExp;
7549     if ( expDiff < -1 ) return a;
7550     shortShift128Left(
7551         aSig0 | LIT64( 0x0001000000000000 ),
7552         aSig1,
7553         15 - ( expDiff < 0 ),
7554         &aSig0,
7555         &aSig1
7556     );
7557     shortShift128Left(
7558         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7559     q = le128( bSig0, bSig1, aSig0, aSig1 );
7560     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7561     expDiff -= 64;
7562     while ( 0 < expDiff ) {
7563         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7564         q = ( 4 < q ) ? q - 4 : 0;
7565         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7566         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7567         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7568         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7569         expDiff -= 61;
7570     }
7571     if ( -64 < expDiff ) {
7572         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7573         q = ( 4 < q ) ? q - 4 : 0;
7574         q >>= - expDiff;
7575         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7576         expDiff += 52;
7577         if ( expDiff < 0 ) {
7578             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7579         }
7580         else {
7581             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7582         }
7583         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7584         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7585     }
7586     else {
7587         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7588         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7589     }
7590     do {
7591         alternateASig0 = aSig0;
7592         alternateASig1 = aSig1;
7593         ++q;
7594         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7595     } while ( 0 <= (int64_t) aSig0 );
7596     add128(
7597         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7598     if (    ( sigMean0 < 0 )
7599          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7600         aSig0 = alternateASig0;
7601         aSig1 = alternateASig1;
7602     }
7603     zSign = ( (int64_t) aSig0 < 0 );
7604     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7605     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7606                                          status);
7607 }
7608
7609 /*----------------------------------------------------------------------------
7610 | Returns the square root of the quadruple-precision floating-point value `a'.
7611 | The operation is performed according to the IEC/IEEE Standard for Binary
7612 | Floating-Point Arithmetic.
7613 *----------------------------------------------------------------------------*/
7614
7615 float128 float128_sqrt(float128 a, float_status *status)
7616 {
7617     flag aSign;
7618     int32_t aExp, zExp;
7619     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7620     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7621
7622     aSig1 = extractFloat128Frac1( a );
7623     aSig0 = extractFloat128Frac0( a );
7624     aExp = extractFloat128Exp( a );
7625     aSign = extractFloat128Sign( a );
7626     if ( aExp == 0x7FFF ) {
7627         if (aSig0 | aSig1) {
7628             return propagateFloat128NaN(a, a, status);
7629         }
7630         if ( ! aSign ) return a;
7631         goto invalid;
7632     }
7633     if ( aSign ) {
7634         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7635  invalid:
7636         float_raise(float_flag_invalid, status);
7637         return float128_default_nan(status);
7638     }
7639     if ( aExp == 0 ) {
7640         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7641         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7642     }
7643     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7644     aSig0 |= LIT64( 0x0001000000000000 );
7645     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7646     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7647     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7648     doubleZSig0 = zSig0<<1;
7649     mul64To128( zSig0, zSig0, &term0, &term1 );
7650     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7651     while ( (int64_t) rem0 < 0 ) {
7652         --zSig0;
7653         doubleZSig0 -= 2;
7654         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7655     }
7656     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7657     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7658         if ( zSig1 == 0 ) zSig1 = 1;
7659         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7660         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7661         mul64To128( zSig1, zSig1, &term2, &term3 );
7662         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7663         while ( (int64_t) rem1 < 0 ) {
7664             --zSig1;
7665             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7666             term3 |= 1;
7667             term2 |= doubleZSig0;
7668             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7669         }
7670         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7671     }
7672     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7673     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7674
7675 }
7676
7677 /*----------------------------------------------------------------------------
7678 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7679 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7680 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7681 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7682 *----------------------------------------------------------------------------*/
7683
7684 int float128_eq(float128 a, float128 b, float_status *status)
7685 {
7686
7687     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7688               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7689          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7690               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7691        ) {
7692         float_raise(float_flag_invalid, status);
7693         return 0;
7694     }
7695     return
7696            ( a.low == b.low )
7697         && (    ( a.high == b.high )
7698              || (    ( a.low == 0 )
7699                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7700            );
7701
7702 }
7703
7704 /*----------------------------------------------------------------------------
7705 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7706 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7707 | exception is raised if either operand is a NaN.  The comparison is performed
7708 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7709 *----------------------------------------------------------------------------*/
7710
7711 int float128_le(float128 a, float128 b, float_status *status)
7712 {
7713     flag aSign, bSign;
7714
7715     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7716               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7717          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7718               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7719        ) {
7720         float_raise(float_flag_invalid, status);
7721         return 0;
7722     }
7723     aSign = extractFloat128Sign( a );
7724     bSign = extractFloat128Sign( b );
7725     if ( aSign != bSign ) {
7726         return
7727                aSign
7728             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7729                  == 0 );
7730     }
7731     return
7732           aSign ? le128( b.high, b.low, a.high, a.low )
7733         : le128( a.high, a.low, b.high, b.low );
7734
7735 }
7736
7737 /*----------------------------------------------------------------------------
7738 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7739 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7740 | raised if either operand is a NaN.  The comparison is performed according
7741 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7742 *----------------------------------------------------------------------------*/
7743
7744 int float128_lt(float128 a, float128 b, float_status *status)
7745 {
7746     flag aSign, bSign;
7747
7748     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7749               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7750          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7751               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7752        ) {
7753         float_raise(float_flag_invalid, status);
7754         return 0;
7755     }
7756     aSign = extractFloat128Sign( a );
7757     bSign = extractFloat128Sign( b );
7758     if ( aSign != bSign ) {
7759         return
7760                aSign
7761             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7762                  != 0 );
7763     }
7764     return
7765           aSign ? lt128( b.high, b.low, a.high, a.low )
7766         : lt128( a.high, a.low, b.high, b.low );
7767
7768 }
7769
7770 /*----------------------------------------------------------------------------
7771 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7772 | be compared, and 0 otherwise.  The invalid exception is raised if either
7773 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7774 | Standard for Binary Floating-Point Arithmetic.
7775 *----------------------------------------------------------------------------*/
7776
7777 int float128_unordered(float128 a, float128 b, float_status *status)
7778 {
7779     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7780               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7781          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7782               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7783        ) {
7784         float_raise(float_flag_invalid, status);
7785         return 1;
7786     }
7787     return 0;
7788 }
7789
7790 /*----------------------------------------------------------------------------
7791 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7792 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7793 | exception.  The comparison is performed according to the IEC/IEEE Standard
7794 | for Binary Floating-Point Arithmetic.
7795 *----------------------------------------------------------------------------*/
7796
7797 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7798 {
7799
7800     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7801               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7802          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7803               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7804        ) {
7805         if (float128_is_signaling_nan(a, status)
7806          || float128_is_signaling_nan(b, status)) {
7807             float_raise(float_flag_invalid, status);
7808         }
7809         return 0;
7810     }
7811     return
7812            ( a.low == b.low )
7813         && (    ( a.high == b.high )
7814              || (    ( a.low == 0 )
7815                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7816            );
7817
7818 }
7819
7820 /*----------------------------------------------------------------------------
7821 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7822 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7823 | cause an exception.  Otherwise, the comparison is performed according to the
7824 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7825 *----------------------------------------------------------------------------*/
7826
7827 int float128_le_quiet(float128 a, float128 b, float_status *status)
7828 {
7829     flag aSign, bSign;
7830
7831     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7832               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7833          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7834               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7835        ) {
7836         if (float128_is_signaling_nan(a, status)
7837          || float128_is_signaling_nan(b, status)) {
7838             float_raise(float_flag_invalid, status);
7839         }
7840         return 0;
7841     }
7842     aSign = extractFloat128Sign( a );
7843     bSign = extractFloat128Sign( b );
7844     if ( aSign != bSign ) {
7845         return
7846                aSign
7847             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7848                  == 0 );
7849     }
7850     return
7851           aSign ? le128( b.high, b.low, a.high, a.low )
7852         : le128( a.high, a.low, b.high, b.low );
7853
7854 }
7855
7856 /*----------------------------------------------------------------------------
7857 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7858 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7859 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7860 | Standard for Binary Floating-Point Arithmetic.
7861 *----------------------------------------------------------------------------*/
7862
7863 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7864 {
7865     flag aSign, bSign;
7866
7867     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7868               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7869          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7870               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7871        ) {
7872         if (float128_is_signaling_nan(a, status)
7873          || float128_is_signaling_nan(b, status)) {
7874             float_raise(float_flag_invalid, status);
7875         }
7876         return 0;
7877     }
7878     aSign = extractFloat128Sign( a );
7879     bSign = extractFloat128Sign( b );
7880     if ( aSign != bSign ) {
7881         return
7882                aSign
7883             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7884                  != 0 );
7885     }
7886     return
7887           aSign ? lt128( b.high, b.low, a.high, a.low )
7888         : lt128( a.high, a.low, b.high, b.low );
7889
7890 }
7891
7892 /*----------------------------------------------------------------------------
7893 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7894 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7895 | comparison is performed according to the IEC/IEEE Standard for Binary
7896 | Floating-Point Arithmetic.
7897 *----------------------------------------------------------------------------*/
7898
7899 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7900 {
7901     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7902               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7903          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7904               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7905        ) {
7906         if (float128_is_signaling_nan(a, status)
7907          || float128_is_signaling_nan(b, status)) {
7908             float_raise(float_flag_invalid, status);
7909         }
7910         return 1;
7911     }
7912     return 0;
7913 }
7914
7915 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7916                                             int is_quiet, float_status *status)
7917 {
7918     flag aSign, bSign;
7919
7920     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7921         float_raise(float_flag_invalid, status);
7922         return float_relation_unordered;
7923     }
7924     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7925           ( extractFloatx80Frac( a )<<1 ) ) ||
7926         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7927           ( extractFloatx80Frac( b )<<1 ) )) {
7928         if (!is_quiet ||
7929             floatx80_is_signaling_nan(a, status) ||
7930             floatx80_is_signaling_nan(b, status)) {
7931             float_raise(float_flag_invalid, status);
7932         }
7933         return float_relation_unordered;
7934     }
7935     aSign = extractFloatx80Sign( a );
7936     bSign = extractFloatx80Sign( b );
7937     if ( aSign != bSign ) {
7938
7939         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7940              ( ( a.low | b.low ) == 0 ) ) {
7941             /* zero case */
7942             return float_relation_equal;
7943         } else {
7944             return 1 - (2 * aSign);
7945         }
7946     } else {
7947         if (a.low == b.low && a.high == b.high) {
7948             return float_relation_equal;
7949         } else {
7950             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7951         }
7952     }
7953 }
7954
7955 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7956 {
7957     return floatx80_compare_internal(a, b, 0, status);
7958 }
7959
7960 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7961 {
7962     return floatx80_compare_internal(a, b, 1, status);
7963 }
7964
7965 static inline int float128_compare_internal(float128 a, float128 b,
7966                                             int is_quiet, float_status *status)
7967 {
7968     flag aSign, bSign;
7969
7970     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7971           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7972         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7973           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7974         if (!is_quiet ||
7975             float128_is_signaling_nan(a, status) ||
7976             float128_is_signaling_nan(b, status)) {
7977             float_raise(float_flag_invalid, status);
7978         }
7979         return float_relation_unordered;
7980     }
7981     aSign = extractFloat128Sign( a );
7982     bSign = extractFloat128Sign( b );
7983     if ( aSign != bSign ) {
7984         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7985             /* zero case */
7986             return float_relation_equal;
7987         } else {
7988             return 1 - (2 * aSign);
7989         }
7990     } else {
7991         if (a.low == b.low && a.high == b.high) {
7992             return float_relation_equal;
7993         } else {
7994             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7995         }
7996     }
7997 }
7998
7999 int float128_compare(float128 a, float128 b, float_status *status)
8000 {
8001     return float128_compare_internal(a, b, 0, status);
8002 }
8003
8004 int float128_compare_quiet(float128 a, float128 b, float_status *status)
8005 {
8006     return float128_compare_internal(a, b, 1, status);
8007 }
8008
8009 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
8010 {
8011     flag aSign;
8012     int32_t aExp;
8013     uint64_t aSig;
8014
8015     if (floatx80_invalid_encoding(a)) {
8016         float_raise(float_flag_invalid, status);
8017         return floatx80_default_nan(status);
8018     }
8019     aSig = extractFloatx80Frac( a );
8020     aExp = extractFloatx80Exp( a );
8021     aSign = extractFloatx80Sign( a );
8022
8023     if ( aExp == 0x7FFF ) {
8024         if ( aSig<<1 ) {
8025             return propagateFloatx80NaN(a, a, status);
8026         }
8027         return a;
8028     }
8029
8030     if (aExp == 0) {
8031         if (aSig == 0) {
8032             return a;
8033         }
8034         aExp++;
8035     }
8036
8037     if (n > 0x10000) {
8038         n = 0x10000;
8039     } else if (n < -0x10000) {
8040         n = -0x10000;
8041     }
8042
8043     aExp += n;
8044     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
8045                                          aSign, aExp, aSig, 0, status);
8046 }
8047
8048 float128 float128_scalbn(float128 a, int n, float_status *status)
8049 {
8050     flag aSign;
8051     int32_t aExp;
8052     uint64_t aSig0, aSig1;
8053
8054     aSig1 = extractFloat128Frac1( a );
8055     aSig0 = extractFloat128Frac0( a );
8056     aExp = extractFloat128Exp( a );
8057     aSign = extractFloat128Sign( a );
8058     if ( aExp == 0x7FFF ) {
8059         if ( aSig0 | aSig1 ) {
8060             return propagateFloat128NaN(a, a, status);
8061         }
8062         return a;
8063     }
8064     if (aExp != 0) {
8065         aSig0 |= LIT64( 0x0001000000000000 );
8066     } else if (aSig0 == 0 && aSig1 == 0) {
8067         return a;
8068     } else {
8069         aExp++;
8070     }
8071
8072     if (n > 0x10000) {
8073         n = 0x10000;
8074     } else if (n < -0x10000) {
8075         n = -0x10000;
8076     }
8077
8078     aExp += n - 1;
8079     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
8080                                          , status);
8081
8082 }
8083
8084 static void __attribute__((constructor)) softfloat_init(void)
8085 {
8086     union_float64 ua, ub, uc, ur;
8087
8088     if (QEMU_NO_HARDFLOAT) {
8089         return;
8090     }
8091     /*
8092      * Test that the host's FMA is not obviously broken. For example,
8093      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
8094      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
8095      */
8096     ua.s = 0x0020000000000001ULL;
8097     ub.s = 0x3ca0000000000000ULL;
8098     uc.s = 0x0020000000000000ULL;
8099     ur.h = fma(ua.h, ub.h, uc.h);
8100     if (ur.s != 0x0020000000000001ULL) {
8101         force_soft_fma = true;
8102     }
8103 }