fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include <math.h>
  87 #include "qemu/bitops.h"
  88 #include "fpu/softfloat.h"
  89
  90 /* We only need stdlib for abort() */
  91
  92 /*----------------------------------------------------------------------------
  93 | Primitive arithmetic functions, including multi-word arithmetic, and
  94 | division and square root approximations.  (Can be specialized to target if
  95 | desired.)
  96 *----------------------------------------------------------------------------*/
  97 #include "fpu/softfloat-macros.h"
  98
  99 /*
 100  * Hardfloat
 101  *
 102  * Fast emulation of guest FP instructions is challenging for two reasons.
 103  * First, FP instruction semantics are similar but not identical, particularly
 104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
 105  * exception flags is not trivial: reading the host's flags register with a
 106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
 107  * and trapping on every FP exception is not fast nor pleasant to work with.
 108  *
 109  * We address these challenges by leveraging the host FPU for a subset of the
 110  * operations. To do this we expand on the idea presented in this paper:
 111  *
 112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
 113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
 114  *
 115  * The idea is thus to leverage the host FPU to (1) compute FP operations
 116  * and (2) identify whether FP exceptions occurred while avoiding
 117  * expensive exception flag register accesses.
 118  *
 119  * An important optimization shown in the paper is that given that exception
 120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
 121  * This is particularly useful for the inexact flag, which is very frequently
 122  * raised in floating-point workloads.
 123  *
 124  * We optimize the code further by deferring to soft-fp whenever FP exception
 125  * detection might get hairy. Two examples: (1) when at least one operand is
 126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
 127  * and the result is < the minimum normal.
 128  */
 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
 130     static inline void name(soft_t *a, float_status *s)                 \
 131     {                                                                   \
 132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
 133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
 134                                      soft_t ## _is_neg(*a));            \
 135             float_raise(float_flag_input_denormal, s);                  \
 136         }                                                               \
 137     }
 138
 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
 141 #undef GEN_INPUT_FLUSH__NOCHECK
 142
 143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
 144     static inline void name(soft_t *a, float_status *s) \
 145     {                                                   \
 146         if (likely(!s->flush_inputs_to_zero)) {         \
 147             return;                                     \
 148         }                                               \
 149         soft_t ## _input_flush__nocheck(a, s);          \
 150     }
 151
 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
 154 #undef GEN_INPUT_FLUSH1
 155
 156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
 157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
 158     {                                                                   \
 159         if (likely(!s->flush_inputs_to_zero)) {                         \
 160             return;                                                     \
 161         }                                                               \
 162         soft_t ## _input_flush__nocheck(a, s);                          \
 163         soft_t ## _input_flush__nocheck(b, s);                          \
 164     }
 165
 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
 168 #undef GEN_INPUT_FLUSH2
 169
 170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
 171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
 172     {                                                                   \
 173         if (likely(!s->flush_inputs_to_zero)) {                         \
 174             return;                                                     \
 175         }                                                               \
 176         soft_t ## _input_flush__nocheck(a, s);                          \
 177         soft_t ## _input_flush__nocheck(b, s);                          \
 178         soft_t ## _input_flush__nocheck(c, s);                          \
 179     }
 180
 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
 183 #undef GEN_INPUT_FLUSH3
 184
 185 /*
 186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
 187  * hardfloat functions. Each combination of number of inputs and float size
 188  * gets its own value.
 189  */
 190 #if defined(__x86_64__)
 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
 197 #else
 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
 204 #endif
 205
 206 /*
 207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
 208  * float{32,64}_is_infinity when !USE_FP.
 209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
 210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
 211  */
 212 #if defined(__x86_64__) || defined(__aarch64__)
 213 # define QEMU_HARDFLOAT_USE_ISINF   1
 214 #else
 215 # define QEMU_HARDFLOAT_USE_ISINF   0
 216 #endif
 217
 218 /*
 219  * Some targets clear the FP flags before most FP operations. This prevents
 220  * the use of hardfloat, since hardfloat relies on the inexact flag being
 221  * already set.
 222  */
 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
 224 # if defined(__FAST_MATH__)
 225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
 226     IEEE implementation
 227 # endif
 228 # define QEMU_NO_HARDFLOAT 1
 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
 230 #else
 231 # define QEMU_NO_HARDFLOAT 0
 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
 233 #endif
 234
 235 static inline bool can_use_fpu(const float_status *s)
 236 {
 237     if (QEMU_NO_HARDFLOAT) {
 238         return false;
 239     }
 240     return likely(s->float_exception_flags & float_flag_inexact &&
 241                   s->float_rounding_mode == float_round_nearest_even);
 242 }
 243
 244 /*
 245  * Hardfloat generation functions. Each operation can have two flavors:
 246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
 247  * most condition checks, or native ones (e.g. fpclassify).
 248  *
 249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
 250  * compiler to propagate constants and inline everything into the callers.
 251  *
 252  * We only generate functions for operations with two inputs, since only
 253  * these are common enough to justify consolidating them into common code.
 254  */
 255
 256 typedef union {
 257     float32 s;
 258     float h;
 259 } union_float32;
 260
 261 typedef union {
 262     float64 s;
 263     double h;
 264 } union_float64;
 265
 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
 268
 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
 271 typedef float   (*hard_f32_op2_fn)(float a, float b);
 272 typedef double  (*hard_f64_op2_fn)(double a, double b);
 273
 274 /* 2-input is-zero-or-normal */
 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
 276 {
 277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
 278         /*
 279          * Not using a temp variable for consecutive fpclassify calls ends up
 280          * generating faster code.
 281          */
 282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 284     }
 285     return float32_is_zero_or_normal(a.s) &&
 286            float32_is_zero_or_normal(b.s);
 287 }
 288
 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
 290 {
 291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
 292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
 294     }
 295     return float64_is_zero_or_normal(a.s) &&
 296            float64_is_zero_or_normal(b.s);
 297 }
 298
 299 /* 3-input is-zero-or-normal */
 300 static inline
 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
 302 {
 303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
 304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 307     }
 308     return float32_is_zero_or_normal(a.s) &&
 309            float32_is_zero_or_normal(b.s) &&
 310            float32_is_zero_or_normal(c.s);
 311 }
 312
 313 static inline
 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
 315 {
 316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
 317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
 318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
 319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
 320     }
 321     return float64_is_zero_or_normal(a.s) &&
 322            float64_is_zero_or_normal(b.s) &&
 323            float64_is_zero_or_normal(c.s);
 324 }
 325
 326 static inline bool f32_is_inf(union_float32 a)
 327 {
 328     if (QEMU_HARDFLOAT_USE_ISINF) {
 329         return isinf(a.h);
 330     }
 331     return float32_is_infinity(a.s);
 332 }
 333
 334 static inline bool f64_is_inf(union_float64 a)
 335 {
 336     if (QEMU_HARDFLOAT_USE_ISINF) {
 337         return isinf(a.h);
 338     }
 339     return float64_is_infinity(a.s);
 340 }
 341
 342 static inline float32
 343 float32_gen2(float32 xa, float32 xb, float_status *s,
 344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
 345              f32_check_fn pre, f32_check_fn post)
 346 {
 347     union_float32 ua, ub, ur;
 348
 349     ua.s = xa;
 350     ub.s = xb;
 351
 352     if (unlikely(!can_use_fpu(s))) {
 353         goto soft;
 354     }
 355
 356     float32_input_flush2(&ua.s, &ub.s, s);
 357     if (unlikely(!pre(ua, ub))) {
 358         goto soft;
 359     }
 360
 361     ur.h = hard(ua.h, ub.h);
 362     if (unlikely(f32_is_inf(ur))) {
 363         float_raise(float_flag_overflow, s);
 364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
 365         goto soft;
 366     }
 367     return ur.s;
 368
 369  soft:
 370     return soft(ua.s, ub.s, s);
 371 }
 372
 373 static inline float64
 374 float64_gen2(float64 xa, float64 xb, float_status *s,
 375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
 376              f64_check_fn pre, f64_check_fn post)
 377 {
 378     union_float64 ua, ub, ur;
 379
 380     ua.s = xa;
 381     ub.s = xb;
 382
 383     if (unlikely(!can_use_fpu(s))) {
 384         goto soft;
 385     }
 386
 387     float64_input_flush2(&ua.s, &ub.s, s);
 388     if (unlikely(!pre(ua, ub))) {
 389         goto soft;
 390     }
 391
 392     ur.h = hard(ua.h, ub.h);
 393     if (unlikely(f64_is_inf(ur))) {
 394         float_raise(float_flag_overflow, s);
 395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
 396         goto soft;
 397     }
 398     return ur.s;
 399
 400  soft:
 401     return soft(ua.s, ub.s, s);
 402 }
 403
 404 /*----------------------------------------------------------------------------
 405 | Returns the fraction bits of the single-precision floating-point value `a'.
 406 *----------------------------------------------------------------------------*/
 407
 408 static inline uint32_t extractFloat32Frac(float32 a)
 409 {
 410     return float32_val(a) & 0x007FFFFF;
 411 }
 412
 413 /*----------------------------------------------------------------------------
 414 | Returns the exponent bits of the single-precision floating-point value `a'.
 415 *----------------------------------------------------------------------------*/
 416
 417 static inline int extractFloat32Exp(float32 a)
 418 {
 419     return (float32_val(a) >> 23) & 0xFF;
 420 }
 421
 422 /*----------------------------------------------------------------------------
 423 | Returns the sign bit of the single-precision floating-point value `a'.
 424 *----------------------------------------------------------------------------*/
 425
 426 static inline bool extractFloat32Sign(float32 a)
 427 {
 428     return float32_val(a) >> 31;
 429 }
 430
 431 /*----------------------------------------------------------------------------
 432 | Returns the fraction bits of the double-precision floating-point value `a'.
 433 *----------------------------------------------------------------------------*/
 434
 435 static inline uint64_t extractFloat64Frac(float64 a)
 436 {
 437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
 438 }
 439
 440 /*----------------------------------------------------------------------------
 441 | Returns the exponent bits of the double-precision floating-point value `a'.
 442 *----------------------------------------------------------------------------*/
 443
 444 static inline int extractFloat64Exp(float64 a)
 445 {
 446     return (float64_val(a) >> 52) & 0x7FF;
 447 }
 448
 449 /*----------------------------------------------------------------------------
 450 | Returns the sign bit of the double-precision floating-point value `a'.
 451 *----------------------------------------------------------------------------*/
 452
 453 static inline bool extractFloat64Sign(float64 a)
 454 {
 455     return float64_val(a) >> 63;
 456 }
 457
 458 /*
 459  * Classify a floating point number. Everything above float_class_qnan
 460  * is a NaN so cls >= float_class_qnan is any NaN.
 461  */
 462
 463 typedef enum __attribute__ ((__packed__)) {
 464     float_class_unclassified,
 465     float_class_zero,
 466     float_class_normal,
 467     float_class_inf,
 468     float_class_qnan,  /* all NaNs from here */
 469     float_class_snan,
 470 } FloatClass;
 471
 472 #define float_cmask(bit)  (1u << (bit))
 473
 474 enum {
 475     float_cmask_zero    = float_cmask(float_class_zero),
 476     float_cmask_normal  = float_cmask(float_class_normal),
 477     float_cmask_inf     = float_cmask(float_class_inf),
 478     float_cmask_qnan    = float_cmask(float_class_qnan),
 479     float_cmask_snan    = float_cmask(float_class_snan),
 480
 481     float_cmask_infzero = float_cmask_zero | float_cmask_inf,
 482     float_cmask_anynan  = float_cmask_qnan | float_cmask_snan,
 483 };
 484
 485
 486 /* Simple helpers for checking if, or what kind of, NaN we have */
 487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
 488 {
 489     return unlikely(c >= float_class_qnan);
 490 }
 491
 492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
 493 {
 494     return c == float_class_snan;
 495 }
 496
 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 498 {
 499     return c == float_class_qnan;
 500 }
 501
 502 /*
 503  * Structure holding all of the decomposed parts of a float.
 504  * The exponent is unbiased and the fraction is normalized.
 505  *
 506  * The fraction words are stored in big-endian word ordering,
 507  * so that truncation from a larger format to a smaller format
 508  * can be done simply by ignoring subsequent elements.
 509  */
 510
 511 typedef struct {
 512     FloatClass cls;
 513     bool sign;
 514     int32_t exp;
 515     union {
 516         /* Routines that know the structure may reference the singular name. */
 517         uint64_t frac;
 518         /*
 519          * Routines expanded with multiple structures reference "hi" and "lo"
 520          * depending on the operation.  In FloatParts64, "hi" and "lo" are
 521          * both the same word and aliased here.
 522          */
 523         uint64_t frac_hi;
 524         uint64_t frac_lo;
 525     };
 526 } FloatParts64;
 527
 528 typedef struct {
 529     FloatClass cls;
 530     bool sign;
 531     int32_t exp;
 532     uint64_t frac_hi;
 533     uint64_t frac_lo;
 534 } FloatParts128;
 535
 536 /* These apply to the most significant word of each FloatPartsN. */
 537 #define DECOMPOSED_BINARY_POINT    63
 538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 539
 540 /* Structure holding all of the relevant parameters for a format.
 541  *   exp_size: the size of the exponent field
 542  *   exp_bias: the offset applied to the exponent field
 543  *   exp_max: the maximum normalised exponent
 544  *   frac_size: the size of the fraction field
 545  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 546  * The following are computed based the size of fraction
 547  *   frac_lsb: least significant bit of fraction
 548  *   frac_lsbm1: the bit below the least significant bit (for rounding)
 549  *   round_mask/roundeven_mask: masks used for rounding
 550  * The following optional modifiers are available:
 551  *   arm_althp: handle ARM Alternative Half Precision
 552  */
 553 typedef struct {
 554     int exp_size;
 555     int exp_bias;
 556     int exp_max;
 557     int frac_size;
 558     int frac_shift;
 559     uint64_t frac_lsb;
 560     uint64_t frac_lsbm1;
 561     uint64_t round_mask;
 562     uint64_t roundeven_mask;
 563     bool arm_althp;
 564 } FloatFmt;
 565
 566 /* Expand fields based on the size of exponent and fraction */
 567 #define FLOAT_PARAMS(E, F)                                           \
 568     .exp_size       = E,                                             \
 569     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 570     .exp_max        = (1 << E) - 1,                                  \
 571     .frac_size      = F,                                             \
 572     .frac_shift     = (-F - 1) & 63,                                 \
 573     .frac_lsb       = 1ull << ((-F - 1) & 63),                       \
 574     .frac_lsbm1     = 1ull << ((-F - 2) & 63),                       \
 575     .round_mask     = (1ull << ((-F - 1) & 63)) - 1,                 \
 576     .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
 577
 578 static const FloatFmt float16_params = {
 579     FLOAT_PARAMS(5, 10)
 580 };
 581
 582 static const FloatFmt float16_params_ahp = {
 583     FLOAT_PARAMS(5, 10),
 584     .arm_althp = true
 585 };
 586
 587 static const FloatFmt bfloat16_params = {
 588     FLOAT_PARAMS(8, 7)
 589 };
 590
 591 static const FloatFmt float32_params = {
 592     FLOAT_PARAMS(8, 23)
 593 };
 594
 595 static const FloatFmt float64_params = {
 596     FLOAT_PARAMS(11, 52)
 597 };
 598
 599 static const FloatFmt float128_params = {
 600     FLOAT_PARAMS(15, 112)
 601 };
 602
 603 /* Unpack a float to parts, but do not canonicalize.  */
 604 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
 605 {
 606     const int f_size = fmt->frac_size;
 607     const int e_size = fmt->exp_size;
 608
 609     *r = (FloatParts64) {
 610         .cls = float_class_unclassified,
 611         .sign = extract64(raw, f_size + e_size, 1),
 612         .exp = extract64(raw, f_size, e_size),
 613         .frac = extract64(raw, 0, f_size)
 614     };
 615 }
 616
 617 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
 618 {
 619     unpack_raw64(p, &float16_params, f);
 620 }
 621
 622 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
 623 {
 624     unpack_raw64(p, &bfloat16_params, f);
 625 }
 626
 627 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
 628 {
 629     unpack_raw64(p, &float32_params, f);
 630 }
 631
 632 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
 633 {
 634     unpack_raw64(p, &float64_params, f);
 635 }
 636
 637 static void float128_unpack_raw(FloatParts128 *p, float128 f)
 638 {
 639     const int f_size = float128_params.frac_size - 64;
 640     const int e_size = float128_params.exp_size;
 641
 642     *p = (FloatParts128) {
 643         .cls = float_class_unclassified,
 644         .sign = extract64(f.high, f_size + e_size, 1),
 645         .exp = extract64(f.high, f_size, e_size),
 646         .frac_hi = extract64(f.high, 0, f_size),
 647         .frac_lo = f.low,
 648     };
 649 }
 650
 651 /* Pack a float from parts, but do not canonicalize.  */
 652 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
 653 {
 654     const int f_size = fmt->frac_size;
 655     const int e_size = fmt->exp_size;
 656     uint64_t ret;
 657
 658     ret = (uint64_t)p->sign << (f_size + e_size);
 659     ret = deposit64(ret, f_size, e_size, p->exp);
 660     ret = deposit64(ret, 0, f_size, p->frac);
 661     return ret;
 662 }
 663
 664 static inline float16 float16_pack_raw(const FloatParts64 *p)
 665 {
 666     return make_float16(pack_raw64(p, &float16_params));
 667 }
 668
 669 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
 670 {
 671     return pack_raw64(p, &bfloat16_params);
 672 }
 673
 674 static inline float32 float32_pack_raw(const FloatParts64 *p)
 675 {
 676     return make_float32(pack_raw64(p, &float32_params));
 677 }
 678
 679 static inline float64 float64_pack_raw(const FloatParts64 *p)
 680 {
 681     return make_float64(pack_raw64(p, &float64_params));
 682 }
 683
 684 static float128 float128_pack_raw(const FloatParts128 *p)
 685 {
 686     const int f_size = float128_params.frac_size - 64;
 687     const int e_size = float128_params.exp_size;
 688     uint64_t hi;
 689
 690     hi = (uint64_t)p->sign << (f_size + e_size);
 691     hi = deposit64(hi, f_size, e_size, p->exp);
 692     hi = deposit64(hi, 0, f_size, p->frac_hi);
 693     return make_float128(hi, p->frac_lo);
 694 }
 695
 696 /*----------------------------------------------------------------------------
 697 | Functions and definitions to determine:  (1) whether tininess for underflow
 698 | is detected before or after rounding by default, (2) what (if anything)
 699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 701 | are propagated from function inputs to output.  These details are target-
 702 | specific.
 703 *----------------------------------------------------------------------------*/
 704 #include "softfloat-specialize.c.inc"
 705
 706 #define PARTS_GENERIC_64_128(NAME, P) \
 707     QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
 708
 709 #define parts_default_nan(P, S)    PARTS_GENERIC_64_128(default_nan, P)(P, S)
 710 #define parts_silence_nan(P, S)    PARTS_GENERIC_64_128(silence_nan, P)(P, S)
 711
 712 static void parts64_return_nan(FloatParts64 *a, float_status *s);
 713 static void parts128_return_nan(FloatParts128 *a, float_status *s);
 714
 715 #define parts_return_nan(P, S)     PARTS_GENERIC_64_128(return_nan, P)(P, S)
 716
 717 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b,
 718                                       float_status *s);
 719 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b,
 720                                         float_status *s);
 721
 722 #define parts_pick_nan(A, B, S)    PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
 723
 724 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b,
 725                                              FloatParts64 *c, float_status *s,
 726                                              int ab_mask, int abc_mask);
 727 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a,
 728                                                FloatParts128 *b,
 729                                                FloatParts128 *c,
 730                                                float_status *s,
 731                                                int ab_mask, int abc_mask);
 732
 733 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
 734     PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
 735
 736 static void parts64_canonicalize(FloatParts64 *p, float_status *status,
 737                                  const FloatFmt *fmt);
 738 static void parts128_canonicalize(FloatParts128 *p, float_status *status,
 739                                   const FloatFmt *fmt);
 740
 741 #define parts_canonicalize(A, S, F) \
 742     PARTS_GENERIC_64_128(canonicalize, A)(A, S, F)
 743
 744 /*
 745  * Helper functions for softfloat-parts.c.inc, per-size operations.
 746  */
 747
 748 #define FRAC_GENERIC_64_128(NAME, P) \
 749     QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
 750
 751 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b)
 752 {
 753     return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1;
 754 }
 755
 756 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b)
 757 {
 758     uint64_t ta = a->frac_hi, tb = b->frac_hi;
 759     if (ta == tb) {
 760         ta = a->frac_lo, tb = b->frac_lo;
 761         if (ta == tb) {
 762             return 0;
 763         }
 764     }
 765     return ta < tb ? -1 : 1;
 766 }
 767
 768 #define frac_cmp(A, B)  FRAC_GENERIC_64_128(cmp, A)(A, B)
 769
 770 static void frac64_clear(FloatParts64 *a)
 771 {
 772     a->frac = 0;
 773 }
 774
 775 static void frac128_clear(FloatParts128 *a)
 776 {
 777     a->frac_hi = a->frac_lo = 0;
 778 }
 779
 780 #define frac_clear(A)  FRAC_GENERIC_64_128(clear, A)(A)
 781
 782 static bool frac64_eqz(FloatParts64 *a)
 783 {
 784     return a->frac == 0;
 785 }
 786
 787 static bool frac128_eqz(FloatParts128 *a)
 788 {
 789     return (a->frac_hi | a->frac_lo) == 0;
 790 }
 791
 792 #define frac_eqz(A)  FRAC_GENERIC_64_128(eqz, A)(A)
 793
 794 static int frac64_normalize(FloatParts64 *a)
 795 {
 796     if (a->frac) {
 797         int shift = clz64(a->frac);
 798         a->frac <<= shift;
 799         return shift;
 800     }
 801     return 64;
 802 }
 803
 804 static int frac128_normalize(FloatParts128 *a)
 805 {
 806     if (a->frac_hi) {
 807         int shl = clz64(a->frac_hi);
 808         if (shl) {
 809             int shr = 64 - shl;
 810             a->frac_hi = (a->frac_hi << shl) | (a->frac_lo >> shr);
 811             a->frac_lo = (a->frac_lo << shl);
 812         }
 813         return shl;
 814     } else if (a->frac_lo) {
 815         int shl = clz64(a->frac_lo);
 816         a->frac_hi = (a->frac_lo << shl);
 817         a->frac_lo = 0;
 818         return shl + 64;
 819     }
 820     return 128;
 821 }
 822
 823 #define frac_normalize(A)  FRAC_GENERIC_64_128(normalize, A)(A)
 824
 825 static void frac64_shl(FloatParts64 *a, int c)
 826 {
 827     a->frac <<= c;
 828 }
 829
 830 static void frac128_shl(FloatParts128 *a, int c)
 831 {
 832     shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
 833 }
 834
 835 #define frac_shl(A, C)  FRAC_GENERIC_64_128(shl, A)(A, C)
 836
 837 static void frac64_shr(FloatParts64 *a, int c)
 838 {
 839     a->frac >>= c;
 840 }
 841
 842 static void frac128_shr(FloatParts128 *a, int c)
 843 {
 844     shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
 845 }
 846
 847 #define frac_shr(A, C)  FRAC_GENERIC_64_128(shr, A)(A, C)
 848
 849
 850 /* Round and uncanonicalize a floating-point number by parts. There
 851  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 852  * fraction; these bits will be removed. The exponent will be biased
 853  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 854  */
 855
 856 static FloatParts64 round_canonical(FloatParts64 p, float_status *s,
 857                                   const FloatFmt *parm)
 858 {
 859     const uint64_t frac_lsb = parm->frac_lsb;
 860     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 861     const uint64_t round_mask = parm->round_mask;
 862     const uint64_t roundeven_mask = parm->roundeven_mask;
 863     const int exp_max = parm->exp_max;
 864     const int frac_shift = parm->frac_shift;
 865     uint64_t frac, inc;
 866     int exp, flags = 0;
 867     bool overflow_norm;
 868
 869     frac = p.frac;
 870     exp = p.exp;
 871
 872     switch (p.cls) {
 873     case float_class_normal:
 874         switch (s->float_rounding_mode) {
 875         case float_round_nearest_even:
 876             overflow_norm = false;
 877             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 878             break;
 879         case float_round_ties_away:
 880             overflow_norm = false;
 881             inc = frac_lsbm1;
 882             break;
 883         case float_round_to_zero:
 884             overflow_norm = true;
 885             inc = 0;
 886             break;
 887         case float_round_up:
 888             inc = p.sign ? 0 : round_mask;
 889             overflow_norm = p.sign;
 890             break;
 891         case float_round_down:
 892             inc = p.sign ? round_mask : 0;
 893             overflow_norm = !p.sign;
 894             break;
 895         case float_round_to_odd:
 896             overflow_norm = true;
 897             inc = frac & frac_lsb ? 0 : round_mask;
 898             break;
 899         default:
 900             g_assert_not_reached();
 901         }
 902
 903         exp += parm->exp_bias;
 904         if (likely(exp > 0)) {
 905             if (frac & round_mask) {
 906                 flags |= float_flag_inexact;
 907                 if (uadd64_overflow(frac, inc, &frac)) {
 908                     frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT;
 909                     exp++;
 910                 }
 911             }
 912             frac >>= frac_shift;
 913
 914             if (parm->arm_althp) {
 915                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
 916                 if (unlikely(exp > exp_max)) {
 917                     /* Overflow.  Return the maximum normal.  */
 918                     flags = float_flag_invalid;
 919                     exp = exp_max;
 920                     frac = -1;
 921                 }
 922             } else if (unlikely(exp >= exp_max)) {
 923                 flags |= float_flag_overflow | float_flag_inexact;
 924                 if (overflow_norm) {
 925                     exp = exp_max - 1;
 926                     frac = -1;
 927                 } else {
 928                     p.cls = float_class_inf;
 929                     goto do_inf;
 930                 }
 931             }
 932         } else if (s->flush_to_zero) {
 933             flags |= float_flag_output_denormal;
 934             p.cls = float_class_zero;
 935             goto do_zero;
 936         } else {
 937             bool is_tiny = s->tininess_before_rounding || (exp < 0);
 938
 939             if (!is_tiny) {
 940                 uint64_t discard;
 941                 is_tiny = !uadd64_overflow(frac, inc, &discard);
 942             }
 943
 944             shift64RightJamming(frac, 1 - exp, &frac);
 945             if (frac & round_mask) {
 946                 /* Need to recompute round-to-even.  */
 947                 switch (s->float_rounding_mode) {
 948                 case float_round_nearest_even:
 949                     inc = ((frac & roundeven_mask) != frac_lsbm1
 950                            ? frac_lsbm1 : 0);
 951                     break;
 952                 case float_round_to_odd:
 953                     inc = frac & frac_lsb ? 0 : round_mask;
 954                     break;
 955                 default:
 956                     break;
 957                 }
 958                 flags |= float_flag_inexact;
 959                 frac += inc;
 960             }
 961
 962             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 963             frac >>= frac_shift;
 964
 965             if (is_tiny && (flags & float_flag_inexact)) {
 966                 flags |= float_flag_underflow;
 967             }
 968             if (exp == 0 && frac == 0) {
 969                 p.cls = float_class_zero;
 970             }
 971         }
 972         break;
 973
 974     case float_class_zero:
 975     do_zero:
 976         exp = 0;
 977         frac = 0;
 978         break;
 979
 980     case float_class_inf:
 981     do_inf:
 982         assert(!parm->arm_althp);
 983         exp = exp_max;
 984         frac = 0;
 985         break;
 986
 987     case float_class_qnan:
 988     case float_class_snan:
 989         assert(!parm->arm_althp);
 990         exp = exp_max;
 991         frac >>= parm->frac_shift;
 992         break;
 993
 994     default:
 995         g_assert_not_reached();
 996     }
 997
 998     float_raise(flags, s);
 999     p.exp = exp;
1000     p.frac = frac;
1001     return p;
1002 }
1003
1004
1005 #define partsN(NAME)   parts64_##NAME
1006 #define FloatPartsN    FloatParts64
1007
1008 #include "softfloat-parts.c.inc"
1009
1010 #undef  partsN
1011 #undef  FloatPartsN
1012 #define partsN(NAME)   parts128_##NAME
1013 #define FloatPartsN    FloatParts128
1014
1015 #include "softfloat-parts.c.inc"
1016
1017 #undef  partsN
1018 #undef  FloatPartsN
1019
1020 /*
1021  * Pack/unpack routines with a specific FloatFmt.
1022  */
1023
1024 static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
1025                                       float_status *s, const FloatFmt *params)
1026 {
1027     float16_unpack_raw(p, f);
1028     parts_canonicalize(p, s, params);
1029 }
1030
1031 static void float16_unpack_canonical(FloatParts64 *p, float16 f,
1032                                      float_status *s)
1033 {
1034     float16a_unpack_canonical(p, f, s, &float16_params);
1035 }
1036
1037 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
1038                                       float_status *s)
1039 {
1040     bfloat16_unpack_raw(p, f);
1041     parts_canonicalize(p, s, &bfloat16_params);
1042 }
1043
1044 static float16 float16a_round_pack_canonical(FloatParts64 *p,
1045                                              float_status *s,
1046                                              const FloatFmt *params)
1047 {
1048     *p = round_canonical(*p, s, params);
1049     return float16_pack_raw(p);
1050 }
1051
1052 static float16 float16_round_pack_canonical(FloatParts64 *p,
1053                                             float_status *s)
1054 {
1055     return float16a_round_pack_canonical(p, s, &float16_params);
1056 }
1057
1058 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p,
1059                                               float_status *s)
1060 {
1061     *p = round_canonical(*p, s, &bfloat16_params);
1062     return bfloat16_pack_raw(p);
1063 }
1064
1065 static void float32_unpack_canonical(FloatParts64 *p, float32 f,
1066                                      float_status *s)
1067 {
1068     float32_unpack_raw(p, f);
1069     parts_canonicalize(p, s, &float32_params);
1070 }
1071
1072 static float32 float32_round_pack_canonical(FloatParts64 *p,
1073                                             float_status *s)
1074 {
1075     *p = round_canonical(*p, s, &float32_params);
1076     return float32_pack_raw(p);
1077 }
1078
1079 static void float64_unpack_canonical(FloatParts64 *p, float64 f,
1080                                      float_status *s)
1081 {
1082     float64_unpack_raw(p, f);
1083     parts_canonicalize(p, s, &float64_params);
1084 }
1085
1086 static float64 float64_round_pack_canonical(FloatParts64 *p,
1087                                             float_status *s)
1088 {
1089     *p = round_canonical(*p, s, &float64_params);
1090     return float64_pack_raw(p);
1091 }
1092
1093 /*
1094  * Returns the result of adding or subtracting the values of the
1095  * floating-point values `a' and `b'. The operation is performed
1096  * according to the IEC/IEEE Standard for Binary Floating-Point
1097  * Arithmetic.
1098  */
1099
1100 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract,
1101                                 float_status *s)
1102 {
1103     bool a_sign = a.sign;
1104     bool b_sign = b.sign ^ subtract;
1105
1106     if (a_sign != b_sign) {
1107         /* Subtraction */
1108
1109         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1110             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
1111                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1112                 a.frac = a.frac - b.frac;
1113             } else {
1114                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1115                 a.frac = b.frac - a.frac;
1116                 a.exp = b.exp;
1117                 a_sign ^= 1;
1118             }
1119
1120             if (a.frac == 0) {
1121                 a.cls = float_class_zero;
1122                 a.sign = s->float_rounding_mode == float_round_down;
1123             } else {
1124                 int shift = clz64(a.frac);
1125                 a.frac = a.frac << shift;
1126                 a.exp = a.exp - shift;
1127                 a.sign = a_sign;
1128             }
1129             return a;
1130         }
1131         if (is_nan(a.cls) || is_nan(b.cls)) {
1132             return *parts_pick_nan(&a, &b, s);
1133         }
1134         if (a.cls == float_class_inf) {
1135             if (b.cls == float_class_inf) {
1136                 float_raise(float_flag_invalid, s);
1137                 parts_default_nan(&a, s);
1138             }
1139             return a;
1140         }
1141         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1142             a.sign = s->float_rounding_mode == float_round_down;
1143             return a;
1144         }
1145         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1146             b.sign = a_sign ^ 1;
1147             return b;
1148         }
1149         if (b.cls == float_class_zero) {
1150             return a;
1151         }
1152     } else {
1153         /* Addition */
1154         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1155             if (a.exp > b.exp) {
1156                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1157             } else if (a.exp < b.exp) {
1158                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1159                 a.exp = b.exp;
1160             }
1161
1162             if (uadd64_overflow(a.frac, b.frac, &a.frac)) {
1163                 shift64RightJamming(a.frac, 1, &a.frac);
1164                 a.frac |= DECOMPOSED_IMPLICIT_BIT;
1165                 a.exp += 1;
1166             }
1167             return a;
1168         }
1169         if (is_nan(a.cls) || is_nan(b.cls)) {
1170             return *parts_pick_nan(&a, &b, s);
1171         }
1172         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1173             return a;
1174         }
1175         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1176             b.sign = b_sign;
1177             return b;
1178         }
1179     }
1180     g_assert_not_reached();
1181 }
1182
1183 /*
1184  * Returns the result of adding or subtracting the floating-point
1185  * values `a' and `b'. The operation is performed according to the
1186  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1187  */
1188
1189 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1190 {
1191     FloatParts64 pa, pb, pr;
1192
1193     float16_unpack_canonical(&pa, a, status);
1194     float16_unpack_canonical(&pb, b, status);
1195     pr = addsub_floats(pa, pb, false, status);
1196
1197     return float16_round_pack_canonical(&pr, status);
1198 }
1199
1200 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1201 {
1202     FloatParts64 pa, pb, pr;
1203
1204     float16_unpack_canonical(&pa, a, status);
1205     float16_unpack_canonical(&pb, b, status);
1206     pr = addsub_floats(pa, pb, true, status);
1207
1208     return float16_round_pack_canonical(&pr, status);
1209 }
1210
1211 static float32 QEMU_SOFTFLOAT_ATTR
1212 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1213 {
1214     FloatParts64 pa, pb, pr;
1215
1216     float32_unpack_canonical(&pa, a, status);
1217     float32_unpack_canonical(&pb, b, status);
1218     pr = addsub_floats(pa, pb, subtract, status);
1219
1220     return float32_round_pack_canonical(&pr, status);
1221 }
1222
1223 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1224 {
1225     return soft_f32_addsub(a, b, false, status);
1226 }
1227
1228 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1229 {
1230     return soft_f32_addsub(a, b, true, status);
1231 }
1232
1233 static float64 QEMU_SOFTFLOAT_ATTR
1234 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1235 {
1236     FloatParts64 pa, pb, pr;
1237
1238     float64_unpack_canonical(&pa, a, status);
1239     float64_unpack_canonical(&pb, b, status);
1240     pr = addsub_floats(pa, pb, subtract, status);
1241
1242     return float64_round_pack_canonical(&pr, status);
1243 }
1244
1245 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1246 {
1247     return soft_f64_addsub(a, b, false, status);
1248 }
1249
1250 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1251 {
1252     return soft_f64_addsub(a, b, true, status);
1253 }
1254
1255 static float hard_f32_add(float a, float b)
1256 {
1257     return a + b;
1258 }
1259
1260 static float hard_f32_sub(float a, float b)
1261 {
1262     return a - b;
1263 }
1264
1265 static double hard_f64_add(double a, double b)
1266 {
1267     return a + b;
1268 }
1269
1270 static double hard_f64_sub(double a, double b)
1271 {
1272     return a - b;
1273 }
1274
1275 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1276 {
1277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1278         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1279     }
1280     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1281 }
1282
1283 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1284 {
1285     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1286         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1287     } else {
1288         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1289     }
1290 }
1291
1292 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1293                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1294 {
1295     return float32_gen2(a, b, s, hard, soft,
1296                         f32_is_zon2, f32_addsubmul_post);
1297 }
1298
1299 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1300                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1301 {
1302     return float64_gen2(a, b, s, hard, soft,
1303                         f64_is_zon2, f64_addsubmul_post);
1304 }
1305
1306 float32 QEMU_FLATTEN
1307 float32_add(float32 a, float32 b, float_status *s)
1308 {
1309     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1310 }
1311
1312 float32 QEMU_FLATTEN
1313 float32_sub(float32 a, float32 b, float_status *s)
1314 {
1315     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1316 }
1317
1318 float64 QEMU_FLATTEN
1319 float64_add(float64 a, float64 b, float_status *s)
1320 {
1321     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1322 }
1323
1324 float64 QEMU_FLATTEN
1325 float64_sub(float64 a, float64 b, float_status *s)
1326 {
1327     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1328 }
1329
1330 /*
1331  * Returns the result of adding or subtracting the bfloat16
1332  * values `a' and `b'.
1333  */
1334 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1335 {
1336     FloatParts64 pa, pb, pr;
1337
1338     bfloat16_unpack_canonical(&pa, a, status);
1339     bfloat16_unpack_canonical(&pb, b, status);
1340     pr = addsub_floats(pa, pb, false, status);
1341
1342     return bfloat16_round_pack_canonical(&pr, status);
1343 }
1344
1345 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1346 {
1347     FloatParts64 pa, pb, pr;
1348
1349     bfloat16_unpack_canonical(&pa, a, status);
1350     bfloat16_unpack_canonical(&pb, b, status);
1351     pr = addsub_floats(pa, pb, true, status);
1352
1353     return bfloat16_round_pack_canonical(&pr, status);
1354 }
1355
1356 /*
1357  * Returns the result of multiplying the floating-point values `a' and
1358  * `b'. The operation is performed according to the IEC/IEEE Standard
1359  * for Binary Floating-Point Arithmetic.
1360  */
1361
1362 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1363 {
1364     bool sign = a.sign ^ b.sign;
1365
1366     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1367         uint64_t hi, lo;
1368         int exp = a.exp + b.exp;
1369
1370         mul64To128(a.frac, b.frac, &hi, &lo);
1371         if (hi & DECOMPOSED_IMPLICIT_BIT) {
1372             exp += 1;
1373         } else {
1374             hi <<= 1;
1375         }
1376         hi |= (lo != 0);
1377
1378         /* Re-use a */
1379         a.exp = exp;
1380         a.sign = sign;
1381         a.frac = hi;
1382         return a;
1383     }
1384     /* handle all the NaN cases */
1385     if (is_nan(a.cls) || is_nan(b.cls)) {
1386         return *parts_pick_nan(&a, &b, s);
1387     }
1388     /* Inf * Zero == NaN */
1389     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1390         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1391         float_raise(float_flag_invalid, s);
1392         parts_default_nan(&a, s);
1393         return a;
1394     }
1395     /* Multiply by 0 or Inf */
1396     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1397         a.sign = sign;
1398         return a;
1399     }
1400     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1401         b.sign = sign;
1402         return b;
1403     }
1404     g_assert_not_reached();
1405 }
1406
1407 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1408 {
1409     FloatParts64 pa, pb, pr;
1410
1411     float16_unpack_canonical(&pa, a, status);
1412     float16_unpack_canonical(&pb, b, status);
1413     pr = mul_floats(pa, pb, status);
1414
1415     return float16_round_pack_canonical(&pr, status);
1416 }
1417
1418 static float32 QEMU_SOFTFLOAT_ATTR
1419 soft_f32_mul(float32 a, float32 b, float_status *status)
1420 {
1421     FloatParts64 pa, pb, pr;
1422
1423     float32_unpack_canonical(&pa, a, status);
1424     float32_unpack_canonical(&pb, b, status);
1425     pr = mul_floats(pa, pb, status);
1426
1427     return float32_round_pack_canonical(&pr, status);
1428 }
1429
1430 static float64 QEMU_SOFTFLOAT_ATTR
1431 soft_f64_mul(float64 a, float64 b, float_status *status)
1432 {
1433     FloatParts64 pa, pb, pr;
1434
1435     float64_unpack_canonical(&pa, a, status);
1436     float64_unpack_canonical(&pb, b, status);
1437     pr = mul_floats(pa, pb, status);
1438
1439     return float64_round_pack_canonical(&pr, status);
1440 }
1441
1442 static float hard_f32_mul(float a, float b)
1443 {
1444     return a * b;
1445 }
1446
1447 static double hard_f64_mul(double a, double b)
1448 {
1449     return a * b;
1450 }
1451
1452 float32 QEMU_FLATTEN
1453 float32_mul(float32 a, float32 b, float_status *s)
1454 {
1455     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1456                         f32_is_zon2, f32_addsubmul_post);
1457 }
1458
1459 float64 QEMU_FLATTEN
1460 float64_mul(float64 a, float64 b, float_status *s)
1461 {
1462     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1463                         f64_is_zon2, f64_addsubmul_post);
1464 }
1465
1466 /*
1467  * Returns the result of multiplying the bfloat16
1468  * values `a' and `b'.
1469  */
1470
1471 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1472 {
1473     FloatParts64 pa, pb, pr;
1474
1475     bfloat16_unpack_canonical(&pa, a, status);
1476     bfloat16_unpack_canonical(&pb, b, status);
1477     pr = mul_floats(pa, pb, status);
1478
1479     return bfloat16_round_pack_canonical(&pr, status);
1480 }
1481
1482 /*
1483  * Returns the result of multiplying the floating-point values `a' and
1484  * `b' then adding 'c', with no intermediate rounding step after the
1485  * multiplication. The operation is performed according to the
1486  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1487  * The flags argument allows the caller to select negation of the
1488  * addend, the intermediate product, or the final result. (The
1489  * difference between this and having the caller do a separate
1490  * negation is that negating externally will flip the sign bit on
1491  * NaNs.)
1492  */
1493
1494 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1495                                 int flags, float_status *s)
1496 {
1497     bool inf_zero, p_sign;
1498     bool sign_flip = flags & float_muladd_negate_result;
1499     FloatClass p_class;
1500     uint64_t hi, lo;
1501     int p_exp;
1502     int ab_mask, abc_mask;
1503
1504     ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1505     abc_mask = float_cmask(c.cls) | ab_mask;
1506     inf_zero = ab_mask == float_cmask_infzero;
1507
1508     /* It is implementation-defined whether the cases of (0,inf,qnan)
1509      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1510      * they return if they do), so we have to hand this information
1511      * off to the target-specific pick-a-NaN routine.
1512      */
1513     if (unlikely(abc_mask & float_cmask_anynan)) {
1514         return *parts_pick_nan_muladd(&a, &b, &c, s, ab_mask, abc_mask);
1515     }
1516
1517     if (inf_zero) {
1518         float_raise(float_flag_invalid, s);
1519         parts_default_nan(&a, s);
1520         return a;
1521     }
1522
1523     if (flags & float_muladd_negate_c) {
1524         c.sign ^= 1;
1525     }
1526
1527     p_sign = a.sign ^ b.sign;
1528
1529     if (flags & float_muladd_negate_product) {
1530         p_sign ^= 1;
1531     }
1532
1533     if (ab_mask & float_cmask_inf) {
1534         p_class = float_class_inf;
1535     } else if (ab_mask & float_cmask_zero) {
1536         p_class = float_class_zero;
1537     } else {
1538         p_class = float_class_normal;
1539     }
1540
1541     if (c.cls == float_class_inf) {
1542         if (p_class == float_class_inf && p_sign != c.sign) {
1543             float_raise(float_flag_invalid, s);
1544             parts_default_nan(&c, s);
1545         } else {
1546             c.sign ^= sign_flip;
1547         }
1548         return c;
1549     }
1550
1551     if (p_class == float_class_inf) {
1552         a.cls = float_class_inf;
1553         a.sign = p_sign ^ sign_flip;
1554         return a;
1555     }
1556
1557     if (p_class == float_class_zero) {
1558         if (c.cls == float_class_zero) {
1559             if (p_sign != c.sign) {
1560                 p_sign = s->float_rounding_mode == float_round_down;
1561             }
1562             c.sign = p_sign;
1563         } else if (flags & float_muladd_halve_result) {
1564             c.exp -= 1;
1565         }
1566         c.sign ^= sign_flip;
1567         return c;
1568     }
1569
1570     /* a & b should be normals now... */
1571     assert(a.cls == float_class_normal &&
1572            b.cls == float_class_normal);
1573
1574     p_exp = a.exp + b.exp;
1575
1576     mul64To128(a.frac, b.frac, &hi, &lo);
1577
1578     /* Renormalize to the msb. */
1579     if (hi & DECOMPOSED_IMPLICIT_BIT) {
1580         p_exp += 1;
1581     } else {
1582         shortShift128Left(hi, lo, 1, &hi, &lo);
1583     }
1584
1585     /* + add/sub */
1586     if (c.cls != float_class_zero) {
1587         int exp_diff = p_exp - c.exp;
1588         if (p_sign == c.sign) {
1589             /* Addition */
1590             if (exp_diff <= 0) {
1591                 shift64RightJamming(hi, -exp_diff, &hi);
1592                 p_exp = c.exp;
1593                 if (uadd64_overflow(hi, c.frac, &hi)) {
1594                     shift64RightJamming(hi, 1, &hi);
1595                     hi |= DECOMPOSED_IMPLICIT_BIT;
1596                     p_exp += 1;
1597                 }
1598             } else {
1599                 uint64_t c_hi, c_lo, over;
1600                 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1601                 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1602                 if (over) {
1603                     shift64RightJamming(hi, 1, &hi);
1604                     hi |= DECOMPOSED_IMPLICIT_BIT;
1605                     p_exp += 1;
1606                 }
1607             }
1608         } else {
1609             /* Subtraction */
1610             uint64_t c_hi = c.frac, c_lo = 0;
1611
1612             if (exp_diff <= 0) {
1613                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1614                 if (exp_diff == 0
1615                     &&
1616                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1617                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1618                 } else {
1619                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1620                     p_sign ^= 1;
1621                     p_exp = c.exp;
1622                 }
1623             } else {
1624                 shift128RightJamming(c_hi, c_lo,
1625                                      exp_diff,
1626                                      &c_hi, &c_lo);
1627                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1628             }
1629
1630             if (hi == 0 && lo == 0) {
1631                 a.cls = float_class_zero;
1632                 a.sign = s->float_rounding_mode == float_round_down;
1633                 a.sign ^= sign_flip;
1634                 return a;
1635             } else {
1636                 int shift;
1637                 if (hi != 0) {
1638                     shift = clz64(hi);
1639                 } else {
1640                     shift = clz64(lo) + 64;
1641                 }
1642                 /* Normalizing to a binary point of 124 is the
1643                    correct adjust for the exponent.  However since we're
1644                    shifting, we might as well put the binary point back
1645                    at 63 where we really want it.  Therefore shift as
1646                    if we're leaving 1 bit at the top of the word, but
1647                    adjust the exponent as if we're leaving 3 bits.  */
1648                 shift128Left(hi, lo, shift, &hi, &lo);
1649                 p_exp -= shift;
1650             }
1651         }
1652     }
1653     hi |= (lo != 0);
1654
1655     if (flags & float_muladd_halve_result) {
1656         p_exp -= 1;
1657     }
1658
1659     /* finally prepare our result */
1660     a.cls = float_class_normal;
1661     a.sign = p_sign ^ sign_flip;
1662     a.exp = p_exp;
1663     a.frac = hi;
1664
1665     return a;
1666 }
1667
1668 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1669                                                 int flags, float_status *status)
1670 {
1671     FloatParts64 pa, pb, pc, pr;
1672
1673     float16_unpack_canonical(&pa, a, status);
1674     float16_unpack_canonical(&pb, b, status);
1675     float16_unpack_canonical(&pc, c, status);
1676     pr = muladd_floats(pa, pb, pc, flags, status);
1677
1678     return float16_round_pack_canonical(&pr, status);
1679 }
1680
1681 static float32 QEMU_SOFTFLOAT_ATTR
1682 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1683                 float_status *status)
1684 {
1685     FloatParts64 pa, pb, pc, pr;
1686
1687     float32_unpack_canonical(&pa, a, status);
1688     float32_unpack_canonical(&pb, b, status);
1689     float32_unpack_canonical(&pc, c, status);
1690     pr = muladd_floats(pa, pb, pc, flags, status);
1691
1692     return float32_round_pack_canonical(&pr, status);
1693 }
1694
1695 static float64 QEMU_SOFTFLOAT_ATTR
1696 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1697                 float_status *status)
1698 {
1699     FloatParts64 pa, pb, pc, pr;
1700
1701     float64_unpack_canonical(&pa, a, status);
1702     float64_unpack_canonical(&pb, b, status);
1703     float64_unpack_canonical(&pc, c, status);
1704     pr = muladd_floats(pa, pb, pc, flags, status);
1705
1706     return float64_round_pack_canonical(&pr, status);
1707 }
1708
1709 static bool force_soft_fma;
1710
1711 float32 QEMU_FLATTEN
1712 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1713 {
1714     union_float32 ua, ub, uc, ur;
1715
1716     ua.s = xa;
1717     ub.s = xb;
1718     uc.s = xc;
1719
1720     if (unlikely(!can_use_fpu(s))) {
1721         goto soft;
1722     }
1723     if (unlikely(flags & float_muladd_halve_result)) {
1724         goto soft;
1725     }
1726
1727     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1728     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1729         goto soft;
1730     }
1731
1732     if (unlikely(force_soft_fma)) {
1733         goto soft;
1734     }
1735
1736     /*
1737      * When (a || b) == 0, there's no need to check for under/over flow,
1738      * since we know the addend is (normal || 0) and the product is 0.
1739      */
1740     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1741         union_float32 up;
1742         bool prod_sign;
1743
1744         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1745         prod_sign ^= !!(flags & float_muladd_negate_product);
1746         up.s = float32_set_sign(float32_zero, prod_sign);
1747
1748         if (flags & float_muladd_negate_c) {
1749             uc.h = -uc.h;
1750         }
1751         ur.h = up.h + uc.h;
1752     } else {
1753         union_float32 ua_orig = ua;
1754         union_float32 uc_orig = uc;
1755
1756         if (flags & float_muladd_negate_product) {
1757             ua.h = -ua.h;
1758         }
1759         if (flags & float_muladd_negate_c) {
1760             uc.h = -uc.h;
1761         }
1762
1763         ur.h = fmaf(ua.h, ub.h, uc.h);
1764
1765         if (unlikely(f32_is_inf(ur))) {
1766             float_raise(float_flag_overflow, s);
1767         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1768             ua = ua_orig;
1769             uc = uc_orig;
1770             goto soft;
1771         }
1772     }
1773     if (flags & float_muladd_negate_result) {
1774         return float32_chs(ur.s);
1775     }
1776     return ur.s;
1777
1778  soft:
1779     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1780 }
1781
1782 float64 QEMU_FLATTEN
1783 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1784 {
1785     union_float64 ua, ub, uc, ur;
1786
1787     ua.s = xa;
1788     ub.s = xb;
1789     uc.s = xc;
1790
1791     if (unlikely(!can_use_fpu(s))) {
1792         goto soft;
1793     }
1794     if (unlikely(flags & float_muladd_halve_result)) {
1795         goto soft;
1796     }
1797
1798     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1799     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1800         goto soft;
1801     }
1802
1803     if (unlikely(force_soft_fma)) {
1804         goto soft;
1805     }
1806
1807     /*
1808      * When (a || b) == 0, there's no need to check for under/over flow,
1809      * since we know the addend is (normal || 0) and the product is 0.
1810      */
1811     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1812         union_float64 up;
1813         bool prod_sign;
1814
1815         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1816         prod_sign ^= !!(flags & float_muladd_negate_product);
1817         up.s = float64_set_sign(float64_zero, prod_sign);
1818
1819         if (flags & float_muladd_negate_c) {
1820             uc.h = -uc.h;
1821         }
1822         ur.h = up.h + uc.h;
1823     } else {
1824         union_float64 ua_orig = ua;
1825         union_float64 uc_orig = uc;
1826
1827         if (flags & float_muladd_negate_product) {
1828             ua.h = -ua.h;
1829         }
1830         if (flags & float_muladd_negate_c) {
1831             uc.h = -uc.h;
1832         }
1833
1834         ur.h = fma(ua.h, ub.h, uc.h);
1835
1836         if (unlikely(f64_is_inf(ur))) {
1837             float_raise(float_flag_overflow, s);
1838         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1839             ua = ua_orig;
1840             uc = uc_orig;
1841             goto soft;
1842         }
1843     }
1844     if (flags & float_muladd_negate_result) {
1845         return float64_chs(ur.s);
1846     }
1847     return ur.s;
1848
1849  soft:
1850     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1851 }
1852
1853 /*
1854  * Returns the result of multiplying the bfloat16 values `a'
1855  * and `b' then adding 'c', with no intermediate rounding step after the
1856  * multiplication.
1857  */
1858
1859 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1860                                       int flags, float_status *status)
1861 {
1862     FloatParts64 pa, pb, pc, pr;
1863
1864     bfloat16_unpack_canonical(&pa, a, status);
1865     bfloat16_unpack_canonical(&pb, b, status);
1866     bfloat16_unpack_canonical(&pc, c, status);
1867     pr = muladd_floats(pa, pb, pc, flags, status);
1868
1869     return bfloat16_round_pack_canonical(&pr, status);
1870 }
1871
1872 /*
1873  * Returns the result of dividing the floating-point value `a' by the
1874  * corresponding value `b'. The operation is performed according to
1875  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1876  */
1877
1878 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1879 {
1880     bool sign = a.sign ^ b.sign;
1881
1882     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1883         uint64_t n0, n1, q, r;
1884         int exp = a.exp - b.exp;
1885
1886         /*
1887          * We want a 2*N / N-bit division to produce exactly an N-bit
1888          * result, so that we do not lose any precision and so that we
1889          * do not have to renormalize afterward.  If A.frac < B.frac,
1890          * then division would produce an (N-1)-bit result; shift A left
1891          * by one to produce the an N-bit result, and decrement the
1892          * exponent to match.
1893          *
1894          * The udiv_qrnnd algorithm that we're using requires normalization,
1895          * i.e. the msb of the denominator must be set, which is already true.
1896          */
1897         if (a.frac < b.frac) {
1898             exp -= 1;
1899             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1900         } else {
1901             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1902         }
1903         q = udiv_qrnnd(&r, n1, n0, b.frac);
1904
1905         /* Set lsb if there is a remainder, to set inexact. */
1906         a.frac = q | (r != 0);
1907         a.sign = sign;
1908         a.exp = exp;
1909         return a;
1910     }
1911     /* handle all the NaN cases */
1912     if (is_nan(a.cls) || is_nan(b.cls)) {
1913         return *parts_pick_nan(&a, &b, s);
1914     }
1915     /* 0/0 or Inf/Inf */
1916     if (a.cls == b.cls
1917         &&
1918         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1919         float_raise(float_flag_invalid, s);
1920         parts_default_nan(&a, s);
1921         return a;
1922     }
1923     /* Inf / x or 0 / x */
1924     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1925         a.sign = sign;
1926         return a;
1927     }
1928     /* Div 0 => Inf */
1929     if (b.cls == float_class_zero) {
1930         float_raise(float_flag_divbyzero, s);
1931         a.cls = float_class_inf;
1932         a.sign = sign;
1933         return a;
1934     }
1935     /* Div by Inf */
1936     if (b.cls == float_class_inf) {
1937         a.cls = float_class_zero;
1938         a.sign = sign;
1939         return a;
1940     }
1941     g_assert_not_reached();
1942 }
1943
1944 float16 float16_div(float16 a, float16 b, float_status *status)
1945 {
1946     FloatParts64 pa, pb, pr;
1947
1948     float16_unpack_canonical(&pa, a, status);
1949     float16_unpack_canonical(&pb, b, status);
1950     pr = div_floats(pa, pb, status);
1951
1952     return float16_round_pack_canonical(&pr, status);
1953 }
1954
1955 static float32 QEMU_SOFTFLOAT_ATTR
1956 soft_f32_div(float32 a, float32 b, float_status *status)
1957 {
1958     FloatParts64 pa, pb, pr;
1959
1960     float32_unpack_canonical(&pa, a, status);
1961     float32_unpack_canonical(&pb, b, status);
1962     pr = div_floats(pa, pb, status);
1963
1964     return float32_round_pack_canonical(&pr, status);
1965 }
1966
1967 static float64 QEMU_SOFTFLOAT_ATTR
1968 soft_f64_div(float64 a, float64 b, float_status *status)
1969 {
1970     FloatParts64 pa, pb, pr;
1971
1972     float64_unpack_canonical(&pa, a, status);
1973     float64_unpack_canonical(&pb, b, status);
1974     pr = div_floats(pa, pb, status);
1975
1976     return float64_round_pack_canonical(&pr, status);
1977 }
1978
1979 static float hard_f32_div(float a, float b)
1980 {
1981     return a / b;
1982 }
1983
1984 static double hard_f64_div(double a, double b)
1985 {
1986     return a / b;
1987 }
1988
1989 static bool f32_div_pre(union_float32 a, union_float32 b)
1990 {
1991     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1992         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1993                fpclassify(b.h) == FP_NORMAL;
1994     }
1995     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1996 }
1997
1998 static bool f64_div_pre(union_float64 a, union_float64 b)
1999 {
2000     if (QEMU_HARDFLOAT_2F64_USE_FP) {
2001         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
2002                fpclassify(b.h) == FP_NORMAL;
2003     }
2004     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
2005 }
2006
2007 static bool f32_div_post(union_float32 a, union_float32 b)
2008 {
2009     if (QEMU_HARDFLOAT_2F32_USE_FP) {
2010         return fpclassify(a.h) != FP_ZERO;
2011     }
2012     return !float32_is_zero(a.s);
2013 }
2014
2015 static bool f64_div_post(union_float64 a, union_float64 b)
2016 {
2017     if (QEMU_HARDFLOAT_2F64_USE_FP) {
2018         return fpclassify(a.h) != FP_ZERO;
2019     }
2020     return !float64_is_zero(a.s);
2021 }
2022
2023 float32 QEMU_FLATTEN
2024 float32_div(float32 a, float32 b, float_status *s)
2025 {
2026     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
2027                         f32_div_pre, f32_div_post);
2028 }
2029
2030 float64 QEMU_FLATTEN
2031 float64_div(float64 a, float64 b, float_status *s)
2032 {
2033     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
2034                         f64_div_pre, f64_div_post);
2035 }
2036
2037 /*
2038  * Returns the result of dividing the bfloat16
2039  * value `a' by the corresponding value `b'.
2040  */
2041
2042 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
2043 {
2044     FloatParts64 pa, pb, pr;
2045
2046     bfloat16_unpack_canonical(&pa, a, status);
2047     bfloat16_unpack_canonical(&pb, b, status);
2048     pr = div_floats(pa, pb, status);
2049
2050     return bfloat16_round_pack_canonical(&pr, status);
2051 }
2052
2053 /*
2054  * Float to Float conversions
2055  *
2056  * Returns the result of converting one float format to another. The
2057  * conversion is performed according to the IEC/IEEE Standard for
2058  * Binary Floating-Point Arithmetic.
2059  *
2060  * The float_to_float helper only needs to take care of raising
2061  * invalid exceptions and handling the conversion on NaNs.
2062  */
2063
2064 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
2065                                  float_status *s)
2066 {
2067     if (dstf->arm_althp) {
2068         switch (a.cls) {
2069         case float_class_qnan:
2070         case float_class_snan:
2071             /* There is no NaN in the destination format.  Raise Invalid
2072              * and return a zero with the sign of the input NaN.
2073              */
2074             float_raise(float_flag_invalid, s);
2075             a.cls = float_class_zero;
2076             a.frac = 0;
2077             a.exp = 0;
2078             break;
2079
2080         case float_class_inf:
2081             /* There is no Inf in the destination format.  Raise Invalid
2082              * and return the maximum normal with the correct sign.
2083              */
2084             float_raise(float_flag_invalid, s);
2085             a.cls = float_class_normal;
2086             a.exp = dstf->exp_max;
2087             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
2088             break;
2089
2090         default:
2091             break;
2092         }
2093     } else if (is_nan(a.cls)) {
2094         parts_return_nan(&a, s);
2095     }
2096     return a;
2097 }
2098
2099 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
2100 {
2101     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2102     FloatParts64 pa, pr;
2103
2104     float16a_unpack_canonical(&pa, a, s, fmt16);
2105     pr = float_to_float(pa, &float32_params, s);
2106     return float32_round_pack_canonical(&pr, s);
2107 }
2108
2109 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
2110 {
2111     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2112     FloatParts64 pa, pr;
2113
2114     float16a_unpack_canonical(&pa, a, s, fmt16);
2115     pr = float_to_float(pa, &float64_params, s);
2116     return float64_round_pack_canonical(&pr, s);
2117 }
2118
2119 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
2120 {
2121     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2122     FloatParts64 pa, pr;
2123
2124     float32_unpack_canonical(&pa, a, s);
2125     pr = float_to_float(pa, fmt16, s);
2126     return float16a_round_pack_canonical(&pr, s, fmt16);
2127 }
2128
2129 static float64 QEMU_SOFTFLOAT_ATTR
2130 soft_float32_to_float64(float32 a, float_status *s)
2131 {
2132     FloatParts64 pa, pr;
2133
2134     float32_unpack_canonical(&pa, a, s);
2135     pr = float_to_float(pa, &float64_params, s);
2136     return float64_round_pack_canonical(&pr, s);
2137 }
2138
2139 float64 float32_to_float64(float32 a, float_status *s)
2140 {
2141     if (likely(float32_is_normal(a))) {
2142         /* Widening conversion can never produce inexact results.  */
2143         union_float32 uf;
2144         union_float64 ud;
2145         uf.s = a;
2146         ud.h = uf.h;
2147         return ud.s;
2148     } else if (float32_is_zero(a)) {
2149         return float64_set_sign(float64_zero, float32_is_neg(a));
2150     } else {
2151         return soft_float32_to_float64(a, s);
2152     }
2153 }
2154
2155 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2156 {
2157     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2158     FloatParts64 pa, pr;
2159
2160     float64_unpack_canonical(&pa, a, s);
2161     pr = float_to_float(pa, fmt16, s);
2162     return float16a_round_pack_canonical(&pr, s, fmt16);
2163 }
2164
2165 float32 float64_to_float32(float64 a, float_status *s)
2166 {
2167     FloatParts64 pa, pr;
2168
2169     float64_unpack_canonical(&pa, a, s);
2170     pr = float_to_float(pa, &float32_params, s);
2171     return float32_round_pack_canonical(&pr, s);
2172 }
2173
2174 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2175 {
2176     FloatParts64 pa, pr;
2177
2178     bfloat16_unpack_canonical(&pa, a, s);
2179     pr = float_to_float(pa, &float32_params, s);
2180     return float32_round_pack_canonical(&pr, s);
2181 }
2182
2183 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2184 {
2185     FloatParts64 pa, pr;
2186
2187     bfloat16_unpack_canonical(&pa, a, s);
2188     pr = float_to_float(pa, &float64_params, s);
2189     return float64_round_pack_canonical(&pr, s);
2190 }
2191
2192 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2193 {
2194     FloatParts64 pa, pr;
2195
2196     float32_unpack_canonical(&pa, a, s);
2197     pr = float_to_float(pa, &bfloat16_params, s);
2198     return bfloat16_round_pack_canonical(&pr, s);
2199 }
2200
2201 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2202 {
2203     FloatParts64 pa, pr;
2204
2205     float64_unpack_canonical(&pa, a, s);
2206     pr = float_to_float(pa, &bfloat16_params, s);
2207     return bfloat16_round_pack_canonical(&pr, s);
2208 }
2209
2210 /*
2211  * Rounds the floating-point value `a' to an integer, and returns the
2212  * result as a floating-point value. The operation is performed
2213  * according to the IEC/IEEE Standard for Binary Floating-Point
2214  * Arithmetic.
2215  */
2216
2217 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2218                                int scale, float_status *s)
2219 {
2220     switch (a.cls) {
2221     case float_class_qnan:
2222     case float_class_snan:
2223         parts_return_nan(&a, s);
2224         break;
2225
2226     case float_class_zero:
2227     case float_class_inf:
2228         /* already "integral" */
2229         break;
2230
2231     case float_class_normal:
2232         scale = MIN(MAX(scale, -0x10000), 0x10000);
2233         a.exp += scale;
2234
2235         if (a.exp >= DECOMPOSED_BINARY_POINT) {
2236             /* already integral */
2237             break;
2238         }
2239         if (a.exp < 0) {
2240             bool one;
2241             /* all fractional */
2242             float_raise(float_flag_inexact, s);
2243             switch (rmode) {
2244             case float_round_nearest_even:
2245                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2246                 break;
2247             case float_round_ties_away:
2248                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2249                 break;
2250             case float_round_to_zero:
2251                 one = false;
2252                 break;
2253             case float_round_up:
2254                 one = !a.sign;
2255                 break;
2256             case float_round_down:
2257                 one = a.sign;
2258                 break;
2259             case float_round_to_odd:
2260                 one = true;
2261                 break;
2262             default:
2263                 g_assert_not_reached();
2264             }
2265
2266             if (one) {
2267                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2268                 a.exp = 0;
2269             } else {
2270                 a.cls = float_class_zero;
2271             }
2272         } else {
2273             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2274             uint64_t frac_lsbm1 = frac_lsb >> 1;
2275             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2276             uint64_t rnd_mask = rnd_even_mask >> 1;
2277             uint64_t inc;
2278
2279             switch (rmode) {
2280             case float_round_nearest_even:
2281                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2282                 break;
2283             case float_round_ties_away:
2284                 inc = frac_lsbm1;
2285                 break;
2286             case float_round_to_zero:
2287                 inc = 0;
2288                 break;
2289             case float_round_up:
2290                 inc = a.sign ? 0 : rnd_mask;
2291                 break;
2292             case float_round_down:
2293                 inc = a.sign ? rnd_mask : 0;
2294                 break;
2295             case float_round_to_odd:
2296                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2297                 break;
2298             default:
2299                 g_assert_not_reached();
2300             }
2301
2302             if (a.frac & rnd_mask) {
2303                 float_raise(float_flag_inexact, s);
2304                 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2305                     a.frac >>= 1;
2306                     a.frac |= DECOMPOSED_IMPLICIT_BIT;
2307                     a.exp++;
2308                 }
2309                 a.frac &= ~rnd_mask;
2310             }
2311         }
2312         break;
2313     default:
2314         g_assert_not_reached();
2315     }
2316     return a;
2317 }
2318
2319 float16 float16_round_to_int(float16 a, float_status *s)
2320 {
2321     FloatParts64 pa, pr;
2322
2323     float16_unpack_canonical(&pa, a, s);
2324     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2325     return float16_round_pack_canonical(&pr, s);
2326 }
2327
2328 float32 float32_round_to_int(float32 a, float_status *s)
2329 {
2330     FloatParts64 pa, pr;
2331
2332     float32_unpack_canonical(&pa, a, s);
2333     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2334     return float32_round_pack_canonical(&pr, s);
2335 }
2336
2337 float64 float64_round_to_int(float64 a, float_status *s)
2338 {
2339     FloatParts64 pa, pr;
2340
2341     float64_unpack_canonical(&pa, a, s);
2342     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2343     return float64_round_pack_canonical(&pr, s);
2344 }
2345
2346 /*
2347  * Rounds the bfloat16 value `a' to an integer, and returns the
2348  * result as a bfloat16 value.
2349  */
2350
2351 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2352 {
2353     FloatParts64 pa, pr;
2354
2355     bfloat16_unpack_canonical(&pa, a, s);
2356     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2357     return bfloat16_round_pack_canonical(&pr, s);
2358 }
2359
2360 /*
2361  * Returns the result of converting the floating-point value `a' to
2362  * the two's complement integer format. The conversion is performed
2363  * according to the IEC/IEEE Standard for Binary Floating-Point
2364  * Arithmetic---which means in particular that the conversion is
2365  * rounded according to the current rounding mode. If `a' is a NaN,
2366  * the largest positive integer is returned. Otherwise, if the
2367  * conversion overflows, the largest integer with the same sign as `a'
2368  * is returned.
2369 */
2370
2371 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2372                                      int scale, int64_t min, int64_t max,
2373                                      float_status *s)
2374 {
2375     uint64_t r;
2376     int orig_flags = get_float_exception_flags(s);
2377     FloatParts64 p = round_to_int(in, rmode, scale, s);
2378
2379     switch (p.cls) {
2380     case float_class_snan:
2381     case float_class_qnan:
2382         s->float_exception_flags = orig_flags | float_flag_invalid;
2383         return max;
2384     case float_class_inf:
2385         s->float_exception_flags = orig_flags | float_flag_invalid;
2386         return p.sign ? min : max;
2387     case float_class_zero:
2388         return 0;
2389     case float_class_normal:
2390         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2391             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2392         } else {
2393             r = UINT64_MAX;
2394         }
2395         if (p.sign) {
2396             if (r <= -(uint64_t) min) {
2397                 return -r;
2398             } else {
2399                 s->float_exception_flags = orig_flags | float_flag_invalid;
2400                 return min;
2401             }
2402         } else {
2403             if (r <= max) {
2404                 return r;
2405             } else {
2406                 s->float_exception_flags = orig_flags | float_flag_invalid;
2407                 return max;
2408             }
2409         }
2410     default:
2411         g_assert_not_reached();
2412     }
2413 }
2414
2415 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2416                               float_status *s)
2417 {
2418     FloatParts64 p;
2419
2420     float16_unpack_canonical(&p, a, s);
2421     return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s);
2422 }
2423
2424 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2425                                 float_status *s)
2426 {
2427     FloatParts64 p;
2428
2429     float16_unpack_canonical(&p, a, s);
2430     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2431 }
2432
2433 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2434                                 float_status *s)
2435 {
2436     FloatParts64 p;
2437
2438     float16_unpack_canonical(&p, a, s);
2439     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2440 }
2441
2442 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2443                                 float_status *s)
2444 {
2445     FloatParts64 p;
2446
2447     float16_unpack_canonical(&p, a, s);
2448     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2449 }
2450
2451 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2452                                 float_status *s)
2453 {
2454     FloatParts64 p;
2455
2456     float32_unpack_canonical(&p, a, s);
2457     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2458 }
2459
2460 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2461                                 float_status *s)
2462 {
2463     FloatParts64 p;
2464
2465     float32_unpack_canonical(&p, a, s);
2466     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2467 }
2468
2469 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2470                                 float_status *s)
2471 {
2472     FloatParts64 p;
2473
2474     float32_unpack_canonical(&p, a, s);
2475     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2476 }
2477
2478 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2479                                 float_status *s)
2480 {
2481     FloatParts64 p;
2482
2483     float64_unpack_canonical(&p, a, s);
2484     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2485 }
2486
2487 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2488                                 float_status *s)
2489 {
2490     FloatParts64 p;
2491
2492     float64_unpack_canonical(&p, a, s);
2493     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2494 }
2495
2496 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2497                                 float_status *s)
2498 {
2499     FloatParts64 p;
2500
2501     float64_unpack_canonical(&p, a, s);
2502     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2503 }
2504
2505 int8_t float16_to_int8(float16 a, float_status *s)
2506 {
2507     return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2508 }
2509
2510 int16_t float16_to_int16(float16 a, float_status *s)
2511 {
2512     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2513 }
2514
2515 int32_t float16_to_int32(float16 a, float_status *s)
2516 {
2517     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2518 }
2519
2520 int64_t float16_to_int64(float16 a, float_status *s)
2521 {
2522     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2523 }
2524
2525 int16_t float32_to_int16(float32 a, float_status *s)
2526 {
2527     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2528 }
2529
2530 int32_t float32_to_int32(float32 a, float_status *s)
2531 {
2532     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2533 }
2534
2535 int64_t float32_to_int64(float32 a, float_status *s)
2536 {
2537     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2538 }
2539
2540 int16_t float64_to_int16(float64 a, float_status *s)
2541 {
2542     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2543 }
2544
2545 int32_t float64_to_int32(float64 a, float_status *s)
2546 {
2547     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2548 }
2549
2550 int64_t float64_to_int64(float64 a, float_status *s)
2551 {
2552     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2553 }
2554
2555 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2556 {
2557     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2558 }
2559
2560 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2561 {
2562     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2563 }
2564
2565 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2566 {
2567     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2568 }
2569
2570 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2571 {
2572     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2573 }
2574
2575 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2576 {
2577     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2578 }
2579
2580 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2581 {
2582     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2583 }
2584
2585 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2586 {
2587     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2588 }
2589
2590 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2591 {
2592     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2593 }
2594
2595 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2596 {
2597     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2598 }
2599
2600 /*
2601  * Returns the result of converting the floating-point value `a' to
2602  * the two's complement integer format.
2603  */
2604
2605 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2606                                  float_status *s)
2607 {
2608     FloatParts64 p;
2609
2610     bfloat16_unpack_canonical(&p, a, s);
2611     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2612 }
2613
2614 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2615                                  float_status *s)
2616 {
2617     FloatParts64 p;
2618
2619     bfloat16_unpack_canonical(&p, a, s);
2620     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2621 }
2622
2623 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2624                                  float_status *s)
2625 {
2626     FloatParts64 p;
2627
2628     bfloat16_unpack_canonical(&p, a, s);
2629     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2630 }
2631
2632 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2633 {
2634     return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2635 }
2636
2637 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2638 {
2639     return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2640 }
2641
2642 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2643 {
2644     return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2645 }
2646
2647 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2648 {
2649     return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2650 }
2651
2652 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2653 {
2654     return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2655 }
2656
2657 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2658 {
2659     return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2660 }
2661
2662 /*
2663  *  Returns the result of converting the floating-point value `a' to
2664  *  the unsigned integer format. The conversion is performed according
2665  *  to the IEC/IEEE Standard for Binary Floating-Point
2666  *  Arithmetic---which means in particular that the conversion is
2667  *  rounded according to the current rounding mode. If `a' is a NaN,
2668  *  the largest unsigned integer is returned. Otherwise, if the
2669  *  conversion overflows, the largest unsigned integer is returned. If
2670  *  the 'a' is negative, the result is rounded and zero is returned;
2671  *  values that do not round to zero will raise the inexact exception
2672  *  flag.
2673  */
2674
2675 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2676                                        int scale, uint64_t max,
2677                                        float_status *s)
2678 {
2679     int orig_flags = get_float_exception_flags(s);
2680     FloatParts64 p = round_to_int(in, rmode, scale, s);
2681     uint64_t r;
2682
2683     switch (p.cls) {
2684     case float_class_snan:
2685     case float_class_qnan:
2686         s->float_exception_flags = orig_flags | float_flag_invalid;
2687         return max;
2688     case float_class_inf:
2689         s->float_exception_flags = orig_flags | float_flag_invalid;
2690         return p.sign ? 0 : max;
2691     case float_class_zero:
2692         return 0;
2693     case float_class_normal:
2694         if (p.sign) {
2695             s->float_exception_flags = orig_flags | float_flag_invalid;
2696             return 0;
2697         }
2698
2699         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2700             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2701         } else {
2702             s->float_exception_flags = orig_flags | float_flag_invalid;
2703             return max;
2704         }
2705
2706         /* For uint64 this will never trip, but if p.exp is too large
2707          * to shift a decomposed fraction we shall have exited via the
2708          * 3rd leg above.
2709          */
2710         if (r > max) {
2711             s->float_exception_flags = orig_flags | float_flag_invalid;
2712             return max;
2713         }
2714         return r;
2715     default:
2716         g_assert_not_reached();
2717     }
2718 }
2719
2720 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2721                                 float_status *s)
2722 {
2723     FloatParts64 p;
2724
2725     float16_unpack_canonical(&p, a, s);
2726     return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s);
2727 }
2728
2729 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2730                                   float_status *s)
2731 {
2732     FloatParts64 p;
2733
2734     float16_unpack_canonical(&p, a, s);
2735     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2736 }
2737
2738 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2739                                   float_status *s)
2740 {
2741     FloatParts64 p;
2742
2743     float16_unpack_canonical(&p, a, s);
2744     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2745 }
2746
2747 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2748                                   float_status *s)
2749 {
2750     FloatParts64 p;
2751
2752     float16_unpack_canonical(&p, a, s);
2753     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2754 }
2755
2756 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2757                                   float_status *s)
2758 {
2759     FloatParts64 p;
2760
2761     float32_unpack_canonical(&p, a, s);
2762     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2763 }
2764
2765 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2766                                   float_status *s)
2767 {
2768     FloatParts64 p;
2769
2770     float32_unpack_canonical(&p, a, s);
2771     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2772 }
2773
2774 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2775                                   float_status *s)
2776 {
2777     FloatParts64 p;
2778
2779     float32_unpack_canonical(&p, a, s);
2780     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2781 }
2782
2783 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2784                                   float_status *s)
2785 {
2786     FloatParts64 p;
2787
2788     float64_unpack_canonical(&p, a, s);
2789     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2790 }
2791
2792 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2793                                   float_status *s)
2794 {
2795     FloatParts64 p;
2796
2797     float64_unpack_canonical(&p, a, s);
2798     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2799 }
2800
2801 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2802                                   float_status *s)
2803 {
2804     FloatParts64 p;
2805
2806     float64_unpack_canonical(&p, a, s);
2807     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2808 }
2809
2810 uint8_t float16_to_uint8(float16 a, float_status *s)
2811 {
2812     return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2813 }
2814
2815 uint16_t float16_to_uint16(float16 a, float_status *s)
2816 {
2817     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2818 }
2819
2820 uint32_t float16_to_uint32(float16 a, float_status *s)
2821 {
2822     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2823 }
2824
2825 uint64_t float16_to_uint64(float16 a, float_status *s)
2826 {
2827     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2828 }
2829
2830 uint16_t float32_to_uint16(float32 a, float_status *s)
2831 {
2832     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2833 }
2834
2835 uint32_t float32_to_uint32(float32 a, float_status *s)
2836 {
2837     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2838 }
2839
2840 uint64_t float32_to_uint64(float32 a, float_status *s)
2841 {
2842     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2843 }
2844
2845 uint16_t float64_to_uint16(float64 a, float_status *s)
2846 {
2847     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2848 }
2849
2850 uint32_t float64_to_uint32(float64 a, float_status *s)
2851 {
2852     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2853 }
2854
2855 uint64_t float64_to_uint64(float64 a, float_status *s)
2856 {
2857     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2858 }
2859
2860 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2861 {
2862     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2863 }
2864
2865 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2866 {
2867     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2868 }
2869
2870 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2871 {
2872     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2873 }
2874
2875 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2876 {
2877     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2878 }
2879
2880 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2881 {
2882     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2883 }
2884
2885 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2886 {
2887     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2888 }
2889
2890 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2891 {
2892     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2893 }
2894
2895 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2896 {
2897     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2898 }
2899
2900 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2901 {
2902     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2903 }
2904
2905 /*
2906  *  Returns the result of converting the bfloat16 value `a' to
2907  *  the unsigned integer format.
2908  */
2909
2910 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2911                                    int scale, float_status *s)
2912 {
2913     FloatParts64 p;
2914
2915     bfloat16_unpack_canonical(&p, a, s);
2916     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2917 }
2918
2919 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2920                                    int scale, float_status *s)
2921 {
2922     FloatParts64 p;
2923
2924     bfloat16_unpack_canonical(&p, a, s);
2925     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2926 }
2927
2928 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2929                                    int scale, float_status *s)
2930 {
2931     FloatParts64 p;
2932
2933     bfloat16_unpack_canonical(&p, a, s);
2934     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2935 }
2936
2937 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2938 {
2939     return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2940 }
2941
2942 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2943 {
2944     return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2945 }
2946
2947 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2948 {
2949     return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2950 }
2951
2952 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2953 {
2954     return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2955 }
2956
2957 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2958 {
2959     return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2960 }
2961
2962 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2963 {
2964     return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2965 }
2966
2967 /*
2968  * Integer to float conversions
2969  *
2970  * Returns the result of converting the two's complement integer `a'
2971  * to the floating-point format. The conversion is performed according
2972  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2973  */
2974
2975 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2976 {
2977     FloatParts64 r = { .sign = false };
2978
2979     if (a == 0) {
2980         r.cls = float_class_zero;
2981     } else {
2982         uint64_t f = a;
2983         int shift;
2984
2985         r.cls = float_class_normal;
2986         if (a < 0) {
2987             f = -f;
2988             r.sign = true;
2989         }
2990         shift = clz64(f);
2991         scale = MIN(MAX(scale, -0x10000), 0x10000);
2992
2993         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2994         r.frac = f << shift;
2995     }
2996
2997     return r;
2998 }
2999
3000 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
3001 {
3002     FloatParts64 pa = int_to_float(a, scale, status);
3003     return float16_round_pack_canonical(&pa, status);
3004 }
3005
3006 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
3007 {
3008     return int64_to_float16_scalbn(a, scale, status);
3009 }
3010
3011 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
3012 {
3013     return int64_to_float16_scalbn(a, scale, status);
3014 }
3015
3016 float16 int64_to_float16(int64_t a, float_status *status)
3017 {
3018     return int64_to_float16_scalbn(a, 0, status);
3019 }
3020
3021 float16 int32_to_float16(int32_t a, float_status *status)
3022 {
3023     return int64_to_float16_scalbn(a, 0, status);
3024 }
3025
3026 float16 int16_to_float16(int16_t a, float_status *status)
3027 {
3028     return int64_to_float16_scalbn(a, 0, status);
3029 }
3030
3031 float16 int8_to_float16(int8_t a, float_status *status)
3032 {
3033     return int64_to_float16_scalbn(a, 0, status);
3034 }
3035
3036 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
3037 {
3038     FloatParts64 pa = int_to_float(a, scale, status);
3039     return float32_round_pack_canonical(&pa, status);
3040 }
3041
3042 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
3043 {
3044     return int64_to_float32_scalbn(a, scale, status);
3045 }
3046
3047 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
3048 {
3049     return int64_to_float32_scalbn(a, scale, status);
3050 }
3051
3052 float32 int64_to_float32(int64_t a, float_status *status)
3053 {
3054     return int64_to_float32_scalbn(a, 0, status);
3055 }
3056
3057 float32 int32_to_float32(int32_t a, float_status *status)
3058 {
3059     return int64_to_float32_scalbn(a, 0, status);
3060 }
3061
3062 float32 int16_to_float32(int16_t a, float_status *status)
3063 {
3064     return int64_to_float32_scalbn(a, 0, status);
3065 }
3066
3067 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
3068 {
3069     FloatParts64 pa = int_to_float(a, scale, status);
3070     return float64_round_pack_canonical(&pa, status);
3071 }
3072
3073 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
3074 {
3075     return int64_to_float64_scalbn(a, scale, status);
3076 }
3077
3078 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
3079 {
3080     return int64_to_float64_scalbn(a, scale, status);
3081 }
3082
3083 float64 int64_to_float64(int64_t a, float_status *status)
3084 {
3085     return int64_to_float64_scalbn(a, 0, status);
3086 }
3087
3088 float64 int32_to_float64(int32_t a, float_status *status)
3089 {
3090     return int64_to_float64_scalbn(a, 0, status);
3091 }
3092
3093 float64 int16_to_float64(int16_t a, float_status *status)
3094 {
3095     return int64_to_float64_scalbn(a, 0, status);
3096 }
3097
3098 /*
3099  * Returns the result of converting the two's complement integer `a'
3100  * to the bfloat16 format.
3101  */
3102
3103 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
3104 {
3105     FloatParts64 pa = int_to_float(a, scale, status);
3106     return bfloat16_round_pack_canonical(&pa, status);
3107 }
3108
3109 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
3110 {
3111     return int64_to_bfloat16_scalbn(a, scale, status);
3112 }
3113
3114 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
3115 {
3116     return int64_to_bfloat16_scalbn(a, scale, status);
3117 }
3118
3119 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
3120 {
3121     return int64_to_bfloat16_scalbn(a, 0, status);
3122 }
3123
3124 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
3125 {
3126     return int64_to_bfloat16_scalbn(a, 0, status);
3127 }
3128
3129 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
3130 {
3131     return int64_to_bfloat16_scalbn(a, 0, status);
3132 }
3133
3134 /*
3135  * Unsigned Integer to float conversions
3136  *
3137  * Returns the result of converting the unsigned integer `a' to the
3138  * floating-point format. The conversion is performed according to the
3139  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3140  */
3141
3142 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
3143 {
3144     FloatParts64 r = { .sign = false };
3145     int shift;
3146
3147     if (a == 0) {
3148         r.cls = float_class_zero;
3149     } else {
3150         scale = MIN(MAX(scale, -0x10000), 0x10000);
3151         shift = clz64(a);
3152         r.cls = float_class_normal;
3153         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
3154         r.frac = a << shift;
3155     }
3156
3157     return r;
3158 }
3159
3160 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
3161 {
3162     FloatParts64 pa = uint_to_float(a, scale, status);
3163     return float16_round_pack_canonical(&pa, status);
3164 }
3165
3166 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
3167 {
3168     return uint64_to_float16_scalbn(a, scale, status);
3169 }
3170
3171 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
3172 {
3173     return uint64_to_float16_scalbn(a, scale, status);
3174 }
3175
3176 float16 uint64_to_float16(uint64_t a, float_status *status)
3177 {
3178     return uint64_to_float16_scalbn(a, 0, status);
3179 }
3180
3181 float16 uint32_to_float16(uint32_t a, float_status *status)
3182 {
3183     return uint64_to_float16_scalbn(a, 0, status);
3184 }
3185
3186 float16 uint16_to_float16(uint16_t a, float_status *status)
3187 {
3188     return uint64_to_float16_scalbn(a, 0, status);
3189 }
3190
3191 float16 uint8_to_float16(uint8_t a, float_status *status)
3192 {
3193     return uint64_to_float16_scalbn(a, 0, status);
3194 }
3195
3196 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
3197 {
3198     FloatParts64 pa = uint_to_float(a, scale, status);
3199     return float32_round_pack_canonical(&pa, status);
3200 }
3201
3202 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
3203 {
3204     return uint64_to_float32_scalbn(a, scale, status);
3205 }
3206
3207 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
3208 {
3209     return uint64_to_float32_scalbn(a, scale, status);
3210 }
3211
3212 float32 uint64_to_float32(uint64_t a, float_status *status)
3213 {
3214     return uint64_to_float32_scalbn(a, 0, status);
3215 }
3216
3217 float32 uint32_to_float32(uint32_t a, float_status *status)
3218 {
3219     return uint64_to_float32_scalbn(a, 0, status);
3220 }
3221
3222 float32 uint16_to_float32(uint16_t a, float_status *status)
3223 {
3224     return uint64_to_float32_scalbn(a, 0, status);
3225 }
3226
3227 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3228 {
3229     FloatParts64 pa = uint_to_float(a, scale, status);
3230     return float64_round_pack_canonical(&pa, status);
3231 }
3232
3233 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3234 {
3235     return uint64_to_float64_scalbn(a, scale, status);
3236 }
3237
3238 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3239 {
3240     return uint64_to_float64_scalbn(a, scale, status);
3241 }
3242
3243 float64 uint64_to_float64(uint64_t a, float_status *status)
3244 {
3245     return uint64_to_float64_scalbn(a, 0, status);
3246 }
3247
3248 float64 uint32_to_float64(uint32_t a, float_status *status)
3249 {
3250     return uint64_to_float64_scalbn(a, 0, status);
3251 }
3252
3253 float64 uint16_to_float64(uint16_t a, float_status *status)
3254 {
3255     return uint64_to_float64_scalbn(a, 0, status);
3256 }
3257
3258 /*
3259  * Returns the result of converting the unsigned integer `a' to the
3260  * bfloat16 format.
3261  */
3262
3263 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3264 {
3265     FloatParts64 pa = uint_to_float(a, scale, status);
3266     return bfloat16_round_pack_canonical(&pa, status);
3267 }
3268
3269 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3270 {
3271     return uint64_to_bfloat16_scalbn(a, scale, status);
3272 }
3273
3274 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3275 {
3276     return uint64_to_bfloat16_scalbn(a, scale, status);
3277 }
3278
3279 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3280 {
3281     return uint64_to_bfloat16_scalbn(a, 0, status);
3282 }
3283
3284 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3285 {
3286     return uint64_to_bfloat16_scalbn(a, 0, status);
3287 }
3288
3289 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3290 {
3291     return uint64_to_bfloat16_scalbn(a, 0, status);
3292 }
3293
3294 /* Float Min/Max */
3295 /* min() and max() functions. These can't be implemented as
3296  * 'compare and pick one input' because that would mishandle
3297  * NaNs and +0 vs -0.
3298  *
3299  * minnum() and maxnum() functions. These are similar to the min()
3300  * and max() functions but if one of the arguments is a QNaN and
3301  * the other is numerical then the numerical argument is returned.
3302  * SNaNs will get quietened before being returned.
3303  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3304  * and maxNum() operations. min() and max() are the typical min/max
3305  * semantics provided by many CPUs which predate that specification.
3306  *
3307  * minnummag() and maxnummag() functions correspond to minNumMag()
3308  * and minNumMag() from the IEEE-754 2008.
3309  */
3310 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3311                                 bool ieee, bool ismag, float_status *s)
3312 {
3313     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3314         if (ieee) {
3315             /* Takes two floating-point values `a' and `b', one of
3316              * which is a NaN, and returns the appropriate NaN
3317              * result. If either `a' or `b' is a signaling NaN,
3318              * the invalid exception is raised.
3319              */
3320             if (is_snan(a.cls) || is_snan(b.cls)) {
3321                 return *parts_pick_nan(&a, &b, s);
3322             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3323                 return b;
3324             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3325                 return a;
3326             }
3327         }
3328         return *parts_pick_nan(&a, &b, s);
3329     } else {
3330         int a_exp, b_exp;
3331
3332         switch (a.cls) {
3333         case float_class_normal:
3334             a_exp = a.exp;
3335             break;
3336         case float_class_inf:
3337             a_exp = INT_MAX;
3338             break;
3339         case float_class_zero:
3340             a_exp = INT_MIN;
3341             break;
3342         default:
3343             g_assert_not_reached();
3344             break;
3345         }
3346         switch (b.cls) {
3347         case float_class_normal:
3348             b_exp = b.exp;
3349             break;
3350         case float_class_inf:
3351             b_exp = INT_MAX;
3352             break;
3353         case float_class_zero:
3354             b_exp = INT_MIN;
3355             break;
3356         default:
3357             g_assert_not_reached();
3358             break;
3359         }
3360
3361         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3362             bool a_less = a_exp < b_exp;
3363             if (a_exp == b_exp) {
3364                 a_less = a.frac < b.frac;
3365             }
3366             return a_less ^ ismin ? b : a;
3367         }
3368
3369         if (a.sign == b.sign) {
3370             bool a_less = a_exp < b_exp;
3371             if (a_exp == b_exp) {
3372                 a_less = a.frac < b.frac;
3373             }
3374             return a.sign ^ a_less ^ ismin ? b : a;
3375         } else {
3376             return a.sign ^ ismin ? b : a;
3377         }
3378     }
3379 }
3380
3381 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
3382 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
3383                                      float_status *s)                   \
3384 {                                                                       \
3385     FloatParts64 pa, pb, pr;                                            \
3386     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3387     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3388     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3389     return float ## sz ## _round_pack_canonical(&pr, s);                \
3390 }
3391
3392 MINMAX(16, min, true, false, false)
3393 MINMAX(16, minnum, true, true, false)
3394 MINMAX(16, minnummag, true, true, true)
3395 MINMAX(16, max, false, false, false)
3396 MINMAX(16, maxnum, false, true, false)
3397 MINMAX(16, maxnummag, false, true, true)
3398
3399 MINMAX(32, min, true, false, false)
3400 MINMAX(32, minnum, true, true, false)
3401 MINMAX(32, minnummag, true, true, true)
3402 MINMAX(32, max, false, false, false)
3403 MINMAX(32, maxnum, false, true, false)
3404 MINMAX(32, maxnummag, false, true, true)
3405
3406 MINMAX(64, min, true, false, false)
3407 MINMAX(64, minnum, true, true, false)
3408 MINMAX(64, minnummag, true, true, true)
3409 MINMAX(64, max, false, false, false)
3410 MINMAX(64, maxnum, false, true, false)
3411 MINMAX(64, maxnummag, false, true, true)
3412
3413 #undef MINMAX
3414
3415 #define BF16_MINMAX(name, ismin, isiee, ismag)                          \
3416 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s)     \
3417 {                                                                       \
3418     FloatParts64 pa, pb, pr;                                            \
3419     bfloat16_unpack_canonical(&pa, a, s);                               \
3420     bfloat16_unpack_canonical(&pb, b, s);                               \
3421     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3422     return bfloat16_round_pack_canonical(&pr, s);                       \
3423 }
3424
3425 BF16_MINMAX(min, true, false, false)
3426 BF16_MINMAX(minnum, true, true, false)
3427 BF16_MINMAX(minnummag, true, true, true)
3428 BF16_MINMAX(max, false, false, false)
3429 BF16_MINMAX(maxnum, false, true, false)
3430 BF16_MINMAX(maxnummag, false, true, true)
3431
3432 #undef BF16_MINMAX
3433
3434 /* Floating point compare */
3435 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3436                                     float_status *s)
3437 {
3438     if (is_nan(a.cls) || is_nan(b.cls)) {
3439         if (!is_quiet ||
3440             a.cls == float_class_snan ||
3441             b.cls == float_class_snan) {
3442             float_raise(float_flag_invalid, s);
3443         }
3444         return float_relation_unordered;
3445     }
3446
3447     if (a.cls == float_class_zero) {
3448         if (b.cls == float_class_zero) {
3449             return float_relation_equal;
3450         }
3451         return b.sign ? float_relation_greater : float_relation_less;
3452     } else if (b.cls == float_class_zero) {
3453         return a.sign ? float_relation_less : float_relation_greater;
3454     }
3455
3456     /* The only really important thing about infinity is its sign. If
3457      * both are infinities the sign marks the smallest of the two.
3458      */
3459     if (a.cls == float_class_inf) {
3460         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3461             return float_relation_equal;
3462         }
3463         return a.sign ? float_relation_less : float_relation_greater;
3464     } else if (b.cls == float_class_inf) {
3465         return b.sign ? float_relation_greater : float_relation_less;
3466     }
3467
3468     if (a.sign != b.sign) {
3469         return a.sign ? float_relation_less : float_relation_greater;
3470     }
3471
3472     if (a.exp == b.exp) {
3473         if (a.frac == b.frac) {
3474             return float_relation_equal;
3475         }
3476         if (a.sign) {
3477             return a.frac > b.frac ?
3478                 float_relation_less : float_relation_greater;
3479         } else {
3480             return a.frac > b.frac ?
3481                 float_relation_greater : float_relation_less;
3482         }
3483     } else {
3484         if (a.sign) {
3485             return a.exp > b.exp ? float_relation_less : float_relation_greater;
3486         } else {
3487             return a.exp > b.exp ? float_relation_greater : float_relation_less;
3488         }
3489     }
3490 }
3491
3492 #define COMPARE(name, attr, sz)                                         \
3493 static int attr                                                         \
3494 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
3495 {                                                                       \
3496     FloatParts64 pa, pb;                                                \
3497     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3498     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3499     return compare_floats(pa, pb, is_quiet, s);                         \
3500 }
3501
3502 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3503 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3504 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3505
3506 #undef COMPARE
3507
3508 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3509 {
3510     return soft_f16_compare(a, b, false, s);
3511 }
3512
3513 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3514 {
3515     return soft_f16_compare(a, b, true, s);
3516 }
3517
3518 static FloatRelation QEMU_FLATTEN
3519 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3520 {
3521     union_float32 ua, ub;
3522
3523     ua.s = xa;
3524     ub.s = xb;
3525
3526     if (QEMU_NO_HARDFLOAT) {
3527         goto soft;
3528     }
3529
3530     float32_input_flush2(&ua.s, &ub.s, s);
3531     if (isgreaterequal(ua.h, ub.h)) {
3532         if (isgreater(ua.h, ub.h)) {
3533             return float_relation_greater;
3534         }
3535         return float_relation_equal;
3536     }
3537     if (likely(isless(ua.h, ub.h))) {
3538         return float_relation_less;
3539     }
3540     /* The only condition remaining is unordered.
3541      * Fall through to set flags.
3542      */
3543  soft:
3544     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3545 }
3546
3547 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3548 {
3549     return f32_compare(a, b, false, s);
3550 }
3551
3552 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3553 {
3554     return f32_compare(a, b, true, s);
3555 }
3556
3557 static FloatRelation QEMU_FLATTEN
3558 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3559 {
3560     union_float64 ua, ub;
3561
3562     ua.s = xa;
3563     ub.s = xb;
3564
3565     if (QEMU_NO_HARDFLOAT) {
3566         goto soft;
3567     }
3568
3569     float64_input_flush2(&ua.s, &ub.s, s);
3570     if (isgreaterequal(ua.h, ub.h)) {
3571         if (isgreater(ua.h, ub.h)) {
3572             return float_relation_greater;
3573         }
3574         return float_relation_equal;
3575     }
3576     if (likely(isless(ua.h, ub.h))) {
3577         return float_relation_less;
3578     }
3579     /* The only condition remaining is unordered.
3580      * Fall through to set flags.
3581      */
3582  soft:
3583     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3584 }
3585
3586 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3587 {
3588     return f64_compare(a, b, false, s);
3589 }
3590
3591 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3592 {
3593     return f64_compare(a, b, true, s);
3594 }
3595
3596 static FloatRelation QEMU_FLATTEN
3597 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3598 {
3599     FloatParts64 pa, pb;
3600
3601     bfloat16_unpack_canonical(&pa, a, s);
3602     bfloat16_unpack_canonical(&pb, b, s);
3603     return compare_floats(pa, pb, is_quiet, s);
3604 }
3605
3606 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3607 {
3608     return soft_bf16_compare(a, b, false, s);
3609 }
3610
3611 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3612 {
3613     return soft_bf16_compare(a, b, true, s);
3614 }
3615
3616 /* Multiply A by 2 raised to the power N.  */
3617 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3618 {
3619     if (unlikely(is_nan(a.cls))) {
3620         parts_return_nan(&a, s);
3621     }
3622     if (a.cls == float_class_normal) {
3623         /* The largest float type (even though not supported by FloatParts64)
3624          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3625          * still allows rounding to infinity, without allowing overflow
3626          * within the int32_t that backs FloatParts64.exp.
3627          */
3628         n = MIN(MAX(n, -0x10000), 0x10000);
3629         a.exp += n;
3630     }
3631     return a;
3632 }
3633
3634 float16 float16_scalbn(float16 a, int n, float_status *status)
3635 {
3636     FloatParts64 pa, pr;
3637
3638     float16_unpack_canonical(&pa, a, status);
3639     pr = scalbn_decomposed(pa, n, status);
3640     return float16_round_pack_canonical(&pr, status);
3641 }
3642
3643 float32 float32_scalbn(float32 a, int n, float_status *status)
3644 {
3645     FloatParts64 pa, pr;
3646
3647     float32_unpack_canonical(&pa, a, status);
3648     pr = scalbn_decomposed(pa, n, status);
3649     return float32_round_pack_canonical(&pr, status);
3650 }
3651
3652 float64 float64_scalbn(float64 a, int n, float_status *status)
3653 {
3654     FloatParts64 pa, pr;
3655
3656     float64_unpack_canonical(&pa, a, status);
3657     pr = scalbn_decomposed(pa, n, status);
3658     return float64_round_pack_canonical(&pr, status);
3659 }
3660
3661 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3662 {
3663     FloatParts64 pa, pr;
3664
3665     bfloat16_unpack_canonical(&pa, a, status);
3666     pr = scalbn_decomposed(pa, n, status);
3667     return bfloat16_round_pack_canonical(&pr, status);
3668 }
3669
3670 /*
3671  * Square Root
3672  *
3673  * The old softfloat code did an approximation step before zeroing in
3674  * on the final result. However for simpleness we just compute the
3675  * square root by iterating down from the implicit bit to enough extra
3676  * bits to ensure we get a correctly rounded result.
3677  *
3678  * This does mean however the calculation is slower than before,
3679  * especially for 64 bit floats.
3680  */
3681
3682 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3683 {
3684     uint64_t a_frac, r_frac, s_frac;
3685     int bit, last_bit;
3686
3687     if (is_nan(a.cls)) {
3688         parts_return_nan(&a, s);
3689         return a;
3690     }
3691     if (a.cls == float_class_zero) {
3692         return a;  /* sqrt(+-0) = +-0 */
3693     }
3694     if (a.sign) {
3695         float_raise(float_flag_invalid, s);
3696         parts_default_nan(&a, s);
3697         return a;
3698     }
3699     if (a.cls == float_class_inf) {
3700         return a;  /* sqrt(+inf) = +inf */
3701     }
3702
3703     assert(a.cls == float_class_normal);
3704
3705     /* We need two overflow bits at the top. Adding room for that is a
3706      * right shift. If the exponent is odd, we can discard the low bit
3707      * by multiplying the fraction by 2; that's a left shift. Combine
3708      * those and we shift right by 1 if the exponent is odd, otherwise 2.
3709      */
3710     a_frac = a.frac >> (2 - (a.exp & 1));
3711     a.exp >>= 1;
3712
3713     /* Bit-by-bit computation of sqrt.  */
3714     r_frac = 0;
3715     s_frac = 0;
3716
3717     /* Iterate from implicit bit down to the 3 extra bits to compute a
3718      * properly rounded result. Remember we've inserted two more bits
3719      * at the top, so these positions are two less.
3720      */
3721     bit = DECOMPOSED_BINARY_POINT - 2;
3722     last_bit = MAX(p->frac_shift - 4, 0);
3723     do {
3724         uint64_t q = 1ULL << bit;
3725         uint64_t t_frac = s_frac + q;
3726         if (t_frac <= a_frac) {
3727             s_frac = t_frac + q;
3728             a_frac -= t_frac;
3729             r_frac += q;
3730         }
3731         a_frac <<= 1;
3732     } while (--bit >= last_bit);
3733
3734     /* Undo the right shift done above. If there is any remaining
3735      * fraction, the result is inexact. Set the sticky bit.
3736      */
3737     a.frac = (r_frac << 2) + (a_frac != 0);
3738
3739     return a;
3740 }
3741
3742 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3743 {
3744     FloatParts64 pa, pr;
3745
3746     float16_unpack_canonical(&pa, a, status);
3747     pr = sqrt_float(pa, status, &float16_params);
3748     return float16_round_pack_canonical(&pr, status);
3749 }
3750
3751 static float32 QEMU_SOFTFLOAT_ATTR
3752 soft_f32_sqrt(float32 a, float_status *status)
3753 {
3754     FloatParts64 pa, pr;
3755
3756     float32_unpack_canonical(&pa, a, status);
3757     pr = sqrt_float(pa, status, &float32_params);
3758     return float32_round_pack_canonical(&pr, status);
3759 }
3760
3761 static float64 QEMU_SOFTFLOAT_ATTR
3762 soft_f64_sqrt(float64 a, float_status *status)
3763 {
3764     FloatParts64 pa, pr;
3765
3766     float64_unpack_canonical(&pa, a, status);
3767     pr = sqrt_float(pa, status, &float64_params);
3768     return float64_round_pack_canonical(&pr, status);
3769 }
3770
3771 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3772 {
3773     union_float32 ua, ur;
3774
3775     ua.s = xa;
3776     if (unlikely(!can_use_fpu(s))) {
3777         goto soft;
3778     }
3779
3780     float32_input_flush1(&ua.s, s);
3781     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3782         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3783                        fpclassify(ua.h) == FP_ZERO) ||
3784                      signbit(ua.h))) {
3785             goto soft;
3786         }
3787     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3788                         float32_is_neg(ua.s))) {
3789         goto soft;
3790     }
3791     ur.h = sqrtf(ua.h);
3792     return ur.s;
3793
3794  soft:
3795     return soft_f32_sqrt(ua.s, s);
3796 }
3797
3798 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3799 {
3800     union_float64 ua, ur;
3801
3802     ua.s = xa;
3803     if (unlikely(!can_use_fpu(s))) {
3804         goto soft;
3805     }
3806
3807     float64_input_flush1(&ua.s, s);
3808     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3809         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3810                        fpclassify(ua.h) == FP_ZERO) ||
3811                      signbit(ua.h))) {
3812             goto soft;
3813         }
3814     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3815                         float64_is_neg(ua.s))) {
3816         goto soft;
3817     }
3818     ur.h = sqrt(ua.h);
3819     return ur.s;
3820
3821  soft:
3822     return soft_f64_sqrt(ua.s, s);
3823 }
3824
3825 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3826 {
3827     FloatParts64 pa, pr;
3828
3829     bfloat16_unpack_canonical(&pa, a, status);
3830     pr = sqrt_float(pa, status, &bfloat16_params);
3831     return bfloat16_round_pack_canonical(&pr, status);
3832 }
3833
3834 /*----------------------------------------------------------------------------
3835 | The pattern for a default generated NaN.
3836 *----------------------------------------------------------------------------*/
3837
3838 float16 float16_default_nan(float_status *status)
3839 {
3840     FloatParts64 p;
3841
3842     parts_default_nan(&p, status);
3843     p.frac >>= float16_params.frac_shift;
3844     return float16_pack_raw(&p);
3845 }
3846
3847 float32 float32_default_nan(float_status *status)
3848 {
3849     FloatParts64 p;
3850
3851     parts_default_nan(&p, status);
3852     p.frac >>= float32_params.frac_shift;
3853     return float32_pack_raw(&p);
3854 }
3855
3856 float64 float64_default_nan(float_status *status)
3857 {
3858     FloatParts64 p;
3859
3860     parts_default_nan(&p, status);
3861     p.frac >>= float64_params.frac_shift;
3862     return float64_pack_raw(&p);
3863 }
3864
3865 float128 float128_default_nan(float_status *status)
3866 {
3867     FloatParts128 p;
3868
3869     parts_default_nan(&p, status);
3870     frac_shr(&p, float128_params.frac_shift);
3871     return float128_pack_raw(&p);
3872 }
3873
3874 bfloat16 bfloat16_default_nan(float_status *status)
3875 {
3876     FloatParts64 p;
3877
3878     parts_default_nan(&p, status);
3879     p.frac >>= bfloat16_params.frac_shift;
3880     return bfloat16_pack_raw(&p);
3881 }
3882
3883 /*----------------------------------------------------------------------------
3884 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3885 *----------------------------------------------------------------------------*/
3886
3887 float16 float16_silence_nan(float16 a, float_status *status)
3888 {
3889     FloatParts64 p;
3890
3891     float16_unpack_raw(&p, a);
3892     p.frac <<= float16_params.frac_shift;
3893     parts_silence_nan(&p, status);
3894     p.frac >>= float16_params.frac_shift;
3895     return float16_pack_raw(&p);
3896 }
3897
3898 float32 float32_silence_nan(float32 a, float_status *status)
3899 {
3900     FloatParts64 p;
3901
3902     float32_unpack_raw(&p, a);
3903     p.frac <<= float32_params.frac_shift;
3904     parts_silence_nan(&p, status);
3905     p.frac >>= float32_params.frac_shift;
3906     return float32_pack_raw(&p);
3907 }
3908
3909 float64 float64_silence_nan(float64 a, float_status *status)
3910 {
3911     FloatParts64 p;
3912
3913     float64_unpack_raw(&p, a);
3914     p.frac <<= float64_params.frac_shift;
3915     parts_silence_nan(&p, status);
3916     p.frac >>= float64_params.frac_shift;
3917     return float64_pack_raw(&p);
3918 }
3919
3920 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3921 {
3922     FloatParts64 p;
3923
3924     bfloat16_unpack_raw(&p, a);
3925     p.frac <<= bfloat16_params.frac_shift;
3926     parts_silence_nan(&p, status);
3927     p.frac >>= bfloat16_params.frac_shift;
3928     return bfloat16_pack_raw(&p);
3929 }
3930
3931 float128 float128_silence_nan(float128 a, float_status *status)
3932 {
3933     FloatParts128 p;
3934
3935     float128_unpack_raw(&p, a);
3936     frac_shl(&p, float128_params.frac_shift);
3937     parts_silence_nan(&p, status);
3938     frac_shr(&p, float128_params.frac_shift);
3939     return float128_pack_raw(&p);
3940 }
3941
3942 /*----------------------------------------------------------------------------
3943 | If `a' is denormal and we are in flush-to-zero mode then set the
3944 | input-denormal exception and return zero. Otherwise just return the value.
3945 *----------------------------------------------------------------------------*/
3946
3947 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3948 {
3949     if (p.exp == 0 && p.frac != 0) {
3950         float_raise(float_flag_input_denormal, status);
3951         return true;
3952     }
3953
3954     return false;
3955 }
3956
3957 float16 float16_squash_input_denormal(float16 a, float_status *status)
3958 {
3959     if (status->flush_inputs_to_zero) {
3960         FloatParts64 p;
3961
3962         float16_unpack_raw(&p, a);
3963         if (parts_squash_denormal(p, status)) {
3964             return float16_set_sign(float16_zero, p.sign);
3965         }
3966     }
3967     return a;
3968 }
3969
3970 float32 float32_squash_input_denormal(float32 a, float_status *status)
3971 {
3972     if (status->flush_inputs_to_zero) {
3973         FloatParts64 p;
3974
3975         float32_unpack_raw(&p, a);
3976         if (parts_squash_denormal(p, status)) {
3977             return float32_set_sign(float32_zero, p.sign);
3978         }
3979     }
3980     return a;
3981 }
3982
3983 float64 float64_squash_input_denormal(float64 a, float_status *status)
3984 {
3985     if (status->flush_inputs_to_zero) {
3986         FloatParts64 p;
3987
3988         float64_unpack_raw(&p, a);
3989         if (parts_squash_denormal(p, status)) {
3990             return float64_set_sign(float64_zero, p.sign);
3991         }
3992     }
3993     return a;
3994 }
3995
3996 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3997 {
3998     if (status->flush_inputs_to_zero) {
3999         FloatParts64 p;
4000
4001         bfloat16_unpack_raw(&p, a);
4002         if (parts_squash_denormal(p, status)) {
4003             return bfloat16_set_sign(bfloat16_zero, p.sign);
4004         }
4005     }
4006     return a;
4007 }
4008
4009 /*----------------------------------------------------------------------------
4010 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
4011 | and 7, and returns the properly rounded 32-bit integer corresponding to the
4012 | input.  If `zSign' is 1, the input is negated before being converted to an
4013 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
4014 | is simply rounded to an integer, with the inexact exception raised if the
4015 | input cannot be represented exactly as an integer.  However, if the fixed-
4016 | point input is too large, the invalid exception is raised and the largest
4017 | positive or negative integer is returned.
4018 *----------------------------------------------------------------------------*/
4019
4020 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
4021                                  float_status *status)
4022 {
4023     int8_t roundingMode;
4024     bool roundNearestEven;
4025     int8_t roundIncrement, roundBits;
4026     int32_t z;
4027
4028     roundingMode = status->float_rounding_mode;
4029     roundNearestEven = ( roundingMode == float_round_nearest_even );
4030     switch (roundingMode) {
4031     case float_round_nearest_even:
4032     case float_round_ties_away:
4033         roundIncrement = 0x40;
4034         break;
4035     case float_round_to_zero:
4036         roundIncrement = 0;
4037         break;
4038     case float_round_up:
4039         roundIncrement = zSign ? 0 : 0x7f;
4040         break;
4041     case float_round_down:
4042         roundIncrement = zSign ? 0x7f : 0;
4043         break;
4044     case float_round_to_odd:
4045         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
4046         break;
4047     default:
4048         abort();
4049     }
4050     roundBits = absZ & 0x7F;
4051     absZ = ( absZ + roundIncrement )>>7;
4052     if (!(roundBits ^ 0x40) && roundNearestEven) {
4053         absZ &= ~1;
4054     }
4055     z = absZ;
4056     if ( zSign ) z = - z;
4057     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
4058         float_raise(float_flag_invalid, status);
4059         return zSign ? INT32_MIN : INT32_MAX;
4060     }
4061     if (roundBits) {
4062         float_raise(float_flag_inexact, status);
4063     }
4064     return z;
4065
4066 }
4067
4068 /*----------------------------------------------------------------------------
4069 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4070 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4071 | and returns the properly rounded 64-bit integer corresponding to the input.
4072 | If `zSign' is 1, the input is negated before being converted to an integer.
4073 | Ordinarily, the fixed-point input is simply rounded to an integer, with
4074 | the inexact exception raised if the input cannot be represented exactly as
4075 | an integer.  However, if the fixed-point input is too large, the invalid
4076 | exception is raised and the largest positive or negative integer is
4077 | returned.
4078 *----------------------------------------------------------------------------*/
4079
4080 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
4081                                float_status *status)
4082 {
4083     int8_t roundingMode;
4084     bool roundNearestEven, increment;
4085     int64_t z;
4086
4087     roundingMode = status->float_rounding_mode;
4088     roundNearestEven = ( roundingMode == float_round_nearest_even );
4089     switch (roundingMode) {
4090     case float_round_nearest_even:
4091     case float_round_ties_away:
4092         increment = ((int64_t) absZ1 < 0);
4093         break;
4094     case float_round_to_zero:
4095         increment = 0;
4096         break;
4097     case float_round_up:
4098         increment = !zSign && absZ1;
4099         break;
4100     case float_round_down:
4101         increment = zSign && absZ1;
4102         break;
4103     case float_round_to_odd:
4104         increment = !(absZ0 & 1) && absZ1;
4105         break;
4106     default:
4107         abort();
4108     }
4109     if ( increment ) {
4110         ++absZ0;
4111         if ( absZ0 == 0 ) goto overflow;
4112         if (!(absZ1 << 1) && roundNearestEven) {
4113             absZ0 &= ~1;
4114         }
4115     }
4116     z = absZ0;
4117     if ( zSign ) z = - z;
4118     if ( z && ( ( z < 0 ) ^ zSign ) ) {
4119  overflow:
4120         float_raise(float_flag_invalid, status);
4121         return zSign ? INT64_MIN : INT64_MAX;
4122     }
4123     if (absZ1) {
4124         float_raise(float_flag_inexact, status);
4125     }
4126     return z;
4127
4128 }
4129
4130 /*----------------------------------------------------------------------------
4131 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4132 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4133 | and returns the properly rounded 64-bit unsigned integer corresponding to the
4134 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
4135 | with the inexact exception raised if the input cannot be represented exactly
4136 | as an integer.  However, if the fixed-point input is too large, the invalid
4137 | exception is raised and the largest unsigned integer is returned.
4138 *----------------------------------------------------------------------------*/
4139
4140 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
4141                                 uint64_t absZ1, float_status *status)
4142 {
4143     int8_t roundingMode;
4144     bool roundNearestEven, increment;
4145
4146     roundingMode = status->float_rounding_mode;
4147     roundNearestEven = (roundingMode == float_round_nearest_even);
4148     switch (roundingMode) {
4149     case float_round_nearest_even:
4150     case float_round_ties_away:
4151         increment = ((int64_t)absZ1 < 0);
4152         break;
4153     case float_round_to_zero:
4154         increment = 0;
4155         break;
4156     case float_round_up:
4157         increment = !zSign && absZ1;
4158         break;
4159     case float_round_down:
4160         increment = zSign && absZ1;
4161         break;
4162     case float_round_to_odd:
4163         increment = !(absZ0 & 1) && absZ1;
4164         break;
4165     default:
4166         abort();
4167     }
4168     if (increment) {
4169         ++absZ0;
4170         if (absZ0 == 0) {
4171             float_raise(float_flag_invalid, status);
4172             return UINT64_MAX;
4173         }
4174         if (!(absZ1 << 1) && roundNearestEven) {
4175             absZ0 &= ~1;
4176         }
4177     }
4178
4179     if (zSign && absZ0) {
4180         float_raise(float_flag_invalid, status);
4181         return 0;
4182     }
4183
4184     if (absZ1) {
4185         float_raise(float_flag_inexact, status);
4186     }
4187     return absZ0;
4188 }
4189
4190 /*----------------------------------------------------------------------------
4191 | Normalizes the subnormal single-precision floating-point value represented
4192 | by the denormalized significand `aSig'.  The normalized exponent and
4193 | significand are stored at the locations pointed to by `zExpPtr' and
4194 | `zSigPtr', respectively.
4195 *----------------------------------------------------------------------------*/
4196
4197 static void
4198  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
4199 {
4200     int8_t shiftCount;
4201
4202     shiftCount = clz32(aSig) - 8;
4203     *zSigPtr = aSig<<shiftCount;
4204     *zExpPtr = 1 - shiftCount;
4205
4206 }
4207
4208 /*----------------------------------------------------------------------------
4209 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4210 | and significand `zSig', and returns the proper single-precision floating-
4211 | point value corresponding to the abstract input.  Ordinarily, the abstract
4212 | value is simply rounded and packed into the single-precision format, with
4213 | the inexact exception raised if the abstract input cannot be represented
4214 | exactly.  However, if the abstract value is too large, the overflow and
4215 | inexact exceptions are raised and an infinity or maximal finite value is
4216 | returned.  If the abstract value is too small, the input value is rounded to
4217 | a subnormal number, and the underflow and inexact exceptions are raised if
4218 | the abstract input cannot be represented exactly as a subnormal single-
4219 | precision floating-point number.
4220 |     The input significand `zSig' has its binary point between bits 30
4221 | and 29, which is 7 bits to the left of the usual location.  This shifted
4222 | significand must be normalized or smaller.  If `zSig' is not normalized,
4223 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4224 | and it must not require rounding.  In the usual case that `zSig' is
4225 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4226 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4227 | Binary Floating-Point Arithmetic.
4228 *----------------------------------------------------------------------------*/
4229
4230 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4231                                    float_status *status)
4232 {
4233     int8_t roundingMode;
4234     bool roundNearestEven;
4235     int8_t roundIncrement, roundBits;
4236     bool isTiny;
4237
4238     roundingMode = status->float_rounding_mode;
4239     roundNearestEven = ( roundingMode == float_round_nearest_even );
4240     switch (roundingMode) {
4241     case float_round_nearest_even:
4242     case float_round_ties_away:
4243         roundIncrement = 0x40;
4244         break;
4245     case float_round_to_zero:
4246         roundIncrement = 0;
4247         break;
4248     case float_round_up:
4249         roundIncrement = zSign ? 0 : 0x7f;
4250         break;
4251     case float_round_down:
4252         roundIncrement = zSign ? 0x7f : 0;
4253         break;
4254     case float_round_to_odd:
4255         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4256         break;
4257     default:
4258         abort();
4259         break;
4260     }
4261     roundBits = zSig & 0x7F;
4262     if ( 0xFD <= (uint16_t) zExp ) {
4263         if (    ( 0xFD < zExp )
4264              || (    ( zExp == 0xFD )
4265                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4266            ) {
4267             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4268                                    roundIncrement != 0;
4269             float_raise(float_flag_overflow | float_flag_inexact, status);
4270             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4271         }
4272         if ( zExp < 0 ) {
4273             if (status->flush_to_zero) {
4274                 float_raise(float_flag_output_denormal, status);
4275                 return packFloat32(zSign, 0, 0);
4276             }
4277             isTiny = status->tininess_before_rounding
4278                   || (zExp < -1)
4279                   || (zSig + roundIncrement < 0x80000000);
4280             shift32RightJamming( zSig, - zExp, &zSig );
4281             zExp = 0;
4282             roundBits = zSig & 0x7F;
4283             if (isTiny && roundBits) {
4284                 float_raise(float_flag_underflow, status);
4285             }
4286             if (roundingMode == float_round_to_odd) {
4287                 /*
4288                  * For round-to-odd case, the roundIncrement depends on
4289                  * zSig which just changed.
4290                  */
4291                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4292             }
4293         }
4294     }
4295     if (roundBits) {
4296         float_raise(float_flag_inexact, status);
4297     }
4298     zSig = ( zSig + roundIncrement )>>7;
4299     if (!(roundBits ^ 0x40) && roundNearestEven) {
4300         zSig &= ~1;
4301     }
4302     if ( zSig == 0 ) zExp = 0;
4303     return packFloat32( zSign, zExp, zSig );
4304
4305 }
4306
4307 /*----------------------------------------------------------------------------
4308 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4309 | and significand `zSig', and returns the proper single-precision floating-
4310 | point value corresponding to the abstract input.  This routine is just like
4311 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4312 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4313 | floating-point exponent.
4314 *----------------------------------------------------------------------------*/
4315
4316 static float32
4317  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4318                               float_status *status)
4319 {
4320     int8_t shiftCount;
4321
4322     shiftCount = clz32(zSig) - 1;
4323     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4324                                status);
4325
4326 }
4327
4328 /*----------------------------------------------------------------------------
4329 | Normalizes the subnormal double-precision floating-point value represented
4330 | by the denormalized significand `aSig'.  The normalized exponent and
4331 | significand are stored at the locations pointed to by `zExpPtr' and
4332 | `zSigPtr', respectively.
4333 *----------------------------------------------------------------------------*/
4334
4335 static void
4336  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4337 {
4338     int8_t shiftCount;
4339
4340     shiftCount = clz64(aSig) - 11;
4341     *zSigPtr = aSig<<shiftCount;
4342     *zExpPtr = 1 - shiftCount;
4343
4344 }
4345
4346 /*----------------------------------------------------------------------------
4347 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4348 | double-precision floating-point value, returning the result.  After being
4349 | shifted into the proper positions, the three fields are simply added
4350 | together to form the result.  This means that any integer portion of `zSig'
4351 | will be added into the exponent.  Since a properly normalized significand
4352 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4353 | than the desired result exponent whenever `zSig' is a complete, normalized
4354 | significand.
4355 *----------------------------------------------------------------------------*/
4356
4357 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4358 {
4359
4360     return make_float64(
4361         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4362
4363 }
4364
4365 /*----------------------------------------------------------------------------
4366 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4367 | and significand `zSig', and returns the proper double-precision floating-
4368 | point value corresponding to the abstract input.  Ordinarily, the abstract
4369 | value is simply rounded and packed into the double-precision format, with
4370 | the inexact exception raised if the abstract input cannot be represented
4371 | exactly.  However, if the abstract value is too large, the overflow and
4372 | inexact exceptions are raised and an infinity or maximal finite value is
4373 | returned.  If the abstract value is too small, the input value is rounded to
4374 | a subnormal number, and the underflow and inexact exceptions are raised if
4375 | the abstract input cannot be represented exactly as a subnormal double-
4376 | precision floating-point number.
4377 |     The input significand `zSig' has its binary point between bits 62
4378 | and 61, which is 10 bits to the left of the usual location.  This shifted
4379 | significand must be normalized or smaller.  If `zSig' is not normalized,
4380 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4381 | and it must not require rounding.  In the usual case that `zSig' is
4382 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4383 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4384 | Binary Floating-Point Arithmetic.
4385 *----------------------------------------------------------------------------*/
4386
4387 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4388                                    float_status *status)
4389 {
4390     int8_t roundingMode;
4391     bool roundNearestEven;
4392     int roundIncrement, roundBits;
4393     bool isTiny;
4394
4395     roundingMode = status->float_rounding_mode;
4396     roundNearestEven = ( roundingMode == float_round_nearest_even );
4397     switch (roundingMode) {
4398     case float_round_nearest_even:
4399     case float_round_ties_away:
4400         roundIncrement = 0x200;
4401         break;
4402     case float_round_to_zero:
4403         roundIncrement = 0;
4404         break;
4405     case float_round_up:
4406         roundIncrement = zSign ? 0 : 0x3ff;
4407         break;
4408     case float_round_down:
4409         roundIncrement = zSign ? 0x3ff : 0;
4410         break;
4411     case float_round_to_odd:
4412         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4413         break;
4414     default:
4415         abort();
4416     }
4417     roundBits = zSig & 0x3FF;
4418     if ( 0x7FD <= (uint16_t) zExp ) {
4419         if (    ( 0x7FD < zExp )
4420              || (    ( zExp == 0x7FD )
4421                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4422            ) {
4423             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4424                                    roundIncrement != 0;
4425             float_raise(float_flag_overflow | float_flag_inexact, status);
4426             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4427         }
4428         if ( zExp < 0 ) {
4429             if (status->flush_to_zero) {
4430                 float_raise(float_flag_output_denormal, status);
4431                 return packFloat64(zSign, 0, 0);
4432             }
4433             isTiny = status->tininess_before_rounding
4434                   || (zExp < -1)
4435                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4436             shift64RightJamming( zSig, - zExp, &zSig );
4437             zExp = 0;
4438             roundBits = zSig & 0x3FF;
4439             if (isTiny && roundBits) {
4440                 float_raise(float_flag_underflow, status);
4441             }
4442             if (roundingMode == float_round_to_odd) {
4443                 /*
4444                  * For round-to-odd case, the roundIncrement depends on
4445                  * zSig which just changed.
4446                  */
4447                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4448             }
4449         }
4450     }
4451     if (roundBits) {
4452         float_raise(float_flag_inexact, status);
4453     }
4454     zSig = ( zSig + roundIncrement )>>10;
4455     if (!(roundBits ^ 0x200) && roundNearestEven) {
4456         zSig &= ~1;
4457     }
4458     if ( zSig == 0 ) zExp = 0;
4459     return packFloat64( zSign, zExp, zSig );
4460
4461 }
4462
4463 /*----------------------------------------------------------------------------
4464 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4465 | and significand `zSig', and returns the proper double-precision floating-
4466 | point value corresponding to the abstract input.  This routine is just like
4467 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4468 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4469 | floating-point exponent.
4470 *----------------------------------------------------------------------------*/
4471
4472 static float64
4473  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4474                               float_status *status)
4475 {
4476     int8_t shiftCount;
4477
4478     shiftCount = clz64(zSig) - 1;
4479     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4480                                status);
4481
4482 }
4483
4484 /*----------------------------------------------------------------------------
4485 | Normalizes the subnormal extended double-precision floating-point value
4486 | represented by the denormalized significand `aSig'.  The normalized exponent
4487 | and significand are stored at the locations pointed to by `zExpPtr' and
4488 | `zSigPtr', respectively.
4489 *----------------------------------------------------------------------------*/
4490
4491 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4492                                 uint64_t *zSigPtr)
4493 {
4494     int8_t shiftCount;
4495
4496     shiftCount = clz64(aSig);
4497     *zSigPtr = aSig<<shiftCount;
4498     *zExpPtr = 1 - shiftCount;
4499 }
4500
4501 /*----------------------------------------------------------------------------
4502 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4503 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4504 | and returns the proper extended double-precision floating-point value
4505 | corresponding to the abstract input.  Ordinarily, the abstract value is
4506 | rounded and packed into the extended double-precision format, with the
4507 | inexact exception raised if the abstract input cannot be represented
4508 | exactly.  However, if the abstract value is too large, the overflow and
4509 | inexact exceptions are raised and an infinity or maximal finite value is
4510 | returned.  If the abstract value is too small, the input value is rounded to
4511 | a subnormal number, and the underflow and inexact exceptions are raised if
4512 | the abstract input cannot be represented exactly as a subnormal extended
4513 | double-precision floating-point number.
4514 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
4515 | number of bits as single or double precision, respectively.  Otherwise, the
4516 | result is rounded to the full precision of the extended double-precision
4517 | format.
4518 |     The input significand must be normalized or smaller.  If the input
4519 | significand is not normalized, `zExp' must be 0; in that case, the result
4520 | returned is a subnormal number, and it must not require rounding.  The
4521 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4522 | Floating-Point Arithmetic.
4523 *----------------------------------------------------------------------------*/
4524
4525 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4526                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4527                               float_status *status)
4528 {
4529     int8_t roundingMode;
4530     bool roundNearestEven, increment, isTiny;
4531     int64_t roundIncrement, roundMask, roundBits;
4532
4533     roundingMode = status->float_rounding_mode;
4534     roundNearestEven = ( roundingMode == float_round_nearest_even );
4535     if ( roundingPrecision == 80 ) goto precision80;
4536     if ( roundingPrecision == 64 ) {
4537         roundIncrement = UINT64_C(0x0000000000000400);
4538         roundMask = UINT64_C(0x00000000000007FF);
4539     }
4540     else if ( roundingPrecision == 32 ) {
4541         roundIncrement = UINT64_C(0x0000008000000000);
4542         roundMask = UINT64_C(0x000000FFFFFFFFFF);
4543     }
4544     else {
4545         goto precision80;
4546     }
4547     zSig0 |= ( zSig1 != 0 );
4548     switch (roundingMode) {
4549     case float_round_nearest_even:
4550     case float_round_ties_away:
4551         break;
4552     case float_round_to_zero:
4553         roundIncrement = 0;
4554         break;
4555     case float_round_up:
4556         roundIncrement = zSign ? 0 : roundMask;
4557         break;
4558     case float_round_down:
4559         roundIncrement = zSign ? roundMask : 0;
4560         break;
4561     default:
4562         abort();
4563     }
4564     roundBits = zSig0 & roundMask;
4565     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4566         if (    ( 0x7FFE < zExp )
4567              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4568            ) {
4569             goto overflow;
4570         }
4571         if ( zExp <= 0 ) {
4572             if (status->flush_to_zero) {
4573                 float_raise(float_flag_output_denormal, status);
4574                 return packFloatx80(zSign, 0, 0);
4575             }
4576             isTiny = status->tininess_before_rounding
4577                   || (zExp < 0 )
4578                   || (zSig0 <= zSig0 + roundIncrement);
4579             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4580             zExp = 0;
4581             roundBits = zSig0 & roundMask;
4582             if (isTiny && roundBits) {
4583                 float_raise(float_flag_underflow, status);
4584             }
4585             if (roundBits) {
4586                 float_raise(float_flag_inexact, status);
4587             }
4588             zSig0 += roundIncrement;
4589             if ( (int64_t) zSig0 < 0 ) zExp = 1;
4590             roundIncrement = roundMask + 1;
4591             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4592                 roundMask |= roundIncrement;
4593             }
4594             zSig0 &= ~ roundMask;
4595             return packFloatx80( zSign, zExp, zSig0 );
4596         }
4597     }
4598     if (roundBits) {
4599         float_raise(float_flag_inexact, status);
4600     }
4601     zSig0 += roundIncrement;
4602     if ( zSig0 < roundIncrement ) {
4603         ++zExp;
4604         zSig0 = UINT64_C(0x8000000000000000);
4605     }
4606     roundIncrement = roundMask + 1;
4607     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4608         roundMask |= roundIncrement;
4609     }
4610     zSig0 &= ~ roundMask;
4611     if ( zSig0 == 0 ) zExp = 0;
4612     return packFloatx80( zSign, zExp, zSig0 );
4613  precision80:
4614     switch (roundingMode) {
4615     case float_round_nearest_even:
4616     case float_round_ties_away:
4617         increment = ((int64_t)zSig1 < 0);
4618         break;
4619     case float_round_to_zero:
4620         increment = 0;
4621         break;
4622     case float_round_up:
4623         increment = !zSign && zSig1;
4624         break;
4625     case float_round_down:
4626         increment = zSign && zSig1;
4627         break;
4628     default:
4629         abort();
4630     }
4631     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4632         if (    ( 0x7FFE < zExp )
4633              || (    ( zExp == 0x7FFE )
4634                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4635                   && increment
4636                 )
4637            ) {
4638             roundMask = 0;
4639  overflow:
4640             float_raise(float_flag_overflow | float_flag_inexact, status);
4641             if (    ( roundingMode == float_round_to_zero )
4642                  || ( zSign && ( roundingMode == float_round_up ) )
4643                  || ( ! zSign && ( roundingMode == float_round_down ) )
4644                ) {
4645                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4646             }
4647             return packFloatx80(zSign,
4648                                 floatx80_infinity_high,
4649                                 floatx80_infinity_low);
4650         }
4651         if ( zExp <= 0 ) {
4652             isTiny = status->tininess_before_rounding
4653                   || (zExp < 0)
4654                   || !increment
4655                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4656             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4657             zExp = 0;
4658             if (isTiny && zSig1) {
4659                 float_raise(float_flag_underflow, status);
4660             }
4661             if (zSig1) {
4662                 float_raise(float_flag_inexact, status);
4663             }
4664             switch (roundingMode) {
4665             case float_round_nearest_even:
4666             case float_round_ties_away:
4667                 increment = ((int64_t)zSig1 < 0);
4668                 break;
4669             case float_round_to_zero:
4670                 increment = 0;
4671                 break;
4672             case float_round_up:
4673                 increment = !zSign && zSig1;
4674                 break;
4675             case float_round_down:
4676                 increment = zSign && zSig1;
4677                 break;
4678             default:
4679                 abort();
4680             }
4681             if ( increment ) {
4682                 ++zSig0;
4683                 if (!(zSig1 << 1) && roundNearestEven) {
4684                     zSig0 &= ~1;
4685                 }
4686                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4687             }
4688             return packFloatx80( zSign, zExp, zSig0 );
4689         }
4690     }
4691     if (zSig1) {
4692         float_raise(float_flag_inexact, status);
4693     }
4694     if ( increment ) {
4695         ++zSig0;
4696         if ( zSig0 == 0 ) {
4697             ++zExp;
4698             zSig0 = UINT64_C(0x8000000000000000);
4699         }
4700         else {
4701             if (!(zSig1 << 1) && roundNearestEven) {
4702                 zSig0 &= ~1;
4703             }
4704         }
4705     }
4706     else {
4707         if ( zSig0 == 0 ) zExp = 0;
4708     }
4709     return packFloatx80( zSign, zExp, zSig0 );
4710
4711 }
4712
4713 /*----------------------------------------------------------------------------
4714 | Takes an abstract floating-point value having sign `zSign', exponent
4715 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4716 | and returns the proper extended double-precision floating-point value
4717 | corresponding to the abstract input.  This routine is just like
4718 | `roundAndPackFloatx80' except that the input significand does not have to be
4719 | normalized.
4720 *----------------------------------------------------------------------------*/
4721
4722 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4723                                        bool zSign, int32_t zExp,
4724                                        uint64_t zSig0, uint64_t zSig1,
4725                                        float_status *status)
4726 {
4727     int8_t shiftCount;
4728
4729     if ( zSig0 == 0 ) {
4730         zSig0 = zSig1;
4731         zSig1 = 0;
4732         zExp -= 64;
4733     }
4734     shiftCount = clz64(zSig0);
4735     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4736     zExp -= shiftCount;
4737     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4738                                 zSig0, zSig1, status);
4739
4740 }
4741
4742 /*----------------------------------------------------------------------------
4743 | Returns the least-significant 64 fraction bits of the quadruple-precision
4744 | floating-point value `a'.
4745 *----------------------------------------------------------------------------*/
4746
4747 static inline uint64_t extractFloat128Frac1( float128 a )
4748 {
4749
4750     return a.low;
4751
4752 }
4753
4754 /*----------------------------------------------------------------------------
4755 | Returns the most-significant 48 fraction bits of the quadruple-precision
4756 | floating-point value `a'.
4757 *----------------------------------------------------------------------------*/
4758
4759 static inline uint64_t extractFloat128Frac0( float128 a )
4760 {
4761
4762     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4763
4764 }
4765
4766 /*----------------------------------------------------------------------------
4767 | Returns the exponent bits of the quadruple-precision floating-point value
4768 | `a'.
4769 *----------------------------------------------------------------------------*/
4770
4771 static inline int32_t extractFloat128Exp( float128 a )
4772 {
4773
4774     return ( a.high>>48 ) & 0x7FFF;
4775
4776 }
4777
4778 /*----------------------------------------------------------------------------
4779 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4780 *----------------------------------------------------------------------------*/
4781
4782 static inline bool extractFloat128Sign(float128 a)
4783 {
4784     return a.high >> 63;
4785 }
4786
4787 /*----------------------------------------------------------------------------
4788 | Normalizes the subnormal quadruple-precision floating-point value
4789 | represented by the denormalized significand formed by the concatenation of
4790 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4791 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4792 | significand are stored at the location pointed to by `zSig0Ptr', and the
4793 | least significant 64 bits of the normalized significand are stored at the
4794 | location pointed to by `zSig1Ptr'.
4795 *----------------------------------------------------------------------------*/
4796
4797 static void
4798  normalizeFloat128Subnormal(
4799      uint64_t aSig0,
4800      uint64_t aSig1,
4801      int32_t *zExpPtr,
4802      uint64_t *zSig0Ptr,
4803      uint64_t *zSig1Ptr
4804  )
4805 {
4806     int8_t shiftCount;
4807
4808     if ( aSig0 == 0 ) {
4809         shiftCount = clz64(aSig1) - 15;
4810         if ( shiftCount < 0 ) {
4811             *zSig0Ptr = aSig1>>( - shiftCount );
4812             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4813         }
4814         else {
4815             *zSig0Ptr = aSig1<<shiftCount;
4816             *zSig1Ptr = 0;
4817         }
4818         *zExpPtr = - shiftCount - 63;
4819     }
4820     else {
4821         shiftCount = clz64(aSig0) - 15;
4822         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4823         *zExpPtr = 1 - shiftCount;
4824     }
4825
4826 }
4827
4828 /*----------------------------------------------------------------------------
4829 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4830 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4831 | floating-point value, returning the result.  After being shifted into the
4832 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4833 | added together to form the most significant 32 bits of the result.  This
4834 | means that any integer portion of `zSig0' will be added into the exponent.
4835 | Since a properly normalized significand will have an integer portion equal
4836 | to 1, the `zExp' input should be 1 less than the desired result exponent
4837 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4838 | significand.
4839 *----------------------------------------------------------------------------*/
4840
4841 static inline float128
4842 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4843 {
4844     float128 z;
4845
4846     z.low = zSig1;
4847     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4848     return z;
4849 }
4850
4851 /*----------------------------------------------------------------------------
4852 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4853 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4854 | and `zSig2', and returns the proper quadruple-precision floating-point value
4855 | corresponding to the abstract input.  Ordinarily, the abstract value is
4856 | simply rounded and packed into the quadruple-precision format, with the
4857 | inexact exception raised if the abstract input cannot be represented
4858 | exactly.  However, if the abstract value is too large, the overflow and
4859 | inexact exceptions are raised and an infinity or maximal finite value is
4860 | returned.  If the abstract value is too small, the input value is rounded to
4861 | a subnormal number, and the underflow and inexact exceptions are raised if
4862 | the abstract input cannot be represented exactly as a subnormal quadruple-
4863 | precision floating-point number.
4864 |     The input significand must be normalized or smaller.  If the input
4865 | significand is not normalized, `zExp' must be 0; in that case, the result
4866 | returned is a subnormal number, and it must not require rounding.  In the
4867 | usual case that the input significand is normalized, `zExp' must be 1 less
4868 | than the ``true'' floating-point exponent.  The handling of underflow and
4869 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4870 *----------------------------------------------------------------------------*/
4871
4872 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4873                                      uint64_t zSig0, uint64_t zSig1,
4874                                      uint64_t zSig2, float_status *status)
4875 {
4876     int8_t roundingMode;
4877     bool roundNearestEven, increment, isTiny;
4878
4879     roundingMode = status->float_rounding_mode;
4880     roundNearestEven = ( roundingMode == float_round_nearest_even );
4881     switch (roundingMode) {
4882     case float_round_nearest_even:
4883     case float_round_ties_away:
4884         increment = ((int64_t)zSig2 < 0);
4885         break;
4886     case float_round_to_zero:
4887         increment = 0;
4888         break;
4889     case float_round_up:
4890         increment = !zSign && zSig2;
4891         break;
4892     case float_round_down:
4893         increment = zSign && zSig2;
4894         break;
4895     case float_round_to_odd:
4896         increment = !(zSig1 & 0x1) && zSig2;
4897         break;
4898     default:
4899         abort();
4900     }
4901     if ( 0x7FFD <= (uint32_t) zExp ) {
4902         if (    ( 0x7FFD < zExp )
4903              || (    ( zExp == 0x7FFD )
4904                   && eq128(
4905                          UINT64_C(0x0001FFFFFFFFFFFF),
4906                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4907                          zSig0,
4908                          zSig1
4909                      )
4910                   && increment
4911                 )
4912            ) {
4913             float_raise(float_flag_overflow | float_flag_inexact, status);
4914             if (    ( roundingMode == float_round_to_zero )
4915                  || ( zSign && ( roundingMode == float_round_up ) )
4916                  || ( ! zSign && ( roundingMode == float_round_down ) )
4917                  || (roundingMode == float_round_to_odd)
4918                ) {
4919                 return
4920                     packFloat128(
4921                         zSign,
4922                         0x7FFE,
4923                         UINT64_C(0x0000FFFFFFFFFFFF),
4924                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4925                     );
4926             }
4927             return packFloat128( zSign, 0x7FFF, 0, 0 );
4928         }
4929         if ( zExp < 0 ) {
4930             if (status->flush_to_zero) {
4931                 float_raise(float_flag_output_denormal, status);
4932                 return packFloat128(zSign, 0, 0, 0);
4933             }
4934             isTiny = status->tininess_before_rounding
4935                   || (zExp < -1)
4936                   || !increment
4937                   || lt128(zSig0, zSig1,
4938                            UINT64_C(0x0001FFFFFFFFFFFF),
4939                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4940             shift128ExtraRightJamming(
4941                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4942             zExp = 0;
4943             if (isTiny && zSig2) {
4944                 float_raise(float_flag_underflow, status);
4945             }
4946             switch (roundingMode) {
4947             case float_round_nearest_even:
4948             case float_round_ties_away:
4949                 increment = ((int64_t)zSig2 < 0);
4950                 break;
4951             case float_round_to_zero:
4952                 increment = 0;
4953                 break;
4954             case float_round_up:
4955                 increment = !zSign && zSig2;
4956                 break;
4957             case float_round_down:
4958                 increment = zSign && zSig2;
4959                 break;
4960             case float_round_to_odd:
4961                 increment = !(zSig1 & 0x1) && zSig2;
4962                 break;
4963             default:
4964                 abort();
4965             }
4966         }
4967     }
4968     if (zSig2) {
4969         float_raise(float_flag_inexact, status);
4970     }
4971     if ( increment ) {
4972         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4973         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4974             zSig1 &= ~1;
4975         }
4976     }
4977     else {
4978         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4979     }
4980     return packFloat128( zSign, zExp, zSig0, zSig1 );
4981
4982 }
4983
4984 /*----------------------------------------------------------------------------
4985 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4986 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4987 | returns the proper quadruple-precision floating-point value corresponding
4988 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4989 | except that the input significand has fewer bits and does not have to be
4990 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4991 | point exponent.
4992 *----------------------------------------------------------------------------*/
4993
4994 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4995                                               uint64_t zSig0, uint64_t zSig1,
4996                                               float_status *status)
4997 {
4998     int8_t shiftCount;
4999     uint64_t zSig2;
5000
5001     if ( zSig0 == 0 ) {
5002         zSig0 = zSig1;
5003         zSig1 = 0;
5004         zExp -= 64;
5005     }
5006     shiftCount = clz64(zSig0) - 15;
5007     if ( 0 <= shiftCount ) {
5008         zSig2 = 0;
5009         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5010     }
5011     else {
5012         shift128ExtraRightJamming(
5013             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
5014     }
5015     zExp -= shiftCount;
5016     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
5017
5018 }
5019
5020
5021 /*----------------------------------------------------------------------------
5022 | Returns the result of converting the 32-bit two's complement integer `a'
5023 | to the extended double-precision floating-point format.  The conversion
5024 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5025 | Arithmetic.
5026 *----------------------------------------------------------------------------*/
5027
5028 floatx80 int32_to_floatx80(int32_t a, float_status *status)
5029 {
5030     bool zSign;
5031     uint32_t absA;
5032     int8_t shiftCount;
5033     uint64_t zSig;
5034
5035     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
5036     zSign = ( a < 0 );
5037     absA = zSign ? - a : a;
5038     shiftCount = clz32(absA) + 32;
5039     zSig = absA;
5040     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
5041
5042 }
5043
5044 /*----------------------------------------------------------------------------
5045 | Returns the result of converting the 32-bit two's complement integer `a' to
5046 | the quadruple-precision floating-point format.  The conversion is performed
5047 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5048 *----------------------------------------------------------------------------*/
5049
5050 float128 int32_to_float128(int32_t a, float_status *status)
5051 {
5052     bool zSign;
5053     uint32_t absA;
5054     int8_t shiftCount;
5055     uint64_t zSig0;
5056
5057     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
5058     zSign = ( a < 0 );
5059     absA = zSign ? - a : a;
5060     shiftCount = clz32(absA) + 17;
5061     zSig0 = absA;
5062     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
5063
5064 }
5065
5066 /*----------------------------------------------------------------------------
5067 | Returns the result of converting the 64-bit two's complement integer `a'
5068 | to the extended double-precision floating-point format.  The conversion
5069 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5070 | Arithmetic.
5071 *----------------------------------------------------------------------------*/
5072
5073 floatx80 int64_to_floatx80(int64_t a, float_status *status)
5074 {
5075     bool zSign;
5076     uint64_t absA;
5077     int8_t shiftCount;
5078
5079     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
5080     zSign = ( a < 0 );
5081     absA = zSign ? - a : a;
5082     shiftCount = clz64(absA);
5083     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
5084
5085 }
5086
5087 /*----------------------------------------------------------------------------
5088 | Returns the result of converting the 64-bit two's complement integer `a' to
5089 | the quadruple-precision floating-point format.  The conversion is performed
5090 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5091 *----------------------------------------------------------------------------*/
5092
5093 float128 int64_to_float128(int64_t a, float_status *status)
5094 {
5095     bool zSign;
5096     uint64_t absA;
5097     int8_t shiftCount;
5098     int32_t zExp;
5099     uint64_t zSig0, zSig1;
5100
5101     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
5102     zSign = ( a < 0 );
5103     absA = zSign ? - a : a;
5104     shiftCount = clz64(absA) + 49;
5105     zExp = 0x406E - shiftCount;
5106     if ( 64 <= shiftCount ) {
5107         zSig1 = 0;
5108         zSig0 = absA;
5109         shiftCount -= 64;
5110     }
5111     else {
5112         zSig1 = absA;
5113         zSig0 = 0;
5114     }
5115     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5116     return packFloat128( zSign, zExp, zSig0, zSig1 );
5117
5118 }
5119
5120 /*----------------------------------------------------------------------------
5121 | Returns the result of converting the 64-bit unsigned integer `a'
5122 | to the quadruple-precision floating-point format.  The conversion is performed
5123 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5124 *----------------------------------------------------------------------------*/
5125
5126 float128 uint64_to_float128(uint64_t a, float_status *status)
5127 {
5128     if (a == 0) {
5129         return float128_zero;
5130     }
5131     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
5132 }
5133
5134 /*----------------------------------------------------------------------------
5135 | Returns the result of converting the single-precision floating-point value
5136 | `a' to the extended double-precision floating-point format.  The conversion
5137 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5138 | Arithmetic.
5139 *----------------------------------------------------------------------------*/
5140
5141 floatx80 float32_to_floatx80(float32 a, float_status *status)
5142 {
5143     bool aSign;
5144     int aExp;
5145     uint32_t aSig;
5146
5147     a = float32_squash_input_denormal(a, status);
5148     aSig = extractFloat32Frac( a );
5149     aExp = extractFloat32Exp( a );
5150     aSign = extractFloat32Sign( a );
5151     if ( aExp == 0xFF ) {
5152         if (aSig) {
5153             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
5154                                                status);
5155             return floatx80_silence_nan(res, status);
5156         }
5157         return packFloatx80(aSign,
5158                             floatx80_infinity_high,
5159                             floatx80_infinity_low);
5160     }
5161     if ( aExp == 0 ) {
5162         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5163         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5164     }
5165     aSig |= 0x00800000;
5166     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
5167
5168 }
5169
5170 /*----------------------------------------------------------------------------
5171 | Returns the result of converting the single-precision floating-point value
5172 | `a' to the double-precision floating-point format.  The conversion is
5173 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5174 | Arithmetic.
5175 *----------------------------------------------------------------------------*/
5176
5177 float128 float32_to_float128(float32 a, float_status *status)
5178 {
5179     bool aSign;
5180     int aExp;
5181     uint32_t aSig;
5182
5183     a = float32_squash_input_denormal(a, status);
5184     aSig = extractFloat32Frac( a );
5185     aExp = extractFloat32Exp( a );
5186     aSign = extractFloat32Sign( a );
5187     if ( aExp == 0xFF ) {
5188         if (aSig) {
5189             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
5190         }
5191         return packFloat128( aSign, 0x7FFF, 0, 0 );
5192     }
5193     if ( aExp == 0 ) {
5194         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5195         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5196         --aExp;
5197     }
5198     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
5199
5200 }
5201
5202 /*----------------------------------------------------------------------------
5203 | Returns the remainder of the single-precision floating-point value `a'
5204 | with respect to the corresponding value `b'.  The operation is performed
5205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5206 *----------------------------------------------------------------------------*/
5207
5208 float32 float32_rem(float32 a, float32 b, float_status *status)
5209 {
5210     bool aSign, zSign;
5211     int aExp, bExp, expDiff;
5212     uint32_t aSig, bSig;
5213     uint32_t q;
5214     uint64_t aSig64, bSig64, q64;
5215     uint32_t alternateASig;
5216     int32_t sigMean;
5217     a = float32_squash_input_denormal(a, status);
5218     b = float32_squash_input_denormal(b, status);
5219
5220     aSig = extractFloat32Frac( a );
5221     aExp = extractFloat32Exp( a );
5222     aSign = extractFloat32Sign( a );
5223     bSig = extractFloat32Frac( b );
5224     bExp = extractFloat32Exp( b );
5225     if ( aExp == 0xFF ) {
5226         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
5227             return propagateFloat32NaN(a, b, status);
5228         }
5229         float_raise(float_flag_invalid, status);
5230         return float32_default_nan(status);
5231     }
5232     if ( bExp == 0xFF ) {
5233         if (bSig) {
5234             return propagateFloat32NaN(a, b, status);
5235         }
5236         return a;
5237     }
5238     if ( bExp == 0 ) {
5239         if ( bSig == 0 ) {
5240             float_raise(float_flag_invalid, status);
5241             return float32_default_nan(status);
5242         }
5243         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5244     }
5245     if ( aExp == 0 ) {
5246         if ( aSig == 0 ) return a;
5247         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5248     }
5249     expDiff = aExp - bExp;
5250     aSig |= 0x00800000;
5251     bSig |= 0x00800000;
5252     if ( expDiff < 32 ) {
5253         aSig <<= 8;
5254         bSig <<= 8;
5255         if ( expDiff < 0 ) {
5256             if ( expDiff < -1 ) return a;
5257             aSig >>= 1;
5258         }
5259         q = ( bSig <= aSig );
5260         if ( q ) aSig -= bSig;
5261         if ( 0 < expDiff ) {
5262             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5263             q >>= 32 - expDiff;
5264             bSig >>= 2;
5265             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5266         }
5267         else {
5268             aSig >>= 2;
5269             bSig >>= 2;
5270         }
5271     }
5272     else {
5273         if ( bSig <= aSig ) aSig -= bSig;
5274         aSig64 = ( (uint64_t) aSig )<<40;
5275         bSig64 = ( (uint64_t) bSig )<<40;
5276         expDiff -= 64;
5277         while ( 0 < expDiff ) {
5278             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5279             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5280             aSig64 = - ( ( bSig * q64 )<<38 );
5281             expDiff -= 62;
5282         }
5283         expDiff += 64;
5284         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5285         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5286         q = q64>>( 64 - expDiff );
5287         bSig <<= 6;
5288         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5289     }
5290     do {
5291         alternateASig = aSig;
5292         ++q;
5293         aSig -= bSig;
5294     } while ( 0 <= (int32_t) aSig );
5295     sigMean = aSig + alternateASig;
5296     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5297         aSig = alternateASig;
5298     }
5299     zSign = ( (int32_t) aSig < 0 );
5300     if ( zSign ) aSig = - aSig;
5301     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5302 }
5303
5304
5305
5306 /*----------------------------------------------------------------------------
5307 | Returns the binary exponential of the single-precision floating-point value
5308 | `a'. The operation is performed according to the IEC/IEEE Standard for
5309 | Binary Floating-Point Arithmetic.
5310 |
5311 | Uses the following identities:
5312 |
5313 | 1. -------------------------------------------------------------------------
5314 |      x    x*ln(2)
5315 |     2  = e
5316 |
5317 | 2. -------------------------------------------------------------------------
5318 |                      2     3     4     5           n
5319 |      x        x     x     x     x     x           x
5320 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5321 |               1!    2!    3!    4!    5!          n!
5322 *----------------------------------------------------------------------------*/
5323
5324 static const float64 float32_exp2_coefficients[15] =
5325 {
5326     const_float64( 0x3ff0000000000000ll ), /*  1 */
5327     const_float64( 0x3fe0000000000000ll ), /*  2 */
5328     const_float64( 0x3fc5555555555555ll ), /*  3 */
5329     const_float64( 0x3fa5555555555555ll ), /*  4 */
5330     const_float64( 0x3f81111111111111ll ), /*  5 */
5331     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
5332     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
5333     const_float64( 0x3efa01a01a01a01all ), /*  8 */
5334     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
5335     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5336     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5337     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5338     const_float64( 0x3de6124613a86d09ll ), /* 13 */
5339     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5340     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5341 };
5342
5343 float32 float32_exp2(float32 a, float_status *status)
5344 {
5345     bool aSign;
5346     int aExp;
5347     uint32_t aSig;
5348     float64 r, x, xn;
5349     int i;
5350     a = float32_squash_input_denormal(a, status);
5351
5352     aSig = extractFloat32Frac( a );
5353     aExp = extractFloat32Exp( a );
5354     aSign = extractFloat32Sign( a );
5355
5356     if ( aExp == 0xFF) {
5357         if (aSig) {
5358             return propagateFloat32NaN(a, float32_zero, status);
5359         }
5360         return (aSign) ? float32_zero : a;
5361     }
5362     if (aExp == 0) {
5363         if (aSig == 0) return float32_one;
5364     }
5365
5366     float_raise(float_flag_inexact, status);
5367
5368     /* ******************************* */
5369     /* using float64 for approximation */
5370     /* ******************************* */
5371     x = float32_to_float64(a, status);
5372     x = float64_mul(x, float64_ln2, status);
5373
5374     xn = x;
5375     r = float64_one;
5376     for (i = 0 ; i < 15 ; i++) {
5377         float64 f;
5378
5379         f = float64_mul(xn, float32_exp2_coefficients[i], status);
5380         r = float64_add(r, f, status);
5381
5382         xn = float64_mul(xn, x, status);
5383     }
5384
5385     return float64_to_float32(r, status);
5386 }
5387
5388 /*----------------------------------------------------------------------------
5389 | Returns the binary log of the single-precision floating-point value `a'.
5390 | The operation is performed according to the IEC/IEEE Standard for Binary
5391 | Floating-Point Arithmetic.
5392 *----------------------------------------------------------------------------*/
5393 float32 float32_log2(float32 a, float_status *status)
5394 {
5395     bool aSign, zSign;
5396     int aExp;
5397     uint32_t aSig, zSig, i;
5398
5399     a = float32_squash_input_denormal(a, status);
5400     aSig = extractFloat32Frac( a );
5401     aExp = extractFloat32Exp( a );
5402     aSign = extractFloat32Sign( a );
5403
5404     if ( aExp == 0 ) {
5405         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5406         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5407     }
5408     if ( aSign ) {
5409         float_raise(float_flag_invalid, status);
5410         return float32_default_nan(status);
5411     }
5412     if ( aExp == 0xFF ) {
5413         if (aSig) {
5414             return propagateFloat32NaN(a, float32_zero, status);
5415         }
5416         return a;
5417     }
5418
5419     aExp -= 0x7F;
5420     aSig |= 0x00800000;
5421     zSign = aExp < 0;
5422     zSig = aExp << 23;
5423
5424     for (i = 1 << 22; i > 0; i >>= 1) {
5425         aSig = ( (uint64_t)aSig * aSig ) >> 23;
5426         if ( aSig & 0x01000000 ) {
5427             aSig >>= 1;
5428             zSig |= i;
5429         }
5430     }
5431
5432     if ( zSign )
5433         zSig = -zSig;
5434
5435     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5436 }
5437
5438 /*----------------------------------------------------------------------------
5439 | Returns the result of converting the double-precision floating-point value
5440 | `a' to the extended double-precision floating-point format.  The conversion
5441 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5442 | Arithmetic.
5443 *----------------------------------------------------------------------------*/
5444
5445 floatx80 float64_to_floatx80(float64 a, float_status *status)
5446 {
5447     bool aSign;
5448     int aExp;
5449     uint64_t aSig;
5450
5451     a = float64_squash_input_denormal(a, status);
5452     aSig = extractFloat64Frac( a );
5453     aExp = extractFloat64Exp( a );
5454     aSign = extractFloat64Sign( a );
5455     if ( aExp == 0x7FF ) {
5456         if (aSig) {
5457             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5458                                                status);
5459             return floatx80_silence_nan(res, status);
5460         }
5461         return packFloatx80(aSign,
5462                             floatx80_infinity_high,
5463                             floatx80_infinity_low);
5464     }
5465     if ( aExp == 0 ) {
5466         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5467         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5468     }
5469     return
5470         packFloatx80(
5471             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5472
5473 }
5474
5475 /*----------------------------------------------------------------------------
5476 | Returns the result of converting the double-precision floating-point value
5477 | `a' to the quadruple-precision floating-point format.  The conversion is
5478 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5479 | Arithmetic.
5480 *----------------------------------------------------------------------------*/
5481
5482 float128 float64_to_float128(float64 a, float_status *status)
5483 {
5484     bool aSign;
5485     int aExp;
5486     uint64_t aSig, zSig0, zSig1;
5487
5488     a = float64_squash_input_denormal(a, status);
5489     aSig = extractFloat64Frac( a );
5490     aExp = extractFloat64Exp( a );
5491     aSign = extractFloat64Sign( a );
5492     if ( aExp == 0x7FF ) {
5493         if (aSig) {
5494             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5495         }
5496         return packFloat128( aSign, 0x7FFF, 0, 0 );
5497     }
5498     if ( aExp == 0 ) {
5499         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5500         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5501         --aExp;
5502     }
5503     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5504     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5505
5506 }
5507
5508
5509 /*----------------------------------------------------------------------------
5510 | Returns the remainder of the double-precision floating-point value `a'
5511 | with respect to the corresponding value `b'.  The operation is performed
5512 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5513 *----------------------------------------------------------------------------*/
5514
5515 float64 float64_rem(float64 a, float64 b, float_status *status)
5516 {
5517     bool aSign, zSign;
5518     int aExp, bExp, expDiff;
5519     uint64_t aSig, bSig;
5520     uint64_t q, alternateASig;
5521     int64_t sigMean;
5522
5523     a = float64_squash_input_denormal(a, status);
5524     b = float64_squash_input_denormal(b, status);
5525     aSig = extractFloat64Frac( a );
5526     aExp = extractFloat64Exp( a );
5527     aSign = extractFloat64Sign( a );
5528     bSig = extractFloat64Frac( b );
5529     bExp = extractFloat64Exp( b );
5530     if ( aExp == 0x7FF ) {
5531         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5532             return propagateFloat64NaN(a, b, status);
5533         }
5534         float_raise(float_flag_invalid, status);
5535         return float64_default_nan(status);
5536     }
5537     if ( bExp == 0x7FF ) {
5538         if (bSig) {
5539             return propagateFloat64NaN(a, b, status);
5540         }
5541         return a;
5542     }
5543     if ( bExp == 0 ) {
5544         if ( bSig == 0 ) {
5545             float_raise(float_flag_invalid, status);
5546             return float64_default_nan(status);
5547         }
5548         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5549     }
5550     if ( aExp == 0 ) {
5551         if ( aSig == 0 ) return a;
5552         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5553     }
5554     expDiff = aExp - bExp;
5555     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5556     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5557     if ( expDiff < 0 ) {
5558         if ( expDiff < -1 ) return a;
5559         aSig >>= 1;
5560     }
5561     q = ( bSig <= aSig );
5562     if ( q ) aSig -= bSig;
5563     expDiff -= 64;
5564     while ( 0 < expDiff ) {
5565         q = estimateDiv128To64( aSig, 0, bSig );
5566         q = ( 2 < q ) ? q - 2 : 0;
5567         aSig = - ( ( bSig>>2 ) * q );
5568         expDiff -= 62;
5569     }
5570     expDiff += 64;
5571     if ( 0 < expDiff ) {
5572         q = estimateDiv128To64( aSig, 0, bSig );
5573         q = ( 2 < q ) ? q - 2 : 0;
5574         q >>= 64 - expDiff;
5575         bSig >>= 2;
5576         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5577     }
5578     else {
5579         aSig >>= 2;
5580         bSig >>= 2;
5581     }
5582     do {
5583         alternateASig = aSig;
5584         ++q;
5585         aSig -= bSig;
5586     } while ( 0 <= (int64_t) aSig );
5587     sigMean = aSig + alternateASig;
5588     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5589         aSig = alternateASig;
5590     }
5591     zSign = ( (int64_t) aSig < 0 );
5592     if ( zSign ) aSig = - aSig;
5593     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5594
5595 }
5596
5597 /*----------------------------------------------------------------------------
5598 | Returns the binary log of the double-precision floating-point value `a'.
5599 | The operation is performed according to the IEC/IEEE Standard for Binary
5600 | Floating-Point Arithmetic.
5601 *----------------------------------------------------------------------------*/
5602 float64 float64_log2(float64 a, float_status *status)
5603 {
5604     bool aSign, zSign;
5605     int aExp;
5606     uint64_t aSig, aSig0, aSig1, zSig, i;
5607     a = float64_squash_input_denormal(a, status);
5608
5609     aSig = extractFloat64Frac( a );
5610     aExp = extractFloat64Exp( a );
5611     aSign = extractFloat64Sign( a );
5612
5613     if ( aExp == 0 ) {
5614         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5615         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5616     }
5617     if ( aSign ) {
5618         float_raise(float_flag_invalid, status);
5619         return float64_default_nan(status);
5620     }
5621     if ( aExp == 0x7FF ) {
5622         if (aSig) {
5623             return propagateFloat64NaN(a, float64_zero, status);
5624         }
5625         return a;
5626     }
5627
5628     aExp -= 0x3FF;
5629     aSig |= UINT64_C(0x0010000000000000);
5630     zSign = aExp < 0;
5631     zSig = (uint64_t)aExp << 52;
5632     for (i = 1LL << 51; i > 0; i >>= 1) {
5633         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5634         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5635         if ( aSig & UINT64_C(0x0020000000000000) ) {
5636             aSig >>= 1;
5637             zSig |= i;
5638         }
5639     }
5640
5641     if ( zSign )
5642         zSig = -zSig;
5643     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5644 }
5645
5646 /*----------------------------------------------------------------------------
5647 | Returns the result of converting the extended double-precision floating-
5648 | point value `a' to the 32-bit two's complement integer format.  The
5649 | conversion is performed according to the IEC/IEEE Standard for Binary
5650 | Floating-Point Arithmetic---which means in particular that the conversion
5651 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5652 | largest positive integer is returned.  Otherwise, if the conversion
5653 | overflows, the largest integer with the same sign as `a' is returned.
5654 *----------------------------------------------------------------------------*/
5655
5656 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5657 {
5658     bool aSign;
5659     int32_t aExp, shiftCount;
5660     uint64_t aSig;
5661
5662     if (floatx80_invalid_encoding(a)) {
5663         float_raise(float_flag_invalid, status);
5664         return 1 << 31;
5665     }
5666     aSig = extractFloatx80Frac( a );
5667     aExp = extractFloatx80Exp( a );
5668     aSign = extractFloatx80Sign( a );
5669     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5670     shiftCount = 0x4037 - aExp;
5671     if ( shiftCount <= 0 ) shiftCount = 1;
5672     shift64RightJamming( aSig, shiftCount, &aSig );
5673     return roundAndPackInt32(aSign, aSig, status);
5674
5675 }
5676
5677 /*----------------------------------------------------------------------------
5678 | Returns the result of converting the extended double-precision floating-
5679 | point value `a' to the 32-bit two's complement integer format.  The
5680 | conversion is performed according to the IEC/IEEE Standard for Binary
5681 | Floating-Point Arithmetic, except that the conversion is always rounded
5682 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5683 | Otherwise, if the conversion overflows, the largest integer with the same
5684 | sign as `a' is returned.
5685 *----------------------------------------------------------------------------*/
5686
5687 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5688 {
5689     bool aSign;
5690     int32_t aExp, shiftCount;
5691     uint64_t aSig, savedASig;
5692     int32_t z;
5693
5694     if (floatx80_invalid_encoding(a)) {
5695         float_raise(float_flag_invalid, status);
5696         return 1 << 31;
5697     }
5698     aSig = extractFloatx80Frac( a );
5699     aExp = extractFloatx80Exp( a );
5700     aSign = extractFloatx80Sign( a );
5701     if ( 0x401E < aExp ) {
5702         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5703         goto invalid;
5704     }
5705     else if ( aExp < 0x3FFF ) {
5706         if (aExp || aSig) {
5707             float_raise(float_flag_inexact, status);
5708         }
5709         return 0;
5710     }
5711     shiftCount = 0x403E - aExp;
5712     savedASig = aSig;
5713     aSig >>= shiftCount;
5714     z = aSig;
5715     if ( aSign ) z = - z;
5716     if ( ( z < 0 ) ^ aSign ) {
5717  invalid:
5718         float_raise(float_flag_invalid, status);
5719         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5720     }
5721     if ( ( aSig<<shiftCount ) != savedASig ) {
5722         float_raise(float_flag_inexact, status);
5723     }
5724     return z;
5725
5726 }
5727
5728 /*----------------------------------------------------------------------------
5729 | Returns the result of converting the extended double-precision floating-
5730 | point value `a' to the 64-bit two's complement integer format.  The
5731 | conversion is performed according to the IEC/IEEE Standard for Binary
5732 | Floating-Point Arithmetic---which means in particular that the conversion
5733 | is rounded according to the current rounding mode.  If `a' is a NaN,
5734 | the largest positive integer is returned.  Otherwise, if the conversion
5735 | overflows, the largest integer with the same sign as `a' is returned.
5736 *----------------------------------------------------------------------------*/
5737
5738 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5739 {
5740     bool aSign;
5741     int32_t aExp, shiftCount;
5742     uint64_t aSig, aSigExtra;
5743
5744     if (floatx80_invalid_encoding(a)) {
5745         float_raise(float_flag_invalid, status);
5746         return 1ULL << 63;
5747     }
5748     aSig = extractFloatx80Frac( a );
5749     aExp = extractFloatx80Exp( a );
5750     aSign = extractFloatx80Sign( a );
5751     shiftCount = 0x403E - aExp;
5752     if ( shiftCount <= 0 ) {
5753         if ( shiftCount ) {
5754             float_raise(float_flag_invalid, status);
5755             if (!aSign || floatx80_is_any_nan(a)) {
5756                 return INT64_MAX;
5757             }
5758             return INT64_MIN;
5759         }
5760         aSigExtra = 0;
5761     }
5762     else {
5763         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5764     }
5765     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5766
5767 }
5768
5769 /*----------------------------------------------------------------------------
5770 | Returns the result of converting the extended double-precision floating-
5771 | point value `a' to the 64-bit two's complement integer format.  The
5772 | conversion is performed according to the IEC/IEEE Standard for Binary
5773 | Floating-Point Arithmetic, except that the conversion is always rounded
5774 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5775 | Otherwise, if the conversion overflows, the largest integer with the same
5776 | sign as `a' is returned.
5777 *----------------------------------------------------------------------------*/
5778
5779 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5780 {
5781     bool aSign;
5782     int32_t aExp, shiftCount;
5783     uint64_t aSig;
5784     int64_t z;
5785
5786     if (floatx80_invalid_encoding(a)) {
5787         float_raise(float_flag_invalid, status);
5788         return 1ULL << 63;
5789     }
5790     aSig = extractFloatx80Frac( a );
5791     aExp = extractFloatx80Exp( a );
5792     aSign = extractFloatx80Sign( a );
5793     shiftCount = aExp - 0x403E;
5794     if ( 0 <= shiftCount ) {
5795         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5796         if ( ( a.high != 0xC03E ) || aSig ) {
5797             float_raise(float_flag_invalid, status);
5798             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5799                 return INT64_MAX;
5800             }
5801         }
5802         return INT64_MIN;
5803     }
5804     else if ( aExp < 0x3FFF ) {
5805         if (aExp | aSig) {
5806             float_raise(float_flag_inexact, status);
5807         }
5808         return 0;
5809     }
5810     z = aSig>>( - shiftCount );
5811     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5812         float_raise(float_flag_inexact, status);
5813     }
5814     if ( aSign ) z = - z;
5815     return z;
5816
5817 }
5818
5819 /*----------------------------------------------------------------------------
5820 | Returns the result of converting the extended double-precision floating-
5821 | point value `a' to the single-precision floating-point format.  The
5822 | conversion is performed according to the IEC/IEEE Standard for Binary
5823 | Floating-Point Arithmetic.
5824 *----------------------------------------------------------------------------*/
5825
5826 float32 floatx80_to_float32(floatx80 a, float_status *status)
5827 {
5828     bool aSign;
5829     int32_t aExp;
5830     uint64_t aSig;
5831
5832     if (floatx80_invalid_encoding(a)) {
5833         float_raise(float_flag_invalid, status);
5834         return float32_default_nan(status);
5835     }
5836     aSig = extractFloatx80Frac( a );
5837     aExp = extractFloatx80Exp( a );
5838     aSign = extractFloatx80Sign( a );
5839     if ( aExp == 0x7FFF ) {
5840         if ( (uint64_t) ( aSig<<1 ) ) {
5841             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5842                                              status);
5843             return float32_silence_nan(res, status);
5844         }
5845         return packFloat32( aSign, 0xFF, 0 );
5846     }
5847     shift64RightJamming( aSig, 33, &aSig );
5848     if ( aExp || aSig ) aExp -= 0x3F81;
5849     return roundAndPackFloat32(aSign, aExp, aSig, status);
5850
5851 }
5852
5853 /*----------------------------------------------------------------------------
5854 | Returns the result of converting the extended double-precision floating-
5855 | point value `a' to the double-precision floating-point format.  The
5856 | conversion is performed according to the IEC/IEEE Standard for Binary
5857 | Floating-Point Arithmetic.
5858 *----------------------------------------------------------------------------*/
5859
5860 float64 floatx80_to_float64(floatx80 a, float_status *status)
5861 {
5862     bool aSign;
5863     int32_t aExp;
5864     uint64_t aSig, zSig;
5865
5866     if (floatx80_invalid_encoding(a)) {
5867         float_raise(float_flag_invalid, status);
5868         return float64_default_nan(status);
5869     }
5870     aSig = extractFloatx80Frac( a );
5871     aExp = extractFloatx80Exp( a );
5872     aSign = extractFloatx80Sign( a );
5873     if ( aExp == 0x7FFF ) {
5874         if ( (uint64_t) ( aSig<<1 ) ) {
5875             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5876                                              status);
5877             return float64_silence_nan(res, status);
5878         }
5879         return packFloat64( aSign, 0x7FF, 0 );
5880     }
5881     shift64RightJamming( aSig, 1, &zSig );
5882     if ( aExp || aSig ) aExp -= 0x3C01;
5883     return roundAndPackFloat64(aSign, aExp, zSig, status);
5884
5885 }
5886
5887 /*----------------------------------------------------------------------------
5888 | Returns the result of converting the extended double-precision floating-
5889 | point value `a' to the quadruple-precision floating-point format.  The
5890 | conversion is performed according to the IEC/IEEE Standard for Binary
5891 | Floating-Point Arithmetic.
5892 *----------------------------------------------------------------------------*/
5893
5894 float128 floatx80_to_float128(floatx80 a, float_status *status)
5895 {
5896     bool aSign;
5897     int aExp;
5898     uint64_t aSig, zSig0, zSig1;
5899
5900     if (floatx80_invalid_encoding(a)) {
5901         float_raise(float_flag_invalid, status);
5902         return float128_default_nan(status);
5903     }
5904     aSig = extractFloatx80Frac( a );
5905     aExp = extractFloatx80Exp( a );
5906     aSign = extractFloatx80Sign( a );
5907     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5908         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5909                                            status);
5910         return float128_silence_nan(res, status);
5911     }
5912     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5913     return packFloat128( aSign, aExp, zSig0, zSig1 );
5914
5915 }
5916
5917 /*----------------------------------------------------------------------------
5918 | Rounds the extended double-precision floating-point value `a'
5919 | to the precision provided by floatx80_rounding_precision and returns the
5920 | result as an extended double-precision floating-point value.
5921 | The operation is performed according to the IEC/IEEE Standard for Binary
5922 | Floating-Point Arithmetic.
5923 *----------------------------------------------------------------------------*/
5924
5925 floatx80 floatx80_round(floatx80 a, float_status *status)
5926 {
5927     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5928                                 extractFloatx80Sign(a),
5929                                 extractFloatx80Exp(a),
5930                                 extractFloatx80Frac(a), 0, status);
5931 }
5932
5933 /*----------------------------------------------------------------------------
5934 | Rounds the extended double-precision floating-point value `a' to an integer,
5935 | and returns the result as an extended quadruple-precision floating-point
5936 | value.  The operation is performed according to the IEC/IEEE Standard for
5937 | Binary Floating-Point Arithmetic.
5938 *----------------------------------------------------------------------------*/
5939
5940 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5941 {
5942     bool aSign;
5943     int32_t aExp;
5944     uint64_t lastBitMask, roundBitsMask;
5945     floatx80 z;
5946
5947     if (floatx80_invalid_encoding(a)) {
5948         float_raise(float_flag_invalid, status);
5949         return floatx80_default_nan(status);
5950     }
5951     aExp = extractFloatx80Exp( a );
5952     if ( 0x403E <= aExp ) {
5953         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5954             return propagateFloatx80NaN(a, a, status);
5955         }
5956         return a;
5957     }
5958     if ( aExp < 0x3FFF ) {
5959         if (    ( aExp == 0 )
5960              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5961             return a;
5962         }
5963         float_raise(float_flag_inexact, status);
5964         aSign = extractFloatx80Sign( a );
5965         switch (status->float_rounding_mode) {
5966          case float_round_nearest_even:
5967             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5968                ) {
5969                 return
5970                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5971             }
5972             break;
5973         case float_round_ties_away:
5974             if (aExp == 0x3FFE) {
5975                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5976             }
5977             break;
5978          case float_round_down:
5979             return
5980                   aSign ?
5981                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5982                 : packFloatx80( 0, 0, 0 );
5983          case float_round_up:
5984             return
5985                   aSign ? packFloatx80( 1, 0, 0 )
5986                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5987
5988         case float_round_to_zero:
5989             break;
5990         default:
5991             g_assert_not_reached();
5992         }
5993         return packFloatx80( aSign, 0, 0 );
5994     }
5995     lastBitMask = 1;
5996     lastBitMask <<= 0x403E - aExp;
5997     roundBitsMask = lastBitMask - 1;
5998     z = a;
5999     switch (status->float_rounding_mode) {
6000     case float_round_nearest_even:
6001         z.low += lastBitMask>>1;
6002         if ((z.low & roundBitsMask) == 0) {
6003             z.low &= ~lastBitMask;
6004         }
6005         break;
6006     case float_round_ties_away:
6007         z.low += lastBitMask >> 1;
6008         break;
6009     case float_round_to_zero:
6010         break;
6011     case float_round_up:
6012         if (!extractFloatx80Sign(z)) {
6013             z.low += roundBitsMask;
6014         }
6015         break;
6016     case float_round_down:
6017         if (extractFloatx80Sign(z)) {
6018             z.low += roundBitsMask;
6019         }
6020         break;
6021     default:
6022         abort();
6023     }
6024     z.low &= ~ roundBitsMask;
6025     if ( z.low == 0 ) {
6026         ++z.high;
6027         z.low = UINT64_C(0x8000000000000000);
6028     }
6029     if (z.low != a.low) {
6030         float_raise(float_flag_inexact, status);
6031     }
6032     return z;
6033
6034 }
6035
6036 /*----------------------------------------------------------------------------
6037 | Returns the result of adding the absolute values of the extended double-
6038 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
6039 | negated before being returned.  `zSign' is ignored if the result is a NaN.
6040 | The addition is performed according to the IEC/IEEE Standard for Binary
6041 | Floating-Point Arithmetic.
6042 *----------------------------------------------------------------------------*/
6043
6044 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6045                                 float_status *status)
6046 {
6047     int32_t aExp, bExp, zExp;
6048     uint64_t aSig, bSig, zSig0, zSig1;
6049     int32_t expDiff;
6050
6051     aSig = extractFloatx80Frac( a );
6052     aExp = extractFloatx80Exp( a );
6053     bSig = extractFloatx80Frac( b );
6054     bExp = extractFloatx80Exp( b );
6055     expDiff = aExp - bExp;
6056     if ( 0 < expDiff ) {
6057         if ( aExp == 0x7FFF ) {
6058             if ((uint64_t)(aSig << 1)) {
6059                 return propagateFloatx80NaN(a, b, status);
6060             }
6061             return a;
6062         }
6063         if ( bExp == 0 ) --expDiff;
6064         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6065         zExp = aExp;
6066     }
6067     else if ( expDiff < 0 ) {
6068         if ( bExp == 0x7FFF ) {
6069             if ((uint64_t)(bSig << 1)) {
6070                 return propagateFloatx80NaN(a, b, status);
6071             }
6072             return packFloatx80(zSign,
6073                                 floatx80_infinity_high,
6074                                 floatx80_infinity_low);
6075         }
6076         if ( aExp == 0 ) ++expDiff;
6077         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6078         zExp = bExp;
6079     }
6080     else {
6081         if ( aExp == 0x7FFF ) {
6082             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6083                 return propagateFloatx80NaN(a, b, status);
6084             }
6085             return a;
6086         }
6087         zSig1 = 0;
6088         zSig0 = aSig + bSig;
6089         if ( aExp == 0 ) {
6090             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
6091                 /* At least one of the values is a pseudo-denormal,
6092                  * and there is a carry out of the result.  */
6093                 zExp = 1;
6094                 goto shiftRight1;
6095             }
6096             if (zSig0 == 0) {
6097                 return packFloatx80(zSign, 0, 0);
6098             }
6099             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
6100             goto roundAndPack;
6101         }
6102         zExp = aExp;
6103         goto shiftRight1;
6104     }
6105     zSig0 = aSig + bSig;
6106     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
6107  shiftRight1:
6108     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
6109     zSig0 |= UINT64_C(0x8000000000000000);
6110     ++zExp;
6111  roundAndPack:
6112     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6113                                 zSign, zExp, zSig0, zSig1, status);
6114 }
6115
6116 /*----------------------------------------------------------------------------
6117 | Returns the result of subtracting the absolute values of the extended
6118 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
6119 | difference is negated before being returned.  `zSign' is ignored if the
6120 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6121 | Standard for Binary Floating-Point Arithmetic.
6122 *----------------------------------------------------------------------------*/
6123
6124 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6125                                 float_status *status)
6126 {
6127     int32_t aExp, bExp, zExp;
6128     uint64_t aSig, bSig, zSig0, zSig1;
6129     int32_t expDiff;
6130
6131     aSig = extractFloatx80Frac( a );
6132     aExp = extractFloatx80Exp( a );
6133     bSig = extractFloatx80Frac( b );
6134     bExp = extractFloatx80Exp( b );
6135     expDiff = aExp - bExp;
6136     if ( 0 < expDiff ) goto aExpBigger;
6137     if ( expDiff < 0 ) goto bExpBigger;
6138     if ( aExp == 0x7FFF ) {
6139         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6140             return propagateFloatx80NaN(a, b, status);
6141         }
6142         float_raise(float_flag_invalid, status);
6143         return floatx80_default_nan(status);
6144     }
6145     if ( aExp == 0 ) {
6146         aExp = 1;
6147         bExp = 1;
6148     }
6149     zSig1 = 0;
6150     if ( bSig < aSig ) goto aBigger;
6151     if ( aSig < bSig ) goto bBigger;
6152     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
6153  bExpBigger:
6154     if ( bExp == 0x7FFF ) {
6155         if ((uint64_t)(bSig << 1)) {
6156             return propagateFloatx80NaN(a, b, status);
6157         }
6158         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
6159                             floatx80_infinity_low);
6160     }
6161     if ( aExp == 0 ) ++expDiff;
6162     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6163  bBigger:
6164     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
6165     zExp = bExp;
6166     zSign ^= 1;
6167     goto normalizeRoundAndPack;
6168  aExpBigger:
6169     if ( aExp == 0x7FFF ) {
6170         if ((uint64_t)(aSig << 1)) {
6171             return propagateFloatx80NaN(a, b, status);
6172         }
6173         return a;
6174     }
6175     if ( bExp == 0 ) --expDiff;
6176     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6177  aBigger:
6178     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
6179     zExp = aExp;
6180  normalizeRoundAndPack:
6181     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6182                                          zSign, zExp, zSig0, zSig1, status);
6183 }
6184
6185 /*----------------------------------------------------------------------------
6186 | Returns the result of adding the extended double-precision floating-point
6187 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6188 | Standard for Binary Floating-Point Arithmetic.
6189 *----------------------------------------------------------------------------*/
6190
6191 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
6192 {
6193     bool aSign, bSign;
6194
6195     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6196         float_raise(float_flag_invalid, status);
6197         return floatx80_default_nan(status);
6198     }
6199     aSign = extractFloatx80Sign( a );
6200     bSign = extractFloatx80Sign( b );
6201     if ( aSign == bSign ) {
6202         return addFloatx80Sigs(a, b, aSign, status);
6203     }
6204     else {
6205         return subFloatx80Sigs(a, b, aSign, status);
6206     }
6207
6208 }
6209
6210 /*----------------------------------------------------------------------------
6211 | Returns the result of subtracting the extended double-precision floating-
6212 | point values `a' and `b'.  The operation is performed according to the
6213 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6214 *----------------------------------------------------------------------------*/
6215
6216 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
6217 {
6218     bool aSign, bSign;
6219
6220     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6221         float_raise(float_flag_invalid, status);
6222         return floatx80_default_nan(status);
6223     }
6224     aSign = extractFloatx80Sign( a );
6225     bSign = extractFloatx80Sign( b );
6226     if ( aSign == bSign ) {
6227         return subFloatx80Sigs(a, b, aSign, status);
6228     }
6229     else {
6230         return addFloatx80Sigs(a, b, aSign, status);
6231     }
6232
6233 }
6234
6235 /*----------------------------------------------------------------------------
6236 | Returns the result of multiplying the extended double-precision floating-
6237 | point values `a' and `b'.  The operation is performed according to the
6238 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6239 *----------------------------------------------------------------------------*/
6240
6241 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6242 {
6243     bool aSign, bSign, zSign;
6244     int32_t aExp, bExp, zExp;
6245     uint64_t aSig, bSig, zSig0, zSig1;
6246
6247     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6248         float_raise(float_flag_invalid, status);
6249         return floatx80_default_nan(status);
6250     }
6251     aSig = extractFloatx80Frac( a );
6252     aExp = extractFloatx80Exp( a );
6253     aSign = extractFloatx80Sign( a );
6254     bSig = extractFloatx80Frac( b );
6255     bExp = extractFloatx80Exp( b );
6256     bSign = extractFloatx80Sign( b );
6257     zSign = aSign ^ bSign;
6258     if ( aExp == 0x7FFF ) {
6259         if (    (uint64_t) ( aSig<<1 )
6260              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6261             return propagateFloatx80NaN(a, b, status);
6262         }
6263         if ( ( bExp | bSig ) == 0 ) goto invalid;
6264         return packFloatx80(zSign, floatx80_infinity_high,
6265                                    floatx80_infinity_low);
6266     }
6267     if ( bExp == 0x7FFF ) {
6268         if ((uint64_t)(bSig << 1)) {
6269             return propagateFloatx80NaN(a, b, status);
6270         }
6271         if ( ( aExp | aSig ) == 0 ) {
6272  invalid:
6273             float_raise(float_flag_invalid, status);
6274             return floatx80_default_nan(status);
6275         }
6276         return packFloatx80(zSign, floatx80_infinity_high,
6277                                    floatx80_infinity_low);
6278     }
6279     if ( aExp == 0 ) {
6280         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6281         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6282     }
6283     if ( bExp == 0 ) {
6284         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6285         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6286     }
6287     zExp = aExp + bExp - 0x3FFE;
6288     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6289     if ( 0 < (int64_t) zSig0 ) {
6290         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6291         --zExp;
6292     }
6293     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6294                                 zSign, zExp, zSig0, zSig1, status);
6295 }
6296
6297 /*----------------------------------------------------------------------------
6298 | Returns the result of dividing the extended double-precision floating-point
6299 | value `a' by the corresponding value `b'.  The operation is performed
6300 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6301 *----------------------------------------------------------------------------*/
6302
6303 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6304 {
6305     bool aSign, bSign, zSign;
6306     int32_t aExp, bExp, zExp;
6307     uint64_t aSig, bSig, zSig0, zSig1;
6308     uint64_t rem0, rem1, rem2, term0, term1, term2;
6309
6310     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6311         float_raise(float_flag_invalid, status);
6312         return floatx80_default_nan(status);
6313     }
6314     aSig = extractFloatx80Frac( a );
6315     aExp = extractFloatx80Exp( a );
6316     aSign = extractFloatx80Sign( a );
6317     bSig = extractFloatx80Frac( b );
6318     bExp = extractFloatx80Exp( b );
6319     bSign = extractFloatx80Sign( b );
6320     zSign = aSign ^ bSign;
6321     if ( aExp == 0x7FFF ) {
6322         if ((uint64_t)(aSig << 1)) {
6323             return propagateFloatx80NaN(a, b, status);
6324         }
6325         if ( bExp == 0x7FFF ) {
6326             if ((uint64_t)(bSig << 1)) {
6327                 return propagateFloatx80NaN(a, b, status);
6328             }
6329             goto invalid;
6330         }
6331         return packFloatx80(zSign, floatx80_infinity_high,
6332                                    floatx80_infinity_low);
6333     }
6334     if ( bExp == 0x7FFF ) {
6335         if ((uint64_t)(bSig << 1)) {
6336             return propagateFloatx80NaN(a, b, status);
6337         }
6338         return packFloatx80( zSign, 0, 0 );
6339     }
6340     if ( bExp == 0 ) {
6341         if ( bSig == 0 ) {
6342             if ( ( aExp | aSig ) == 0 ) {
6343  invalid:
6344                 float_raise(float_flag_invalid, status);
6345                 return floatx80_default_nan(status);
6346             }
6347             float_raise(float_flag_divbyzero, status);
6348             return packFloatx80(zSign, floatx80_infinity_high,
6349                                        floatx80_infinity_low);
6350         }
6351         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6352     }
6353     if ( aExp == 0 ) {
6354         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6355         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6356     }
6357     zExp = aExp - bExp + 0x3FFE;
6358     rem1 = 0;
6359     if ( bSig <= aSig ) {
6360         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6361         ++zExp;
6362     }
6363     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6364     mul64To128( bSig, zSig0, &term0, &term1 );
6365     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6366     while ( (int64_t) rem0 < 0 ) {
6367         --zSig0;
6368         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6369     }
6370     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6371     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6372         mul64To128( bSig, zSig1, &term1, &term2 );
6373         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6374         while ( (int64_t) rem1 < 0 ) {
6375             --zSig1;
6376             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6377         }
6378         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6379     }
6380     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6381                                 zSign, zExp, zSig0, zSig1, status);
6382 }
6383
6384 /*----------------------------------------------------------------------------
6385 | Returns the remainder of the extended double-precision floating-point value
6386 | `a' with respect to the corresponding value `b'.  The operation is performed
6387 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6388 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6389 | the quotient toward zero instead.  '*quotient' is set to the low 64 bits of
6390 | the absolute value of the integer quotient.
6391 *----------------------------------------------------------------------------*/
6392
6393 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6394                          float_status *status)
6395 {
6396     bool aSign, zSign;
6397     int32_t aExp, bExp, expDiff, aExpOrig;
6398     uint64_t aSig0, aSig1, bSig;
6399     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6400
6401     *quotient = 0;
6402     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6403         float_raise(float_flag_invalid, status);
6404         return floatx80_default_nan(status);
6405     }
6406     aSig0 = extractFloatx80Frac( a );
6407     aExpOrig = aExp = extractFloatx80Exp( a );
6408     aSign = extractFloatx80Sign( a );
6409     bSig = extractFloatx80Frac( b );
6410     bExp = extractFloatx80Exp( b );
6411     if ( aExp == 0x7FFF ) {
6412         if (    (uint64_t) ( aSig0<<1 )
6413              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6414             return propagateFloatx80NaN(a, b, status);
6415         }
6416         goto invalid;
6417     }
6418     if ( bExp == 0x7FFF ) {
6419         if ((uint64_t)(bSig << 1)) {
6420             return propagateFloatx80NaN(a, b, status);
6421         }
6422         if (aExp == 0 && aSig0 >> 63) {
6423             /*
6424              * Pseudo-denormal argument must be returned in normalized
6425              * form.
6426              */
6427             return packFloatx80(aSign, 1, aSig0);
6428         }
6429         return a;
6430     }
6431     if ( bExp == 0 ) {
6432         if ( bSig == 0 ) {
6433  invalid:
6434             float_raise(float_flag_invalid, status);
6435             return floatx80_default_nan(status);
6436         }
6437         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6438     }
6439     if ( aExp == 0 ) {
6440         if ( aSig0 == 0 ) return a;
6441         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6442     }
6443     zSign = aSign;
6444     expDiff = aExp - bExp;
6445     aSig1 = 0;
6446     if ( expDiff < 0 ) {
6447         if ( mod || expDiff < -1 ) {
6448             if (aExp == 1 && aExpOrig == 0) {
6449                 /*
6450                  * Pseudo-denormal argument must be returned in
6451                  * normalized form.
6452                  */
6453                 return packFloatx80(aSign, aExp, aSig0);
6454             }
6455             return a;
6456         }
6457         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6458         expDiff = 0;
6459     }
6460     *quotient = q = ( bSig <= aSig0 );
6461     if ( q ) aSig0 -= bSig;
6462     expDiff -= 64;
6463     while ( 0 < expDiff ) {
6464         q = estimateDiv128To64( aSig0, aSig1, bSig );
6465         q = ( 2 < q ) ? q - 2 : 0;
6466         mul64To128( bSig, q, &term0, &term1 );
6467         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6468         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6469         expDiff -= 62;
6470         *quotient <<= 62;
6471         *quotient += q;
6472     }
6473     expDiff += 64;
6474     if ( 0 < expDiff ) {
6475         q = estimateDiv128To64( aSig0, aSig1, bSig );
6476         q = ( 2 < q ) ? q - 2 : 0;
6477         q >>= 64 - expDiff;
6478         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6479         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6480         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6481         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6482             ++q;
6483             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6484         }
6485         if (expDiff < 64) {
6486             *quotient <<= expDiff;
6487         } else {
6488             *quotient = 0;
6489         }
6490         *quotient += q;
6491     }
6492     else {
6493         term1 = 0;
6494         term0 = bSig;
6495     }
6496     if (!mod) {
6497         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6498         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6499                 || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6500                         && ( q & 1 ) )
6501             ) {
6502             aSig0 = alternateASig0;
6503             aSig1 = alternateASig1;
6504             zSign = ! zSign;
6505             ++*quotient;
6506         }
6507     }
6508     return
6509         normalizeRoundAndPackFloatx80(
6510             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6511
6512 }
6513
6514 /*----------------------------------------------------------------------------
6515 | Returns the remainder of the extended double-precision floating-point value
6516 | `a' with respect to the corresponding value `b'.  The operation is performed
6517 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6518 *----------------------------------------------------------------------------*/
6519
6520 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6521 {
6522     uint64_t quotient;
6523     return floatx80_modrem(a, b, false, &quotient, status);
6524 }
6525
6526 /*----------------------------------------------------------------------------
6527 | Returns the remainder of the extended double-precision floating-point value
6528 | `a' with respect to the corresponding value `b', with the quotient truncated
6529 | toward zero.
6530 *----------------------------------------------------------------------------*/
6531
6532 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6533 {
6534     uint64_t quotient;
6535     return floatx80_modrem(a, b, true, &quotient, status);
6536 }
6537
6538 /*----------------------------------------------------------------------------
6539 | Returns the square root of the extended double-precision floating-point
6540 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6541 | for Binary Floating-Point Arithmetic.
6542 *----------------------------------------------------------------------------*/
6543
6544 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6545 {
6546     bool aSign;
6547     int32_t aExp, zExp;
6548     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6549     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6550
6551     if (floatx80_invalid_encoding(a)) {
6552         float_raise(float_flag_invalid, status);
6553         return floatx80_default_nan(status);
6554     }
6555     aSig0 = extractFloatx80Frac( a );
6556     aExp = extractFloatx80Exp( a );
6557     aSign = extractFloatx80Sign( a );
6558     if ( aExp == 0x7FFF ) {
6559         if ((uint64_t)(aSig0 << 1)) {
6560             return propagateFloatx80NaN(a, a, status);
6561         }
6562         if ( ! aSign ) return a;
6563         goto invalid;
6564     }
6565     if ( aSign ) {
6566         if ( ( aExp | aSig0 ) == 0 ) return a;
6567  invalid:
6568         float_raise(float_flag_invalid, status);
6569         return floatx80_default_nan(status);
6570     }
6571     if ( aExp == 0 ) {
6572         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6573         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6574     }
6575     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6576     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6577     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6578     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6579     doubleZSig0 = zSig0<<1;
6580     mul64To128( zSig0, zSig0, &term0, &term1 );
6581     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6582     while ( (int64_t) rem0 < 0 ) {
6583         --zSig0;
6584         doubleZSig0 -= 2;
6585         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6586     }
6587     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6588     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6589         if ( zSig1 == 0 ) zSig1 = 1;
6590         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6591         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6592         mul64To128( zSig1, zSig1, &term2, &term3 );
6593         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6594         while ( (int64_t) rem1 < 0 ) {
6595             --zSig1;
6596             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6597             term3 |= 1;
6598             term2 |= doubleZSig0;
6599             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6600         }
6601         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6602     }
6603     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6604     zSig0 |= doubleZSig0;
6605     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6606                                 0, zExp, zSig0, zSig1, status);
6607 }
6608
6609 /*----------------------------------------------------------------------------
6610 | Returns the result of converting the quadruple-precision floating-point
6611 | value `a' to the 32-bit two's complement integer format.  The conversion
6612 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6613 | Arithmetic---which means in particular that the conversion is rounded
6614 | according to the current rounding mode.  If `a' is a NaN, the largest
6615 | positive integer is returned.  Otherwise, if the conversion overflows, the
6616 | largest integer with the same sign as `a' is returned.
6617 *----------------------------------------------------------------------------*/
6618
6619 int32_t float128_to_int32(float128 a, float_status *status)
6620 {
6621     bool aSign;
6622     int32_t aExp, shiftCount;
6623     uint64_t aSig0, aSig1;
6624
6625     aSig1 = extractFloat128Frac1( a );
6626     aSig0 = extractFloat128Frac0( a );
6627     aExp = extractFloat128Exp( a );
6628     aSign = extractFloat128Sign( a );
6629     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6630     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6631     aSig0 |= ( aSig1 != 0 );
6632     shiftCount = 0x4028 - aExp;
6633     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6634     return roundAndPackInt32(aSign, aSig0, status);
6635
6636 }
6637
6638 /*----------------------------------------------------------------------------
6639 | Returns the result of converting the quadruple-precision floating-point
6640 | value `a' to the 32-bit two's complement integer format.  The conversion
6641 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6642 | Arithmetic, except that the conversion is always rounded toward zero.  If
6643 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6644 | conversion overflows, the largest integer with the same sign as `a' is
6645 | returned.
6646 *----------------------------------------------------------------------------*/
6647
6648 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6649 {
6650     bool aSign;
6651     int32_t aExp, shiftCount;
6652     uint64_t aSig0, aSig1, savedASig;
6653     int32_t z;
6654
6655     aSig1 = extractFloat128Frac1( a );
6656     aSig0 = extractFloat128Frac0( a );
6657     aExp = extractFloat128Exp( a );
6658     aSign = extractFloat128Sign( a );
6659     aSig0 |= ( aSig1 != 0 );
6660     if ( 0x401E < aExp ) {
6661         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6662         goto invalid;
6663     }
6664     else if ( aExp < 0x3FFF ) {
6665         if (aExp || aSig0) {
6666             float_raise(float_flag_inexact, status);
6667         }
6668         return 0;
6669     }
6670     aSig0 |= UINT64_C(0x0001000000000000);
6671     shiftCount = 0x402F - aExp;
6672     savedASig = aSig0;
6673     aSig0 >>= shiftCount;
6674     z = aSig0;
6675     if ( aSign ) z = - z;
6676     if ( ( z < 0 ) ^ aSign ) {
6677  invalid:
6678         float_raise(float_flag_invalid, status);
6679         return aSign ? INT32_MIN : INT32_MAX;
6680     }
6681     if ( ( aSig0<<shiftCount ) != savedASig ) {
6682         float_raise(float_flag_inexact, status);
6683     }
6684     return z;
6685
6686 }
6687
6688 /*----------------------------------------------------------------------------
6689 | Returns the result of converting the quadruple-precision floating-point
6690 | value `a' to the 64-bit two's complement integer format.  The conversion
6691 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6692 | Arithmetic---which means in particular that the conversion is rounded
6693 | according to the current rounding mode.  If `a' is a NaN, the largest
6694 | positive integer is returned.  Otherwise, if the conversion overflows, the
6695 | largest integer with the same sign as `a' is returned.
6696 *----------------------------------------------------------------------------*/
6697
6698 int64_t float128_to_int64(float128 a, float_status *status)
6699 {
6700     bool aSign;
6701     int32_t aExp, shiftCount;
6702     uint64_t aSig0, aSig1;
6703
6704     aSig1 = extractFloat128Frac1( a );
6705     aSig0 = extractFloat128Frac0( a );
6706     aExp = extractFloat128Exp( a );
6707     aSign = extractFloat128Sign( a );
6708     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6709     shiftCount = 0x402F - aExp;
6710     if ( shiftCount <= 0 ) {
6711         if ( 0x403E < aExp ) {
6712             float_raise(float_flag_invalid, status);
6713             if (    ! aSign
6714                  || (    ( aExp == 0x7FFF )
6715                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6716                     )
6717                ) {
6718                 return INT64_MAX;
6719             }
6720             return INT64_MIN;
6721         }
6722         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6723     }
6724     else {
6725         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6726     }
6727     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6728
6729 }
6730
6731 /*----------------------------------------------------------------------------
6732 | Returns the result of converting the quadruple-precision floating-point
6733 | value `a' to the 64-bit two's complement integer format.  The conversion
6734 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6735 | Arithmetic, except that the conversion is always rounded toward zero.
6736 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6737 | the conversion overflows, the largest integer with the same sign as `a' is
6738 | returned.
6739 *----------------------------------------------------------------------------*/
6740
6741 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6742 {
6743     bool aSign;
6744     int32_t aExp, shiftCount;
6745     uint64_t aSig0, aSig1;
6746     int64_t z;
6747
6748     aSig1 = extractFloat128Frac1( a );
6749     aSig0 = extractFloat128Frac0( a );
6750     aExp = extractFloat128Exp( a );
6751     aSign = extractFloat128Sign( a );
6752     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6753     shiftCount = aExp - 0x402F;
6754     if ( 0 < shiftCount ) {
6755         if ( 0x403E <= aExp ) {
6756             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6757             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6758                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6759                 if (aSig1) {
6760                     float_raise(float_flag_inexact, status);
6761                 }
6762             }
6763             else {
6764                 float_raise(float_flag_invalid, status);
6765                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6766                     return INT64_MAX;
6767                 }
6768             }
6769             return INT64_MIN;
6770         }
6771         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6772         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6773             float_raise(float_flag_inexact, status);
6774         }
6775     }
6776     else {
6777         if ( aExp < 0x3FFF ) {
6778             if ( aExp | aSig0 | aSig1 ) {
6779                 float_raise(float_flag_inexact, status);
6780             }
6781             return 0;
6782         }
6783         z = aSig0>>( - shiftCount );
6784         if (    aSig1
6785              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6786             float_raise(float_flag_inexact, status);
6787         }
6788     }
6789     if ( aSign ) z = - z;
6790     return z;
6791
6792 }
6793
6794 /*----------------------------------------------------------------------------
6795 | Returns the result of converting the quadruple-precision floating-point value
6796 | `a' to the 64-bit unsigned integer format.  The conversion is
6797 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6798 | Arithmetic---which means in particular that the conversion is rounded
6799 | according to the current rounding mode.  If `a' is a NaN, the largest
6800 | positive integer is returned.  If the conversion overflows, the
6801 | largest unsigned integer is returned.  If 'a' is negative, the value is
6802 | rounded and zero is returned; negative values that do not round to zero
6803 | will raise the inexact exception.
6804 *----------------------------------------------------------------------------*/
6805
6806 uint64_t float128_to_uint64(float128 a, float_status *status)
6807 {
6808     bool aSign;
6809     int aExp;
6810     int shiftCount;
6811     uint64_t aSig0, aSig1;
6812
6813     aSig0 = extractFloat128Frac0(a);
6814     aSig1 = extractFloat128Frac1(a);
6815     aExp = extractFloat128Exp(a);
6816     aSign = extractFloat128Sign(a);
6817     if (aSign && (aExp > 0x3FFE)) {
6818         float_raise(float_flag_invalid, status);
6819         if (float128_is_any_nan(a)) {
6820             return UINT64_MAX;
6821         } else {
6822             return 0;
6823         }
6824     }
6825     if (aExp) {
6826         aSig0 |= UINT64_C(0x0001000000000000);
6827     }
6828     shiftCount = 0x402F - aExp;
6829     if (shiftCount <= 0) {
6830         if (0x403E < aExp) {
6831             float_raise(float_flag_invalid, status);
6832             return UINT64_MAX;
6833         }
6834         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6835     } else {
6836         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6837     }
6838     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6839 }
6840
6841 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6842 {
6843     uint64_t v;
6844     signed char current_rounding_mode = status->float_rounding_mode;
6845
6846     set_float_rounding_mode(float_round_to_zero, status);
6847     v = float128_to_uint64(a, status);
6848     set_float_rounding_mode(current_rounding_mode, status);
6849
6850     return v;
6851 }
6852
6853 /*----------------------------------------------------------------------------
6854 | Returns the result of converting the quadruple-precision floating-point
6855 | value `a' to the 32-bit unsigned integer format.  The conversion
6856 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6857 | Arithmetic except that the conversion is always rounded toward zero.
6858 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6859 | if the conversion overflows, the largest unsigned integer is returned.
6860 | If 'a' is negative, the value is rounded and zero is returned; negative
6861 | values that do not round to zero will raise the inexact exception.
6862 *----------------------------------------------------------------------------*/
6863
6864 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6865 {
6866     uint64_t v;
6867     uint32_t res;
6868     int old_exc_flags = get_float_exception_flags(status);
6869
6870     v = float128_to_uint64_round_to_zero(a, status);
6871     if (v > 0xffffffff) {
6872         res = 0xffffffff;
6873     } else {
6874         return v;
6875     }
6876     set_float_exception_flags(old_exc_flags, status);
6877     float_raise(float_flag_invalid, status);
6878     return res;
6879 }
6880
6881 /*----------------------------------------------------------------------------
6882 | Returns the result of converting the quadruple-precision floating-point value
6883 | `a' to the 32-bit unsigned integer format.  The conversion is
6884 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6885 | Arithmetic---which means in particular that the conversion is rounded
6886 | according to the current rounding mode.  If `a' is a NaN, the largest
6887 | positive integer is returned.  If the conversion overflows, the
6888 | largest unsigned integer is returned.  If 'a' is negative, the value is
6889 | rounded and zero is returned; negative values that do not round to zero
6890 | will raise the inexact exception.
6891 *----------------------------------------------------------------------------*/
6892
6893 uint32_t float128_to_uint32(float128 a, float_status *status)
6894 {
6895     uint64_t v;
6896     uint32_t res;
6897     int old_exc_flags = get_float_exception_flags(status);
6898
6899     v = float128_to_uint64(a, status);
6900     if (v > 0xffffffff) {
6901         res = 0xffffffff;
6902     } else {
6903         return v;
6904     }
6905     set_float_exception_flags(old_exc_flags, status);
6906     float_raise(float_flag_invalid, status);
6907     return res;
6908 }
6909
6910 /*----------------------------------------------------------------------------
6911 | Returns the result of converting the quadruple-precision floating-point
6912 | value `a' to the single-precision floating-point format.  The conversion
6913 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6914 | Arithmetic.
6915 *----------------------------------------------------------------------------*/
6916
6917 float32 float128_to_float32(float128 a, float_status *status)
6918 {
6919     bool aSign;
6920     int32_t aExp;
6921     uint64_t aSig0, aSig1;
6922     uint32_t zSig;
6923
6924     aSig1 = extractFloat128Frac1( a );
6925     aSig0 = extractFloat128Frac0( a );
6926     aExp = extractFloat128Exp( a );
6927     aSign = extractFloat128Sign( a );
6928     if ( aExp == 0x7FFF ) {
6929         if ( aSig0 | aSig1 ) {
6930             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6931         }
6932         return packFloat32( aSign, 0xFF, 0 );
6933     }
6934     aSig0 |= ( aSig1 != 0 );
6935     shift64RightJamming( aSig0, 18, &aSig0 );
6936     zSig = aSig0;
6937     if ( aExp || zSig ) {
6938         zSig |= 0x40000000;
6939         aExp -= 0x3F81;
6940     }
6941     return roundAndPackFloat32(aSign, aExp, zSig, status);
6942
6943 }
6944
6945 /*----------------------------------------------------------------------------
6946 | Returns the result of converting the quadruple-precision floating-point
6947 | value `a' to the double-precision floating-point format.  The conversion
6948 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6949 | Arithmetic.
6950 *----------------------------------------------------------------------------*/
6951
6952 float64 float128_to_float64(float128 a, float_status *status)
6953 {
6954     bool aSign;
6955     int32_t aExp;
6956     uint64_t aSig0, aSig1;
6957
6958     aSig1 = extractFloat128Frac1( a );
6959     aSig0 = extractFloat128Frac0( a );
6960     aExp = extractFloat128Exp( a );
6961     aSign = extractFloat128Sign( a );
6962     if ( aExp == 0x7FFF ) {
6963         if ( aSig0 | aSig1 ) {
6964             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6965         }
6966         return packFloat64( aSign, 0x7FF, 0 );
6967     }
6968     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6969     aSig0 |= ( aSig1 != 0 );
6970     if ( aExp || aSig0 ) {
6971         aSig0 |= UINT64_C(0x4000000000000000);
6972         aExp -= 0x3C01;
6973     }
6974     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6975
6976 }
6977
6978 /*----------------------------------------------------------------------------
6979 | Returns the result of converting the quadruple-precision floating-point
6980 | value `a' to the extended double-precision floating-point format.  The
6981 | conversion is performed according to the IEC/IEEE Standard for Binary
6982 | Floating-Point Arithmetic.
6983 *----------------------------------------------------------------------------*/
6984
6985 floatx80 float128_to_floatx80(float128 a, float_status *status)
6986 {
6987     bool aSign;
6988     int32_t aExp;
6989     uint64_t aSig0, aSig1;
6990
6991     aSig1 = extractFloat128Frac1( a );
6992     aSig0 = extractFloat128Frac0( a );
6993     aExp = extractFloat128Exp( a );
6994     aSign = extractFloat128Sign( a );
6995     if ( aExp == 0x7FFF ) {
6996         if ( aSig0 | aSig1 ) {
6997             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6998                                                status);
6999             return floatx80_silence_nan(res, status);
7000         }
7001         return packFloatx80(aSign, floatx80_infinity_high,
7002                                    floatx80_infinity_low);
7003     }
7004     if ( aExp == 0 ) {
7005         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
7006         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7007     }
7008     else {
7009         aSig0 |= UINT64_C(0x0001000000000000);
7010     }
7011     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
7012     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
7013
7014 }
7015
7016 /*----------------------------------------------------------------------------
7017 | Rounds the quadruple-precision floating-point value `a' to an integer, and
7018 | returns the result as a quadruple-precision floating-point value.  The
7019 | operation is performed according to the IEC/IEEE Standard for Binary
7020 | Floating-Point Arithmetic.
7021 *----------------------------------------------------------------------------*/
7022
7023 float128 float128_round_to_int(float128 a, float_status *status)
7024 {
7025     bool aSign;
7026     int32_t aExp;
7027     uint64_t lastBitMask, roundBitsMask;
7028     float128 z;
7029
7030     aExp = extractFloat128Exp( a );
7031     if ( 0x402F <= aExp ) {
7032         if ( 0x406F <= aExp ) {
7033             if (    ( aExp == 0x7FFF )
7034                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
7035                ) {
7036                 return propagateFloat128NaN(a, a, status);
7037             }
7038             return a;
7039         }
7040         lastBitMask = 1;
7041         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
7042         roundBitsMask = lastBitMask - 1;
7043         z = a;
7044         switch (status->float_rounding_mode) {
7045         case float_round_nearest_even:
7046             if ( lastBitMask ) {
7047                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
7048                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
7049             }
7050             else {
7051                 if ( (int64_t) z.low < 0 ) {
7052                     ++z.high;
7053                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
7054                 }
7055             }
7056             break;
7057         case float_round_ties_away:
7058             if (lastBitMask) {
7059                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
7060             } else {
7061                 if ((int64_t) z.low < 0) {
7062                     ++z.high;
7063                 }
7064             }
7065             break;
7066         case float_round_to_zero:
7067             break;
7068         case float_round_up:
7069             if (!extractFloat128Sign(z)) {
7070                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7071             }
7072             break;
7073         case float_round_down:
7074             if (extractFloat128Sign(z)) {
7075                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7076             }
7077             break;
7078         case float_round_to_odd:
7079             /*
7080              * Note that if lastBitMask == 0, the last bit is the lsb
7081              * of high, and roundBitsMask == -1.
7082              */
7083             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
7084                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7085             }
7086             break;
7087         default:
7088             abort();
7089         }
7090         z.low &= ~ roundBitsMask;
7091     }
7092     else {
7093         if ( aExp < 0x3FFF ) {
7094             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
7095             float_raise(float_flag_inexact, status);
7096             aSign = extractFloat128Sign( a );
7097             switch (status->float_rounding_mode) {
7098             case float_round_nearest_even:
7099                 if (    ( aExp == 0x3FFE )
7100                      && (   extractFloat128Frac0( a )
7101                           | extractFloat128Frac1( a ) )
7102                    ) {
7103                     return packFloat128( aSign, 0x3FFF, 0, 0 );
7104                 }
7105                 break;
7106             case float_round_ties_away:
7107                 if (aExp == 0x3FFE) {
7108                     return packFloat128(aSign, 0x3FFF, 0, 0);
7109                 }
7110                 break;
7111             case float_round_down:
7112                 return
7113                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7114                     : packFloat128( 0, 0, 0, 0 );
7115             case float_round_up:
7116                 return
7117                       aSign ? packFloat128( 1, 0, 0, 0 )
7118                     : packFloat128( 0, 0x3FFF, 0, 0 );
7119
7120             case float_round_to_odd:
7121                 return packFloat128(aSign, 0x3FFF, 0, 0);
7122
7123             case float_round_to_zero:
7124                 break;
7125             }
7126             return packFloat128( aSign, 0, 0, 0 );
7127         }
7128         lastBitMask = 1;
7129         lastBitMask <<= 0x402F - aExp;
7130         roundBitsMask = lastBitMask - 1;
7131         z.low = 0;
7132         z.high = a.high;
7133         switch (status->float_rounding_mode) {
7134         case float_round_nearest_even:
7135             z.high += lastBitMask>>1;
7136             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7137                 z.high &= ~ lastBitMask;
7138             }
7139             break;
7140         case float_round_ties_away:
7141             z.high += lastBitMask>>1;
7142             break;
7143         case float_round_to_zero:
7144             break;
7145         case float_round_up:
7146             if (!extractFloat128Sign(z)) {
7147                 z.high |= ( a.low != 0 );
7148                 z.high += roundBitsMask;
7149             }
7150             break;
7151         case float_round_down:
7152             if (extractFloat128Sign(z)) {
7153                 z.high |= (a.low != 0);
7154                 z.high += roundBitsMask;
7155             }
7156             break;
7157         case float_round_to_odd:
7158             if ((z.high & lastBitMask) == 0) {
7159                 z.high |= (a.low != 0);
7160                 z.high += roundBitsMask;
7161             }
7162             break;
7163         default:
7164             abort();
7165         }
7166         z.high &= ~ roundBitsMask;
7167     }
7168     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7169         float_raise(float_flag_inexact, status);
7170     }
7171     return z;
7172
7173 }
7174
7175 /*----------------------------------------------------------------------------
7176 | Returns the result of adding the absolute values of the quadruple-precision
7177 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
7178 | before being returned.  `zSign' is ignored if the result is a NaN.
7179 | The addition is performed according to the IEC/IEEE Standard for Binary
7180 | Floating-Point Arithmetic.
7181 *----------------------------------------------------------------------------*/
7182
7183 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
7184                                 float_status *status)
7185 {
7186     int32_t aExp, bExp, zExp;
7187     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7188     int32_t expDiff;
7189
7190     aSig1 = extractFloat128Frac1( a );
7191     aSig0 = extractFloat128Frac0( a );
7192     aExp = extractFloat128Exp( a );
7193     bSig1 = extractFloat128Frac1( b );
7194     bSig0 = extractFloat128Frac0( b );
7195     bExp = extractFloat128Exp( b );
7196     expDiff = aExp - bExp;
7197     if ( 0 < expDiff ) {
7198         if ( aExp == 0x7FFF ) {
7199             if (aSig0 | aSig1) {
7200                 return propagateFloat128NaN(a, b, status);
7201             }
7202             return a;
7203         }
7204         if ( bExp == 0 ) {
7205             --expDiff;
7206         }
7207         else {
7208             bSig0 |= UINT64_C(0x0001000000000000);
7209         }
7210         shift128ExtraRightJamming(
7211             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7212         zExp = aExp;
7213     }
7214     else if ( expDiff < 0 ) {
7215         if ( bExp == 0x7FFF ) {
7216             if (bSig0 | bSig1) {
7217                 return propagateFloat128NaN(a, b, status);
7218             }
7219             return packFloat128( zSign, 0x7FFF, 0, 0 );
7220         }
7221         if ( aExp == 0 ) {
7222             ++expDiff;
7223         }
7224         else {
7225             aSig0 |= UINT64_C(0x0001000000000000);
7226         }
7227         shift128ExtraRightJamming(
7228             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7229         zExp = bExp;
7230     }
7231     else {
7232         if ( aExp == 0x7FFF ) {
7233             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7234                 return propagateFloat128NaN(a, b, status);
7235             }
7236             return a;
7237         }
7238         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7239         if ( aExp == 0 ) {
7240             if (status->flush_to_zero) {
7241                 if (zSig0 | zSig1) {
7242                     float_raise(float_flag_output_denormal, status);
7243                 }
7244                 return packFloat128(zSign, 0, 0, 0);
7245             }
7246             return packFloat128( zSign, 0, zSig0, zSig1 );
7247         }
7248         zSig2 = 0;
7249         zSig0 |= UINT64_C(0x0002000000000000);
7250         zExp = aExp;
7251         goto shiftRight1;
7252     }
7253     aSig0 |= UINT64_C(0x0001000000000000);
7254     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7255     --zExp;
7256     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
7257     ++zExp;
7258  shiftRight1:
7259     shift128ExtraRightJamming(
7260         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7261  roundAndPack:
7262     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7263
7264 }
7265
7266 /*----------------------------------------------------------------------------
7267 | Returns the result of subtracting the absolute values of the quadruple-
7268 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7269 | difference is negated before being returned.  `zSign' is ignored if the
7270 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7271 | Standard for Binary Floating-Point Arithmetic.
7272 *----------------------------------------------------------------------------*/
7273
7274 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
7275                                 float_status *status)
7276 {
7277     int32_t aExp, bExp, zExp;
7278     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7279     int32_t expDiff;
7280
7281     aSig1 = extractFloat128Frac1( a );
7282     aSig0 = extractFloat128Frac0( a );
7283     aExp = extractFloat128Exp( a );
7284     bSig1 = extractFloat128Frac1( b );
7285     bSig0 = extractFloat128Frac0( b );
7286     bExp = extractFloat128Exp( b );
7287     expDiff = aExp - bExp;
7288     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7289     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7290     if ( 0 < expDiff ) goto aExpBigger;
7291     if ( expDiff < 0 ) goto bExpBigger;
7292     if ( aExp == 0x7FFF ) {
7293         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7294             return propagateFloat128NaN(a, b, status);
7295         }
7296         float_raise(float_flag_invalid, status);
7297         return float128_default_nan(status);
7298     }
7299     if ( aExp == 0 ) {
7300         aExp = 1;
7301         bExp = 1;
7302     }
7303     if ( bSig0 < aSig0 ) goto aBigger;
7304     if ( aSig0 < bSig0 ) goto bBigger;
7305     if ( bSig1 < aSig1 ) goto aBigger;
7306     if ( aSig1 < bSig1 ) goto bBigger;
7307     return packFloat128(status->float_rounding_mode == float_round_down,
7308                         0, 0, 0);
7309  bExpBigger:
7310     if ( bExp == 0x7FFF ) {
7311         if (bSig0 | bSig1) {
7312             return propagateFloat128NaN(a, b, status);
7313         }
7314         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7315     }
7316     if ( aExp == 0 ) {
7317         ++expDiff;
7318     }
7319     else {
7320         aSig0 |= UINT64_C(0x4000000000000000);
7321     }
7322     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7323     bSig0 |= UINT64_C(0x4000000000000000);
7324  bBigger:
7325     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7326     zExp = bExp;
7327     zSign ^= 1;
7328     goto normalizeRoundAndPack;
7329  aExpBigger:
7330     if ( aExp == 0x7FFF ) {
7331         if (aSig0 | aSig1) {
7332             return propagateFloat128NaN(a, b, status);
7333         }
7334         return a;
7335     }
7336     if ( bExp == 0 ) {
7337         --expDiff;
7338     }
7339     else {
7340         bSig0 |= UINT64_C(0x4000000000000000);
7341     }
7342     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7343     aSig0 |= UINT64_C(0x4000000000000000);
7344  aBigger:
7345     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7346     zExp = aExp;
7347  normalizeRoundAndPack:
7348     --zExp;
7349     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7350                                          status);
7351
7352 }
7353
7354 /*----------------------------------------------------------------------------
7355 | Returns the result of adding the quadruple-precision floating-point values
7356 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7357 | for Binary Floating-Point Arithmetic.
7358 *----------------------------------------------------------------------------*/
7359
7360 float128 float128_add(float128 a, float128 b, float_status *status)
7361 {
7362     bool aSign, bSign;
7363
7364     aSign = extractFloat128Sign( a );
7365     bSign = extractFloat128Sign( b );
7366     if ( aSign == bSign ) {
7367         return addFloat128Sigs(a, b, aSign, status);
7368     }
7369     else {
7370         return subFloat128Sigs(a, b, aSign, status);
7371     }
7372
7373 }
7374
7375 /*----------------------------------------------------------------------------
7376 | Returns the result of subtracting the quadruple-precision floating-point
7377 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7378 | Standard for Binary Floating-Point Arithmetic.
7379 *----------------------------------------------------------------------------*/
7380
7381 float128 float128_sub(float128 a, float128 b, float_status *status)
7382 {
7383     bool aSign, bSign;
7384
7385     aSign = extractFloat128Sign( a );
7386     bSign = extractFloat128Sign( b );
7387     if ( aSign == bSign ) {
7388         return subFloat128Sigs(a, b, aSign, status);
7389     }
7390     else {
7391         return addFloat128Sigs(a, b, aSign, status);
7392     }
7393
7394 }
7395
7396 /*----------------------------------------------------------------------------
7397 | Returns the result of multiplying the quadruple-precision floating-point
7398 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7399 | Standard for Binary Floating-Point Arithmetic.
7400 *----------------------------------------------------------------------------*/
7401
7402 float128 float128_mul(float128 a, float128 b, float_status *status)
7403 {
7404     bool aSign, bSign, zSign;
7405     int32_t aExp, bExp, zExp;
7406     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7407
7408     aSig1 = extractFloat128Frac1( a );
7409     aSig0 = extractFloat128Frac0( a );
7410     aExp = extractFloat128Exp( a );
7411     aSign = extractFloat128Sign( a );
7412     bSig1 = extractFloat128Frac1( b );
7413     bSig0 = extractFloat128Frac0( b );
7414     bExp = extractFloat128Exp( b );
7415     bSign = extractFloat128Sign( b );
7416     zSign = aSign ^ bSign;
7417     if ( aExp == 0x7FFF ) {
7418         if (    ( aSig0 | aSig1 )
7419              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7420             return propagateFloat128NaN(a, b, status);
7421         }
7422         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7423         return packFloat128( zSign, 0x7FFF, 0, 0 );
7424     }
7425     if ( bExp == 0x7FFF ) {
7426         if (bSig0 | bSig1) {
7427             return propagateFloat128NaN(a, b, status);
7428         }
7429         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7430  invalid:
7431             float_raise(float_flag_invalid, status);
7432             return float128_default_nan(status);
7433         }
7434         return packFloat128( zSign, 0x7FFF, 0, 0 );
7435     }
7436     if ( aExp == 0 ) {
7437         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7438         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7439     }
7440     if ( bExp == 0 ) {
7441         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7442         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7443     }
7444     zExp = aExp + bExp - 0x4000;
7445     aSig0 |= UINT64_C(0x0001000000000000);
7446     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7447     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7448     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7449     zSig2 |= ( zSig3 != 0 );
7450     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7451         shift128ExtraRightJamming(
7452             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7453         ++zExp;
7454     }
7455     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7456
7457 }
7458
7459 /*----------------------------------------------------------------------------
7460 | Returns the result of dividing the quadruple-precision floating-point value
7461 | `a' by the corresponding value `b'.  The operation is performed according to
7462 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7463 *----------------------------------------------------------------------------*/
7464
7465 float128 float128_div(float128 a, float128 b, float_status *status)
7466 {
7467     bool aSign, bSign, zSign;
7468     int32_t aExp, bExp, zExp;
7469     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7470     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7471
7472     aSig1 = extractFloat128Frac1( a );
7473     aSig0 = extractFloat128Frac0( a );
7474     aExp = extractFloat128Exp( a );
7475     aSign = extractFloat128Sign( a );
7476     bSig1 = extractFloat128Frac1( b );
7477     bSig0 = extractFloat128Frac0( b );
7478     bExp = extractFloat128Exp( b );
7479     bSign = extractFloat128Sign( b );
7480     zSign = aSign ^ bSign;
7481     if ( aExp == 0x7FFF ) {
7482         if (aSig0 | aSig1) {
7483             return propagateFloat128NaN(a, b, status);
7484         }
7485         if ( bExp == 0x7FFF ) {
7486             if (bSig0 | bSig1) {
7487                 return propagateFloat128NaN(a, b, status);
7488             }
7489             goto invalid;
7490         }
7491         return packFloat128( zSign, 0x7FFF, 0, 0 );
7492     }
7493     if ( bExp == 0x7FFF ) {
7494         if (bSig0 | bSig1) {
7495             return propagateFloat128NaN(a, b, status);
7496         }
7497         return packFloat128( zSign, 0, 0, 0 );
7498     }
7499     if ( bExp == 0 ) {
7500         if ( ( bSig0 | bSig1 ) == 0 ) {
7501             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7502  invalid:
7503                 float_raise(float_flag_invalid, status);
7504                 return float128_default_nan(status);
7505             }
7506             float_raise(float_flag_divbyzero, status);
7507             return packFloat128( zSign, 0x7FFF, 0, 0 );
7508         }
7509         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7510     }
7511     if ( aExp == 0 ) {
7512         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7513         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7514     }
7515     zExp = aExp - bExp + 0x3FFD;
7516     shortShift128Left(
7517         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7518     shortShift128Left(
7519         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7520     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7521         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7522         ++zExp;
7523     }
7524     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7525     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7526     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7527     while ( (int64_t) rem0 < 0 ) {
7528         --zSig0;
7529         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7530     }
7531     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7532     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7533         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7534         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7535         while ( (int64_t) rem1 < 0 ) {
7536             --zSig1;
7537             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7538         }
7539         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7540     }
7541     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7542     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7543
7544 }
7545
7546 /*----------------------------------------------------------------------------
7547 | Returns the remainder of the quadruple-precision floating-point value `a'
7548 | with respect to the corresponding value `b'.  The operation is performed
7549 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7550 *----------------------------------------------------------------------------*/
7551
7552 float128 float128_rem(float128 a, float128 b, float_status *status)
7553 {
7554     bool aSign, zSign;
7555     int32_t aExp, bExp, expDiff;
7556     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7557     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7558     int64_t sigMean0;
7559
7560     aSig1 = extractFloat128Frac1( a );
7561     aSig0 = extractFloat128Frac0( a );
7562     aExp = extractFloat128Exp( a );
7563     aSign = extractFloat128Sign( a );
7564     bSig1 = extractFloat128Frac1( b );
7565     bSig0 = extractFloat128Frac0( b );
7566     bExp = extractFloat128Exp( b );
7567     if ( aExp == 0x7FFF ) {
7568         if (    ( aSig0 | aSig1 )
7569              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7570             return propagateFloat128NaN(a, b, status);
7571         }
7572         goto invalid;
7573     }
7574     if ( bExp == 0x7FFF ) {
7575         if (bSig0 | bSig1) {
7576             return propagateFloat128NaN(a, b, status);
7577         }
7578         return a;
7579     }
7580     if ( bExp == 0 ) {
7581         if ( ( bSig0 | bSig1 ) == 0 ) {
7582  invalid:
7583             float_raise(float_flag_invalid, status);
7584             return float128_default_nan(status);
7585         }
7586         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7587     }
7588     if ( aExp == 0 ) {
7589         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7590         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7591     }
7592     expDiff = aExp - bExp;
7593     if ( expDiff < -1 ) return a;
7594     shortShift128Left(
7595         aSig0 | UINT64_C(0x0001000000000000),
7596         aSig1,
7597         15 - ( expDiff < 0 ),
7598         &aSig0,
7599         &aSig1
7600     );
7601     shortShift128Left(
7602         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7603     q = le128( bSig0, bSig1, aSig0, aSig1 );
7604     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7605     expDiff -= 64;
7606     while ( 0 < expDiff ) {
7607         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7608         q = ( 4 < q ) ? q - 4 : 0;
7609         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7610         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7611         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7612         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7613         expDiff -= 61;
7614     }
7615     if ( -64 < expDiff ) {
7616         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7617         q = ( 4 < q ) ? q - 4 : 0;
7618         q >>= - expDiff;
7619         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7620         expDiff += 52;
7621         if ( expDiff < 0 ) {
7622             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7623         }
7624         else {
7625             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7626         }
7627         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7628         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7629     }
7630     else {
7631         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7632         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7633     }
7634     do {
7635         alternateASig0 = aSig0;
7636         alternateASig1 = aSig1;
7637         ++q;
7638         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7639     } while ( 0 <= (int64_t) aSig0 );
7640     add128(
7641         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7642     if (    ( sigMean0 < 0 )
7643          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7644         aSig0 = alternateASig0;
7645         aSig1 = alternateASig1;
7646     }
7647     zSign = ( (int64_t) aSig0 < 0 );
7648     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7649     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7650                                          status);
7651 }
7652
7653 /*----------------------------------------------------------------------------
7654 | Returns the square root of the quadruple-precision floating-point value `a'.
7655 | The operation is performed according to the IEC/IEEE Standard for Binary
7656 | Floating-Point Arithmetic.
7657 *----------------------------------------------------------------------------*/
7658
7659 float128 float128_sqrt(float128 a, float_status *status)
7660 {
7661     bool aSign;
7662     int32_t aExp, zExp;
7663     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7664     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7665
7666     aSig1 = extractFloat128Frac1( a );
7667     aSig0 = extractFloat128Frac0( a );
7668     aExp = extractFloat128Exp( a );
7669     aSign = extractFloat128Sign( a );
7670     if ( aExp == 0x7FFF ) {
7671         if (aSig0 | aSig1) {
7672             return propagateFloat128NaN(a, a, status);
7673         }
7674         if ( ! aSign ) return a;
7675         goto invalid;
7676     }
7677     if ( aSign ) {
7678         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7679  invalid:
7680         float_raise(float_flag_invalid, status);
7681         return float128_default_nan(status);
7682     }
7683     if ( aExp == 0 ) {
7684         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7685         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7686     }
7687     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7688     aSig0 |= UINT64_C(0x0001000000000000);
7689     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7690     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7691     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7692     doubleZSig0 = zSig0<<1;
7693     mul64To128( zSig0, zSig0, &term0, &term1 );
7694     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7695     while ( (int64_t) rem0 < 0 ) {
7696         --zSig0;
7697         doubleZSig0 -= 2;
7698         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7699     }
7700     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7701     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7702         if ( zSig1 == 0 ) zSig1 = 1;
7703         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7704         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7705         mul64To128( zSig1, zSig1, &term2, &term3 );
7706         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7707         while ( (int64_t) rem1 < 0 ) {
7708             --zSig1;
7709             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7710             term3 |= 1;
7711             term2 |= doubleZSig0;
7712             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7713         }
7714         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7715     }
7716     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7717     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7718
7719 }
7720
7721 static inline FloatRelation
7722 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7723                           float_status *status)
7724 {
7725     bool aSign, bSign;
7726
7727     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7728         float_raise(float_flag_invalid, status);
7729         return float_relation_unordered;
7730     }
7731     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7732           ( extractFloatx80Frac( a )<<1 ) ) ||
7733         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7734           ( extractFloatx80Frac( b )<<1 ) )) {
7735         if (!is_quiet ||
7736             floatx80_is_signaling_nan(a, status) ||
7737             floatx80_is_signaling_nan(b, status)) {
7738             float_raise(float_flag_invalid, status);
7739         }
7740         return float_relation_unordered;
7741     }
7742     aSign = extractFloatx80Sign( a );
7743     bSign = extractFloatx80Sign( b );
7744     if ( aSign != bSign ) {
7745
7746         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7747              ( ( a.low | b.low ) == 0 ) ) {
7748             /* zero case */
7749             return float_relation_equal;
7750         } else {
7751             return 1 - (2 * aSign);
7752         }
7753     } else {
7754         /* Normalize pseudo-denormals before comparison.  */
7755         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7756             ++a.high;
7757         }
7758         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7759             ++b.high;
7760         }
7761         if (a.low == b.low && a.high == b.high) {
7762             return float_relation_equal;
7763         } else {
7764             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7765         }
7766     }
7767 }
7768
7769 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7770 {
7771     return floatx80_compare_internal(a, b, 0, status);
7772 }
7773
7774 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7775                                      float_status *status)
7776 {
7777     return floatx80_compare_internal(a, b, 1, status);
7778 }
7779
7780 static inline FloatRelation
7781 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7782                           float_status *status)
7783 {
7784     bool aSign, bSign;
7785
7786     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7787           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7788         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7789           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7790         if (!is_quiet ||
7791             float128_is_signaling_nan(a, status) ||
7792             float128_is_signaling_nan(b, status)) {
7793             float_raise(float_flag_invalid, status);
7794         }
7795         return float_relation_unordered;
7796     }
7797     aSign = extractFloat128Sign( a );
7798     bSign = extractFloat128Sign( b );
7799     if ( aSign != bSign ) {
7800         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7801             /* zero case */
7802             return float_relation_equal;
7803         } else {
7804             return 1 - (2 * aSign);
7805         }
7806     } else {
7807         if (a.low == b.low && a.high == b.high) {
7808             return float_relation_equal;
7809         } else {
7810             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7811         }
7812     }
7813 }
7814
7815 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7816 {
7817     return float128_compare_internal(a, b, 0, status);
7818 }
7819
7820 FloatRelation float128_compare_quiet(float128 a, float128 b,
7821                                      float_status *status)
7822 {
7823     return float128_compare_internal(a, b, 1, status);
7824 }
7825
7826 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7827 {
7828     bool aSign;
7829     int32_t aExp;
7830     uint64_t aSig;
7831
7832     if (floatx80_invalid_encoding(a)) {
7833         float_raise(float_flag_invalid, status);
7834         return floatx80_default_nan(status);
7835     }
7836     aSig = extractFloatx80Frac( a );
7837     aExp = extractFloatx80Exp( a );
7838     aSign = extractFloatx80Sign( a );
7839
7840     if ( aExp == 0x7FFF ) {
7841         if ( aSig<<1 ) {
7842             return propagateFloatx80NaN(a, a, status);
7843         }
7844         return a;
7845     }
7846
7847     if (aExp == 0) {
7848         if (aSig == 0) {
7849             return a;
7850         }
7851         aExp++;
7852     }
7853
7854     if (n > 0x10000) {
7855         n = 0x10000;
7856     } else if (n < -0x10000) {
7857         n = -0x10000;
7858     }
7859
7860     aExp += n;
7861     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7862                                          aSign, aExp, aSig, 0, status);
7863 }
7864
7865 float128 float128_scalbn(float128 a, int n, float_status *status)
7866 {
7867     bool aSign;
7868     int32_t aExp;
7869     uint64_t aSig0, aSig1;
7870
7871     aSig1 = extractFloat128Frac1( a );
7872     aSig0 = extractFloat128Frac0( a );
7873     aExp = extractFloat128Exp( a );
7874     aSign = extractFloat128Sign( a );
7875     if ( aExp == 0x7FFF ) {
7876         if ( aSig0 | aSig1 ) {
7877             return propagateFloat128NaN(a, a, status);
7878         }
7879         return a;
7880     }
7881     if (aExp != 0) {
7882         aSig0 |= UINT64_C(0x0001000000000000);
7883     } else if (aSig0 == 0 && aSig1 == 0) {
7884         return a;
7885     } else {
7886         aExp++;
7887     }
7888
7889     if (n > 0x10000) {
7890         n = 0x10000;
7891     } else if (n < -0x10000) {
7892         n = -0x10000;
7893     }
7894
7895     aExp += n - 1;
7896     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7897                                          , status);
7898
7899 }
7900
7901 static void __attribute__((constructor)) softfloat_init(void)
7902 {
7903     union_float64 ua, ub, uc, ur;
7904
7905     if (QEMU_NO_HARDFLOAT) {
7906         return;
7907     }
7908     /*
7909      * Test that the host's FMA is not obviously broken. For example,
7910      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7911      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7912      */
7913     ua.s = 0x0020000000000001ULL;
7914     ub.s = 0x3ca0000000000000ULL;
7915     uc.s = 0x0020000000000000ULL;
7916     ur.h = fma(ua.h, ub.h, uc.h);
7917     if (ur.s != 0x0020000000000001ULL) {
7918         force_soft_fma = true;
7919     }
7920 }