target/i386/fpu_helper.c

   1 /*
   2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include <math.h>
  22 #include "cpu.h"
  23 #include "exec/helper-proto.h"
  24 #include "qemu/host-utils.h"
  25 #include "exec/exec-all.h"
  26 #include "exec/cpu_ldst.h"
  27 #include "fpu/softfloat.h"
  28 #include "fpu/softfloat-macros.h"
  29
  30 #ifdef CONFIG_SOFTMMU
  31 #include "hw/irq.h"
  32 #endif
  33
  34 #define FPU_RC_MASK         0xc00
  35 #define FPU_RC_NEAR         0x000
  36 #define FPU_RC_DOWN         0x400
  37 #define FPU_RC_UP           0x800
  38 #define FPU_RC_CHOP         0xc00
  39
  40 #define MAXTAN 9223372036854775808.0
  41
  42 /* the following deal with x86 long double-precision numbers */
  43 #define MAXEXPD 0x7fff
  44 #define EXPBIAS 16383
  45 #define EXPD(fp)        (fp.l.upper & 0x7fff)
  46 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
  47 #define MANTD(fp)       (fp.l.lower)
  48 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
  49
  50 #define FPUS_IE (1 << 0)
  51 #define FPUS_DE (1 << 1)
  52 #define FPUS_ZE (1 << 2)
  53 #define FPUS_OE (1 << 3)
  54 #define FPUS_UE (1 << 4)
  55 #define FPUS_PE (1 << 5)
  56 #define FPUS_SF (1 << 6)
  57 #define FPUS_SE (1 << 7)
  58 #define FPUS_B  (1 << 15)
  59
  60 #define FPUC_EM 0x3f
  61
  62 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
  63 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
  64 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
  65 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
  66 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
  67 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
  68 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
  69 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
  70
  71 #if !defined(CONFIG_USER_ONLY)
  72 static qemu_irq ferr_irq;
  73
  74 void x86_register_ferr_irq(qemu_irq irq)
  75 {
  76     ferr_irq = irq;
  77 }
  78
  79 static void cpu_clear_ignne(void)
  80 {
  81     CPUX86State *env = &X86_CPU(first_cpu)->env;
  82     env->hflags2 &= ~HF2_IGNNE_MASK;
  83 }
  84
  85 void cpu_set_ignne(void)
  86 {
  87     CPUX86State *env = &X86_CPU(first_cpu)->env;
  88     env->hflags2 |= HF2_IGNNE_MASK;
  89     /*
  90      * We get here in response to a write to port F0h.  The chipset should
  91      * deassert FP_IRQ and FERR# instead should stay signaled until FPSW_SE is
  92      * cleared, because FERR# and FP_IRQ are two separate pins on real
  93      * hardware.  However, we don't model FERR# as a qemu_irq, so we just
  94      * do directly what the chipset would do, i.e. deassert FP_IRQ.
  95      */
  96     qemu_irq_lower(ferr_irq);
  97 }
  98 #endif
  99
 100
 101 static inline void fpush(CPUX86State *env)
 102 {
 103     env->fpstt = (env->fpstt - 1) & 7;
 104     env->fptags[env->fpstt] = 0; /* validate stack entry */
 105 }
 106
 107 static inline void fpop(CPUX86State *env)
 108 {
 109     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
 110     env->fpstt = (env->fpstt + 1) & 7;
 111 }
 112
 113 static inline floatx80 helper_fldt(CPUX86State *env, target_ulong ptr,
 114                                    uintptr_t retaddr)
 115 {
 116     CPU_LDoubleU temp;
 117
 118     temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
 119     temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
 120     return temp.d;
 121 }
 122
 123 static inline void helper_fstt(CPUX86State *env, floatx80 f, target_ulong ptr,
 124                                uintptr_t retaddr)
 125 {
 126     CPU_LDoubleU temp;
 127
 128     temp.d = f;
 129     cpu_stq_data_ra(env, ptr, temp.l.lower, retaddr);
 130     cpu_stw_data_ra(env, ptr + 8, temp.l.upper, retaddr);
 131 }
 132
 133 /* x87 FPU helpers */
 134
 135 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
 136 {
 137     union {
 138         float64 f64;
 139         double d;
 140     } u;
 141
 142     u.f64 = floatx80_to_float64(a, &env->fp_status);
 143     return u.d;
 144 }
 145
 146 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
 147 {
 148     union {
 149         float64 f64;
 150         double d;
 151     } u;
 152
 153     u.d = a;
 154     return float64_to_floatx80(u.f64, &env->fp_status);
 155 }
 156
 157 static void fpu_set_exception(CPUX86State *env, int mask)
 158 {
 159     env->fpus |= mask;
 160     if (env->fpus & (~env->fpuc & FPUC_EM)) {
 161         env->fpus |= FPUS_SE | FPUS_B;
 162     }
 163 }
 164
 165 static inline uint8_t save_exception_flags(CPUX86State *env)
 166 {
 167     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
 168     set_float_exception_flags(0, &env->fp_status);
 169     return old_flags;
 170 }
 171
 172 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
 173 {
 174     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
 175     float_raise(old_flags, &env->fp_status);
 176     fpu_set_exception(env,
 177                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
 178                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
 179                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
 180                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
 181                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
 182                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
 183 }
 184
 185 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
 186 {
 187     uint8_t old_flags = save_exception_flags(env);
 188     floatx80 ret = floatx80_div(a, b, &env->fp_status);
 189     merge_exception_flags(env, old_flags);
 190     return ret;
 191 }
 192
 193 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
 194 {
 195     if (env->cr[0] & CR0_NE_MASK) {
 196         raise_exception_ra(env, EXCP10_COPR, retaddr);
 197     }
 198 #if !defined(CONFIG_USER_ONLY)
 199     else if (ferr_irq && !(env->hflags2 & HF2_IGNNE_MASK)) {
 200         qemu_irq_raise(ferr_irq);
 201     }
 202 #endif
 203 }
 204
 205 void helper_flds_FT0(CPUX86State *env, uint32_t val)
 206 {
 207     uint8_t old_flags = save_exception_flags(env);
 208     union {
 209         float32 f;
 210         uint32_t i;
 211     } u;
 212
 213     u.i = val;
 214     FT0 = float32_to_floatx80(u.f, &env->fp_status);
 215     merge_exception_flags(env, old_flags);
 216 }
 217
 218 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
 219 {
 220     uint8_t old_flags = save_exception_flags(env);
 221     union {
 222         float64 f;
 223         uint64_t i;
 224     } u;
 225
 226     u.i = val;
 227     FT0 = float64_to_floatx80(u.f, &env->fp_status);
 228     merge_exception_flags(env, old_flags);
 229 }
 230
 231 void helper_fildl_FT0(CPUX86State *env, int32_t val)
 232 {
 233     FT0 = int32_to_floatx80(val, &env->fp_status);
 234 }
 235
 236 void helper_flds_ST0(CPUX86State *env, uint32_t val)
 237 {
 238     uint8_t old_flags = save_exception_flags(env);
 239     int new_fpstt;
 240     union {
 241         float32 f;
 242         uint32_t i;
 243     } u;
 244
 245     new_fpstt = (env->fpstt - 1) & 7;
 246     u.i = val;
 247     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
 248     env->fpstt = new_fpstt;
 249     env->fptags[new_fpstt] = 0; /* validate stack entry */
 250     merge_exception_flags(env, old_flags);
 251 }
 252
 253 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
 254 {
 255     uint8_t old_flags = save_exception_flags(env);
 256     int new_fpstt;
 257     union {
 258         float64 f;
 259         uint64_t i;
 260     } u;
 261
 262     new_fpstt = (env->fpstt - 1) & 7;
 263     u.i = val;
 264     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
 265     env->fpstt = new_fpstt;
 266     env->fptags[new_fpstt] = 0; /* validate stack entry */
 267     merge_exception_flags(env, old_flags);
 268 }
 269
 270 void helper_fildl_ST0(CPUX86State *env, int32_t val)
 271 {
 272     int new_fpstt;
 273
 274     new_fpstt = (env->fpstt - 1) & 7;
 275     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
 276     env->fpstt = new_fpstt;
 277     env->fptags[new_fpstt] = 0; /* validate stack entry */
 278 }
 279
 280 void helper_fildll_ST0(CPUX86State *env, int64_t val)
 281 {
 282     int new_fpstt;
 283
 284     new_fpstt = (env->fpstt - 1) & 7;
 285     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
 286     env->fpstt = new_fpstt;
 287     env->fptags[new_fpstt] = 0; /* validate stack entry */
 288 }
 289
 290 uint32_t helper_fsts_ST0(CPUX86State *env)
 291 {
 292     uint8_t old_flags = save_exception_flags(env);
 293     union {
 294         float32 f;
 295         uint32_t i;
 296     } u;
 297
 298     u.f = floatx80_to_float32(ST0, &env->fp_status);
 299     merge_exception_flags(env, old_flags);
 300     return u.i;
 301 }
 302
 303 uint64_t helper_fstl_ST0(CPUX86State *env)
 304 {
 305     uint8_t old_flags = save_exception_flags(env);
 306     union {
 307         float64 f;
 308         uint64_t i;
 309     } u;
 310
 311     u.f = floatx80_to_float64(ST0, &env->fp_status);
 312     merge_exception_flags(env, old_flags);
 313     return u.i;
 314 }
 315
 316 int32_t helper_fist_ST0(CPUX86State *env)
 317 {
 318     uint8_t old_flags = save_exception_flags(env);
 319     int32_t val;
 320
 321     val = floatx80_to_int32(ST0, &env->fp_status);
 322     if (val != (int16_t)val) {
 323         set_float_exception_flags(float_flag_invalid, &env->fp_status);
 324         val = -32768;
 325     }
 326     merge_exception_flags(env, old_flags);
 327     return val;
 328 }
 329
 330 int32_t helper_fistl_ST0(CPUX86State *env)
 331 {
 332     uint8_t old_flags = save_exception_flags(env);
 333     int32_t val;
 334
 335     val = floatx80_to_int32(ST0, &env->fp_status);
 336     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 337         val = 0x80000000;
 338     }
 339     merge_exception_flags(env, old_flags);
 340     return val;
 341 }
 342
 343 int64_t helper_fistll_ST0(CPUX86State *env)
 344 {
 345     uint8_t old_flags = save_exception_flags(env);
 346     int64_t val;
 347
 348     val = floatx80_to_int64(ST0, &env->fp_status);
 349     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 350         val = 0x8000000000000000ULL;
 351     }
 352     merge_exception_flags(env, old_flags);
 353     return val;
 354 }
 355
 356 int32_t helper_fistt_ST0(CPUX86State *env)
 357 {
 358     uint8_t old_flags = save_exception_flags(env);
 359     int32_t val;
 360
 361     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
 362     if (val != (int16_t)val) {
 363         set_float_exception_flags(float_flag_invalid, &env->fp_status);
 364         val = -32768;
 365     }
 366     merge_exception_flags(env, old_flags);
 367     return val;
 368 }
 369
 370 int32_t helper_fisttl_ST0(CPUX86State *env)
 371 {
 372     uint8_t old_flags = save_exception_flags(env);
 373     int32_t val;
 374
 375     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
 376     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 377         val = 0x80000000;
 378     }
 379     merge_exception_flags(env, old_flags);
 380     return val;
 381 }
 382
 383 int64_t helper_fisttll_ST0(CPUX86State *env)
 384 {
 385     uint8_t old_flags = save_exception_flags(env);
 386     int64_t val;
 387
 388     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
 389     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 390         val = 0x8000000000000000ULL;
 391     }
 392     merge_exception_flags(env, old_flags);
 393     return val;
 394 }
 395
 396 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
 397 {
 398     int new_fpstt;
 399
 400     new_fpstt = (env->fpstt - 1) & 7;
 401     env->fpregs[new_fpstt].d = helper_fldt(env, ptr, GETPC());
 402     env->fpstt = new_fpstt;
 403     env->fptags[new_fpstt] = 0; /* validate stack entry */
 404 }
 405
 406 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
 407 {
 408     helper_fstt(env, ST0, ptr, GETPC());
 409 }
 410
 411 void helper_fpush(CPUX86State *env)
 412 {
 413     fpush(env);
 414 }
 415
 416 void helper_fpop(CPUX86State *env)
 417 {
 418     fpop(env);
 419 }
 420
 421 void helper_fdecstp(CPUX86State *env)
 422 {
 423     env->fpstt = (env->fpstt - 1) & 7;
 424     env->fpus &= ~0x4700;
 425 }
 426
 427 void helper_fincstp(CPUX86State *env)
 428 {
 429     env->fpstt = (env->fpstt + 1) & 7;
 430     env->fpus &= ~0x4700;
 431 }
 432
 433 /* FPU move */
 434
 435 void helper_ffree_STN(CPUX86State *env, int st_index)
 436 {
 437     env->fptags[(env->fpstt + st_index) & 7] = 1;
 438 }
 439
 440 void helper_fmov_ST0_FT0(CPUX86State *env)
 441 {
 442     ST0 = FT0;
 443 }
 444
 445 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
 446 {
 447     FT0 = ST(st_index);
 448 }
 449
 450 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
 451 {
 452     ST0 = ST(st_index);
 453 }
 454
 455 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
 456 {
 457     ST(st_index) = ST0;
 458 }
 459
 460 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
 461 {
 462     floatx80 tmp;
 463
 464     tmp = ST(st_index);
 465     ST(st_index) = ST0;
 466     ST0 = tmp;
 467 }
 468
 469 /* FPU operations */
 470
 471 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
 472
 473 void helper_fcom_ST0_FT0(CPUX86State *env)
 474 {
 475     uint8_t old_flags = save_exception_flags(env);
 476     FloatRelation ret;
 477
 478     ret = floatx80_compare(ST0, FT0, &env->fp_status);
 479     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
 480     merge_exception_flags(env, old_flags);
 481 }
 482
 483 void helper_fucom_ST0_FT0(CPUX86State *env)
 484 {
 485     uint8_t old_flags = save_exception_flags(env);
 486     FloatRelation ret;
 487
 488     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
 489     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
 490     merge_exception_flags(env, old_flags);
 491 }
 492
 493 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
 494
 495 void helper_fcomi_ST0_FT0(CPUX86State *env)
 496 {
 497     uint8_t old_flags = save_exception_flags(env);
 498     int eflags;
 499     FloatRelation ret;
 500
 501     ret = floatx80_compare(ST0, FT0, &env->fp_status);
 502     eflags = cpu_cc_compute_all(env, CC_OP);
 503     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
 504     CC_SRC = eflags;
 505     merge_exception_flags(env, old_flags);
 506 }
 507
 508 void helper_fucomi_ST0_FT0(CPUX86State *env)
 509 {
 510     uint8_t old_flags = save_exception_flags(env);
 511     int eflags;
 512     FloatRelation ret;
 513
 514     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
 515     eflags = cpu_cc_compute_all(env, CC_OP);
 516     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
 517     CC_SRC = eflags;
 518     merge_exception_flags(env, old_flags);
 519 }
 520
 521 void helper_fadd_ST0_FT0(CPUX86State *env)
 522 {
 523     uint8_t old_flags = save_exception_flags(env);
 524     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
 525     merge_exception_flags(env, old_flags);
 526 }
 527
 528 void helper_fmul_ST0_FT0(CPUX86State *env)
 529 {
 530     uint8_t old_flags = save_exception_flags(env);
 531     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
 532     merge_exception_flags(env, old_flags);
 533 }
 534
 535 void helper_fsub_ST0_FT0(CPUX86State *env)
 536 {
 537     uint8_t old_flags = save_exception_flags(env);
 538     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
 539     merge_exception_flags(env, old_flags);
 540 }
 541
 542 void helper_fsubr_ST0_FT0(CPUX86State *env)
 543 {
 544     uint8_t old_flags = save_exception_flags(env);
 545     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
 546     merge_exception_flags(env, old_flags);
 547 }
 548
 549 void helper_fdiv_ST0_FT0(CPUX86State *env)
 550 {
 551     ST0 = helper_fdiv(env, ST0, FT0);
 552 }
 553
 554 void helper_fdivr_ST0_FT0(CPUX86State *env)
 555 {
 556     ST0 = helper_fdiv(env, FT0, ST0);
 557 }
 558
 559 /* fp operations between STN and ST0 */
 560
 561 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
 562 {
 563     uint8_t old_flags = save_exception_flags(env);
 564     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
 565     merge_exception_flags(env, old_flags);
 566 }
 567
 568 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
 569 {
 570     uint8_t old_flags = save_exception_flags(env);
 571     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
 572     merge_exception_flags(env, old_flags);
 573 }
 574
 575 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
 576 {
 577     uint8_t old_flags = save_exception_flags(env);
 578     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
 579     merge_exception_flags(env, old_flags);
 580 }
 581
 582 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
 583 {
 584     uint8_t old_flags = save_exception_flags(env);
 585     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
 586     merge_exception_flags(env, old_flags);
 587 }
 588
 589 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
 590 {
 591     floatx80 *p;
 592
 593     p = &ST(st_index);
 594     *p = helper_fdiv(env, *p, ST0);
 595 }
 596
 597 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
 598 {
 599     floatx80 *p;
 600
 601     p = &ST(st_index);
 602     *p = helper_fdiv(env, ST0, *p);
 603 }
 604
 605 /* misc FPU operations */
 606 void helper_fchs_ST0(CPUX86State *env)
 607 {
 608     ST0 = floatx80_chs(ST0);
 609 }
 610
 611 void helper_fabs_ST0(CPUX86State *env)
 612 {
 613     ST0 = floatx80_abs(ST0);
 614 }
 615
 616 void helper_fld1_ST0(CPUX86State *env)
 617 {
 618     ST0 = floatx80_one;
 619 }
 620
 621 void helper_fldl2t_ST0(CPUX86State *env)
 622 {
 623     switch (env->fpuc & FPU_RC_MASK) {
 624     case FPU_RC_UP:
 625         ST0 = floatx80_l2t_u;
 626         break;
 627     default:
 628         ST0 = floatx80_l2t;
 629         break;
 630     }
 631 }
 632
 633 void helper_fldl2e_ST0(CPUX86State *env)
 634 {
 635     switch (env->fpuc & FPU_RC_MASK) {
 636     case FPU_RC_DOWN:
 637     case FPU_RC_CHOP:
 638         ST0 = floatx80_l2e_d;
 639         break;
 640     default:
 641         ST0 = floatx80_l2e;
 642         break;
 643     }
 644 }
 645
 646 void helper_fldpi_ST0(CPUX86State *env)
 647 {
 648     switch (env->fpuc & FPU_RC_MASK) {
 649     case FPU_RC_DOWN:
 650     case FPU_RC_CHOP:
 651         ST0 = floatx80_pi_d;
 652         break;
 653     default:
 654         ST0 = floatx80_pi;
 655         break;
 656     }
 657 }
 658
 659 void helper_fldlg2_ST0(CPUX86State *env)
 660 {
 661     switch (env->fpuc & FPU_RC_MASK) {
 662     case FPU_RC_DOWN:
 663     case FPU_RC_CHOP:
 664         ST0 = floatx80_lg2_d;
 665         break;
 666     default:
 667         ST0 = floatx80_lg2;
 668         break;
 669     }
 670 }
 671
 672 void helper_fldln2_ST0(CPUX86State *env)
 673 {
 674     switch (env->fpuc & FPU_RC_MASK) {
 675     case FPU_RC_DOWN:
 676     case FPU_RC_CHOP:
 677         ST0 = floatx80_ln2_d;
 678         break;
 679     default:
 680         ST0 = floatx80_ln2;
 681         break;
 682     }
 683 }
 684
 685 void helper_fldz_ST0(CPUX86State *env)
 686 {
 687     ST0 = floatx80_zero;
 688 }
 689
 690 void helper_fldz_FT0(CPUX86State *env)
 691 {
 692     FT0 = floatx80_zero;
 693 }
 694
 695 uint32_t helper_fnstsw(CPUX86State *env)
 696 {
 697     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
 698 }
 699
 700 uint32_t helper_fnstcw(CPUX86State *env)
 701 {
 702     return env->fpuc;
 703 }
 704
 705 void update_fp_status(CPUX86State *env)
 706 {
 707     int rnd_type;
 708
 709     /* set rounding mode */
 710     switch (env->fpuc & FPU_RC_MASK) {
 711     default:
 712     case FPU_RC_NEAR:
 713         rnd_type = float_round_nearest_even;
 714         break;
 715     case FPU_RC_DOWN:
 716         rnd_type = float_round_down;
 717         break;
 718     case FPU_RC_UP:
 719         rnd_type = float_round_up;
 720         break;
 721     case FPU_RC_CHOP:
 722         rnd_type = float_round_to_zero;
 723         break;
 724     }
 725     set_float_rounding_mode(rnd_type, &env->fp_status);
 726     switch ((env->fpuc >> 8) & 3) {
 727     case 0:
 728         rnd_type = 32;
 729         break;
 730     case 2:
 731         rnd_type = 64;
 732         break;
 733     case 3:
 734     default:
 735         rnd_type = 80;
 736         break;
 737     }
 738     set_floatx80_rounding_precision(rnd_type, &env->fp_status);
 739 }
 740
 741 void helper_fldcw(CPUX86State *env, uint32_t val)
 742 {
 743     cpu_set_fpuc(env, val);
 744 }
 745
 746 void helper_fclex(CPUX86State *env)
 747 {
 748     env->fpus &= 0x7f00;
 749 }
 750
 751 void helper_fwait(CPUX86State *env)
 752 {
 753     if (env->fpus & FPUS_SE) {
 754         fpu_raise_exception(env, GETPC());
 755     }
 756 }
 757
 758 void helper_fninit(CPUX86State *env)
 759 {
 760     env->fpus = 0;
 761     env->fpstt = 0;
 762     cpu_set_fpuc(env, 0x37f);
 763     env->fptags[0] = 1;
 764     env->fptags[1] = 1;
 765     env->fptags[2] = 1;
 766     env->fptags[3] = 1;
 767     env->fptags[4] = 1;
 768     env->fptags[5] = 1;
 769     env->fptags[6] = 1;
 770     env->fptags[7] = 1;
 771 }
 772
 773 /* BCD ops */
 774
 775 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
 776 {
 777     floatx80 tmp;
 778     uint64_t val;
 779     unsigned int v;
 780     int i;
 781
 782     val = 0;
 783     for (i = 8; i >= 0; i--) {
 784         v = cpu_ldub_data_ra(env, ptr + i, GETPC());
 785         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
 786     }
 787     tmp = int64_to_floatx80(val, &env->fp_status);
 788     if (cpu_ldub_data_ra(env, ptr + 9, GETPC()) & 0x80) {
 789         tmp = floatx80_chs(tmp);
 790     }
 791     fpush(env);
 792     ST0 = tmp;
 793 }
 794
 795 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
 796 {
 797     uint8_t old_flags = save_exception_flags(env);
 798     int v;
 799     target_ulong mem_ref, mem_end;
 800     int64_t val;
 801     CPU_LDoubleU temp;
 802
 803     temp.d = ST0;
 804
 805     val = floatx80_to_int64(ST0, &env->fp_status);
 806     mem_ref = ptr;
 807     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
 808         set_float_exception_flags(float_flag_invalid, &env->fp_status);
 809         while (mem_ref < ptr + 7) {
 810             cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
 811         }
 812         cpu_stb_data_ra(env, mem_ref++, 0xc0, GETPC());
 813         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
 814         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
 815         merge_exception_flags(env, old_flags);
 816         return;
 817     }
 818     mem_end = mem_ref + 9;
 819     if (SIGND(temp)) {
 820         cpu_stb_data_ra(env, mem_end, 0x80, GETPC());
 821         val = -val;
 822     } else {
 823         cpu_stb_data_ra(env, mem_end, 0x00, GETPC());
 824     }
 825     while (mem_ref < mem_end) {
 826         if (val == 0) {
 827             break;
 828         }
 829         v = val % 100;
 830         val = val / 100;
 831         v = ((v / 10) << 4) | (v % 10);
 832         cpu_stb_data_ra(env, mem_ref++, v, GETPC());
 833     }
 834     while (mem_ref < mem_end) {
 835         cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
 836     }
 837     merge_exception_flags(env, old_flags);
 838 }
 839
 840 /* 128-bit significand of log(2).  */
 841 #define ln2_sig_high 0xb17217f7d1cf79abULL
 842 #define ln2_sig_low 0xc9e3b39803f2f6afULL
 843
 844 /*
 845  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
 846  * the interval [-1/64, 1/64].
 847  */
 848 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
 849 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
 850 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
 851 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
 852 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
 853 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
 854 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
 855 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
 856 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
 857
 858 struct f2xm1_data {
 859     /*
 860      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
 861      * are very close to exact floatx80 values.
 862      */
 863     floatx80 t;
 864     /* The value of 2^t.  */
 865     floatx80 exp2;
 866     /* The value of 2^t - 1.  */
 867     floatx80 exp2m1;
 868 };
 869
 870 static const struct f2xm1_data f2xm1_table[65] = {
 871     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
 872       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
 873       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
 874     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
 875       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
 876       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
 877     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
 878       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
 879       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
 880     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
 881       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
 882       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
 883     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
 884       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
 885       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
 886     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
 887       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
 888       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
 889     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
 890       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
 891       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
 892     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
 893       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
 894       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
 895     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
 896       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
 897       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
 898     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
 899       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
 900       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
 901     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
 902       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
 903       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
 904     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
 905       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
 906       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
 907     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
 908       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
 909       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
 910     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
 911       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
 912       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
 913     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
 914       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
 915       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
 916     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
 917       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
 918       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
 919     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
 920       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
 921       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
 922     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
 923       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
 924       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
 925     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
 926       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
 927       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
 928     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
 929       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
 930       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
 931     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
 932       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
 933       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
 934     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
 935       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
 936       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
 937     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
 938       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
 939       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
 940     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
 941       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
 942       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
 943     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
 944       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
 945       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
 946     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
 947       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
 948       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
 949     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
 950       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
 951       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
 952     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
 953       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
 954       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
 955     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
 956       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
 957       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
 958     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
 959       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
 960       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
 961     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
 962       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
 963       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
 964     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
 965       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
 966       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
 967     { floatx80_zero_init,
 968       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
 969       floatx80_zero_init },
 970     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
 971       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
 972       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
 973     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
 974       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
 975       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
 976     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
 977       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
 978       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
 979     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
 980       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
 981       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
 982     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
 983       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
 984       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
 985     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
 986       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
 987       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
 988     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
 989       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
 990       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
 991     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
 992       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
 993       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
 994     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
 995       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
 996       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
 997     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
 998       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
 999       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
1000     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
1001       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
1002       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
1003     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
1004       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
1005       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
1006     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
1007       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
1008       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
1009     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
1010       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
1011       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
1012     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
1013       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
1014       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
1015     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
1016       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
1017       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
1018     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1019       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1020       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1021     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1022       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1023       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1024     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1025       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1026       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1027     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1028       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1029       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1030     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1031       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1032       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1033     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1034       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1035       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1036     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1037       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1038       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1039     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1040       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1041       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1042     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1043       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1044       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1045     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1046       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1047       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1048     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1049       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1050       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1051     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1052       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1053       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1054     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1055       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1056       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1057     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1058       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1059       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1060     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1061       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1062       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1063     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1064       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1065       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1066 };
1067
1068 void helper_f2xm1(CPUX86State *env)
1069 {
1070     uint8_t old_flags = save_exception_flags(env);
1071     uint64_t sig = extractFloatx80Frac(ST0);
1072     int32_t exp = extractFloatx80Exp(ST0);
1073     bool sign = extractFloatx80Sign(ST0);
1074
1075     if (floatx80_invalid_encoding(ST0)) {
1076         float_raise(float_flag_invalid, &env->fp_status);
1077         ST0 = floatx80_default_nan(&env->fp_status);
1078     } else if (floatx80_is_any_nan(ST0)) {
1079         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1080             float_raise(float_flag_invalid, &env->fp_status);
1081             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1082         }
1083     } else if (exp > 0x3fff ||
1084                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1085         /* Out of range for the instruction, treat as invalid.  */
1086         float_raise(float_flag_invalid, &env->fp_status);
1087         ST0 = floatx80_default_nan(&env->fp_status);
1088     } else if (exp == 0x3fff) {
1089         /* Argument 1 or -1, exact result 1 or -0.5.  */
1090         if (sign) {
1091             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1092         }
1093     } else if (exp < 0x3fb0) {
1094         if (!floatx80_is_zero(ST0)) {
1095             /*
1096              * Multiplying the argument by an extra-precision version
1097              * of log(2) is sufficiently precise.  Zero arguments are
1098              * returned unchanged.
1099              */
1100             uint64_t sig0, sig1, sig2;
1101             if (exp == 0) {
1102                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1103             }
1104             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1105                             &sig2);
1106             /* This result is inexact.  */
1107             sig1 |= 1;
1108             ST0 = normalizeRoundAndPackFloatx80(80, sign, exp, sig0, sig1,
1109                                                 &env->fp_status);
1110         }
1111     } else {
1112         floatx80 tmp, y, accum;
1113         bool asign, bsign;
1114         int32_t n, aexp, bexp;
1115         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1116         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1117         signed char save_prec = env->fp_status.floatx80_rounding_precision;
1118         env->fp_status.float_rounding_mode = float_round_nearest_even;
1119         env->fp_status.floatx80_rounding_precision = 80;
1120
1121         /* Find the nearest multiple of 1/32 to the argument.  */
1122         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1123         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1124         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1125
1126         if (floatx80_is_zero(y)) {
1127             /*
1128              * Use the value of 2^t - 1 from the table, to avoid
1129              * needing to special-case zero as a result of
1130              * multiplication below.
1131              */
1132             ST0 = f2xm1_table[n].t;
1133             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1134             env->fp_status.float_rounding_mode = save_mode;
1135         } else {
1136             /*
1137              * Compute the lower parts of a polynomial expansion for
1138              * (2^y - 1) / y.
1139              */
1140             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1141             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1142             accum = floatx80_mul(accum, y, &env->fp_status);
1143             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1144             accum = floatx80_mul(accum, y, &env->fp_status);
1145             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1146             accum = floatx80_mul(accum, y, &env->fp_status);
1147             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1148             accum = floatx80_mul(accum, y, &env->fp_status);
1149             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1150             accum = floatx80_mul(accum, y, &env->fp_status);
1151             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1152             accum = floatx80_mul(accum, y, &env->fp_status);
1153             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1154
1155             /*
1156              * The full polynomial expansion is f2xm1_coeff_0 + accum
1157              * (where accum has much lower magnitude, and so, in
1158              * particular, carry out of the addition is not possible).
1159              * (This expansion is only accurate to about 70 bits, not
1160              * 128 bits.)
1161              */
1162             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1163             asign = extractFloatx80Sign(f2xm1_coeff_0);
1164             shift128RightJamming(extractFloatx80Frac(accum), 0,
1165                                  aexp - extractFloatx80Exp(accum),
1166                                  &asig0, &asig1);
1167             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1168             bsig1 = 0;
1169             if (asign == extractFloatx80Sign(accum)) {
1170                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1171             } else {
1172                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1173             }
1174             /* And thus compute an approximation to 2^y - 1.  */
1175             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1176                             &asig0, &asig1, &asig2);
1177             aexp += extractFloatx80Exp(y) - 0x3ffe;
1178             asign ^= extractFloatx80Sign(y);
1179             if (n != 32) {
1180                 /*
1181                  * Multiply this by the precomputed value of 2^t and
1182                  * add that of 2^t - 1.
1183                  */
1184                 mul128By64To192(asig0, asig1,
1185                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1186                                 &asig0, &asig1, &asig2);
1187                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1188                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1189                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1190                 bsig1 = 0;
1191                 if (bexp < aexp) {
1192                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1193                                          &bsig0, &bsig1);
1194                 } else if (aexp < bexp) {
1195                     shift128RightJamming(asig0, asig1, bexp - aexp,
1196                                          &asig0, &asig1);
1197                     aexp = bexp;
1198                 }
1199                 /* The sign of 2^t - 1 is always that of the result.  */
1200                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1201                 if (asign == bsign) {
1202                     /* Avoid possible carry out of the addition.  */
1203                     shift128RightJamming(asig0, asig1, 1,
1204                                          &asig0, &asig1);
1205                     shift128RightJamming(bsig0, bsig1, 1,
1206                                          &bsig0, &bsig1);
1207                     ++aexp;
1208                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1209                 } else {
1210                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1211                     asign = bsign;
1212                 }
1213             }
1214             env->fp_status.float_rounding_mode = save_mode;
1215             /* This result is inexact.  */
1216             asig1 |= 1;
1217             ST0 = normalizeRoundAndPackFloatx80(80, asign, aexp, asig0, asig1,
1218                                                 &env->fp_status);
1219         }
1220
1221         env->fp_status.floatx80_rounding_precision = save_prec;
1222     }
1223     merge_exception_flags(env, old_flags);
1224 }
1225
1226 void helper_fptan(CPUX86State *env)
1227 {
1228     double fptemp = floatx80_to_double(env, ST0);
1229
1230     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1231         env->fpus |= 0x400;
1232     } else {
1233         fptemp = tan(fptemp);
1234         ST0 = double_to_floatx80(env, fptemp);
1235         fpush(env);
1236         ST0 = floatx80_one;
1237         env->fpus &= ~0x400; /* C2 <-- 0 */
1238         /* the above code is for |arg| < 2**52 only */
1239     }
1240 }
1241
1242 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1243 #define pi_4_exp 0x3ffe
1244 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1245 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1246 #define pi_2_exp 0x3fff
1247 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1248 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1249 #define pi_34_exp 0x4000
1250 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1251 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1252 #define pi_exp 0x4000
1253 #define pi_sig_high 0xc90fdaa22168c234ULL
1254 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1255
1256 /*
1257  * Polynomial coefficients for an approximation to atan(x), with only
1258  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1259  * for some other approximations, no low part is needed for the first
1260  * coefficient here to achieve a sufficiently accurate result, because
1261  * the coefficient in this minimax approximation is very close to
1262  * exactly 1.)
1263  */
1264 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1265 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1266 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1267 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1268 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1269 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1270 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1271
1272 struct fpatan_data {
1273     /* High and low parts of atan(x).  */
1274     floatx80 atan_high, atan_low;
1275 };
1276
1277 static const struct fpatan_data fpatan_table[9] = {
1278     { floatx80_zero_init,
1279       floatx80_zero_init },
1280     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1281       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1282     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1283       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1284     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1285       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1286     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1287       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1288     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1289       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1290     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1291       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1292     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1293       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1294     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1295       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1296 };
1297
1298 void helper_fpatan(CPUX86State *env)
1299 {
1300     uint8_t old_flags = save_exception_flags(env);
1301     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1302     int32_t arg0_exp = extractFloatx80Exp(ST0);
1303     bool arg0_sign = extractFloatx80Sign(ST0);
1304     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1305     int32_t arg1_exp = extractFloatx80Exp(ST1);
1306     bool arg1_sign = extractFloatx80Sign(ST1);
1307
1308     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1309         float_raise(float_flag_invalid, &env->fp_status);
1310         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1311     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1312         float_raise(float_flag_invalid, &env->fp_status);
1313         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1314     } else if (floatx80_invalid_encoding(ST0) ||
1315                floatx80_invalid_encoding(ST1)) {
1316         float_raise(float_flag_invalid, &env->fp_status);
1317         ST1 = floatx80_default_nan(&env->fp_status);
1318     } else if (floatx80_is_any_nan(ST0)) {
1319         ST1 = ST0;
1320     } else if (floatx80_is_any_nan(ST1)) {
1321         /* Pass this NaN through.  */
1322     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1323         /* Pass this zero through.  */
1324     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1325                  arg0_exp - arg1_exp >= 80) &&
1326                !arg0_sign) {
1327         /*
1328          * Dividing ST1 by ST0 gives the correct result up to
1329          * rounding, and avoids spurious underflow exceptions that
1330          * might result from passing some small values through the
1331          * polynomial approximation, but if a finite nonzero result of
1332          * division is exact, the result of fpatan is still inexact
1333          * (and underflowing where appropriate).
1334          */
1335         signed char save_prec = env->fp_status.floatx80_rounding_precision;
1336         env->fp_status.floatx80_rounding_precision = 80;
1337         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1338         env->fp_status.floatx80_rounding_precision = save_prec;
1339         if (!floatx80_is_zero(ST1) &&
1340             !(get_float_exception_flags(&env->fp_status) &
1341               float_flag_inexact)) {
1342             /*
1343              * The mathematical result is very slightly closer to zero
1344              * than this exact result.  Round a value with the
1345              * significand adjusted accordingly to get the correct
1346              * exceptions, and possibly an adjusted result depending
1347              * on the rounding mode.
1348              */
1349             uint64_t sig = extractFloatx80Frac(ST1);
1350             int32_t exp = extractFloatx80Exp(ST1);
1351             bool sign = extractFloatx80Sign(ST1);
1352             if (exp == 0) {
1353                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1354             }
1355             ST1 = normalizeRoundAndPackFloatx80(80, sign, exp, sig - 1,
1356                                                 -1, &env->fp_status);
1357         }
1358     } else {
1359         /* The result is inexact.  */
1360         bool rsign = arg1_sign;
1361         int32_t rexp;
1362         uint64_t rsig0, rsig1;
1363         if (floatx80_is_zero(ST1)) {
1364             /*
1365              * ST0 is negative.  The result is pi with the sign of
1366              * ST1.
1367              */
1368             rexp = pi_exp;
1369             rsig0 = pi_sig_high;
1370             rsig1 = pi_sig_low;
1371         } else if (floatx80_is_infinity(ST1)) {
1372             if (floatx80_is_infinity(ST0)) {
1373                 if (arg0_sign) {
1374                     rexp = pi_34_exp;
1375                     rsig0 = pi_34_sig_high;
1376                     rsig1 = pi_34_sig_low;
1377                 } else {
1378                     rexp = pi_4_exp;
1379                     rsig0 = pi_4_sig_high;
1380                     rsig1 = pi_4_sig_low;
1381                 }
1382             } else {
1383                 rexp = pi_2_exp;
1384                 rsig0 = pi_2_sig_high;
1385                 rsig1 = pi_2_sig_low;
1386             }
1387         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1388             rexp = pi_2_exp;
1389             rsig0 = pi_2_sig_high;
1390             rsig1 = pi_2_sig_low;
1391         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1392             /* ST0 is negative.  */
1393             rexp = pi_exp;
1394             rsig0 = pi_sig_high;
1395             rsig1 = pi_sig_low;
1396         } else {
1397             /*
1398              * ST0 and ST1 are finite, nonzero and with exponents not
1399              * too far apart.
1400              */
1401             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1402             int32_t azexp, axexp;
1403             bool adj_sub, ysign, zsign;
1404             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1405             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1406             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1407             uint64_t azsig0, azsig1;
1408             uint64_t azsig2, azsig3, axsig0, axsig1;
1409             floatx80 x8;
1410             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1411             signed char save_prec = env->fp_status.floatx80_rounding_precision;
1412             env->fp_status.float_rounding_mode = float_round_nearest_even;
1413             env->fp_status.floatx80_rounding_precision = 80;
1414
1415             if (arg0_exp == 0) {
1416                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1417             }
1418             if (arg1_exp == 0) {
1419                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1420             }
1421             if (arg0_exp > arg1_exp ||
1422                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1423                 /* Work with abs(ST1) / abs(ST0).  */
1424                 num_exp = arg1_exp;
1425                 num_sig = arg1_sig;
1426                 den_exp = arg0_exp;
1427                 den_sig = arg0_sig;
1428                 if (arg0_sign) {
1429                     /* The result is subtracted from pi.  */
1430                     adj_exp = pi_exp;
1431                     adj_sig0 = pi_sig_high;
1432                     adj_sig1 = pi_sig_low;
1433                     adj_sub = true;
1434                 } else {
1435                     /* The result is used as-is.  */
1436                     adj_exp = 0;
1437                     adj_sig0 = 0;
1438                     adj_sig1 = 0;
1439                     adj_sub = false;
1440                 }
1441             } else {
1442                 /* Work with abs(ST0) / abs(ST1).  */
1443                 num_exp = arg0_exp;
1444                 num_sig = arg0_sig;
1445                 den_exp = arg1_exp;
1446                 den_sig = arg1_sig;
1447                 /* The result is added to or subtracted from pi/2.  */
1448                 adj_exp = pi_2_exp;
1449                 adj_sig0 = pi_2_sig_high;
1450                 adj_sig1 = pi_2_sig_low;
1451                 adj_sub = !arg0_sign;
1452             }
1453
1454             /*
1455              * Compute x = num/den, where 0 < x <= 1 and x is not too
1456              * small.
1457              */
1458             xexp = num_exp - den_exp + 0x3ffe;
1459             remsig0 = num_sig;
1460             remsig1 = 0;
1461             if (den_sig <= remsig0) {
1462                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1463                 ++xexp;
1464             }
1465             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1466             mul64To128(den_sig, xsig0, &msig0, &msig1);
1467             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1468             while ((int64_t) remsig0 < 0) {
1469                 --xsig0;
1470                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1471             }
1472             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1473             /*
1474              * No need to correct any estimation error in xsig1; even
1475              * with such error, it is accurate enough.
1476              */
1477
1478             /*
1479              * Split x as x = t + y, where t = n/8 is the nearest
1480              * multiple of 1/8 to x.
1481              */
1482             x8 = normalizeRoundAndPackFloatx80(80, false, xexp + 3, xsig0,
1483                                                xsig1, &env->fp_status);
1484             n = floatx80_to_int32(x8, &env->fp_status);
1485             if (n == 0) {
1486                 ysign = false;
1487                 yexp = xexp;
1488                 ysig0 = xsig0;
1489                 ysig1 = xsig1;
1490                 texp = 0;
1491                 tsig = 0;
1492             } else {
1493                 int shift = clz32(n) + 32;
1494                 texp = 0x403b - shift;
1495                 tsig = n;
1496                 tsig <<= shift;
1497                 if (texp == xexp) {
1498                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1499                     if ((int64_t) ysig0 >= 0) {
1500                         ysign = false;
1501                         if (ysig0 == 0) {
1502                             if (ysig1 == 0) {
1503                                 yexp = 0;
1504                             } else {
1505                                 shift = clz64(ysig1) + 64;
1506                                 yexp = xexp - shift;
1507                                 shift128Left(ysig0, ysig1, shift,
1508                                              &ysig0, &ysig1);
1509                             }
1510                         } else {
1511                             shift = clz64(ysig0);
1512                             yexp = xexp - shift;
1513                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1514                         }
1515                     } else {
1516                         ysign = true;
1517                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1518                         if (ysig0 == 0) {
1519                             shift = clz64(ysig1) + 64;
1520                         } else {
1521                             shift = clz64(ysig0);
1522                         }
1523                         yexp = xexp - shift;
1524                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1525                     }
1526                 } else {
1527                     /*
1528                      * t's exponent must be greater than x's because t
1529                      * is positive and the nearest multiple of 1/8 to
1530                      * x, and if x has a greater exponent, the power
1531                      * of 2 with that exponent is also a multiple of
1532                      * 1/8.
1533                      */
1534                     uint64_t usig0, usig1;
1535                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1536                                          &usig0, &usig1);
1537                     ysign = true;
1538                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1539                     if (ysig0 == 0) {
1540                         shift = clz64(ysig1) + 64;
1541                     } else {
1542                         shift = clz64(ysig0);
1543                     }
1544                     yexp = texp - shift;
1545                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1546                 }
1547             }
1548
1549             /*
1550              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1551              * arctan(z).
1552              */
1553             zsign = ysign;
1554             if (texp == 0 || yexp == 0) {
1555                 zexp = yexp;
1556                 zsig0 = ysig0;
1557                 zsig1 = ysig1;
1558             } else {
1559                 /*
1560                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1561                  */
1562                 int32_t dexp = texp + xexp - 0x3ffe;
1563                 uint64_t dsig0, dsig1, dsig2;
1564                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1565                 /*
1566                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1567                  * bit).  Add 1 to produce the denominator 1+tx.
1568                  */
1569                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1570                                      &dsig0, &dsig1);
1571                 dsig0 |= 0x8000000000000000ULL;
1572                 zexp = yexp - 1;
1573                 remsig0 = ysig0;
1574                 remsig1 = ysig1;
1575                 remsig2 = 0;
1576                 if (dsig0 <= remsig0) {
1577                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1578                     ++zexp;
1579                 }
1580                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1581                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1582                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1583                        &remsig0, &remsig1, &remsig2);
1584                 while ((int64_t) remsig0 < 0) {
1585                     --zsig0;
1586                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1587                            &remsig0, &remsig1, &remsig2);
1588                 }
1589                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1590                 /* No need to correct any estimation error in zsig1.  */
1591             }
1592
1593             if (zexp == 0) {
1594                 azexp = 0;
1595                 azsig0 = 0;
1596                 azsig1 = 0;
1597             } else {
1598                 floatx80 z2, accum;
1599                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1600                 /* Compute z^2.  */
1601                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1602                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1603                 z2 = normalizeRoundAndPackFloatx80(80, false,
1604                                                    zexp + zexp - 0x3ffe,
1605                                                    z2sig0, z2sig1,
1606                                                    &env->fp_status);
1607
1608                 /* Compute the lower parts of the polynomial expansion.  */
1609                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1610                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1611                 accum = floatx80_mul(accum, z2, &env->fp_status);
1612                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1613                 accum = floatx80_mul(accum, z2, &env->fp_status);
1614                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1615                 accum = floatx80_mul(accum, z2, &env->fp_status);
1616                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1617                 accum = floatx80_mul(accum, z2, &env->fp_status);
1618                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1619                 accum = floatx80_mul(accum, z2, &env->fp_status);
1620
1621                 /*
1622                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1623                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1624                  */
1625                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1626                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1627                                      aexp - extractFloatx80Exp(accum),
1628                                      &asig0, &asig1);
1629                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1630                        &asig0, &asig1);
1631                 /* Multiply by z to compute arctan(z).  */
1632                 azexp = aexp + zexp - 0x3ffe;
1633                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1634                             &azsig2, &azsig3);
1635             }
1636
1637             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1638             if (texp == 0) {
1639                 /* z is positive.  */
1640                 axexp = azexp;
1641                 axsig0 = azsig0;
1642                 axsig1 = azsig1;
1643             } else {
1644                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1645                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1646                 uint64_t low_sig0 =
1647                     extractFloatx80Frac(fpatan_table[n].atan_low);
1648                 uint64_t low_sig1 = 0;
1649                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1650                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1651                 axsig1 = 0;
1652                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1653                                      &low_sig0, &low_sig1);
1654                 if (low_sign) {
1655                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1656                            &axsig0, &axsig1);
1657                 } else {
1658                     add128(axsig0, axsig1, low_sig0, low_sig1,
1659                            &axsig0, &axsig1);
1660                 }
1661                 if (azexp >= axexp) {
1662                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1663                                          &axsig0, &axsig1);
1664                     axexp = azexp + 1;
1665                     shift128RightJamming(azsig0, azsig1, 1,
1666                                          &azsig0, &azsig1);
1667                 } else {
1668                     shift128RightJamming(axsig0, axsig1, 1,
1669                                          &axsig0, &axsig1);
1670                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1671                                          &azsig0, &azsig1);
1672                     ++axexp;
1673                 }
1674                 if (zsign) {
1675                     sub128(axsig0, axsig1, azsig0, azsig1,
1676                            &axsig0, &axsig1);
1677                 } else {
1678                     add128(axsig0, axsig1, azsig0, azsig1,
1679                            &axsig0, &axsig1);
1680                 }
1681             }
1682
1683             if (adj_exp == 0) {
1684                 rexp = axexp;
1685                 rsig0 = axsig0;
1686                 rsig1 = axsig1;
1687             } else {
1688                 /*
1689                  * Add or subtract arctan(x) (exponent axexp,
1690                  * significand axsig0 and axsig1, positive, not
1691                  * necessarily normalized) to the number given by
1692                  * adj_exp, adj_sig0 and adj_sig1, according to
1693                  * adj_sub.
1694                  */
1695                 if (adj_exp >= axexp) {
1696                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1697                                          &axsig0, &axsig1);
1698                     rexp = adj_exp + 1;
1699                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1700                                          &adj_sig0, &adj_sig1);
1701                 } else {
1702                     shift128RightJamming(axsig0, axsig1, 1,
1703                                          &axsig0, &axsig1);
1704                     shift128RightJamming(adj_sig0, adj_sig1,
1705                                          axexp - adj_exp + 1,
1706                                          &adj_sig0, &adj_sig1);
1707                     rexp = axexp + 1;
1708                 }
1709                 if (adj_sub) {
1710                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1711                            &rsig0, &rsig1);
1712                 } else {
1713                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1714                            &rsig0, &rsig1);
1715                 }
1716             }
1717
1718             env->fp_status.float_rounding_mode = save_mode;
1719             env->fp_status.floatx80_rounding_precision = save_prec;
1720         }
1721         /* This result is inexact.  */
1722         rsig1 |= 1;
1723         ST1 = normalizeRoundAndPackFloatx80(80, rsign, rexp,
1724                                             rsig0, rsig1, &env->fp_status);
1725     }
1726
1727     fpop(env);
1728     merge_exception_flags(env, old_flags);
1729 }
1730
1731 void helper_fxtract(CPUX86State *env)
1732 {
1733     uint8_t old_flags = save_exception_flags(env);
1734     CPU_LDoubleU temp;
1735
1736     temp.d = ST0;
1737
1738     if (floatx80_is_zero(ST0)) {
1739         /* Easy way to generate -inf and raising division by 0 exception */
1740         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1741                            &env->fp_status);
1742         fpush(env);
1743         ST0 = temp.d;
1744     } else if (floatx80_invalid_encoding(ST0)) {
1745         float_raise(float_flag_invalid, &env->fp_status);
1746         ST0 = floatx80_default_nan(&env->fp_status);
1747         fpush(env);
1748         ST0 = ST1;
1749     } else if (floatx80_is_any_nan(ST0)) {
1750         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1751             float_raise(float_flag_invalid, &env->fp_status);
1752             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1753         }
1754         fpush(env);
1755         ST0 = ST1;
1756     } else if (floatx80_is_infinity(ST0)) {
1757         fpush(env);
1758         ST0 = ST1;
1759         ST1 = floatx80_infinity;
1760     } else {
1761         int expdif;
1762
1763         if (EXPD(temp) == 0) {
1764             int shift = clz64(temp.l.lower);
1765             temp.l.lower <<= shift;
1766             expdif = 1 - EXPBIAS - shift;
1767             float_raise(float_flag_input_denormal, &env->fp_status);
1768         } else {
1769             expdif = EXPD(temp) - EXPBIAS;
1770         }
1771         /* DP exponent bias */
1772         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1773         fpush(env);
1774         BIASEXPONENT(temp);
1775         ST0 = temp.d;
1776     }
1777     merge_exception_flags(env, old_flags);
1778 }
1779
1780 static void helper_fprem_common(CPUX86State *env, bool mod)
1781 {
1782     uint8_t old_flags = save_exception_flags(env);
1783     uint64_t quotient;
1784     CPU_LDoubleU temp0, temp1;
1785     int exp0, exp1, expdiff;
1786
1787     temp0.d = ST0;
1788     temp1.d = ST1;
1789     exp0 = EXPD(temp0);
1790     exp1 = EXPD(temp1);
1791
1792     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1793     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1794         exp0 == 0x7fff || exp1 == 0x7fff ||
1795         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1796         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1797     } else {
1798         if (exp0 == 0) {
1799             exp0 = 1 - clz64(temp0.l.lower);
1800         }
1801         if (exp1 == 0) {
1802             exp1 = 1 - clz64(temp1.l.lower);
1803         }
1804         expdiff = exp0 - exp1;
1805         if (expdiff < 64) {
1806             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1807             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1808             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1809             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1810         } else {
1811             /*
1812              * Partial remainder.  This choice of how many bits to
1813              * process at once is specified in AMD instruction set
1814              * manuals, and empirically is followed by Intel
1815              * processors as well; it ensures that the final remainder
1816              * operation in a loop does produce the correct low three
1817              * bits of the quotient.  AMD manuals specify that the
1818              * flags other than C2 are cleared, and empirically Intel
1819              * processors clear them as well.
1820              */
1821             int n = 32 + (expdiff % 32);
1822             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1823             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1824             env->fpus |= 0x400;  /* C2 <-- 1 */
1825         }
1826     }
1827     merge_exception_flags(env, old_flags);
1828 }
1829
1830 void helper_fprem1(CPUX86State *env)
1831 {
1832     helper_fprem_common(env, false);
1833 }
1834
1835 void helper_fprem(CPUX86State *env)
1836 {
1837     helper_fprem_common(env, true);
1838 }
1839
1840 /* 128-bit significand of log2(e).  */
1841 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1842 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1843
1844 /*
1845  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1846  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1847  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1848  * interval [sqrt(2)/2, sqrt(2)].
1849  */
1850 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1851 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1852 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1853 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1854 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1855 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1856 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1857 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1858 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1859 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1860 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1861
1862 /*
1863  * Compute an approximation of log2(1+arg), where 1+arg is in the
1864  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1865  * function is called, rounding precision is set to 80 and the
1866  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1867  * and must not be so close to zero that underflow might occur.
1868  */
1869 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1870                                 uint64_t *sig0, uint64_t *sig1)
1871 {
1872     uint64_t arg0_sig = extractFloatx80Frac(arg);
1873     int32_t arg0_exp = extractFloatx80Exp(arg);
1874     bool arg0_sign = extractFloatx80Sign(arg);
1875     bool asign;
1876     int32_t dexp, texp, aexp;
1877     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1878     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1879     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1880     floatx80 t2, accum;
1881
1882     /*
1883      * Compute an approximation of arg/(2+arg), with extra precision,
1884      * as the argument to a polynomial approximation.  The extra
1885      * precision is only needed for the first term of the
1886      * approximation, with subsequent terms being significantly
1887      * smaller; the approximation only uses odd exponents, and the
1888      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1889      */
1890     if (arg0_sign) {
1891         dexp = 0x3fff;
1892         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1893         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1894     } else {
1895         dexp = 0x4000;
1896         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1897         dsig0 |= 0x8000000000000000ULL;
1898     }
1899     texp = arg0_exp - dexp + 0x3ffe;
1900     rsig0 = arg0_sig;
1901     rsig1 = 0;
1902     rsig2 = 0;
1903     if (dsig0 <= rsig0) {
1904         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1905         ++texp;
1906     }
1907     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1908     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1909     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1910            &rsig0, &rsig1, &rsig2);
1911     while ((int64_t) rsig0 < 0) {
1912         --tsig0;
1913         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1914                &rsig0, &rsig1, &rsig2);
1915     }
1916     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1917     /*
1918      * No need to correct any estimation error in tsig1; even with
1919      * such error, it is accurate enough.  Now compute the square of
1920      * that approximation.
1921      */
1922     mul128To256(tsig0, tsig1, tsig0, tsig1,
1923                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1924     t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe,
1925                                        t2sig0, t2sig1, &env->fp_status);
1926
1927     /* Compute the lower parts of the polynomial expansion.  */
1928     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1929     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1930     accum = floatx80_mul(accum, t2, &env->fp_status);
1931     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1932     accum = floatx80_mul(accum, t2, &env->fp_status);
1933     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1934     accum = floatx80_mul(accum, t2, &env->fp_status);
1935     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1936     accum = floatx80_mul(accum, t2, &env->fp_status);
1937     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1938     accum = floatx80_mul(accum, t2, &env->fp_status);
1939     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1940     accum = floatx80_mul(accum, t2, &env->fp_status);
1941     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1942     accum = floatx80_mul(accum, t2, &env->fp_status);
1943     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1944     accum = floatx80_mul(accum, t2, &env->fp_status);
1945     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
1946
1947     /*
1948      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
1949      * accum has much lower magnitude, and so, in particular, carry
1950      * out of the addition is not possible), multiplied by t.  (This
1951      * expansion is only accurate to about 70 bits, not 128 bits.)
1952      */
1953     aexp = extractFloatx80Exp(fyl2x_coeff_0);
1954     asign = extractFloatx80Sign(fyl2x_coeff_0);
1955     shift128RightJamming(extractFloatx80Frac(accum), 0,
1956                          aexp - extractFloatx80Exp(accum),
1957                          &asig0, &asig1);
1958     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
1959     bsig1 = 0;
1960     if (asign == extractFloatx80Sign(accum)) {
1961         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1962     } else {
1963         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1964     }
1965     /* Multiply by t to compute the required result.  */
1966     mul128To256(asig0, asig1, tsig0, tsig1,
1967                 &asig0, &asig1, &asig2, &asig3);
1968     aexp += texp - 0x3ffe;
1969     *exp = aexp;
1970     *sig0 = asig0;
1971     *sig1 = asig1;
1972 }
1973
1974 void helper_fyl2xp1(CPUX86State *env)
1975 {
1976     uint8_t old_flags = save_exception_flags(env);
1977     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1978     int32_t arg0_exp = extractFloatx80Exp(ST0);
1979     bool arg0_sign = extractFloatx80Sign(ST0);
1980     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1981     int32_t arg1_exp = extractFloatx80Exp(ST1);
1982     bool arg1_sign = extractFloatx80Sign(ST1);
1983
1984     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1985         float_raise(float_flag_invalid, &env->fp_status);
1986         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1987     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1988         float_raise(float_flag_invalid, &env->fp_status);
1989         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1990     } else if (floatx80_invalid_encoding(ST0) ||
1991                floatx80_invalid_encoding(ST1)) {
1992         float_raise(float_flag_invalid, &env->fp_status);
1993         ST1 = floatx80_default_nan(&env->fp_status);
1994     } else if (floatx80_is_any_nan(ST0)) {
1995         ST1 = ST0;
1996     } else if (floatx80_is_any_nan(ST1)) {
1997         /* Pass this NaN through.  */
1998     } else if (arg0_exp > 0x3ffd ||
1999                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
2000                                                   0x95f619980c4336f7ULL :
2001                                                   0xd413cccfe7799211ULL))) {
2002         /*
2003          * Out of range for the instruction (ST0 must have absolute
2004          * value less than 1 - sqrt(2)/2 = 0.292..., according to
2005          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
2006          * to sqrt(2) - 1, which we allow here), treat as invalid.
2007          */
2008         float_raise(float_flag_invalid, &env->fp_status);
2009         ST1 = floatx80_default_nan(&env->fp_status);
2010     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2011                arg1_exp == 0x7fff) {
2012         /*
2013          * One argument is zero, or multiplying by infinity; correct
2014          * result is exact and can be obtained by multiplying the
2015          * arguments.
2016          */
2017         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2018     } else if (arg0_exp < 0x3fb0) {
2019         /*
2020          * Multiplying both arguments and an extra-precision version
2021          * of log2(e) is sufficiently precise.
2022          */
2023         uint64_t sig0, sig1, sig2;
2024         int32_t exp;
2025         if (arg0_exp == 0) {
2026             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2027         }
2028         if (arg1_exp == 0) {
2029             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2030         }
2031         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2032                         &sig0, &sig1, &sig2);
2033         exp = arg0_exp + 1;
2034         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2035         exp += arg1_exp - 0x3ffe;
2036         /* This result is inexact.  */
2037         sig1 |= 1;
2038         ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, exp,
2039                                             sig0, sig1, &env->fp_status);
2040     } else {
2041         int32_t aexp;
2042         uint64_t asig0, asig1, asig2;
2043         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2044         signed char save_prec = env->fp_status.floatx80_rounding_precision;
2045         env->fp_status.float_rounding_mode = float_round_nearest_even;
2046         env->fp_status.floatx80_rounding_precision = 80;
2047
2048         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2049         /*
2050          * Multiply by the second argument to compute the required
2051          * result.
2052          */
2053         if (arg1_exp == 0) {
2054             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2055         }
2056         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2057         aexp += arg1_exp - 0x3ffe;
2058         /* This result is inexact.  */
2059         asig1 |= 1;
2060         env->fp_status.float_rounding_mode = save_mode;
2061         ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, aexp,
2062                                             asig0, asig1, &env->fp_status);
2063         env->fp_status.floatx80_rounding_precision = save_prec;
2064     }
2065     fpop(env);
2066     merge_exception_flags(env, old_flags);
2067 }
2068
2069 void helper_fyl2x(CPUX86State *env)
2070 {
2071     uint8_t old_flags = save_exception_flags(env);
2072     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2073     int32_t arg0_exp = extractFloatx80Exp(ST0);
2074     bool arg0_sign = extractFloatx80Sign(ST0);
2075     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2076     int32_t arg1_exp = extractFloatx80Exp(ST1);
2077     bool arg1_sign = extractFloatx80Sign(ST1);
2078
2079     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2080         float_raise(float_flag_invalid, &env->fp_status);
2081         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2082     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2083         float_raise(float_flag_invalid, &env->fp_status);
2084         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2085     } else if (floatx80_invalid_encoding(ST0) ||
2086                floatx80_invalid_encoding(ST1)) {
2087         float_raise(float_flag_invalid, &env->fp_status);
2088         ST1 = floatx80_default_nan(&env->fp_status);
2089     } else if (floatx80_is_any_nan(ST0)) {
2090         ST1 = ST0;
2091     } else if (floatx80_is_any_nan(ST1)) {
2092         /* Pass this NaN through.  */
2093     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2094         float_raise(float_flag_invalid, &env->fp_status);
2095         ST1 = floatx80_default_nan(&env->fp_status);
2096     } else if (floatx80_is_infinity(ST1)) {
2097         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2098                                              &env->fp_status);
2099         switch (cmp) {
2100         case float_relation_less:
2101             ST1 = floatx80_chs(ST1);
2102             break;
2103         case float_relation_greater:
2104             /* Result is infinity of the same sign as ST1.  */
2105             break;
2106         default:
2107             float_raise(float_flag_invalid, &env->fp_status);
2108             ST1 = floatx80_default_nan(&env->fp_status);
2109             break;
2110         }
2111     } else if (floatx80_is_infinity(ST0)) {
2112         if (floatx80_is_zero(ST1)) {
2113             float_raise(float_flag_invalid, &env->fp_status);
2114             ST1 = floatx80_default_nan(&env->fp_status);
2115         } else if (arg1_sign) {
2116             ST1 = floatx80_chs(ST0);
2117         } else {
2118             ST1 = ST0;
2119         }
2120     } else if (floatx80_is_zero(ST0)) {
2121         if (floatx80_is_zero(ST1)) {
2122             float_raise(float_flag_invalid, &env->fp_status);
2123             ST1 = floatx80_default_nan(&env->fp_status);
2124         } else {
2125             /* Result is infinity with opposite sign to ST1.  */
2126             float_raise(float_flag_divbyzero, &env->fp_status);
2127             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2128                                 0x8000000000000000ULL);
2129         }
2130     } else if (floatx80_is_zero(ST1)) {
2131         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2132             ST1 = floatx80_chs(ST1);
2133         }
2134         /* Otherwise, ST1 is already the correct result.  */
2135     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2136         if (arg1_sign) {
2137             ST1 = floatx80_chs(floatx80_zero);
2138         } else {
2139             ST1 = floatx80_zero;
2140         }
2141     } else {
2142         int32_t int_exp;
2143         floatx80 arg0_m1;
2144         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2145         signed char save_prec = env->fp_status.floatx80_rounding_precision;
2146         env->fp_status.float_rounding_mode = float_round_nearest_even;
2147         env->fp_status.floatx80_rounding_precision = 80;
2148
2149         if (arg0_exp == 0) {
2150             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2151         }
2152         if (arg1_exp == 0) {
2153             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2154         }
2155         int_exp = arg0_exp - 0x3fff;
2156         if (arg0_sig > 0xb504f333f9de6484ULL) {
2157             ++int_exp;
2158         }
2159         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2160                                                &env->fp_status),
2161                                floatx80_one, &env->fp_status);
2162         if (floatx80_is_zero(arg0_m1)) {
2163             /* Exact power of 2; multiply by ST1.  */
2164             env->fp_status.float_rounding_mode = save_mode;
2165             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2166                                ST1, &env->fp_status);
2167         } else {
2168             bool asign = extractFloatx80Sign(arg0_m1);
2169             int32_t aexp;
2170             uint64_t asig0, asig1, asig2;
2171             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2172             if (int_exp != 0) {
2173                 bool isign = (int_exp < 0);
2174                 int32_t iexp;
2175                 uint64_t isig;
2176                 int shift;
2177                 int_exp = isign ? -int_exp : int_exp;
2178                 shift = clz32(int_exp) + 32;
2179                 isig = int_exp;
2180                 isig <<= shift;
2181                 iexp = 0x403e - shift;
2182                 shift128RightJamming(asig0, asig1, iexp - aexp,
2183                                      &asig0, &asig1);
2184                 if (asign == isign) {
2185                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2186                 } else {
2187                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2188                 }
2189                 aexp = iexp;
2190                 asign = isign;
2191             }
2192             /*
2193              * Multiply by the second argument to compute the required
2194              * result.
2195              */
2196             if (arg1_exp == 0) {
2197                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2198             }
2199             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2200             aexp += arg1_exp - 0x3ffe;
2201             /* This result is inexact.  */
2202             asig1 |= 1;
2203             env->fp_status.float_rounding_mode = save_mode;
2204             ST1 = normalizeRoundAndPackFloatx80(80, asign ^ arg1_sign, aexp,
2205                                                 asig0, asig1, &env->fp_status);
2206         }
2207
2208         env->fp_status.floatx80_rounding_precision = save_prec;
2209     }
2210     fpop(env);
2211     merge_exception_flags(env, old_flags);
2212 }
2213
2214 void helper_fsqrt(CPUX86State *env)
2215 {
2216     uint8_t old_flags = save_exception_flags(env);
2217     if (floatx80_is_neg(ST0)) {
2218         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2219         env->fpus |= 0x400;
2220     }
2221     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2222     merge_exception_flags(env, old_flags);
2223 }
2224
2225 void helper_fsincos(CPUX86State *env)
2226 {
2227     double fptemp = floatx80_to_double(env, ST0);
2228
2229     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2230         env->fpus |= 0x400;
2231     } else {
2232         ST0 = double_to_floatx80(env, sin(fptemp));
2233         fpush(env);
2234         ST0 = double_to_floatx80(env, cos(fptemp));
2235         env->fpus &= ~0x400;  /* C2 <-- 0 */
2236         /* the above code is for |arg| < 2**63 only */
2237     }
2238 }
2239
2240 void helper_frndint(CPUX86State *env)
2241 {
2242     uint8_t old_flags = save_exception_flags(env);
2243     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2244     merge_exception_flags(env, old_flags);
2245 }
2246
2247 void helper_fscale(CPUX86State *env)
2248 {
2249     uint8_t old_flags = save_exception_flags(env);
2250     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2251         float_raise(float_flag_invalid, &env->fp_status);
2252         ST0 = floatx80_default_nan(&env->fp_status);
2253     } else if (floatx80_is_any_nan(ST1)) {
2254         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2255             float_raise(float_flag_invalid, &env->fp_status);
2256         }
2257         ST0 = ST1;
2258         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2259             float_raise(float_flag_invalid, &env->fp_status);
2260             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2261         }
2262     } else if (floatx80_is_infinity(ST1) &&
2263                !floatx80_invalid_encoding(ST0) &&
2264                !floatx80_is_any_nan(ST0)) {
2265         if (floatx80_is_neg(ST1)) {
2266             if (floatx80_is_infinity(ST0)) {
2267                 float_raise(float_flag_invalid, &env->fp_status);
2268                 ST0 = floatx80_default_nan(&env->fp_status);
2269             } else {
2270                 ST0 = (floatx80_is_neg(ST0) ?
2271                        floatx80_chs(floatx80_zero) :
2272                        floatx80_zero);
2273             }
2274         } else {
2275             if (floatx80_is_zero(ST0)) {
2276                 float_raise(float_flag_invalid, &env->fp_status);
2277                 ST0 = floatx80_default_nan(&env->fp_status);
2278             } else {
2279                 ST0 = (floatx80_is_neg(ST0) ?
2280                        floatx80_chs(floatx80_infinity) :
2281                        floatx80_infinity);
2282             }
2283         }
2284     } else {
2285         int n;
2286         signed char save = env->fp_status.floatx80_rounding_precision;
2287         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2288         set_float_exception_flags(0, &env->fp_status);
2289         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2290         set_float_exception_flags(save_flags, &env->fp_status);
2291         env->fp_status.floatx80_rounding_precision = 80;
2292         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2293         env->fp_status.floatx80_rounding_precision = save;
2294     }
2295     merge_exception_flags(env, old_flags);
2296 }
2297
2298 void helper_fsin(CPUX86State *env)
2299 {
2300     double fptemp = floatx80_to_double(env, ST0);
2301
2302     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2303         env->fpus |= 0x400;
2304     } else {
2305         ST0 = double_to_floatx80(env, sin(fptemp));
2306         env->fpus &= ~0x400;  /* C2 <-- 0 */
2307         /* the above code is for |arg| < 2**53 only */
2308     }
2309 }
2310
2311 void helper_fcos(CPUX86State *env)
2312 {
2313     double fptemp = floatx80_to_double(env, ST0);
2314
2315     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2316         env->fpus |= 0x400;
2317     } else {
2318         ST0 = double_to_floatx80(env, cos(fptemp));
2319         env->fpus &= ~0x400;  /* C2 <-- 0 */
2320         /* the above code is for |arg| < 2**63 only */
2321     }
2322 }
2323
2324 void helper_fxam_ST0(CPUX86State *env)
2325 {
2326     CPU_LDoubleU temp;
2327     int expdif;
2328
2329     temp.d = ST0;
2330
2331     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2332     if (SIGND(temp)) {
2333         env->fpus |= 0x200; /* C1 <-- 1 */
2334     }
2335
2336     if (env->fptags[env->fpstt]) {
2337         env->fpus |= 0x4100; /* Empty */
2338         return;
2339     }
2340
2341     expdif = EXPD(temp);
2342     if (expdif == MAXEXPD) {
2343         if (MANTD(temp) == 0x8000000000000000ULL) {
2344             env->fpus |= 0x500; /* Infinity */
2345         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2346             env->fpus |= 0x100; /* NaN */
2347         }
2348     } else if (expdif == 0) {
2349         if (MANTD(temp) == 0) {
2350             env->fpus |=  0x4000; /* Zero */
2351         } else {
2352             env->fpus |= 0x4400; /* Denormal */
2353         }
2354     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2355         env->fpus |= 0x400;
2356     }
2357 }
2358
2359 static void do_fstenv(CPUX86State *env, target_ulong ptr, int data32,
2360                       uintptr_t retaddr)
2361 {
2362     int fpus, fptag, exp, i;
2363     uint64_t mant;
2364     CPU_LDoubleU tmp;
2365
2366     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2367     fptag = 0;
2368     for (i = 7; i >= 0; i--) {
2369         fptag <<= 2;
2370         if (env->fptags[i]) {
2371             fptag |= 3;
2372         } else {
2373             tmp.d = env->fpregs[i].d;
2374             exp = EXPD(tmp);
2375             mant = MANTD(tmp);
2376             if (exp == 0 && mant == 0) {
2377                 /* zero */
2378                 fptag |= 1;
2379             } else if (exp == 0 || exp == MAXEXPD
2380                        || (mant & (1LL << 63)) == 0) {
2381                 /* NaNs, infinity, denormal */
2382                 fptag |= 2;
2383             }
2384         }
2385     }
2386     if (data32) {
2387         /* 32 bit */
2388         cpu_stl_data_ra(env, ptr, env->fpuc, retaddr);
2389         cpu_stl_data_ra(env, ptr + 4, fpus, retaddr);
2390         cpu_stl_data_ra(env, ptr + 8, fptag, retaddr);
2391         cpu_stl_data_ra(env, ptr + 12, 0, retaddr); /* fpip */
2392         cpu_stl_data_ra(env, ptr + 16, 0, retaddr); /* fpcs */
2393         cpu_stl_data_ra(env, ptr + 20, 0, retaddr); /* fpoo */
2394         cpu_stl_data_ra(env, ptr + 24, 0, retaddr); /* fpos */
2395     } else {
2396         /* 16 bit */
2397         cpu_stw_data_ra(env, ptr, env->fpuc, retaddr);
2398         cpu_stw_data_ra(env, ptr + 2, fpus, retaddr);
2399         cpu_stw_data_ra(env, ptr + 4, fptag, retaddr);
2400         cpu_stw_data_ra(env, ptr + 6, 0, retaddr);
2401         cpu_stw_data_ra(env, ptr + 8, 0, retaddr);
2402         cpu_stw_data_ra(env, ptr + 10, 0, retaddr);
2403         cpu_stw_data_ra(env, ptr + 12, 0, retaddr);
2404     }
2405 }
2406
2407 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2408 {
2409     do_fstenv(env, ptr, data32, GETPC());
2410 }
2411
2412 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2413 {
2414     env->fpstt = (fpus >> 11) & 7;
2415     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2416     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2417 #if !defined(CONFIG_USER_ONLY)
2418     if (!(env->fpus & FPUS_SE)) {
2419         /*
2420          * Here the processor deasserts FERR#; in response, the chipset deasserts
2421          * IGNNE#.
2422          */
2423         cpu_clear_ignne();
2424     }
2425 #endif
2426 }
2427
2428 static void do_fldenv(CPUX86State *env, target_ulong ptr, int data32,
2429                       uintptr_t retaddr)
2430 {
2431     int i, fpus, fptag;
2432
2433     if (data32) {
2434         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2435         fpus = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2436         fptag = cpu_lduw_data_ra(env, ptr + 8, retaddr);
2437     } else {
2438         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2439         fpus = cpu_lduw_data_ra(env, ptr + 2, retaddr);
2440         fptag = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2441     }
2442     cpu_set_fpus(env, fpus);
2443     for (i = 0; i < 8; i++) {
2444         env->fptags[i] = ((fptag & 3) == 3);
2445         fptag >>= 2;
2446     }
2447 }
2448
2449 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2450 {
2451     do_fldenv(env, ptr, data32, GETPC());
2452 }
2453
2454 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2455 {
2456     floatx80 tmp;
2457     int i;
2458
2459     do_fstenv(env, ptr, data32, GETPC());
2460
2461     ptr += (14 << data32);
2462     for (i = 0; i < 8; i++) {
2463         tmp = ST(i);
2464         helper_fstt(env, tmp, ptr, GETPC());
2465         ptr += 10;
2466     }
2467
2468     /* fninit */
2469     env->fpus = 0;
2470     env->fpstt = 0;
2471     cpu_set_fpuc(env, 0x37f);
2472     env->fptags[0] = 1;
2473     env->fptags[1] = 1;
2474     env->fptags[2] = 1;
2475     env->fptags[3] = 1;
2476     env->fptags[4] = 1;
2477     env->fptags[5] = 1;
2478     env->fptags[6] = 1;
2479     env->fptags[7] = 1;
2480 }
2481
2482 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2483 {
2484     floatx80 tmp;
2485     int i;
2486
2487     do_fldenv(env, ptr, data32, GETPC());
2488     ptr += (14 << data32);
2489
2490     for (i = 0; i < 8; i++) {
2491         tmp = helper_fldt(env, ptr, GETPC());
2492         ST(i) = tmp;
2493         ptr += 10;
2494     }
2495 }
2496
2497 #if defined(CONFIG_USER_ONLY)
2498 void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
2499 {
2500     helper_fsave(env, ptr, data32);
2501 }
2502
2503 void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
2504 {
2505     helper_frstor(env, ptr, data32);
2506 }
2507 #endif
2508
2509 #define XO(X)  offsetof(X86XSaveArea, X)
2510
2511 static void do_xsave_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2512 {
2513     int fpus, fptag, i;
2514     target_ulong addr;
2515
2516     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2517     fptag = 0;
2518     for (i = 0; i < 8; i++) {
2519         fptag |= (env->fptags[i] << i);
2520     }
2521
2522     cpu_stw_data_ra(env, ptr + XO(legacy.fcw), env->fpuc, ra);
2523     cpu_stw_data_ra(env, ptr + XO(legacy.fsw), fpus, ra);
2524     cpu_stw_data_ra(env, ptr + XO(legacy.ftw), fptag ^ 0xff, ra);
2525
2526     /* In 32-bit mode this is eip, sel, dp, sel.
2527        In 64-bit mode this is rip, rdp.
2528        But in either case we don't write actual data, just zeros.  */
2529     cpu_stq_data_ra(env, ptr + XO(legacy.fpip), 0, ra); /* eip+sel; rip */
2530     cpu_stq_data_ra(env, ptr + XO(legacy.fpdp), 0, ra); /* edp+sel; rdp */
2531
2532     addr = ptr + XO(legacy.fpregs);
2533     for (i = 0; i < 8; i++) {
2534         floatx80 tmp = ST(i);
2535         helper_fstt(env, tmp, addr, ra);
2536         addr += 16;
2537     }
2538 }
2539
2540 static void do_xsave_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2541 {
2542     update_mxcsr_from_sse_status(env);
2543     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr), env->mxcsr, ra);
2544     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr_mask), 0x0000ffff, ra);
2545 }
2546
2547 static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2548 {
2549     int i, nb_xmm_regs;
2550     target_ulong addr;
2551
2552     if (env->hflags & HF_CS64_MASK) {
2553         nb_xmm_regs = 16;
2554     } else {
2555         nb_xmm_regs = 8;
2556     }
2557
2558     addr = ptr + XO(legacy.xmm_regs);
2559     for (i = 0; i < nb_xmm_regs; i++) {
2560         cpu_stq_data_ra(env, addr, env->xmm_regs[i].ZMM_Q(0), ra);
2561         cpu_stq_data_ra(env, addr + 8, env->xmm_regs[i].ZMM_Q(1), ra);
2562         addr += 16;
2563     }
2564 }
2565
2566 static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2567 {
2568     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2569     int i;
2570
2571     for (i = 0; i < 4; i++, addr += 16) {
2572         cpu_stq_data_ra(env, addr, env->bnd_regs[i].lb, ra);
2573         cpu_stq_data_ra(env, addr + 8, env->bnd_regs[i].ub, ra);
2574     }
2575 }
2576
2577 static void do_xsave_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2578 {
2579     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2580                     env->bndcs_regs.cfgu, ra);
2581     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2582                     env->bndcs_regs.sts, ra);
2583 }
2584
2585 static void do_xsave_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2586 {
2587     cpu_stq_data_ra(env, ptr, env->pkru, ra);
2588 }
2589
2590 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2591 {
2592     uintptr_t ra = GETPC();
2593
2594     /* The operand must be 16 byte aligned */
2595     if (ptr & 0xf) {
2596         raise_exception_ra(env, EXCP0D_GPF, ra);
2597     }
2598
2599     do_xsave_fpu(env, ptr, ra);
2600
2601     if (env->cr[4] & CR4_OSFXSR_MASK) {
2602         do_xsave_mxcsr(env, ptr, ra);
2603         /* Fast FXSAVE leaves out the XMM registers */
2604         if (!(env->efer & MSR_EFER_FFXSR)
2605             || (env->hflags & HF_CPL_MASK)
2606             || !(env->hflags & HF_LMA_MASK)) {
2607             do_xsave_sse(env, ptr, ra);
2608         }
2609     }
2610 }
2611
2612 static uint64_t get_xinuse(CPUX86State *env)
2613 {
2614     uint64_t inuse = -1;
2615
2616     /* For the most part, we don't track XINUSE.  We could calculate it
2617        here for all components, but it's probably less work to simply
2618        indicate in use.  That said, the state of BNDREGS is important
2619        enough to track in HFLAGS, so we might as well use that here.  */
2620     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2621        inuse &= ~XSTATE_BNDREGS_MASK;
2622     }
2623     return inuse;
2624 }
2625
2626 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2627                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2628 {
2629     uint64_t old_bv, new_bv;
2630
2631     /* The OS must have enabled XSAVE.  */
2632     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2633         raise_exception_ra(env, EXCP06_ILLOP, ra);
2634     }
2635
2636     /* The operand must be 64 byte aligned.  */
2637     if (ptr & 63) {
2638         raise_exception_ra(env, EXCP0D_GPF, ra);
2639     }
2640
2641     /* Never save anything not enabled by XCR0.  */
2642     rfbm &= env->xcr0;
2643     opt &= rfbm;
2644
2645     if (opt & XSTATE_FP_MASK) {
2646         do_xsave_fpu(env, ptr, ra);
2647     }
2648     if (rfbm & XSTATE_SSE_MASK) {
2649         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2650         do_xsave_mxcsr(env, ptr, ra);
2651     }
2652     if (opt & XSTATE_SSE_MASK) {
2653         do_xsave_sse(env, ptr, ra);
2654     }
2655     if (opt & XSTATE_BNDREGS_MASK) {
2656         do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
2657     }
2658     if (opt & XSTATE_BNDCSR_MASK) {
2659         do_xsave_bndcsr(env, ptr + XO(bndcsr_state), ra);
2660     }
2661     if (opt & XSTATE_PKRU_MASK) {
2662         do_xsave_pkru(env, ptr + XO(pkru_state), ra);
2663     }
2664
2665     /* Update the XSTATE_BV field.  */
2666     old_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2667     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2668     cpu_stq_data_ra(env, ptr + XO(header.xstate_bv), new_bv, ra);
2669 }
2670
2671 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2672 {
2673     do_xsave(env, ptr, rfbm, get_xinuse(env), -1, GETPC());
2674 }
2675
2676 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2677 {
2678     uint64_t inuse = get_xinuse(env);
2679     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2680 }
2681
2682 static void do_xrstor_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2683 {
2684     int i, fpuc, fpus, fptag;
2685     target_ulong addr;
2686
2687     fpuc = cpu_lduw_data_ra(env, ptr + XO(legacy.fcw), ra);
2688     fpus = cpu_lduw_data_ra(env, ptr + XO(legacy.fsw), ra);
2689     fptag = cpu_lduw_data_ra(env, ptr + XO(legacy.ftw), ra);
2690     cpu_set_fpuc(env, fpuc);
2691     cpu_set_fpus(env, fpus);
2692     fptag ^= 0xff;
2693     for (i = 0; i < 8; i++) {
2694         env->fptags[i] = ((fptag >> i) & 1);
2695     }
2696
2697     addr = ptr + XO(legacy.fpregs);
2698     for (i = 0; i < 8; i++) {
2699         floatx80 tmp = helper_fldt(env, addr, ra);
2700         ST(i) = tmp;
2701         addr += 16;
2702     }
2703 }
2704
2705 static void do_xrstor_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2706 {
2707     cpu_set_mxcsr(env, cpu_ldl_data_ra(env, ptr + XO(legacy.mxcsr), ra));
2708 }
2709
2710 static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2711 {
2712     int i, nb_xmm_regs;
2713     target_ulong addr;
2714
2715     if (env->hflags & HF_CS64_MASK) {
2716         nb_xmm_regs = 16;
2717     } else {
2718         nb_xmm_regs = 8;
2719     }
2720
2721     addr = ptr + XO(legacy.xmm_regs);
2722     for (i = 0; i < nb_xmm_regs; i++) {
2723         env->xmm_regs[i].ZMM_Q(0) = cpu_ldq_data_ra(env, addr, ra);
2724         env->xmm_regs[i].ZMM_Q(1) = cpu_ldq_data_ra(env, addr + 8, ra);
2725         addr += 16;
2726     }
2727 }
2728
2729 static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2730 {
2731     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2732     int i;
2733
2734     for (i = 0; i < 4; i++, addr += 16) {
2735         env->bnd_regs[i].lb = cpu_ldq_data_ra(env, addr, ra);
2736         env->bnd_regs[i].ub = cpu_ldq_data_ra(env, addr + 8, ra);
2737     }
2738 }
2739
2740 static void do_xrstor_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2741 {
2742     /* FIXME: Extend highest implemented bit of linear address.  */
2743     env->bndcs_regs.cfgu
2744         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu), ra);
2745     env->bndcs_regs.sts
2746         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts), ra);
2747 }
2748
2749 static void do_xrstor_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2750 {
2751     env->pkru = cpu_ldq_data_ra(env, ptr, ra);
2752 }
2753
2754 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2755 {
2756     uintptr_t ra = GETPC();
2757
2758     /* The operand must be 16 byte aligned */
2759     if (ptr & 0xf) {
2760         raise_exception_ra(env, EXCP0D_GPF, ra);
2761     }
2762
2763     do_xrstor_fpu(env, ptr, ra);
2764
2765     if (env->cr[4] & CR4_OSFXSR_MASK) {
2766         do_xrstor_mxcsr(env, ptr, ra);
2767         /* Fast FXRSTOR leaves out the XMM registers */
2768         if (!(env->efer & MSR_EFER_FFXSR)
2769             || (env->hflags & HF_CPL_MASK)
2770             || !(env->hflags & HF_LMA_MASK)) {
2771             do_xrstor_sse(env, ptr, ra);
2772         }
2773     }
2774 }
2775
2776 #if defined(CONFIG_USER_ONLY)
2777 void cpu_x86_fxsave(CPUX86State *env, target_ulong ptr)
2778 {
2779     helper_fxsave(env, ptr);
2780 }
2781
2782 void cpu_x86_fxrstor(CPUX86State *env, target_ulong ptr)
2783 {
2784     helper_fxrstor(env, ptr);
2785 }
2786 #endif
2787
2788 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2789 {
2790     uintptr_t ra = GETPC();
2791     uint64_t xstate_bv, xcomp_bv, reserve0;
2792
2793     rfbm &= env->xcr0;
2794
2795     /* The OS must have enabled XSAVE.  */
2796     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2797         raise_exception_ra(env, EXCP06_ILLOP, ra);
2798     }
2799
2800     /* The operand must be 64 byte aligned.  */
2801     if (ptr & 63) {
2802         raise_exception_ra(env, EXCP0D_GPF, ra);
2803     }
2804
2805     xstate_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2806
2807     if ((int64_t)xstate_bv < 0) {
2808         /* FIXME: Compact form.  */
2809         raise_exception_ra(env, EXCP0D_GPF, ra);
2810     }
2811
2812     /* Standard form.  */
2813
2814     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2815     if (xstate_bv & ~env->xcr0) {
2816         raise_exception_ra(env, EXCP0D_GPF, ra);
2817     }
2818
2819     /* The XCOMP_BV field must be zero.  Note that, as of the April 2016
2820        revision, the description of the XSAVE Header (Vol 1, Sec 13.4.2)
2821        describes only XCOMP_BV, but the description of the standard form
2822        of XRSTOR (Vol 1, Sec 13.8.1) checks bytes 23:8 for zero, which
2823        includes the next 64-bit field.  */
2824     xcomp_bv = cpu_ldq_data_ra(env, ptr + XO(header.xcomp_bv), ra);
2825     reserve0 = cpu_ldq_data_ra(env, ptr + XO(header.reserve0), ra);
2826     if (xcomp_bv || reserve0) {
2827         raise_exception_ra(env, EXCP0D_GPF, ra);
2828     }
2829
2830     if (rfbm & XSTATE_FP_MASK) {
2831         if (xstate_bv & XSTATE_FP_MASK) {
2832             do_xrstor_fpu(env, ptr, ra);
2833         } else {
2834             helper_fninit(env);
2835             memset(env->fpregs, 0, sizeof(env->fpregs));
2836         }
2837     }
2838     if (rfbm & XSTATE_SSE_MASK) {
2839         /* Note that the standard form of XRSTOR loads MXCSR from memory
2840            whether or not the XSTATE_BV bit is set.  */
2841         do_xrstor_mxcsr(env, ptr, ra);
2842         if (xstate_bv & XSTATE_SSE_MASK) {
2843             do_xrstor_sse(env, ptr, ra);
2844         } else {
2845             /* ??? When AVX is implemented, we may have to be more
2846                selective in the clearing.  */
2847             memset(env->xmm_regs, 0, sizeof(env->xmm_regs));
2848         }
2849     }
2850     if (rfbm & XSTATE_BNDREGS_MASK) {
2851         if (xstate_bv & XSTATE_BNDREGS_MASK) {
2852             do_xrstor_bndregs(env, ptr + XO(bndreg_state), ra);
2853             env->hflags |= HF_MPX_IU_MASK;
2854         } else {
2855             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
2856             env->hflags &= ~HF_MPX_IU_MASK;
2857         }
2858     }
2859     if (rfbm & XSTATE_BNDCSR_MASK) {
2860         if (xstate_bv & XSTATE_BNDCSR_MASK) {
2861             do_xrstor_bndcsr(env, ptr + XO(bndcsr_state), ra);
2862         } else {
2863             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
2864         }
2865         cpu_sync_bndcs_hflags(env);
2866     }
2867     if (rfbm & XSTATE_PKRU_MASK) {
2868         uint64_t old_pkru = env->pkru;
2869         if (xstate_bv & XSTATE_PKRU_MASK) {
2870             do_xrstor_pkru(env, ptr + XO(pkru_state), ra);
2871         } else {
2872             env->pkru = 0;
2873         }
2874         if (env->pkru != old_pkru) {
2875             CPUState *cs = env_cpu(env);
2876             tlb_flush(cs);
2877         }
2878     }
2879 }
2880
2881 #undef XO
2882
2883 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
2884 {
2885     /* The OS must have enabled XSAVE.  */
2886     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2887         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2888     }
2889
2890     switch (ecx) {
2891     case 0:
2892         return env->xcr0;
2893     case 1:
2894         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
2895             return env->xcr0 & get_xinuse(env);
2896         }
2897         break;
2898     }
2899     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2900 }
2901
2902 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
2903 {
2904     uint32_t dummy, ena_lo, ena_hi;
2905     uint64_t ena;
2906
2907     /* The OS must have enabled XSAVE.  */
2908     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2909         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2910     }
2911
2912     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
2913     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
2914         goto do_gpf;
2915     }
2916
2917     /* Disallow enabling unimplemented features.  */
2918     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
2919     ena = ((uint64_t)ena_hi << 32) | ena_lo;
2920     if (mask & ~ena) {
2921         goto do_gpf;
2922     }
2923
2924     /* Disallow enabling only half of MPX.  */
2925     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
2926         & XSTATE_BNDCSR_MASK) {
2927         goto do_gpf;
2928     }
2929
2930     env->xcr0 = mask;
2931     cpu_sync_bndcs_hflags(env);
2932     return;
2933
2934  do_gpf:
2935     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2936 }
2937
2938 /* MMX/SSE */
2939 /* XXX: optimize by storing fptt and fptags in the static cpu state */
2940
2941 #define SSE_DAZ             0x0040
2942 #define SSE_RC_MASK         0x6000
2943 #define SSE_RC_NEAR         0x0000
2944 #define SSE_RC_DOWN         0x2000
2945 #define SSE_RC_UP           0x4000
2946 #define SSE_RC_CHOP         0x6000
2947 #define SSE_FZ              0x8000
2948
2949 void update_mxcsr_status(CPUX86State *env)
2950 {
2951     uint32_t mxcsr = env->mxcsr;
2952     int rnd_type;
2953
2954     /* set rounding mode */
2955     switch (mxcsr & SSE_RC_MASK) {
2956     default:
2957     case SSE_RC_NEAR:
2958         rnd_type = float_round_nearest_even;
2959         break;
2960     case SSE_RC_DOWN:
2961         rnd_type = float_round_down;
2962         break;
2963     case SSE_RC_UP:
2964         rnd_type = float_round_up;
2965         break;
2966     case SSE_RC_CHOP:
2967         rnd_type = float_round_to_zero;
2968         break;
2969     }
2970     set_float_rounding_mode(rnd_type, &env->sse_status);
2971
2972     /* Set exception flags.  */
2973     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
2974                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
2975                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
2976                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
2977                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
2978                               &env->sse_status);
2979
2980     /* set denormals are zero */
2981     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
2982
2983     /* set flush to zero */
2984     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
2985 }
2986
2987 void update_mxcsr_from_sse_status(CPUX86State *env)
2988 {
2989     if (tcg_enabled()) {
2990         uint8_t flags = get_float_exception_flags(&env->sse_status);
2991         /*
2992          * The MXCSR denormal flag has opposite semantics to
2993          * float_flag_input_denormal (the softfloat code sets that flag
2994          * only when flushing input denormals to zero, but SSE sets it
2995          * only when not flushing them to zero), so is not converted
2996          * here.
2997          */
2998         env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
2999                        (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3000                        (flags & float_flag_overflow ? FPUS_OE : 0) |
3001                        (flags & float_flag_underflow ? FPUS_UE : 0) |
3002                        (flags & float_flag_inexact ? FPUS_PE : 0) |
3003                        (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3004                         0));
3005     }
3006 }
3007
3008 void helper_update_mxcsr(CPUX86State *env)
3009 {
3010     update_mxcsr_from_sse_status(env);
3011 }
3012
3013 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3014 {
3015     cpu_set_mxcsr(env, val);
3016 }
3017
3018 void helper_enter_mmx(CPUX86State *env)
3019 {
3020     env->fpstt = 0;
3021     *(uint32_t *)(env->fptags) = 0;
3022     *(uint32_t *)(env->fptags + 4) = 0;
3023 }
3024
3025 void helper_emms(CPUX86State *env)
3026 {
3027     /* set to empty state */
3028     *(uint32_t *)(env->fptags) = 0x01010101;
3029     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3030 }
3031
3032 /* XXX: suppress */
3033 void helper_movq(CPUX86State *env, void *d, void *s)
3034 {
3035     *(uint64_t *)d = *(uint64_t *)s;
3036 }
3037
3038 #define SHIFT 0
3039 #include "ops_sse.h"
3040
3041 #define SHIFT 1
3042 #include "ops_sse.h"