target/i386/tcg/fpu_helper.c

   1 /*
   2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include <math.h>
  22 #include "cpu.h"
  23 #include "exec/helper-proto.h"
  24 #include "qemu/host-utils.h"
  25 #include "exec/exec-all.h"
  26 #include "exec/cpu_ldst.h"
  27 #include "fpu/softfloat.h"
  28 #include "fpu/softfloat-macros.h"
  29 #include "helper-tcg.h"
  30
  31 #ifdef CONFIG_SOFTMMU
  32 #include "hw/irq.h"
  33 #endif
  34
  35 /* float macros */
  36 #define FT0    (env->ft0)
  37 #define ST0    (env->fpregs[env->fpstt].d)
  38 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
  39 #define ST1    ST(1)
  40
  41 #define FPU_RC_MASK         0xc00
  42 #define FPU_RC_NEAR         0x000
  43 #define FPU_RC_DOWN         0x400
  44 #define FPU_RC_UP           0x800
  45 #define FPU_RC_CHOP         0xc00
  46
  47 #define MAXTAN 9223372036854775808.0
  48
  49 /* the following deal with x86 long double-precision numbers */
  50 #define MAXEXPD 0x7fff
  51 #define EXPBIAS 16383
  52 #define EXPD(fp)        (fp.l.upper & 0x7fff)
  53 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
  54 #define MANTD(fp)       (fp.l.lower)
  55 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
  56
  57 #define FPUS_IE (1 << 0)
  58 #define FPUS_DE (1 << 1)
  59 #define FPUS_ZE (1 << 2)
  60 #define FPUS_OE (1 << 3)
  61 #define FPUS_UE (1 << 4)
  62 #define FPUS_PE (1 << 5)
  63 #define FPUS_SF (1 << 6)
  64 #define FPUS_SE (1 << 7)
  65 #define FPUS_B  (1 << 15)
  66
  67 #define FPUC_EM 0x3f
  68
  69 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
  70 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
  71 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
  72 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
  73 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
  74 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
  75 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
  76 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
  77
  78 #if !defined(CONFIG_USER_ONLY)
  79 static qemu_irq ferr_irq;
  80
  81 void x86_register_ferr_irq(qemu_irq irq)
  82 {
  83     ferr_irq = irq;
  84 }
  85
  86 static void cpu_clear_ignne(void)
  87 {
  88     CPUX86State *env = &X86_CPU(first_cpu)->env;
  89     env->hflags2 &= ~HF2_IGNNE_MASK;
  90 }
  91
  92 void cpu_set_ignne(void)
  93 {
  94     CPUX86State *env = &X86_CPU(first_cpu)->env;
  95     env->hflags2 |= HF2_IGNNE_MASK;
  96     /*
  97      * We get here in response to a write to port F0h.  The chipset should
  98      * deassert FP_IRQ and FERR# instead should stay signaled until FPSW_SE is
  99      * cleared, because FERR# and FP_IRQ are two separate pins on real
 100      * hardware.  However, we don't model FERR# as a qemu_irq, so we just
 101      * do directly what the chipset would do, i.e. deassert FP_IRQ.
 102      */
 103     qemu_irq_lower(ferr_irq);
 104 }
 105 #endif
 106
 107
 108 static inline void fpush(CPUX86State *env)
 109 {
 110     env->fpstt = (env->fpstt - 1) & 7;
 111     env->fptags[env->fpstt] = 0; /* validate stack entry */
 112 }
 113
 114 static inline void fpop(CPUX86State *env)
 115 {
 116     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
 117     env->fpstt = (env->fpstt + 1) & 7;
 118 }
 119
 120 static inline floatx80 helper_fldt(CPUX86State *env, target_ulong ptr,
 121                                    uintptr_t retaddr)
 122 {
 123     CPU_LDoubleU temp;
 124
 125     temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
 126     temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
 127     return temp.d;
 128 }
 129
 130 static inline void helper_fstt(CPUX86State *env, floatx80 f, target_ulong ptr,
 131                                uintptr_t retaddr)
 132 {
 133     CPU_LDoubleU temp;
 134
 135     temp.d = f;
 136     cpu_stq_data_ra(env, ptr, temp.l.lower, retaddr);
 137     cpu_stw_data_ra(env, ptr + 8, temp.l.upper, retaddr);
 138 }
 139
 140 /* x87 FPU helpers */
 141
 142 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
 143 {
 144     union {
 145         float64 f64;
 146         double d;
 147     } u;
 148
 149     u.f64 = floatx80_to_float64(a, &env->fp_status);
 150     return u.d;
 151 }
 152
 153 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
 154 {
 155     union {
 156         float64 f64;
 157         double d;
 158     } u;
 159
 160     u.d = a;
 161     return float64_to_floatx80(u.f64, &env->fp_status);
 162 }
 163
 164 static void fpu_set_exception(CPUX86State *env, int mask)
 165 {
 166     env->fpus |= mask;
 167     if (env->fpus & (~env->fpuc & FPUC_EM)) {
 168         env->fpus |= FPUS_SE | FPUS_B;
 169     }
 170 }
 171
 172 static inline uint8_t save_exception_flags(CPUX86State *env)
 173 {
 174     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
 175     set_float_exception_flags(0, &env->fp_status);
 176     return old_flags;
 177 }
 178
 179 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
 180 {
 181     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
 182     float_raise(old_flags, &env->fp_status);
 183     fpu_set_exception(env,
 184                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
 185                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
 186                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
 187                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
 188                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
 189                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
 190 }
 191
 192 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
 193 {
 194     uint8_t old_flags = save_exception_flags(env);
 195     floatx80 ret = floatx80_div(a, b, &env->fp_status);
 196     merge_exception_flags(env, old_flags);
 197     return ret;
 198 }
 199
 200 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
 201 {
 202     if (env->cr[0] & CR0_NE_MASK) {
 203         raise_exception_ra(env, EXCP10_COPR, retaddr);
 204     }
 205 #if !defined(CONFIG_USER_ONLY)
 206     else if (ferr_irq && !(env->hflags2 & HF2_IGNNE_MASK)) {
 207         qemu_irq_raise(ferr_irq);
 208     }
 209 #endif
 210 }
 211
 212 void helper_flds_FT0(CPUX86State *env, uint32_t val)
 213 {
 214     uint8_t old_flags = save_exception_flags(env);
 215     union {
 216         float32 f;
 217         uint32_t i;
 218     } u;
 219
 220     u.i = val;
 221     FT0 = float32_to_floatx80(u.f, &env->fp_status);
 222     merge_exception_flags(env, old_flags);
 223 }
 224
 225 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
 226 {
 227     uint8_t old_flags = save_exception_flags(env);
 228     union {
 229         float64 f;
 230         uint64_t i;
 231     } u;
 232
 233     u.i = val;
 234     FT0 = float64_to_floatx80(u.f, &env->fp_status);
 235     merge_exception_flags(env, old_flags);
 236 }
 237
 238 void helper_fildl_FT0(CPUX86State *env, int32_t val)
 239 {
 240     FT0 = int32_to_floatx80(val, &env->fp_status);
 241 }
 242
 243 void helper_flds_ST0(CPUX86State *env, uint32_t val)
 244 {
 245     uint8_t old_flags = save_exception_flags(env);
 246     int new_fpstt;
 247     union {
 248         float32 f;
 249         uint32_t i;
 250     } u;
 251
 252     new_fpstt = (env->fpstt - 1) & 7;
 253     u.i = val;
 254     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
 255     env->fpstt = new_fpstt;
 256     env->fptags[new_fpstt] = 0; /* validate stack entry */
 257     merge_exception_flags(env, old_flags);
 258 }
 259
 260 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
 261 {
 262     uint8_t old_flags = save_exception_flags(env);
 263     int new_fpstt;
 264     union {
 265         float64 f;
 266         uint64_t i;
 267     } u;
 268
 269     new_fpstt = (env->fpstt - 1) & 7;
 270     u.i = val;
 271     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
 272     env->fpstt = new_fpstt;
 273     env->fptags[new_fpstt] = 0; /* validate stack entry */
 274     merge_exception_flags(env, old_flags);
 275 }
 276
 277 void helper_fildl_ST0(CPUX86State *env, int32_t val)
 278 {
 279     int new_fpstt;
 280
 281     new_fpstt = (env->fpstt - 1) & 7;
 282     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
 283     env->fpstt = new_fpstt;
 284     env->fptags[new_fpstt] = 0; /* validate stack entry */
 285 }
 286
 287 void helper_fildll_ST0(CPUX86State *env, int64_t val)
 288 {
 289     int new_fpstt;
 290
 291     new_fpstt = (env->fpstt - 1) & 7;
 292     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
 293     env->fpstt = new_fpstt;
 294     env->fptags[new_fpstt] = 0; /* validate stack entry */
 295 }
 296
 297 uint32_t helper_fsts_ST0(CPUX86State *env)
 298 {
 299     uint8_t old_flags = save_exception_flags(env);
 300     union {
 301         float32 f;
 302         uint32_t i;
 303     } u;
 304
 305     u.f = floatx80_to_float32(ST0, &env->fp_status);
 306     merge_exception_flags(env, old_flags);
 307     return u.i;
 308 }
 309
 310 uint64_t helper_fstl_ST0(CPUX86State *env)
 311 {
 312     uint8_t old_flags = save_exception_flags(env);
 313     union {
 314         float64 f;
 315         uint64_t i;
 316     } u;
 317
 318     u.f = floatx80_to_float64(ST0, &env->fp_status);
 319     merge_exception_flags(env, old_flags);
 320     return u.i;
 321 }
 322
 323 int32_t helper_fist_ST0(CPUX86State *env)
 324 {
 325     uint8_t old_flags = save_exception_flags(env);
 326     int32_t val;
 327
 328     val = floatx80_to_int32(ST0, &env->fp_status);
 329     if (val != (int16_t)val) {
 330         set_float_exception_flags(float_flag_invalid, &env->fp_status);
 331         val = -32768;
 332     }
 333     merge_exception_flags(env, old_flags);
 334     return val;
 335 }
 336
 337 int32_t helper_fistl_ST0(CPUX86State *env)
 338 {
 339     uint8_t old_flags = save_exception_flags(env);
 340     int32_t val;
 341
 342     val = floatx80_to_int32(ST0, &env->fp_status);
 343     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 344         val = 0x80000000;
 345     }
 346     merge_exception_flags(env, old_flags);
 347     return val;
 348 }
 349
 350 int64_t helper_fistll_ST0(CPUX86State *env)
 351 {
 352     uint8_t old_flags = save_exception_flags(env);
 353     int64_t val;
 354
 355     val = floatx80_to_int64(ST0, &env->fp_status);
 356     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 357         val = 0x8000000000000000ULL;
 358     }
 359     merge_exception_flags(env, old_flags);
 360     return val;
 361 }
 362
 363 int32_t helper_fistt_ST0(CPUX86State *env)
 364 {
 365     uint8_t old_flags = save_exception_flags(env);
 366     int32_t val;
 367
 368     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
 369     if (val != (int16_t)val) {
 370         set_float_exception_flags(float_flag_invalid, &env->fp_status);
 371         val = -32768;
 372     }
 373     merge_exception_flags(env, old_flags);
 374     return val;
 375 }
 376
 377 int32_t helper_fisttl_ST0(CPUX86State *env)
 378 {
 379     uint8_t old_flags = save_exception_flags(env);
 380     int32_t val;
 381
 382     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
 383     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 384         val = 0x80000000;
 385     }
 386     merge_exception_flags(env, old_flags);
 387     return val;
 388 }
 389
 390 int64_t helper_fisttll_ST0(CPUX86State *env)
 391 {
 392     uint8_t old_flags = save_exception_flags(env);
 393     int64_t val;
 394
 395     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
 396     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 397         val = 0x8000000000000000ULL;
 398     }
 399     merge_exception_flags(env, old_flags);
 400     return val;
 401 }
 402
 403 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
 404 {
 405     int new_fpstt;
 406
 407     new_fpstt = (env->fpstt - 1) & 7;
 408     env->fpregs[new_fpstt].d = helper_fldt(env, ptr, GETPC());
 409     env->fpstt = new_fpstt;
 410     env->fptags[new_fpstt] = 0; /* validate stack entry */
 411 }
 412
 413 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
 414 {
 415     helper_fstt(env, ST0, ptr, GETPC());
 416 }
 417
 418 void helper_fpush(CPUX86State *env)
 419 {
 420     fpush(env);
 421 }
 422
 423 void helper_fpop(CPUX86State *env)
 424 {
 425     fpop(env);
 426 }
 427
 428 void helper_fdecstp(CPUX86State *env)
 429 {
 430     env->fpstt = (env->fpstt - 1) & 7;
 431     env->fpus &= ~0x4700;
 432 }
 433
 434 void helper_fincstp(CPUX86State *env)
 435 {
 436     env->fpstt = (env->fpstt + 1) & 7;
 437     env->fpus &= ~0x4700;
 438 }
 439
 440 /* FPU move */
 441
 442 void helper_ffree_STN(CPUX86State *env, int st_index)
 443 {
 444     env->fptags[(env->fpstt + st_index) & 7] = 1;
 445 }
 446
 447 void helper_fmov_ST0_FT0(CPUX86State *env)
 448 {
 449     ST0 = FT0;
 450 }
 451
 452 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
 453 {
 454     FT0 = ST(st_index);
 455 }
 456
 457 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
 458 {
 459     ST0 = ST(st_index);
 460 }
 461
 462 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
 463 {
 464     ST(st_index) = ST0;
 465 }
 466
 467 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
 468 {
 469     floatx80 tmp;
 470
 471     tmp = ST(st_index);
 472     ST(st_index) = ST0;
 473     ST0 = tmp;
 474 }
 475
 476 /* FPU operations */
 477
 478 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
 479
 480 void helper_fcom_ST0_FT0(CPUX86State *env)
 481 {
 482     uint8_t old_flags = save_exception_flags(env);
 483     FloatRelation ret;
 484
 485     ret = floatx80_compare(ST0, FT0, &env->fp_status);
 486     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
 487     merge_exception_flags(env, old_flags);
 488 }
 489
 490 void helper_fucom_ST0_FT0(CPUX86State *env)
 491 {
 492     uint8_t old_flags = save_exception_flags(env);
 493     FloatRelation ret;
 494
 495     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
 496     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
 497     merge_exception_flags(env, old_flags);
 498 }
 499
 500 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
 501
 502 void helper_fcomi_ST0_FT0(CPUX86State *env)
 503 {
 504     uint8_t old_flags = save_exception_flags(env);
 505     int eflags;
 506     FloatRelation ret;
 507
 508     ret = floatx80_compare(ST0, FT0, &env->fp_status);
 509     eflags = cpu_cc_compute_all(env, CC_OP);
 510     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
 511     CC_SRC = eflags;
 512     merge_exception_flags(env, old_flags);
 513 }
 514
 515 void helper_fucomi_ST0_FT0(CPUX86State *env)
 516 {
 517     uint8_t old_flags = save_exception_flags(env);
 518     int eflags;
 519     FloatRelation ret;
 520
 521     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
 522     eflags = cpu_cc_compute_all(env, CC_OP);
 523     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
 524     CC_SRC = eflags;
 525     merge_exception_flags(env, old_flags);
 526 }
 527
 528 void helper_fadd_ST0_FT0(CPUX86State *env)
 529 {
 530     uint8_t old_flags = save_exception_flags(env);
 531     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
 532     merge_exception_flags(env, old_flags);
 533 }
 534
 535 void helper_fmul_ST0_FT0(CPUX86State *env)
 536 {
 537     uint8_t old_flags = save_exception_flags(env);
 538     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
 539     merge_exception_flags(env, old_flags);
 540 }
 541
 542 void helper_fsub_ST0_FT0(CPUX86State *env)
 543 {
 544     uint8_t old_flags = save_exception_flags(env);
 545     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
 546     merge_exception_flags(env, old_flags);
 547 }
 548
 549 void helper_fsubr_ST0_FT0(CPUX86State *env)
 550 {
 551     uint8_t old_flags = save_exception_flags(env);
 552     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
 553     merge_exception_flags(env, old_flags);
 554 }
 555
 556 void helper_fdiv_ST0_FT0(CPUX86State *env)
 557 {
 558     ST0 = helper_fdiv(env, ST0, FT0);
 559 }
 560
 561 void helper_fdivr_ST0_FT0(CPUX86State *env)
 562 {
 563     ST0 = helper_fdiv(env, FT0, ST0);
 564 }
 565
 566 /* fp operations between STN and ST0 */
 567
 568 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
 569 {
 570     uint8_t old_flags = save_exception_flags(env);
 571     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
 572     merge_exception_flags(env, old_flags);
 573 }
 574
 575 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
 576 {
 577     uint8_t old_flags = save_exception_flags(env);
 578     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
 579     merge_exception_flags(env, old_flags);
 580 }
 581
 582 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
 583 {
 584     uint8_t old_flags = save_exception_flags(env);
 585     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
 586     merge_exception_flags(env, old_flags);
 587 }
 588
 589 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
 590 {
 591     uint8_t old_flags = save_exception_flags(env);
 592     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
 593     merge_exception_flags(env, old_flags);
 594 }
 595
 596 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
 597 {
 598     floatx80 *p;
 599
 600     p = &ST(st_index);
 601     *p = helper_fdiv(env, *p, ST0);
 602 }
 603
 604 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
 605 {
 606     floatx80 *p;
 607
 608     p = &ST(st_index);
 609     *p = helper_fdiv(env, ST0, *p);
 610 }
 611
 612 /* misc FPU operations */
 613 void helper_fchs_ST0(CPUX86State *env)
 614 {
 615     ST0 = floatx80_chs(ST0);
 616 }
 617
 618 void helper_fabs_ST0(CPUX86State *env)
 619 {
 620     ST0 = floatx80_abs(ST0);
 621 }
 622
 623 void helper_fld1_ST0(CPUX86State *env)
 624 {
 625     ST0 = floatx80_one;
 626 }
 627
 628 void helper_fldl2t_ST0(CPUX86State *env)
 629 {
 630     switch (env->fpuc & FPU_RC_MASK) {
 631     case FPU_RC_UP:
 632         ST0 = floatx80_l2t_u;
 633         break;
 634     default:
 635         ST0 = floatx80_l2t;
 636         break;
 637     }
 638 }
 639
 640 void helper_fldl2e_ST0(CPUX86State *env)
 641 {
 642     switch (env->fpuc & FPU_RC_MASK) {
 643     case FPU_RC_DOWN:
 644     case FPU_RC_CHOP:
 645         ST0 = floatx80_l2e_d;
 646         break;
 647     default:
 648         ST0 = floatx80_l2e;
 649         break;
 650     }
 651 }
 652
 653 void helper_fldpi_ST0(CPUX86State *env)
 654 {
 655     switch (env->fpuc & FPU_RC_MASK) {
 656     case FPU_RC_DOWN:
 657     case FPU_RC_CHOP:
 658         ST0 = floatx80_pi_d;
 659         break;
 660     default:
 661         ST0 = floatx80_pi;
 662         break;
 663     }
 664 }
 665
 666 void helper_fldlg2_ST0(CPUX86State *env)
 667 {
 668     switch (env->fpuc & FPU_RC_MASK) {
 669     case FPU_RC_DOWN:
 670     case FPU_RC_CHOP:
 671         ST0 = floatx80_lg2_d;
 672         break;
 673     default:
 674         ST0 = floatx80_lg2;
 675         break;
 676     }
 677 }
 678
 679 void helper_fldln2_ST0(CPUX86State *env)
 680 {
 681     switch (env->fpuc & FPU_RC_MASK) {
 682     case FPU_RC_DOWN:
 683     case FPU_RC_CHOP:
 684         ST0 = floatx80_ln2_d;
 685         break;
 686     default:
 687         ST0 = floatx80_ln2;
 688         break;
 689     }
 690 }
 691
 692 void helper_fldz_ST0(CPUX86State *env)
 693 {
 694     ST0 = floatx80_zero;
 695 }
 696
 697 void helper_fldz_FT0(CPUX86State *env)
 698 {
 699     FT0 = floatx80_zero;
 700 }
 701
 702 uint32_t helper_fnstsw(CPUX86State *env)
 703 {
 704     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
 705 }
 706
 707 uint32_t helper_fnstcw(CPUX86State *env)
 708 {
 709     return env->fpuc;
 710 }
 711
 712 void update_fp_status(CPUX86State *env)
 713 {
 714     int rnd_type;
 715
 716     /* set rounding mode */
 717     switch (env->fpuc & FPU_RC_MASK) {
 718     default:
 719     case FPU_RC_NEAR:
 720         rnd_type = float_round_nearest_even;
 721         break;
 722     case FPU_RC_DOWN:
 723         rnd_type = float_round_down;
 724         break;
 725     case FPU_RC_UP:
 726         rnd_type = float_round_up;
 727         break;
 728     case FPU_RC_CHOP:
 729         rnd_type = float_round_to_zero;
 730         break;
 731     }
 732     set_float_rounding_mode(rnd_type, &env->fp_status);
 733     switch ((env->fpuc >> 8) & 3) {
 734     case 0:
 735         rnd_type = 32;
 736         break;
 737     case 2:
 738         rnd_type = 64;
 739         break;
 740     case 3:
 741     default:
 742         rnd_type = 80;
 743         break;
 744     }
 745     set_floatx80_rounding_precision(rnd_type, &env->fp_status);
 746 }
 747
 748 void helper_fldcw(CPUX86State *env, uint32_t val)
 749 {
 750     cpu_set_fpuc(env, val);
 751 }
 752
 753 void helper_fclex(CPUX86State *env)
 754 {
 755     env->fpus &= 0x7f00;
 756 }
 757
 758 void helper_fwait(CPUX86State *env)
 759 {
 760     if (env->fpus & FPUS_SE) {
 761         fpu_raise_exception(env, GETPC());
 762     }
 763 }
 764
 765 void helper_fninit(CPUX86State *env)
 766 {
 767     env->fpus = 0;
 768     env->fpstt = 0;
 769     cpu_set_fpuc(env, 0x37f);
 770     env->fptags[0] = 1;
 771     env->fptags[1] = 1;
 772     env->fptags[2] = 1;
 773     env->fptags[3] = 1;
 774     env->fptags[4] = 1;
 775     env->fptags[5] = 1;
 776     env->fptags[6] = 1;
 777     env->fptags[7] = 1;
 778 }
 779
 780 /* BCD ops */
 781
 782 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
 783 {
 784     floatx80 tmp;
 785     uint64_t val;
 786     unsigned int v;
 787     int i;
 788
 789     val = 0;
 790     for (i = 8; i >= 0; i--) {
 791         v = cpu_ldub_data_ra(env, ptr + i, GETPC());
 792         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
 793     }
 794     tmp = int64_to_floatx80(val, &env->fp_status);
 795     if (cpu_ldub_data_ra(env, ptr + 9, GETPC()) & 0x80) {
 796         tmp = floatx80_chs(tmp);
 797     }
 798     fpush(env);
 799     ST0 = tmp;
 800 }
 801
 802 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
 803 {
 804     uint8_t old_flags = save_exception_flags(env);
 805     int v;
 806     target_ulong mem_ref, mem_end;
 807     int64_t val;
 808     CPU_LDoubleU temp;
 809
 810     temp.d = ST0;
 811
 812     val = floatx80_to_int64(ST0, &env->fp_status);
 813     mem_ref = ptr;
 814     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
 815         set_float_exception_flags(float_flag_invalid, &env->fp_status);
 816         while (mem_ref < ptr + 7) {
 817             cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
 818         }
 819         cpu_stb_data_ra(env, mem_ref++, 0xc0, GETPC());
 820         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
 821         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
 822         merge_exception_flags(env, old_flags);
 823         return;
 824     }
 825     mem_end = mem_ref + 9;
 826     if (SIGND(temp)) {
 827         cpu_stb_data_ra(env, mem_end, 0x80, GETPC());
 828         val = -val;
 829     } else {
 830         cpu_stb_data_ra(env, mem_end, 0x00, GETPC());
 831     }
 832     while (mem_ref < mem_end) {
 833         if (val == 0) {
 834             break;
 835         }
 836         v = val % 100;
 837         val = val / 100;
 838         v = ((v / 10) << 4) | (v % 10);
 839         cpu_stb_data_ra(env, mem_ref++, v, GETPC());
 840     }
 841     while (mem_ref < mem_end) {
 842         cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
 843     }
 844     merge_exception_flags(env, old_flags);
 845 }
 846
 847 /* 128-bit significand of log(2).  */
 848 #define ln2_sig_high 0xb17217f7d1cf79abULL
 849 #define ln2_sig_low 0xc9e3b39803f2f6afULL
 850
 851 /*
 852  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
 853  * the interval [-1/64, 1/64].
 854  */
 855 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
 856 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
 857 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
 858 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
 859 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
 860 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
 861 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
 862 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
 863 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
 864
 865 struct f2xm1_data {
 866     /*
 867      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
 868      * are very close to exact floatx80 values.
 869      */
 870     floatx80 t;
 871     /* The value of 2^t.  */
 872     floatx80 exp2;
 873     /* The value of 2^t - 1.  */
 874     floatx80 exp2m1;
 875 };
 876
 877 static const struct f2xm1_data f2xm1_table[65] = {
 878     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
 879       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
 880       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
 881     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
 882       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
 883       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
 884     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
 885       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
 886       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
 887     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
 888       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
 889       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
 890     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
 891       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
 892       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
 893     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
 894       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
 895       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
 896     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
 897       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
 898       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
 899     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
 900       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
 901       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
 902     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
 903       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
 904       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
 905     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
 906       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
 907       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
 908     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
 909       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
 910       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
 911     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
 912       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
 913       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
 914     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
 915       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
 916       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
 917     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
 918       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
 919       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
 920     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
 921       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
 922       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
 923     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
 924       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
 925       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
 926     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
 927       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
 928       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
 929     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
 930       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
 931       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
 932     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
 933       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
 934       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
 935     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
 936       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
 937       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
 938     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
 939       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
 940       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
 941     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
 942       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
 943       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
 944     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
 945       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
 946       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
 947     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
 948       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
 949       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
 950     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
 951       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
 952       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
 953     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
 954       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
 955       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
 956     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
 957       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
 958       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
 959     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
 960       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
 961       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
 962     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
 963       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
 964       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
 965     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
 966       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
 967       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
 968     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
 969       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
 970       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
 971     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
 972       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
 973       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
 974     { floatx80_zero_init,
 975       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
 976       floatx80_zero_init },
 977     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
 978       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
 979       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
 980     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
 981       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
 982       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
 983     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
 984       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
 985       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
 986     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
 987       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
 988       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
 989     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
 990       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
 991       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
 992     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
 993       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
 994       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
 995     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
 996       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
 997       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
 998     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
 999       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
1000       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
1001     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
1002       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
1003       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
1004     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
1005       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
1006       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
1007     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
1008       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
1009       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
1010     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
1011       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
1012       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
1013     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
1014       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
1015       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
1016     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
1017       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
1018       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
1019     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
1020       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
1021       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
1022     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
1023       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
1024       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
1025     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1026       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1027       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1028     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1029       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1030       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1031     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1032       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1033       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1034     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1035       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1036       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1037     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1038       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1039       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1040     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1041       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1042       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1043     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1044       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1045       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1046     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1047       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1048       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1049     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1050       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1051       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1052     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1053       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1054       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1055     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1056       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1057       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1058     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1059       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1060       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1061     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1062       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1063       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1064     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1065       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1066       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1067     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1068       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1069       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1070     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1071       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1072       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1073 };
1074
1075 void helper_f2xm1(CPUX86State *env)
1076 {
1077     uint8_t old_flags = save_exception_flags(env);
1078     uint64_t sig = extractFloatx80Frac(ST0);
1079     int32_t exp = extractFloatx80Exp(ST0);
1080     bool sign = extractFloatx80Sign(ST0);
1081
1082     if (floatx80_invalid_encoding(ST0)) {
1083         float_raise(float_flag_invalid, &env->fp_status);
1084         ST0 = floatx80_default_nan(&env->fp_status);
1085     } else if (floatx80_is_any_nan(ST0)) {
1086         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1087             float_raise(float_flag_invalid, &env->fp_status);
1088             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1089         }
1090     } else if (exp > 0x3fff ||
1091                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1092         /* Out of range for the instruction, treat as invalid.  */
1093         float_raise(float_flag_invalid, &env->fp_status);
1094         ST0 = floatx80_default_nan(&env->fp_status);
1095     } else if (exp == 0x3fff) {
1096         /* Argument 1 or -1, exact result 1 or -0.5.  */
1097         if (sign) {
1098             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1099         }
1100     } else if (exp < 0x3fb0) {
1101         if (!floatx80_is_zero(ST0)) {
1102             /*
1103              * Multiplying the argument by an extra-precision version
1104              * of log(2) is sufficiently precise.  Zero arguments are
1105              * returned unchanged.
1106              */
1107             uint64_t sig0, sig1, sig2;
1108             if (exp == 0) {
1109                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1110             }
1111             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1112                             &sig2);
1113             /* This result is inexact.  */
1114             sig1 |= 1;
1115             ST0 = normalizeRoundAndPackFloatx80(80, sign, exp, sig0, sig1,
1116                                                 &env->fp_status);
1117         }
1118     } else {
1119         floatx80 tmp, y, accum;
1120         bool asign, bsign;
1121         int32_t n, aexp, bexp;
1122         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1123         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1124         signed char save_prec = env->fp_status.floatx80_rounding_precision;
1125         env->fp_status.float_rounding_mode = float_round_nearest_even;
1126         env->fp_status.floatx80_rounding_precision = 80;
1127
1128         /* Find the nearest multiple of 1/32 to the argument.  */
1129         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1130         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1131         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1132
1133         if (floatx80_is_zero(y)) {
1134             /*
1135              * Use the value of 2^t - 1 from the table, to avoid
1136              * needing to special-case zero as a result of
1137              * multiplication below.
1138              */
1139             ST0 = f2xm1_table[n].t;
1140             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1141             env->fp_status.float_rounding_mode = save_mode;
1142         } else {
1143             /*
1144              * Compute the lower parts of a polynomial expansion for
1145              * (2^y - 1) / y.
1146              */
1147             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1148             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1149             accum = floatx80_mul(accum, y, &env->fp_status);
1150             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1151             accum = floatx80_mul(accum, y, &env->fp_status);
1152             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1153             accum = floatx80_mul(accum, y, &env->fp_status);
1154             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1155             accum = floatx80_mul(accum, y, &env->fp_status);
1156             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1157             accum = floatx80_mul(accum, y, &env->fp_status);
1158             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1159             accum = floatx80_mul(accum, y, &env->fp_status);
1160             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1161
1162             /*
1163              * The full polynomial expansion is f2xm1_coeff_0 + accum
1164              * (where accum has much lower magnitude, and so, in
1165              * particular, carry out of the addition is not possible).
1166              * (This expansion is only accurate to about 70 bits, not
1167              * 128 bits.)
1168              */
1169             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1170             asign = extractFloatx80Sign(f2xm1_coeff_0);
1171             shift128RightJamming(extractFloatx80Frac(accum), 0,
1172                                  aexp - extractFloatx80Exp(accum),
1173                                  &asig0, &asig1);
1174             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1175             bsig1 = 0;
1176             if (asign == extractFloatx80Sign(accum)) {
1177                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1178             } else {
1179                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1180             }
1181             /* And thus compute an approximation to 2^y - 1.  */
1182             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1183                             &asig0, &asig1, &asig2);
1184             aexp += extractFloatx80Exp(y) - 0x3ffe;
1185             asign ^= extractFloatx80Sign(y);
1186             if (n != 32) {
1187                 /*
1188                  * Multiply this by the precomputed value of 2^t and
1189                  * add that of 2^t - 1.
1190                  */
1191                 mul128By64To192(asig0, asig1,
1192                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1193                                 &asig0, &asig1, &asig2);
1194                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1195                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1196                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1197                 bsig1 = 0;
1198                 if (bexp < aexp) {
1199                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1200                                          &bsig0, &bsig1);
1201                 } else if (aexp < bexp) {
1202                     shift128RightJamming(asig0, asig1, bexp - aexp,
1203                                          &asig0, &asig1);
1204                     aexp = bexp;
1205                 }
1206                 /* The sign of 2^t - 1 is always that of the result.  */
1207                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1208                 if (asign == bsign) {
1209                     /* Avoid possible carry out of the addition.  */
1210                     shift128RightJamming(asig0, asig1, 1,
1211                                          &asig0, &asig1);
1212                     shift128RightJamming(bsig0, bsig1, 1,
1213                                          &bsig0, &bsig1);
1214                     ++aexp;
1215                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1216                 } else {
1217                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1218                     asign = bsign;
1219                 }
1220             }
1221             env->fp_status.float_rounding_mode = save_mode;
1222             /* This result is inexact.  */
1223             asig1 |= 1;
1224             ST0 = normalizeRoundAndPackFloatx80(80, asign, aexp, asig0, asig1,
1225                                                 &env->fp_status);
1226         }
1227
1228         env->fp_status.floatx80_rounding_precision = save_prec;
1229     }
1230     merge_exception_flags(env, old_flags);
1231 }
1232
1233 void helper_fptan(CPUX86State *env)
1234 {
1235     double fptemp = floatx80_to_double(env, ST0);
1236
1237     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1238         env->fpus |= 0x400;
1239     } else {
1240         fptemp = tan(fptemp);
1241         ST0 = double_to_floatx80(env, fptemp);
1242         fpush(env);
1243         ST0 = floatx80_one;
1244         env->fpus &= ~0x400; /* C2 <-- 0 */
1245         /* the above code is for |arg| < 2**52 only */
1246     }
1247 }
1248
1249 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1250 #define pi_4_exp 0x3ffe
1251 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1252 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1253 #define pi_2_exp 0x3fff
1254 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1255 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1256 #define pi_34_exp 0x4000
1257 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1258 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1259 #define pi_exp 0x4000
1260 #define pi_sig_high 0xc90fdaa22168c234ULL
1261 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1262
1263 /*
1264  * Polynomial coefficients for an approximation to atan(x), with only
1265  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1266  * for some other approximations, no low part is needed for the first
1267  * coefficient here to achieve a sufficiently accurate result, because
1268  * the coefficient in this minimax approximation is very close to
1269  * exactly 1.)
1270  */
1271 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1272 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1273 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1274 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1275 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1276 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1277 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1278
1279 struct fpatan_data {
1280     /* High and low parts of atan(x).  */
1281     floatx80 atan_high, atan_low;
1282 };
1283
1284 static const struct fpatan_data fpatan_table[9] = {
1285     { floatx80_zero_init,
1286       floatx80_zero_init },
1287     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1288       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1289     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1290       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1291     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1292       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1293     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1294       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1295     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1296       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1297     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1298       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1299     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1300       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1301     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1302       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1303 };
1304
1305 void helper_fpatan(CPUX86State *env)
1306 {
1307     uint8_t old_flags = save_exception_flags(env);
1308     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1309     int32_t arg0_exp = extractFloatx80Exp(ST0);
1310     bool arg0_sign = extractFloatx80Sign(ST0);
1311     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1312     int32_t arg1_exp = extractFloatx80Exp(ST1);
1313     bool arg1_sign = extractFloatx80Sign(ST1);
1314
1315     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1316         float_raise(float_flag_invalid, &env->fp_status);
1317         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1318     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1319         float_raise(float_flag_invalid, &env->fp_status);
1320         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1321     } else if (floatx80_invalid_encoding(ST0) ||
1322                floatx80_invalid_encoding(ST1)) {
1323         float_raise(float_flag_invalid, &env->fp_status);
1324         ST1 = floatx80_default_nan(&env->fp_status);
1325     } else if (floatx80_is_any_nan(ST0)) {
1326         ST1 = ST0;
1327     } else if (floatx80_is_any_nan(ST1)) {
1328         /* Pass this NaN through.  */
1329     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1330         /* Pass this zero through.  */
1331     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1332                  arg0_exp - arg1_exp >= 80) &&
1333                !arg0_sign) {
1334         /*
1335          * Dividing ST1 by ST0 gives the correct result up to
1336          * rounding, and avoids spurious underflow exceptions that
1337          * might result from passing some small values through the
1338          * polynomial approximation, but if a finite nonzero result of
1339          * division is exact, the result of fpatan is still inexact
1340          * (and underflowing where appropriate).
1341          */
1342         signed char save_prec = env->fp_status.floatx80_rounding_precision;
1343         env->fp_status.floatx80_rounding_precision = 80;
1344         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1345         env->fp_status.floatx80_rounding_precision = save_prec;
1346         if (!floatx80_is_zero(ST1) &&
1347             !(get_float_exception_flags(&env->fp_status) &
1348               float_flag_inexact)) {
1349             /*
1350              * The mathematical result is very slightly closer to zero
1351              * than this exact result.  Round a value with the
1352              * significand adjusted accordingly to get the correct
1353              * exceptions, and possibly an adjusted result depending
1354              * on the rounding mode.
1355              */
1356             uint64_t sig = extractFloatx80Frac(ST1);
1357             int32_t exp = extractFloatx80Exp(ST1);
1358             bool sign = extractFloatx80Sign(ST1);
1359             if (exp == 0) {
1360                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1361             }
1362             ST1 = normalizeRoundAndPackFloatx80(80, sign, exp, sig - 1,
1363                                                 -1, &env->fp_status);
1364         }
1365     } else {
1366         /* The result is inexact.  */
1367         bool rsign = arg1_sign;
1368         int32_t rexp;
1369         uint64_t rsig0, rsig1;
1370         if (floatx80_is_zero(ST1)) {
1371             /*
1372              * ST0 is negative.  The result is pi with the sign of
1373              * ST1.
1374              */
1375             rexp = pi_exp;
1376             rsig0 = pi_sig_high;
1377             rsig1 = pi_sig_low;
1378         } else if (floatx80_is_infinity(ST1)) {
1379             if (floatx80_is_infinity(ST0)) {
1380                 if (arg0_sign) {
1381                     rexp = pi_34_exp;
1382                     rsig0 = pi_34_sig_high;
1383                     rsig1 = pi_34_sig_low;
1384                 } else {
1385                     rexp = pi_4_exp;
1386                     rsig0 = pi_4_sig_high;
1387                     rsig1 = pi_4_sig_low;
1388                 }
1389             } else {
1390                 rexp = pi_2_exp;
1391                 rsig0 = pi_2_sig_high;
1392                 rsig1 = pi_2_sig_low;
1393             }
1394         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1395             rexp = pi_2_exp;
1396             rsig0 = pi_2_sig_high;
1397             rsig1 = pi_2_sig_low;
1398         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1399             /* ST0 is negative.  */
1400             rexp = pi_exp;
1401             rsig0 = pi_sig_high;
1402             rsig1 = pi_sig_low;
1403         } else {
1404             /*
1405              * ST0 and ST1 are finite, nonzero and with exponents not
1406              * too far apart.
1407              */
1408             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1409             int32_t azexp, axexp;
1410             bool adj_sub, ysign, zsign;
1411             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1412             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1413             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1414             uint64_t azsig0, azsig1;
1415             uint64_t azsig2, azsig3, axsig0, axsig1;
1416             floatx80 x8;
1417             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1418             signed char save_prec = env->fp_status.floatx80_rounding_precision;
1419             env->fp_status.float_rounding_mode = float_round_nearest_even;
1420             env->fp_status.floatx80_rounding_precision = 80;
1421
1422             if (arg0_exp == 0) {
1423                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1424             }
1425             if (arg1_exp == 0) {
1426                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1427             }
1428             if (arg0_exp > arg1_exp ||
1429                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1430                 /* Work with abs(ST1) / abs(ST0).  */
1431                 num_exp = arg1_exp;
1432                 num_sig = arg1_sig;
1433                 den_exp = arg0_exp;
1434                 den_sig = arg0_sig;
1435                 if (arg0_sign) {
1436                     /* The result is subtracted from pi.  */
1437                     adj_exp = pi_exp;
1438                     adj_sig0 = pi_sig_high;
1439                     adj_sig1 = pi_sig_low;
1440                     adj_sub = true;
1441                 } else {
1442                     /* The result is used as-is.  */
1443                     adj_exp = 0;
1444                     adj_sig0 = 0;
1445                     adj_sig1 = 0;
1446                     adj_sub = false;
1447                 }
1448             } else {
1449                 /* Work with abs(ST0) / abs(ST1).  */
1450                 num_exp = arg0_exp;
1451                 num_sig = arg0_sig;
1452                 den_exp = arg1_exp;
1453                 den_sig = arg1_sig;
1454                 /* The result is added to or subtracted from pi/2.  */
1455                 adj_exp = pi_2_exp;
1456                 adj_sig0 = pi_2_sig_high;
1457                 adj_sig1 = pi_2_sig_low;
1458                 adj_sub = !arg0_sign;
1459             }
1460
1461             /*
1462              * Compute x = num/den, where 0 < x <= 1 and x is not too
1463              * small.
1464              */
1465             xexp = num_exp - den_exp + 0x3ffe;
1466             remsig0 = num_sig;
1467             remsig1 = 0;
1468             if (den_sig <= remsig0) {
1469                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1470                 ++xexp;
1471             }
1472             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1473             mul64To128(den_sig, xsig0, &msig0, &msig1);
1474             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1475             while ((int64_t) remsig0 < 0) {
1476                 --xsig0;
1477                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1478             }
1479             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1480             /*
1481              * No need to correct any estimation error in xsig1; even
1482              * with such error, it is accurate enough.
1483              */
1484
1485             /*
1486              * Split x as x = t + y, where t = n/8 is the nearest
1487              * multiple of 1/8 to x.
1488              */
1489             x8 = normalizeRoundAndPackFloatx80(80, false, xexp + 3, xsig0,
1490                                                xsig1, &env->fp_status);
1491             n = floatx80_to_int32(x8, &env->fp_status);
1492             if (n == 0) {
1493                 ysign = false;
1494                 yexp = xexp;
1495                 ysig0 = xsig0;
1496                 ysig1 = xsig1;
1497                 texp = 0;
1498                 tsig = 0;
1499             } else {
1500                 int shift = clz32(n) + 32;
1501                 texp = 0x403b - shift;
1502                 tsig = n;
1503                 tsig <<= shift;
1504                 if (texp == xexp) {
1505                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1506                     if ((int64_t) ysig0 >= 0) {
1507                         ysign = false;
1508                         if (ysig0 == 0) {
1509                             if (ysig1 == 0) {
1510                                 yexp = 0;
1511                             } else {
1512                                 shift = clz64(ysig1) + 64;
1513                                 yexp = xexp - shift;
1514                                 shift128Left(ysig0, ysig1, shift,
1515                                              &ysig0, &ysig1);
1516                             }
1517                         } else {
1518                             shift = clz64(ysig0);
1519                             yexp = xexp - shift;
1520                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1521                         }
1522                     } else {
1523                         ysign = true;
1524                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1525                         if (ysig0 == 0) {
1526                             shift = clz64(ysig1) + 64;
1527                         } else {
1528                             shift = clz64(ysig0);
1529                         }
1530                         yexp = xexp - shift;
1531                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1532                     }
1533                 } else {
1534                     /*
1535                      * t's exponent must be greater than x's because t
1536                      * is positive and the nearest multiple of 1/8 to
1537                      * x, and if x has a greater exponent, the power
1538                      * of 2 with that exponent is also a multiple of
1539                      * 1/8.
1540                      */
1541                     uint64_t usig0, usig1;
1542                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1543                                          &usig0, &usig1);
1544                     ysign = true;
1545                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1546                     if (ysig0 == 0) {
1547                         shift = clz64(ysig1) + 64;
1548                     } else {
1549                         shift = clz64(ysig0);
1550                     }
1551                     yexp = texp - shift;
1552                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1553                 }
1554             }
1555
1556             /*
1557              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1558              * arctan(z).
1559              */
1560             zsign = ysign;
1561             if (texp == 0 || yexp == 0) {
1562                 zexp = yexp;
1563                 zsig0 = ysig0;
1564                 zsig1 = ysig1;
1565             } else {
1566                 /*
1567                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1568                  */
1569                 int32_t dexp = texp + xexp - 0x3ffe;
1570                 uint64_t dsig0, dsig1, dsig2;
1571                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1572                 /*
1573                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1574                  * bit).  Add 1 to produce the denominator 1+tx.
1575                  */
1576                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1577                                      &dsig0, &dsig1);
1578                 dsig0 |= 0x8000000000000000ULL;
1579                 zexp = yexp - 1;
1580                 remsig0 = ysig0;
1581                 remsig1 = ysig1;
1582                 remsig2 = 0;
1583                 if (dsig0 <= remsig0) {
1584                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1585                     ++zexp;
1586                 }
1587                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1588                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1589                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1590                        &remsig0, &remsig1, &remsig2);
1591                 while ((int64_t) remsig0 < 0) {
1592                     --zsig0;
1593                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1594                            &remsig0, &remsig1, &remsig2);
1595                 }
1596                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1597                 /* No need to correct any estimation error in zsig1.  */
1598             }
1599
1600             if (zexp == 0) {
1601                 azexp = 0;
1602                 azsig0 = 0;
1603                 azsig1 = 0;
1604             } else {
1605                 floatx80 z2, accum;
1606                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1607                 /* Compute z^2.  */
1608                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1609                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1610                 z2 = normalizeRoundAndPackFloatx80(80, false,
1611                                                    zexp + zexp - 0x3ffe,
1612                                                    z2sig0, z2sig1,
1613                                                    &env->fp_status);
1614
1615                 /* Compute the lower parts of the polynomial expansion.  */
1616                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1617                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1618                 accum = floatx80_mul(accum, z2, &env->fp_status);
1619                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1620                 accum = floatx80_mul(accum, z2, &env->fp_status);
1621                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1622                 accum = floatx80_mul(accum, z2, &env->fp_status);
1623                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1624                 accum = floatx80_mul(accum, z2, &env->fp_status);
1625                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1626                 accum = floatx80_mul(accum, z2, &env->fp_status);
1627
1628                 /*
1629                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1630                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1631                  */
1632                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1633                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1634                                      aexp - extractFloatx80Exp(accum),
1635                                      &asig0, &asig1);
1636                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1637                        &asig0, &asig1);
1638                 /* Multiply by z to compute arctan(z).  */
1639                 azexp = aexp + zexp - 0x3ffe;
1640                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1641                             &azsig2, &azsig3);
1642             }
1643
1644             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1645             if (texp == 0) {
1646                 /* z is positive.  */
1647                 axexp = azexp;
1648                 axsig0 = azsig0;
1649                 axsig1 = azsig1;
1650             } else {
1651                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1652                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1653                 uint64_t low_sig0 =
1654                     extractFloatx80Frac(fpatan_table[n].atan_low);
1655                 uint64_t low_sig1 = 0;
1656                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1657                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1658                 axsig1 = 0;
1659                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1660                                      &low_sig0, &low_sig1);
1661                 if (low_sign) {
1662                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1663                            &axsig0, &axsig1);
1664                 } else {
1665                     add128(axsig0, axsig1, low_sig0, low_sig1,
1666                            &axsig0, &axsig1);
1667                 }
1668                 if (azexp >= axexp) {
1669                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1670                                          &axsig0, &axsig1);
1671                     axexp = azexp + 1;
1672                     shift128RightJamming(azsig0, azsig1, 1,
1673                                          &azsig0, &azsig1);
1674                 } else {
1675                     shift128RightJamming(axsig0, axsig1, 1,
1676                                          &axsig0, &axsig1);
1677                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1678                                          &azsig0, &azsig1);
1679                     ++axexp;
1680                 }
1681                 if (zsign) {
1682                     sub128(axsig0, axsig1, azsig0, azsig1,
1683                            &axsig0, &axsig1);
1684                 } else {
1685                     add128(axsig0, axsig1, azsig0, azsig1,
1686                            &axsig0, &axsig1);
1687                 }
1688             }
1689
1690             if (adj_exp == 0) {
1691                 rexp = axexp;
1692                 rsig0 = axsig0;
1693                 rsig1 = axsig1;
1694             } else {
1695                 /*
1696                  * Add or subtract arctan(x) (exponent axexp,
1697                  * significand axsig0 and axsig1, positive, not
1698                  * necessarily normalized) to the number given by
1699                  * adj_exp, adj_sig0 and adj_sig1, according to
1700                  * adj_sub.
1701                  */
1702                 if (adj_exp >= axexp) {
1703                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1704                                          &axsig0, &axsig1);
1705                     rexp = adj_exp + 1;
1706                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1707                                          &adj_sig0, &adj_sig1);
1708                 } else {
1709                     shift128RightJamming(axsig0, axsig1, 1,
1710                                          &axsig0, &axsig1);
1711                     shift128RightJamming(adj_sig0, adj_sig1,
1712                                          axexp - adj_exp + 1,
1713                                          &adj_sig0, &adj_sig1);
1714                     rexp = axexp + 1;
1715                 }
1716                 if (adj_sub) {
1717                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1718                            &rsig0, &rsig1);
1719                 } else {
1720                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1721                            &rsig0, &rsig1);
1722                 }
1723             }
1724
1725             env->fp_status.float_rounding_mode = save_mode;
1726             env->fp_status.floatx80_rounding_precision = save_prec;
1727         }
1728         /* This result is inexact.  */
1729         rsig1 |= 1;
1730         ST1 = normalizeRoundAndPackFloatx80(80, rsign, rexp,
1731                                             rsig0, rsig1, &env->fp_status);
1732     }
1733
1734     fpop(env);
1735     merge_exception_flags(env, old_flags);
1736 }
1737
1738 void helper_fxtract(CPUX86State *env)
1739 {
1740     uint8_t old_flags = save_exception_flags(env);
1741     CPU_LDoubleU temp;
1742
1743     temp.d = ST0;
1744
1745     if (floatx80_is_zero(ST0)) {
1746         /* Easy way to generate -inf and raising division by 0 exception */
1747         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1748                            &env->fp_status);
1749         fpush(env);
1750         ST0 = temp.d;
1751     } else if (floatx80_invalid_encoding(ST0)) {
1752         float_raise(float_flag_invalid, &env->fp_status);
1753         ST0 = floatx80_default_nan(&env->fp_status);
1754         fpush(env);
1755         ST0 = ST1;
1756     } else if (floatx80_is_any_nan(ST0)) {
1757         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1758             float_raise(float_flag_invalid, &env->fp_status);
1759             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1760         }
1761         fpush(env);
1762         ST0 = ST1;
1763     } else if (floatx80_is_infinity(ST0)) {
1764         fpush(env);
1765         ST0 = ST1;
1766         ST1 = floatx80_infinity;
1767     } else {
1768         int expdif;
1769
1770         if (EXPD(temp) == 0) {
1771             int shift = clz64(temp.l.lower);
1772             temp.l.lower <<= shift;
1773             expdif = 1 - EXPBIAS - shift;
1774             float_raise(float_flag_input_denormal, &env->fp_status);
1775         } else {
1776             expdif = EXPD(temp) - EXPBIAS;
1777         }
1778         /* DP exponent bias */
1779         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1780         fpush(env);
1781         BIASEXPONENT(temp);
1782         ST0 = temp.d;
1783     }
1784     merge_exception_flags(env, old_flags);
1785 }
1786
1787 static void helper_fprem_common(CPUX86State *env, bool mod)
1788 {
1789     uint8_t old_flags = save_exception_flags(env);
1790     uint64_t quotient;
1791     CPU_LDoubleU temp0, temp1;
1792     int exp0, exp1, expdiff;
1793
1794     temp0.d = ST0;
1795     temp1.d = ST1;
1796     exp0 = EXPD(temp0);
1797     exp1 = EXPD(temp1);
1798
1799     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1800     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1801         exp0 == 0x7fff || exp1 == 0x7fff ||
1802         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1803         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1804     } else {
1805         if (exp0 == 0) {
1806             exp0 = 1 - clz64(temp0.l.lower);
1807         }
1808         if (exp1 == 0) {
1809             exp1 = 1 - clz64(temp1.l.lower);
1810         }
1811         expdiff = exp0 - exp1;
1812         if (expdiff < 64) {
1813             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1814             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1815             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1816             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1817         } else {
1818             /*
1819              * Partial remainder.  This choice of how many bits to
1820              * process at once is specified in AMD instruction set
1821              * manuals, and empirically is followed by Intel
1822              * processors as well; it ensures that the final remainder
1823              * operation in a loop does produce the correct low three
1824              * bits of the quotient.  AMD manuals specify that the
1825              * flags other than C2 are cleared, and empirically Intel
1826              * processors clear them as well.
1827              */
1828             int n = 32 + (expdiff % 32);
1829             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1830             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1831             env->fpus |= 0x400;  /* C2 <-- 1 */
1832         }
1833     }
1834     merge_exception_flags(env, old_flags);
1835 }
1836
1837 void helper_fprem1(CPUX86State *env)
1838 {
1839     helper_fprem_common(env, false);
1840 }
1841
1842 void helper_fprem(CPUX86State *env)
1843 {
1844     helper_fprem_common(env, true);
1845 }
1846
1847 /* 128-bit significand of log2(e).  */
1848 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1849 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1850
1851 /*
1852  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1853  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1854  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1855  * interval [sqrt(2)/2, sqrt(2)].
1856  */
1857 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1858 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1859 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1860 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1861 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1862 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1863 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1864 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1865 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1866 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1867 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1868
1869 /*
1870  * Compute an approximation of log2(1+arg), where 1+arg is in the
1871  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1872  * function is called, rounding precision is set to 80 and the
1873  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1874  * and must not be so close to zero that underflow might occur.
1875  */
1876 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1877                                 uint64_t *sig0, uint64_t *sig1)
1878 {
1879     uint64_t arg0_sig = extractFloatx80Frac(arg);
1880     int32_t arg0_exp = extractFloatx80Exp(arg);
1881     bool arg0_sign = extractFloatx80Sign(arg);
1882     bool asign;
1883     int32_t dexp, texp, aexp;
1884     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1885     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1886     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1887     floatx80 t2, accum;
1888
1889     /*
1890      * Compute an approximation of arg/(2+arg), with extra precision,
1891      * as the argument to a polynomial approximation.  The extra
1892      * precision is only needed for the first term of the
1893      * approximation, with subsequent terms being significantly
1894      * smaller; the approximation only uses odd exponents, and the
1895      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1896      */
1897     if (arg0_sign) {
1898         dexp = 0x3fff;
1899         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1900         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1901     } else {
1902         dexp = 0x4000;
1903         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1904         dsig0 |= 0x8000000000000000ULL;
1905     }
1906     texp = arg0_exp - dexp + 0x3ffe;
1907     rsig0 = arg0_sig;
1908     rsig1 = 0;
1909     rsig2 = 0;
1910     if (dsig0 <= rsig0) {
1911         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1912         ++texp;
1913     }
1914     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1915     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1916     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1917            &rsig0, &rsig1, &rsig2);
1918     while ((int64_t) rsig0 < 0) {
1919         --tsig0;
1920         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1921                &rsig0, &rsig1, &rsig2);
1922     }
1923     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1924     /*
1925      * No need to correct any estimation error in tsig1; even with
1926      * such error, it is accurate enough.  Now compute the square of
1927      * that approximation.
1928      */
1929     mul128To256(tsig0, tsig1, tsig0, tsig1,
1930                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1931     t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe,
1932                                        t2sig0, t2sig1, &env->fp_status);
1933
1934     /* Compute the lower parts of the polynomial expansion.  */
1935     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1936     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1937     accum = floatx80_mul(accum, t2, &env->fp_status);
1938     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1939     accum = floatx80_mul(accum, t2, &env->fp_status);
1940     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1941     accum = floatx80_mul(accum, t2, &env->fp_status);
1942     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1943     accum = floatx80_mul(accum, t2, &env->fp_status);
1944     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1945     accum = floatx80_mul(accum, t2, &env->fp_status);
1946     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1947     accum = floatx80_mul(accum, t2, &env->fp_status);
1948     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1949     accum = floatx80_mul(accum, t2, &env->fp_status);
1950     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1951     accum = floatx80_mul(accum, t2, &env->fp_status);
1952     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
1953
1954     /*
1955      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
1956      * accum has much lower magnitude, and so, in particular, carry
1957      * out of the addition is not possible), multiplied by t.  (This
1958      * expansion is only accurate to about 70 bits, not 128 bits.)
1959      */
1960     aexp = extractFloatx80Exp(fyl2x_coeff_0);
1961     asign = extractFloatx80Sign(fyl2x_coeff_0);
1962     shift128RightJamming(extractFloatx80Frac(accum), 0,
1963                          aexp - extractFloatx80Exp(accum),
1964                          &asig0, &asig1);
1965     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
1966     bsig1 = 0;
1967     if (asign == extractFloatx80Sign(accum)) {
1968         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1969     } else {
1970         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1971     }
1972     /* Multiply by t to compute the required result.  */
1973     mul128To256(asig0, asig1, tsig0, tsig1,
1974                 &asig0, &asig1, &asig2, &asig3);
1975     aexp += texp - 0x3ffe;
1976     *exp = aexp;
1977     *sig0 = asig0;
1978     *sig1 = asig1;
1979 }
1980
1981 void helper_fyl2xp1(CPUX86State *env)
1982 {
1983     uint8_t old_flags = save_exception_flags(env);
1984     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1985     int32_t arg0_exp = extractFloatx80Exp(ST0);
1986     bool arg0_sign = extractFloatx80Sign(ST0);
1987     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1988     int32_t arg1_exp = extractFloatx80Exp(ST1);
1989     bool arg1_sign = extractFloatx80Sign(ST1);
1990
1991     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1992         float_raise(float_flag_invalid, &env->fp_status);
1993         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1994     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1995         float_raise(float_flag_invalid, &env->fp_status);
1996         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1997     } else if (floatx80_invalid_encoding(ST0) ||
1998                floatx80_invalid_encoding(ST1)) {
1999         float_raise(float_flag_invalid, &env->fp_status);
2000         ST1 = floatx80_default_nan(&env->fp_status);
2001     } else if (floatx80_is_any_nan(ST0)) {
2002         ST1 = ST0;
2003     } else if (floatx80_is_any_nan(ST1)) {
2004         /* Pass this NaN through.  */
2005     } else if (arg0_exp > 0x3ffd ||
2006                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
2007                                                   0x95f619980c4336f7ULL :
2008                                                   0xd413cccfe7799211ULL))) {
2009         /*
2010          * Out of range for the instruction (ST0 must have absolute
2011          * value less than 1 - sqrt(2)/2 = 0.292..., according to
2012          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
2013          * to sqrt(2) - 1, which we allow here), treat as invalid.
2014          */
2015         float_raise(float_flag_invalid, &env->fp_status);
2016         ST1 = floatx80_default_nan(&env->fp_status);
2017     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2018                arg1_exp == 0x7fff) {
2019         /*
2020          * One argument is zero, or multiplying by infinity; correct
2021          * result is exact and can be obtained by multiplying the
2022          * arguments.
2023          */
2024         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2025     } else if (arg0_exp < 0x3fb0) {
2026         /*
2027          * Multiplying both arguments and an extra-precision version
2028          * of log2(e) is sufficiently precise.
2029          */
2030         uint64_t sig0, sig1, sig2;
2031         int32_t exp;
2032         if (arg0_exp == 0) {
2033             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2034         }
2035         if (arg1_exp == 0) {
2036             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2037         }
2038         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2039                         &sig0, &sig1, &sig2);
2040         exp = arg0_exp + 1;
2041         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2042         exp += arg1_exp - 0x3ffe;
2043         /* This result is inexact.  */
2044         sig1 |= 1;
2045         ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, exp,
2046                                             sig0, sig1, &env->fp_status);
2047     } else {
2048         int32_t aexp;
2049         uint64_t asig0, asig1, asig2;
2050         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2051         signed char save_prec = env->fp_status.floatx80_rounding_precision;
2052         env->fp_status.float_rounding_mode = float_round_nearest_even;
2053         env->fp_status.floatx80_rounding_precision = 80;
2054
2055         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2056         /*
2057          * Multiply by the second argument to compute the required
2058          * result.
2059          */
2060         if (arg1_exp == 0) {
2061             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2062         }
2063         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2064         aexp += arg1_exp - 0x3ffe;
2065         /* This result is inexact.  */
2066         asig1 |= 1;
2067         env->fp_status.float_rounding_mode = save_mode;
2068         ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, aexp,
2069                                             asig0, asig1, &env->fp_status);
2070         env->fp_status.floatx80_rounding_precision = save_prec;
2071     }
2072     fpop(env);
2073     merge_exception_flags(env, old_flags);
2074 }
2075
2076 void helper_fyl2x(CPUX86State *env)
2077 {
2078     uint8_t old_flags = save_exception_flags(env);
2079     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2080     int32_t arg0_exp = extractFloatx80Exp(ST0);
2081     bool arg0_sign = extractFloatx80Sign(ST0);
2082     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2083     int32_t arg1_exp = extractFloatx80Exp(ST1);
2084     bool arg1_sign = extractFloatx80Sign(ST1);
2085
2086     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2087         float_raise(float_flag_invalid, &env->fp_status);
2088         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2089     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2090         float_raise(float_flag_invalid, &env->fp_status);
2091         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2092     } else if (floatx80_invalid_encoding(ST0) ||
2093                floatx80_invalid_encoding(ST1)) {
2094         float_raise(float_flag_invalid, &env->fp_status);
2095         ST1 = floatx80_default_nan(&env->fp_status);
2096     } else if (floatx80_is_any_nan(ST0)) {
2097         ST1 = ST0;
2098     } else if (floatx80_is_any_nan(ST1)) {
2099         /* Pass this NaN through.  */
2100     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2101         float_raise(float_flag_invalid, &env->fp_status);
2102         ST1 = floatx80_default_nan(&env->fp_status);
2103     } else if (floatx80_is_infinity(ST1)) {
2104         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2105                                              &env->fp_status);
2106         switch (cmp) {
2107         case float_relation_less:
2108             ST1 = floatx80_chs(ST1);
2109             break;
2110         case float_relation_greater:
2111             /* Result is infinity of the same sign as ST1.  */
2112             break;
2113         default:
2114             float_raise(float_flag_invalid, &env->fp_status);
2115             ST1 = floatx80_default_nan(&env->fp_status);
2116             break;
2117         }
2118     } else if (floatx80_is_infinity(ST0)) {
2119         if (floatx80_is_zero(ST1)) {
2120             float_raise(float_flag_invalid, &env->fp_status);
2121             ST1 = floatx80_default_nan(&env->fp_status);
2122         } else if (arg1_sign) {
2123             ST1 = floatx80_chs(ST0);
2124         } else {
2125             ST1 = ST0;
2126         }
2127     } else if (floatx80_is_zero(ST0)) {
2128         if (floatx80_is_zero(ST1)) {
2129             float_raise(float_flag_invalid, &env->fp_status);
2130             ST1 = floatx80_default_nan(&env->fp_status);
2131         } else {
2132             /* Result is infinity with opposite sign to ST1.  */
2133             float_raise(float_flag_divbyzero, &env->fp_status);
2134             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2135                                 0x8000000000000000ULL);
2136         }
2137     } else if (floatx80_is_zero(ST1)) {
2138         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2139             ST1 = floatx80_chs(ST1);
2140         }
2141         /* Otherwise, ST1 is already the correct result.  */
2142     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2143         if (arg1_sign) {
2144             ST1 = floatx80_chs(floatx80_zero);
2145         } else {
2146             ST1 = floatx80_zero;
2147         }
2148     } else {
2149         int32_t int_exp;
2150         floatx80 arg0_m1;
2151         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2152         signed char save_prec = env->fp_status.floatx80_rounding_precision;
2153         env->fp_status.float_rounding_mode = float_round_nearest_even;
2154         env->fp_status.floatx80_rounding_precision = 80;
2155
2156         if (arg0_exp == 0) {
2157             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2158         }
2159         if (arg1_exp == 0) {
2160             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2161         }
2162         int_exp = arg0_exp - 0x3fff;
2163         if (arg0_sig > 0xb504f333f9de6484ULL) {
2164             ++int_exp;
2165         }
2166         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2167                                                &env->fp_status),
2168                                floatx80_one, &env->fp_status);
2169         if (floatx80_is_zero(arg0_m1)) {
2170             /* Exact power of 2; multiply by ST1.  */
2171             env->fp_status.float_rounding_mode = save_mode;
2172             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2173                                ST1, &env->fp_status);
2174         } else {
2175             bool asign = extractFloatx80Sign(arg0_m1);
2176             int32_t aexp;
2177             uint64_t asig0, asig1, asig2;
2178             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2179             if (int_exp != 0) {
2180                 bool isign = (int_exp < 0);
2181                 int32_t iexp;
2182                 uint64_t isig;
2183                 int shift;
2184                 int_exp = isign ? -int_exp : int_exp;
2185                 shift = clz32(int_exp) + 32;
2186                 isig = int_exp;
2187                 isig <<= shift;
2188                 iexp = 0x403e - shift;
2189                 shift128RightJamming(asig0, asig1, iexp - aexp,
2190                                      &asig0, &asig1);
2191                 if (asign == isign) {
2192                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2193                 } else {
2194                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2195                 }
2196                 aexp = iexp;
2197                 asign = isign;
2198             }
2199             /*
2200              * Multiply by the second argument to compute the required
2201              * result.
2202              */
2203             if (arg1_exp == 0) {
2204                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2205             }
2206             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2207             aexp += arg1_exp - 0x3ffe;
2208             /* This result is inexact.  */
2209             asig1 |= 1;
2210             env->fp_status.float_rounding_mode = save_mode;
2211             ST1 = normalizeRoundAndPackFloatx80(80, asign ^ arg1_sign, aexp,
2212                                                 asig0, asig1, &env->fp_status);
2213         }
2214
2215         env->fp_status.floatx80_rounding_precision = save_prec;
2216     }
2217     fpop(env);
2218     merge_exception_flags(env, old_flags);
2219 }
2220
2221 void helper_fsqrt(CPUX86State *env)
2222 {
2223     uint8_t old_flags = save_exception_flags(env);
2224     if (floatx80_is_neg(ST0)) {
2225         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2226         env->fpus |= 0x400;
2227     }
2228     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2229     merge_exception_flags(env, old_flags);
2230 }
2231
2232 void helper_fsincos(CPUX86State *env)
2233 {
2234     double fptemp = floatx80_to_double(env, ST0);
2235
2236     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2237         env->fpus |= 0x400;
2238     } else {
2239         ST0 = double_to_floatx80(env, sin(fptemp));
2240         fpush(env);
2241         ST0 = double_to_floatx80(env, cos(fptemp));
2242         env->fpus &= ~0x400;  /* C2 <-- 0 */
2243         /* the above code is for |arg| < 2**63 only */
2244     }
2245 }
2246
2247 void helper_frndint(CPUX86State *env)
2248 {
2249     uint8_t old_flags = save_exception_flags(env);
2250     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2251     merge_exception_flags(env, old_flags);
2252 }
2253
2254 void helper_fscale(CPUX86State *env)
2255 {
2256     uint8_t old_flags = save_exception_flags(env);
2257     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2258         float_raise(float_flag_invalid, &env->fp_status);
2259         ST0 = floatx80_default_nan(&env->fp_status);
2260     } else if (floatx80_is_any_nan(ST1)) {
2261         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2262             float_raise(float_flag_invalid, &env->fp_status);
2263         }
2264         ST0 = ST1;
2265         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2266             float_raise(float_flag_invalid, &env->fp_status);
2267             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2268         }
2269     } else if (floatx80_is_infinity(ST1) &&
2270                !floatx80_invalid_encoding(ST0) &&
2271                !floatx80_is_any_nan(ST0)) {
2272         if (floatx80_is_neg(ST1)) {
2273             if (floatx80_is_infinity(ST0)) {
2274                 float_raise(float_flag_invalid, &env->fp_status);
2275                 ST0 = floatx80_default_nan(&env->fp_status);
2276             } else {
2277                 ST0 = (floatx80_is_neg(ST0) ?
2278                        floatx80_chs(floatx80_zero) :
2279                        floatx80_zero);
2280             }
2281         } else {
2282             if (floatx80_is_zero(ST0)) {
2283                 float_raise(float_flag_invalid, &env->fp_status);
2284                 ST0 = floatx80_default_nan(&env->fp_status);
2285             } else {
2286                 ST0 = (floatx80_is_neg(ST0) ?
2287                        floatx80_chs(floatx80_infinity) :
2288                        floatx80_infinity);
2289             }
2290         }
2291     } else {
2292         int n;
2293         signed char save = env->fp_status.floatx80_rounding_precision;
2294         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2295         set_float_exception_flags(0, &env->fp_status);
2296         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2297         set_float_exception_flags(save_flags, &env->fp_status);
2298         env->fp_status.floatx80_rounding_precision = 80;
2299         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2300         env->fp_status.floatx80_rounding_precision = save;
2301     }
2302     merge_exception_flags(env, old_flags);
2303 }
2304
2305 void helper_fsin(CPUX86State *env)
2306 {
2307     double fptemp = floatx80_to_double(env, ST0);
2308
2309     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2310         env->fpus |= 0x400;
2311     } else {
2312         ST0 = double_to_floatx80(env, sin(fptemp));
2313         env->fpus &= ~0x400;  /* C2 <-- 0 */
2314         /* the above code is for |arg| < 2**53 only */
2315     }
2316 }
2317
2318 void helper_fcos(CPUX86State *env)
2319 {
2320     double fptemp = floatx80_to_double(env, ST0);
2321
2322     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2323         env->fpus |= 0x400;
2324     } else {
2325         ST0 = double_to_floatx80(env, cos(fptemp));
2326         env->fpus &= ~0x400;  /* C2 <-- 0 */
2327         /* the above code is for |arg| < 2**63 only */
2328     }
2329 }
2330
2331 void helper_fxam_ST0(CPUX86State *env)
2332 {
2333     CPU_LDoubleU temp;
2334     int expdif;
2335
2336     temp.d = ST0;
2337
2338     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2339     if (SIGND(temp)) {
2340         env->fpus |= 0x200; /* C1 <-- 1 */
2341     }
2342
2343     if (env->fptags[env->fpstt]) {
2344         env->fpus |= 0x4100; /* Empty */
2345         return;
2346     }
2347
2348     expdif = EXPD(temp);
2349     if (expdif == MAXEXPD) {
2350         if (MANTD(temp) == 0x8000000000000000ULL) {
2351             env->fpus |= 0x500; /* Infinity */
2352         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2353             env->fpus |= 0x100; /* NaN */
2354         }
2355     } else if (expdif == 0) {
2356         if (MANTD(temp) == 0) {
2357             env->fpus |=  0x4000; /* Zero */
2358         } else {
2359             env->fpus |= 0x4400; /* Denormal */
2360         }
2361     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2362         env->fpus |= 0x400;
2363     }
2364 }
2365
2366 static void do_fstenv(CPUX86State *env, target_ulong ptr, int data32,
2367                       uintptr_t retaddr)
2368 {
2369     int fpus, fptag, exp, i;
2370     uint64_t mant;
2371     CPU_LDoubleU tmp;
2372
2373     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2374     fptag = 0;
2375     for (i = 7; i >= 0; i--) {
2376         fptag <<= 2;
2377         if (env->fptags[i]) {
2378             fptag |= 3;
2379         } else {
2380             tmp.d = env->fpregs[i].d;
2381             exp = EXPD(tmp);
2382             mant = MANTD(tmp);
2383             if (exp == 0 && mant == 0) {
2384                 /* zero */
2385                 fptag |= 1;
2386             } else if (exp == 0 || exp == MAXEXPD
2387                        || (mant & (1LL << 63)) == 0) {
2388                 /* NaNs, infinity, denormal */
2389                 fptag |= 2;
2390             }
2391         }
2392     }
2393     if (data32) {
2394         /* 32 bit */
2395         cpu_stl_data_ra(env, ptr, env->fpuc, retaddr);
2396         cpu_stl_data_ra(env, ptr + 4, fpus, retaddr);
2397         cpu_stl_data_ra(env, ptr + 8, fptag, retaddr);
2398         cpu_stl_data_ra(env, ptr + 12, 0, retaddr); /* fpip */
2399         cpu_stl_data_ra(env, ptr + 16, 0, retaddr); /* fpcs */
2400         cpu_stl_data_ra(env, ptr + 20, 0, retaddr); /* fpoo */
2401         cpu_stl_data_ra(env, ptr + 24, 0, retaddr); /* fpos */
2402     } else {
2403         /* 16 bit */
2404         cpu_stw_data_ra(env, ptr, env->fpuc, retaddr);
2405         cpu_stw_data_ra(env, ptr + 2, fpus, retaddr);
2406         cpu_stw_data_ra(env, ptr + 4, fptag, retaddr);
2407         cpu_stw_data_ra(env, ptr + 6, 0, retaddr);
2408         cpu_stw_data_ra(env, ptr + 8, 0, retaddr);
2409         cpu_stw_data_ra(env, ptr + 10, 0, retaddr);
2410         cpu_stw_data_ra(env, ptr + 12, 0, retaddr);
2411     }
2412 }
2413
2414 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2415 {
2416     do_fstenv(env, ptr, data32, GETPC());
2417 }
2418
2419 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2420 {
2421     env->fpstt = (fpus >> 11) & 7;
2422     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2423     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2424 #if !defined(CONFIG_USER_ONLY)
2425     if (!(env->fpus & FPUS_SE)) {
2426         /*
2427          * Here the processor deasserts FERR#; in response, the chipset deasserts
2428          * IGNNE#.
2429          */
2430         cpu_clear_ignne();
2431     }
2432 #endif
2433 }
2434
2435 static void do_fldenv(CPUX86State *env, target_ulong ptr, int data32,
2436                       uintptr_t retaddr)
2437 {
2438     int i, fpus, fptag;
2439
2440     if (data32) {
2441         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2442         fpus = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2443         fptag = cpu_lduw_data_ra(env, ptr + 8, retaddr);
2444     } else {
2445         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2446         fpus = cpu_lduw_data_ra(env, ptr + 2, retaddr);
2447         fptag = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2448     }
2449     cpu_set_fpus(env, fpus);
2450     for (i = 0; i < 8; i++) {
2451         env->fptags[i] = ((fptag & 3) == 3);
2452         fptag >>= 2;
2453     }
2454 }
2455
2456 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2457 {
2458     do_fldenv(env, ptr, data32, GETPC());
2459 }
2460
2461 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2462 {
2463     floatx80 tmp;
2464     int i;
2465
2466     do_fstenv(env, ptr, data32, GETPC());
2467
2468     ptr += (14 << data32);
2469     for (i = 0; i < 8; i++) {
2470         tmp = ST(i);
2471         helper_fstt(env, tmp, ptr, GETPC());
2472         ptr += 10;
2473     }
2474
2475     /* fninit */
2476     env->fpus = 0;
2477     env->fpstt = 0;
2478     cpu_set_fpuc(env, 0x37f);
2479     env->fptags[0] = 1;
2480     env->fptags[1] = 1;
2481     env->fptags[2] = 1;
2482     env->fptags[3] = 1;
2483     env->fptags[4] = 1;
2484     env->fptags[5] = 1;
2485     env->fptags[6] = 1;
2486     env->fptags[7] = 1;
2487 }
2488
2489 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2490 {
2491     floatx80 tmp;
2492     int i;
2493
2494     do_fldenv(env, ptr, data32, GETPC());
2495     ptr += (14 << data32);
2496
2497     for (i = 0; i < 8; i++) {
2498         tmp = helper_fldt(env, ptr, GETPC());
2499         ST(i) = tmp;
2500         ptr += 10;
2501     }
2502 }
2503
2504 #if defined(CONFIG_USER_ONLY)
2505 void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
2506 {
2507     helper_fsave(env, ptr, data32);
2508 }
2509
2510 void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
2511 {
2512     helper_frstor(env, ptr, data32);
2513 }
2514 #endif
2515
2516 #define XO(X)  offsetof(X86XSaveArea, X)
2517
2518 static void do_xsave_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2519 {
2520     int fpus, fptag, i;
2521     target_ulong addr;
2522
2523     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2524     fptag = 0;
2525     for (i = 0; i < 8; i++) {
2526         fptag |= (env->fptags[i] << i);
2527     }
2528
2529     cpu_stw_data_ra(env, ptr + XO(legacy.fcw), env->fpuc, ra);
2530     cpu_stw_data_ra(env, ptr + XO(legacy.fsw), fpus, ra);
2531     cpu_stw_data_ra(env, ptr + XO(legacy.ftw), fptag ^ 0xff, ra);
2532
2533     /* In 32-bit mode this is eip, sel, dp, sel.
2534        In 64-bit mode this is rip, rdp.
2535        But in either case we don't write actual data, just zeros.  */
2536     cpu_stq_data_ra(env, ptr + XO(legacy.fpip), 0, ra); /* eip+sel; rip */
2537     cpu_stq_data_ra(env, ptr + XO(legacy.fpdp), 0, ra); /* edp+sel; rdp */
2538
2539     addr = ptr + XO(legacy.fpregs);
2540     for (i = 0; i < 8; i++) {
2541         floatx80 tmp = ST(i);
2542         helper_fstt(env, tmp, addr, ra);
2543         addr += 16;
2544     }
2545 }
2546
2547 static void do_xsave_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2548 {
2549     update_mxcsr_from_sse_status(env);
2550     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr), env->mxcsr, ra);
2551     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr_mask), 0x0000ffff, ra);
2552 }
2553
2554 static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2555 {
2556     int i, nb_xmm_regs;
2557     target_ulong addr;
2558
2559     if (env->hflags & HF_CS64_MASK) {
2560         nb_xmm_regs = 16;
2561     } else {
2562         nb_xmm_regs = 8;
2563     }
2564
2565     addr = ptr + XO(legacy.xmm_regs);
2566     for (i = 0; i < nb_xmm_regs; i++) {
2567         cpu_stq_data_ra(env, addr, env->xmm_regs[i].ZMM_Q(0), ra);
2568         cpu_stq_data_ra(env, addr + 8, env->xmm_regs[i].ZMM_Q(1), ra);
2569         addr += 16;
2570     }
2571 }
2572
2573 static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2574 {
2575     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2576     int i;
2577
2578     for (i = 0; i < 4; i++, addr += 16) {
2579         cpu_stq_data_ra(env, addr, env->bnd_regs[i].lb, ra);
2580         cpu_stq_data_ra(env, addr + 8, env->bnd_regs[i].ub, ra);
2581     }
2582 }
2583
2584 static void do_xsave_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2585 {
2586     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2587                     env->bndcs_regs.cfgu, ra);
2588     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2589                     env->bndcs_regs.sts, ra);
2590 }
2591
2592 static void do_xsave_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2593 {
2594     cpu_stq_data_ra(env, ptr, env->pkru, ra);
2595 }
2596
2597 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2598 {
2599     uintptr_t ra = GETPC();
2600
2601     /* The operand must be 16 byte aligned */
2602     if (ptr & 0xf) {
2603         raise_exception_ra(env, EXCP0D_GPF, ra);
2604     }
2605
2606     do_xsave_fpu(env, ptr, ra);
2607
2608     if (env->cr[4] & CR4_OSFXSR_MASK) {
2609         do_xsave_mxcsr(env, ptr, ra);
2610         /* Fast FXSAVE leaves out the XMM registers */
2611         if (!(env->efer & MSR_EFER_FFXSR)
2612             || (env->hflags & HF_CPL_MASK)
2613             || !(env->hflags & HF_LMA_MASK)) {
2614             do_xsave_sse(env, ptr, ra);
2615         }
2616     }
2617 }
2618
2619 static uint64_t get_xinuse(CPUX86State *env)
2620 {
2621     uint64_t inuse = -1;
2622
2623     /* For the most part, we don't track XINUSE.  We could calculate it
2624        here for all components, but it's probably less work to simply
2625        indicate in use.  That said, the state of BNDREGS is important
2626        enough to track in HFLAGS, so we might as well use that here.  */
2627     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2628        inuse &= ~XSTATE_BNDREGS_MASK;
2629     }
2630     return inuse;
2631 }
2632
2633 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2634                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2635 {
2636     uint64_t old_bv, new_bv;
2637
2638     /* The OS must have enabled XSAVE.  */
2639     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2640         raise_exception_ra(env, EXCP06_ILLOP, ra);
2641     }
2642
2643     /* The operand must be 64 byte aligned.  */
2644     if (ptr & 63) {
2645         raise_exception_ra(env, EXCP0D_GPF, ra);
2646     }
2647
2648     /* Never save anything not enabled by XCR0.  */
2649     rfbm &= env->xcr0;
2650     opt &= rfbm;
2651
2652     if (opt & XSTATE_FP_MASK) {
2653         do_xsave_fpu(env, ptr, ra);
2654     }
2655     if (rfbm & XSTATE_SSE_MASK) {
2656         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2657         do_xsave_mxcsr(env, ptr, ra);
2658     }
2659     if (opt & XSTATE_SSE_MASK) {
2660         do_xsave_sse(env, ptr, ra);
2661     }
2662     if (opt & XSTATE_BNDREGS_MASK) {
2663         do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
2664     }
2665     if (opt & XSTATE_BNDCSR_MASK) {
2666         do_xsave_bndcsr(env, ptr + XO(bndcsr_state), ra);
2667     }
2668     if (opt & XSTATE_PKRU_MASK) {
2669         do_xsave_pkru(env, ptr + XO(pkru_state), ra);
2670     }
2671
2672     /* Update the XSTATE_BV field.  */
2673     old_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2674     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2675     cpu_stq_data_ra(env, ptr + XO(header.xstate_bv), new_bv, ra);
2676 }
2677
2678 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2679 {
2680     do_xsave(env, ptr, rfbm, get_xinuse(env), -1, GETPC());
2681 }
2682
2683 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2684 {
2685     uint64_t inuse = get_xinuse(env);
2686     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2687 }
2688
2689 static void do_xrstor_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2690 {
2691     int i, fpuc, fpus, fptag;
2692     target_ulong addr;
2693
2694     fpuc = cpu_lduw_data_ra(env, ptr + XO(legacy.fcw), ra);
2695     fpus = cpu_lduw_data_ra(env, ptr + XO(legacy.fsw), ra);
2696     fptag = cpu_lduw_data_ra(env, ptr + XO(legacy.ftw), ra);
2697     cpu_set_fpuc(env, fpuc);
2698     cpu_set_fpus(env, fpus);
2699     fptag ^= 0xff;
2700     for (i = 0; i < 8; i++) {
2701         env->fptags[i] = ((fptag >> i) & 1);
2702     }
2703
2704     addr = ptr + XO(legacy.fpregs);
2705     for (i = 0; i < 8; i++) {
2706         floatx80 tmp = helper_fldt(env, addr, ra);
2707         ST(i) = tmp;
2708         addr += 16;
2709     }
2710 }
2711
2712 static void do_xrstor_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2713 {
2714     cpu_set_mxcsr(env, cpu_ldl_data_ra(env, ptr + XO(legacy.mxcsr), ra));
2715 }
2716
2717 static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2718 {
2719     int i, nb_xmm_regs;
2720     target_ulong addr;
2721
2722     if (env->hflags & HF_CS64_MASK) {
2723         nb_xmm_regs = 16;
2724     } else {
2725         nb_xmm_regs = 8;
2726     }
2727
2728     addr = ptr + XO(legacy.xmm_regs);
2729     for (i = 0; i < nb_xmm_regs; i++) {
2730         env->xmm_regs[i].ZMM_Q(0) = cpu_ldq_data_ra(env, addr, ra);
2731         env->xmm_regs[i].ZMM_Q(1) = cpu_ldq_data_ra(env, addr + 8, ra);
2732         addr += 16;
2733     }
2734 }
2735
2736 static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2737 {
2738     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2739     int i;
2740
2741     for (i = 0; i < 4; i++, addr += 16) {
2742         env->bnd_regs[i].lb = cpu_ldq_data_ra(env, addr, ra);
2743         env->bnd_regs[i].ub = cpu_ldq_data_ra(env, addr + 8, ra);
2744     }
2745 }
2746
2747 static void do_xrstor_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2748 {
2749     /* FIXME: Extend highest implemented bit of linear address.  */
2750     env->bndcs_regs.cfgu
2751         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu), ra);
2752     env->bndcs_regs.sts
2753         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts), ra);
2754 }
2755
2756 static void do_xrstor_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2757 {
2758     env->pkru = cpu_ldq_data_ra(env, ptr, ra);
2759 }
2760
2761 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2762 {
2763     uintptr_t ra = GETPC();
2764
2765     /* The operand must be 16 byte aligned */
2766     if (ptr & 0xf) {
2767         raise_exception_ra(env, EXCP0D_GPF, ra);
2768     }
2769
2770     do_xrstor_fpu(env, ptr, ra);
2771
2772     if (env->cr[4] & CR4_OSFXSR_MASK) {
2773         do_xrstor_mxcsr(env, ptr, ra);
2774         /* Fast FXRSTOR leaves out the XMM registers */
2775         if (!(env->efer & MSR_EFER_FFXSR)
2776             || (env->hflags & HF_CPL_MASK)
2777             || !(env->hflags & HF_LMA_MASK)) {
2778             do_xrstor_sse(env, ptr, ra);
2779         }
2780     }
2781 }
2782
2783 #if defined(CONFIG_USER_ONLY)
2784 void cpu_x86_fxsave(CPUX86State *env, target_ulong ptr)
2785 {
2786     helper_fxsave(env, ptr);
2787 }
2788
2789 void cpu_x86_fxrstor(CPUX86State *env, target_ulong ptr)
2790 {
2791     helper_fxrstor(env, ptr);
2792 }
2793 #endif
2794
2795 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2796 {
2797     uintptr_t ra = GETPC();
2798     uint64_t xstate_bv, xcomp_bv, reserve0;
2799
2800     rfbm &= env->xcr0;
2801
2802     /* The OS must have enabled XSAVE.  */
2803     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2804         raise_exception_ra(env, EXCP06_ILLOP, ra);
2805     }
2806
2807     /* The operand must be 64 byte aligned.  */
2808     if (ptr & 63) {
2809         raise_exception_ra(env, EXCP0D_GPF, ra);
2810     }
2811
2812     xstate_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2813
2814     if ((int64_t)xstate_bv < 0) {
2815         /* FIXME: Compact form.  */
2816         raise_exception_ra(env, EXCP0D_GPF, ra);
2817     }
2818
2819     /* Standard form.  */
2820
2821     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2822     if (xstate_bv & ~env->xcr0) {
2823         raise_exception_ra(env, EXCP0D_GPF, ra);
2824     }
2825
2826     /* The XCOMP_BV field must be zero.  Note that, as of the April 2016
2827        revision, the description of the XSAVE Header (Vol 1, Sec 13.4.2)
2828        describes only XCOMP_BV, but the description of the standard form
2829        of XRSTOR (Vol 1, Sec 13.8.1) checks bytes 23:8 for zero, which
2830        includes the next 64-bit field.  */
2831     xcomp_bv = cpu_ldq_data_ra(env, ptr + XO(header.xcomp_bv), ra);
2832     reserve0 = cpu_ldq_data_ra(env, ptr + XO(header.reserve0), ra);
2833     if (xcomp_bv || reserve0) {
2834         raise_exception_ra(env, EXCP0D_GPF, ra);
2835     }
2836
2837     if (rfbm & XSTATE_FP_MASK) {
2838         if (xstate_bv & XSTATE_FP_MASK) {
2839             do_xrstor_fpu(env, ptr, ra);
2840         } else {
2841             helper_fninit(env);
2842             memset(env->fpregs, 0, sizeof(env->fpregs));
2843         }
2844     }
2845     if (rfbm & XSTATE_SSE_MASK) {
2846         /* Note that the standard form of XRSTOR loads MXCSR from memory
2847            whether or not the XSTATE_BV bit is set.  */
2848         do_xrstor_mxcsr(env, ptr, ra);
2849         if (xstate_bv & XSTATE_SSE_MASK) {
2850             do_xrstor_sse(env, ptr, ra);
2851         } else {
2852             /* ??? When AVX is implemented, we may have to be more
2853                selective in the clearing.  */
2854             memset(env->xmm_regs, 0, sizeof(env->xmm_regs));
2855         }
2856     }
2857     if (rfbm & XSTATE_BNDREGS_MASK) {
2858         if (xstate_bv & XSTATE_BNDREGS_MASK) {
2859             do_xrstor_bndregs(env, ptr + XO(bndreg_state), ra);
2860             env->hflags |= HF_MPX_IU_MASK;
2861         } else {
2862             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
2863             env->hflags &= ~HF_MPX_IU_MASK;
2864         }
2865     }
2866     if (rfbm & XSTATE_BNDCSR_MASK) {
2867         if (xstate_bv & XSTATE_BNDCSR_MASK) {
2868             do_xrstor_bndcsr(env, ptr + XO(bndcsr_state), ra);
2869         } else {
2870             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
2871         }
2872         cpu_sync_bndcs_hflags(env);
2873     }
2874     if (rfbm & XSTATE_PKRU_MASK) {
2875         uint64_t old_pkru = env->pkru;
2876         if (xstate_bv & XSTATE_PKRU_MASK) {
2877             do_xrstor_pkru(env, ptr + XO(pkru_state), ra);
2878         } else {
2879             env->pkru = 0;
2880         }
2881         if (env->pkru != old_pkru) {
2882             CPUState *cs = env_cpu(env);
2883             tlb_flush(cs);
2884         }
2885     }
2886 }
2887
2888 #undef XO
2889
2890 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
2891 {
2892     /* The OS must have enabled XSAVE.  */
2893     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2894         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2895     }
2896
2897     switch (ecx) {
2898     case 0:
2899         return env->xcr0;
2900     case 1:
2901         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
2902             return env->xcr0 & get_xinuse(env);
2903         }
2904         break;
2905     }
2906     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2907 }
2908
2909 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
2910 {
2911     uint32_t dummy, ena_lo, ena_hi;
2912     uint64_t ena;
2913
2914     /* The OS must have enabled XSAVE.  */
2915     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2916         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2917     }
2918
2919     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
2920     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
2921         goto do_gpf;
2922     }
2923
2924     /* Disallow enabling unimplemented features.  */
2925     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
2926     ena = ((uint64_t)ena_hi << 32) | ena_lo;
2927     if (mask & ~ena) {
2928         goto do_gpf;
2929     }
2930
2931     /* Disallow enabling only half of MPX.  */
2932     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
2933         & XSTATE_BNDCSR_MASK) {
2934         goto do_gpf;
2935     }
2936
2937     env->xcr0 = mask;
2938     cpu_sync_bndcs_hflags(env);
2939     return;
2940
2941  do_gpf:
2942     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2943 }
2944
2945 /* MMX/SSE */
2946 /* XXX: optimize by storing fptt and fptags in the static cpu state */
2947
2948 #define SSE_DAZ             0x0040
2949 #define SSE_RC_MASK         0x6000
2950 #define SSE_RC_NEAR         0x0000
2951 #define SSE_RC_DOWN         0x2000
2952 #define SSE_RC_UP           0x4000
2953 #define SSE_RC_CHOP         0x6000
2954 #define SSE_FZ              0x8000
2955
2956 void update_mxcsr_status(CPUX86State *env)
2957 {
2958     uint32_t mxcsr = env->mxcsr;
2959     int rnd_type;
2960
2961     /* set rounding mode */
2962     switch (mxcsr & SSE_RC_MASK) {
2963     default:
2964     case SSE_RC_NEAR:
2965         rnd_type = float_round_nearest_even;
2966         break;
2967     case SSE_RC_DOWN:
2968         rnd_type = float_round_down;
2969         break;
2970     case SSE_RC_UP:
2971         rnd_type = float_round_up;
2972         break;
2973     case SSE_RC_CHOP:
2974         rnd_type = float_round_to_zero;
2975         break;
2976     }
2977     set_float_rounding_mode(rnd_type, &env->sse_status);
2978
2979     /* Set exception flags.  */
2980     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
2981                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
2982                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
2983                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
2984                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
2985                               &env->sse_status);
2986
2987     /* set denormals are zero */
2988     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
2989
2990     /* set flush to zero */
2991     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
2992 }
2993
2994 void update_mxcsr_from_sse_status(CPUX86State *env)
2995 {
2996     uint8_t flags = get_float_exception_flags(&env->sse_status);
2997     /*
2998      * The MXCSR denormal flag has opposite semantics to
2999      * float_flag_input_denormal (the softfloat code sets that flag
3000      * only when flushing input denormals to zero, but SSE sets it
3001      * only when not flushing them to zero), so is not converted
3002      * here.
3003      */
3004     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
3005                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3006                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3007                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3008                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3009                    (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3010                     0));
3011 }
3012
3013 void helper_update_mxcsr(CPUX86State *env)
3014 {
3015     update_mxcsr_from_sse_status(env);
3016 }
3017
3018 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3019 {
3020     cpu_set_mxcsr(env, val);
3021 }
3022
3023 void helper_enter_mmx(CPUX86State *env)
3024 {
3025     env->fpstt = 0;
3026     *(uint32_t *)(env->fptags) = 0;
3027     *(uint32_t *)(env->fptags + 4) = 0;
3028 }
3029
3030 void helper_emms(CPUX86State *env)
3031 {
3032     /* set to empty state */
3033     *(uint32_t *)(env->fptags) = 0x01010101;
3034     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3035 }
3036
3037 /* XXX: suppress */
3038 void helper_movq(CPUX86State *env, void *d, void *s)
3039 {
3040     *(uint64_t *)d = *(uint64_t *)s;
3041 }
3042
3043 #define SHIFT 0
3044 #include "ops_sse.h"
3045
3046 #define SHIFT 1
3047 #include "ops_sse.h"