target/i386/fpu_helper.c

   1 /*
   2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include <math.h>
  22 #include "cpu.h"
  23 #include "exec/helper-proto.h"
  24 #include "qemu/host-utils.h"
  25 #include "exec/exec-all.h"
  26 #include "exec/cpu_ldst.h"
  27 #include "fpu/softfloat.h"
  28
  29 #ifdef CONFIG_SOFTMMU
  30 #include "hw/irq.h"
  31 #endif
  32
  33 #define FPU_RC_MASK         0xc00
  34 #define FPU_RC_NEAR         0x000
  35 #define FPU_RC_DOWN         0x400
  36 #define FPU_RC_UP           0x800
  37 #define FPU_RC_CHOP         0xc00
  38
  39 #define MAXTAN 9223372036854775808.0
  40
  41 /* the following deal with x86 long double-precision numbers */
  42 #define MAXEXPD 0x7fff
  43 #define EXPBIAS 16383
  44 #define EXPD(fp)        (fp.l.upper & 0x7fff)
  45 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
  46 #define MANTD(fp)       (fp.l.lower)
  47 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
  48
  49 #define FPUS_IE (1 << 0)
  50 #define FPUS_DE (1 << 1)
  51 #define FPUS_ZE (1 << 2)
  52 #define FPUS_OE (1 << 3)
  53 #define FPUS_UE (1 << 4)
  54 #define FPUS_PE (1 << 5)
  55 #define FPUS_SF (1 << 6)
  56 #define FPUS_SE (1 << 7)
  57 #define FPUS_B  (1 << 15)
  58
  59 #define FPUC_EM 0x3f
  60
  61 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
  62 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
  63 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
  64 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
  65 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
  66 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
  67 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
  68 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
  69
  70 #if !defined(CONFIG_USER_ONLY)
  71 static qemu_irq ferr_irq;
  72
  73 void x86_register_ferr_irq(qemu_irq irq)
  74 {
  75     ferr_irq = irq;
  76 }
  77
  78 static void cpu_clear_ignne(void)
  79 {
  80     CPUX86State *env = &X86_CPU(first_cpu)->env;
  81     env->hflags2 &= ~HF2_IGNNE_MASK;
  82 }
  83
  84 void cpu_set_ignne(void)
  85 {
  86     CPUX86State *env = &X86_CPU(first_cpu)->env;
  87     env->hflags2 |= HF2_IGNNE_MASK;
  88     /*
  89      * We get here in response to a write to port F0h.  The chipset should
  90      * deassert FP_IRQ and FERR# instead should stay signaled until FPSW_SE is
  91      * cleared, because FERR# and FP_IRQ are two separate pins on real
  92      * hardware.  However, we don't model FERR# as a qemu_irq, so we just
  93      * do directly what the chipset would do, i.e. deassert FP_IRQ.
  94      */
  95     qemu_irq_lower(ferr_irq);
  96 }
  97 #endif
  98
  99
 100 static inline void fpush(CPUX86State *env)
 101 {
 102     env->fpstt = (env->fpstt - 1) & 7;
 103     env->fptags[env->fpstt] = 0; /* validate stack entry */
 104 }
 105
 106 static inline void fpop(CPUX86State *env)
 107 {
 108     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
 109     env->fpstt = (env->fpstt + 1) & 7;
 110 }
 111
 112 static inline floatx80 helper_fldt(CPUX86State *env, target_ulong ptr,
 113                                    uintptr_t retaddr)
 114 {
 115     CPU_LDoubleU temp;
 116
 117     temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
 118     temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
 119     return temp.d;
 120 }
 121
 122 static inline void helper_fstt(CPUX86State *env, floatx80 f, target_ulong ptr,
 123                                uintptr_t retaddr)
 124 {
 125     CPU_LDoubleU temp;
 126
 127     temp.d = f;
 128     cpu_stq_data_ra(env, ptr, temp.l.lower, retaddr);
 129     cpu_stw_data_ra(env, ptr + 8, temp.l.upper, retaddr);
 130 }
 131
 132 /* x87 FPU helpers */
 133
 134 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
 135 {
 136     union {
 137         float64 f64;
 138         double d;
 139     } u;
 140
 141     u.f64 = floatx80_to_float64(a, &env->fp_status);
 142     return u.d;
 143 }
 144
 145 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
 146 {
 147     union {
 148         float64 f64;
 149         double d;
 150     } u;
 151
 152     u.d = a;
 153     return float64_to_floatx80(u.f64, &env->fp_status);
 154 }
 155
 156 static void fpu_set_exception(CPUX86State *env, int mask)
 157 {
 158     env->fpus |= mask;
 159     if (env->fpus & (~env->fpuc & FPUC_EM)) {
 160         env->fpus |= FPUS_SE | FPUS_B;
 161     }
 162 }
 163
 164 static inline uint8_t save_exception_flags(CPUX86State *env)
 165 {
 166     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
 167     set_float_exception_flags(0, &env->fp_status);
 168     return old_flags;
 169 }
 170
 171 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
 172 {
 173     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
 174     float_raise(old_flags, &env->fp_status);
 175     fpu_set_exception(env,
 176                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
 177                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
 178                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
 179                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
 180                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
 181                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
 182 }
 183
 184 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
 185 {
 186     uint8_t old_flags = save_exception_flags(env);
 187     floatx80 ret = floatx80_div(a, b, &env->fp_status);
 188     merge_exception_flags(env, old_flags);
 189     return ret;
 190 }
 191
 192 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
 193 {
 194     if (env->cr[0] & CR0_NE_MASK) {
 195         raise_exception_ra(env, EXCP10_COPR, retaddr);
 196     }
 197 #if !defined(CONFIG_USER_ONLY)
 198     else if (ferr_irq && !(env->hflags2 & HF2_IGNNE_MASK)) {
 199         qemu_irq_raise(ferr_irq);
 200     }
 201 #endif
 202 }
 203
 204 void helper_flds_FT0(CPUX86State *env, uint32_t val)
 205 {
 206     uint8_t old_flags = save_exception_flags(env);
 207     union {
 208         float32 f;
 209         uint32_t i;
 210     } u;
 211
 212     u.i = val;
 213     FT0 = float32_to_floatx80(u.f, &env->fp_status);
 214     merge_exception_flags(env, old_flags);
 215 }
 216
 217 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
 218 {
 219     uint8_t old_flags = save_exception_flags(env);
 220     union {
 221         float64 f;
 222         uint64_t i;
 223     } u;
 224
 225     u.i = val;
 226     FT0 = float64_to_floatx80(u.f, &env->fp_status);
 227     merge_exception_flags(env, old_flags);
 228 }
 229
 230 void helper_fildl_FT0(CPUX86State *env, int32_t val)
 231 {
 232     FT0 = int32_to_floatx80(val, &env->fp_status);
 233 }
 234
 235 void helper_flds_ST0(CPUX86State *env, uint32_t val)
 236 {
 237     uint8_t old_flags = save_exception_flags(env);
 238     int new_fpstt;
 239     union {
 240         float32 f;
 241         uint32_t i;
 242     } u;
 243
 244     new_fpstt = (env->fpstt - 1) & 7;
 245     u.i = val;
 246     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
 247     env->fpstt = new_fpstt;
 248     env->fptags[new_fpstt] = 0; /* validate stack entry */
 249     merge_exception_flags(env, old_flags);
 250 }
 251
 252 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
 253 {
 254     uint8_t old_flags = save_exception_flags(env);
 255     int new_fpstt;
 256     union {
 257         float64 f;
 258         uint64_t i;
 259     } u;
 260
 261     new_fpstt = (env->fpstt - 1) & 7;
 262     u.i = val;
 263     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
 264     env->fpstt = new_fpstt;
 265     env->fptags[new_fpstt] = 0; /* validate stack entry */
 266     merge_exception_flags(env, old_flags);
 267 }
 268
 269 void helper_fildl_ST0(CPUX86State *env, int32_t val)
 270 {
 271     int new_fpstt;
 272
 273     new_fpstt = (env->fpstt - 1) & 7;
 274     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
 275     env->fpstt = new_fpstt;
 276     env->fptags[new_fpstt] = 0; /* validate stack entry */
 277 }
 278
 279 void helper_fildll_ST0(CPUX86State *env, int64_t val)
 280 {
 281     int new_fpstt;
 282
 283     new_fpstt = (env->fpstt - 1) & 7;
 284     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
 285     env->fpstt = new_fpstt;
 286     env->fptags[new_fpstt] = 0; /* validate stack entry */
 287 }
 288
 289 uint32_t helper_fsts_ST0(CPUX86State *env)
 290 {
 291     uint8_t old_flags = save_exception_flags(env);
 292     union {
 293         float32 f;
 294         uint32_t i;
 295     } u;
 296
 297     u.f = floatx80_to_float32(ST0, &env->fp_status);
 298     merge_exception_flags(env, old_flags);
 299     return u.i;
 300 }
 301
 302 uint64_t helper_fstl_ST0(CPUX86State *env)
 303 {
 304     uint8_t old_flags = save_exception_flags(env);
 305     union {
 306         float64 f;
 307         uint64_t i;
 308     } u;
 309
 310     u.f = floatx80_to_float64(ST0, &env->fp_status);
 311     merge_exception_flags(env, old_flags);
 312     return u.i;
 313 }
 314
 315 int32_t helper_fist_ST0(CPUX86State *env)
 316 {
 317     uint8_t old_flags = save_exception_flags(env);
 318     int32_t val;
 319
 320     val = floatx80_to_int32(ST0, &env->fp_status);
 321     if (val != (int16_t)val) {
 322         set_float_exception_flags(float_flag_invalid, &env->fp_status);
 323         val = -32768;
 324     }
 325     merge_exception_flags(env, old_flags);
 326     return val;
 327 }
 328
 329 int32_t helper_fistl_ST0(CPUX86State *env)
 330 {
 331     uint8_t old_flags = save_exception_flags(env);
 332     int32_t val;
 333
 334     val = floatx80_to_int32(ST0, &env->fp_status);
 335     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 336         val = 0x80000000;
 337     }
 338     merge_exception_flags(env, old_flags);
 339     return val;
 340 }
 341
 342 int64_t helper_fistll_ST0(CPUX86State *env)
 343 {
 344     uint8_t old_flags = save_exception_flags(env);
 345     int64_t val;
 346
 347     val = floatx80_to_int64(ST0, &env->fp_status);
 348     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 349         val = 0x8000000000000000ULL;
 350     }
 351     merge_exception_flags(env, old_flags);
 352     return val;
 353 }
 354
 355 int32_t helper_fistt_ST0(CPUX86State *env)
 356 {
 357     uint8_t old_flags = save_exception_flags(env);
 358     int32_t val;
 359
 360     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
 361     if (val != (int16_t)val) {
 362         set_float_exception_flags(float_flag_invalid, &env->fp_status);
 363         val = -32768;
 364     }
 365     merge_exception_flags(env, old_flags);
 366     return val;
 367 }
 368
 369 int32_t helper_fisttl_ST0(CPUX86State *env)
 370 {
 371     uint8_t old_flags = save_exception_flags(env);
 372     int32_t val;
 373
 374     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
 375     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 376         val = 0x80000000;
 377     }
 378     merge_exception_flags(env, old_flags);
 379     return val;
 380 }
 381
 382 int64_t helper_fisttll_ST0(CPUX86State *env)
 383 {
 384     uint8_t old_flags = save_exception_flags(env);
 385     int64_t val;
 386
 387     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
 388     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 389         val = 0x8000000000000000ULL;
 390     }
 391     merge_exception_flags(env, old_flags);
 392     return val;
 393 }
 394
 395 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
 396 {
 397     int new_fpstt;
 398
 399     new_fpstt = (env->fpstt - 1) & 7;
 400     env->fpregs[new_fpstt].d = helper_fldt(env, ptr, GETPC());
 401     env->fpstt = new_fpstt;
 402     env->fptags[new_fpstt] = 0; /* validate stack entry */
 403 }
 404
 405 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
 406 {
 407     helper_fstt(env, ST0, ptr, GETPC());
 408 }
 409
 410 void helper_fpush(CPUX86State *env)
 411 {
 412     fpush(env);
 413 }
 414
 415 void helper_fpop(CPUX86State *env)
 416 {
 417     fpop(env);
 418 }
 419
 420 void helper_fdecstp(CPUX86State *env)
 421 {
 422     env->fpstt = (env->fpstt - 1) & 7;
 423     env->fpus &= ~0x4700;
 424 }
 425
 426 void helper_fincstp(CPUX86State *env)
 427 {
 428     env->fpstt = (env->fpstt + 1) & 7;
 429     env->fpus &= ~0x4700;
 430 }
 431
 432 /* FPU move */
 433
 434 void helper_ffree_STN(CPUX86State *env, int st_index)
 435 {
 436     env->fptags[(env->fpstt + st_index) & 7] = 1;
 437 }
 438
 439 void helper_fmov_ST0_FT0(CPUX86State *env)
 440 {
 441     ST0 = FT0;
 442 }
 443
 444 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
 445 {
 446     FT0 = ST(st_index);
 447 }
 448
 449 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
 450 {
 451     ST0 = ST(st_index);
 452 }
 453
 454 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
 455 {
 456     ST(st_index) = ST0;
 457 }
 458
 459 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
 460 {
 461     floatx80 tmp;
 462
 463     tmp = ST(st_index);
 464     ST(st_index) = ST0;
 465     ST0 = tmp;
 466 }
 467
 468 /* FPU operations */
 469
 470 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
 471
 472 void helper_fcom_ST0_FT0(CPUX86State *env)
 473 {
 474     uint8_t old_flags = save_exception_flags(env);
 475     FloatRelation ret;
 476
 477     ret = floatx80_compare(ST0, FT0, &env->fp_status);
 478     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
 479     merge_exception_flags(env, old_flags);
 480 }
 481
 482 void helper_fucom_ST0_FT0(CPUX86State *env)
 483 {
 484     uint8_t old_flags = save_exception_flags(env);
 485     FloatRelation ret;
 486
 487     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
 488     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
 489     merge_exception_flags(env, old_flags);
 490 }
 491
 492 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
 493
 494 void helper_fcomi_ST0_FT0(CPUX86State *env)
 495 {
 496     uint8_t old_flags = save_exception_flags(env);
 497     int eflags;
 498     FloatRelation ret;
 499
 500     ret = floatx80_compare(ST0, FT0, &env->fp_status);
 501     eflags = cpu_cc_compute_all(env, CC_OP);
 502     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
 503     CC_SRC = eflags;
 504     merge_exception_flags(env, old_flags);
 505 }
 506
 507 void helper_fucomi_ST0_FT0(CPUX86State *env)
 508 {
 509     uint8_t old_flags = save_exception_flags(env);
 510     int eflags;
 511     FloatRelation ret;
 512
 513     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
 514     eflags = cpu_cc_compute_all(env, CC_OP);
 515     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
 516     CC_SRC = eflags;
 517     merge_exception_flags(env, old_flags);
 518 }
 519
 520 void helper_fadd_ST0_FT0(CPUX86State *env)
 521 {
 522     uint8_t old_flags = save_exception_flags(env);
 523     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
 524     merge_exception_flags(env, old_flags);
 525 }
 526
 527 void helper_fmul_ST0_FT0(CPUX86State *env)
 528 {
 529     uint8_t old_flags = save_exception_flags(env);
 530     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
 531     merge_exception_flags(env, old_flags);
 532 }
 533
 534 void helper_fsub_ST0_FT0(CPUX86State *env)
 535 {
 536     uint8_t old_flags = save_exception_flags(env);
 537     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
 538     merge_exception_flags(env, old_flags);
 539 }
 540
 541 void helper_fsubr_ST0_FT0(CPUX86State *env)
 542 {
 543     uint8_t old_flags = save_exception_flags(env);
 544     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
 545     merge_exception_flags(env, old_flags);
 546 }
 547
 548 void helper_fdiv_ST0_FT0(CPUX86State *env)
 549 {
 550     ST0 = helper_fdiv(env, ST0, FT0);
 551 }
 552
 553 void helper_fdivr_ST0_FT0(CPUX86State *env)
 554 {
 555     ST0 = helper_fdiv(env, FT0, ST0);
 556 }
 557
 558 /* fp operations between STN and ST0 */
 559
 560 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
 561 {
 562     uint8_t old_flags = save_exception_flags(env);
 563     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
 564     merge_exception_flags(env, old_flags);
 565 }
 566
 567 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
 568 {
 569     uint8_t old_flags = save_exception_flags(env);
 570     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
 571     merge_exception_flags(env, old_flags);
 572 }
 573
 574 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
 575 {
 576     uint8_t old_flags = save_exception_flags(env);
 577     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
 578     merge_exception_flags(env, old_flags);
 579 }
 580
 581 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
 582 {
 583     uint8_t old_flags = save_exception_flags(env);
 584     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
 585     merge_exception_flags(env, old_flags);
 586 }
 587
 588 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
 589 {
 590     floatx80 *p;
 591
 592     p = &ST(st_index);
 593     *p = helper_fdiv(env, *p, ST0);
 594 }
 595
 596 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
 597 {
 598     floatx80 *p;
 599
 600     p = &ST(st_index);
 601     *p = helper_fdiv(env, ST0, *p);
 602 }
 603
 604 /* misc FPU operations */
 605 void helper_fchs_ST0(CPUX86State *env)
 606 {
 607     ST0 = floatx80_chs(ST0);
 608 }
 609
 610 void helper_fabs_ST0(CPUX86State *env)
 611 {
 612     ST0 = floatx80_abs(ST0);
 613 }
 614
 615 void helper_fld1_ST0(CPUX86State *env)
 616 {
 617     ST0 = floatx80_one;
 618 }
 619
 620 void helper_fldl2t_ST0(CPUX86State *env)
 621 {
 622     switch (env->fpuc & FPU_RC_MASK) {
 623     case FPU_RC_UP:
 624         ST0 = floatx80_l2t_u;
 625         break;
 626     default:
 627         ST0 = floatx80_l2t;
 628         break;
 629     }
 630 }
 631
 632 void helper_fldl2e_ST0(CPUX86State *env)
 633 {
 634     switch (env->fpuc & FPU_RC_MASK) {
 635     case FPU_RC_DOWN:
 636     case FPU_RC_CHOP:
 637         ST0 = floatx80_l2e_d;
 638         break;
 639     default:
 640         ST0 = floatx80_l2e;
 641         break;
 642     }
 643 }
 644
 645 void helper_fldpi_ST0(CPUX86State *env)
 646 {
 647     switch (env->fpuc & FPU_RC_MASK) {
 648     case FPU_RC_DOWN:
 649     case FPU_RC_CHOP:
 650         ST0 = floatx80_pi_d;
 651         break;
 652     default:
 653         ST0 = floatx80_pi;
 654         break;
 655     }
 656 }
 657
 658 void helper_fldlg2_ST0(CPUX86State *env)
 659 {
 660     switch (env->fpuc & FPU_RC_MASK) {
 661     case FPU_RC_DOWN:
 662     case FPU_RC_CHOP:
 663         ST0 = floatx80_lg2_d;
 664         break;
 665     default:
 666         ST0 = floatx80_lg2;
 667         break;
 668     }
 669 }
 670
 671 void helper_fldln2_ST0(CPUX86State *env)
 672 {
 673     switch (env->fpuc & FPU_RC_MASK) {
 674     case FPU_RC_DOWN:
 675     case FPU_RC_CHOP:
 676         ST0 = floatx80_ln2_d;
 677         break;
 678     default:
 679         ST0 = floatx80_ln2;
 680         break;
 681     }
 682 }
 683
 684 void helper_fldz_ST0(CPUX86State *env)
 685 {
 686     ST0 = floatx80_zero;
 687 }
 688
 689 void helper_fldz_FT0(CPUX86State *env)
 690 {
 691     FT0 = floatx80_zero;
 692 }
 693
 694 uint32_t helper_fnstsw(CPUX86State *env)
 695 {
 696     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
 697 }
 698
 699 uint32_t helper_fnstcw(CPUX86State *env)
 700 {
 701     return env->fpuc;
 702 }
 703
 704 void update_fp_status(CPUX86State *env)
 705 {
 706     int rnd_type;
 707
 708     /* set rounding mode */
 709     switch (env->fpuc & FPU_RC_MASK) {
 710     default:
 711     case FPU_RC_NEAR:
 712         rnd_type = float_round_nearest_even;
 713         break;
 714     case FPU_RC_DOWN:
 715         rnd_type = float_round_down;
 716         break;
 717     case FPU_RC_UP:
 718         rnd_type = float_round_up;
 719         break;
 720     case FPU_RC_CHOP:
 721         rnd_type = float_round_to_zero;
 722         break;
 723     }
 724     set_float_rounding_mode(rnd_type, &env->fp_status);
 725     switch ((env->fpuc >> 8) & 3) {
 726     case 0:
 727         rnd_type = 32;
 728         break;
 729     case 2:
 730         rnd_type = 64;
 731         break;
 732     case 3:
 733     default:
 734         rnd_type = 80;
 735         break;
 736     }
 737     set_floatx80_rounding_precision(rnd_type, &env->fp_status);
 738 }
 739
 740 void helper_fldcw(CPUX86State *env, uint32_t val)
 741 {
 742     cpu_set_fpuc(env, val);
 743 }
 744
 745 void helper_fclex(CPUX86State *env)
 746 {
 747     env->fpus &= 0x7f00;
 748 }
 749
 750 void helper_fwait(CPUX86State *env)
 751 {
 752     if (env->fpus & FPUS_SE) {
 753         fpu_raise_exception(env, GETPC());
 754     }
 755 }
 756
 757 void helper_fninit(CPUX86State *env)
 758 {
 759     env->fpus = 0;
 760     env->fpstt = 0;
 761     cpu_set_fpuc(env, 0x37f);
 762     env->fptags[0] = 1;
 763     env->fptags[1] = 1;
 764     env->fptags[2] = 1;
 765     env->fptags[3] = 1;
 766     env->fptags[4] = 1;
 767     env->fptags[5] = 1;
 768     env->fptags[6] = 1;
 769     env->fptags[7] = 1;
 770 }
 771
 772 /* BCD ops */
 773
 774 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
 775 {
 776     floatx80 tmp;
 777     uint64_t val;
 778     unsigned int v;
 779     int i;
 780
 781     val = 0;
 782     for (i = 8; i >= 0; i--) {
 783         v = cpu_ldub_data_ra(env, ptr + i, GETPC());
 784         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
 785     }
 786     tmp = int64_to_floatx80(val, &env->fp_status);
 787     if (cpu_ldub_data_ra(env, ptr + 9, GETPC()) & 0x80) {
 788         tmp = floatx80_chs(tmp);
 789     }
 790     fpush(env);
 791     ST0 = tmp;
 792 }
 793
 794 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
 795 {
 796     uint8_t old_flags = save_exception_flags(env);
 797     int v;
 798     target_ulong mem_ref, mem_end;
 799     int64_t val;
 800     CPU_LDoubleU temp;
 801
 802     temp.d = ST0;
 803
 804     val = floatx80_to_int64(ST0, &env->fp_status);
 805     mem_ref = ptr;
 806     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
 807         set_float_exception_flags(float_flag_invalid, &env->fp_status);
 808         while (mem_ref < ptr + 7) {
 809             cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
 810         }
 811         cpu_stb_data_ra(env, mem_ref++, 0xc0, GETPC());
 812         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
 813         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
 814         merge_exception_flags(env, old_flags);
 815         return;
 816     }
 817     mem_end = mem_ref + 9;
 818     if (SIGND(temp)) {
 819         cpu_stb_data_ra(env, mem_end, 0x80, GETPC());
 820         val = -val;
 821     } else {
 822         cpu_stb_data_ra(env, mem_end, 0x00, GETPC());
 823     }
 824     while (mem_ref < mem_end) {
 825         if (val == 0) {
 826             break;
 827         }
 828         v = val % 100;
 829         val = val / 100;
 830         v = ((v / 10) << 4) | (v % 10);
 831         cpu_stb_data_ra(env, mem_ref++, v, GETPC());
 832     }
 833     while (mem_ref < mem_end) {
 834         cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
 835     }
 836     merge_exception_flags(env, old_flags);
 837 }
 838
 839 void helper_f2xm1(CPUX86State *env)
 840 {
 841     double val = floatx80_to_double(env, ST0);
 842
 843     val = pow(2.0, val) - 1.0;
 844     ST0 = double_to_floatx80(env, val);
 845 }
 846
 847 void helper_fyl2x(CPUX86State *env)
 848 {
 849     double fptemp = floatx80_to_double(env, ST0);
 850
 851     if (fptemp > 0.0) {
 852         fptemp = log(fptemp) / log(2.0); /* log2(ST) */
 853         fptemp *= floatx80_to_double(env, ST1);
 854         ST1 = double_to_floatx80(env, fptemp);
 855         fpop(env);
 856     } else {
 857         env->fpus &= ~0x4700;
 858         env->fpus |= 0x400;
 859     }
 860 }
 861
 862 void helper_fptan(CPUX86State *env)
 863 {
 864     double fptemp = floatx80_to_double(env, ST0);
 865
 866     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
 867         env->fpus |= 0x400;
 868     } else {
 869         fptemp = tan(fptemp);
 870         ST0 = double_to_floatx80(env, fptemp);
 871         fpush(env);
 872         ST0 = floatx80_one;
 873         env->fpus &= ~0x400; /* C2 <-- 0 */
 874         /* the above code is for |arg| < 2**52 only */
 875     }
 876 }
 877
 878 void helper_fpatan(CPUX86State *env)
 879 {
 880     double fptemp, fpsrcop;
 881
 882     fpsrcop = floatx80_to_double(env, ST1);
 883     fptemp = floatx80_to_double(env, ST0);
 884     ST1 = double_to_floatx80(env, atan2(fpsrcop, fptemp));
 885     fpop(env);
 886 }
 887
 888 void helper_fxtract(CPUX86State *env)
 889 {
 890     uint8_t old_flags = save_exception_flags(env);
 891     CPU_LDoubleU temp;
 892
 893     temp.d = ST0;
 894
 895     if (floatx80_is_zero(ST0)) {
 896         /* Easy way to generate -inf and raising division by 0 exception */
 897         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
 898                            &env->fp_status);
 899         fpush(env);
 900         ST0 = temp.d;
 901     } else if (floatx80_invalid_encoding(ST0)) {
 902         float_raise(float_flag_invalid, &env->fp_status);
 903         ST0 = floatx80_default_nan(&env->fp_status);
 904         fpush(env);
 905         ST0 = ST1;
 906     } else if (floatx80_is_any_nan(ST0)) {
 907         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
 908             float_raise(float_flag_invalid, &env->fp_status);
 909             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
 910         }
 911         fpush(env);
 912         ST0 = ST1;
 913     } else if (floatx80_is_infinity(ST0)) {
 914         fpush(env);
 915         ST0 = ST1;
 916         ST1 = floatx80_infinity;
 917     } else {
 918         int expdif;
 919
 920         if (EXPD(temp) == 0) {
 921             int shift = clz64(temp.l.lower);
 922             temp.l.lower <<= shift;
 923             expdif = 1 - EXPBIAS - shift;
 924             float_raise(float_flag_input_denormal, &env->fp_status);
 925         } else {
 926             expdif = EXPD(temp) - EXPBIAS;
 927         }
 928         /* DP exponent bias */
 929         ST0 = int32_to_floatx80(expdif, &env->fp_status);
 930         fpush(env);
 931         BIASEXPONENT(temp);
 932         ST0 = temp.d;
 933     }
 934     merge_exception_flags(env, old_flags);
 935 }
 936
 937 void helper_fprem1(CPUX86State *env)
 938 {
 939     double st0, st1, dblq, fpsrcop, fptemp;
 940     CPU_LDoubleU fpsrcop1, fptemp1;
 941     int expdif;
 942     signed long long int q;
 943
 944     st0 = floatx80_to_double(env, ST0);
 945     st1 = floatx80_to_double(env, ST1);
 946
 947     if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
 948         ST0 = double_to_floatx80(env, 0.0 / 0.0); /* NaN */
 949         env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
 950         return;
 951     }
 952
 953     fpsrcop = st0;
 954     fptemp = st1;
 955     fpsrcop1.d = ST0;
 956     fptemp1.d = ST1;
 957     expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
 958
 959     if (expdif < 0) {
 960         /* optimisation? taken from the AMD docs */
 961         env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
 962         /* ST0 is unchanged */
 963         return;
 964     }
 965
 966     if (expdif < 53) {
 967         dblq = fpsrcop / fptemp;
 968         /* round dblq towards nearest integer */
 969         dblq = rint(dblq);
 970         st0 = fpsrcop - fptemp * dblq;
 971
 972         /* convert dblq to q by truncating towards zero */
 973         if (dblq < 0.0) {
 974             q = (signed long long int)(-dblq);
 975         } else {
 976             q = (signed long long int)dblq;
 977         }
 978
 979         env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
 980         /* (C0,C3,C1) <-- (q2,q1,q0) */
 981         env->fpus |= (q & 0x4) << (8 - 2);  /* (C0) <-- q2 */
 982         env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
 983         env->fpus |= (q & 0x1) << (9 - 0);  /* (C1) <-- q0 */
 984     } else {
 985         env->fpus |= 0x400;  /* C2 <-- 1 */
 986         fptemp = pow(2.0, expdif - 50);
 987         fpsrcop = (st0 / st1) / fptemp;
 988         /* fpsrcop = integer obtained by chopping */
 989         fpsrcop = (fpsrcop < 0.0) ?
 990                   -(floor(fabs(fpsrcop))) : floor(fpsrcop);
 991         st0 -= (st1 * fpsrcop * fptemp);
 992     }
 993     ST0 = double_to_floatx80(env, st0);
 994 }
 995
 996 void helper_fprem(CPUX86State *env)
 997 {
 998     double st0, st1, dblq, fpsrcop, fptemp;
 999     CPU_LDoubleU fpsrcop1, fptemp1;
1000     int expdif;
1001     signed long long int q;
1002
1003     st0 = floatx80_to_double(env, ST0);
1004     st1 = floatx80_to_double(env, ST1);
1005
1006     if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
1007         ST0 = double_to_floatx80(env, 0.0 / 0.0); /* NaN */
1008         env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1009         return;
1010     }
1011
1012     fpsrcop = st0;
1013     fptemp = st1;
1014     fpsrcop1.d = ST0;
1015     fptemp1.d = ST1;
1016     expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
1017
1018     if (expdif < 0) {
1019         /* optimisation? taken from the AMD docs */
1020         env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1021         /* ST0 is unchanged */
1022         return;
1023     }
1024
1025     if (expdif < 53) {
1026         dblq = fpsrcop / fptemp; /* ST0 / ST1 */
1027         /* round dblq towards zero */
1028         dblq = (dblq < 0.0) ? ceil(dblq) : floor(dblq);
1029         st0 = fpsrcop - fptemp * dblq; /* fpsrcop is ST0 */
1030
1031         /* convert dblq to q by truncating towards zero */
1032         if (dblq < 0.0) {
1033             q = (signed long long int)(-dblq);
1034         } else {
1035             q = (signed long long int)dblq;
1036         }
1037
1038         env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1039         /* (C0,C3,C1) <-- (q2,q1,q0) */
1040         env->fpus |= (q & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1041         env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
1042         env->fpus |= (q & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1043     } else {
1044         int N = 32 + (expdif % 32); /* as per AMD docs */
1045
1046         env->fpus |= 0x400;  /* C2 <-- 1 */
1047         fptemp = pow(2.0, (double)(expdif - N));
1048         fpsrcop = (st0 / st1) / fptemp;
1049         /* fpsrcop = integer obtained by chopping */
1050         fpsrcop = (fpsrcop < 0.0) ?
1051                   -(floor(fabs(fpsrcop))) : floor(fpsrcop);
1052         st0 -= (st1 * fpsrcop * fptemp);
1053     }
1054     ST0 = double_to_floatx80(env, st0);
1055 }
1056
1057 void helper_fyl2xp1(CPUX86State *env)
1058 {
1059     double fptemp = floatx80_to_double(env, ST0);
1060
1061     if ((fptemp + 1.0) > 0.0) {
1062         fptemp = log(fptemp + 1.0) / log(2.0); /* log2(ST + 1.0) */
1063         fptemp *= floatx80_to_double(env, ST1);
1064         ST1 = double_to_floatx80(env, fptemp);
1065         fpop(env);
1066     } else {
1067         env->fpus &= ~0x4700;
1068         env->fpus |= 0x400;
1069     }
1070 }
1071
1072 void helper_fsqrt(CPUX86State *env)
1073 {
1074     uint8_t old_flags = save_exception_flags(env);
1075     if (floatx80_is_neg(ST0)) {
1076         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
1077         env->fpus |= 0x400;
1078     }
1079     ST0 = floatx80_sqrt(ST0, &env->fp_status);
1080     merge_exception_flags(env, old_flags);
1081 }
1082
1083 void helper_fsincos(CPUX86State *env)
1084 {
1085     double fptemp = floatx80_to_double(env, ST0);
1086
1087     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1088         env->fpus |= 0x400;
1089     } else {
1090         ST0 = double_to_floatx80(env, sin(fptemp));
1091         fpush(env);
1092         ST0 = double_to_floatx80(env, cos(fptemp));
1093         env->fpus &= ~0x400;  /* C2 <-- 0 */
1094         /* the above code is for |arg| < 2**63 only */
1095     }
1096 }
1097
1098 void helper_frndint(CPUX86State *env)
1099 {
1100     uint8_t old_flags = save_exception_flags(env);
1101     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
1102     merge_exception_flags(env, old_flags);
1103 }
1104
1105 void helper_fscale(CPUX86State *env)
1106 {
1107     uint8_t old_flags = save_exception_flags(env);
1108     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
1109         float_raise(float_flag_invalid, &env->fp_status);
1110         ST0 = floatx80_default_nan(&env->fp_status);
1111     } else if (floatx80_is_any_nan(ST1)) {
1112         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1113             float_raise(float_flag_invalid, &env->fp_status);
1114         }
1115         ST0 = ST1;
1116         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1117             float_raise(float_flag_invalid, &env->fp_status);
1118             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1119         }
1120     } else if (floatx80_is_infinity(ST1) &&
1121                !floatx80_invalid_encoding(ST0) &&
1122                !floatx80_is_any_nan(ST0)) {
1123         if (floatx80_is_neg(ST1)) {
1124             if (floatx80_is_infinity(ST0)) {
1125                 float_raise(float_flag_invalid, &env->fp_status);
1126                 ST0 = floatx80_default_nan(&env->fp_status);
1127             } else {
1128                 ST0 = (floatx80_is_neg(ST0) ?
1129                        floatx80_chs(floatx80_zero) :
1130                        floatx80_zero);
1131             }
1132         } else {
1133             if (floatx80_is_zero(ST0)) {
1134                 float_raise(float_flag_invalid, &env->fp_status);
1135                 ST0 = floatx80_default_nan(&env->fp_status);
1136             } else {
1137                 ST0 = (floatx80_is_neg(ST0) ?
1138                        floatx80_chs(floatx80_infinity) :
1139                        floatx80_infinity);
1140             }
1141         }
1142     } else {
1143         int n;
1144         signed char save = env->fp_status.floatx80_rounding_precision;
1145         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
1146         set_float_exception_flags(0, &env->fp_status);
1147         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
1148         set_float_exception_flags(save_flags, &env->fp_status);
1149         env->fp_status.floatx80_rounding_precision = 80;
1150         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
1151         env->fp_status.floatx80_rounding_precision = save;
1152     }
1153     merge_exception_flags(env, old_flags);
1154 }
1155
1156 void helper_fsin(CPUX86State *env)
1157 {
1158     double fptemp = floatx80_to_double(env, ST0);
1159
1160     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1161         env->fpus |= 0x400;
1162     } else {
1163         ST0 = double_to_floatx80(env, sin(fptemp));
1164         env->fpus &= ~0x400;  /* C2 <-- 0 */
1165         /* the above code is for |arg| < 2**53 only */
1166     }
1167 }
1168
1169 void helper_fcos(CPUX86State *env)
1170 {
1171     double fptemp = floatx80_to_double(env, ST0);
1172
1173     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1174         env->fpus |= 0x400;
1175     } else {
1176         ST0 = double_to_floatx80(env, cos(fptemp));
1177         env->fpus &= ~0x400;  /* C2 <-- 0 */
1178         /* the above code is for |arg| < 2**63 only */
1179     }
1180 }
1181
1182 void helper_fxam_ST0(CPUX86State *env)
1183 {
1184     CPU_LDoubleU temp;
1185     int expdif;
1186
1187     temp.d = ST0;
1188
1189     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1190     if (SIGND(temp)) {
1191         env->fpus |= 0x200; /* C1 <-- 1 */
1192     }
1193
1194     if (env->fptags[env->fpstt]) {
1195         env->fpus |= 0x4100; /* Empty */
1196         return;
1197     }
1198
1199     expdif = EXPD(temp);
1200     if (expdif == MAXEXPD) {
1201         if (MANTD(temp) == 0x8000000000000000ULL) {
1202             env->fpus |= 0x500; /* Infinity */
1203         } else if (MANTD(temp) & 0x8000000000000000ULL) {
1204             env->fpus |= 0x100; /* NaN */
1205         }
1206     } else if (expdif == 0) {
1207         if (MANTD(temp) == 0) {
1208             env->fpus |=  0x4000; /* Zero */
1209         } else {
1210             env->fpus |= 0x4400; /* Denormal */
1211         }
1212     } else if (MANTD(temp) & 0x8000000000000000ULL) {
1213         env->fpus |= 0x400;
1214     }
1215 }
1216
1217 static void do_fstenv(CPUX86State *env, target_ulong ptr, int data32,
1218                       uintptr_t retaddr)
1219 {
1220     int fpus, fptag, exp, i;
1221     uint64_t mant;
1222     CPU_LDoubleU tmp;
1223
1224     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
1225     fptag = 0;
1226     for (i = 7; i >= 0; i--) {
1227         fptag <<= 2;
1228         if (env->fptags[i]) {
1229             fptag |= 3;
1230         } else {
1231             tmp.d = env->fpregs[i].d;
1232             exp = EXPD(tmp);
1233             mant = MANTD(tmp);
1234             if (exp == 0 && mant == 0) {
1235                 /* zero */
1236                 fptag |= 1;
1237             } else if (exp == 0 || exp == MAXEXPD
1238                        || (mant & (1LL << 63)) == 0) {
1239                 /* NaNs, infinity, denormal */
1240                 fptag |= 2;
1241             }
1242         }
1243     }
1244     if (data32) {
1245         /* 32 bit */
1246         cpu_stl_data_ra(env, ptr, env->fpuc, retaddr);
1247         cpu_stl_data_ra(env, ptr + 4, fpus, retaddr);
1248         cpu_stl_data_ra(env, ptr + 8, fptag, retaddr);
1249         cpu_stl_data_ra(env, ptr + 12, 0, retaddr); /* fpip */
1250         cpu_stl_data_ra(env, ptr + 16, 0, retaddr); /* fpcs */
1251         cpu_stl_data_ra(env, ptr + 20, 0, retaddr); /* fpoo */
1252         cpu_stl_data_ra(env, ptr + 24, 0, retaddr); /* fpos */
1253     } else {
1254         /* 16 bit */
1255         cpu_stw_data_ra(env, ptr, env->fpuc, retaddr);
1256         cpu_stw_data_ra(env, ptr + 2, fpus, retaddr);
1257         cpu_stw_data_ra(env, ptr + 4, fptag, retaddr);
1258         cpu_stw_data_ra(env, ptr + 6, 0, retaddr);
1259         cpu_stw_data_ra(env, ptr + 8, 0, retaddr);
1260         cpu_stw_data_ra(env, ptr + 10, 0, retaddr);
1261         cpu_stw_data_ra(env, ptr + 12, 0, retaddr);
1262     }
1263 }
1264
1265 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
1266 {
1267     do_fstenv(env, ptr, data32, GETPC());
1268 }
1269
1270 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
1271 {
1272     env->fpstt = (fpus >> 11) & 7;
1273     env->fpus = fpus & ~0x3800 & ~FPUS_B;
1274     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
1275 #if !defined(CONFIG_USER_ONLY)
1276     if (!(env->fpus & FPUS_SE)) {
1277         /*
1278          * Here the processor deasserts FERR#; in response, the chipset deasserts
1279          * IGNNE#.
1280          */
1281         cpu_clear_ignne();
1282     }
1283 #endif
1284 }
1285
1286 static void do_fldenv(CPUX86State *env, target_ulong ptr, int data32,
1287                       uintptr_t retaddr)
1288 {
1289     int i, fpus, fptag;
1290
1291     if (data32) {
1292         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
1293         fpus = cpu_lduw_data_ra(env, ptr + 4, retaddr);
1294         fptag = cpu_lduw_data_ra(env, ptr + 8, retaddr);
1295     } else {
1296         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
1297         fpus = cpu_lduw_data_ra(env, ptr + 2, retaddr);
1298         fptag = cpu_lduw_data_ra(env, ptr + 4, retaddr);
1299     }
1300     cpu_set_fpus(env, fpus);
1301     for (i = 0; i < 8; i++) {
1302         env->fptags[i] = ((fptag & 3) == 3);
1303         fptag >>= 2;
1304     }
1305 }
1306
1307 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
1308 {
1309     do_fldenv(env, ptr, data32, GETPC());
1310 }
1311
1312 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
1313 {
1314     floatx80 tmp;
1315     int i;
1316
1317     do_fstenv(env, ptr, data32, GETPC());
1318
1319     ptr += (14 << data32);
1320     for (i = 0; i < 8; i++) {
1321         tmp = ST(i);
1322         helper_fstt(env, tmp, ptr, GETPC());
1323         ptr += 10;
1324     }
1325
1326     /* fninit */
1327     env->fpus = 0;
1328     env->fpstt = 0;
1329     cpu_set_fpuc(env, 0x37f);
1330     env->fptags[0] = 1;
1331     env->fptags[1] = 1;
1332     env->fptags[2] = 1;
1333     env->fptags[3] = 1;
1334     env->fptags[4] = 1;
1335     env->fptags[5] = 1;
1336     env->fptags[6] = 1;
1337     env->fptags[7] = 1;
1338 }
1339
1340 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
1341 {
1342     floatx80 tmp;
1343     int i;
1344
1345     do_fldenv(env, ptr, data32, GETPC());
1346     ptr += (14 << data32);
1347
1348     for (i = 0; i < 8; i++) {
1349         tmp = helper_fldt(env, ptr, GETPC());
1350         ST(i) = tmp;
1351         ptr += 10;
1352     }
1353 }
1354
1355 #if defined(CONFIG_USER_ONLY)
1356 void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
1357 {
1358     helper_fsave(env, ptr, data32);
1359 }
1360
1361 void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
1362 {
1363     helper_frstor(env, ptr, data32);
1364 }
1365 #endif
1366
1367 #define XO(X)  offsetof(X86XSaveArea, X)
1368
1369 static void do_xsave_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
1370 {
1371     int fpus, fptag, i;
1372     target_ulong addr;
1373
1374     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
1375     fptag = 0;
1376     for (i = 0; i < 8; i++) {
1377         fptag |= (env->fptags[i] << i);
1378     }
1379
1380     cpu_stw_data_ra(env, ptr + XO(legacy.fcw), env->fpuc, ra);
1381     cpu_stw_data_ra(env, ptr + XO(legacy.fsw), fpus, ra);
1382     cpu_stw_data_ra(env, ptr + XO(legacy.ftw), fptag ^ 0xff, ra);
1383
1384     /* In 32-bit mode this is eip, sel, dp, sel.
1385        In 64-bit mode this is rip, rdp.
1386        But in either case we don't write actual data, just zeros.  */
1387     cpu_stq_data_ra(env, ptr + XO(legacy.fpip), 0, ra); /* eip+sel; rip */
1388     cpu_stq_data_ra(env, ptr + XO(legacy.fpdp), 0, ra); /* edp+sel; rdp */
1389
1390     addr = ptr + XO(legacy.fpregs);
1391     for (i = 0; i < 8; i++) {
1392         floatx80 tmp = ST(i);
1393         helper_fstt(env, tmp, addr, ra);
1394         addr += 16;
1395     }
1396 }
1397
1398 static void do_xsave_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
1399 {
1400     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr), env->mxcsr, ra);
1401     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr_mask), 0x0000ffff, ra);
1402 }
1403
1404 static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
1405 {
1406     int i, nb_xmm_regs;
1407     target_ulong addr;
1408
1409     if (env->hflags & HF_CS64_MASK) {
1410         nb_xmm_regs = 16;
1411     } else {
1412         nb_xmm_regs = 8;
1413     }
1414
1415     addr = ptr + XO(legacy.xmm_regs);
1416     for (i = 0; i < nb_xmm_regs; i++) {
1417         cpu_stq_data_ra(env, addr, env->xmm_regs[i].ZMM_Q(0), ra);
1418         cpu_stq_data_ra(env, addr + 8, env->xmm_regs[i].ZMM_Q(1), ra);
1419         addr += 16;
1420     }
1421 }
1422
1423 static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
1424 {
1425     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
1426     int i;
1427
1428     for (i = 0; i < 4; i++, addr += 16) {
1429         cpu_stq_data_ra(env, addr, env->bnd_regs[i].lb, ra);
1430         cpu_stq_data_ra(env, addr + 8, env->bnd_regs[i].ub, ra);
1431     }
1432 }
1433
1434 static void do_xsave_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
1435 {
1436     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
1437                     env->bndcs_regs.cfgu, ra);
1438     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
1439                     env->bndcs_regs.sts, ra);
1440 }
1441
1442 static void do_xsave_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
1443 {
1444     cpu_stq_data_ra(env, ptr, env->pkru, ra);
1445 }
1446
1447 void helper_fxsave(CPUX86State *env, target_ulong ptr)
1448 {
1449     uintptr_t ra = GETPC();
1450
1451     /* The operand must be 16 byte aligned */
1452     if (ptr & 0xf) {
1453         raise_exception_ra(env, EXCP0D_GPF, ra);
1454     }
1455
1456     do_xsave_fpu(env, ptr, ra);
1457
1458     if (env->cr[4] & CR4_OSFXSR_MASK) {
1459         do_xsave_mxcsr(env, ptr, ra);
1460         /* Fast FXSAVE leaves out the XMM registers */
1461         if (!(env->efer & MSR_EFER_FFXSR)
1462             || (env->hflags & HF_CPL_MASK)
1463             || !(env->hflags & HF_LMA_MASK)) {
1464             do_xsave_sse(env, ptr, ra);
1465         }
1466     }
1467 }
1468
1469 static uint64_t get_xinuse(CPUX86State *env)
1470 {
1471     uint64_t inuse = -1;
1472
1473     /* For the most part, we don't track XINUSE.  We could calculate it
1474        here for all components, but it's probably less work to simply
1475        indicate in use.  That said, the state of BNDREGS is important
1476        enough to track in HFLAGS, so we might as well use that here.  */
1477     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
1478        inuse &= ~XSTATE_BNDREGS_MASK;
1479     }
1480     return inuse;
1481 }
1482
1483 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
1484                      uint64_t inuse, uint64_t opt, uintptr_t ra)
1485 {
1486     uint64_t old_bv, new_bv;
1487
1488     /* The OS must have enabled XSAVE.  */
1489     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
1490         raise_exception_ra(env, EXCP06_ILLOP, ra);
1491     }
1492
1493     /* The operand must be 64 byte aligned.  */
1494     if (ptr & 63) {
1495         raise_exception_ra(env, EXCP0D_GPF, ra);
1496     }
1497
1498     /* Never save anything not enabled by XCR0.  */
1499     rfbm &= env->xcr0;
1500     opt &= rfbm;
1501
1502     if (opt & XSTATE_FP_MASK) {
1503         do_xsave_fpu(env, ptr, ra);
1504     }
1505     if (rfbm & XSTATE_SSE_MASK) {
1506         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
1507         do_xsave_mxcsr(env, ptr, ra);
1508     }
1509     if (opt & XSTATE_SSE_MASK) {
1510         do_xsave_sse(env, ptr, ra);
1511     }
1512     if (opt & XSTATE_BNDREGS_MASK) {
1513         do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
1514     }
1515     if (opt & XSTATE_BNDCSR_MASK) {
1516         do_xsave_bndcsr(env, ptr + XO(bndcsr_state), ra);
1517     }
1518     if (opt & XSTATE_PKRU_MASK) {
1519         do_xsave_pkru(env, ptr + XO(pkru_state), ra);
1520     }
1521
1522     /* Update the XSTATE_BV field.  */
1523     old_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
1524     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
1525     cpu_stq_data_ra(env, ptr + XO(header.xstate_bv), new_bv, ra);
1526 }
1527
1528 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
1529 {
1530     do_xsave(env, ptr, rfbm, get_xinuse(env), -1, GETPC());
1531 }
1532
1533 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
1534 {
1535     uint64_t inuse = get_xinuse(env);
1536     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
1537 }
1538
1539 static void do_xrstor_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
1540 {
1541     int i, fpuc, fpus, fptag;
1542     target_ulong addr;
1543
1544     fpuc = cpu_lduw_data_ra(env, ptr + XO(legacy.fcw), ra);
1545     fpus = cpu_lduw_data_ra(env, ptr + XO(legacy.fsw), ra);
1546     fptag = cpu_lduw_data_ra(env, ptr + XO(legacy.ftw), ra);
1547     cpu_set_fpuc(env, fpuc);
1548     cpu_set_fpus(env, fpus);
1549     fptag ^= 0xff;
1550     for (i = 0; i < 8; i++) {
1551         env->fptags[i] = ((fptag >> i) & 1);
1552     }
1553
1554     addr = ptr + XO(legacy.fpregs);
1555     for (i = 0; i < 8; i++) {
1556         floatx80 tmp = helper_fldt(env, addr, ra);
1557         ST(i) = tmp;
1558         addr += 16;
1559     }
1560 }
1561
1562 static void do_xrstor_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
1563 {
1564     cpu_set_mxcsr(env, cpu_ldl_data_ra(env, ptr + XO(legacy.mxcsr), ra));
1565 }
1566
1567 static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
1568 {
1569     int i, nb_xmm_regs;
1570     target_ulong addr;
1571
1572     if (env->hflags & HF_CS64_MASK) {
1573         nb_xmm_regs = 16;
1574     } else {
1575         nb_xmm_regs = 8;
1576     }
1577
1578     addr = ptr + XO(legacy.xmm_regs);
1579     for (i = 0; i < nb_xmm_regs; i++) {
1580         env->xmm_regs[i].ZMM_Q(0) = cpu_ldq_data_ra(env, addr, ra);
1581         env->xmm_regs[i].ZMM_Q(1) = cpu_ldq_data_ra(env, addr + 8, ra);
1582         addr += 16;
1583     }
1584 }
1585
1586 static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
1587 {
1588     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
1589     int i;
1590
1591     for (i = 0; i < 4; i++, addr += 16) {
1592         env->bnd_regs[i].lb = cpu_ldq_data_ra(env, addr, ra);
1593         env->bnd_regs[i].ub = cpu_ldq_data_ra(env, addr + 8, ra);
1594     }
1595 }
1596
1597 static void do_xrstor_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
1598 {
1599     /* FIXME: Extend highest implemented bit of linear address.  */
1600     env->bndcs_regs.cfgu
1601         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu), ra);
1602     env->bndcs_regs.sts
1603         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts), ra);
1604 }
1605
1606 static void do_xrstor_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
1607 {
1608     env->pkru = cpu_ldq_data_ra(env, ptr, ra);
1609 }
1610
1611 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
1612 {
1613     uintptr_t ra = GETPC();
1614
1615     /* The operand must be 16 byte aligned */
1616     if (ptr & 0xf) {
1617         raise_exception_ra(env, EXCP0D_GPF, ra);
1618     }
1619
1620     do_xrstor_fpu(env, ptr, ra);
1621
1622     if (env->cr[4] & CR4_OSFXSR_MASK) {
1623         do_xrstor_mxcsr(env, ptr, ra);
1624         /* Fast FXRSTOR leaves out the XMM registers */
1625         if (!(env->efer & MSR_EFER_FFXSR)
1626             || (env->hflags & HF_CPL_MASK)
1627             || !(env->hflags & HF_LMA_MASK)) {
1628             do_xrstor_sse(env, ptr, ra);
1629         }
1630     }
1631 }
1632
1633 #if defined(CONFIG_USER_ONLY)
1634 void cpu_x86_fxsave(CPUX86State *env, target_ulong ptr)
1635 {
1636     helper_fxsave(env, ptr);
1637 }
1638
1639 void cpu_x86_fxrstor(CPUX86State *env, target_ulong ptr)
1640 {
1641     helper_fxrstor(env, ptr);
1642 }
1643 #endif
1644
1645 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
1646 {
1647     uintptr_t ra = GETPC();
1648     uint64_t xstate_bv, xcomp_bv, reserve0;
1649
1650     rfbm &= env->xcr0;
1651
1652     /* The OS must have enabled XSAVE.  */
1653     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
1654         raise_exception_ra(env, EXCP06_ILLOP, ra);
1655     }
1656
1657     /* The operand must be 64 byte aligned.  */
1658     if (ptr & 63) {
1659         raise_exception_ra(env, EXCP0D_GPF, ra);
1660     }
1661
1662     xstate_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
1663
1664     if ((int64_t)xstate_bv < 0) {
1665         /* FIXME: Compact form.  */
1666         raise_exception_ra(env, EXCP0D_GPF, ra);
1667     }
1668
1669     /* Standard form.  */
1670
1671     /* The XSTATE_BV field must not set bits not present in XCR0.  */
1672     if (xstate_bv & ~env->xcr0) {
1673         raise_exception_ra(env, EXCP0D_GPF, ra);
1674     }
1675
1676     /* The XCOMP_BV field must be zero.  Note that, as of the April 2016
1677        revision, the description of the XSAVE Header (Vol 1, Sec 13.4.2)
1678        describes only XCOMP_BV, but the description of the standard form
1679        of XRSTOR (Vol 1, Sec 13.8.1) checks bytes 23:8 for zero, which
1680        includes the next 64-bit field.  */
1681     xcomp_bv = cpu_ldq_data_ra(env, ptr + XO(header.xcomp_bv), ra);
1682     reserve0 = cpu_ldq_data_ra(env, ptr + XO(header.reserve0), ra);
1683     if (xcomp_bv || reserve0) {
1684         raise_exception_ra(env, EXCP0D_GPF, ra);
1685     }
1686
1687     if (rfbm & XSTATE_FP_MASK) {
1688         if (xstate_bv & XSTATE_FP_MASK) {
1689             do_xrstor_fpu(env, ptr, ra);
1690         } else {
1691             helper_fninit(env);
1692             memset(env->fpregs, 0, sizeof(env->fpregs));
1693         }
1694     }
1695     if (rfbm & XSTATE_SSE_MASK) {
1696         /* Note that the standard form of XRSTOR loads MXCSR from memory
1697            whether or not the XSTATE_BV bit is set.  */
1698         do_xrstor_mxcsr(env, ptr, ra);
1699         if (xstate_bv & XSTATE_SSE_MASK) {
1700             do_xrstor_sse(env, ptr, ra);
1701         } else {
1702             /* ??? When AVX is implemented, we may have to be more
1703                selective in the clearing.  */
1704             memset(env->xmm_regs, 0, sizeof(env->xmm_regs));
1705         }
1706     }
1707     if (rfbm & XSTATE_BNDREGS_MASK) {
1708         if (xstate_bv & XSTATE_BNDREGS_MASK) {
1709             do_xrstor_bndregs(env, ptr + XO(bndreg_state), ra);
1710             env->hflags |= HF_MPX_IU_MASK;
1711         } else {
1712             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
1713             env->hflags &= ~HF_MPX_IU_MASK;
1714         }
1715     }
1716     if (rfbm & XSTATE_BNDCSR_MASK) {
1717         if (xstate_bv & XSTATE_BNDCSR_MASK) {
1718             do_xrstor_bndcsr(env, ptr + XO(bndcsr_state), ra);
1719         } else {
1720             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
1721         }
1722         cpu_sync_bndcs_hflags(env);
1723     }
1724     if (rfbm & XSTATE_PKRU_MASK) {
1725         uint64_t old_pkru = env->pkru;
1726         if (xstate_bv & XSTATE_PKRU_MASK) {
1727             do_xrstor_pkru(env, ptr + XO(pkru_state), ra);
1728         } else {
1729             env->pkru = 0;
1730         }
1731         if (env->pkru != old_pkru) {
1732             CPUState *cs = env_cpu(env);
1733             tlb_flush(cs);
1734         }
1735     }
1736 }
1737
1738 #undef XO
1739
1740 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
1741 {
1742     /* The OS must have enabled XSAVE.  */
1743     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
1744         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
1745     }
1746
1747     switch (ecx) {
1748     case 0:
1749         return env->xcr0;
1750     case 1:
1751         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
1752             return env->xcr0 & get_xinuse(env);
1753         }
1754         break;
1755     }
1756     raise_exception_ra(env, EXCP0D_GPF, GETPC());
1757 }
1758
1759 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
1760 {
1761     uint32_t dummy, ena_lo, ena_hi;
1762     uint64_t ena;
1763
1764     /* The OS must have enabled XSAVE.  */
1765     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
1766         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
1767     }
1768
1769     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
1770     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
1771         goto do_gpf;
1772     }
1773
1774     /* Disallow enabling unimplemented features.  */
1775     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
1776     ena = ((uint64_t)ena_hi << 32) | ena_lo;
1777     if (mask & ~ena) {
1778         goto do_gpf;
1779     }
1780
1781     /* Disallow enabling only half of MPX.  */
1782     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
1783         & XSTATE_BNDCSR_MASK) {
1784         goto do_gpf;
1785     }
1786
1787     env->xcr0 = mask;
1788     cpu_sync_bndcs_hflags(env);
1789     return;
1790
1791  do_gpf:
1792     raise_exception_ra(env, EXCP0D_GPF, GETPC());
1793 }
1794
1795 /* MMX/SSE */
1796 /* XXX: optimize by storing fptt and fptags in the static cpu state */
1797
1798 #define SSE_DAZ             0x0040
1799 #define SSE_RC_MASK         0x6000
1800 #define SSE_RC_NEAR         0x0000
1801 #define SSE_RC_DOWN         0x2000
1802 #define SSE_RC_UP           0x4000
1803 #define SSE_RC_CHOP         0x6000
1804 #define SSE_FZ              0x8000
1805
1806 void update_mxcsr_status(CPUX86State *env)
1807 {
1808     uint32_t mxcsr = env->mxcsr;
1809     int rnd_type;
1810
1811     /* set rounding mode */
1812     switch (mxcsr & SSE_RC_MASK) {
1813     default:
1814     case SSE_RC_NEAR:
1815         rnd_type = float_round_nearest_even;
1816         break;
1817     case SSE_RC_DOWN:
1818         rnd_type = float_round_down;
1819         break;
1820     case SSE_RC_UP:
1821         rnd_type = float_round_up;
1822         break;
1823     case SSE_RC_CHOP:
1824         rnd_type = float_round_to_zero;
1825         break;
1826     }
1827     set_float_rounding_mode(rnd_type, &env->sse_status);
1828
1829     /* set denormals are zero */
1830     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
1831
1832     /* set flush to zero */
1833     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->fp_status);
1834 }
1835
1836 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
1837 {
1838     cpu_set_mxcsr(env, val);
1839 }
1840
1841 void helper_enter_mmx(CPUX86State *env)
1842 {
1843     env->fpstt = 0;
1844     *(uint32_t *)(env->fptags) = 0;
1845     *(uint32_t *)(env->fptags + 4) = 0;
1846 }
1847
1848 void helper_emms(CPUX86State *env)
1849 {
1850     /* set to empty state */
1851     *(uint32_t *)(env->fptags) = 0x01010101;
1852     *(uint32_t *)(env->fptags + 4) = 0x01010101;
1853 }
1854
1855 /* XXX: suppress */
1856 void helper_movq(CPUX86State *env, void *d, void *s)
1857 {
1858     *(uint64_t *)d = *(uint64_t *)s;
1859 }
1860
1861 #define SHIFT 0
1862 #include "ops_sse.h"
1863
1864 #define SHIFT 1
1865 #include "ops_sse.h"