VEX/priv/host_amd64_defs.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                                 host_amd64_defs.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2004-2017 OpenWorks LLP
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, see <http://www.gnu.org/licenses/>.
  25
  26    The GNU General Public License is contained in the file COPYING.
  27
  28    Neither the names of the U.S. Department of Energy nor the
  29    University of California nor the names of its contributors may be
  30    used to endorse or promote products derived from this software
  31    without prior written permission.
  32 */
  33
  34 #include "libvex_basictypes.h"
  35 #include "libvex.h"
  36 #include "libvex_trc_values.h"
  37
  38 #include "main_util.h"
  39 #include "host_generic_regs.h"
  40 #include "host_amd64_defs.h"
  41
  42
  43 /* --------- Registers. --------- */
  44
  45 const RRegUniverse* getRRegUniverse_AMD64 ( void )
  46 {
  47    /* The real-register universe is a big constant, so we just want to
  48       initialise it once. */
  49    static RRegUniverse rRegUniverse_AMD64;
  50    static Bool         rRegUniverse_AMD64_initted = False;
  51
  52    /* Handy shorthand, nothing more */
  53    RRegUniverse* ru = &rRegUniverse_AMD64;
  54
  55    /* This isn't thread-safe.  Sigh. */
  56    if (LIKELY(rRegUniverse_AMD64_initted))
  57       return ru;
  58
  59    RRegUniverse__init(ru);
  60
  61    /* Add the registers.  The initial segment of this array must be
  62       those available for allocation by reg-alloc, and those that
  63       follow are not available for allocation. */
  64    ru->allocable_start[HRcInt64] = ru->size;
  65    ru->regs[ru->size++] = hregAMD64_R12();
  66    ru->regs[ru->size++] = hregAMD64_R13();
  67    ru->regs[ru->size++] = hregAMD64_R14();
  68    ru->regs[ru->size++] = hregAMD64_R15();
  69    ru->regs[ru->size++] = hregAMD64_RBX();
  70    ru->regs[ru->size++] = hregAMD64_RSI();
  71    ru->regs[ru->size++] = hregAMD64_RDI();
  72    ru->regs[ru->size++] = hregAMD64_R8();
  73    ru->regs[ru->size++] = hregAMD64_R9();
  74    ru->regs[ru->size++] = hregAMD64_R10();
  75    ru->allocable_end[HRcInt64] = ru->size - 1;
  76
  77    ru->allocable_start[HRcVec128] = ru->size;
  78    ru->regs[ru->size++] = hregAMD64_XMM3();
  79    ru->regs[ru->size++] = hregAMD64_XMM4();
  80    ru->regs[ru->size++] = hregAMD64_XMM5();
  81    ru->regs[ru->size++] = hregAMD64_XMM6();
  82    ru->regs[ru->size++] = hregAMD64_XMM7();
  83    ru->regs[ru->size++] = hregAMD64_XMM8();
  84    ru->regs[ru->size++] = hregAMD64_XMM9();
  85    ru->regs[ru->size++] = hregAMD64_XMM10();
  86    ru->regs[ru->size++] = hregAMD64_XMM11();
  87    ru->regs[ru->size++] = hregAMD64_XMM12();
  88    ru->allocable_end[HRcVec128] = ru->size - 1;
  89    ru->allocable = ru->size;
  90
  91    /* And other regs, not available to the allocator. */
  92    ru->regs[ru->size++] = hregAMD64_RAX();
  93    ru->regs[ru->size++] = hregAMD64_RCX();
  94    ru->regs[ru->size++] = hregAMD64_RDX();
  95    ru->regs[ru->size++] = hregAMD64_RSP();
  96    ru->regs[ru->size++] = hregAMD64_RBP();
  97    ru->regs[ru->size++] = hregAMD64_R11();
  98    ru->regs[ru->size++] = hregAMD64_XMM0();
  99    ru->regs[ru->size++] = hregAMD64_XMM1();
 100
 101    rRegUniverse_AMD64_initted = True;
 102
 103    RRegUniverse__check_is_sane(ru);
 104    return ru;
 105 }
 106
 107
 108 UInt ppHRegAMD64 ( HReg reg )
 109 {
 110    Int r;
 111    static const HChar* ireg64_names[16]
 112      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
 113          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
 114    /* Be generic for all virtual regs. */
 115    if (hregIsVirtual(reg)) {
 116       return ppHReg(reg);
 117    }
 118    /* But specific for real regs. */
 119    switch (hregClass(reg)) {
 120       case HRcInt64:
 121          r = hregEncoding(reg);
 122          vassert(r >= 0 && r < 16);
 123          return vex_printf("%s", ireg64_names[r]);
 124       case HRcVec128:
 125          r = hregEncoding(reg);
 126          vassert(r >= 0 && r < 16);
 127          return vex_printf("%%xmm%d", r);
 128       default:
 129          vpanic("ppHRegAMD64");
 130    }
 131 }
 132
 133 static UInt ppHRegAMD64_lo32 ( HReg reg )
 134 {
 135    Int r;
 136    static const HChar* ireg32_names[16]
 137      = { "%eax", "%ecx", "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
 138          "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" };
 139    /* Be generic for all virtual regs. */
 140    if (hregIsVirtual(reg)) {
 141       UInt written = ppHReg(reg);
 142       written += vex_printf("d");
 143       return written;
 144    }
 145    /* But specific for real regs. */
 146    switch (hregClass(reg)) {
 147       case HRcInt64:
 148          r = hregEncoding(reg);
 149          vassert(r >= 0 && r < 16);
 150          return vex_printf("%s", ireg32_names[r]);
 151       default:
 152          vpanic("ppHRegAMD64_lo32: invalid regclass");
 153    }
 154 }
 155
 156
 157 /* --------- Condition codes, Intel encoding. --------- */
 158
 159 const HChar* showAMD64CondCode ( AMD64CondCode cond )
 160 {
 161    switch (cond) {
 162       case Acc_O:      return "o";
 163       case Acc_NO:     return "no";
 164       case Acc_B:      return "b";
 165       case Acc_NB:     return "nb";
 166       case Acc_Z:      return "z";
 167       case Acc_NZ:     return "nz";
 168       case Acc_BE:     return "be";
 169       case Acc_NBE:    return "nbe";
 170       case Acc_S:      return "s";
 171       case Acc_NS:     return "ns";
 172       case Acc_P:      return "p";
 173       case Acc_NP:     return "np";
 174       case Acc_L:      return "l";
 175       case Acc_NL:     return "nl";
 176       case Acc_LE:     return "le";
 177       case Acc_NLE:    return "nle";
 178       case Acc_ALWAYS: return "ALWAYS";
 179       default: vpanic("ppAMD64CondCode");
 180    }
 181 }
 182
 183
 184 /* --------- AMD64AMode: memory address expressions. --------- */
 185
 186 AMD64AMode* AMD64AMode_IR ( UInt imm32, HReg reg ) {
 187    AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
 188    am->tag        = Aam_IR;
 189    am->Aam.IR.imm = imm32;
 190    am->Aam.IR.reg = reg;
 191    return am;
 192 }
 193 AMD64AMode* AMD64AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
 194    AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
 195    am->tag = Aam_IRRS;
 196    am->Aam.IRRS.imm   = imm32;
 197    am->Aam.IRRS.base  = base;
 198    am->Aam.IRRS.index = indEx;
 199    am->Aam.IRRS.shift = shift;
 200    vassert(shift >= 0 && shift <= 3);
 201    return am;
 202 }
 203
 204 void ppAMD64AMode ( AMD64AMode* am ) {
 205    switch (am->tag) {
 206       case Aam_IR:
 207          if (am->Aam.IR.imm == 0)
 208             vex_printf("(");
 209          else
 210             vex_printf("0x%x(", am->Aam.IR.imm);
 211          ppHRegAMD64(am->Aam.IR.reg);
 212          vex_printf(")");
 213          return;
 214       case Aam_IRRS:
 215          vex_printf("0x%x(", am->Aam.IRRS.imm);
 216          ppHRegAMD64(am->Aam.IRRS.base);
 217          vex_printf(",");
 218          ppHRegAMD64(am->Aam.IRRS.index);
 219          vex_printf(",%d)", 1 << am->Aam.IRRS.shift);
 220          return;
 221       default:
 222          vpanic("ppAMD64AMode");
 223    }
 224 }
 225
 226 static void addRegUsage_AMD64AMode ( HRegUsage* u, AMD64AMode* am ) {
 227    switch (am->tag) {
 228       case Aam_IR:
 229          addHRegUse(u, HRmRead, am->Aam.IR.reg);
 230          return;
 231       case Aam_IRRS:
 232          addHRegUse(u, HRmRead, am->Aam.IRRS.base);
 233          addHRegUse(u, HRmRead, am->Aam.IRRS.index);
 234          return;
 235       default:
 236          vpanic("addRegUsage_AMD64AMode");
 237    }
 238 }
 239
 240 static void mapRegs_AMD64AMode ( HRegRemap* m, AMD64AMode* am ) {
 241    switch (am->tag) {
 242       case Aam_IR:
 243          am->Aam.IR.reg = lookupHRegRemap(m, am->Aam.IR.reg);
 244          return;
 245       case Aam_IRRS:
 246          am->Aam.IRRS.base = lookupHRegRemap(m, am->Aam.IRRS.base);
 247          am->Aam.IRRS.index = lookupHRegRemap(m, am->Aam.IRRS.index);
 248          return;
 249       default:
 250          vpanic("mapRegs_AMD64AMode");
 251    }
 252 }
 253
 254 /* --------- Operand, which can be reg, immediate or memory. --------- */
 255
 256 AMD64RMI* AMD64RMI_Imm ( UInt imm32 ) {
 257    AMD64RMI* op       = LibVEX_Alloc_inline(sizeof(AMD64RMI));
 258    op->tag            = Armi_Imm;
 259    op->Armi.Imm.imm32 = imm32;
 260    return op;
 261 }
 262 AMD64RMI* AMD64RMI_Reg ( HReg reg ) {
 263    AMD64RMI* op     = LibVEX_Alloc_inline(sizeof(AMD64RMI));
 264    op->tag          = Armi_Reg;
 265    op->Armi.Reg.reg = reg;
 266    return op;
 267 }
 268 AMD64RMI* AMD64RMI_Mem ( AMD64AMode* am ) {
 269    AMD64RMI* op    = LibVEX_Alloc_inline(sizeof(AMD64RMI));
 270    op->tag         = Armi_Mem;
 271    op->Armi.Mem.am = am;
 272    return op;
 273 }
 274
 275 static void ppAMD64RMI_wrk ( AMD64RMI* op, Bool lo32 ) {
 276    switch (op->tag) {
 277       case Armi_Imm:
 278          vex_printf("$0x%x", op->Armi.Imm.imm32);
 279          return;
 280       case Armi_Reg:
 281          if (lo32)
 282             ppHRegAMD64_lo32(op->Armi.Reg.reg);
 283          else
 284             ppHRegAMD64(op->Armi.Reg.reg);
 285          return;
 286       case Armi_Mem:
 287          ppAMD64AMode(op->Armi.Mem.am);
 288          return;
 289      default:
 290          vpanic("ppAMD64RMI");
 291    }
 292 }
 293 void ppAMD64RMI ( AMD64RMI* op ) {
 294    ppAMD64RMI_wrk(op, False/*!lo32*/);
 295 }
 296 void ppAMD64RMI_lo32 ( AMD64RMI* op ) {
 297    ppAMD64RMI_wrk(op, True/*lo32*/);
 298 }
 299
 300 /* An AMD64RMI can only be used in a "read" context (what would it mean
 301    to write or modify a literal?) and so we enumerate its registers
 302    accordingly. */
 303 static void addRegUsage_AMD64RMI ( HRegUsage* u, AMD64RMI* op ) {
 304    switch (op->tag) {
 305       case Armi_Imm:
 306          return;
 307       case Armi_Reg:
 308          addHRegUse(u, HRmRead, op->Armi.Reg.reg);
 309          return;
 310       case Armi_Mem:
 311          addRegUsage_AMD64AMode(u, op->Armi.Mem.am);
 312          return;
 313       default:
 314          vpanic("addRegUsage_AMD64RMI");
 315    }
 316 }
 317
 318 static void mapRegs_AMD64RMI ( HRegRemap* m, AMD64RMI* op ) {
 319    switch (op->tag) {
 320       case Armi_Imm:
 321          return;
 322       case Armi_Reg:
 323          op->Armi.Reg.reg = lookupHRegRemap(m, op->Armi.Reg.reg);
 324          return;
 325       case Armi_Mem:
 326          mapRegs_AMD64AMode(m, op->Armi.Mem.am);
 327          return;
 328       default:
 329          vpanic("mapRegs_AMD64RMI");
 330    }
 331 }
 332
 333
 334 /* --------- Operand, which can be reg or immediate only. --------- */
 335
 336 AMD64RI* AMD64RI_Imm ( UInt imm32 ) {
 337    AMD64RI* op       = LibVEX_Alloc_inline(sizeof(AMD64RI));
 338    op->tag           = Ari_Imm;
 339    op->Ari.Imm.imm32 = imm32;
 340    return op;
 341 }
 342 AMD64RI* AMD64RI_Reg ( HReg reg ) {
 343    AMD64RI* op     = LibVEX_Alloc_inline(sizeof(AMD64RI));
 344    op->tag         = Ari_Reg;
 345    op->Ari.Reg.reg = reg;
 346    return op;
 347 }
 348
 349 void ppAMD64RI ( AMD64RI* op ) {
 350    switch (op->tag) {
 351       case Ari_Imm:
 352          vex_printf("$0x%x", op->Ari.Imm.imm32);
 353          return;
 354       case Ari_Reg:
 355          ppHRegAMD64(op->Ari.Reg.reg);
 356          return;
 357      default:
 358          vpanic("ppAMD64RI");
 359    }
 360 }
 361
 362 /* An AMD64RI can only be used in a "read" context (what would it mean
 363    to write or modify a literal?) and so we enumerate its registers
 364    accordingly. */
 365 static void addRegUsage_AMD64RI ( HRegUsage* u, AMD64RI* op ) {
 366    switch (op->tag) {
 367       case Ari_Imm:
 368          return;
 369       case Ari_Reg:
 370          addHRegUse(u, HRmRead, op->Ari.Reg.reg);
 371          return;
 372       default:
 373          vpanic("addRegUsage_AMD64RI");
 374    }
 375 }
 376
 377 static void mapRegs_AMD64RI ( HRegRemap* m, AMD64RI* op ) {
 378    switch (op->tag) {
 379       case Ari_Imm:
 380          return;
 381       case Ari_Reg:
 382          op->Ari.Reg.reg = lookupHRegRemap(m, op->Ari.Reg.reg);
 383          return;
 384       default:
 385          vpanic("mapRegs_AMD64RI");
 386    }
 387 }
 388
 389
 390 /* --------- Operand, which can be reg or memory only. --------- */
 391
 392 AMD64RM* AMD64RM_Reg ( HReg reg ) {
 393    AMD64RM* op       = LibVEX_Alloc_inline(sizeof(AMD64RM));
 394    op->tag         = Arm_Reg;
 395    op->Arm.Reg.reg = reg;
 396    return op;
 397 }
 398 AMD64RM* AMD64RM_Mem ( AMD64AMode* am ) {
 399    AMD64RM* op    = LibVEX_Alloc_inline(sizeof(AMD64RM));
 400    op->tag        = Arm_Mem;
 401    op->Arm.Mem.am = am;
 402    return op;
 403 }
 404
 405 void ppAMD64RM ( AMD64RM* op ) {
 406    switch (op->tag) {
 407       case Arm_Mem:
 408          ppAMD64AMode(op->Arm.Mem.am);
 409          return;
 410       case Arm_Reg:
 411          ppHRegAMD64(op->Arm.Reg.reg);
 412          return;
 413      default:
 414          vpanic("ppAMD64RM");
 415    }
 416 }
 417
 418 /* Because an AMD64RM can be both a source or destination operand, we
 419    have to supply a mode -- pertaining to the operand as a whole --
 420    indicating how it's being used. */
 421 static void addRegUsage_AMD64RM ( HRegUsage* u, AMD64RM* op, HRegMode mode ) {
 422    switch (op->tag) {
 423       case Arm_Mem:
 424          /* Memory is read, written or modified.  So we just want to
 425             know the regs read by the amode. */
 426          addRegUsage_AMD64AMode(u, op->Arm.Mem.am);
 427          return;
 428       case Arm_Reg:
 429          /* reg is read, written or modified.  Add it in the
 430             appropriate way. */
 431          addHRegUse(u, mode, op->Arm.Reg.reg);
 432          return;
 433      default:
 434          vpanic("addRegUsage_AMD64RM");
 435    }
 436 }
 437
 438 static void mapRegs_AMD64RM ( HRegRemap* m, AMD64RM* op )
 439 {
 440    switch (op->tag) {
 441       case Arm_Mem:
 442          mapRegs_AMD64AMode(m, op->Arm.Mem.am);
 443          return;
 444       case Arm_Reg:
 445          op->Arm.Reg.reg = lookupHRegRemap(m, op->Arm.Reg.reg);
 446          return;
 447      default:
 448          vpanic("mapRegs_AMD64RM");
 449    }
 450 }
 451
 452
 453 /* --------- Instructions. --------- */
 454
 455 static const HChar* showAMD64ScalarSz ( Int sz ) {
 456    switch (sz) {
 457       case 2: return "w";
 458       case 4: return "l";
 459       case 8: return "q";
 460       default: vpanic("showAMD64ScalarSz");
 461    }
 462 }
 463
 464 const HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
 465    switch (op) {
 466       case Aun_NOT: return "not";
 467       case Aun_NEG: return "neg";
 468       default: vpanic("showAMD64UnaryOp");
 469    }
 470 }
 471
 472 const HChar* showAMD64AluOp ( AMD64AluOp op ) {
 473    switch (op) {
 474       case Aalu_MOV:  return "mov";
 475       case Aalu_CMP:  return "cmp";
 476       case Aalu_ADD:  return "add";
 477       case Aalu_SUB:  return "sub";
 478       case Aalu_ADC:  return "adc";
 479       case Aalu_SBB:  return "sbb";
 480       case Aalu_AND:  return "and";
 481       case Aalu_OR:   return "or";
 482       case Aalu_XOR:  return "xor";
 483       case Aalu_MUL:  return "imul";
 484       default: vpanic("showAMD64AluOp");
 485    }
 486 }
 487
 488 const HChar* showAMD64ShiftOp ( AMD64ShiftOp op ) {
 489    switch (op) {
 490       case Ash_SHL: return "shl";
 491       case Ash_SHR: return "shr";
 492       case Ash_SAR: return "sar";
 493       default: vpanic("showAMD64ShiftOp");
 494    }
 495 }
 496
 497 const HChar* showA87FpOp ( A87FpOp op ) {
 498    switch (op) {
 499       case Afp_SCALE:  return "scale";
 500       case Afp_ATAN:   return "atan";
 501       case Afp_YL2X:   return "yl2x";
 502       case Afp_YL2XP1: return "yl2xp1";
 503       case Afp_PREM:   return "prem";
 504       case Afp_PREM1:  return "prem1";
 505       case Afp_SQRT:   return "sqrt";
 506       case Afp_SIN:    return "sin";
 507       case Afp_COS:    return "cos";
 508       case Afp_TAN:    return "tan";
 509       case Afp_ROUND:  return "round";
 510       case Afp_2XM1:   return "2xm1";
 511       default: vpanic("showA87FpOp");
 512    }
 513 }
 514
 515 const HChar* showAMD64SseOp ( AMD64SseOp op ) {
 516    switch (op) {
 517       case Asse_MOV:      return "movups";
 518       case Asse_ADDF:     return "add";
 519       case Asse_SUBF:     return "sub";
 520       case Asse_MULF:     return "mul";
 521       case Asse_DIVF:     return "div";
 522       case Asse_MAXF:     return "max";
 523       case Asse_MINF:     return "min";
 524       case Asse_CMPEQF:   return "cmpFeq";
 525       case Asse_CMPLTF:   return "cmpFlt";
 526       case Asse_CMPLEF:   return "cmpFle";
 527       case Asse_CMPUNF:   return "cmpFun";
 528       case Asse_RCPF:     return "rcp";
 529       case Asse_RSQRTF:   return "rsqrt";
 530       case Asse_SQRTF:    return "sqrt";
 531       case Asse_I2F:      return "cvtdq2ps.";
 532       case Asse_F2I:      return "cvtps2dq.";
 533       case Asse_AND:      return "and";
 534       case Asse_OR:       return "or";
 535       case Asse_XOR:      return "xor";
 536       case Asse_ANDN:     return "andn";
 537       case Asse_ADD8:     return "paddb";
 538       case Asse_ADD16:    return "paddw";
 539       case Asse_ADD32:    return "paddd";
 540       case Asse_ADD64:    return "paddq";
 541       case Asse_QADD8U:   return "paddusb";
 542       case Asse_QADD16U:  return "paddusw";
 543       case Asse_QADD8S:   return "paddsb";
 544       case Asse_QADD16S:  return "paddsw";
 545       case Asse_SUB8:     return "psubb";
 546       case Asse_SUB16:    return "psubw";
 547       case Asse_SUB32:    return "psubd";
 548       case Asse_SUB64:    return "psubq";
 549       case Asse_QSUB8U:   return "psubusb";
 550       case Asse_QSUB16U:  return "psubusw";
 551       case Asse_QSUB8S:   return "psubsb";
 552       case Asse_QSUB16S:  return "psubsw";
 553       case Asse_MUL16:    return "pmullw";
 554       case Asse_MULHI16U: return "pmulhuw";
 555       case Asse_MULHI16S: return "pmulhw";
 556       case Asse_AVG8U:    return "pavgb";
 557       case Asse_AVG16U:   return "pavgw";
 558       case Asse_MAX16S:   return "pmaxw";
 559       case Asse_MAX8U:    return "pmaxub";
 560       case Asse_MIN16S:   return "pminw";
 561       case Asse_MIN8U:    return "pminub";
 562       case Asse_CMPEQ8:   return "pcmpeqb";
 563       case Asse_CMPEQ16:  return "pcmpeqw";
 564       case Asse_CMPEQ32:  return "pcmpeqd";
 565       case Asse_CMPGT8S:  return "pcmpgtb";
 566       case Asse_CMPGT16S: return "pcmpgtw";
 567       case Asse_CMPGT32S: return "pcmpgtd";
 568       case Asse_SHL16:    return "psllw";
 569       case Asse_SHL32:    return "pslld";
 570       case Asse_SHL64:    return "psllq";
 571       case Asse_SHL128:   return "pslldq";
 572       case Asse_SHR16:    return "psrlw";
 573       case Asse_SHR32:    return "psrld";
 574       case Asse_SHR64:    return "psrlq";
 575       case Asse_SHR128:   return "psrldq";
 576       case Asse_SAR16:    return "psraw";
 577       case Asse_SAR32:    return "psrad";
 578       case Asse_PACKSSD:  return "packssdw";
 579       case Asse_PACKSSW:  return "packsswb";
 580       case Asse_PACKUSW:  return "packuswb";
 581       case Asse_UNPCKHB:  return "punpckhb";
 582       case Asse_UNPCKHW:  return "punpckhw";
 583       case Asse_UNPCKHD:  return "punpckhd";
 584       case Asse_UNPCKHQ:  return "punpckhq";
 585       case Asse_UNPCKLB:  return "punpcklb";
 586       case Asse_UNPCKLW:  return "punpcklw";
 587       case Asse_UNPCKLD:  return "punpckld";
 588       case Asse_UNPCKLQ:  return "punpcklq";
 589       case Asse_PSHUFB:   return "pshufb";
 590       case Asse_PMADDUBSW: return "pmaddubsw";
 591       case Asse_F32toF16: return "vcvtps2ph(rm_field=$0x4).";
 592       case Asse_F16toF32: return "vcvtph2ps.";
 593       default: vpanic("showAMD64SseOp");
 594    }
 595 }
 596
 597 AMD64Instr* AMD64Instr_Imm64 ( ULong imm64, HReg dst ) {
 598    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 599    i->tag             = Ain_Imm64;
 600    i->Ain.Imm64.imm64 = imm64;
 601    i->Ain.Imm64.dst   = dst;
 602    return i;
 603 }
 604 AMD64Instr* AMD64Instr_Alu64R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
 605    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 606    i->tag            = Ain_Alu64R;
 607    i->Ain.Alu64R.op  = op;
 608    i->Ain.Alu64R.src = src;
 609    i->Ain.Alu64R.dst = dst;
 610    return i;
 611 }
 612 AMD64Instr* AMD64Instr_Alu64M ( AMD64AluOp op, AMD64RI* src, AMD64AMode* dst ) {
 613    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 614    i->tag            = Ain_Alu64M;
 615    i->Ain.Alu64M.op  = op;
 616    i->Ain.Alu64M.src = src;
 617    i->Ain.Alu64M.dst = dst;
 618    vassert(op != Aalu_MUL);
 619    return i;
 620 }
 621 AMD64Instr* AMD64Instr_Sh64 ( AMD64ShiftOp op, UInt src, HReg dst ) {
 622    AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 623    i->tag          = Ain_Sh64;
 624    i->Ain.Sh64.op  = op;
 625    i->Ain.Sh64.src = src;
 626    i->Ain.Sh64.dst = dst;
 627    return i;
 628 }
 629 AMD64Instr* AMD64Instr_Sh32 ( AMD64ShiftOp op, UInt src, HReg dst ) {
 630    AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 631    i->tag          = Ain_Sh32;
 632    i->Ain.Sh32.op  = op;
 633    i->Ain.Sh32.src = src;
 634    i->Ain.Sh32.dst = dst;
 635    return i;
 636 }
 637 AMD64Instr* AMD64Instr_Test64 ( UInt imm32, HReg dst ) {
 638    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 639    i->tag              = Ain_Test64;
 640    i->Ain.Test64.imm32 = imm32;
 641    i->Ain.Test64.dst   = dst;
 642    return i;
 643 }
 644 AMD64Instr* AMD64Instr_Unary64 ( AMD64UnaryOp op, HReg dst ) {
 645    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 646    i->tag             = Ain_Unary64;
 647    i->Ain.Unary64.op  = op;
 648    i->Ain.Unary64.dst = dst;
 649    return i;
 650 }
 651 AMD64Instr* AMD64Instr_Lea64 ( AMD64AMode* am, HReg dst ) {
 652    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 653    i->tag             = Ain_Lea64;
 654    i->Ain.Lea64.am    = am;
 655    i->Ain.Lea64.dst   = dst;
 656    return i;
 657 }
 658 AMD64Instr* AMD64Instr_Alu32R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
 659    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 660    i->tag            = Ain_Alu32R;
 661    i->Ain.Alu32R.op  = op;
 662    i->Ain.Alu32R.src = src;
 663    i->Ain.Alu32R.dst = dst;
 664    switch (op) {
 665       case Aalu_ADD: case Aalu_SUB: case Aalu_CMP:
 666       case Aalu_AND: case Aalu_OR:  case Aalu_XOR: break;
 667       default: vassert(0);
 668    }
 669    return i;
 670 }
 671 AMD64Instr* AMD64Instr_MulL ( Bool syned, AMD64RM* src ) {
 672    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 673    i->tag            = Ain_MulL;
 674    i->Ain.MulL.syned = syned;
 675    i->Ain.MulL.src   = src;
 676    return i;
 677 }
 678 AMD64Instr* AMD64Instr_Div ( Bool syned, Int sz, AMD64RM* src ) {
 679    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 680    i->tag            = Ain_Div;
 681    i->Ain.Div.syned  = syned;
 682    i->Ain.Div.sz     = sz;
 683    i->Ain.Div.src    = src;
 684    vassert(sz == 4 || sz == 8);
 685    return i;
 686 }
 687 AMD64Instr* AMD64Instr_Push( AMD64RMI* src ) {
 688    AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 689    i->tag          = Ain_Push;
 690    i->Ain.Push.src = src;
 691    return i;
 692 }
 693 AMD64Instr* AMD64Instr_Call ( AMD64CondCode cond, Addr64 target, Int regparms,
 694                               RetLoc rloc ) {
 695    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 696    i->tag               = Ain_Call;
 697    i->Ain.Call.cond     = cond;
 698    i->Ain.Call.target   = target;
 699    i->Ain.Call.regparms = regparms;
 700    i->Ain.Call.rloc     = rloc;
 701    vassert(regparms >= 0 && regparms <= 6);
 702    vassert(is_sane_RetLoc(rloc));
 703    return i;
 704 }
 705
 706 AMD64Instr* AMD64Instr_XDirect ( Addr64 dstGA, AMD64AMode* amRIP,
 707                                  AMD64CondCode cond, Bool toFastEP ) {
 708    AMD64Instr* i           = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 709    i->tag                  = Ain_XDirect;
 710    i->Ain.XDirect.dstGA    = dstGA;
 711    i->Ain.XDirect.amRIP    = amRIP;
 712    i->Ain.XDirect.cond     = cond;
 713    i->Ain.XDirect.toFastEP = toFastEP;
 714    return i;
 715 }
 716 AMD64Instr* AMD64Instr_XIndir ( HReg dstGA, AMD64AMode* amRIP,
 717                                 AMD64CondCode cond ) {
 718    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 719    i->tag              = Ain_XIndir;
 720    i->Ain.XIndir.dstGA = dstGA;
 721    i->Ain.XIndir.amRIP = amRIP;
 722    i->Ain.XIndir.cond  = cond;
 723    return i;
 724 }
 725 AMD64Instr* AMD64Instr_XAssisted ( HReg dstGA, AMD64AMode* amRIP,
 726                                    AMD64CondCode cond, IRJumpKind jk ) {
 727    AMD64Instr* i          = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 728    i->tag                 = Ain_XAssisted;
 729    i->Ain.XAssisted.dstGA = dstGA;
 730    i->Ain.XAssisted.amRIP = amRIP;
 731    i->Ain.XAssisted.cond  = cond;
 732    i->Ain.XAssisted.jk    = jk;
 733    return i;
 734 }
 735
 736 AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode cond, HReg src, HReg dst ) {
 737    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 738    i->tag             = Ain_CMov64;
 739    i->Ain.CMov64.cond = cond;
 740    i->Ain.CMov64.src  = src;
 741    i->Ain.CMov64.dst  = dst;
 742    vassert(cond != Acc_ALWAYS);
 743    return i;
 744 }
 745 AMD64Instr* AMD64Instr_CLoad ( AMD64CondCode cond, UChar szB,
 746                                AMD64AMode* addr, HReg dst ) {
 747    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 748    i->tag            = Ain_CLoad;
 749    i->Ain.CLoad.cond = cond;
 750    i->Ain.CLoad.szB  = szB;
 751    i->Ain.CLoad.addr = addr;
 752    i->Ain.CLoad.dst  = dst;
 753    vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
 754    return i;
 755 }
 756 AMD64Instr* AMD64Instr_CStore ( AMD64CondCode cond, UChar szB,
 757                                 HReg src, AMD64AMode* addr ) {
 758    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 759    i->tag             = Ain_CStore;
 760    i->Ain.CStore.cond = cond;
 761    i->Ain.CStore.szB  = szB;
 762    i->Ain.CStore.src  = src;
 763    i->Ain.CStore.addr = addr;
 764    vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
 765    return i;
 766 }
 767 AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
 768    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 769    i->tag              = Ain_MovxLQ;
 770    i->Ain.MovxLQ.syned = syned;
 771    i->Ain.MovxLQ.src   = src;
 772    i->Ain.MovxLQ.dst   = dst;
 773    return i;
 774 }
 775 AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
 776                                 AMD64AMode* src, HReg dst ) {
 777    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 778    i->tag                = Ain_LoadEX;
 779    i->Ain.LoadEX.szSmall = szSmall;
 780    i->Ain.LoadEX.syned   = syned;
 781    i->Ain.LoadEX.src     = src;
 782    i->Ain.LoadEX.dst     = dst;
 783    vassert(szSmall == 1 || szSmall == 2 || szSmall == 4);
 784    return i;
 785 }
 786 AMD64Instr* AMD64Instr_Store ( UChar sz, HReg src, AMD64AMode* dst ) {
 787    AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 788    i->tag           = Ain_Store;
 789    i->Ain.Store.sz  = sz;
 790    i->Ain.Store.src = src;
 791    i->Ain.Store.dst = dst;
 792    vassert(sz == 1 || sz == 2 || sz == 4);
 793    return i;
 794 }
 795 AMD64Instr* AMD64Instr_Set64 ( AMD64CondCode cond, HReg dst ) {
 796    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 797    i->tag            = Ain_Set64;
 798    i->Ain.Set64.cond = cond;
 799    i->Ain.Set64.dst  = dst;
 800    return i;
 801 }
 802 AMD64Instr* AMD64Instr_Bsfr64 ( Bool isFwds, HReg src, HReg dst ) {
 803    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 804    i->tag               = Ain_Bsfr64;
 805    i->Ain.Bsfr64.isFwds = isFwds;
 806    i->Ain.Bsfr64.src    = src;
 807    i->Ain.Bsfr64.dst    = dst;
 808    return i;
 809 }
 810 AMD64Instr* AMD64Instr_MFence ( void ) {
 811    AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 812    i->tag        = Ain_MFence;
 813    return i;
 814 }
 815 AMD64Instr* AMD64Instr_ACAS ( AMD64AMode* addr, UChar sz ) {
 816    AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 817    i->tag           = Ain_ACAS;
 818    i->Ain.ACAS.addr = addr;
 819    i->Ain.ACAS.sz   = sz;
 820    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
 821    return i;
 822 }
 823 AMD64Instr* AMD64Instr_DACAS ( AMD64AMode* addr, UChar sz ) {
 824    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 825    i->tag            = Ain_DACAS;
 826    i->Ain.DACAS.addr = addr;
 827    i->Ain.DACAS.sz   = sz;
 828    vassert(sz == 8 || sz == 4);
 829    return i;
 830 }
 831
 832 AMD64Instr* AMD64Instr_A87Free ( Int nregs )
 833 {
 834    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 835    i->tag               = Ain_A87Free;
 836    i->Ain.A87Free.nregs = nregs;
 837    vassert(nregs >= 1 && nregs <= 7);
 838    return i;
 839 }
 840 AMD64Instr* AMD64Instr_A87PushPop ( AMD64AMode* addr, Bool isPush, UChar szB )
 841 {
 842    AMD64Instr* i            = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 843    i->tag                   = Ain_A87PushPop;
 844    i->Ain.A87PushPop.addr   = addr;
 845    i->Ain.A87PushPop.isPush = isPush;
 846    i->Ain.A87PushPop.szB    = szB;
 847    vassert(szB == 8 || szB == 4);
 848    return i;
 849 }
 850 AMD64Instr* AMD64Instr_A87FpOp ( A87FpOp op )
 851 {
 852    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 853    i->tag            = Ain_A87FpOp;
 854    i->Ain.A87FpOp.op = op;
 855    return i;
 856 }
 857 AMD64Instr* AMD64Instr_A87LdCW ( AMD64AMode* addr )
 858 {
 859    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 860    i->tag              = Ain_A87LdCW;
 861    i->Ain.A87LdCW.addr = addr;
 862    return i;
 863 }
 864 AMD64Instr* AMD64Instr_A87StSW ( AMD64AMode* addr )
 865 {
 866    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 867    i->tag              = Ain_A87StSW;
 868    i->Ain.A87StSW.addr = addr;
 869    return i;
 870 }
 871 AMD64Instr* AMD64Instr_LdMXCSR ( AMD64AMode* addr ) {
 872    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 873    i->tag                = Ain_LdMXCSR;
 874    i->Ain.LdMXCSR.addr   = addr;
 875    return i;
 876 }
 877 AMD64Instr* AMD64Instr_SseUComIS ( Int sz, HReg srcL, HReg srcR, HReg dst ) {
 878    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 879    i->tag                = Ain_SseUComIS;
 880    i->Ain.SseUComIS.sz   = toUChar(sz);
 881    i->Ain.SseUComIS.srcL = srcL;
 882    i->Ain.SseUComIS.srcR = srcR;
 883    i->Ain.SseUComIS.dst  = dst;
 884    vassert(sz == 4 || sz == 8);
 885    return i;
 886 }
 887 AMD64Instr* AMD64Instr_SseSI2SF ( Int szS, Int szD, HReg src, HReg dst ) {
 888    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 889    i->tag              = Ain_SseSI2SF;
 890    i->Ain.SseSI2SF.szS = toUChar(szS);
 891    i->Ain.SseSI2SF.szD = toUChar(szD);
 892    i->Ain.SseSI2SF.src = src;
 893    i->Ain.SseSI2SF.dst = dst;
 894    vassert(szS == 4 || szS == 8);
 895    vassert(szD == 4 || szD == 8);
 896    return i;
 897 }
 898 AMD64Instr* AMD64Instr_SseSF2SI ( Int szS, Int szD, HReg src, HReg dst ) {
 899    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 900    i->tag              = Ain_SseSF2SI;
 901    i->Ain.SseSF2SI.szS = toUChar(szS);
 902    i->Ain.SseSF2SI.szD = toUChar(szD);
 903    i->Ain.SseSF2SI.src = src;
 904    i->Ain.SseSF2SI.dst = dst;
 905    vassert(szS == 4 || szS == 8);
 906    vassert(szD == 4 || szD == 8);
 907    return i;
 908 }
 909 AMD64Instr* AMD64Instr_SseSDSS   ( Bool from64, HReg src, HReg dst )
 910 {
 911    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 912    i->tag                = Ain_SseSDSS;
 913    i->Ain.SseSDSS.from64 = from64;
 914    i->Ain.SseSDSS.src    = src;
 915    i->Ain.SseSDSS.dst    = dst;
 916    return i;
 917 }
 918 AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz,
 919                                  HReg reg, AMD64AMode* addr ) {
 920    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 921    i->tag                = Ain_SseLdSt;
 922    i->Ain.SseLdSt.isLoad = isLoad;
 923    i->Ain.SseLdSt.sz     = toUChar(sz);
 924    i->Ain.SseLdSt.reg    = reg;
 925    i->Ain.SseLdSt.addr   = addr;
 926    vassert(sz == 4 || sz == 8 || sz == 16);
 927    return i;
 928 }
 929 AMD64Instr* AMD64Instr_SseCStore ( AMD64CondCode cond,
 930                                    HReg src, AMD64AMode* addr )
 931 {
 932    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 933    i->tag                = Ain_SseCStore;
 934    i->Ain.SseCStore.cond = cond;
 935    i->Ain.SseCStore.src  = src;
 936    i->Ain.SseCStore.addr = addr;
 937    vassert(cond != Acc_ALWAYS);
 938    return i;
 939 }
 940 AMD64Instr* AMD64Instr_SseCLoad ( AMD64CondCode cond,
 941                                   AMD64AMode* addr, HReg dst )
 942 {
 943    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 944    i->tag               = Ain_SseCLoad;
 945    i->Ain.SseCLoad.cond = cond;
 946    i->Ain.SseCLoad.addr = addr;
 947    i->Ain.SseCLoad.dst  = dst;
 948    vassert(cond != Acc_ALWAYS);
 949    return i;
 950 }
 951 AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
 952 {
 953    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 954    i->tag                = Ain_SseLdzLO;
 955    i->Ain.SseLdzLO.sz    = sz;
 956    i->Ain.SseLdzLO.reg   = reg;
 957    i->Ain.SseLdzLO.addr  = addr;
 958    vassert(sz == 4 || sz == 8);
 959    return i;
 960 }
 961 AMD64Instr* AMD64Instr_Sse32Fx4 ( AMD64SseOp op, HReg src, HReg dst ) {
 962    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 963    i->tag              = Ain_Sse32Fx4;
 964    i->Ain.Sse32Fx4.op  = op;
 965    i->Ain.Sse32Fx4.src = src;
 966    i->Ain.Sse32Fx4.dst = dst;
 967    vassert(op != Asse_MOV);
 968    return i;
 969 }
 970 AMD64Instr* AMD64Instr_Sse32FLo ( AMD64SseOp op, HReg src, HReg dst ) {
 971    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 972    i->tag              = Ain_Sse32FLo;
 973    i->Ain.Sse32FLo.op  = op;
 974    i->Ain.Sse32FLo.src = src;
 975    i->Ain.Sse32FLo.dst = dst;
 976    vassert(op != Asse_MOV);
 977    return i;
 978 }
 979 AMD64Instr* AMD64Instr_Sse64Fx2 ( AMD64SseOp op, HReg src, HReg dst ) {
 980    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 981    i->tag              = Ain_Sse64Fx2;
 982    i->Ain.Sse64Fx2.op  = op;
 983    i->Ain.Sse64Fx2.src = src;
 984    i->Ain.Sse64Fx2.dst = dst;
 985    vassert(op != Asse_MOV);
 986    return i;
 987 }
 988 AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp op, HReg src, HReg dst ) {
 989    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 990    i->tag              = Ain_Sse64FLo;
 991    i->Ain.Sse64FLo.op  = op;
 992    i->Ain.Sse64FLo.src = src;
 993    i->Ain.Sse64FLo.dst = dst;
 994    vassert(op != Asse_MOV);
 995    return i;
 996 }
 997 AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp op, HReg re, HReg rg ) {
 998    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 999    i->tag             = Ain_SseReRg;
1000    i->Ain.SseReRg.op  = op;
1001    i->Ain.SseReRg.src = re;
1002    i->Ain.SseReRg.dst = rg;
1003    return i;
1004 }
1005 AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
1006    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1007    i->tag              = Ain_SseCMov;
1008    i->Ain.SseCMov.cond = cond;
1009    i->Ain.SseCMov.src  = src;
1010    i->Ain.SseCMov.dst  = dst;
1011    vassert(cond != Acc_ALWAYS);
1012    return i;
1013 }
1014 AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
1015    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1016    i->tag               = Ain_SseShuf;
1017    i->Ain.SseShuf.order = order;
1018    i->Ain.SseShuf.src   = src;
1019    i->Ain.SseShuf.dst   = dst;
1020    vassert(order >= 0 && order <= 0xFF);
1021    return i;
1022 }
1023 AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp op,
1024                                    UInt shiftBits, HReg dst ) {
1025    AMD64Instr* i              = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1026    i->tag                     = Ain_SseShiftN;
1027    i->Ain.SseShiftN.op        = op;
1028    i->Ain.SseShiftN.shiftBits = shiftBits;
1029    i->Ain.SseShiftN.dst       = dst;
1030    return i;
1031 }
1032 AMD64Instr* AMD64Instr_SseMOVQ ( HReg gpr, HReg xmm, Bool toXMM ) {
1033    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1034    i->tag               = Ain_SseMOVQ;
1035    i->Ain.SseMOVQ.gpr   = gpr;
1036    i->Ain.SseMOVQ.xmm   = xmm;
1037    i->Ain.SseMOVQ.toXMM = toXMM;
1038    vassert(hregClass(gpr) == HRcInt64);
1039    vassert(hregClass(xmm) == HRcVec128);
1040    return i;
1041 }
1042 //uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
1043 //uu                                  HReg reg, AMD64AMode* addr ) {
1044 //uu    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1045 //uu    i->tag                = Ain_AvxLdSt;
1046 //uu    i->Ain.AvxLdSt.isLoad = isLoad;
1047 //uu    i->Ain.AvxLdSt.reg    = reg;
1048 //uu    i->Ain.AvxLdSt.addr   = addr;
1049 //uu    return i;
1050 //uu }
1051 //uu AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp op, HReg re, HReg rg ) {
1052 //uu    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1053 //uu    i->tag             = Ain_AvxReRg;
1054 //uu    i->Ain.AvxReRg.op  = op;
1055 //uu    i->Ain.AvxReRg.src = re;
1056 //uu    i->Ain.AvxReRg.dst = rg;
1057 //uu    return i;
1058 //uu }
1059 AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
1060                                  AMD64AMode* amFailAddr ) {
1061    AMD64Instr* i             = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1062    i->tag                    = Ain_EvCheck;
1063    i->Ain.EvCheck.amCounter  = amCounter;
1064    i->Ain.EvCheck.amFailAddr = amFailAddr;
1065    return i;
1066 }
1067 AMD64Instr* AMD64Instr_ProfInc ( void ) {
1068    AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1069    i->tag        = Ain_ProfInc;
1070    return i;
1071 }
1072
1073 void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 )
1074 {
1075    vassert(mode64 == True);
1076    switch (i->tag) {
1077       case Ain_Imm64:
1078          vex_printf("movabsq $0x%llx,", i->Ain.Imm64.imm64);
1079          ppHRegAMD64(i->Ain.Imm64.dst);
1080          return;
1081       case Ain_Alu64R:
1082          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64R.op));
1083          ppAMD64RMI(i->Ain.Alu64R.src);
1084          vex_printf(",");
1085          ppHRegAMD64(i->Ain.Alu64R.dst);
1086          return;
1087       case Ain_Alu64M:
1088          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64M.op));
1089          ppAMD64RI(i->Ain.Alu64M.src);
1090          vex_printf(",");
1091          ppAMD64AMode(i->Ain.Alu64M.dst);
1092          return;
1093       case Ain_Sh64:
1094          vex_printf("%sq ", showAMD64ShiftOp(i->Ain.Sh64.op));
1095          if (i->Ain.Sh64.src == 0)
1096             vex_printf("%%cl,");
1097          else
1098             vex_printf("$%d,", (Int)i->Ain.Sh64.src);
1099          ppHRegAMD64(i->Ain.Sh64.dst);
1100          return;
1101       case Ain_Sh32:
1102          vex_printf("%sl ", showAMD64ShiftOp(i->Ain.Sh32.op));
1103          if (i->Ain.Sh32.src == 0)
1104             vex_printf("%%cl,");
1105          else
1106             vex_printf("$%d,", (Int)i->Ain.Sh32.src);
1107          ppHRegAMD64_lo32(i->Ain.Sh32.dst);
1108          return;
1109       case Ain_Test64:
1110          vex_printf("testq $%d,", (Int)i->Ain.Test64.imm32);
1111          ppHRegAMD64(i->Ain.Test64.dst);
1112          return;
1113       case Ain_Unary64:
1114          vex_printf("%sq ", showAMD64UnaryOp(i->Ain.Unary64.op));
1115          ppHRegAMD64(i->Ain.Unary64.dst);
1116          return;
1117       case Ain_Lea64:
1118          vex_printf("leaq ");
1119          ppAMD64AMode(i->Ain.Lea64.am);
1120          vex_printf(",");
1121          ppHRegAMD64(i->Ain.Lea64.dst);
1122          return;
1123       case Ain_Alu32R:
1124          vex_printf("%sl ", showAMD64AluOp(i->Ain.Alu32R.op));
1125          ppAMD64RMI_lo32(i->Ain.Alu32R.src);
1126          vex_printf(",");
1127          ppHRegAMD64_lo32(i->Ain.Alu32R.dst);
1128          return;
1129       case Ain_MulL:
1130          vex_printf("%cmulq ", i->Ain.MulL.syned ? 's' : 'u');
1131          ppAMD64RM(i->Ain.MulL.src);
1132          return;
1133       case Ain_Div:
1134          vex_printf("%cdiv%s ",
1135                     i->Ain.Div.syned ? 's' : 'u',
1136                     showAMD64ScalarSz(i->Ain.Div.sz));
1137          ppAMD64RM(i->Ain.Div.src);
1138          return;
1139       case Ain_Push:
1140          vex_printf("pushq ");
1141          ppAMD64RMI(i->Ain.Push.src);
1142          return;
1143       case Ain_Call:
1144          vex_printf("call%s[%d,",
1145                     i->Ain.Call.cond==Acc_ALWAYS
1146                        ? "" : showAMD64CondCode(i->Ain.Call.cond),
1147                     i->Ain.Call.regparms );
1148          ppRetLoc(i->Ain.Call.rloc);
1149          vex_printf("] 0x%llx", i->Ain.Call.target);
1150          break;
1151
1152       case Ain_XDirect:
1153          vex_printf("(xDirect) ");
1154          vex_printf("if (%%rflags.%s) { ",
1155                     showAMD64CondCode(i->Ain.XDirect.cond));
1156          vex_printf("movabsq $0x%llx,%%r11; ", i->Ain.XDirect.dstGA);
1157          vex_printf("movq %%r11,");
1158          ppAMD64AMode(i->Ain.XDirect.amRIP);
1159          vex_printf("; ");
1160          vex_printf("movabsq $disp_cp_chain_me_to_%sEP,%%r11; call *%%r11 }",
1161                     i->Ain.XDirect.toFastEP ? "fast" : "slow");
1162          return;
1163       case Ain_XIndir:
1164          vex_printf("(xIndir) ");
1165          vex_printf("if (%%rflags.%s) { ",
1166                     showAMD64CondCode(i->Ain.XIndir.cond));
1167          vex_printf("movq ");
1168          ppHRegAMD64(i->Ain.XIndir.dstGA);
1169          vex_printf(",");
1170          ppAMD64AMode(i->Ain.XIndir.amRIP);
1171          vex_printf("; movabsq $disp_indir,%%r11; jmp *%%r11 }");
1172          return;
1173       case Ain_XAssisted:
1174          vex_printf("(xAssisted) ");
1175          vex_printf("if (%%rflags.%s) { ",
1176                     showAMD64CondCode(i->Ain.XAssisted.cond));
1177          vex_printf("movq ");
1178          ppHRegAMD64(i->Ain.XAssisted.dstGA);
1179          vex_printf(",");
1180          ppAMD64AMode(i->Ain.XAssisted.amRIP);
1181          vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%rbp",
1182                     (Int)i->Ain.XAssisted.jk);
1183          vex_printf("; movabsq $disp_assisted,%%r11; jmp *%%r11 }");
1184          return;
1185
1186       case Ain_CMov64:
1187          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.CMov64.cond));
1188          ppHRegAMD64(i->Ain.CMov64.src);
1189          vex_printf(",");
1190          ppHRegAMD64(i->Ain.CMov64.dst);
1191          return;
1192       case Ain_CLoad:
1193          vex_printf("if (%%rflags.%s) { ",
1194                     showAMD64CondCode(i->Ain.CLoad.cond));
1195          vex_printf("mov%c ", i->Ain.CLoad.szB == 4 ? 'l' : 'q');
1196          ppAMD64AMode(i->Ain.CLoad.addr);
1197          vex_printf(", ");
1198          (i->Ain.CLoad.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1199             (i->Ain.CLoad.dst);
1200          vex_printf(" }");
1201          return;
1202       case Ain_CStore:
1203          vex_printf("if (%%rflags.%s) { ",
1204                     showAMD64CondCode(i->Ain.CStore.cond));
1205          vex_printf("mov%c ", i->Ain.CStore.szB == 4 ? 'l' : 'q');
1206          (i->Ain.CStore.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1207             (i->Ain.CStore.src);
1208          vex_printf(", ");
1209          ppAMD64AMode(i->Ain.CStore.addr);
1210          vex_printf(" }");
1211          return;
1212
1213       case Ain_MovxLQ:
1214          vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
1215          ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
1216          vex_printf(",");
1217          ppHRegAMD64(i->Ain.MovxLQ.dst);
1218          return;
1219       case Ain_LoadEX:
1220          if (i->Ain.LoadEX.szSmall==4 && !i->Ain.LoadEX.syned) {
1221             vex_printf("movl ");
1222             ppAMD64AMode(i->Ain.LoadEX.src);
1223             vex_printf(",");
1224             ppHRegAMD64_lo32(i->Ain.LoadEX.dst);
1225          } else {
1226             vex_printf("mov%c%cq ",
1227                        i->Ain.LoadEX.syned ? 's' : 'z',
1228                        i->Ain.LoadEX.szSmall==1
1229                           ? 'b'
1230                           : (i->Ain.LoadEX.szSmall==2 ? 'w' : 'l'));
1231             ppAMD64AMode(i->Ain.LoadEX.src);
1232             vex_printf(",");
1233             ppHRegAMD64(i->Ain.LoadEX.dst);
1234          }
1235          return;
1236       case Ain_Store:
1237          vex_printf("mov%c ", i->Ain.Store.sz==1 ? 'b'
1238                               : (i->Ain.Store.sz==2 ? 'w' : 'l'));
1239          ppHRegAMD64(i->Ain.Store.src);
1240          vex_printf(",");
1241          ppAMD64AMode(i->Ain.Store.dst);
1242          return;
1243       case Ain_Set64:
1244          vex_printf("setq%s ", showAMD64CondCode(i->Ain.Set64.cond));
1245          ppHRegAMD64(i->Ain.Set64.dst);
1246          return;
1247       case Ain_Bsfr64:
1248          vex_printf("bs%cq ", i->Ain.Bsfr64.isFwds ? 'f' : 'r');
1249          ppHRegAMD64(i->Ain.Bsfr64.src);
1250          vex_printf(",");
1251          ppHRegAMD64(i->Ain.Bsfr64.dst);
1252          return;
1253       case Ain_MFence:
1254          vex_printf("mfence" );
1255          return;
1256       case Ain_ACAS:
1257          vex_printf("lock cmpxchg%c ",
1258                      i->Ain.ACAS.sz==1 ? 'b' : i->Ain.ACAS.sz==2 ? 'w'
1259                      : i->Ain.ACAS.sz==4 ? 'l' : 'q' );
1260          vex_printf("{%%rax->%%rbx},");
1261          ppAMD64AMode(i->Ain.ACAS.addr);
1262          return;
1263       case Ain_DACAS:
1264          vex_printf("lock cmpxchg%db {%%rdx:%%rax->%%rcx:%%rbx},",
1265                     (Int)(2 * i->Ain.DACAS.sz));
1266          ppAMD64AMode(i->Ain.DACAS.addr);
1267          return;
1268       case Ain_A87Free:
1269          vex_printf("ffree %%st(7..%d)", 8 - i->Ain.A87Free.nregs );
1270          break;
1271       case Ain_A87PushPop:
1272          vex_printf(i->Ain.A87PushPop.isPush ? "fld%c " : "fstp%c ",
1273                     i->Ain.A87PushPop.szB == 4 ? 's' : 'l');
1274          ppAMD64AMode(i->Ain.A87PushPop.addr);
1275          break;
1276       case Ain_A87FpOp:
1277          vex_printf("f%s", showA87FpOp(i->Ain.A87FpOp.op));
1278          break;
1279       case Ain_A87LdCW:
1280          vex_printf("fldcw ");
1281          ppAMD64AMode(i->Ain.A87LdCW.addr);
1282          break;
1283       case Ain_A87StSW:
1284          vex_printf("fstsw ");
1285          ppAMD64AMode(i->Ain.A87StSW.addr);
1286          break;
1287       case Ain_LdMXCSR:
1288          vex_printf("ldmxcsr ");
1289          ppAMD64AMode(i->Ain.LdMXCSR.addr);
1290          break;
1291       case Ain_SseUComIS:
1292          vex_printf("ucomis%s ", i->Ain.SseUComIS.sz==4 ? "s" : "d");
1293          ppHRegAMD64(i->Ain.SseUComIS.srcL);
1294          vex_printf(",");
1295          ppHRegAMD64(i->Ain.SseUComIS.srcR);
1296          vex_printf(" ; pushfq ; popq ");
1297          ppHRegAMD64(i->Ain.SseUComIS.dst);
1298          break;
1299       case Ain_SseSI2SF:
1300          vex_printf("cvtsi2s%s ", i->Ain.SseSI2SF.szD==4 ? "s" : "d");
1301          (i->Ain.SseSI2SF.szS==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1302             (i->Ain.SseSI2SF.src);
1303          vex_printf(",");
1304          ppHRegAMD64(i->Ain.SseSI2SF.dst);
1305          break;
1306       case Ain_SseSF2SI:
1307          vex_printf("cvts%s2si ", i->Ain.SseSF2SI.szS==4 ? "s" : "d");
1308          ppHRegAMD64(i->Ain.SseSF2SI.src);
1309          vex_printf(",");
1310          (i->Ain.SseSF2SI.szD==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1311             (i->Ain.SseSF2SI.dst);
1312          break;
1313       case Ain_SseSDSS:
1314          vex_printf(i->Ain.SseSDSS.from64 ? "cvtsd2ss " : "cvtss2sd ");
1315          ppHRegAMD64(i->Ain.SseSDSS.src);
1316          vex_printf(",");
1317          ppHRegAMD64(i->Ain.SseSDSS.dst);
1318          break;
1319       case Ain_SseLdSt:
1320          switch (i->Ain.SseLdSt.sz) {
1321             case 4:  vex_printf("movss "); break;
1322             case 8:  vex_printf("movsd "); break;
1323             case 16: vex_printf("movups "); break;
1324             default: vassert(0);
1325          }
1326          if (i->Ain.SseLdSt.isLoad) {
1327             ppAMD64AMode(i->Ain.SseLdSt.addr);
1328             vex_printf(",");
1329             ppHRegAMD64(i->Ain.SseLdSt.reg);
1330          } else {
1331             ppHRegAMD64(i->Ain.SseLdSt.reg);
1332             vex_printf(",");
1333             ppAMD64AMode(i->Ain.SseLdSt.addr);
1334          }
1335          return;
1336       case Ain_SseCStore:
1337          vex_printf("if (%%rflags.%s) { ",
1338                     showAMD64CondCode(i->Ain.SseCStore.cond));
1339          vex_printf("movups ");
1340          ppHRegAMD64(i->Ain.SseCStore.src);
1341          vex_printf(", ");
1342          ppAMD64AMode(i->Ain.SseCStore.addr);
1343          vex_printf(" }");
1344          return;
1345       case Ain_SseCLoad:
1346          vex_printf("if (%%rflags.%s) { ",
1347                     showAMD64CondCode(i->Ain.SseCLoad.cond));
1348          vex_printf("movups ");
1349          ppAMD64AMode(i->Ain.SseCLoad.addr);
1350          vex_printf(", ");
1351          ppHRegAMD64(i->Ain.SseCLoad.dst);
1352          vex_printf(" }");
1353          return;
1354       case Ain_SseLdzLO:
1355          vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
1356          ppAMD64AMode(i->Ain.SseLdzLO.addr);
1357          vex_printf(",");
1358          ppHRegAMD64(i->Ain.SseLdzLO.reg);
1359          return;
1360       case Ain_Sse32Fx4:
1361          vex_printf("%sps ", showAMD64SseOp(i->Ain.Sse32Fx4.op));
1362          ppHRegAMD64(i->Ain.Sse32Fx4.src);
1363          vex_printf(",");
1364          ppHRegAMD64(i->Ain.Sse32Fx4.dst);
1365          return;
1366       case Ain_Sse32FLo:
1367          vex_printf("%sss ", showAMD64SseOp(i->Ain.Sse32FLo.op));
1368          ppHRegAMD64(i->Ain.Sse32FLo.src);
1369          vex_printf(",");
1370          ppHRegAMD64(i->Ain.Sse32FLo.dst);
1371          return;
1372       case Ain_Sse64Fx2:
1373          vex_printf("%spd ", showAMD64SseOp(i->Ain.Sse64Fx2.op));
1374          ppHRegAMD64(i->Ain.Sse64Fx2.src);
1375          vex_printf(",");
1376          ppHRegAMD64(i->Ain.Sse64Fx2.dst);
1377          return;
1378       case Ain_Sse64FLo:
1379          vex_printf("%ssd ", showAMD64SseOp(i->Ain.Sse64FLo.op));
1380          ppHRegAMD64(i->Ain.Sse64FLo.src);
1381          vex_printf(",");
1382          ppHRegAMD64(i->Ain.Sse64FLo.dst);
1383          return;
1384       case Ain_SseReRg:
1385          vex_printf("%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1386          ppHRegAMD64(i->Ain.SseReRg.src);
1387          vex_printf(",");
1388          ppHRegAMD64(i->Ain.SseReRg.dst);
1389          return;
1390       case Ain_SseCMov:
1391          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.SseCMov.cond));
1392          ppHRegAMD64(i->Ain.SseCMov.src);
1393          vex_printf(",");
1394          ppHRegAMD64(i->Ain.SseCMov.dst);
1395          return;
1396       case Ain_SseShuf:
1397          vex_printf("pshufd $0x%x,", (UInt)i->Ain.SseShuf.order);
1398          ppHRegAMD64(i->Ain.SseShuf.src);
1399          vex_printf(",");
1400          ppHRegAMD64(i->Ain.SseShuf.dst);
1401          return;
1402       case Ain_SseShiftN:
1403          vex_printf("%s $%u, ", showAMD64SseOp(i->Ain.SseShiftN.op),
1404                                 i->Ain.SseShiftN.shiftBits);
1405          ppHRegAMD64(i->Ain.SseShiftN.dst);
1406          return;
1407       case Ain_SseMOVQ:
1408          vex_printf("movq ");
1409          if (i->Ain.SseMOVQ.toXMM) {
1410             ppHRegAMD64(i->Ain.SseMOVQ.gpr);
1411             vex_printf(",");
1412             ppHRegAMD64(i->Ain.SseMOVQ.xmm);
1413          } else {
1414             ppHRegAMD64(i->Ain.SseMOVQ.xmm);
1415             vex_printf(",");
1416             ppHRegAMD64(i->Ain.SseMOVQ.gpr);
1417          };
1418          return;
1419       //uu case Ain_AvxLdSt:
1420       //uu    vex_printf("vmovups ");
1421       //uu    if (i->Ain.AvxLdSt.isLoad) {
1422       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1423       //uu       vex_printf(",");
1424       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1425       //uu    } else {
1426       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1427       //uu       vex_printf(",");
1428       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1429       //uu    }
1430       //uu    return;
1431       //uu case Ain_AvxReRg:
1432       //uu    vex_printf("v%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1433       //uu    ppHRegAMD64(i->Ain.AvxReRg.src);
1434       //uu    vex_printf(",");
1435       //uu    ppHRegAMD64(i->Ain.AvxReRg.dst);
1436       //uu    return;
1437       case Ain_EvCheck:
1438          vex_printf("(evCheck) decl ");
1439          ppAMD64AMode(i->Ain.EvCheck.amCounter);
1440          vex_printf("; jns nofail; jmp *");
1441          ppAMD64AMode(i->Ain.EvCheck.amFailAddr);
1442          vex_printf("; nofail:");
1443          return;
1444       case Ain_ProfInc:
1445          vex_printf("(profInc) movabsq $NotKnownYet, %%r11; incq (%%r11)");
1446          return;
1447       default:
1448          vpanic("ppAMD64Instr");
1449    }
1450 }
1451
1452 /* --------- Helpers for register allocation. --------- */
1453
1454 void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
1455 {
1456    Bool unary;
1457    vassert(mode64 == True);
1458    initHRegUsage(u);
1459    switch (i->tag) {
1460       case Ain_Imm64:
1461          addHRegUse(u, HRmWrite, i->Ain.Imm64.dst);
1462          return;
1463       case Ain_Alu64R:
1464          addRegUsage_AMD64RMI(u, i->Ain.Alu64R.src);
1465          if (i->Ain.Alu64R.op == Aalu_MOV) {
1466             addHRegUse(u, HRmWrite, i->Ain.Alu64R.dst);
1467
1468             if (i->Ain.Alu64R.src->tag == Armi_Reg) {
1469                u->isRegRegMove = True;
1470                u->regMoveSrc   = i->Ain.Alu64R.src->Armi.Reg.reg;
1471                u->regMoveDst   = i->Ain.Alu64R.dst;
1472             }
1473             return;
1474          }
1475          if (i->Ain.Alu64R.op == Aalu_CMP) {
1476             addHRegUse(u, HRmRead, i->Ain.Alu64R.dst);
1477             return;
1478          }
1479          addHRegUse(u, HRmModify, i->Ain.Alu64R.dst);
1480          return;
1481       case Ain_Alu64M:
1482          addRegUsage_AMD64RI(u, i->Ain.Alu64M.src);
1483          addRegUsage_AMD64AMode(u, i->Ain.Alu64M.dst);
1484          return;
1485       case Ain_Sh64:
1486          addHRegUse(u, HRmModify, i->Ain.Sh64.dst);
1487          if (i->Ain.Sh64.src == 0)
1488             addHRegUse(u, HRmRead, hregAMD64_RCX());
1489          return;
1490       case Ain_Sh32:
1491          addHRegUse(u, HRmModify, i->Ain.Sh32.dst);
1492          if (i->Ain.Sh32.src == 0)
1493             addHRegUse(u, HRmRead, hregAMD64_RCX());
1494          return;
1495       case Ain_Test64:
1496          addHRegUse(u, HRmRead, i->Ain.Test64.dst);
1497          return;
1498       case Ain_Unary64:
1499          addHRegUse(u, HRmModify, i->Ain.Unary64.dst);
1500          return;
1501       case Ain_Lea64:
1502          addRegUsage_AMD64AMode(u, i->Ain.Lea64.am);
1503          addHRegUse(u, HRmWrite, i->Ain.Lea64.dst);
1504          return;
1505       case Ain_Alu32R:
1506          vassert(i->Ain.Alu32R.op != Aalu_MOV);
1507          addRegUsage_AMD64RMI(u, i->Ain.Alu32R.src);
1508          if (i->Ain.Alu32R.op == Aalu_CMP) {
1509             addHRegUse(u, HRmRead, i->Ain.Alu32R.dst);
1510             return;
1511          }
1512          addHRegUse(u, HRmModify, i->Ain.Alu32R.dst);
1513          return;
1514       case Ain_MulL:
1515          addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
1516          addHRegUse(u, HRmModify, hregAMD64_RAX());
1517          addHRegUse(u, HRmWrite, hregAMD64_RDX());
1518          return;
1519       case Ain_Div:
1520          addRegUsage_AMD64RM(u, i->Ain.Div.src, HRmRead);
1521          addHRegUse(u, HRmModify, hregAMD64_RAX());
1522          addHRegUse(u, HRmModify, hregAMD64_RDX());
1523          return;
1524       case Ain_Push:
1525          addRegUsage_AMD64RMI(u, i->Ain.Push.src);
1526          addHRegUse(u, HRmModify, hregAMD64_RSP());
1527          return;
1528       case Ain_Call:
1529          /* This is a bit subtle. */
1530          /* First off, claim it trashes all the caller-saved regs
1531             which fall within the register allocator's jurisdiction.
1532             These I believe to be: rax rcx rdx rdi rsi r8 r9 r10
1533             and all the xmm registers. */
1534          addHRegUse(u, HRmWrite, hregAMD64_RAX());
1535          addHRegUse(u, HRmWrite, hregAMD64_RCX());
1536          addHRegUse(u, HRmWrite, hregAMD64_RDX());
1537          addHRegUse(u, HRmWrite, hregAMD64_RDI());
1538          addHRegUse(u, HRmWrite, hregAMD64_RSI());
1539          addHRegUse(u, HRmWrite, hregAMD64_R8());
1540          addHRegUse(u, HRmWrite, hregAMD64_R9());
1541          addHRegUse(u, HRmWrite, hregAMD64_R10());
1542          addHRegUse(u, HRmWrite, hregAMD64_XMM0());
1543          addHRegUse(u, HRmWrite, hregAMD64_XMM1());
1544          addHRegUse(u, HRmWrite, hregAMD64_XMM3());
1545          addHRegUse(u, HRmWrite, hregAMD64_XMM4());
1546          addHRegUse(u, HRmWrite, hregAMD64_XMM5());
1547          addHRegUse(u, HRmWrite, hregAMD64_XMM6());
1548          addHRegUse(u, HRmWrite, hregAMD64_XMM7());
1549          addHRegUse(u, HRmWrite, hregAMD64_XMM8());
1550          addHRegUse(u, HRmWrite, hregAMD64_XMM9());
1551          addHRegUse(u, HRmWrite, hregAMD64_XMM10());
1552          addHRegUse(u, HRmWrite, hregAMD64_XMM11());
1553          addHRegUse(u, HRmWrite, hregAMD64_XMM12());
1554
1555          /* Now we have to state any parameter-carrying registers
1556             which might be read.  This depends on the regparmness. */
1557          switch (i->Ain.Call.regparms) {
1558             case 6: addHRegUse(u, HRmRead, hregAMD64_R9());  /*fallthru*/
1559             case 5: addHRegUse(u, HRmRead, hregAMD64_R8());  /*fallthru*/
1560             case 4: addHRegUse(u, HRmRead, hregAMD64_RCX()); /*fallthru*/
1561             case 3: addHRegUse(u, HRmRead, hregAMD64_RDX()); /*fallthru*/
1562             case 2: addHRegUse(u, HRmRead, hregAMD64_RSI()); /*fallthru*/
1563             case 1: addHRegUse(u, HRmRead, hregAMD64_RDI()); break;
1564             case 0: break;
1565             default: vpanic("getRegUsage_AMD64Instr:Call:regparms");
1566          }
1567          /* Finally, there is the issue that the insn trashes a
1568             register because the literal target address has to be
1569             loaded into a register.  Fortunately, r11 is stated in the
1570             ABI as a scratch register, and so seems a suitable victim.  */
1571          addHRegUse(u, HRmWrite, hregAMD64_R11());
1572          /* Upshot of this is that the assembler really must use r11,
1573             and no other, as a destination temporary. */
1574          return;
1575       /* XDirect/XIndir/XAssisted are also a bit subtle.  They
1576          conditionally exit the block.  Hence we only need to list (1)
1577          the registers that they read, and (2) the registers that they
1578          write in the case where the block is not exited.  (2) is
1579          empty, hence only (1) is relevant here. */
1580       case Ain_XDirect:
1581          /* Don't bother to mention the write to %r11, since it is not
1582             available to the allocator. */
1583          addRegUsage_AMD64AMode(u, i->Ain.XDirect.amRIP);
1584          return;
1585       case Ain_XIndir:
1586          /* Ditto re %r11 */
1587          addHRegUse(u, HRmRead, i->Ain.XIndir.dstGA);
1588          addRegUsage_AMD64AMode(u, i->Ain.XIndir.amRIP);
1589          return;
1590       case Ain_XAssisted:
1591          /* Ditto re %r11 and %rbp (the baseblock ptr) */
1592          addHRegUse(u, HRmRead, i->Ain.XAssisted.dstGA);
1593          addRegUsage_AMD64AMode(u, i->Ain.XAssisted.amRIP);
1594          return;
1595       case Ain_CMov64:
1596          addHRegUse(u, HRmRead,   i->Ain.CMov64.src);
1597          addHRegUse(u, HRmModify, i->Ain.CMov64.dst);
1598          return;
1599       case Ain_CLoad:
1600          addRegUsage_AMD64AMode(u, i->Ain.CLoad.addr);
1601          addHRegUse(u, HRmModify, i->Ain.CLoad.dst);
1602          return;
1603       case Ain_CStore:
1604          addRegUsage_AMD64AMode(u, i->Ain.CStore.addr);
1605          addHRegUse(u, HRmRead, i->Ain.CStore.src);
1606          return;
1607       case Ain_MovxLQ:
1608          addHRegUse(u, HRmRead,  i->Ain.MovxLQ.src);
1609          addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
1610          return;
1611       case Ain_LoadEX:
1612          addRegUsage_AMD64AMode(u, i->Ain.LoadEX.src);
1613          addHRegUse(u, HRmWrite, i->Ain.LoadEX.dst);
1614          return;
1615       case Ain_Store:
1616          addHRegUse(u, HRmRead, i->Ain.Store.src);
1617          addRegUsage_AMD64AMode(u, i->Ain.Store.dst);
1618          return;
1619       case Ain_Set64:
1620          addHRegUse(u, HRmWrite, i->Ain.Set64.dst);
1621          return;
1622       case Ain_Bsfr64:
1623          addHRegUse(u, HRmRead, i->Ain.Bsfr64.src);
1624          addHRegUse(u, HRmWrite, i->Ain.Bsfr64.dst);
1625          return;
1626       case Ain_MFence:
1627          return;
1628       case Ain_ACAS:
1629          addRegUsage_AMD64AMode(u, i->Ain.ACAS.addr);
1630          addHRegUse(u, HRmRead, hregAMD64_RBX());
1631          addHRegUse(u, HRmModify, hregAMD64_RAX());
1632          return;
1633       case Ain_DACAS:
1634          addRegUsage_AMD64AMode(u, i->Ain.DACAS.addr);
1635          addHRegUse(u, HRmRead, hregAMD64_RCX());
1636          addHRegUse(u, HRmRead, hregAMD64_RBX());
1637          addHRegUse(u, HRmModify, hregAMD64_RDX());
1638          addHRegUse(u, HRmModify, hregAMD64_RAX());
1639          return;
1640       case Ain_A87Free:
1641          return;
1642       case Ain_A87PushPop:
1643          addRegUsage_AMD64AMode(u, i->Ain.A87PushPop.addr);
1644          return;
1645       case Ain_A87FpOp:
1646          return;
1647       case Ain_A87LdCW:
1648          addRegUsage_AMD64AMode(u, i->Ain.A87LdCW.addr);
1649          return;
1650       case Ain_A87StSW:
1651          addRegUsage_AMD64AMode(u, i->Ain.A87StSW.addr);
1652          return;
1653       case Ain_LdMXCSR:
1654          addRegUsage_AMD64AMode(u, i->Ain.LdMXCSR.addr);
1655          return;
1656       case Ain_SseUComIS:
1657          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcL);
1658          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcR);
1659          addHRegUse(u, HRmWrite, i->Ain.SseUComIS.dst);
1660          return;
1661       case Ain_SseSI2SF:
1662          addHRegUse(u, HRmRead,  i->Ain.SseSI2SF.src);
1663          addHRegUse(u, HRmWrite, i->Ain.SseSI2SF.dst);
1664          return;
1665       case Ain_SseSF2SI:
1666          addHRegUse(u, HRmRead,  i->Ain.SseSF2SI.src);
1667          addHRegUse(u, HRmWrite, i->Ain.SseSF2SI.dst);
1668          return;
1669       case Ain_SseSDSS:
1670          addHRegUse(u, HRmRead,  i->Ain.SseSDSS.src);
1671          addHRegUse(u, HRmWrite, i->Ain.SseSDSS.dst);
1672          return;
1673       case Ain_SseLdSt:
1674          addRegUsage_AMD64AMode(u, i->Ain.SseLdSt.addr);
1675          addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
1676                        i->Ain.SseLdSt.reg);
1677          return;
1678       case Ain_SseCStore:
1679          addRegUsage_AMD64AMode(u, i->Ain.SseCStore.addr);
1680          addHRegUse(u, HRmRead, i->Ain.SseCStore.src);
1681          return;
1682       case Ain_SseCLoad:
1683          addRegUsage_AMD64AMode(u, i->Ain.SseCLoad.addr);
1684          addHRegUse(u, HRmModify, i->Ain.SseCLoad.dst);
1685          return;
1686       case Ain_SseLdzLO:
1687          addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
1688          addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
1689          return;
1690       case Ain_Sse32Fx4:
1691          vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
1692          unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
1693                          || i->Ain.Sse32Fx4.op == Asse_RSQRTF
1694                          || i->Ain.Sse32Fx4.op == Asse_SQRTF
1695                          || i->Ain.Sse32Fx4.op == Asse_I2F
1696                          || i->Ain.Sse32Fx4.op == Asse_F2I
1697                          || i->Ain.Sse32Fx4.op == Asse_F32toF16
1698                          || i->Ain.Sse32Fx4.op == Asse_F16toF32 );
1699          addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src);
1700          addHRegUse(u, unary ? HRmWrite : HRmModify,
1701                        i->Ain.Sse32Fx4.dst);
1702          return;
1703       case Ain_Sse32FLo:
1704          vassert(i->Ain.Sse32FLo.op != Asse_MOV);
1705          unary = toBool( i->Ain.Sse32FLo.op == Asse_RCPF
1706                          || i->Ain.Sse32FLo.op == Asse_RSQRTF
1707                          || i->Ain.Sse32FLo.op == Asse_SQRTF );
1708          addHRegUse(u, HRmRead, i->Ain.Sse32FLo.src);
1709          addHRegUse(u, unary ? HRmWrite : HRmModify,
1710                        i->Ain.Sse32FLo.dst);
1711          return;
1712       case Ain_Sse64Fx2:
1713          vassert(i->Ain.Sse64Fx2.op != Asse_MOV);
1714          unary = toBool( i->Ain.Sse64Fx2.op == Asse_RCPF
1715                          || i->Ain.Sse64Fx2.op == Asse_RSQRTF
1716                          || i->Ain.Sse64Fx2.op == Asse_SQRTF );
1717          addHRegUse(u, HRmRead, i->Ain.Sse64Fx2.src);
1718          addHRegUse(u, unary ? HRmWrite : HRmModify,
1719                        i->Ain.Sse64Fx2.dst);
1720          return;
1721       case Ain_Sse64FLo:
1722          vassert(i->Ain.Sse64FLo.op != Asse_MOV);
1723          unary = toBool( i->Ain.Sse64FLo.op == Asse_RCPF
1724                          || i->Ain.Sse64FLo.op == Asse_RSQRTF
1725                          || i->Ain.Sse64FLo.op == Asse_SQRTF );
1726          addHRegUse(u, HRmRead, i->Ain.Sse64FLo.src);
1727          addHRegUse(u, unary ? HRmWrite : HRmModify,
1728                        i->Ain.Sse64FLo.dst);
1729          return;
1730       case Ain_SseReRg:
1731          if ( (i->Ain.SseReRg.op == Asse_XOR
1732                || i->Ain.SseReRg.op == Asse_CMPEQ32)
1733               && sameHReg(i->Ain.SseReRg.src, i->Ain.SseReRg.dst)) {
1734             /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd
1735                r,r' as a write of a value to r, and independent of any
1736                previous value in r */
1737             /* (as opposed to a rite of passage :-) */
1738             addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst);
1739          } else {
1740             addHRegUse(u, HRmRead, i->Ain.SseReRg.src);
1741             addHRegUse(u, i->Ain.SseReRg.op == Asse_MOV
1742                              ? HRmWrite : HRmModify,
1743                           i->Ain.SseReRg.dst);
1744
1745             if (i->Ain.SseReRg.op == Asse_MOV) {
1746                u->isRegRegMove = True;
1747                u->regMoveSrc   = i->Ain.SseReRg.src;
1748                u->regMoveDst   = i->Ain.SseReRg.dst;
1749             }
1750          }
1751          return;
1752       case Ain_SseCMov:
1753          addHRegUse(u, HRmRead,   i->Ain.SseCMov.src);
1754          addHRegUse(u, HRmModify, i->Ain.SseCMov.dst);
1755          return;
1756       case Ain_SseShuf:
1757          addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
1758          addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
1759          return;
1760       case Ain_SseShiftN:
1761          addHRegUse(u, HRmModify, i->Ain.SseShiftN.dst);
1762          return;
1763       case Ain_SseMOVQ:
1764          addHRegUse(u, i->Ain.SseMOVQ.toXMM ? HRmRead : HRmWrite,
1765                     i->Ain.SseMOVQ.gpr);
1766          addHRegUse(u, i->Ain.SseMOVQ.toXMM ? HRmWrite : HRmRead,
1767                     i->Ain.SseMOVQ.xmm);
1768          return;
1769       //uu case Ain_AvxLdSt:
1770       //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
1771       //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
1772       //uu               i->Ain.AvxLdSt.reg);
1773       //uu return;
1774       //uu case Ain_AvxReRg:
1775       //uu    if ( (i->Ain.AvxReRg.op == Asse_XOR
1776       //uu          || i->Ain.AvxReRg.op == Asse_CMPEQ32)
1777       //uu         && i->Ain.AvxReRg.src == i->Ain.AvxReRg.dst) {
1778       //uu       /* See comments on the case for Ain_SseReRg. */
1779       //uu       addHRegUse(u, HRmWrite, i->Ain.AvxReRg.dst);
1780       //uu    } else {
1781       //uu       addHRegUse(u, HRmRead, i->Ain.AvxReRg.src);
1782       //uu       addHRegUse(u, i->Ain.AvxReRg.op == Asse_MOV
1783       //uu                        ? HRmWrite : HRmModify,
1784       //uu                     i->Ain.AvxReRg.dst);
1785       //uu
1786       //uu       if (i->Ain.AvxReRg.op == Asse_MOV) {
1787       //uu          u->isRegRegMove = True;
1788       //uu          u->regMoveSrc   = i->Ain.AvxReRg.src;
1789       //uu          u->regMoveDst   = i->Ain.AvxReRg.dst;
1790       //uu       }
1791       //uu    }
1792       //uu    return;
1793       case Ain_EvCheck:
1794          /* We expect both amodes only to mention %rbp, so this is in
1795             fact pointless, since %rbp isn't allocatable, but anyway.. */
1796          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amCounter);
1797          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amFailAddr);
1798          return;
1799       case Ain_ProfInc:
1800          addHRegUse(u, HRmWrite, hregAMD64_R11());
1801          return;
1802       default:
1803          ppAMD64Instr(i, mode64);
1804          vpanic("getRegUsage_AMD64Instr");
1805    }
1806 }
1807
1808 /* local helper */
1809 static inline void mapReg(HRegRemap* m, HReg* r)
1810 {
1811    *r = lookupHRegRemap(m, *r);
1812 }
1813
1814 void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
1815 {
1816    vassert(mode64 == True);
1817    switch (i->tag) {
1818       case Ain_Imm64:
1819          mapReg(m, &i->Ain.Imm64.dst);
1820          return;
1821       case Ain_Alu64R:
1822          mapRegs_AMD64RMI(m, i->Ain.Alu64R.src);
1823          mapReg(m, &i->Ain.Alu64R.dst);
1824          return;
1825       case Ain_Alu64M:
1826          mapRegs_AMD64RI(m, i->Ain.Alu64M.src);
1827          mapRegs_AMD64AMode(m, i->Ain.Alu64M.dst);
1828          return;
1829       case Ain_Sh64:
1830          mapReg(m, &i->Ain.Sh64.dst);
1831          return;
1832       case Ain_Sh32:
1833          mapReg(m, &i->Ain.Sh32.dst);
1834          return;
1835       case Ain_Test64:
1836          mapReg(m, &i->Ain.Test64.dst);
1837          return;
1838       case Ain_Unary64:
1839          mapReg(m, &i->Ain.Unary64.dst);
1840          return;
1841       case Ain_Lea64:
1842          mapRegs_AMD64AMode(m, i->Ain.Lea64.am);
1843          mapReg(m, &i->Ain.Lea64.dst);
1844          return;
1845       case Ain_Alu32R:
1846          mapRegs_AMD64RMI(m, i->Ain.Alu32R.src);
1847          mapReg(m, &i->Ain.Alu32R.dst);
1848          return;
1849       case Ain_MulL:
1850          mapRegs_AMD64RM(m, i->Ain.MulL.src);
1851          return;
1852       case Ain_Div:
1853          mapRegs_AMD64RM(m, i->Ain.Div.src);
1854          return;
1855       case Ain_Push:
1856          mapRegs_AMD64RMI(m, i->Ain.Push.src);
1857          return;
1858       case Ain_Call:
1859          return;
1860       case Ain_XDirect:
1861          mapRegs_AMD64AMode(m, i->Ain.XDirect.amRIP);
1862          return;
1863       case Ain_XIndir:
1864          mapReg(m, &i->Ain.XIndir.dstGA);
1865          mapRegs_AMD64AMode(m, i->Ain.XIndir.amRIP);
1866          return;
1867       case Ain_XAssisted:
1868          mapReg(m, &i->Ain.XAssisted.dstGA);
1869          mapRegs_AMD64AMode(m, i->Ain.XAssisted.amRIP);
1870          return;
1871       case Ain_CMov64:
1872          mapReg(m, &i->Ain.CMov64.src);
1873          mapReg(m, &i->Ain.CMov64.dst);
1874          return;
1875       case Ain_CLoad:
1876          mapRegs_AMD64AMode(m, i->Ain.CLoad.addr);
1877          mapReg(m, &i->Ain.CLoad.dst);
1878          return;
1879       case Ain_CStore:
1880          mapRegs_AMD64AMode(m, i->Ain.CStore.addr);
1881          mapReg(m, &i->Ain.CStore.src);
1882          return;
1883       case Ain_MovxLQ:
1884          mapReg(m, &i->Ain.MovxLQ.src);
1885          mapReg(m, &i->Ain.MovxLQ.dst);
1886          return;
1887       case Ain_LoadEX:
1888          mapRegs_AMD64AMode(m, i->Ain.LoadEX.src);
1889          mapReg(m, &i->Ain.LoadEX.dst);
1890          return;
1891       case Ain_Store:
1892          mapReg(m, &i->Ain.Store.src);
1893          mapRegs_AMD64AMode(m, i->Ain.Store.dst);
1894          return;
1895       case Ain_Set64:
1896          mapReg(m, &i->Ain.Set64.dst);
1897          return;
1898       case Ain_Bsfr64:
1899          mapReg(m, &i->Ain.Bsfr64.src);
1900          mapReg(m, &i->Ain.Bsfr64.dst);
1901          return;
1902       case Ain_MFence:
1903          return;
1904       case Ain_ACAS:
1905          mapRegs_AMD64AMode(m, i->Ain.ACAS.addr);
1906          return;
1907       case Ain_DACAS:
1908          mapRegs_AMD64AMode(m, i->Ain.DACAS.addr);
1909          return;
1910       case Ain_A87Free:
1911          return;
1912       case Ain_A87PushPop:
1913          mapRegs_AMD64AMode(m, i->Ain.A87PushPop.addr);
1914          return;
1915       case Ain_A87FpOp:
1916          return;
1917       case Ain_A87LdCW:
1918          mapRegs_AMD64AMode(m, i->Ain.A87LdCW.addr);
1919          return;
1920       case Ain_A87StSW:
1921          mapRegs_AMD64AMode(m, i->Ain.A87StSW.addr);
1922          return;
1923       case Ain_LdMXCSR:
1924          mapRegs_AMD64AMode(m, i->Ain.LdMXCSR.addr);
1925          return;
1926       case Ain_SseUComIS:
1927          mapReg(m, &i->Ain.SseUComIS.srcL);
1928          mapReg(m, &i->Ain.SseUComIS.srcR);
1929          mapReg(m, &i->Ain.SseUComIS.dst);
1930          return;
1931       case Ain_SseSI2SF:
1932          mapReg(m, &i->Ain.SseSI2SF.src);
1933          mapReg(m, &i->Ain.SseSI2SF.dst);
1934          return;
1935       case Ain_SseSF2SI:
1936          mapReg(m, &i->Ain.SseSF2SI.src);
1937          mapReg(m, &i->Ain.SseSF2SI.dst);
1938          return;
1939       case Ain_SseSDSS:
1940          mapReg(m, &i->Ain.SseSDSS.src);
1941          mapReg(m, &i->Ain.SseSDSS.dst);
1942          return;
1943       case Ain_SseLdSt:
1944          mapReg(m, &i->Ain.SseLdSt.reg);
1945          mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
1946          break;
1947       case Ain_SseCStore:
1948          mapRegs_AMD64AMode(m, i->Ain.SseCStore.addr);
1949          mapReg(m, &i->Ain.SseCStore.src);
1950          return;
1951       case Ain_SseCLoad:
1952          mapRegs_AMD64AMode(m, i->Ain.SseCLoad.addr);
1953          mapReg(m, &i->Ain.SseCLoad.dst);
1954          return;
1955       case Ain_SseLdzLO:
1956          mapReg(m, &i->Ain.SseLdzLO.reg);
1957          mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
1958          break;
1959       case Ain_Sse32Fx4:
1960          mapReg(m, &i->Ain.Sse32Fx4.src);
1961          mapReg(m, &i->Ain.Sse32Fx4.dst);
1962          return;
1963       case Ain_Sse32FLo:
1964          mapReg(m, &i->Ain.Sse32FLo.src);
1965          mapReg(m, &i->Ain.Sse32FLo.dst);
1966          return;
1967       case Ain_Sse64Fx2:
1968          mapReg(m, &i->Ain.Sse64Fx2.src);
1969          mapReg(m, &i->Ain.Sse64Fx2.dst);
1970          return;
1971       case Ain_Sse64FLo:
1972          mapReg(m, &i->Ain.Sse64FLo.src);
1973          mapReg(m, &i->Ain.Sse64FLo.dst);
1974          return;
1975       case Ain_SseReRg:
1976          mapReg(m, &i->Ain.SseReRg.src);
1977          mapReg(m, &i->Ain.SseReRg.dst);
1978          return;
1979       case Ain_SseCMov:
1980          mapReg(m, &i->Ain.SseCMov.src);
1981          mapReg(m, &i->Ain.SseCMov.dst);
1982          return;
1983       case Ain_SseShuf:
1984          mapReg(m, &i->Ain.SseShuf.src);
1985          mapReg(m, &i->Ain.SseShuf.dst);
1986          return;
1987       case Ain_SseShiftN:
1988          mapReg(m, &i->Ain.SseShiftN.dst);
1989          return;
1990       case Ain_SseMOVQ:
1991          mapReg(m, &i->Ain.SseMOVQ.gpr);
1992          mapReg(m, &i->Ain.SseMOVQ.xmm);
1993          return;
1994       //uu case Ain_AvxLdSt:
1995       //uu    mapReg(m, &i->Ain.AvxLdSt.reg);
1996       //uu    mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
1997       //uu    break;
1998       //uu case Ain_AvxReRg:
1999       //uu    mapReg(m, &i->Ain.AvxReRg.src);
2000       //uu    mapReg(m, &i->Ain.AvxReRg.dst);
2001       //uu    return;
2002       case Ain_EvCheck:
2003          /* We expect both amodes only to mention %rbp, so this is in
2004             fact pointless, since %rbp isn't allocatable, but anyway.. */
2005          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amCounter);
2006          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amFailAddr);
2007          return;
2008       case Ain_ProfInc:
2009          /* hardwires r11 -- nothing to modify. */
2010          return;
2011       default:
2012          ppAMD64Instr(i, mode64);
2013          vpanic("mapRegs_AMD64Instr");
2014    }
2015 }
2016
2017 /* Generate amd64 spill/reload instructions under the direction of the
2018    register allocator.  Note it's critical these don't write the
2019    condition codes. */
2020
2021 void genSpill_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
2022                       HReg rreg, Int offsetB, Bool mode64 )
2023 {
2024    AMD64AMode* am;
2025    vassert(offsetB >= 0);
2026    vassert(!hregIsVirtual(rreg));
2027    vassert(mode64 == True);
2028    *i1 = *i2 = NULL;
2029    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
2030    switch (hregClass(rreg)) {
2031       case HRcInt64:
2032          *i1 = AMD64Instr_Alu64M ( Aalu_MOV, AMD64RI_Reg(rreg), am );
2033          return;
2034       case HRcVec128:
2035          *i1 = AMD64Instr_SseLdSt ( False/*store*/, 16, rreg, am );
2036          return;
2037       default:
2038          ppHRegClass(hregClass(rreg));
2039          vpanic("genSpill_AMD64: unimplemented regclass");
2040    }
2041 }
2042
2043 void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
2044                        HReg rreg, Int offsetB, Bool mode64 )
2045 {
2046    AMD64AMode* am;
2047    vassert(offsetB >= 0);
2048    vassert(!hregIsVirtual(rreg));
2049    vassert(mode64 == True);
2050    *i1 = *i2 = NULL;
2051    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
2052    switch (hregClass(rreg)) {
2053       case HRcInt64:
2054          *i1 = AMD64Instr_Alu64R ( Aalu_MOV, AMD64RMI_Mem(am), rreg );
2055          return;
2056       case HRcVec128:
2057          *i1 = AMD64Instr_SseLdSt ( True/*load*/, 16, rreg, am );
2058          return;
2059       default:
2060          ppHRegClass(hregClass(rreg));
2061          vpanic("genReload_AMD64: unimplemented regclass");
2062    }
2063 }
2064
2065 AMD64Instr* genMove_AMD64(HReg from, HReg to, Bool mode64)
2066 {
2067    switch (hregClass(from)) {
2068    case HRcInt64:
2069       return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(from), to);
2070    case HRcVec128:
2071       return AMD64Instr_SseReRg(Asse_MOV, from, to);
2072    default:
2073       ppHRegClass(hregClass(from));
2074       vpanic("genMove_AMD64: unimplemented regclass");
2075    }
2076 }
2077
2078 AMD64Instr* directReload_AMD64( AMD64Instr* i, HReg vreg, Short spill_off )
2079 {
2080    vassert(spill_off >= 0 && spill_off < 10000); /* let's say */
2081
2082    /* Deal with form: src=RMI_Reg, dst=Reg where src == vreg
2083       Convert to: src=RMI_Mem, dst=Reg
2084    */
2085    if (i->tag == Ain_Alu64R
2086        && (i->Ain.Alu64R.op == Aalu_MOV || i->Ain.Alu64R.op == Aalu_OR
2087            || i->Ain.Alu64R.op == Aalu_XOR)
2088        && i->Ain.Alu64R.src->tag == Armi_Reg
2089        && sameHReg(i->Ain.Alu64R.src->Armi.Reg.reg, vreg)) {
2090       vassert(! sameHReg(i->Ain.Alu64R.dst, vreg));
2091       return AMD64Instr_Alu64R(
2092                 i->Ain.Alu64R.op,
2093                 AMD64RMI_Mem( AMD64AMode_IR( spill_off, hregAMD64_RBP())),
2094                 i->Ain.Alu64R.dst
2095              );
2096    }
2097
2098    /* Deal with form: src=RMI_Imm, dst=Reg where dst == vreg
2099       Convert to: src=RI_Imm, dst=Mem
2100    */
2101    if (i->tag == Ain_Alu64R
2102        && (i->Ain.Alu64R.op == Aalu_CMP)
2103        && i->Ain.Alu64R.src->tag == Armi_Imm
2104        && sameHReg(i->Ain.Alu64R.dst, vreg)) {
2105       return AMD64Instr_Alu64M(
2106                 i->Ain.Alu64R.op,
2107                 AMD64RI_Imm( i->Ain.Alu64R.src->Armi.Imm.imm32 ),
2108                 AMD64AMode_IR( spill_off, hregAMD64_RBP())
2109              );
2110    }
2111
2112    return NULL;
2113 }
2114
2115
2116 /* --------- The amd64 assembler (bleh.) --------- */
2117
2118 /* Produce the low three bits of an integer register number. */
2119 inline static UInt iregEnc210 ( HReg r )
2120 {
2121    UInt n;
2122    vassert(hregClass(r) == HRcInt64);
2123    vassert(!hregIsVirtual(r));
2124    n = hregEncoding(r);
2125    vassert(n <= 15);
2126    return n & 7;
2127 }
2128
2129 /* Produce bit 3 of an integer register number. */
2130 inline static UInt iregEnc3 ( HReg r )
2131 {
2132    UInt n;
2133    vassert(hregClass(r) == HRcInt64);
2134    vassert(!hregIsVirtual(r));
2135    n = hregEncoding(r);
2136    vassert(n <= 15);
2137    return (n >> 3) & 1;
2138 }
2139
2140 /* Produce a complete 4-bit integer register number. */
2141 inline static UInt iregEnc3210 ( HReg r )
2142 {
2143    UInt n;
2144    vassert(hregClass(r) == HRcInt64);
2145    vassert(!hregIsVirtual(r));
2146    n = hregEncoding(r);
2147    vassert(n <= 15);
2148    return n;
2149 }
2150
2151 /* Produce a complete 4-bit integer register number. */
2152 inline static UInt vregEnc3210 ( HReg r )
2153 {
2154    UInt n;
2155    vassert(hregClass(r) == HRcVec128);
2156    vassert(!hregIsVirtual(r));
2157    n = hregEncoding(r);
2158    vassert(n <= 15);
2159    return n;
2160 }
2161
2162 inline static UChar mkModRegRM ( UInt mod, UInt reg, UInt regmem )
2163 {
2164    vassert(mod < 4);
2165    vassert((reg|regmem) < 8);
2166    return (UChar)( ((mod & 3) << 6) | ((reg & 7) << 3) | (regmem & 7) );
2167 }
2168
2169 inline static UChar mkSIB ( UInt shift, UInt regindex, UInt regbase )
2170 {
2171    vassert(shift < 4);
2172    vassert((regindex|regbase) < 8);
2173    return (UChar)( ((shift & 3) << 6) | ((regindex & 7) << 3) | (regbase & 7) );
2174 }
2175
2176 static UChar* emit32 ( UChar* p, UInt w32 )
2177 {
2178    *p++ = toUChar((w32)       & 0x000000FF);
2179    *p++ = toUChar((w32 >>  8) & 0x000000FF);
2180    *p++ = toUChar((w32 >> 16) & 0x000000FF);
2181    *p++ = toUChar((w32 >> 24) & 0x000000FF);
2182    return p;
2183 }
2184
2185 static UChar* emit64 ( UChar* p, ULong w64 )
2186 {
2187    p = emit32(p, toUInt(w64         & 0xFFFFFFFF));
2188    p = emit32(p, toUInt((w64 >> 32) & 0xFFFFFFFF));
2189    return p;
2190 }
2191
2192 /* Does a sign-extend of the lowest 8 bits give
2193    the original number? */
2194 static Bool fits8bits ( UInt w32 )
2195 {
2196    Int i32 = (Int)w32;
2197    return toBool(i32 == ((Int)(w32 << 24) >> 24));
2198 }
2199 /* Can the lower 32 bits be signedly widened to produce the whole
2200    64-bit value?  In other words, are the top 33 bits either all 0 or
2201    all 1 ? */
2202 static Bool fitsIn32Bits ( ULong x )
2203 {
2204    Long y1;
2205    y1 = x << 32;
2206    y1 >>=/*s*/ 32;
2207    return toBool(x == y1);
2208 }
2209
2210
2211 /* Forming mod-reg-rm bytes and scale-index-base bytes.
2212
2213      greg,  0(ereg)    |  ereg is not any of: RSP RBP R12 R13
2214                        =  00 greg ereg
2215
2216      greg,  d8(ereg)   |  ereg is neither of: RSP R12
2217                        =  01 greg ereg, d8
2218
2219      greg,  d32(ereg)  |  ereg is neither of: RSP R12
2220                        =  10 greg ereg, d32
2221
2222      greg,  d8(ereg)   |  ereg is either: RSP R12
2223                        =  01 greg 100, 0x24, d8
2224                        (lowest bit of rex distinguishes R12/RSP)
2225
2226      greg,  d32(ereg)  |  ereg is either: RSP R12
2227                        =  10 greg 100, 0x24, d32
2228                        (lowest bit of rex distinguishes R12/RSP)
2229
2230      -----------------------------------------------
2231
2232      greg,  d8(base,index,scale)
2233                |  index != RSP
2234                =  01 greg 100, scale index base, d8
2235
2236      greg,  d32(base,index,scale)
2237                |  index != RSP
2238                =  10 greg 100, scale index base, d32
2239 */
2240 static UChar* doAMode_M__wrk ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
2241 {
2242    UInt gregEnc210 = gregEnc3210 & 7;
2243    if (am->tag == Aam_IR) {
2244       if (am->Aam.IR.imm == 0
2245           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2246           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RBP())
2247           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2248           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R13())
2249          ) {
2250          *p++ = mkModRegRM(0, gregEnc210, iregEnc210(am->Aam.IR.reg));
2251          return p;
2252       }
2253       if (fits8bits(am->Aam.IR.imm)
2254           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2255           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2256          ) {
2257          *p++ = mkModRegRM(1, gregEnc210, iregEnc210(am->Aam.IR.reg));
2258          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2259          return p;
2260       }
2261       if (! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2262           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2263          ) {
2264          *p++ = mkModRegRM(2, gregEnc210, iregEnc210(am->Aam.IR.reg));
2265          p = emit32(p, am->Aam.IR.imm);
2266          return p;
2267       }
2268       if ((sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2269            || sameHReg(am->Aam.IR.reg, hregAMD64_R12()))
2270           && fits8bits(am->Aam.IR.imm)) {
2271          *p++ = mkModRegRM(1, gregEnc210, 4);
2272          *p++ = 0x24;
2273          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2274          return p;
2275       }
2276       if (/* (sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2277               || wait for test case for RSP case */
2278           sameHReg(am->Aam.IR.reg, hregAMD64_R12())) {
2279          *p++ = mkModRegRM(2, gregEnc210, 4);
2280          *p++ = 0x24;
2281          p = emit32(p, am->Aam.IR.imm);
2282          return p;
2283       }
2284       ppAMD64AMode(am);
2285       vpanic("doAMode_M: can't emit amode IR");
2286       /*NOTREACHED*/
2287    }
2288    if (am->tag == Aam_IRRS) {
2289       if (fits8bits(am->Aam.IRRS.imm)
2290           && ! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
2291          *p++ = mkModRegRM(1, gregEnc210, 4);
2292          *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
2293                                           iregEnc210(am->Aam.IRRS.base));
2294          *p++ = toUChar(am->Aam.IRRS.imm & 0xFF);
2295          return p;
2296       }
2297       if (! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
2298          *p++ = mkModRegRM(2, gregEnc210, 4);
2299          *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
2300                                           iregEnc210(am->Aam.IRRS.base));
2301          p = emit32(p, am->Aam.IRRS.imm);
2302          return p;
2303       }
2304       ppAMD64AMode(am);
2305       vpanic("doAMode_M: can't emit amode IRRS");
2306       /*NOTREACHED*/
2307    }
2308    vpanic("doAMode_M: unknown amode");
2309    /*NOTREACHED*/
2310 }
2311
2312 static UChar* doAMode_M ( UChar* p, HReg greg, AMD64AMode* am )
2313 {
2314    return doAMode_M__wrk(p, iregEnc3210(greg), am);
2315 }
2316
2317 static UChar* doAMode_M_enc ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
2318 {
2319    vassert(gregEnc3210 < 16);
2320    return doAMode_M__wrk(p, gregEnc3210, am);
2321 }
2322
2323
2324 /* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
2325 inline
2326 static UChar* doAMode_R__wrk ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
2327 {
2328    *p++ = mkModRegRM(3, gregEnc3210 & 7, eregEnc3210 & 7);
2329    return p;
2330 }
2331
2332 static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
2333 {
2334    return doAMode_R__wrk(p, iregEnc3210(greg), iregEnc3210(ereg));
2335 }
2336
2337 static UChar* doAMode_R_enc_reg ( UChar* p, UInt gregEnc3210, HReg ereg )
2338 {
2339    vassert(gregEnc3210 < 16);
2340    return doAMode_R__wrk(p, gregEnc3210, iregEnc3210(ereg));
2341 }
2342
2343 static UChar* doAMode_R_reg_enc ( UChar* p, HReg greg, UInt eregEnc3210 )
2344 {
2345    vassert(eregEnc3210 < 16);
2346    return doAMode_R__wrk(p, iregEnc3210(greg), eregEnc3210);
2347 }
2348
2349 static UChar* doAMode_R_enc_enc ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
2350 {
2351    vassert( (gregEnc3210|eregEnc3210) < 16);
2352    return doAMode_R__wrk(p, gregEnc3210, eregEnc3210);
2353 }
2354
2355
2356 /* Clear the W bit on a REX byte, thereby changing the operand size
2357    back to whatever that instruction's default operand size is. */
2358 static inline UChar clearWBit ( UChar rex )
2359 {
2360    return rex & ~(1<<3);
2361 }
2362
2363 static inline UChar setWBit ( UChar rex )
2364 {
2365    return rex | (1<<3);
2366 }
2367
2368
2369 /* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
2370 inline static UChar rexAMode_M__wrk ( UInt gregEnc3210, AMD64AMode* am )
2371 {
2372    if (am->tag == Aam_IR) {
2373       UChar W = 1;  /* we want 64-bit mode */
2374       UChar R = (gregEnc3210 >> 3) & 1;
2375       UChar X = 0; /* not relevant */
2376       UChar B = iregEnc3(am->Aam.IR.reg);
2377       return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2378    }
2379    if (am->tag == Aam_IRRS) {
2380       UChar W = 1;  /* we want 64-bit mode */
2381       UChar R = (gregEnc3210 >> 3) & 1;
2382       UChar X = iregEnc3(am->Aam.IRRS.index);
2383       UChar B = iregEnc3(am->Aam.IRRS.base);
2384       return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2385    }
2386    vassert(0);
2387    return 0; /*NOTREACHED*/
2388 }
2389
2390 static UChar rexAMode_M ( HReg greg, AMD64AMode* am )
2391 {
2392    return rexAMode_M__wrk(iregEnc3210(greg), am);
2393 }
2394
2395 static UChar rexAMode_M_enc ( UInt gregEnc3210, AMD64AMode* am )
2396 {
2397    vassert(gregEnc3210 < 16);
2398    return rexAMode_M__wrk(gregEnc3210, am);
2399 }
2400
2401
2402 /* Make up a REX byte, with W=1 (size=64), for a (greg,ereg) pair. */
2403 inline static UChar rexAMode_R__wrk ( UInt gregEnc3210, UInt eregEnc3210 )
2404 {
2405    UChar W = 1;  /* we want 64-bit mode */
2406    UChar R = (gregEnc3210 >> 3) & 1;
2407    UChar X = 0; /* not relevant */
2408    UChar B = (eregEnc3210 >> 3) & 1;
2409    return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2410 }
2411
2412 static UChar rexAMode_R ( HReg greg, HReg ereg )
2413 {
2414    return rexAMode_R__wrk(iregEnc3210(greg), iregEnc3210(ereg));
2415 }
2416
2417 static UChar rexAMode_R_enc_reg ( UInt gregEnc3210, HReg ereg )
2418 {
2419    vassert(gregEnc3210 < 16);
2420    return rexAMode_R__wrk(gregEnc3210, iregEnc3210(ereg));
2421 }
2422
2423 static UChar rexAMode_R_reg_enc ( HReg greg, UInt eregEnc3210 )
2424 {
2425    vassert(eregEnc3210 < 16);
2426    return rexAMode_R__wrk(iregEnc3210(greg), eregEnc3210);
2427 }
2428
2429 static UChar rexAMode_R_enc_enc ( UInt gregEnc3210, UInt eregEnc3210 )
2430 {
2431    vassert((gregEnc3210|eregEnc3210) < 16);
2432    return rexAMode_R__wrk(gregEnc3210, eregEnc3210);
2433 }
2434
2435
2436 //uu /* May 2012: this VEX prefix stuff is currently unused, but has
2437 //uu    verified correct (I reckon).  Certainly it has been known to
2438 //uu    produce correct VEX prefixes during testing. */
2439 //uu
2440 //uu /* Assemble a 2 or 3 byte VEX prefix from parts.  rexR, rexX, rexB and
2441 //uu    notVvvvv need to be not-ed before packing.  mmmmm, rexW, L and pp go
2442 //uu    in verbatim.  There's no range checking on the bits. */
2443 //uu static UInt packVexPrefix ( UInt rexR, UInt rexX, UInt rexB,
2444 //uu                             UInt mmmmm, UInt rexW, UInt notVvvv,
2445 //uu                             UInt L, UInt pp )
2446 //uu {
2447 //uu    UChar byte0 = 0;
2448 //uu    UChar byte1 = 0;
2449 //uu    UChar byte2 = 0;
2450 //uu    if (rexX == 0 && rexB == 0 && mmmmm == 1 && rexW == 0) {
2451 //uu       /* 2 byte encoding is possible. */
2452 //uu       byte0 = 0xC5;
2453 //uu       byte1 = ((rexR ^ 1) << 7) | ((notVvvv ^ 0xF) << 3)
2454 //uu               | (L << 2) | pp;
2455 //uu    } else {
2456 //uu       /* 3 byte encoding is needed. */
2457 //uu       byte0 = 0xC4;
2458 //uu       byte1 = ((rexR ^ 1) << 7) | ((rexX ^ 1) << 6)
2459 //uu               | ((rexB ^ 1) << 5) | mmmmm;
2460 //uu       byte2 = (rexW << 7) | ((notVvvv ^ 0xF) << 3) | (L << 2) | pp;
2461 //uu    }
2462 //uu    return (((UInt)byte2) << 16) | (((UInt)byte1) << 8) | ((UInt)byte0);
2463 //uu }
2464 //uu
2465 //uu /* Make up a VEX prefix for a (greg,amode) pair.  First byte in bits
2466 //uu    7:0 of result, second in 15:8, third (for a 3 byte prefix) in
2467 //uu    23:16.  Has m-mmmm set to indicate a prefix of 0F, pp set to
2468 //uu    indicate no SIMD prefix, W=0 (ignore), L=1 (size=256), and
2469 //uu    vvvv=1111 (unused 3rd reg). */
2470 //uu static UInt vexAMode_M ( HReg greg, AMD64AMode* am )
2471 //uu {
2472 //uu    UChar L       = 1; /* size = 256 */
2473 //uu    UChar pp      = 0; /* no SIMD prefix */
2474 //uu    UChar mmmmm   = 1; /* 0F */
2475 //uu    UChar notVvvv = 0; /* unused */
2476 //uu    UChar rexW    = 0;
2477 //uu    UChar rexR    = 0;
2478 //uu    UChar rexX    = 0;
2479 //uu    UChar rexB    = 0;
2480 //uu    /* Same logic as in rexAMode_M. */
2481 //uu    if (am->tag == Aam_IR) {
2482 //uu       rexR = iregEnc3(greg);
2483 //uu       rexX = 0; /* not relevant */
2484 //uu       rexB = iregEnc3(am->Aam.IR.reg);
2485 //uu    }
2486 //uu    else if (am->tag == Aam_IRRS) {
2487 //uu       rexR = iregEnc3(greg);
2488 //uu       rexX = iregEnc3(am->Aam.IRRS.index);
2489 //uu       rexB = iregEnc3(am->Aam.IRRS.base);
2490 //uu    } else {
2491 //uu       vassert(0);
2492 //uu    }
2493 //uu    return packVexPrefix( rexR, rexX, rexB, mmmmm, rexW, notVvvv, L, pp );
2494 //uu }
2495 //uu
2496 //uu static UChar* emitVexPrefix ( UChar* p, UInt vex )
2497 //uu {
2498 //uu    switch (vex & 0xFF) {
2499 //uu       case 0xC5:
2500 //uu          *p++ = 0xC5;
2501 //uu          *p++ = (vex >> 8) & 0xFF;
2502 //uu          vassert(0 == (vex >> 16));
2503 //uu          break;
2504 //uu       case 0xC4:
2505 //uu          *p++ = 0xC4;
2506 //uu          *p++ = (vex >> 8) & 0xFF;
2507 //uu          *p++ = (vex >> 16) & 0xFF;
2508 //uu          vassert(0 == (vex >> 24));
2509 //uu          break;
2510 //uu       default:
2511 //uu          vassert(0);
2512 //uu    }
2513 //uu    return p;
2514 //uu }
2515
2516
2517 /* Emit ffree %st(N) */
2518 static UChar* do_ffree_st ( UChar* p, Int n )
2519 {
2520    vassert(n >= 0 && n <= 7);
2521    *p++ = 0xDD;
2522    *p++ = toUChar(0xC0 + n);
2523    return p;
2524 }
2525
2526 /* Emit an instruction into buf and return the number of bytes used.
2527    Note that buf is not the insn's final place, and therefore it is
2528    imperative to emit position-independent code.  If the emitted
2529    instruction was a profiler inc, set *is_profInc to True, else
2530    leave it unchanged. */
2531
2532 Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
2533                       UChar* buf, Int nbuf, const AMD64Instr* i,
2534                       Bool mode64, VexEndness endness_host,
2535                       const void* disp_cp_chain_me_to_slowEP,
2536                       const void* disp_cp_chain_me_to_fastEP,
2537                       const void* disp_cp_xindir,
2538                       const void* disp_cp_xassisted )
2539 {
2540    UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
2541    UInt   xtra;
2542    UInt   reg;
2543    UChar  rex;
2544    UChar* p = &buf[0];
2545    UChar* ptmp;
2546    Int    j;
2547    vassert(nbuf >= 64);
2548    vassert(mode64 == True);
2549
2550    /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
2551
2552    switch (i->tag) {
2553
2554    case Ain_Imm64:
2555       if (i->Ain.Imm64.imm64 <= 0xFFFFFULL) {
2556          /* Use the short form (load into 32 bit reg, + default
2557             widening rule) for constants under 1 million.  We could
2558             use this form for the range 0 to 0x7FFFFFFF inclusive, but
2559             limit it to a smaller range for verifiability purposes. */
2560          if (1 & iregEnc3(i->Ain.Imm64.dst))
2561             *p++ = 0x41;
2562          *p++ = 0xB8 + iregEnc210(i->Ain.Imm64.dst);
2563          p = emit32(p, (UInt)i->Ain.Imm64.imm64);
2564       } else {
2565          *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Imm64.dst)));
2566          *p++ = toUChar(0xB8 + iregEnc210(i->Ain.Imm64.dst));
2567          p = emit64(p, i->Ain.Imm64.imm64);
2568       }
2569       goto done;
2570
2571    case Ain_Alu64R:
2572       /* Deal specially with MOV */
2573       if (i->Ain.Alu64R.op == Aalu_MOV) {
2574          switch (i->Ain.Alu64R.src->tag) {
2575             case Armi_Imm:
2576                if (0 == (i->Ain.Alu64R.src->Armi.Imm.imm32 & ~0xFFFFF)) {
2577                   /* Actually we could use this form for constants in
2578                      the range 0 through 0x7FFFFFFF inclusive, but
2579                      limit it to a small range for verifiability
2580                      purposes. */
2581                   /* Generate "movl $imm32, 32-bit-register" and let
2582                      the default zero-extend rule cause the upper half
2583                      of the dst to be zeroed out too.  This saves 1
2584                      and sometimes 2 bytes compared to the more
2585                      obvious encoding in the 'else' branch. */
2586                   if (1 & iregEnc3(i->Ain.Alu64R.dst))
2587                      *p++ = 0x41;
2588                   *p++ = 0xB8 + iregEnc210(i->Ain.Alu64R.dst);
2589                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2590                } else {
2591                   *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Alu64R.dst)));
2592                   *p++ = 0xC7;
2593                   *p++ = toUChar(0xC0 + iregEnc210(i->Ain.Alu64R.dst));
2594                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2595                }
2596                goto done;
2597             case Armi_Reg:
2598                *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2599                                   i->Ain.Alu64R.dst );
2600                *p++ = 0x89;
2601                p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2602                                 i->Ain.Alu64R.dst);
2603                goto done;
2604             case Armi_Mem:
2605                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2606                                  i->Ain.Alu64R.src->Armi.Mem.am);
2607                *p++ = 0x8B;
2608                p = doAMode_M(p, i->Ain.Alu64R.dst,
2609                                 i->Ain.Alu64R.src->Armi.Mem.am);
2610                goto done;
2611             default:
2612                goto bad;
2613          }
2614       }
2615       /* MUL */
2616       if (i->Ain.Alu64R.op == Aalu_MUL) {
2617          switch (i->Ain.Alu64R.src->tag) {
2618             case Armi_Reg:
2619                *p++ = rexAMode_R( i->Ain.Alu64R.dst,
2620                                   i->Ain.Alu64R.src->Armi.Reg.reg);
2621                *p++ = 0x0F;
2622                *p++ = 0xAF;
2623                p = doAMode_R(p, i->Ain.Alu64R.dst,
2624                                 i->Ain.Alu64R.src->Armi.Reg.reg);
2625                goto done;
2626             case Armi_Mem:
2627                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2628                                  i->Ain.Alu64R.src->Armi.Mem.am);
2629                *p++ = 0x0F;
2630                *p++ = 0xAF;
2631                p = doAMode_M(p, i->Ain.Alu64R.dst,
2632                                 i->Ain.Alu64R.src->Armi.Mem.am);
2633                goto done;
2634             case Armi_Imm:
2635                if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2636                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2637                   *p++ = 0x6B;
2638                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2639                   *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2640                } else {
2641                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2642                   *p++ = 0x69;
2643                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2644                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2645                }
2646                goto done;
2647             default:
2648                goto bad;
2649          }
2650       }
2651       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
2652       opc = opc_rr = subopc_imm = opc_imma = 0;
2653       switch (i->Ain.Alu64R.op) {
2654          case Aalu_ADC: opc = 0x13; opc_rr = 0x11;
2655                         subopc_imm = 2; opc_imma = 0x15; break;
2656          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2657                         subopc_imm = 0; opc_imma = 0x05; break;
2658          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2659                         subopc_imm = 5; opc_imma = 0x2D; break;
2660          case Aalu_SBB: opc = 0x1B; opc_rr = 0x19;
2661                         subopc_imm = 3; opc_imma = 0x1D; break;
2662          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2663                         subopc_imm = 4; opc_imma = 0x25; break;
2664          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2665                         subopc_imm = 6; opc_imma = 0x35; break;
2666          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2667                         subopc_imm = 1; opc_imma = 0x0D; break;
2668          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2669                         subopc_imm = 7; opc_imma = 0x3D; break;
2670          default: goto bad;
2671       }
2672       switch (i->Ain.Alu64R.src->tag) {
2673          case Armi_Imm:
2674             if (sameHReg(i->Ain.Alu64R.dst, hregAMD64_RAX())
2675                 && !fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2676                goto bad; /* FIXME: awaiting test case */
2677                *p++ = toUChar(opc_imma);
2678                p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2679             } else
2680             if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2681                *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst );
2682                *p++ = 0x83;
2683                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
2684                *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2685             } else {
2686                *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst);
2687                *p++ = 0x81;
2688                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
2689                p    = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2690             }
2691             goto done;
2692          case Armi_Reg:
2693             *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2694                                i->Ain.Alu64R.dst);
2695             *p++ = toUChar(opc_rr);
2696             p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2697                              i->Ain.Alu64R.dst);
2698             goto done;
2699          case Armi_Mem:
2700             *p++ = rexAMode_M( i->Ain.Alu64R.dst,
2701                                i->Ain.Alu64R.src->Armi.Mem.am);
2702             *p++ = toUChar(opc);
2703             p = doAMode_M(p, i->Ain.Alu64R.dst,
2704                              i->Ain.Alu64R.src->Armi.Mem.am);
2705             goto done;
2706          default:
2707             goto bad;
2708       }
2709       break;
2710
2711    case Ain_Alu64M:
2712       /* Deal specially with MOV */
2713       if (i->Ain.Alu64M.op == Aalu_MOV) {
2714          switch (i->Ain.Alu64M.src->tag) {
2715             case Ari_Reg:
2716                *p++ = rexAMode_M(i->Ain.Alu64M.src->Ari.Reg.reg,
2717                                  i->Ain.Alu64M.dst);
2718                *p++ = 0x89;
2719                p = doAMode_M(p, i->Ain.Alu64M.src->Ari.Reg.reg,
2720                                 i->Ain.Alu64M.dst);
2721                goto done;
2722             case Ari_Imm:
2723                *p++ = rexAMode_M_enc(0, i->Ain.Alu64M.dst);
2724                *p++ = 0xC7;
2725                p = doAMode_M_enc(p, 0, i->Ain.Alu64M.dst);
2726                p = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
2727                goto done;
2728             default:
2729                goto bad;
2730          }
2731       }
2732       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP.  MUL is not
2733          allowed here. (This is derived from the x86 version of same). */
2734       opc = subopc_imm = opc_imma = 0;
2735       switch (i->Ain.Alu64M.op) {
2736          case Aalu_CMP: opc = 0x39; subopc_imm = 7; break;
2737          default: goto bad;
2738       }
2739       switch (i->Ain.Alu64M.src->tag) {
2740          /*
2741          case Xri_Reg:
2742             *p++ = toUChar(opc);
2743             p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
2744                              i->Xin.Alu32M.dst);
2745             goto done;
2746          */
2747          case Ari_Imm:
2748             if (fits8bits(i->Ain.Alu64M.src->Ari.Imm.imm32)) {
2749                *p++ = rexAMode_M_enc(subopc_imm, i->Ain.Alu64M.dst);
2750                *p++ = 0x83;
2751                p    = doAMode_M_enc(p, subopc_imm, i->Ain.Alu64M.dst);
2752                *p++ = toUChar(0xFF & i->Ain.Alu64M.src->Ari.Imm.imm32);
2753                goto done;
2754             } else {
2755                *p++ = rexAMode_M_enc(subopc_imm, i->Ain.Alu64M.dst);
2756                *p++ = 0x81;
2757                p    = doAMode_M_enc(p, subopc_imm, i->Ain.Alu64M.dst);
2758                p    = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
2759                goto done;
2760             }
2761          default:
2762             goto bad;
2763       }
2764
2765       break;
2766
2767    case Ain_Sh64:
2768       opc_cl = opc_imm = subopc = 0;
2769       switch (i->Ain.Sh64.op) {
2770          case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
2771          case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
2772          case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
2773          default: goto bad;
2774       }
2775       if (i->Ain.Sh64.src == 0) {
2776          *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
2777          *p++ = toUChar(opc_cl);
2778          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
2779          goto done;
2780       } else {
2781          *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
2782          *p++ = toUChar(opc_imm);
2783          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
2784          *p++ = (UChar)(i->Ain.Sh64.src);
2785          goto done;
2786       }
2787       break;
2788
2789    case Ain_Sh32:
2790       opc_cl = opc_imm = subopc = 0;
2791       switch (i->Ain.Sh32.op) {
2792          case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
2793          case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
2794          case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
2795          default: goto bad;
2796       }
2797       if (i->Ain.Sh32.src == 0) {
2798          rex = clearWBit( rexAMode_R_enc_reg(0, i->Ain.Sh32.dst) );
2799          if (rex != 0x40) *p++ = rex;
2800          *p++ = toUChar(opc_cl);
2801          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh32.dst);
2802          goto done;
2803       } else {
2804          rex = clearWBit( rexAMode_R_enc_reg(0, i->Ain.Sh32.dst) );
2805          if (rex != 0x40) *p++ = rex;
2806          *p++ = toUChar(opc_imm);
2807          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh32.dst);
2808          *p++ = (UChar)(i->Ain.Sh32.src);
2809          goto done;
2810       }
2811       break;
2812
2813    case Ain_Test64:
2814       /* testq sign-extend($imm32), %reg */
2815       *p++ = rexAMode_R_enc_reg(0, i->Ain.Test64.dst);
2816       *p++ = 0xF7;
2817       p = doAMode_R_enc_reg(p, 0, i->Ain.Test64.dst);
2818       p = emit32(p, i->Ain.Test64.imm32);
2819       goto done;
2820
2821    case Ain_Unary64:
2822       if (i->Ain.Unary64.op == Aun_NOT) {
2823          *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
2824          *p++ = 0xF7;
2825          p = doAMode_R_enc_reg(p, 2, i->Ain.Unary64.dst);
2826          goto done;
2827       }
2828       if (i->Ain.Unary64.op == Aun_NEG) {
2829          *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
2830          *p++ = 0xF7;
2831          p = doAMode_R_enc_reg(p, 3, i->Ain.Unary64.dst);
2832          goto done;
2833       }
2834       break;
2835
2836    case Ain_Lea64:
2837       *p++ = rexAMode_M(i->Ain.Lea64.dst, i->Ain.Lea64.am);
2838       *p++ = 0x8D;
2839       p = doAMode_M(p, i->Ain.Lea64.dst, i->Ain.Lea64.am);
2840       goto done;
2841
2842    case Ain_Alu32R:
2843       /* ADD/SUB/AND/OR/XOR/CMP */
2844       opc = opc_rr = subopc_imm = opc_imma = 0;
2845       switch (i->Ain.Alu32R.op) {
2846          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2847                         subopc_imm = 0; opc_imma = 0x05; break;
2848          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2849                         subopc_imm = 5; opc_imma = 0x2D; break;
2850          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2851                         subopc_imm = 4; opc_imma = 0x25; break;
2852          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2853                         subopc_imm = 6; opc_imma = 0x35; break;
2854          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2855                         subopc_imm = 1; opc_imma = 0x0D; break;
2856          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2857                         subopc_imm = 7; opc_imma = 0x3D; break;
2858          default: goto bad;
2859       }
2860       switch (i->Ain.Alu32R.src->tag) {
2861          case Armi_Imm:
2862             if (sameHReg(i->Ain.Alu32R.dst, hregAMD64_RAX())
2863                 && !fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2864                goto bad; /* FIXME: awaiting test case */
2865                *p++ = toUChar(opc_imma);
2866                p = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2867             } else
2868             if (fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2869                rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst ) );
2870                if (rex != 0x40) *p++ = rex;
2871                *p++ = 0x83;
2872                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
2873                *p++ = toUChar(0xFF & i->Ain.Alu32R.src->Armi.Imm.imm32);
2874             } else {
2875                rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst) );
2876                if (rex != 0x40) *p++ = rex;
2877                *p++ = 0x81;
2878                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
2879                p    = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2880             }
2881             goto done;
2882          case Armi_Reg:
2883             rex  = clearWBit(
2884                    rexAMode_R( i->Ain.Alu32R.src->Armi.Reg.reg,
2885                                i->Ain.Alu32R.dst) );
2886             if (rex != 0x40) *p++ = rex;
2887             *p++ = toUChar(opc_rr);
2888             p = doAMode_R(p, i->Ain.Alu32R.src->Armi.Reg.reg,
2889                              i->Ain.Alu32R.dst);
2890             goto done;
2891          case Armi_Mem:
2892             rex  = clearWBit(
2893                    rexAMode_M( i->Ain.Alu32R.dst,
2894                                i->Ain.Alu32R.src->Armi.Mem.am) );
2895             if (rex != 0x40) *p++ = rex;
2896             *p++ = toUChar(opc);
2897             p = doAMode_M(p, i->Ain.Alu32R.dst,
2898                              i->Ain.Alu32R.src->Armi.Mem.am);
2899             goto done;
2900          default:
2901             goto bad;
2902       }
2903       break;
2904
2905    case Ain_MulL:
2906       subopc = i->Ain.MulL.syned ? 5 : 4;
2907       switch (i->Ain.MulL.src->tag)  {
2908          case Arm_Mem:
2909             *p++ = rexAMode_M_enc(0, i->Ain.MulL.src->Arm.Mem.am);
2910             *p++ = 0xF7;
2911             p = doAMode_M_enc(p, subopc, i->Ain.MulL.src->Arm.Mem.am);
2912             goto done;
2913          case Arm_Reg:
2914             *p++ = rexAMode_R_enc_reg(0, i->Ain.MulL.src->Arm.Reg.reg);
2915             *p++ = 0xF7;
2916             p = doAMode_R_enc_reg(p, subopc, i->Ain.MulL.src->Arm.Reg.reg);
2917             goto done;
2918          default:
2919             goto bad;
2920       }
2921       break;
2922
2923    case Ain_Div:
2924       subopc = i->Ain.Div.syned ? 7 : 6;
2925       if (i->Ain.Div.sz == 4) {
2926          switch (i->Ain.Div.src->tag)  {
2927             case Arm_Mem:
2928                goto bad;
2929                /*FIXME*/
2930                *p++ = 0xF7;
2931                p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
2932                goto done;
2933             case Arm_Reg:
2934                *p++ = clearWBit(
2935                       rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg));
2936                *p++ = 0xF7;
2937                p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
2938                goto done;
2939             default:
2940                goto bad;
2941          }
2942       }
2943       if (i->Ain.Div.sz == 8) {
2944          switch (i->Ain.Div.src->tag)  {
2945             case Arm_Mem:
2946                *p++ = rexAMode_M_enc(0, i->Ain.Div.src->Arm.Mem.am);
2947                *p++ = 0xF7;
2948                p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
2949                goto done;
2950             case Arm_Reg:
2951                *p++ = rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg);
2952                *p++ = 0xF7;
2953                p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
2954                goto done;
2955             default:
2956                goto bad;
2957          }
2958       }
2959       break;
2960
2961    case Ain_Push:
2962       switch (i->Ain.Push.src->tag) {
2963          case Armi_Mem:
2964             *p++ = clearWBit(
2965                    rexAMode_M_enc(0, i->Ain.Push.src->Armi.Mem.am));
2966             *p++ = 0xFF;
2967             p = doAMode_M_enc(p, 6, i->Ain.Push.src->Armi.Mem.am);
2968             goto done;
2969          case Armi_Imm:
2970             *p++ = 0x68;
2971             p = emit32(p, i->Ain.Push.src->Armi.Imm.imm32);
2972             goto done;
2973          case Armi_Reg:
2974             *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.Push.src->Armi.Reg.reg)));
2975             *p++ = toUChar(0x50 + iregEnc210(i->Ain.Push.src->Armi.Reg.reg));
2976             goto done;
2977         default:
2978             goto bad;
2979       }
2980
2981    case Ain_Call: {
2982       /* As per detailed comment for Ain_Call in getRegUsage_AMD64Instr
2983          above, %r11 is used as an address temporary. */
2984       /* If we don't need to do any fixup actions in the case that the
2985          call doesn't happen, just do the simple thing and emit
2986          straight-line code.  This is usually the case. */
2987       if (i->Ain.Call.cond == Acc_ALWAYS/*call always happens*/
2988           || i->Ain.Call.rloc.pri == RLPri_None/*no fixup action*/) {
2989          /* jump over the following two insns if the condition does
2990             not hold */
2991          Bool shortImm = fitsIn32Bits(i->Ain.Call.target);
2992          if (i->Ain.Call.cond != Acc_ALWAYS) {
2993             *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
2994             *p++ = shortImm ? 10 : 13;
2995             /* 10 or 13 bytes in the next two insns */
2996          }
2997          if (shortImm) {
2998             /* 7 bytes: movl sign-extend(imm32), %r11 */
2999             *p++ = 0x49;
3000             *p++ = 0xC7;
3001             *p++ = 0xC3;
3002             p = emit32(p, (UInt)i->Ain.Call.target);
3003          } else {
3004             /* 10 bytes: movabsq $target, %r11 */
3005             *p++ = 0x49;
3006             *p++ = 0xBB;
3007             p = emit64(p, i->Ain.Call.target);
3008          }
3009          /* 3 bytes: call *%r11 */
3010          *p++ = 0x41;
3011          *p++ = 0xFF;
3012          *p++ = 0xD3;
3013       } else {
3014          Int delta;
3015          /* Complex case.  We have to generate an if-then-else diamond. */
3016          // before:
3017          //   j{!cond} else:
3018          //   movabsq $target, %r11
3019          //   call* %r11
3020          // preElse:
3021          //   jmp after:
3022          // else:
3023          //   movabsq $0x5555555555555555, %rax  // possibly
3024          //   movq %rax, %rdx                    // possibly
3025          // after:
3026
3027          // before:
3028          UChar* pBefore = p;
3029
3030          //   j{!cond} else:
3031          *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
3032          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3033
3034          //   movabsq $target, %r11
3035          *p++ = 0x49;
3036          *p++ = 0xBB;
3037          p = emit64(p, i->Ain.Call.target);
3038
3039          //   call* %r11
3040          *p++ = 0x41;
3041          *p++ = 0xFF;
3042          *p++ = 0xD3;
3043
3044          // preElse:
3045          UChar* pPreElse = p;
3046
3047          //   jmp after:
3048          *p++ = 0xEB;
3049          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3050
3051          // else:
3052          UChar* pElse = p;
3053
3054          /* Do the 'else' actions */
3055          switch (i->Ain.Call.rloc.pri) {
3056             case RLPri_Int:
3057                // movabsq $0x5555555555555555, %rax
3058                *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
3059                break;
3060             case RLPri_2Int:
3061                goto bad; //ATC
3062                // movabsq $0x5555555555555555, %rax
3063                *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
3064                // movq %rax, %rdx
3065                *p++ = 0x48; *p++ = 0x89; *p++ = 0xC2;
3066                break;
3067             case RLPri_V128SpRel:
3068                if (i->Ain.Call.rloc.spOff == 0) {
3069                   // We could accept any |spOff| here, but that's more
3070                   // hassle and the only value we're ever going to get
3071                   // is zero (I believe.)  Hence take the easy path :)
3072                   // We need a scag register -- r11 can be it.
3073                   // movabsq $0x5555555555555555, %r11
3074                   *p++ = 0x49; *p++ = 0xBB;
3075                   p = emit64(p, 0x5555555555555555ULL);
3076                   // movq %r11, 0(%rsp)
3077                   *p++ = 0x4C; *p++ = 0x89; *p++ = 0x1C; *p++ = 0x24;
3078                   // movq %r11, 8(%rsp)
3079                   *p++ = 0x4C; *p++ = 0x89; *p++ = 0x5C; *p++ = 0x24;
3080                   *p++ = 0x08;
3081                   break;
3082                }
3083                goto bad; //ATC for all other spOff values
3084             case RLPri_V256SpRel:
3085                goto bad; //ATC
3086             case RLPri_None: case RLPri_INVALID: default:
3087                vassert(0); // should never get here
3088          }
3089
3090          // after:
3091          UChar* pAfter = p;
3092
3093          // Fix up the branch offsets.  The +2s in the offset
3094          // calculations are there because x86 requires conditional
3095          // branches to have their offset stated relative to the
3096          // instruction immediately following the branch insn.  And in
3097          // both cases the branch insns are 2 bytes long.
3098
3099          // First, the "j{!cond} else:" at pBefore.
3100          delta = (Int)(Long)(pElse - (pBefore + 2));
3101          vassert(delta >= 0 && delta < 100/*arbitrary*/);
3102          *(pBefore+1) = (UChar)delta;
3103
3104          // And secondly, the "jmp after:" at pPreElse.
3105          delta = (Int)(Long)(pAfter - (pPreElse + 2));
3106          vassert(delta >= 0 && delta < 100/*arbitrary*/);
3107          *(pPreElse+1) = (UChar)delta;
3108       }
3109       goto done;
3110    }
3111
3112    case Ain_XDirect: {
3113       /* NB: what goes on here has to be very closely coordinated with the
3114          chainXDirect_AMD64 and unchainXDirect_AMD64 below. */
3115       /* We're generating chain-me requests here, so we need to be
3116          sure this is actually allowed -- no-redir translations can't
3117          use chain-me's.  Hence: */
3118       vassert(disp_cp_chain_me_to_slowEP != NULL);
3119       vassert(disp_cp_chain_me_to_fastEP != NULL);
3120
3121       HReg r11 = hregAMD64_R11();
3122
3123       /* Use ptmp for backpatching conditional jumps. */
3124       ptmp = NULL;
3125
3126       /* First off, if this is conditional, create a conditional
3127          jump over the rest of it. */
3128       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
3129          /* jmp fwds if !condition */
3130          *p++ = toUChar(0x70 + (0xF & (i->Ain.XDirect.cond ^ 1)));
3131          ptmp = p; /* fill in this bit later */
3132          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3133       }
3134
3135       /* Update the guest RIP. */
3136       if (fitsIn32Bits(i->Ain.XDirect.dstGA)) {
3137          /* use a shorter encoding */
3138          /* movl sign-extend(dstGA), %r11 */
3139          *p++ = 0x49;
3140          *p++ = 0xC7;
3141          *p++ = 0xC3;
3142          p = emit32(p, (UInt)i->Ain.XDirect.dstGA);
3143       } else {
3144          /* movabsq $dstGA, %r11 */
3145          *p++ = 0x49;
3146          *p++ = 0xBB;
3147          p = emit64(p, i->Ain.XDirect.dstGA);
3148       }
3149
3150       /* movq %r11, amRIP */
3151       *p++ = rexAMode_M(r11, i->Ain.XDirect.amRIP);
3152       *p++ = 0x89;
3153       p = doAMode_M(p, r11, i->Ain.XDirect.amRIP);
3154
3155       /* --- FIRST PATCHABLE BYTE follows --- */
3156       /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
3157          to) backs up the return address, so as to find the address of
3158          the first patchable byte.  So: don't change the length of the
3159          two instructions below. */
3160       /* movabsq $disp_cp_chain_me_to_{slow,fast}EP,%r11; */
3161       *p++ = 0x49;
3162       *p++ = 0xBB;
3163       const void* disp_cp_chain_me
3164                = i->Ain.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
3165                                          : disp_cp_chain_me_to_slowEP;
3166       p = emit64(p, (Addr)disp_cp_chain_me);
3167       /* call *%r11 */
3168       *p++ = 0x41;
3169       *p++ = 0xFF;
3170       *p++ = 0xD3;
3171       /* --- END of PATCHABLE BYTES --- */
3172
3173       /* Fix up the conditional jump, if there was one. */
3174       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
3175          Int delta = p - ptmp;
3176          vassert(delta > 0 && delta < 40);
3177          *ptmp = toUChar(delta-1);
3178       }
3179       goto done;
3180    }
3181
3182    case Ain_XIndir: {
3183       /* We're generating transfers that could lead indirectly to a
3184          chain-me, so we need to be sure this is actually allowed --
3185          no-redir translations are not allowed to reach normal
3186          translations without going through the scheduler.  That means
3187          no XDirects or XIndirs out from no-redir translations.
3188          Hence: */
3189       vassert(disp_cp_xindir != NULL);
3190
3191       /* Use ptmp for backpatching conditional jumps. */
3192       ptmp = NULL;
3193
3194       /* First off, if this is conditional, create a conditional
3195          jump over the rest of it. */
3196       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
3197          /* jmp fwds if !condition */
3198          *p++ = toUChar(0x70 + (0xF & (i->Ain.XIndir.cond ^ 1)));
3199          ptmp = p; /* fill in this bit later */
3200          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3201       }
3202
3203       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
3204       *p++ = rexAMode_M(i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
3205       *p++ = 0x89;
3206       p = doAMode_M(p, i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
3207
3208       /* get $disp_cp_xindir into %r11 */
3209       if (fitsIn32Bits((Addr)disp_cp_xindir)) {
3210          /* use a shorter encoding */
3211          /* movl sign-extend(disp_cp_xindir), %r11 */
3212          *p++ = 0x49;
3213          *p++ = 0xC7;
3214          *p++ = 0xC3;
3215          p = emit32(p, (UInt)(Addr)disp_cp_xindir);
3216       } else {
3217          /* movabsq $disp_cp_xindir, %r11 */
3218          *p++ = 0x49;
3219          *p++ = 0xBB;
3220          p = emit64(p, (Addr)disp_cp_xindir);
3221       }
3222
3223       /* jmp *%r11 */
3224       *p++ = 0x41;
3225       *p++ = 0xFF;
3226       *p++ = 0xE3;
3227
3228       /* Fix up the conditional jump, if there was one. */
3229       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
3230          Int delta = p - ptmp;
3231          vassert(delta > 0 && delta < 40);
3232          *ptmp = toUChar(delta-1);
3233       }
3234       goto done;
3235    }
3236
3237    case Ain_XAssisted: {
3238       /* Use ptmp for backpatching conditional jumps. */
3239       ptmp = NULL;
3240
3241       /* First off, if this is conditional, create a conditional
3242          jump over the rest of it. */
3243       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
3244          /* jmp fwds if !condition */
3245          *p++ = toUChar(0x70 + (0xF & (i->Ain.XAssisted.cond ^ 1)));
3246          ptmp = p; /* fill in this bit later */
3247          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3248       }
3249
3250       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
3251       *p++ = rexAMode_M(i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
3252       *p++ = 0x89;
3253       p = doAMode_M(p, i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
3254       /* movl $magic_number, %ebp.  Since these numbers are all small positive
3255          integers, we can get away with "movl $N, %ebp" rather than
3256          the longer "movq $N, %rbp". */
3257       UInt trcval = 0;
3258       switch (i->Ain.XAssisted.jk) {
3259          case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
3260          case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
3261          case Ijk_Sys_int32:   trcval = VEX_TRC_JMP_SYS_INT32;   break;
3262          case Ijk_Sys_int210:  trcval = VEX_TRC_JMP_SYS_INT210;  break;
3263          case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
3264          case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
3265          case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
3266          case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
3267          case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
3268          case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
3269          case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
3270          case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
3271          case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
3272          /* We don't expect to see the following being assisted. */
3273          case Ijk_Ret:
3274          case Ijk_Call:
3275          /* fallthrough */
3276          default:
3277             ppIRJumpKind(i->Ain.XAssisted.jk);
3278             vpanic("emit_AMD64Instr.Ain_XAssisted: unexpected jump kind");
3279       }
3280       vassert(trcval != 0);
3281       *p++ = 0xBD;
3282       p = emit32(p, trcval);
3283       /* movabsq $disp_assisted, %r11 */
3284       *p++ = 0x49;
3285       *p++ = 0xBB;
3286       p = emit64(p, (Addr)disp_cp_xassisted);
3287       /* jmp *%r11 */
3288       *p++ = 0x41;
3289       *p++ = 0xFF;
3290       *p++ = 0xE3;
3291
3292       /* Fix up the conditional jump, if there was one. */
3293       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
3294          Int delta = p - ptmp;
3295          vassert(delta > 0 && delta < 40);
3296          *ptmp = toUChar(delta-1);
3297       }
3298       goto done;
3299    }
3300
3301    case Ain_CMov64:
3302       vassert(i->Ain.CMov64.cond != Acc_ALWAYS);
3303       *p++ = rexAMode_R(i->Ain.CMov64.dst, i->Ain.CMov64.src);
3304       *p++ = 0x0F;
3305       *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
3306       p = doAMode_R(p, i->Ain.CMov64.dst, i->Ain.CMov64.src);
3307       goto done;
3308
3309    case Ain_CLoad: {
3310       vassert(i->Ain.CLoad.cond != Acc_ALWAYS);
3311
3312       /* Only 32- or 64-bit variants are allowed. */
3313       vassert(i->Ain.CLoad.szB == 4 || i->Ain.CLoad.szB == 8);
3314
3315       /* Use ptmp for backpatching conditional jumps. */
3316       ptmp = NULL;
3317
3318       /* jmp fwds if !condition */
3319       *p++ = toUChar(0x70 + (0xF & (i->Ain.CLoad.cond ^ 1)));
3320       ptmp = p; /* fill in this bit later */
3321       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3322
3323       /* Now the load.  Either a normal 64 bit load or a normal 32 bit
3324          load, which, by the default zero-extension rule, zeroes out
3325          the upper half of the destination, as required. */
3326       rex = rexAMode_M(i->Ain.CLoad.dst, i->Ain.CLoad.addr);
3327       *p++ = i->Ain.CLoad.szB == 4 ? clearWBit(rex) : rex;
3328       *p++ = 0x8B;
3329       p = doAMode_M(p, i->Ain.CLoad.dst, i->Ain.CLoad.addr);
3330
3331       /* Fix up the conditional branch */
3332       Int delta = p - ptmp;
3333       vassert(delta > 0 && delta < 40);
3334       *ptmp = toUChar(delta-1);
3335       goto done;
3336    }
3337
3338    case Ain_CStore: {
3339       /* AFAICS this is identical to Ain_CLoad except that the opcode
3340          is 0x89 instead of 0x8B. */
3341       vassert(i->Ain.CStore.cond != Acc_ALWAYS);
3342
3343       /* Only 32- or 64-bit variants are allowed. */
3344       vassert(i->Ain.CStore.szB == 4 || i->Ain.CStore.szB == 8);
3345
3346       /* Use ptmp for backpatching conditional jumps. */
3347       ptmp = NULL;
3348
3349       /* jmp fwds if !condition */
3350       *p++ = toUChar(0x70 + (0xF & (i->Ain.CStore.cond ^ 1)));
3351       ptmp = p; /* fill in this bit later */
3352       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3353
3354       /* Now the store. */
3355       rex = rexAMode_M(i->Ain.CStore.src, i->Ain.CStore.addr);
3356       *p++ = i->Ain.CStore.szB == 4 ? clearWBit(rex) : rex;
3357       *p++ = 0x89;
3358       p = doAMode_M(p, i->Ain.CStore.src, i->Ain.CStore.addr);
3359
3360       /* Fix up the conditional branch */
3361       Int delta = p - ptmp;
3362       vassert(delta > 0 && delta < 40);
3363       *ptmp = toUChar(delta-1);
3364       goto done;
3365    }
3366
3367    case Ain_MovxLQ:
3368       /* No, _don't_ ask me why the sense of the args has to be
3369          different in the S vs Z case.  I don't know. */
3370       if (i->Ain.MovxLQ.syned) {
3371          /* Need REX.W = 1 here, but rexAMode_R does that for us. */
3372          *p++ = rexAMode_R(i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
3373          *p++ = 0x63;
3374          p = doAMode_R(p, i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
3375       } else {
3376          /* Produce a 32-bit reg-reg move, since the implicit
3377             zero-extend does what we want. */
3378          *p++ = clearWBit (
3379                    rexAMode_R(i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst));
3380          *p++ = 0x89;
3381          p = doAMode_R(p, i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst);
3382       }
3383       goto done;
3384
3385    case Ain_LoadEX:
3386       if (i->Ain.LoadEX.szSmall == 1 && !i->Ain.LoadEX.syned) {
3387          /* movzbq */
3388          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3389          *p++ = 0x0F;
3390          *p++ = 0xB6;
3391          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3392          goto done;
3393       }
3394       if (i->Ain.LoadEX.szSmall == 2 && !i->Ain.LoadEX.syned) {
3395          /* movzwq */
3396          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3397          *p++ = 0x0F;
3398          *p++ = 0xB7;
3399          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3400          goto done;
3401       }
3402       if (i->Ain.LoadEX.szSmall == 4 && !i->Ain.LoadEX.syned) {
3403          /* movzlq */
3404          /* This isn't really an existing AMD64 instruction per se.
3405             Rather, we have to do a 32-bit load.  Because a 32-bit
3406             write implicitly clears the upper 32 bits of the target
3407             register, we get what we want. */
3408          *p++ = clearWBit(
3409                 rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src));
3410          *p++ = 0x8B;
3411          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3412          goto done;
3413       }
3414       break;
3415
3416    case Ain_Set64:
3417       /* Make the destination register be 1 or 0, depending on whether
3418          the relevant condition holds.  Complication: the top 56 bits
3419          of the destination should be forced to zero, but doing 'xorq
3420          %r,%r' kills the flag(s) we are about to read.  Sigh.  So
3421          start off my moving $0 into the dest. */
3422       reg = iregEnc3210(i->Ain.Set64.dst);
3423       vassert(reg < 16);
3424
3425       /* movq $0, %dst */
3426       *p++ = toUChar(reg >= 8 ? 0x49 : 0x48);
3427       *p++ = 0xC7;
3428       *p++ = toUChar(0xC0 + (reg & 7));
3429       p = emit32(p, 0);
3430
3431       /* setb lo8(%dst) */
3432       /* note, 8-bit register rex trickyness.  Be careful here. */
3433       *p++ = toUChar(reg >= 8 ? 0x41 : 0x40);
3434       *p++ = 0x0F;
3435       *p++ = toUChar(0x90 + (0x0F & i->Ain.Set64.cond));
3436       *p++ = toUChar(0xC0 + (reg & 7));
3437       goto done;
3438
3439    case Ain_Bsfr64:
3440       *p++ = rexAMode_R(i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
3441       *p++ = 0x0F;
3442       if (i->Ain.Bsfr64.isFwds) {
3443          *p++ = 0xBC;
3444       } else {
3445          *p++ = 0xBD;
3446       }
3447       p = doAMode_R(p, i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
3448       goto done;
3449
3450    case Ain_MFence:
3451       /* mfence */
3452       *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
3453       goto done;
3454
3455    case Ain_ACAS:
3456       /* lock */
3457       *p++ = 0xF0;
3458       if (i->Ain.ACAS.sz == 2) *p++ = 0x66;
3459       /* cmpxchg{b,w,l,q} %rbx,mem.  Expected-value in %rax, new value
3460          in %rbx.  The new-value register is hardwired to be %rbx
3461          since dealing with byte integer registers is too much hassle,
3462          so we force the register operand to %rbx (could equally be
3463          %rcx or %rdx). */
3464       rex = rexAMode_M( hregAMD64_RBX(), i->Ain.ACAS.addr );
3465       if (i->Ain.ACAS.sz != 8)
3466          rex = clearWBit(rex);
3467
3468       *p++ = rex; /* this can emit 0x40, which is pointless. oh well. */
3469       *p++ = 0x0F;
3470       if (i->Ain.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
3471       p = doAMode_M(p, hregAMD64_RBX(), i->Ain.ACAS.addr);
3472       goto done;
3473
3474    case Ain_DACAS:
3475       /* lock */
3476       *p++ = 0xF0;
3477       /* cmpxchg{8,16}b m{64,128}.  Expected-value in %rdx:%rax, new
3478          value in %rcx:%rbx.  All 4 regs are hardwired in the ISA, so
3479          aren't encoded in the insn. */
3480       rex = rexAMode_M_enc(1, i->Ain.ACAS.addr );
3481       if (i->Ain.ACAS.sz != 8)
3482          rex = clearWBit(rex);
3483       *p++ = rex;
3484       *p++ = 0x0F;
3485       *p++ = 0xC7;
3486       p = doAMode_M_enc(p, 1, i->Ain.DACAS.addr);
3487       goto done;
3488
3489    case Ain_A87Free:
3490       vassert(i->Ain.A87Free.nregs > 0 && i->Ain.A87Free.nregs <= 7);
3491       for (j = 0; j < i->Ain.A87Free.nregs; j++) {
3492          p = do_ffree_st(p, 7-j);
3493       }
3494       goto done;
3495
3496    case Ain_A87PushPop:
3497       vassert(i->Ain.A87PushPop.szB == 8 || i->Ain.A87PushPop.szB == 4);
3498       if (i->Ain.A87PushPop.isPush) {
3499          /* Load from memory into %st(0): flds/fldl amode */
3500          *p++ = clearWBit(
3501                    rexAMode_M_enc(0, i->Ain.A87PushPop.addr) );
3502          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3503          p = doAMode_M_enc(p, 0/*subopcode*/, i->Ain.A87PushPop.addr);
3504       } else {
3505          /* Dump %st(0) to memory: fstps/fstpl amode */
3506          *p++ = clearWBit(
3507                    rexAMode_M_enc(3, i->Ain.A87PushPop.addr) );
3508          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3509          p = doAMode_M_enc(p, 3/*subopcode*/, i->Ain.A87PushPop.addr);
3510          goto done;
3511       }
3512       goto done;
3513
3514    case Ain_A87FpOp:
3515       switch (i->Ain.A87FpOp.op) {
3516          case Afp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
3517          case Afp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
3518          case Afp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
3519          case Afp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
3520          case Afp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
3521          case Afp_SCALE:  *p++ = 0xD9; *p++ = 0xFD; break;
3522          case Afp_ATAN:   *p++ = 0xD9; *p++ = 0xF3; break;
3523          case Afp_YL2X:   *p++ = 0xD9; *p++ = 0xF1; break;
3524          case Afp_YL2XP1: *p++ = 0xD9; *p++ = 0xF9; break;
3525          case Afp_PREM:   *p++ = 0xD9; *p++ = 0xF8; break;
3526          case Afp_PREM1:  *p++ = 0xD9; *p++ = 0xF5; break;
3527          case Afp_TAN:
3528             /* fptan pushes 1.0 on the FP stack, except when the
3529                argument is out of range.  Hence we have to do the
3530                instruction, then inspect C2 to see if there is an out
3531                of range condition.  If there is, we skip the fincstp
3532                that is used by the in-range case to get rid of this
3533                extra 1.0 value. */
3534             *p++ = 0xD9; *p++ = 0xF2; // fptan
3535             *p++ = 0x50;              // pushq %rax
3536             *p++ = 0xDF; *p++ = 0xE0; // fnstsw %ax
3537             *p++ = 0x66; *p++ = 0xA9;
3538             *p++ = 0x00; *p++ = 0x04; // testw $0x400,%ax
3539             *p++ = 0x75; *p++ = 0x02; // jnz after_fincstp
3540             *p++ = 0xD9; *p++ = 0xF7; // fincstp
3541             *p++ = 0x58;              // after_fincstp: popq %rax
3542             break;
3543          default:
3544             goto bad;
3545       }
3546       goto done;
3547
3548    case Ain_A87LdCW:
3549       *p++ = clearWBit(
3550                 rexAMode_M_enc(5, i->Ain.A87LdCW.addr) );
3551       *p++ = 0xD9;
3552       p = doAMode_M_enc(p, 5/*subopcode*/, i->Ain.A87LdCW.addr);
3553       goto done;
3554
3555    case Ain_A87StSW:
3556       *p++ = clearWBit(
3557                 rexAMode_M_enc(7, i->Ain.A87StSW.addr) );
3558       *p++ = 0xDD;
3559       p = doAMode_M_enc(p, 7/*subopcode*/, i->Ain.A87StSW.addr);
3560       goto done;
3561
3562    case Ain_Store:
3563       if (i->Ain.Store.sz == 2) {
3564          /* This just goes to show the crazyness of the instruction
3565             set encoding.  We have to insert two prefix bytes, but be
3566             careful to avoid a conflict in what the size should be, by
3567             ensuring that REX.W = 0. */
3568          *p++ = 0x66; /* override to 16-bits */
3569          *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3570          *p++ = 0x89;
3571          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3572          goto done;
3573       }
3574       if (i->Ain.Store.sz == 4) {
3575          *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3576          *p++ = 0x89;
3577          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3578          goto done;
3579       }
3580       if (i->Ain.Store.sz == 1) {
3581          /* This is one place where it would be wrong to skip emitting
3582             a rex byte of 0x40, since the mere presence of rex changes
3583             the meaning of the byte register access.  Be careful. */
3584          *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3585          *p++ = 0x88;
3586          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3587          goto done;
3588       }
3589       break;
3590
3591    case Ain_LdMXCSR:
3592       *p++ = clearWBit(rexAMode_M_enc(0, i->Ain.LdMXCSR.addr));
3593       *p++ = 0x0F;
3594       *p++ = 0xAE;
3595       p = doAMode_M_enc(p, 2/*subopcode*/, i->Ain.LdMXCSR.addr);
3596       goto done;
3597
3598    case Ain_SseUComIS:
3599       /* ucomi[sd] %srcL, %srcR ;  pushfq ; popq %dst */
3600       /* ucomi[sd] %srcL, %srcR */
3601       if (i->Ain.SseUComIS.sz == 8) {
3602          *p++ = 0x66;
3603       } else {
3604          goto bad;
3605          vassert(i->Ain.SseUComIS.sz == 4);
3606       }
3607       *p++ = clearWBit (
3608              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseUComIS.srcL),
3609                                  vregEnc3210(i->Ain.SseUComIS.srcR) ));
3610       *p++ = 0x0F;
3611       *p++ = 0x2E;
3612       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseUComIS.srcL),
3613                                vregEnc3210(i->Ain.SseUComIS.srcR) );
3614       /* pushfq */
3615       *p++ = 0x9C;
3616       /* popq %dst */
3617       *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.SseUComIS.dst)));
3618       *p++ = toUChar(0x58 + iregEnc210(i->Ain.SseUComIS.dst));
3619       goto done;
3620
3621    case Ain_SseSI2SF:
3622       /* cvssi2s[sd] %src, %dst */
3623       rex = rexAMode_R_enc_reg( vregEnc3210(i->Ain.SseSI2SF.dst),
3624                                 i->Ain.SseSI2SF.src );
3625       *p++ = toUChar(i->Ain.SseSI2SF.szD==4 ? 0xF3 : 0xF2);
3626       *p++ = toUChar(i->Ain.SseSI2SF.szS==4 ? clearWBit(rex) : rex);
3627       *p++ = 0x0F;
3628       *p++ = 0x2A;
3629       p = doAMode_R_enc_reg( p, vregEnc3210(i->Ain.SseSI2SF.dst),
3630                                 i->Ain.SseSI2SF.src );
3631       goto done;
3632
3633    case Ain_SseSF2SI:
3634       /* cvss[sd]2si %src, %dst */
3635       rex = rexAMode_R_reg_enc( i->Ain.SseSF2SI.dst,
3636                                 vregEnc3210(i->Ain.SseSF2SI.src) );
3637       *p++ = toUChar(i->Ain.SseSF2SI.szS==4 ? 0xF3 : 0xF2);
3638       *p++ = toUChar(i->Ain.SseSF2SI.szD==4 ? clearWBit(rex) : rex);
3639       *p++ = 0x0F;
3640       *p++ = 0x2D;
3641       p = doAMode_R_reg_enc( p, i->Ain.SseSF2SI.dst,
3642                                 vregEnc3210(i->Ain.SseSF2SI.src) );
3643       goto done;
3644
3645    case Ain_SseSDSS:
3646       /* cvtsd2ss/cvtss2sd %src, %dst */
3647       *p++ = toUChar(i->Ain.SseSDSS.from64 ? 0xF2 : 0xF3);
3648       *p++ = clearWBit(
3649               rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseSDSS.dst),
3650                                   vregEnc3210(i->Ain.SseSDSS.src) ));
3651       *p++ = 0x0F;
3652       *p++ = 0x5A;
3653       p = doAMode_R_enc_enc( p, vregEnc3210(i->Ain.SseSDSS.dst),
3654                                 vregEnc3210(i->Ain.SseSDSS.src) );
3655       goto done;
3656
3657    case Ain_SseLdSt:
3658       if (i->Ain.SseLdSt.sz == 8) {
3659          *p++ = 0xF2;
3660       } else
3661       if (i->Ain.SseLdSt.sz == 4) {
3662          *p++ = 0xF3;
3663       } else
3664       if (i->Ain.SseLdSt.sz != 16) {
3665          vassert(0);
3666       }
3667       *p++ = clearWBit(
3668              rexAMode_M_enc(vregEnc3210(i->Ain.SseLdSt.reg),
3669                             i->Ain.SseLdSt.addr));
3670       *p++ = 0x0F;
3671       *p++ = toUChar(i->Ain.SseLdSt.isLoad ? 0x10 : 0x11);
3672       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdSt.reg),
3673                            i->Ain.SseLdSt.addr);
3674       goto done;
3675
3676    case Ain_SseCStore: {
3677       vassert(i->Ain.SseCStore.cond != Acc_ALWAYS);
3678
3679       /* Use ptmp for backpatching conditional jumps. */
3680       ptmp = NULL;
3681
3682       /* jmp fwds if !condition */
3683       *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCStore.cond ^ 1)));
3684       ptmp = p; /* fill in this bit later */
3685       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3686
3687       /* Now the store. */
3688       *p++ = clearWBit(
3689              rexAMode_M_enc(vregEnc3210(i->Ain.SseCStore.src),
3690                             i->Ain.SseCStore.addr));
3691       *p++ = 0x0F;
3692       *p++ = toUChar(0x11);
3693       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCStore.src),
3694                            i->Ain.SseCStore.addr);
3695
3696       /* Fix up the conditional branch */
3697       Int delta = p - ptmp;
3698       vassert(delta > 0 && delta < 40);
3699       *ptmp = toUChar(delta-1);
3700       goto done;
3701    }
3702
3703    case Ain_SseCLoad: {
3704       vassert(i->Ain.SseCLoad.cond != Acc_ALWAYS);
3705
3706       /* Use ptmp for backpatching conditional jumps. */
3707       ptmp = NULL;
3708
3709       /* jmp fwds if !condition */
3710       *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCLoad.cond ^ 1)));
3711       ptmp = p; /* fill in this bit later */
3712       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3713
3714       /* Now the load. */
3715       *p++ = clearWBit(
3716              rexAMode_M_enc(vregEnc3210(i->Ain.SseCLoad.dst),
3717                             i->Ain.SseCLoad.addr));
3718       *p++ = 0x0F;
3719       *p++ = toUChar(0x10);
3720       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCLoad.dst),
3721                            i->Ain.SseCLoad.addr);
3722
3723       /* Fix up the conditional branch */
3724       Int delta = p - ptmp;
3725       vassert(delta > 0 && delta < 40);
3726       *ptmp = toUChar(delta-1);
3727       goto done;
3728    }
3729
3730    case Ain_SseLdzLO:
3731       vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
3732       /* movs[sd] amode, %xmm-dst */
3733       *p++ = toUChar(i->Ain.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
3734       *p++ = clearWBit(
3735              rexAMode_M_enc(vregEnc3210(i->Ain.SseLdzLO.reg),
3736                             i->Ain.SseLdzLO.addr));
3737       *p++ = 0x0F;
3738       *p++ = 0x10;
3739       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdzLO.reg),
3740                            i->Ain.SseLdzLO.addr);
3741       goto done;
3742
3743    case Ain_Sse32Fx4: {
3744       UInt srcRegNo = vregEnc3210(i->Ain.Sse32Fx4.src);
3745       UInt dstRegNo = vregEnc3210(i->Ain.Sse32Fx4.dst);
3746       // VEX encoded cases
3747       switch (i->Ain.Sse32Fx4.op) {
3748          case Asse_F16toF32: { // vcvtph2ps %xmmS, %xmmD
3749             UInt s = srcRegNo;
3750             UInt d = dstRegNo;
3751             // VCVTPH2PS %xmmS, %xmmD (s and d are both xmm regs, range 0 .. 15)
3752             // 0xC4 : ~d3 1 ~s3 0 0 0 1 0 : 0x79 : 0x13 : 1 1 d2 d1 d0 s2 s1 s0
3753             UInt byte2 = ((((~d)>>3)&1)<<7) | (1<<6)
3754                          | ((((~s)>>3)&1)<<5) | (1<<1);
3755             UInt byte5 = (1<<7) | (1<<6) | ((d&7) << 3) | ((s&7) << 0);
3756             *p++ = 0xC4;
3757             *p++ = byte2;
3758             *p++ = 0x79;
3759             *p++ = 0x13;
3760             *p++ = byte5;
3761             goto done;
3762          }
3763          case Asse_F32toF16: { // vcvtps2ph $4, %xmmS, %xmmD
3764             UInt s = srcRegNo;
3765             UInt d = dstRegNo;
3766             // VCVTPS2PH $4, %xmmS, %xmmD (s and d both xmm regs, range 0 .. 15)
3767             // 0xC4 : ~s3 1 ~d3 0 0 0 1 1 : 0x79
3768             //      : 0x1D : 11 s2 s1 s0 d2 d1 d0 : 0x4
3769             UInt byte2 = ((((~s)>>3)&1)<<7) | (1<<6)
3770                          | ((((~d)>>3)&1)<<5) | (1<<1) | (1 << 0);
3771             UInt byte5 = (1<<7) | (1<<6) | ((s&7) << 3) | ((d&7) << 0);
3772             *p++ = 0xC4;
3773             *p++ = byte2;
3774             *p++ = 0x79;
3775             *p++ = 0x1D;
3776             *p++ = byte5;
3777             *p++ = 0x04;
3778             goto done;
3779          }
3780          default: break;
3781       }
3782       // After this point, REX encoded cases only
3783       xtra = 0;
3784       switch (i->Ain.Sse32Fx4.op) {
3785          case Asse_F2I: *p++ = 0x66; break;
3786          default: break;
3787       }
3788       *p++ = clearWBit(rexAMode_R_enc_enc(dstRegNo, srcRegNo));
3789       *p++ = 0x0F;
3790       switch (i->Ain.Sse32Fx4.op) {
3791          case Asse_ADDF:   *p++ = 0x58; break;
3792          case Asse_DIVF:   *p++ = 0x5E; break;
3793          case Asse_MAXF:   *p++ = 0x5F; break;
3794          case Asse_MINF:   *p++ = 0x5D; break;
3795          case Asse_MULF:   *p++ = 0x59; break;
3796          case Asse_RCPF:   *p++ = 0x53; break;
3797          case Asse_RSQRTF: *p++ = 0x52; break;
3798          case Asse_SQRTF:  *p++ = 0x51; break;
3799          case Asse_I2F:    *p++ = 0x5B; break; // cvtdq2ps; no 0x66 pfx
3800          case Asse_F2I:    *p++ = 0x5B; break; // cvtps2dq; with 0x66 pfx
3801          case Asse_SUBF:   *p++ = 0x5C; break;
3802          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3803          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3804          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3805          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3806          default: goto bad;
3807       }
3808       p = doAMode_R_enc_enc(p, dstRegNo, srcRegNo);
3809       if (xtra & 0x100)
3810          *p++ = toUChar(xtra & 0xFF);
3811       goto done;
3812    }
3813
3814    case Ain_Sse64Fx2:
3815       xtra = 0;
3816       *p++ = 0x66;
3817       *p++ = clearWBit(
3818              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64Fx2.dst),
3819                                  vregEnc3210(i->Ain.Sse64Fx2.src) ));
3820       *p++ = 0x0F;
3821       switch (i->Ain.Sse64Fx2.op) {
3822          case Asse_ADDF:   *p++ = 0x58; break;
3823          case Asse_DIVF:   *p++ = 0x5E; break;
3824          case Asse_MAXF:   *p++ = 0x5F; break;
3825          case Asse_MINF:   *p++ = 0x5D; break;
3826          case Asse_MULF:   *p++ = 0x59; break;
3827          case Asse_SQRTF:  *p++ = 0x51; break;
3828          case Asse_SUBF:   *p++ = 0x5C; break;
3829          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3830          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3831          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3832          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3833          default: goto bad;
3834       }
3835       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64Fx2.dst),
3836                                vregEnc3210(i->Ain.Sse64Fx2.src) );
3837       if (xtra & 0x100)
3838          *p++ = toUChar(xtra & 0xFF);
3839       goto done;
3840
3841    case Ain_Sse32FLo:
3842       xtra = 0;
3843       *p++ = 0xF3;
3844       *p++ = clearWBit(
3845              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32FLo.dst),
3846                                  vregEnc3210(i->Ain.Sse32FLo.src) ));
3847       *p++ = 0x0F;
3848       switch (i->Ain.Sse32FLo.op) {
3849          case Asse_ADDF:   *p++ = 0x58; break;
3850          case Asse_DIVF:   *p++ = 0x5E; break;
3851          case Asse_MAXF:   *p++ = 0x5F; break;
3852          case Asse_MINF:   *p++ = 0x5D; break;
3853          case Asse_MULF:   *p++ = 0x59; break;
3854          case Asse_RCPF:   *p++ = 0x53; break;
3855          case Asse_RSQRTF: *p++ = 0x52; break;
3856          case Asse_SQRTF:  *p++ = 0x51; break;
3857          case Asse_SUBF:   *p++ = 0x5C; break;
3858          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3859          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3860          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3861          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3862          default: goto bad;
3863       }
3864       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse32FLo.dst),
3865                                vregEnc3210(i->Ain.Sse32FLo.src) );
3866       if (xtra & 0x100)
3867          *p++ = toUChar(xtra & 0xFF);
3868       goto done;
3869
3870    case Ain_Sse64FLo:
3871       xtra = 0;
3872       *p++ = 0xF2;
3873       *p++ = clearWBit(
3874              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64FLo.dst),
3875                                  vregEnc3210(i->Ain.Sse64FLo.src) ));
3876       *p++ = 0x0F;
3877       switch (i->Ain.Sse64FLo.op) {
3878          case Asse_ADDF:   *p++ = 0x58; break;
3879          case Asse_DIVF:   *p++ = 0x5E; break;
3880          case Asse_MAXF:   *p++ = 0x5F; break;
3881          case Asse_MINF:   *p++ = 0x5D; break;
3882          case Asse_MULF:   *p++ = 0x59; break;
3883          case Asse_SQRTF:  *p++ = 0x51; break;
3884          case Asse_SUBF:   *p++ = 0x5C; break;
3885          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3886          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3887          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3888          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3889          default: goto bad;
3890       }
3891       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64FLo.dst),
3892                                vregEnc3210(i->Ain.Sse64FLo.src) );
3893       if (xtra & 0x100)
3894          *p++ = toUChar(xtra & 0xFF);
3895       goto done;
3896
3897    case Ain_SseReRg:
3898 #     define XX(_n) *p++ = (_n)
3899
3900       rex = clearWBit(
3901             rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseReRg.dst),
3902                                 vregEnc3210(i->Ain.SseReRg.src) ));
3903
3904       switch (i->Ain.SseReRg.op) {
3905          case Asse_MOV:     /*movups*/ XX(rex); XX(0x0F); XX(0x10); break;
3906          case Asse_OR:                 XX(rex); XX(0x0F); XX(0x56); break;
3907          case Asse_XOR:                XX(rex); XX(0x0F); XX(0x57); break;
3908          case Asse_AND:                XX(rex); XX(0x0F); XX(0x54); break;
3909          case Asse_ANDN:               XX(rex); XX(0x0F); XX(0x55); break;
3910          case Asse_PACKSSD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6B); break;
3911          case Asse_PACKSSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x63); break;
3912          case Asse_PACKUSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x67); break;
3913          case Asse_ADD8:     XX(0x66); XX(rex); XX(0x0F); XX(0xFC); break;
3914          case Asse_ADD16:    XX(0x66); XX(rex); XX(0x0F); XX(0xFD); break;
3915          case Asse_ADD32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFE); break;
3916          case Asse_ADD64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD4); break;
3917          case Asse_QADD8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEC); break;
3918          case Asse_QADD16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xED); break;
3919          case Asse_QADD8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xDC); break;
3920          case Asse_QADD16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xDD); break;
3921          case Asse_AVG8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xE0); break;
3922          case Asse_AVG16U:   XX(0x66); XX(rex); XX(0x0F); XX(0xE3); break;
3923          case Asse_CMPEQ8:   XX(0x66); XX(rex); XX(0x0F); XX(0x74); break;
3924          case Asse_CMPEQ16:  XX(0x66); XX(rex); XX(0x0F); XX(0x75); break;
3925          case Asse_CMPEQ32:  XX(0x66); XX(rex); XX(0x0F); XX(0x76); break;
3926          case Asse_CMPGT8S:  XX(0x66); XX(rex); XX(0x0F); XX(0x64); break;
3927          case Asse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65); break;
3928          case Asse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66); break;
3929          case Asse_MAX16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break;
3930          case Asse_MAX8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break;
3931          case Asse_MIN16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break;
3932          case Asse_MIN8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break;
3933          case Asse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break;
3934          case Asse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break;
3935          case Asse_MUL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break;
3936          case Asse_SHL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break;
3937          case Asse_SHL32:    XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break;
3938          case Asse_SHL64:    XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break;
3939          case Asse_SAR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break;
3940          case Asse_SAR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break;
3941          case Asse_SHR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break;
3942          case Asse_SHR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break;
3943          case Asse_SHR64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD3); break;
3944          case Asse_SUB8:     XX(0x66); XX(rex); XX(0x0F); XX(0xF8); break;
3945          case Asse_SUB16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF9); break;
3946          case Asse_SUB32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFA); break;
3947          case Asse_SUB64:    XX(0x66); XX(rex); XX(0x0F); XX(0xFB); break;
3948          case Asse_QSUB8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xE8); break;
3949          case Asse_QSUB16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xE9); break;
3950          case Asse_QSUB8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xD8); break;
3951          case Asse_QSUB16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xD9); break;
3952          case Asse_UNPCKHB:  XX(0x66); XX(rex); XX(0x0F); XX(0x68); break;
3953          case Asse_UNPCKHW:  XX(0x66); XX(rex); XX(0x0F); XX(0x69); break;
3954          case Asse_UNPCKHD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6A); break;
3955          case Asse_UNPCKHQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6D); break;
3956          case Asse_UNPCKLB:  XX(0x66); XX(rex); XX(0x0F); XX(0x60); break;
3957          case Asse_UNPCKLW:  XX(0x66); XX(rex); XX(0x0F); XX(0x61); break;
3958          case Asse_UNPCKLD:  XX(0x66); XX(rex); XX(0x0F); XX(0x62); break;
3959          case Asse_UNPCKLQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
3960          case Asse_PSHUFB:   XX(0x66); XX(rex);
3961                              XX(0x0F); XX(0x38); XX(0x00); break;
3962          case Asse_PMADDUBSW:XX(0x66); XX(rex);
3963                              XX(0x0F); XX(0x38); XX(0x04); break;
3964          default: goto bad;
3965       }
3966       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseReRg.dst),
3967                                vregEnc3210(i->Ain.SseReRg.src) );
3968 #     undef XX
3969       goto done;
3970
3971    case Ain_SseCMov:
3972       /* jmp fwds if !condition */
3973       *p++ = toUChar(0x70 + (i->Ain.SseCMov.cond ^ 1));
3974       *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
3975       ptmp = p;
3976
3977       /* movaps %src, %dst */
3978       *p++ = clearWBit(
3979              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseCMov.dst),
3980                                  vregEnc3210(i->Ain.SseCMov.src) ));
3981       *p++ = 0x0F;
3982       *p++ = 0x28;
3983       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseCMov.dst),
3984                                vregEnc3210(i->Ain.SseCMov.src) );
3985
3986       /* Fill in the jump offset. */
3987       *(ptmp-1) = toUChar(p - ptmp);
3988       goto done;
3989
3990    case Ain_SseShuf:
3991       *p++ = 0x66;
3992       *p++ = clearWBit(
3993              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseShuf.dst),
3994                                  vregEnc3210(i->Ain.SseShuf.src) ));
3995       *p++ = 0x0F;
3996       *p++ = 0x70;
3997       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseShuf.dst),
3998                                vregEnc3210(i->Ain.SseShuf.src) );
3999       *p++ = (UChar)(i->Ain.SseShuf.order);
4000       goto done;
4001
4002    case Ain_SseShiftN: {
4003       UInt limit  = 0;
4004       UInt shiftImm = i->Ain.SseShiftN.shiftBits;
4005       switch (i->Ain.SseShiftN.op) {
4006          case Asse_SHL16: limit = 15; opc = 0x71; subopc_imm = 6; break;
4007          case Asse_SHL32: limit = 31; opc = 0x72; subopc_imm = 6; break;
4008          case Asse_SHL64: limit = 63; opc = 0x73; subopc_imm = 6; break;
4009          case Asse_SAR16: limit = 15; opc = 0x71; subopc_imm = 4; break;
4010          case Asse_SAR32: limit = 31; opc = 0x72; subopc_imm = 4; break;
4011          case Asse_SHR16: limit = 15; opc = 0x71; subopc_imm = 2; break;
4012          case Asse_SHR32: limit = 31; opc = 0x72; subopc_imm = 2; break;
4013          case Asse_SHR64: limit = 63; opc = 0x73; subopc_imm = 2; break;
4014          case Asse_SHL128:
4015             if ((shiftImm & 7) != 0) goto bad;
4016             shiftImm >>= 3;
4017             limit = 15; opc = 0x73; subopc_imm = 7;
4018             break;
4019          case Asse_SHR128:
4020             if ((shiftImm & 7) != 0) goto bad;
4021             shiftImm >>= 3;
4022             limit = 15; opc = 0x73; subopc_imm = 3;
4023             break;
4024          default:
4025             // This should never happen .. SSE2 only offers the above 10 insns
4026             // for the "shift with immediate" case
4027             goto bad;
4028       }
4029       vassert(limit > 0 && opc > 0 && subopc_imm > 0);
4030       if (shiftImm > limit) goto bad;
4031       *p++ = 0x66;
4032       *p++ = clearWBit(
4033              rexAMode_R_enc_enc( subopc_imm,
4034                                  vregEnc3210(i->Ain.SseShiftN.dst) ));
4035       *p++ = 0x0F;
4036       *p++ = opc;
4037       p = doAMode_R_enc_enc(p, subopc_imm, vregEnc3210(i->Ain.SseShiftN.dst));
4038       *p++ = shiftImm;
4039       goto done;
4040    }
4041
4042    case Ain_SseMOVQ: {
4043       Bool toXMM = i->Ain.SseMOVQ.toXMM;
4044       HReg gpr = i->Ain.SseMOVQ.gpr;
4045       HReg xmm = i->Ain.SseMOVQ.xmm;
4046       *p++ = 0x66;
4047       *p++ = setWBit( rexAMode_R_enc_enc( vregEnc3210(xmm), iregEnc3210(gpr)) );
4048       *p++ = 0x0F;
4049       *p++ = toXMM ? 0x6E : 0x7E;
4050       p = doAMode_R_enc_enc( p, vregEnc3210(xmm), iregEnc3210(gpr) );
4051       goto done;
4052    }
4053
4054    //uu case Ain_AvxLdSt: {
4055    //uu    UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
4056    //uu                           i->Ain.AvxLdSt.addr );
4057    //uu    p = emitVexPrefix(p, vex);
4058    //uu    *p++ = toUChar(i->Ain.AvxLdSt.isLoad ? 0x10 : 0x11);
4059    //uu    p = doAMode_M(p, dvreg2ireg(i->Ain.AvxLdSt.reg), i->Ain.AvxLdSt.addr);
4060    //uu      goto done;
4061    //uu }
4062
4063    case Ain_EvCheck: {
4064       /* We generate:
4065             (3 bytes)  decl 8(%rbp)    8 == offsetof(host_EvC_COUNTER)
4066             (2 bytes)  jns  nofail     expected taken
4067             (3 bytes)  jmp* 0(%rbp)    0 == offsetof(host_EvC_FAILADDR)
4068             nofail:
4069       */
4070       /* This is heavily asserted re instruction lengths.  It needs to
4071          be.  If we get given unexpected forms of .amCounter or
4072          .amFailAddr -- basically, anything that's not of the form
4073          uimm7(%rbp) -- they are likely to fail. */
4074       /* Note also that after the decl we must be very careful not to
4075          read the carry flag, else we get a partial flags stall.
4076          js/jns avoids that, though. */
4077       UChar* p0 = p;
4078       /* ---  decl 8(%rbp) --- */
4079       /* Need to compute the REX byte for the decl in order to prove
4080          that we don't need it, since this is a 32-bit inc and all
4081          registers involved in the amode are < r8.  "1" because
4082          there's no register in this encoding; instead the register
4083          field is used as a sub opcode.  The encoding for "decl r/m32"
4084          is FF /1, hence the "1". */
4085       rex = clearWBit(rexAMode_M_enc(1, i->Ain.EvCheck.amCounter));
4086       if (rex != 0x40) goto bad; /* We don't expect to need the REX byte. */
4087       *p++ = 0xFF;
4088       p = doAMode_M_enc(p, 1, i->Ain.EvCheck.amCounter);
4089       vassert(p - p0 == 3);
4090       /* --- jns nofail --- */
4091       *p++ = 0x79;
4092       *p++ = 0x03; /* need to check this 0x03 after the next insn */
4093       vassert(p - p0 == 5);
4094       /* --- jmp* 0(%rbp) --- */
4095       /* Once again, verify we don't need REX.  The encoding is FF /4.
4096          We don't need REX.W since by default FF /4 in 64-bit mode
4097          implies a 64 bit load. */
4098       rex = clearWBit(rexAMode_M_enc(4, i->Ain.EvCheck.amFailAddr));
4099       if (rex != 0x40) goto bad;
4100       *p++ = 0xFF;
4101       p = doAMode_M_enc(p, 4, i->Ain.EvCheck.amFailAddr);
4102       vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
4103       /* And crosscheck .. */
4104       vassert(evCheckSzB_AMD64() == 8);
4105       goto done;
4106    }
4107
4108    case Ain_ProfInc: {
4109       /* We generate   movabsq $0, %r11
4110                        incq (%r11)
4111          in the expectation that a later call to LibVEX_patchProfCtr
4112          will be used to fill in the immediate field once the right
4113          value is known.
4114          49 BB 00 00 00 00 00 00 00 00
4115          49 FF 03
4116       */
4117       *p++ = 0x49; *p++ = 0xBB;
4118       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
4119       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
4120       *p++ = 0x49; *p++ = 0xFF; *p++ = 0x03;
4121       /* Tell the caller .. */
4122       vassert(!(*is_profInc));
4123       *is_profInc = True;
4124       goto done;
4125    }
4126
4127    default:
4128       goto bad;
4129    }
4130
4131   bad:
4132    ppAMD64Instr(i, mode64);
4133    vpanic("emit_AMD64Instr");
4134    /*NOTREACHED*/
4135
4136   done:
4137    vassert(p - &buf[0] <= 64);
4138    return p - &buf[0];
4139 }
4140
4141
4142 /* How big is an event check?  See case for Ain_EvCheck in
4143    emit_AMD64Instr just above.  That crosschecks what this returns, so
4144    we can tell if we're inconsistent. */
4145 Int evCheckSzB_AMD64 (void)
4146 {
4147    return 8;
4148 }
4149
4150
4151 /* NB: what goes on here has to be very closely coordinated with the
4152    emitInstr case for XDirect, above. */
4153 VexInvalRange chainXDirect_AMD64 ( VexEndness endness_host,
4154                                    void* place_to_chain,
4155                                    const void* disp_cp_chain_me_EXPECTED,
4156                                    const void* place_to_jump_to )
4157 {
4158    vassert(endness_host == VexEndnessLE);
4159
4160    /* What we're expecting to see is:
4161         movabsq $disp_cp_chain_me_EXPECTED, %r11
4162         call *%r11
4163       viz
4164         49 BB <8 bytes value == disp_cp_chain_me_EXPECTED>
4165         41 FF D3
4166    */
4167    UChar* p = (UChar*)place_to_chain;
4168    vassert(p[0] == 0x49);
4169    vassert(p[1] == 0xBB);
4170    vassert(read_misaligned_ULong_LE(&p[2]) == (Addr)disp_cp_chain_me_EXPECTED);
4171    vassert(p[10] == 0x41);
4172    vassert(p[11] == 0xFF);
4173    vassert(p[12] == 0xD3);
4174    /* And what we want to change it to is either:
4175         (general case):
4176           movabsq $place_to_jump_to, %r11
4177           jmpq *%r11
4178         viz
4179           49 BB <8 bytes value == place_to_jump_to>
4180           41 FF E3
4181         So it's the same length (convenient, huh) and we don't
4182         need to change all the bits.
4183       ---OR---
4184         in the case where the displacement falls within 32 bits
4185           jmpq disp32   where disp32 is relative to the next insn
4186           ud2; ud2; ud2; ud2
4187         viz
4188           E9 <4 bytes == disp32>
4189           0F 0B 0F 0B 0F 0B 0F 0B
4190
4191       In both cases the replacement has the same length as the original.
4192       To remain sane & verifiable,
4193       (1) limit the displacement for the short form to
4194           (say) +/- one billion, so as to avoid wraparound
4195           off-by-ones
4196       (2) even if the short form is applicable, once every (say)
4197           1024 times use the long form anyway, so as to maintain
4198           verifiability
4199    */
4200    /* This is the delta we need to put into a JMP d32 insn.  It's
4201       relative to the start of the next insn, hence the -5.  */
4202    Long delta   = (Long)((const UChar *)place_to_jump_to - (const UChar*)p) - 5;
4203    Bool shortOK = delta >= -1000*1000*1000 && delta < 1000*1000*1000;
4204
4205    static UInt shortCTR = 0; /* DO NOT MAKE NON-STATIC */
4206    if (shortOK) {
4207       shortCTR++; // thread safety bleh
4208       if (0 == (shortCTR & 0x3FF)) {
4209          shortOK = False;
4210          if (0)
4211             vex_printf("QQQ chainXDirect_AMD64: shortCTR = %u, "
4212                        "using long jmp\n", shortCTR);
4213       }
4214    }
4215
4216    /* And make the modifications. */
4217    if (shortOK) {
4218       p[0]  = 0xE9;
4219       write_misaligned_UInt_LE(&p[1], (UInt)(Int)delta);
4220       p[5]  = 0x0F; p[6]  = 0x0B;
4221       p[7]  = 0x0F; p[8]  = 0x0B;
4222       p[9]  = 0x0F; p[10] = 0x0B;
4223       p[11] = 0x0F; p[12] = 0x0B;
4224       /* sanity check on the delta -- top 32 are all 0 or all 1 */
4225       delta >>= 32;
4226       vassert(delta == 0LL || delta == -1LL);
4227    } else {
4228       /* Minimal modifications from the starting sequence. */
4229       write_misaligned_ULong_LE(&p[2], (ULong)(Addr)place_to_jump_to);
4230       p[12] = 0xE3;
4231    }
4232    VexInvalRange vir = { (HWord)place_to_chain, 13 };
4233    return vir;
4234 }
4235
4236
4237 /* NB: what goes on here has to be very closely coordinated with the
4238    emitInstr case for XDirect, above. */
4239 VexInvalRange unchainXDirect_AMD64 ( VexEndness endness_host,
4240                                      void* place_to_unchain,
4241                                      const void* place_to_jump_to_EXPECTED,
4242                                      const void* disp_cp_chain_me )
4243 {
4244    vassert(endness_host == VexEndnessLE);
4245
4246    /* What we're expecting to see is either:
4247         (general case)
4248           movabsq $place_to_jump_to_EXPECTED, %r11
4249           jmpq *%r11
4250         viz
4251           49 BB <8 bytes value == place_to_jump_to_EXPECTED>
4252           41 FF E3
4253       ---OR---
4254         in the case where the displacement falls within 32 bits
4255           jmpq d32
4256           ud2; ud2; ud2; ud2
4257         viz
4258           E9 <4 bytes == disp32>
4259           0F 0B 0F 0B 0F 0B 0F 0B
4260    */
4261    UChar* p     = (UChar*)place_to_unchain;
4262    Bool   valid = False;
4263    if (p[0] == 0x49 && p[1] == 0xBB
4264        && read_misaligned_ULong_LE(&p[2])
4265           == (ULong)(Addr)place_to_jump_to_EXPECTED
4266        && p[10] == 0x41 && p[11] == 0xFF && p[12] == 0xE3) {
4267       /* it's the long form */
4268       valid = True;
4269    }
4270    else
4271    if (p[0] == 0xE9
4272        && p[5]  == 0x0F && p[6]  == 0x0B
4273        && p[7]  == 0x0F && p[8]  == 0x0B
4274        && p[9]  == 0x0F && p[10] == 0x0B
4275        && p[11] == 0x0F && p[12] == 0x0B) {
4276       /* It's the short form.  Check the offset is right. */
4277       Int  s32 = (Int)read_misaligned_UInt_LE(&p[1]);
4278       Long s64 = (Long)s32;
4279       if ((UChar*)p + 5 + s64 == place_to_jump_to_EXPECTED) {
4280          valid = True;
4281          if (0)
4282             vex_printf("QQQ unchainXDirect_AMD64: found short form\n");
4283       }
4284    }
4285    vassert(valid);
4286    /* And what we want to change it to is:
4287         movabsq $disp_cp_chain_me, %r11
4288         call *%r11
4289       viz
4290         49 BB <8 bytes value == disp_cp_chain_me>
4291         41 FF D3
4292       So it's the same length (convenient, huh).
4293    */
4294    p[0] = 0x49;
4295    p[1] = 0xBB;
4296    write_misaligned_ULong_LE(&p[2], (ULong)(Addr)disp_cp_chain_me);
4297    p[10] = 0x41;
4298    p[11] = 0xFF;
4299    p[12] = 0xD3;
4300    VexInvalRange vir = { (HWord)place_to_unchain, 13 };
4301    return vir;
4302 }
4303
4304
4305 /* Patch the counter address into a profile inc point, as previously
4306    created by the Ain_ProfInc case for emit_AMD64Instr. */
4307 VexInvalRange patchProfInc_AMD64 ( VexEndness endness_host,
4308                                    void*  place_to_patch,
4309                                    const ULong* location_of_counter )
4310 {
4311    vassert(endness_host == VexEndnessLE);
4312    vassert(sizeof(ULong*) == 8);
4313    UChar* p = (UChar*)place_to_patch;
4314    vassert(p[0] == 0x49);
4315    vassert(p[1] == 0xBB);
4316    vassert(p[2] == 0x00);
4317    vassert(p[3] == 0x00);
4318    vassert(p[4] == 0x00);
4319    vassert(p[5] == 0x00);
4320    vassert(p[6] == 0x00);
4321    vassert(p[7] == 0x00);
4322    vassert(p[8] == 0x00);
4323    vassert(p[9] == 0x00);
4324    vassert(p[10] == 0x49);
4325    vassert(p[11] == 0xFF);
4326    vassert(p[12] == 0x03);
4327    ULong imm64 = (ULong)(Addr)location_of_counter;
4328    p[2] = imm64 & 0xFF; imm64 >>= 8;
4329    p[3] = imm64 & 0xFF; imm64 >>= 8;
4330    p[4] = imm64 & 0xFF; imm64 >>= 8;
4331    p[5] = imm64 & 0xFF; imm64 >>= 8;
4332    p[6] = imm64 & 0xFF; imm64 >>= 8;
4333    p[7] = imm64 & 0xFF; imm64 >>= 8;
4334    p[8] = imm64 & 0xFF; imm64 >>= 8;
4335    p[9] = imm64 & 0xFF; imm64 >>= 8;
4336    VexInvalRange vir = { (HWord)place_to_patch, 13 };
4337    return vir;
4338 }
4339
4340
4341 /*---------------------------------------------------------------*/
4342 /*--- end                                   host_amd64_defs.c ---*/
4343 /*---------------------------------------------------------------*/