VEX/priv/guest_amd64_helpers.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                             guest_amd64_helpers.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2004-2017 OpenWorks LLP
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, see <http://www.gnu.org/licenses/>.
  25
  26    The GNU General Public License is contained in the file COPYING.
  27
  28    Neither the names of the U.S. Department of Energy nor the
  29    University of California nor the names of its contributors may be
  30    used to endorse or promote products derived from this software
  31    without prior written permission.
  32 */
  33
  34 #include "libvex_basictypes.h"
  35 #include "libvex_emnote.h"
  36 #include "libvex_guest_amd64.h"
  37 #include "libvex_ir.h"
  38 #include "libvex.h"
  39
  40 #include "main_util.h"
  41 #include "main_globals.h"
  42 #include "guest_generic_bb_to_IR.h"
  43 #include "guest_amd64_defs.h"
  44 #include "guest_generic_x87.h"
  45
  46
  47 /* This file contains helper functions for amd64 guest code.
  48    Calls to these functions are generated by the back end.
  49    These calls are of course in the host machine code and
  50    this file will be compiled to host machine code, so that
  51    all makes sense.
  52
  53    Only change the signatures of these helper functions very
  54    carefully.  If you change the signature here, you'll have to change
  55    the parameters passed to it in the IR calls constructed by
  56    guest-amd64/toIR.c.
  57
  58    The convention used is that all functions called from generated
  59    code are named amd64g_<something>, and any function whose name lacks
  60    that prefix is not called from generated code.  Note that some
  61    LibVEX_* functions can however be called by VEX's client, but that
  62    is not the same as calling them from VEX-generated code.
  63 */
  64
  65
  66 /* Set to 1 to get detailed profiling info about use of the flag
  67    machinery. */
  68 #define PROFILE_RFLAGS 0
  69
  70
  71 /*---------------------------------------------------------------*/
  72 /*--- %rflags run-time helpers.                               ---*/
  73 /*---------------------------------------------------------------*/
  74
  75 /* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
  76    after imulq/mulq. */
  77
  78 static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
  79 {
  80    const Long halfMask = 0xFFFFFFFFLL;
  81    ULong u0, v0, w0;
  82     Long u1, v1, w1, w2, t;
  83    u0   = u & halfMask;
  84    u1   = u >> 32;
  85    v0   = v & halfMask;
  86    v1   = v >> 32;
  87    w0   = u0 * v0;
  88    t    = u1 * v0 + (w0 >> 32);
  89    w1   = t & halfMask;
  90    w2   = t >> 32;
  91    w1   = u0 * v1 + w1;
  92    *rHi = u1 * v1 + w2 + (w1 >> 32);
  93    *rLo = (Long)((ULong)u * (ULong)v);
  94 }
  95
  96 static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
  97 {
  98    const ULong halfMask = 0xFFFFFFFFULL;
  99    ULong u0, v0, w0;
 100    ULong u1, v1, w1,w2,t;
 101    u0   = u & halfMask;
 102    u1   = u >> 32;
 103    v0   = v & halfMask;
 104    v1   = v >> 32;
 105    w0   = u0 * v0;
 106    t    = u1 * v0 + (w0 >> 32);
 107    w1   = t & halfMask;
 108    w2   = t >> 32;
 109    w1   = u0 * v1 + w1;
 110    *rHi = u1 * v1 + w2 + (w1 >> 32);
 111    *rLo = u * v;
 112 }
 113
 114
 115 static const UChar parity_table[256] = {
 116     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 117     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 118     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 119     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 120     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 121     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 122     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 123     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 124     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 125     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 126     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 127     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 128     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 129     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 130     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 131     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 132     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 133     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 134     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 135     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 136     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 137     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 138     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 139     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 140     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 141     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 142     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 143     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 144     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 145     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 146     AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
 147     0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
 148 };
 149
 150 /* generalised left-shifter */
 151 static inline Long lshift ( Long x, Int n )
 152 {
 153    if (n >= 0)
 154       return (ULong)x << n;
 155    else
 156       return x >> (-n);
 157 }
 158
 159 /* identity on ULong */
 160 static inline ULong idULong ( ULong x )
 161 {
 162    return x;
 163 }
 164
 165
 166 #define PREAMBLE(__data_bits)                                   \
 167    /* const */ ULong DATA_MASK                                  \
 168       = __data_bits==8                                          \
 169            ? 0xFFULL                                            \
 170            : (__data_bits==16                                   \
 171                 ? 0xFFFFULL                                     \
 172                 : (__data_bits==32                              \
 173                      ? 0xFFFFFFFFULL                            \
 174                      : 0xFFFFFFFFFFFFFFFFULL));                 \
 175    /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1);     \
 176    /* const */ ULong CC_DEP1 = cc_dep1_formal;                  \
 177    /* const */ ULong CC_DEP2 = cc_dep2_formal;                  \
 178    /* const */ ULong CC_NDEP = cc_ndep_formal;                  \
 179    /* Four bogus assignments, which hopefully gcc can     */    \
 180    /* optimise away, and which stop it complaining about  */    \
 181    /* unused variables.                                   */    \
 182    SIGN_MASK = SIGN_MASK;                                       \
 183    DATA_MASK = DATA_MASK;                                       \
 184    CC_DEP2 = CC_DEP2;                                           \
 185    CC_NDEP = CC_NDEP;
 186
 187
 188 /*-------------------------------------------------------------*/
 189
 190 #define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)                       \
 191 {                                                               \
 192    PREAMBLE(DATA_BITS);                                         \
 193    { ULong cf, pf, af, zf, sf, of;                              \
 194      ULong argL, argR, res;                                     \
 195      argL = CC_DEP1;                                            \
 196      argR = CC_DEP2;                                            \
 197      res  = argL + argR;                                        \
 198      cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;                   \
 199      pf = parity_table[(UChar)res];                             \
 200      af = (res ^ argL ^ argR) & 0x10;                           \
 201      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 202      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 203      of = lshift((argL ^ argR ^ -1) & (argL ^ res),             \
 204                  12 - DATA_BITS) & AMD64G_CC_MASK_O;            \
 205      return cf | pf | af | zf | sf | of;                        \
 206    }                                                            \
 207 }
 208
 209 /*-------------------------------------------------------------*/
 210
 211 #define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)                       \
 212 {                                                               \
 213    PREAMBLE(DATA_BITS);                                         \
 214    { ULong cf, pf, af, zf, sf, of;                              \
 215      ULong argL, argR, res;                                     \
 216      argL = CC_DEP1;                                            \
 217      argR = CC_DEP2;                                            \
 218      res  = argL - argR;                                        \
 219      cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;                  \
 220      pf = parity_table[(UChar)res];                             \
 221      af = (res ^ argL ^ argR) & 0x10;                           \
 222      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 223      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 224      of = lshift((argL ^ argR) & (argL ^ res),                  \
 225                  12 - DATA_BITS) & AMD64G_CC_MASK_O;            \
 226      return cf | pf | af | zf | sf | of;                        \
 227    }                                                            \
 228 }
 229
 230 /*-------------------------------------------------------------*/
 231
 232 #define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)                       \
 233 {                                                               \
 234    PREAMBLE(DATA_BITS);                                         \
 235    { ULong cf, pf, af, zf, sf, of;                              \
 236      ULong argL, argR, oldC, res;                               \
 237      oldC = CC_NDEP & AMD64G_CC_MASK_C;                         \
 238      argL = CC_DEP1;                                            \
 239      argR = CC_DEP2 ^ oldC;                                     \
 240      res  = (argL + argR) + oldC;                               \
 241      if (oldC)                                                  \
 242         cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;               \
 243      else                                                       \
 244         cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;                \
 245      pf = parity_table[(UChar)res];                             \
 246      af = (res ^ argL ^ argR) & 0x10;                           \
 247      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 248      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 249      of = lshift((argL ^ argR ^ -1) & (argL ^ res),             \
 250                   12 - DATA_BITS) & AMD64G_CC_MASK_O;           \
 251      return cf | pf | af | zf | sf | of;                        \
 252    }                                                            \
 253 }
 254
 255 /*-------------------------------------------------------------*/
 256
 257 #define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)                       \
 258 {                                                               \
 259    PREAMBLE(DATA_BITS);                                         \
 260    { ULong cf, pf, af, zf, sf, of;                              \
 261      ULong argL, argR, oldC, res;                               \
 262      oldC = CC_NDEP & AMD64G_CC_MASK_C;                         \
 263      argL = CC_DEP1;                                            \
 264      argR = CC_DEP2 ^ oldC;                                     \
 265      res  = (argL - argR) - oldC;                               \
 266      if (oldC)                                                  \
 267         cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;              \
 268      else                                                       \
 269         cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;               \
 270      pf = parity_table[(UChar)res];                             \
 271      af = (res ^ argL ^ argR) & 0x10;                           \
 272      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 273      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 274      of = lshift((argL ^ argR) & (argL ^ res),                  \
 275                  12 - DATA_BITS) & AMD64G_CC_MASK_O;            \
 276      return cf | pf | af | zf | sf | of;                        \
 277    }                                                            \
 278 }
 279
 280 /*-------------------------------------------------------------*/
 281
 282 #define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)                     \
 283 {                                                               \
 284    PREAMBLE(DATA_BITS);                                         \
 285    { ULong cf, pf, af, zf, sf, of;                              \
 286      cf = 0;                                                    \
 287      pf = parity_table[(UChar)CC_DEP1];                         \
 288      af = 0;                                                    \
 289      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 290      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 291      of = 0;                                                    \
 292      return cf | pf | af | zf | sf | of;                        \
 293    }                                                            \
 294 }
 295
 296 /*-------------------------------------------------------------*/
 297
 298 #define ACTIONS_INC(DATA_BITS,DATA_UTYPE)                       \
 299 {                                                               \
 300    PREAMBLE(DATA_BITS);                                         \
 301    { ULong cf, pf, af, zf, sf, of;                              \
 302      ULong argL, argR, res;                                     \
 303      res  = CC_DEP1;                                            \
 304      argL = res - 1;                                            \
 305      argR = 1;                                                  \
 306      cf = CC_NDEP & AMD64G_CC_MASK_C;                           \
 307      pf = parity_table[(UChar)res];                             \
 308      af = (res ^ argL ^ argR) & 0x10;                           \
 309      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 310      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 311      of = ((res & DATA_MASK) == SIGN_MASK) << 11;               \
 312      return cf | pf | af | zf | sf | of;                        \
 313    }                                                            \
 314 }
 315
 316 /*-------------------------------------------------------------*/
 317
 318 #define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)                       \
 319 {                                                               \
 320    PREAMBLE(DATA_BITS);                                         \
 321    { ULong cf, pf, af, zf, sf, of;                              \
 322      ULong argL, argR, res;                                     \
 323      res  = CC_DEP1;                                            \
 324      argL = res + 1;                                            \
 325      argR = 1;                                                  \
 326      cf = CC_NDEP & AMD64G_CC_MASK_C;                           \
 327      pf = parity_table[(UChar)res];                             \
 328      af = (res ^ argL ^ argR) & 0x10;                           \
 329      zf = ((DATA_UTYPE)res == 0) << 6;                          \
 330      sf = lshift(res, 8 - DATA_BITS) & 0x80;                    \
 331      of = ((res & DATA_MASK)                                    \
 332           == ((ULong)SIGN_MASK - 1)) << 11;                     \
 333      return cf | pf | af | zf | sf | of;                        \
 334    }                                                            \
 335 }
 336
 337 /*-------------------------------------------------------------*/
 338
 339 #define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)                       \
 340 {                                                               \
 341    PREAMBLE(DATA_BITS);                                         \
 342    { ULong cf, pf, af, zf, sf, of;                              \
 343      cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C;      \
 344      pf = parity_table[(UChar)CC_DEP1];                         \
 345      af = 0; /* undefined */                                    \
 346      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 347      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 348      /* of is defined if shift count == 1 */                    \
 349      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)             \
 350           & AMD64G_CC_MASK_O;                                   \
 351      return cf | pf | af | zf | sf | of;                        \
 352    }                                                            \
 353 }
 354
 355 /*-------------------------------------------------------------*/
 356
 357 #define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)                       \
 358 {                                                               \
 359    PREAMBLE(DATA_BITS);                                         \
 360    { ULong cf, pf, af, zf, sf, of;                              \
 361      cf = CC_DEP2 & 1;                                          \
 362      pf = parity_table[(UChar)CC_DEP1];                         \
 363      af = 0; /* undefined */                                    \
 364      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 365      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 366      /* of is defined if shift count == 1 */                    \
 367      of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)             \
 368           & AMD64G_CC_MASK_O;                                   \
 369      return cf | pf | af | zf | sf | of;                        \
 370    }                                                            \
 371 }
 372
 373 /*-------------------------------------------------------------*/
 374
 375 /* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
 376 /* DEP1 = result, NDEP = old flags */
 377 #define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)                       \
 378 {                                                               \
 379    PREAMBLE(DATA_BITS);                                         \
 380    { ULong fl                                                   \
 381         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))    \
 382           | (AMD64G_CC_MASK_C & CC_DEP1)                        \
 383           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,                \
 384                                       11-(DATA_BITS-1))         \
 385                      ^ lshift(CC_DEP1, 11)));                   \
 386      return fl;                                                 \
 387    }                                                            \
 388 }
 389
 390 /*-------------------------------------------------------------*/
 391
 392 /* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
 393 /* DEP1 = result, NDEP = old flags */
 394 #define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)                       \
 395 {                                                               \
 396    PREAMBLE(DATA_BITS);                                         \
 397    { ULong fl                                                   \
 398         = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))    \
 399           | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))     \
 400           | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,                \
 401                                       11-(DATA_BITS-1))         \
 402                      ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));   \
 403      return fl;                                                 \
 404    }                                                            \
 405 }
 406
 407 /*-------------------------------------------------------------*/
 408
 409 #define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
 410                                 DATA_U2TYPE, NARROWto2U)        \
 411 {                                                               \
 412    PREAMBLE(DATA_BITS);                                         \
 413    { ULong cf, pf, af, zf, sf, of;                              \
 414      DATA_UTYPE  hi;                                            \
 415      DATA_UTYPE  lo                                             \
 416         = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
 417                      * ((DATA_UTYPE)CC_DEP2) );                 \
 418      DATA_U2TYPE rr                                             \
 419         = NARROWto2U(                                           \
 420              ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
 421              * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
 422      hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
 423      cf = (hi != 0);                                            \
 424      pf = parity_table[(UChar)lo];                              \
 425      af = 0; /* undefined */                                    \
 426      zf = (lo == 0) << 6;                                       \
 427      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
 428      of = cf << 11;                                             \
 429      return cf | pf | af | zf | sf | of;                        \
 430    }                                                            \
 431 }
 432
 433 /*-------------------------------------------------------------*/
 434
 435 #define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
 436                                 DATA_S2TYPE, NARROWto2S)        \
 437 {                                                               \
 438    PREAMBLE(DATA_BITS);                                         \
 439    { ULong cf, pf, af, zf, sf, of;                              \
 440      DATA_STYPE  hi;                                            \
 441      DATA_STYPE  lo                                             \
 442         = NARROWtoS( ((DATA_S2TYPE)(DATA_STYPE)CC_DEP1)         \
 443                      * ((DATA_S2TYPE)(DATA_STYPE)CC_DEP2) );    \
 444      DATA_S2TYPE rr                                             \
 445         = NARROWto2S(                                           \
 446              ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
 447              * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
 448      hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
 449      cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
 450      pf = parity_table[(UChar)lo];                              \
 451      af = 0; /* undefined */                                    \
 452      zf = (lo == 0) << 6;                                       \
 453      sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
 454      of = cf << 11;                                             \
 455      return cf | pf | af | zf | sf | of;                        \
 456    }                                                            \
 457 }
 458
 459 /*-------------------------------------------------------------*/
 460
 461 #define ACTIONS_UMULQ                                           \
 462 {                                                               \
 463    PREAMBLE(64);                                                \
 464    { ULong cf, pf, af, zf, sf, of;                              \
 465      ULong lo, hi;                                              \
 466      mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo );       \
 467      cf = (hi != 0);                                            \
 468      pf = parity_table[(UChar)lo];                              \
 469      af = 0; /* undefined */                                    \
 470      zf = (lo == 0) << 6;                                       \
 471      sf = lshift(lo, 8 - 64) & 0x80;                            \
 472      of = cf << 11;                                             \
 473      return cf | pf | af | zf | sf | of;                        \
 474    }                                                            \
 475 }
 476
 477 /*-------------------------------------------------------------*/
 478
 479 #define ACTIONS_SMULQ                                           \
 480 {                                                               \
 481    PREAMBLE(64);                                                \
 482    { ULong cf, pf, af, zf, sf, of;                              \
 483      Long lo, hi;                                               \
 484      mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo );         \
 485      cf = (hi != (lo >>/*s*/ (64-1)));                          \
 486      pf = parity_table[(UChar)lo];                              \
 487      af = 0; /* undefined */                                    \
 488      zf = (lo == 0) << 6;                                       \
 489      sf = lshift(lo, 8 - 64) & 0x80;                            \
 490      of = cf << 11;                                             \
 491      return cf | pf | af | zf | sf | of;                        \
 492    }                                                            \
 493 }
 494
 495 /*-------------------------------------------------------------*/
 496
 497 #define ACTIONS_ANDN(DATA_BITS,DATA_UTYPE)                      \
 498 {                                                               \
 499    PREAMBLE(DATA_BITS);                                         \
 500    { ULong cf, pf, af, zf, sf, of;                              \
 501      cf = 0;                                                    \
 502      pf = 0;                                                    \
 503      af = 0;                                                    \
 504      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 505      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 506      of = 0;                                                    \
 507      return cf | pf | af | zf | sf | of;                        \
 508    }                                                            \
 509 }
 510
 511 /*-------------------------------------------------------------*/
 512
 513 #define ACTIONS_BLSI(DATA_BITS,DATA_UTYPE)                      \
 514 {                                                               \
 515    PREAMBLE(DATA_BITS);                                         \
 516    { ULong cf, pf, af, zf, sf, of;                              \
 517      cf = ((DATA_UTYPE)CC_DEP2 != 0);                           \
 518      pf = 0;                                                    \
 519      af = 0;                                                    \
 520      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 521      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 522      of = 0;                                                    \
 523      return cf | pf | af | zf | sf | of;                        \
 524    }                                                            \
 525 }
 526
 527 /*-------------------------------------------------------------*/
 528
 529 #define ACTIONS_BLSMSK(DATA_BITS,DATA_UTYPE)                    \
 530 {                                                               \
 531    PREAMBLE(DATA_BITS);                                         \
 532    { Long cf, pf, af, zf, sf, of;                               \
 533      cf = ((DATA_UTYPE)CC_DEP2 == 0);                           \
 534      pf = 0;                                                    \
 535      af = 0;                                                    \
 536      zf = 0;                                                    \
 537      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 538      of = 0;                                                    \
 539      return cf | pf | af | zf | sf | of;                        \
 540    }                                                            \
 541 }
 542
 543 /*-------------------------------------------------------------*/
 544
 545 #define ACTIONS_BLSR(DATA_BITS,DATA_UTYPE)                      \
 546 {                                                               \
 547    PREAMBLE(DATA_BITS);                                         \
 548    { ULong cf, pf, af, zf, sf, of;                              \
 549      cf = ((DATA_UTYPE)CC_DEP2 == 0);                           \
 550      pf = 0;                                                    \
 551      af = 0;                                                    \
 552      zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;                      \
 553      sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;                \
 554      of = 0;                                                    \
 555      return cf | pf | af | zf | sf | of;                        \
 556    }                                                            \
 557 }
 558
 559 /*-------------------------------------------------------------*/
 560
 561 #define ACTIONS_ADX(DATA_BITS,DATA_UTYPE,FLAGNAME)              \
 562 {                                                               \
 563    PREAMBLE(DATA_BITS);                                         \
 564    { ULong ocf; /* o or c */                                    \
 565      ULong argL, argR, oldOC, res;                              \
 566      oldOC = (CC_NDEP >> AMD64G_CC_SHIFT_##FLAGNAME) & 1;       \
 567      argL  = CC_DEP1;                                           \
 568      argR  = CC_DEP2 ^ oldOC;                                   \
 569      res   = (argL + argR) + oldOC;                             \
 570      if (oldOC)                                                 \
 571         ocf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;              \
 572      else                                                       \
 573         ocf = (DATA_UTYPE)res < (DATA_UTYPE)argL;               \
 574      return (CC_NDEP & ~AMD64G_CC_MASK_##FLAGNAME)              \
 575             | (ocf << AMD64G_CC_SHIFT_##FLAGNAME);              \
 576    }                                                            \
 577 }
 578
 579 /*-------------------------------------------------------------*/
 580
 581
 582 #if PROFILE_RFLAGS
 583
 584 static Bool initted     = False;
 585
 586 /* C flag, fast route */
 587 static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
 588 /* C flag, slow route */
 589 static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
 590 /* table for calculate_cond */
 591 static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
 592 /* total entry counts for calc_all, calc_c, calc_cond. */
 593 static UInt n_calc_all  = 0;
 594 static UInt n_calc_c    = 0;
 595 static UInt n_calc_cond = 0;
 596
 597 #define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
 598
 599
 600 static void showCounts ( void )
 601 {
 602    Int op, co;
 603    HChar ch;
 604    vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
 605               n_calc_all, n_calc_cond, n_calc_c);
 606
 607    vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
 608               "    S   NS    P   NP    L   NL   LE  NLE\n");
 609    vex_printf("     -----------------------------------------------------"
 610               "----------------------------------------\n");
 611    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
 612
 613       ch = ' ';
 614       if (op > 0 && (op-1) % 4 == 0)
 615          ch = 'B';
 616       if (op > 0 && (op-1) % 4 == 1)
 617          ch = 'W';
 618       if (op > 0 && (op-1) % 4 == 2)
 619          ch = 'L';
 620       if (op > 0 && (op-1) % 4 == 3)
 621          ch = 'Q';
 622
 623       vex_printf("%2d%c: ", op, ch);
 624       vex_printf("%6u ", tabc_slow[op]);
 625       vex_printf("%6u ", tabc_fast[op]);
 626       for (co = 0; co < 16; co++) {
 627          Int n = tab_cond[op][co];
 628          if (n >= 1000) {
 629             vex_printf(" %3dK", n / 1000);
 630          } else
 631          if (n >= 0) {
 632             vex_printf(" %3d ", n );
 633          } else {
 634             vex_printf("     ");
 635          }
 636       }
 637       vex_printf("\n");
 638    }
 639    vex_printf("\n");
 640 }
 641
 642 static void initCounts ( void )
 643 {
 644    Int op, co;
 645    initted = True;
 646    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
 647       tabc_fast[op] = tabc_slow[op] = 0;
 648       for (co = 0; co < 16; co++)
 649          tab_cond[op][co] = 0;
 650    }
 651 }
 652
 653 #endif /* PROFILE_RFLAGS */
 654
 655
 656 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 657 /* Calculate all the 6 flags from the supplied thunk parameters.
 658    Worker function, not directly called from generated code. */
 659 static
 660 ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op,
 661                                         ULong cc_dep1_formal,
 662                                         ULong cc_dep2_formal,
 663                                         ULong cc_ndep_formal )
 664 {
 665    switch (cc_op) {
 666       case AMD64G_CC_OP_COPY:
 667          return cc_dep1_formal
 668                 & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z
 669                    | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
 670
 671       case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
 672       case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
 673       case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
 674       case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );
 675
 676       case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
 677       case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
 678       case AMD64G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
 679       case AMD64G_CC_OP_ADCQ:   ACTIONS_ADC( 64, ULong  );
 680
 681       case AMD64G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
 682       case AMD64G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
 683       case AMD64G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
 684       case AMD64G_CC_OP_SUBQ:   ACTIONS_SUB( 64, ULong  );
 685
 686       case AMD64G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
 687       case AMD64G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
 688       case AMD64G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
 689       case AMD64G_CC_OP_SBBQ:   ACTIONS_SBB( 64, ULong  );
 690
 691       case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
 692       case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
 693       case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
 694       case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong  );
 695
 696       case AMD64G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
 697       case AMD64G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
 698       case AMD64G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
 699       case AMD64G_CC_OP_INCQ:   ACTIONS_INC( 64, ULong  );
 700
 701       case AMD64G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
 702       case AMD64G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
 703       case AMD64G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
 704       case AMD64G_CC_OP_DECQ:   ACTIONS_DEC( 64, ULong  );
 705
 706       case AMD64G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
 707       case AMD64G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
 708       case AMD64G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
 709       case AMD64G_CC_OP_SHLQ:   ACTIONS_SHL( 64, ULong  );
 710
 711       case AMD64G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
 712       case AMD64G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
 713       case AMD64G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
 714       case AMD64G_CC_OP_SHRQ:   ACTIONS_SHR( 64, ULong  );
 715
 716       case AMD64G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
 717       case AMD64G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
 718       case AMD64G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
 719       case AMD64G_CC_OP_ROLQ:   ACTIONS_ROL( 64, ULong  );
 720
 721       case AMD64G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
 722       case AMD64G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
 723       case AMD64G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
 724       case AMD64G_CC_OP_RORQ:   ACTIONS_ROR( 64, ULong  );
 725
 726       case AMD64G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
 727                                                   UShort, toUShort );
 728       case AMD64G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
 729                                                   UInt,   toUInt );
 730       case AMD64G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
 731                                                   ULong,  idULong );
 732
 733       case AMD64G_CC_OP_UMULQ:  ACTIONS_UMULQ;
 734
 735       case AMD64G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
 736                                                   Short,  toUShort );
 737       case AMD64G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort,
 738                                                   Int,    toUInt   );
 739       case AMD64G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
 740                                                   Long,   idULong );
 741
 742       case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;
 743
 744       case AMD64G_CC_OP_ANDN32: ACTIONS_ANDN( 32, UInt   );
 745       case AMD64G_CC_OP_ANDN64: ACTIONS_ANDN( 64, ULong  );
 746
 747       case AMD64G_CC_OP_BLSI32: ACTIONS_BLSI( 32, UInt   );
 748       case AMD64G_CC_OP_BLSI64: ACTIONS_BLSI( 64, ULong  );
 749
 750       case AMD64G_CC_OP_BLSMSK32: ACTIONS_BLSMSK( 32, UInt   );
 751       case AMD64G_CC_OP_BLSMSK64: ACTIONS_BLSMSK( 64, ULong  );
 752
 753       case AMD64G_CC_OP_BLSR32: ACTIONS_BLSR( 32, UInt   );
 754       case AMD64G_CC_OP_BLSR64: ACTIONS_BLSR( 64, ULong  );
 755
 756       case AMD64G_CC_OP_ADCX32: ACTIONS_ADX( 32, UInt,  C );
 757       case AMD64G_CC_OP_ADCX64: ACTIONS_ADX( 64, ULong, C );
 758
 759       case AMD64G_CC_OP_ADOX32: ACTIONS_ADX( 32, UInt,  O );
 760       case AMD64G_CC_OP_ADOX64: ACTIONS_ADX( 64, ULong, O );
 761
 762       default:
 763          /* shouldn't really make these calls from generated code */
 764          vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
 765                     "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
 766                     cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
 767          vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
 768    }
 769 }
 770
 771 #if defined(VGO_freebsd) || defined(VGO_darwin)
 772
 773 /* This dummy function is just used to have an address just after
 774    amd64g_calculate_rflags_all_WRK */
 775
 776 static
 777 void _______VVVVVVVV_amd64g_calculate_rflags_all_WRK_VVVVVVVV_______ (void)
 778 {
 779 }
 780
 781 /* Export addresses of amd64g_calculate_rflags_all_WRK and
 782    _______VVVVVVVV_amd64g_calculate_rflags_all_WRK_VVVVVVVV_______
 783    Used in syswrap-main.c / VG_(post_syscall) in the case where
 784    the above function was interrupted and we need to work out
 785    what needs to be done for the resumption */
 786
 787 Addr addr_amd64g_calculate_rflags_all_WRK = (Addr)amd64g_calculate_rflags_all_WRK;
 788 Addr addr________VVVVVVVV_amd64g_calculate_rflags_all_WRK_VVVVVVVV_______ = (Addr)_______VVVVVVVV_amd64g_calculate_rflags_all_WRK_VVVVVVVV_______;
 789 #endif
 790
 791 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 792 /* Calculate all the 6 flags from the supplied thunk parameters. */
 793 ULong amd64g_calculate_rflags_all ( ULong cc_op,
 794                                     ULong cc_dep1,
 795                                     ULong cc_dep2,
 796                                     ULong cc_ndep )
 797 {
 798 #  if PROFILE_RFLAGS
 799    if (!initted) initCounts();
 800    n_calc_all++;
 801    if (SHOW_COUNTS_NOW) showCounts();
 802 #  endif
 803    return
 804       amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
 805 }
 806
 807
 808 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 809 /* Calculate just the carry flag from the supplied thunk parameters. */
 810 ULong amd64g_calculate_rflags_c ( ULong cc_op,
 811                                   ULong cc_dep1,
 812                                   ULong cc_dep2,
 813                                   ULong cc_ndep )
 814 {
 815 #  if PROFILE_RFLAGS
 816    if (!initted) initCounts();
 817    n_calc_c++;
 818    tabc_fast[cc_op]++;
 819    if (SHOW_COUNTS_NOW) showCounts();
 820 #  endif
 821
 822    /* Fast-case some common ones. */
 823    switch (cc_op) {
 824       case AMD64G_CC_OP_COPY:
 825          return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
 826       case AMD64G_CC_OP_LOGICQ:
 827       case AMD64G_CC_OP_LOGICL:
 828       case AMD64G_CC_OP_LOGICW:
 829       case AMD64G_CC_OP_LOGICB:
 830          return 0;
 831          //      case AMD64G_CC_OP_SUBL:
 832          //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
 833          //                   ? AMD64G_CC_MASK_C : 0;
 834          //      case AMD64G_CC_OP_SUBW:
 835          //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
 836          //                   ? AMD64G_CC_MASK_C : 0;
 837          //      case AMD64G_CC_OP_SUBB:
 838          //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
 839          //                   ? AMD64G_CC_MASK_C : 0;
 840          //      case AMD64G_CC_OP_INCL:
 841          //      case AMD64G_CC_OP_DECL:
 842          //         return cc_ndep & AMD64G_CC_MASK_C;
 843       default:
 844          break;
 845    }
 846
 847 #  if PROFILE_RFLAGS
 848    tabc_fast[cc_op]--;
 849    tabc_slow[cc_op]++;
 850 #  endif
 851
 852    return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep)
 853           & AMD64G_CC_MASK_C;
 854 }
 855
 856
 857 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 858 /* returns 1 or 0 */
 859 ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond,
 860                                    ULong cc_op,
 861                                    ULong cc_dep1,
 862                                    ULong cc_dep2,
 863                                    ULong cc_ndep )
 864 {
 865    ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1,
 866                                                   cc_dep2, cc_ndep);
 867    ULong of,sf,zf,cf,pf;
 868    ULong inv = cond & 1;
 869
 870 #  if PROFILE_RFLAGS
 871    if (!initted) initCounts();
 872    tab_cond[cc_op][cond]++;
 873    n_calc_cond++;
 874    if (SHOW_COUNTS_NOW) showCounts();
 875 #  endif
 876
 877    switch (cond) {
 878       case AMD64CondNO:
 879       case AMD64CondO: /* OF == 1 */
 880          of = rflags >> AMD64G_CC_SHIFT_O;
 881          return 1 & (inv ^ of);
 882
 883       case AMD64CondNZ:
 884       case AMD64CondZ: /* ZF == 1 */
 885          zf = rflags >> AMD64G_CC_SHIFT_Z;
 886          return 1 & (inv ^ zf);
 887
 888       case AMD64CondNB:
 889       case AMD64CondB: /* CF == 1 */
 890          cf = rflags >> AMD64G_CC_SHIFT_C;
 891          return 1 & (inv ^ cf);
 892          break;
 893
 894       case AMD64CondNBE:
 895       case AMD64CondBE: /* (CF or ZF) == 1 */
 896          cf = rflags >> AMD64G_CC_SHIFT_C;
 897          zf = rflags >> AMD64G_CC_SHIFT_Z;
 898          return 1 & (inv ^ (cf | zf));
 899          break;
 900
 901       case AMD64CondNS:
 902       case AMD64CondS: /* SF == 1 */
 903          sf = rflags >> AMD64G_CC_SHIFT_S;
 904          return 1 & (inv ^ sf);
 905
 906       case AMD64CondNP:
 907       case AMD64CondP: /* PF == 1 */
 908          pf = rflags >> AMD64G_CC_SHIFT_P;
 909          return 1 & (inv ^ pf);
 910
 911       case AMD64CondNL:
 912       case AMD64CondL: /* (SF xor OF) == 1 */
 913          sf = rflags >> AMD64G_CC_SHIFT_S;
 914          of = rflags >> AMD64G_CC_SHIFT_O;
 915          return 1 & (inv ^ (sf ^ of));
 916          break;
 917
 918       case AMD64CondNLE:
 919       case AMD64CondLE: /* ((SF xor OF) or ZF)  == 1 */
 920          sf = rflags >> AMD64G_CC_SHIFT_S;
 921          of = rflags >> AMD64G_CC_SHIFT_O;
 922          zf = rflags >> AMD64G_CC_SHIFT_Z;
 923          return 1 & (inv ^ ((sf ^ of) | zf));
 924          break;
 925
 926       default:
 927          /* shouldn't really make these calls from generated code */
 928          vex_printf("amd64g_calculate_condition"
 929                     "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
 930                     cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
 931          vpanic("amd64g_calculate_condition");
 932    }
 933 }
 934
 935
 936 /* VISIBLE TO LIBVEX CLIENT */
 937 ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/const VexGuestAMD64State* vex_state )
 938 {
 939    ULong rflags = amd64g_calculate_rflags_all_WRK(
 940                      vex_state->guest_CC_OP,
 941                      vex_state->guest_CC_DEP1,
 942                      vex_state->guest_CC_DEP2,
 943                      vex_state->guest_CC_NDEP
 944                   );
 945    Long dflag = vex_state->guest_DFLAG;
 946    vassert(dflag == 1 || dflag == -1);
 947    if (dflag == -1)
 948       rflags |= (1<<10);
 949    if (vex_state->guest_IDFLAG == 1)
 950       rflags |= (1<<21);
 951    if (vex_state->guest_ACFLAG == 1)
 952       rflags |= (1<<18);
 953
 954    return rflags;
 955 }
 956
 957 /* VISIBLE TO LIBVEX CLIENT */
 958 void
 959 LibVEX_GuestAMD64_put_rflags ( ULong rflags,
 960                                /*MOD*/VexGuestAMD64State* vex_state )
 961 {
 962    /* D flag */
 963    if (rflags & AMD64G_CC_MASK_D) {
 964       vex_state->guest_DFLAG = -1;
 965       rflags &= ~AMD64G_CC_MASK_D;
 966    }
 967    else
 968       vex_state->guest_DFLAG = 1;
 969
 970    /* ID flag */
 971    if (rflags & AMD64G_CC_MASK_ID) {
 972       vex_state->guest_IDFLAG = 1;
 973       rflags &= ~AMD64G_CC_MASK_ID;
 974    }
 975    else
 976       vex_state->guest_IDFLAG = 0;
 977
 978    /* AC flag */
 979    if (rflags & AMD64G_CC_MASK_AC) {
 980       vex_state->guest_ACFLAG = 1;
 981       rflags &= ~AMD64G_CC_MASK_AC;
 982    }
 983    else
 984       vex_state->guest_ACFLAG = 0;
 985
 986    UInt cc_mask = AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z |
 987                   AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P;
 988    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
 989    vex_state->guest_CC_DEP1 = rflags & cc_mask;
 990    vex_state->guest_CC_DEP2 = 0;
 991    vex_state->guest_CC_NDEP = 0;
 992 }
 993
 994 /* VISIBLE TO LIBVEX CLIENT */
 995 void
 996 LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
 997                                /*MOD*/VexGuestAMD64State* vex_state )
 998 {
 999    ULong oszacp = amd64g_calculate_rflags_all_WRK(
1000                      vex_state->guest_CC_OP,
1001                      vex_state->guest_CC_DEP1,
1002                      vex_state->guest_CC_DEP2,
1003                      vex_state->guest_CC_NDEP
1004                   );
1005    if (new_carry_flag & 1) {
1006       oszacp |= AMD64G_CC_MASK_C;
1007    } else {
1008       oszacp &= ~AMD64G_CC_MASK_C;
1009    }
1010    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
1011    vex_state->guest_CC_DEP1 = oszacp;
1012    vex_state->guest_CC_DEP2 = 0;
1013    vex_state->guest_CC_NDEP = 0;
1014 }
1015
1016 #if defined(VGO_freebsd) || defined(VGO_darwin)
1017 /* Used in syswrap-main.c / VG_(post_syscall) in the case where
1018    the above function was interrupted and we need to work out
1019    what needs to be done for the resumption. These functions
1020    are extern so no need for 'addr' global variables */
1021 void _______VVVVVVVV_after_GuestAMD64_put_rflag_c_VVVVVVVV_______ (void)
1022 {
1023 }
1024 #endif
1025
1026 /*---------------------------------------------------------------*/
1027 /*--- %rflags translation-time function specialisers.         ---*/
1028 /*--- These help iropt specialise calls the above run-time    ---*/
1029 /*--- %rflags functions.                                      ---*/
1030 /*---------------------------------------------------------------*/
1031
1032 /* Used by the optimiser to try specialisations.  Returns an
1033    equivalent expression, or NULL if none. */
1034
1035 static inline Bool isU64 ( IRExpr* e, ULong n )
1036 {
1037    return e->tag == Iex_Const
1038           && e->Iex.Const.con->tag == Ico_U64
1039           && e->Iex.Const.con->Ico.U64 == n;
1040 }
1041
1042 /* Returns N if W64 is a value of the form 1 << N for N in 1 to 31,
1043    and zero in any other case. */
1044 static Int isU64_1_shl_N_literal ( ULong w64 )
1045 {
1046    if (w64 < (1ULL << 1) || w64 > (1ULL << 31))
1047       return 0;
1048    if ((w64 & (w64 - 1)) != 0)
1049       return 0;
1050    /* At this point, we know w64 is a power of two in the range 2^1 .. 2^31,
1051       and we only need to find out which one it is. */
1052    for (Int n = 1; n <= 31; n++) {
1053       if (w64 == (1ULL << n))
1054          return n;
1055    }
1056    /* Consequently we should never get here. */
1057    /*UNREACHED*/
1058    vassert(0);
1059    return 0;
1060 }
1061
1062 /* Returns N if E is an immediate of the form 1 << N for N in 1 to 31,
1063    and zero in any other case. */
1064 static Int isU64_1_shl_N ( IRExpr* e )
1065 {
1066    if (e->tag != Iex_Const || e->Iex.Const.con->tag != Ico_U64)
1067       return 0;
1068    ULong w64 = e->Iex.Const.con->Ico.U64;
1069    return isU64_1_shl_N_literal(w64);
1070 }
1071
1072 /* Returns N if E is an immediate of the form (1 << N) - 1 for N in 1 to 31,
1073    and zero in any other case. */
1074 static Int isU64_1_shl_N_minus_1 ( IRExpr* e )
1075 {
1076   if (e->tag != Iex_Const || e->Iex.Const.con->tag != Ico_U64)
1077     return 0;
1078   ULong w64 = e->Iex.Const.con->Ico.U64;
1079   // This isn't actually necessary since isU64_1_shl_N_literal will return
1080   // zero given a zero argument, but still ..
1081   if (w64 == 0xFFFFFFFFFFFFFFFFULL)
1082      return 0;
1083   return isU64_1_shl_N_literal(w64 + 1);
1084 }
1085
1086 IRExpr* guest_amd64_spechelper ( const HChar* function_name,
1087                                  IRExpr** args,
1088                                  IRStmt** precedingStmts,
1089                                  Int      n_precedingStmts )
1090 {
1091 #  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
1092 #  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
1093 #  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
1094 #  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
1095 #  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
1096
1097    Int i, arity = 0;
1098    for (i = 0; args[i]; i++)
1099       arity++;
1100 #  if 0
1101    vex_printf("spec request:\n");
1102    vex_printf("   %s  ", function_name);
1103    for (i = 0; i < arity; i++) {
1104       vex_printf("  ");
1105       ppIRExpr(args[i]);
1106    }
1107    vex_printf("\n");
1108 #  endif
1109
1110    /* --------- specialising "amd64g_calculate_condition" --------- */
1111
1112    if (vex_streq(function_name, "amd64g_calculate_condition")) {
1113       /* specialise calls to above "calculate condition" function */
1114       IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
1115       vassert(arity == 5);
1116       cond    = args[0];
1117       cc_op   = args[1];
1118       cc_dep1 = args[2];
1119       cc_dep2 = args[3];
1120
1121       /*---------------- ADDQ ----------------*/
1122
1123       /* 4, */
1124       if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
1125          /* long long add, then Z --> test (dst+src == 0) */
1126          return unop(Iop_1Uto64,
1127                      binop(Iop_CmpEQ64,
1128                            binop(Iop_Add64, cc_dep1, cc_dep2),
1129                            mkU64(0)));
1130       }
1131
1132       /* 8, */
1133       if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondS)) {
1134          /* long long add, then S (negative)
1135             --> (dst+src)[63]
1136             --> ((dst + src) >>u 63) & 1
1137          */
1138          return binop(Iop_And64,
1139                       binop(Iop_Shr64,
1140                             binop(Iop_Add64, cc_dep1, cc_dep2),
1141                             mkU8(63)),
1142                       mkU64(1));
1143       }
1144
1145       /*---------------- ADDL ----------------*/
1146
1147       /* 0, */
1148       if (isU64(cc_op, AMD64G_CC_OP_ADDL) && isU64(cond, AMD64CondO)) {
1149          /* This is very commonly generated by Javascript JITs, for
1150             the idiom "do a 32-bit add and jump to out-of-line code if
1151             an overflow occurs". */
1152          /* long add, then O (overflow)
1153             --> ((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 + dep2)))[31]
1154             --> (((dep1 ^ dep2 ^ -1) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
1155             --> (((not(dep1 ^ dep2)) & (dep1 ^ (dep1 +64 dep2))) >>u 31) & 1
1156          */
1157          vassert(isIRAtom(cc_dep1));
1158          vassert(isIRAtom(cc_dep2));
1159          return
1160             binop(Iop_And64,
1161                   binop(Iop_Shr64,
1162                         binop(Iop_And64,
1163                               unop(Iop_Not64,
1164                                    binop(Iop_Xor64, cc_dep1, cc_dep2)),
1165                               binop(Iop_Xor64,
1166                                     cc_dep1,
1167                                     binop(Iop_Add64, cc_dep1, cc_dep2))),
1168                         mkU8(31)),
1169                   mkU64(1));
1170
1171       }
1172
1173       /* 8, 9 */
1174       if (isU64(cc_op, AMD64G_CC_OP_ADDL) && isU64(cond, AMD64CondS)) {
1175          /* long add, then S (negative)
1176             --> (dst+src)[31]
1177             --> ((dst +64 src) >>u 31) & 1
1178             Pointless to narrow the args to 32 bit before the add. */
1179          return binop(Iop_And64,
1180                       binop(Iop_Shr64,
1181                             binop(Iop_Add64, cc_dep1, cc_dep2),
1182                             mkU8(31)),
1183                       mkU64(1));
1184       }
1185       if (isU64(cc_op, AMD64G_CC_OP_ADDL) && isU64(cond, AMD64CondNS)) {
1186          /* long add, then NS (not negative)
1187             --> (dst+src)[31] ^ 1
1188             --> (((dst +64 src) >>u 31) & 1) ^ 1
1189             Pointless to narrow the args to 32 bit before the add. */
1190          return binop(Iop_Xor64,
1191                       binop(Iop_And64,
1192                             binop(Iop_Shr64,
1193                                   binop(Iop_Add64, cc_dep1, cc_dep2),
1194                                   mkU8(31)),
1195                             mkU64(1)),
1196                       mkU64(1));
1197       }
1198
1199       /*---------------- SUBQ ----------------*/
1200
1201       /* 0, */
1202       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondO)) {
1203          /* long long sub/cmp, then O (overflow)
1204             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[63]
1205             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2))) >>u 63
1206          */
1207          vassert(isIRAtom(cc_dep1));
1208          vassert(isIRAtom(cc_dep2));
1209          return binop(Iop_Shr64,
1210                       binop(Iop_And64,
1211                             binop(Iop_Xor64, cc_dep1, cc_dep2),
1212                             binop(Iop_Xor64,
1213                                   cc_dep1,
1214                                   binop(Iop_Sub64, cc_dep1, cc_dep2))),
1215                       mkU8(63));
1216       }
1217       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNO)) {
1218          /* No action.  Never yet found a test case. */
1219       }
1220
1221       /* 2, 3 */
1222       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
1223          /* long long sub/cmp, then B (unsigned less than)
1224             --> test dst <u src */
1225          return unop(Iop_1Uto64,
1226                      binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
1227       }
1228       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
1229          /* long long sub/cmp, then NB (unsigned greater than or equal)
1230             --> test src <=u dst */
1231          /* Note, args are opposite way round from the usual */
1232          return unop(Iop_1Uto64,
1233                      binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
1234       }
1235
1236       /* 4, 5 */
1237       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
1238          /* long long sub/cmp, then Z --> test dst==src */
1239          return unop(Iop_1Uto64,
1240                      binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
1241       }
1242       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
1243          /* long long sub/cmp, then NZ --> test dst!=src */
1244          return unop(Iop_1Uto64,
1245                      binop(Iop_CmpNE64,cc_dep1,cc_dep2));
1246       }
1247
1248       /* 6, 7 */
1249       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
1250          /* long long sub/cmp, then BE (unsigned less than or equal)
1251             --> test dst <=u src */
1252          return unop(Iop_1Uto64,
1253                      binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
1254       }
1255       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNBE)) {
1256          /* long long sub/cmp, then NBE (unsigned greater than)
1257             --> test !(dst <=u src) */
1258          return binop(Iop_Xor64,
1259                       unop(Iop_1Uto64,
1260                            binop(Iop_CmpLE64U, cc_dep1, cc_dep2)),
1261                       mkU64(1));
1262       }
1263
1264       /* 8, 9 */
1265       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondS)) {
1266          /* long long sub/cmp, then S (negative)
1267             --> (dst-src)[63]
1268             --> (dst-src) >>u 63 */
1269          return binop(Iop_Shr64,
1270                       binop(Iop_Sub64, cc_dep1, cc_dep2),
1271                       mkU8(63));
1272       }
1273       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNS)) {
1274          /* long long sub/cmp, then NS (not negative)
1275             --> (dst-src)[63] ^ 1
1276             --> ((dst-src) >>u 63) ^ 1 */
1277          return binop(Iop_Xor64,
1278                       binop(Iop_Shr64,
1279                             binop(Iop_Sub64, cc_dep1, cc_dep2),
1280                             mkU8(63)),
1281                       mkU64(1));
1282       }
1283
1284       /* 12, 13 */
1285       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
1286          /* long long sub/cmp, then L (signed less than)
1287             --> test dst <s src */
1288          return unop(Iop_1Uto64,
1289                      binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
1290       }
1291       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNL)) {
1292          /* long long sub/cmp, then NL (signed greater than or equal)
1293             --> test dst >=s src
1294             --> test src <=s dst */
1295          return unop(Iop_1Uto64,
1296                      binop(Iop_CmpLE64S, cc_dep2, cc_dep1));
1297       }
1298
1299       /* 14, 15 */
1300       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondLE)) {
1301          /* long long sub/cmp, then LE (signed less than or equal)
1302             --> test dst <=s src */
1303          return unop(Iop_1Uto64,
1304                      binop(Iop_CmpLE64S, cc_dep1, cc_dep2));
1305       }
1306       if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNLE)) {
1307          /* long sub/cmp, then NLE (signed greater than)
1308             --> test !(dst <=s src)
1309             --> test (dst >s src)
1310             --> test (src <s dst) */
1311          return unop(Iop_1Uto64,
1312                      binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
1313
1314       }
1315
1316       /*---------------- SUBL ----------------*/
1317
1318       /* 0, */
1319       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondO)) {
1320          /* This is very commonly generated by Javascript JITs, for
1321             the idiom "do a 32-bit subtract and jump to out-of-line
1322             code if an overflow occurs". */
1323          /* long sub/cmp, then O (overflow)
1324             --> ((dep1 ^ dep2) & (dep1 ^ (dep1 - dep2)))[31]
1325             --> (((dep1 ^ dep2) & (dep1 ^ (dep1 -64 dep2))) >>u 31) & 1
1326          */
1327          vassert(isIRAtom(cc_dep1));
1328          vassert(isIRAtom(cc_dep2));
1329          return
1330             binop(Iop_And64,
1331                   binop(Iop_Shr64,
1332                         binop(Iop_And64,
1333                               binop(Iop_Xor64, cc_dep1, cc_dep2),
1334                               binop(Iop_Xor64,
1335                                     cc_dep1,
1336                                     binop(Iop_Sub64, cc_dep1, cc_dep2))),
1337                         mkU8(31)),
1338                   mkU64(1));
1339       }
1340       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNO)) {
1341          /* No action.  Never yet found a test case. */
1342       }
1343
1344       /* 2, 3 */
1345       {
1346         /* It appears that LLVM 5.0 and later have a new way to find out
1347            whether the top N bits of a word W are all zero, by computing
1348
1349              W  <u   0---(N-1)---0 1 0---0  or
1350              W  <=u  0---(N-1)---0 0 1---1
1351
1352            In particular, the result will be defined if the top N bits of W
1353            are defined, even if the trailing bits -- those corresponding to
1354            the rightmost 0---0 / 1---1 section -- are undefined.  Rather than
1355            make Memcheck more complex, we detect this case where we can and
1356            shift out the irrelevant and potentially undefined bits. */
1357         Int n = 0;
1358         Bool is_NB_or_NBE = False;
1359         if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
1360            if (isU64(cond, AMD64CondB) || isU64(cond, AMD64CondNB)) {
1361               /* long sub/cmp, then B (unsigned less than),
1362                  where dep2 is a power of 2:
1363                    -> CmpLT32U(dep1, 1 << N)
1364                    -> CmpEQ32(dep1 >>u N, 0)
1365                  and
1366                  long sub/cmp, then NB (unsigned greater than or equal),
1367                  where dep2 is a power of 2:
1368                    -> CmpGE32U(dep1, 1 << N)
1369                    -> CmpNE32(dep1 >>u N, 0)
1370                  This avoids CmpLT32U/CmpGE32U being applied to potentially
1371                  uninitialised bits in the area being shifted out. */
1372               n = isU64_1_shl_N(cc_dep2);
1373               is_NB_or_NBE = isU64(cond, AMD64CondNB);
1374            } else if (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE)) {
1375               /* long sub/cmp, then BE (unsigned less than or equal),
1376                  where dep2 is a power of 2 minus 1:
1377                   -> CmpLE32U(dep1, (1 << N) - 1)
1378                   -> CmpEQ32(dep1 >>u N, 0)
1379                  and
1380                  long sub/cmp, then NBE (unsigned greater than),
1381                  where dep2 is a power of 2 minus 1:
1382                    -> CmpGT32U(dep1, (1 << N) - 1)
1383                    -> CmpNE32(dep1 >>u N, 0)
1384                  This avoids CmpLE32U/CmpGT32U being applied to potentially
1385                  uninitialised bits in the area being shifted out. */
1386               n = isU64_1_shl_N_minus_1(cc_dep2);
1387               is_NB_or_NBE = isU64(cond, AMD64CondNBE);
1388            }
1389         }
1390         if (n > 0) {
1391            vassert(n >= 1 && n <= 31);
1392            return unop(Iop_1Uto64,
1393                        binop(is_NB_or_NBE ? Iop_CmpNE32 : Iop_CmpEQ32,
1394                              binop(Iop_Shr32, unop(Iop_64to32, cc_dep1),
1395                                               mkU8(n)),
1396                              mkU32(0)));
1397         }
1398       }
1399       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondB)) {
1400          /* long sub/cmp, then B (unsigned less than)
1401             --> test dst <u src */
1402          return unop(Iop_1Uto64,
1403                      binop(Iop_CmpLT32U,
1404                            unop(Iop_64to32, cc_dep1),
1405                            unop(Iop_64to32, cc_dep2)));
1406       }
1407       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNB)) {
1408          /* long sub/cmp, then NB (unsigned greater than or equal)
1409             --> test src <=u dst */
1410          /* Note, args are opposite way round from the usual */
1411          return unop(Iop_1Uto64,
1412                      binop(Iop_CmpLE32U,
1413                            unop(Iop_64to32, cc_dep2),
1414                            unop(Iop_64to32, cc_dep1)));
1415       }
1416
1417       /* 4, 5 */
1418       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
1419          /* long sub/cmp, then Z --> test dst==src */
1420          return unop(Iop_1Uto64,
1421                      binop(Iop_CmpEQ32,
1422                            unop(Iop_64to32, cc_dep1),
1423                            unop(Iop_64to32, cc_dep2)));
1424       }
1425       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
1426          /* long sub/cmp, then NZ --> test dst!=src */
1427          return unop(Iop_1Uto64,
1428                      binop(Iop_CmpNE32,
1429                            unop(Iop_64to32, cc_dep1),
1430                            unop(Iop_64to32, cc_dep2)));
1431       }
1432
1433       /* 6, 7 */
1434       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
1435          /* long sub/cmp, then BE (unsigned less than or equal)
1436             --> test dst <=u src */
1437          return unop(Iop_1Uto64,
1438                      binop(Iop_CmpLE32U,
1439                            unop(Iop_64to32, cc_dep1),
1440                            unop(Iop_64to32, cc_dep2)));
1441       }
1442       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
1443          /* long sub/cmp, then NBE (unsigned greater than)
1444             --> test src <u dst */
1445          /* Note, args are opposite way round from the usual */
1446          return unop(Iop_1Uto64,
1447                      binop(Iop_CmpLT32U,
1448                            unop(Iop_64to32, cc_dep2),
1449                            unop(Iop_64to32, cc_dep1)));
1450       }
1451
1452       /* 8, 9 */
1453       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
1454          /* long sub/cmp, then S (negative)
1455             --> (dst-src)[31]
1456             --> ((dst -64 src) >>u 31) & 1
1457             Pointless to narrow the args to 32 bit before the subtract. */
1458          return binop(Iop_And64,
1459                       binop(Iop_Shr64,
1460                             binop(Iop_Sub64, cc_dep1, cc_dep2),
1461                             mkU8(31)),
1462                       mkU64(1));
1463       }
1464       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNS)) {
1465          /* long sub/cmp, then NS (not negative)
1466             --> (dst-src)[31] ^ 1
1467             --> (((dst -64 src) >>u 31) & 1) ^ 1
1468             Pointless to narrow the args to 32 bit before the subtract. */
1469          return binop(Iop_Xor64,
1470                       binop(Iop_And64,
1471                             binop(Iop_Shr64,
1472                                   binop(Iop_Sub64, cc_dep1, cc_dep2),
1473                                   mkU8(31)),
1474                             mkU64(1)),
1475                       mkU64(1));
1476       }
1477
1478       /* 12, 13 */
1479       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
1480          /* long sub/cmp, then L (signed less than)
1481             --> test dst <s src */
1482          return unop(Iop_1Uto64,
1483                      binop(Iop_CmpLT32S,
1484                            unop(Iop_64to32, cc_dep1),
1485                            unop(Iop_64to32, cc_dep2)));
1486       }
1487       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNL)) {
1488          /* long sub/cmp, then NL (signed greater than or equal)
1489             --> test dst >=s src
1490             --> test src <=s dst */
1491          return unop(Iop_1Uto64,
1492                      binop(Iop_CmpLE32S,
1493                            unop(Iop_64to32, cc_dep2),
1494                            unop(Iop_64to32, cc_dep1)));
1495       }
1496
1497       /* 14, 15 */
1498       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
1499          /* long sub/cmp, then LE (signed less than or equal)
1500             --> test dst <=s src */
1501          return unop(Iop_1Uto64,
1502                      binop(Iop_CmpLE32S,
1503                            unop(Iop_64to32, cc_dep1),
1504                            unop(Iop_64to32, cc_dep2)));
1505
1506       }
1507       if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
1508          /* long sub/cmp, then NLE (signed greater than)
1509             --> test !(dst <=s src)
1510             --> test (dst >s src)
1511             --> test (src <s dst) */
1512          return unop(Iop_1Uto64,
1513                      binop(Iop_CmpLT32S,
1514                            unop(Iop_64to32, cc_dep2),
1515                            unop(Iop_64to32, cc_dep1)));
1516
1517       }
1518
1519       /*---------------- SUBW ----------------*/
1520
1521       /* 4, 5 */
1522       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
1523          /* word sub/cmp, then Z --> test dst==src */
1524          return unop(Iop_1Uto64,
1525                      binop(Iop_CmpEQ16,
1526                            unop(Iop_64to16,cc_dep1),
1527                            unop(Iop_64to16,cc_dep2)));
1528       }
1529       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
1530          /* word sub/cmp, then NZ --> test dst!=src */
1531          return unop(Iop_1Uto64,
1532                      binop(Iop_CmpNE16,
1533                            unop(Iop_64to16,cc_dep1),
1534                            unop(Iop_64to16,cc_dep2)));
1535       }
1536
1537       /* 6, */
1538       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondBE)) {
1539          /* word sub/cmp, then BE (unsigned less than or equal)
1540             --> test dst <=u src */
1541          return unop(Iop_1Uto64,
1542                      binop(Iop_CmpLE64U,
1543                            binop(Iop_Shl64, cc_dep1, mkU8(48)),
1544                            binop(Iop_Shl64, cc_dep2, mkU8(48))));
1545       }
1546
1547       /* 8, 9 */
1548       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondS)
1549                                           && isU64(cc_dep2, 0)) {
1550          /* word sub/cmp of zero, then S --> test (dst-0 <s 0)
1551                                          --> test dst <s 0
1552                                          --> (ULong)dst[15]
1553             This is yet another scheme by which clang figures out if the
1554             top bit of a word is 1 or 0.  See also LOGICB/CondS below. */
1555          /* Note: isU64(cc_dep2, 0) is correct, even though this is
1556             for an 16-bit comparison, since the args to the helper
1557             function are always U64s. */
1558          return binop(Iop_And64,
1559                       binop(Iop_Shr64,cc_dep1,mkU8(15)),
1560                       mkU64(1));
1561       }
1562       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNS)
1563                                           && isU64(cc_dep2, 0)) {
1564          /* word sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1565                                           --> test !(dst <s 0)
1566                                           --> (ULong) !dst[15]
1567          */
1568          return binop(Iop_Xor64,
1569                       binop(Iop_And64,
1570                             binop(Iop_Shr64,cc_dep1,mkU8(15)),
1571                             mkU64(1)),
1572                       mkU64(1));
1573       }
1574
1575       /* 14, */
1576       if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
1577          /* word sub/cmp, then LE (signed less than or equal)
1578             --> test dst <=s src */
1579          return unop(Iop_1Uto64,
1580                      binop(Iop_CmpLE64S,
1581                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
1582                            binop(Iop_Shl64,cc_dep2,mkU8(48))));
1583
1584       }
1585
1586       /*---------------- SUBB ----------------*/
1587
1588       /* 2, 3 */
1589       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondB)) {
1590          /* byte sub/cmp, then B (unsigned less than)
1591             --> test dst <u src */
1592          return unop(Iop_1Uto64,
1593                      binop(Iop_CmpLT64U,
1594                            binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1595                            binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1596       }
1597       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNB)) {
1598          /* byte sub/cmp, then NB (unsigned greater than or equal)
1599             --> test src <=u dst */
1600          /* Note, args are opposite way round from the usual */
1601          return unop(Iop_1Uto64,
1602                      binop(Iop_CmpLE64U,
1603                            binop(Iop_And64, cc_dep2, mkU64(0xFF)),
1604                            binop(Iop_And64, cc_dep1, mkU64(0xFF))));
1605       }
1606
1607       /* 4, 5 */
1608       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
1609          /* byte sub/cmp, then Z --> test dst==src */
1610          return unop(Iop_1Uto64,
1611                      binop(Iop_CmpEQ8,
1612                            unop(Iop_64to8,cc_dep1),
1613                            unop(Iop_64to8,cc_dep2)));
1614       }
1615       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
1616          /* byte sub/cmp, then NZ --> test dst!=src */
1617          return unop(Iop_1Uto64,
1618                      binop(Iop_CmpNE8,
1619                            unop(Iop_64to8,cc_dep1),
1620                            unop(Iop_64to8,cc_dep2)));
1621       }
1622
1623       /* 6, */
1624       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondBE)) {
1625          /* byte sub/cmp, then BE (unsigned less than or equal)
1626             --> test dst <=u src */
1627          return unop(Iop_1Uto64,
1628                      binop(Iop_CmpLE64U,
1629                            binop(Iop_And64, cc_dep1, mkU64(0xFF)),
1630                            binop(Iop_And64, cc_dep2, mkU64(0xFF))));
1631       }
1632
1633       /* 8, 9 */
1634       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
1635                                           && isU64(cc_dep2, 0)) {
1636          /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
1637                                          --> test dst <s 0
1638                                          --> (ULong)dst[7]
1639             This is yet another scheme by which gcc figures out if the
1640             top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
1641          /* Note: isU64(cc_dep2, 0) is correct, even though this is
1642             for an 8-bit comparison, since the args to the helper
1643             function are always U64s. */
1644          return binop(Iop_And64,
1645                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
1646                       mkU64(1));
1647       }
1648       if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
1649                                           && isU64(cc_dep2, 0)) {
1650          /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
1651                                           --> test !(dst <s 0)
1652                                           --> (ULong) !dst[7]
1653          */
1654          return binop(Iop_Xor64,
1655                       binop(Iop_And64,
1656                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
1657                             mkU64(1)),
1658                       mkU64(1));
1659       }
1660
1661       /*---------------- LOGICQ ----------------*/
1662
1663       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
1664          /* long long and/or/xor, then Z --> test dst==0 */
1665          return unop(Iop_1Uto64,
1666                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1667       }
1668       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNZ)) {
1669          /* long long and/or/xor, then NZ --> test dst!=0 */
1670          return unop(Iop_1Uto64,
1671                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1672       }
1673
1674       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
1675          /* long long and/or/xor, then L
1676             LOGIC sets SF and ZF according to the
1677             result and makes OF be zero.  L computes SF ^ OF, but
1678             OF is zero, so this reduces to SF -- which will be 1 iff
1679             the result is < signed 0.  Hence ...
1680          */
1681          return unop(Iop_1Uto64,
1682                      binop(Iop_CmpLT64S,
1683                            cc_dep1,
1684                            mkU64(0)));
1685       }
1686
1687       // Verified
1688       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondS)) {
1689          /* long long and/or/xor, then S --> (ULong)result[63] */
1690          return binop(Iop_Shr64, cc_dep1, mkU8(63));
1691       }
1692       // Verified
1693       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondNS)) {
1694          /* long long and/or/xor, then S --> (ULong) ~ result[63] */
1695          return binop(Iop_Xor64,
1696                       binop(Iop_Shr64, cc_dep1, mkU8(63)),
1697                       mkU64(1));
1698       }
1699
1700       /*---------------- LOGICL ----------------*/
1701
1702       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
1703          /* long and/or/xor, then Z --> test dst==0 */
1704          return unop(Iop_1Uto64,
1705                      binop(Iop_CmpEQ32,
1706                            unop(Iop_64to32, cc_dep1),
1707                            mkU32(0)));
1708       }
1709       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
1710          /* long and/or/xor, then NZ --> test dst!=0 */
1711          return unop(Iop_1Uto64,
1712                      binop(Iop_CmpNE32,
1713                            unop(Iop_64to32, cc_dep1),
1714                            mkU32(0)));
1715       }
1716
1717       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
1718          /* long and/or/xor, then LE
1719             This is pretty subtle.  LOGIC sets SF and ZF according to the
1720             result and makes OF be zero.  LE computes (SF ^ OF) | ZF, but
1721             OF is zero, so this reduces to SF | ZF -- which will be 1 iff
1722             the result is <=signed 0.  Hence ...
1723          */
1724          return unop(Iop_1Uto64,
1725                      binop(Iop_CmpLE32S,
1726                            unop(Iop_64to32, cc_dep1),
1727                            mkU32(0)));
1728       }
1729
1730       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondS)) {
1731          /* long and/or/xor, then S --> (ULong)result[31] */
1732          return binop(Iop_And64,
1733                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
1734                       mkU64(1));
1735       }
1736       if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNS)) {
1737          /* long and/or/xor, then S --> (ULong) ~ result[31] */
1738          return binop(Iop_Xor64,
1739                 binop(Iop_And64,
1740                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
1741                       mkU64(1)),
1742                 mkU64(1));
1743       }
1744
1745       /*---------------- LOGICW ----------------*/
1746
1747       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondZ)) {
1748          /* word and/or/xor, then Z --> test dst==0 */
1749          // Use CmpEQ32 rather than CmpEQ64 here, so that Memcheck instruments
1750          // it exactly at EdcAUTO.
1751          return unop(Iop_1Uto64,
1752                      binop(Iop_CmpEQ32,
1753                            unop(Iop_16Uto32, unop(Iop_64to16, cc_dep1)),
1754                            mkU32(0)));
1755       }
1756       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNZ)) {
1757          /* word and/or/xor, then NZ --> test dst!=0 */
1758          // Use CmpNE32 rather than CmpNE64 here, so that Memcheck instruments
1759          // it exactly at EdcAUTO.
1760          return unop(Iop_1Uto64,
1761                      binop(Iop_CmpNE32,
1762                            unop(Iop_16Uto32, unop(Iop_64to16, cc_dep1)),
1763                            mkU32(0)));
1764       }
1765
1766       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondS)) {
1767          /* word and/or/xor, then S --> (ULong)result[15] */
1768          return binop(Iop_And64,
1769                       binop(Iop_Shr64, cc_dep1, mkU8(15)),
1770                       mkU64(1));
1771       }
1772       if (isU64(cc_op, AMD64G_CC_OP_LOGICW) && isU64(cond, AMD64CondNS)) {
1773          /* word and/or/xor, then S --> (ULong) ~ result[15] */
1774          return binop(Iop_Xor64,
1775                       binop(Iop_And64,
1776                             binop(Iop_Shr64, cc_dep1, mkU8(15)),
1777                             mkU64(1)),
1778                       mkU64(1));
1779       }
1780
1781       /*---------------- LOGICB ----------------*/
1782
1783       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
1784          /* byte and/or/xor, then Z --> test dst==0 */
1785          // Use CmpEQ32 rather than CmpEQ64 here, so that Memcheck instruments
1786          // it exactly at EdcAUTO.
1787          return unop(Iop_1Uto64,
1788                      binop(Iop_CmpEQ32,
1789                            unop(Iop_8Uto32, unop(Iop_64to8, cc_dep1)),
1790                            mkU32(0)));
1791       }
1792       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
1793          /* byte and/or/xor, then NZ --> test dst!=0 */
1794          // Use CmpNE32 rather than CmpNE64 here, so that Memcheck instruments
1795          // it exactly at EdcAUTO.
1796          return unop(Iop_1Uto64,
1797                      binop(Iop_CmpNE32,
1798                            unop(Iop_8Uto32, unop(Iop_64to8, cc_dep1)),
1799                            mkU32(0)));
1800       }
1801
1802       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
1803          /* this is an idiom gcc sometimes uses to find out if the top
1804             bit of a byte register is set: eg testb %al,%al; js ..
1805             Since it just depends on the top bit of the byte, extract
1806             that bit and explicitly get rid of all the rest.  This
1807             helps memcheck avoid false positives in the case where any
1808             of the other bits in the byte are undefined. */
1809          /* byte and/or/xor, then S --> (UInt)result[7] */
1810          return binop(Iop_And64,
1811                       binop(Iop_Shr64,cc_dep1,mkU8(7)),
1812                       mkU64(1));
1813       }
1814       if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNS)) {
1815          /* byte and/or/xor, then NS --> (UInt)!result[7] */
1816          return binop(Iop_Xor64,
1817                       binop(Iop_And64,
1818                             binop(Iop_Shr64,cc_dep1,mkU8(7)),
1819                             mkU64(1)),
1820                       mkU64(1));
1821       }
1822
1823       /*---------------- INCB ----------------*/
1824
1825       if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
1826          /* 8-bit inc, then LE --> sign bit of the arg */
1827          return binop(Iop_And64,
1828                       binop(Iop_Shr64,
1829                             binop(Iop_Sub64, cc_dep1, mkU64(1)),
1830                             mkU8(7)),
1831                       mkU64(1));
1832       }
1833
1834       /*---------------- INCW ----------------*/
1835
1836       if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
1837          /* 16-bit inc, then Z --> test dst == 0 */
1838          return unop(Iop_1Uto64,
1839                      binop(Iop_CmpEQ64,
1840                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
1841                            mkU64(0)));
1842       }
1843
1844       /*---------------- DECL ----------------*/
1845
1846       if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
1847          /* dec L, then Z --> test dst == 0 */
1848          return unop(Iop_1Uto64,
1849                      binop(Iop_CmpEQ32,
1850                            unop(Iop_64to32, cc_dep1),
1851                            mkU32(0)));
1852       }
1853
1854       /*---------------- DECW ----------------*/
1855
1856       if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
1857          /* 16-bit dec, then NZ --> test dst != 0 */
1858          return unop(Iop_1Uto64,
1859                      binop(Iop_CmpNE64,
1860                            binop(Iop_Shl64,cc_dep1,mkU8(48)),
1861                            mkU64(0)));
1862       }
1863
1864       /*---------------- SHRQ ----------------*/
1865
1866       if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondZ)) {
1867          /* SHRQ, then Z --> test result[63:0] == 0 */
1868          return unop(Iop_1Uto64,
1869                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1870       }
1871       if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondNZ)) {
1872          /* SHRQ, then NZ --> test result[63:0] != 0 */
1873          return unop(Iop_1Uto64,
1874                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1875       }
1876
1877       if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondS)) {
1878          /* SHRQ, then S --> (ULong)result[63] (result is in dep1) */
1879          return binop(Iop_Shr64, cc_dep1, mkU8(63));
1880       }
1881       // No known test case for this, hence disabled:
1882       //if (isU64(cc_op, AMD64G_CC_OP_SHRQ) && isU64(cond, AMD64CondNS)) {
1883       //   /* SHRQ, then NS --> (ULong) ~ result[63] */
1884       //   vassert(0);
1885       //}
1886
1887       /*---------------- SHRL ----------------*/
1888
1889       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondZ)) {
1890          /* SHRL, then Z --> test dep1 == 0 */
1891          return unop(Iop_1Uto64,
1892                      binop(Iop_CmpEQ32, unop(Iop_64to32, cc_dep1),
1893                            mkU32(0)));
1894       }
1895       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNZ)) {
1896          /* SHRL, then NZ --> test dep1 != 0 */
1897          return unop(Iop_1Uto64,
1898                      binop(Iop_CmpNE32, unop(Iop_64to32, cc_dep1),
1899                            mkU32(0)));
1900       }
1901
1902       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondS)) {
1903          /* SHRL/SARL, then S --> (ULong)result[31] */
1904          return binop(Iop_And64,
1905                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
1906                       mkU64(1));
1907       }
1908       if (isU64(cc_op, AMD64G_CC_OP_SHRL) && isU64(cond, AMD64CondNS)) {
1909          /* SHRL/SARL, then NS --> (ULong) ~ result[31] */
1910          return binop(Iop_Xor64,
1911                       binop(Iop_And64,
1912                             binop(Iop_Shr64, cc_dep1, mkU8(31)),
1913                             mkU64(1)),
1914                       mkU64(1));
1915       }
1916
1917       /*---------------- SHRW ----------------*/
1918
1919       if (isU64(cc_op, AMD64G_CC_OP_SHRW) && isU64(cond, AMD64CondZ)) {
1920          /* SHRW, then Z --> test dep1 == 0 */
1921          return unop(Iop_1Uto64,
1922                      binop(Iop_CmpEQ32,
1923                            unop(Iop_16Uto32, unop(Iop_64to16, cc_dep1)),
1924                            mkU32(0)));
1925       }
1926       // No known test case for this, hence disabled:
1927       //if (isU64(cc_op, AMD64G_CC_OP_SHRW) && isU64(cond, AMD64CondNZ)) {
1928       //   /* SHRW, then NZ --> test dep1 == 0 */
1929       //   return unop(Iop_1Uto64,
1930       //               binop(Iop_CmpNE32,
1931       //                     unop(Iop_16Uto32, unop(Iop_64to16, cc_dep1)),
1932       //                     mkU32(0)));
1933       //}
1934
1935       /*---------------- SHLQ ----------------*/
1936
1937       if (isU64(cc_op, AMD64G_CC_OP_SHLQ) && isU64(cond, AMD64CondZ)) {
1938          /* SHLQ, then Z --> test dep1 == 0 */
1939          return unop(Iop_1Uto64,
1940                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1941       }
1942       if (isU64(cc_op, AMD64G_CC_OP_SHLQ) && isU64(cond, AMD64CondNZ)) {
1943          /* SHLQ, then NZ --> test dep1 != 0 */
1944          return unop(Iop_1Uto64,
1945                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1946       }
1947
1948       // Verified
1949       if (isU64(cc_op, AMD64G_CC_OP_SHLQ) && isU64(cond, AMD64CondS)) {
1950          /* SHLQ, then S --> (ULong)result[63] */
1951          return binop(Iop_Shr64, cc_dep1, mkU8(63));
1952       }
1953       // No known test case
1954       //if (isU64(cc_op, AMD64G_CC_OP_SHLQ) && isU64(cond, AMD64CondNS)) {
1955       //   /* SHLQ, then NS --> (ULong) ~ result[63] */
1956       //   vassert(0);
1957       //}
1958
1959       /*---------------- SHLL ----------------*/
1960
1961       if (isU64(cc_op, AMD64G_CC_OP_SHLL) && isU64(cond, AMD64CondZ)) {
1962          /* SHLL, then Z --> test result[31:0] == 0 */
1963          return unop(Iop_1Uto64,
1964                      binop(Iop_CmpEQ32, unop(Iop_64to32, cc_dep1),
1965                            mkU32(0)));
1966       }
1967       // Verified
1968       if (isU64(cc_op, AMD64G_CC_OP_SHLL) && isU64(cond, AMD64CondNZ)) {
1969          /* SHLL, then NZ --> test dep1 != 0 */
1970          return unop(Iop_1Uto64,
1971                      binop(Iop_CmpNE32, unop(Iop_64to32, cc_dep1),
1972                            mkU32(0)));
1973       }
1974
1975       if (isU64(cc_op, AMD64G_CC_OP_SHLL) && isU64(cond, AMD64CondS)) {
1976          /* SHLL, then S --> (ULong)result[31] */
1977          return binop(Iop_And64,
1978                       binop(Iop_Shr64, cc_dep1, mkU8(31)),
1979                       mkU64(1));
1980       }
1981       // No known test case
1982       //if (isU64(cc_op, AMD64G_CC_OP_SHLL) && isU64(cond, AMD64CondNS)) {
1983       //   /* SHLL, then NS --> (ULong) ~ result[31] */
1984       //   vassert(0);
1985       //}
1986
1987       /*---------------- COPY ----------------*/
1988       /* This can happen, as a result of amd64 FP compares: "comisd ... ;
1989          jbe" for example. */
1990
1991       if (isU64(cc_op, AMD64G_CC_OP_COPY)
1992           && (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
1993          /* COPY, then BE --> extract C and Z from dep1, and test (C
1994             or Z == 1). */
1995          /* COPY, then NBE --> extract C and Z from dep1, and test (C
1996             or Z == 0). */
1997          ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
1998          return
1999             unop(
2000                Iop_1Uto64,
2001                binop(
2002                   Iop_CmpEQ64,
2003                   binop(
2004                      Iop_And64,
2005                      binop(
2006                         Iop_Or64,
2007                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
2008                         binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
2009                      ),
2010                      mkU64(1)
2011                   ),
2012                   mkU64(nnn)
2013                )
2014             );
2015       }
2016
2017       if (isU64(cc_op, AMD64G_CC_OP_COPY)
2018           && (isU64(cond, AMD64CondB) || isU64(cond, AMD64CondNB))) {
2019          /* COPY, then B --> extract C from dep1, and test (C == 1). */
2020          /* COPY, then NB --> extract C from dep1, and test (C == 0). */
2021          ULong nnn = isU64(cond, AMD64CondB) ? 1 : 0;
2022          return
2023             unop(
2024                Iop_1Uto64,
2025                binop(
2026                   Iop_CmpEQ64,
2027                   binop(
2028                      Iop_And64,
2029                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
2030                      mkU64(1)
2031                   ),
2032                   mkU64(nnn)
2033                )
2034             );
2035       }
2036
2037       if (isU64(cc_op, AMD64G_CC_OP_COPY)
2038           && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
2039          /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
2040          /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
2041          ULong nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
2042          return
2043             unop(
2044                Iop_1Uto64,
2045                binop(
2046                   Iop_CmpEQ64,
2047                   binop(
2048                      Iop_And64,
2049                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
2050                      mkU64(1)
2051                   ),
2052                   mkU64(nnn)
2053                )
2054             );
2055       }
2056
2057       if (isU64(cc_op, AMD64G_CC_OP_COPY)
2058           && (isU64(cond, AMD64CondP) || isU64(cond, AMD64CondNP))) {
2059          /* COPY, then P --> extract P from dep1, and test (P == 1). */
2060          /* COPY, then NP --> extract P from dep1, and test (P == 0). */
2061          ULong nnn = isU64(cond, AMD64CondP) ? 1 : 0;
2062          return
2063             unop(
2064                Iop_1Uto64,
2065                binop(
2066                   Iop_CmpEQ64,
2067                   binop(
2068                      Iop_And64,
2069                      binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
2070                      mkU64(1)
2071                   ),
2072                   mkU64(nnn)
2073                )
2074             );
2075       }
2076
2077 #     if 0
2078       if (cond->tag == Iex_Const && cc_op->tag == Iex_Const) {
2079          vex_printf("spec request failed: ");
2080          vex_printf("   %s  ", function_name);
2081          for (i = 0; i < 2/*arity*/; i++) {
2082             vex_printf("  ");
2083             ppIRExpr(args[i]);
2084          }
2085          vex_printf("\n");
2086       }
2087 #     endif
2088
2089       return NULL;
2090    }
2091
2092    /* --------- specialising "amd64g_calculate_rflags_c" --------- */
2093
2094    if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
2095       /* specialise calls to above "calculate_rflags_c" function */
2096       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
2097       vassert(arity == 4);
2098       cc_op   = args[0];
2099       cc_dep1 = args[1];
2100       cc_dep2 = args[2];
2101       cc_ndep = args[3];
2102
2103       if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
2104          /* C after sub denotes unsigned less than */
2105          return unop(Iop_1Uto64,
2106                      binop(Iop_CmpLT64U,
2107                            cc_dep1,
2108                            cc_dep2));
2109       }
2110       if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
2111          /* C after sub denotes unsigned less than */
2112          return unop(Iop_1Uto64,
2113                      binop(Iop_CmpLT32U,
2114                            unop(Iop_64to32, cc_dep1),
2115                            unop(Iop_64to32, cc_dep2)));
2116       }
2117       if (isU64(cc_op, AMD64G_CC_OP_SUBW)) {
2118          /* C after sub denotes unsigned less than */
2119          return unop(Iop_1Uto64,
2120                      binop(Iop_CmpLT64U,
2121                            binop(Iop_And64,cc_dep1,mkU64(0xFFFF)),
2122                            binop(Iop_And64,cc_dep2,mkU64(0xFFFF))));
2123       }
2124       if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
2125          /* C after sub denotes unsigned less than */
2126          return unop(Iop_1Uto64,
2127                      binop(Iop_CmpLT64U,
2128                            binop(Iop_And64,cc_dep1,mkU64(0xFF)),
2129                            binop(Iop_And64,cc_dep2,mkU64(0xFF))));
2130       }
2131       if (isU64(cc_op, AMD64G_CC_OP_ADDQ)) {
2132          /* C after add denotes sum <u either arg */
2133          return unop(Iop_1Uto64,
2134                      binop(Iop_CmpLT64U,
2135                            binop(Iop_Add64, cc_dep1, cc_dep2),
2136                            cc_dep1));
2137       }
2138       if (isU64(cc_op, AMD64G_CC_OP_ADDL)) {
2139          /* C after add denotes sum <u either arg */
2140          return unop(Iop_1Uto64,
2141                      binop(Iop_CmpLT32U,
2142                            unop(Iop_64to32, binop(Iop_Add64, cc_dep1, cc_dep2)),
2143                            unop(Iop_64to32, cc_dep1)));
2144       }
2145       if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
2146           || isU64(cc_op, AMD64G_CC_OP_LOGICL)
2147           || isU64(cc_op, AMD64G_CC_OP_LOGICW)
2148           || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
2149          /* cflag after logic is zero */
2150          return mkU64(0);
2151       }
2152       if (isU64(cc_op, AMD64G_CC_OP_DECL)
2153           || isU64(cc_op, AMD64G_CC_OP_INCL)
2154           || isU64(cc_op, AMD64G_CC_OP_DECQ)
2155           || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
2156          /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
2157          return cc_ndep;
2158       }
2159
2160 #     if 0
2161       if (cc_op->tag == Iex_Const) {
2162          vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
2163       }
2164 #     endif
2165
2166 #     if 0
2167       if (cc_op->tag == Iex_Const) {
2168          vex_printf("spec request failed: ");
2169          vex_printf("   %s  ", function_name);
2170          for (i = 0; i < 2/*arity*/; i++) {
2171             vex_printf("  ");
2172             ppIRExpr(args[i]);
2173          }
2174          vex_printf("\n");
2175       }
2176 #     endif
2177
2178       return NULL;
2179    }
2180
2181 #  undef unop
2182 #  undef binop
2183 #  undef mkU64
2184 #  undef mkU32
2185 #  undef mkU8
2186
2187    return NULL;
2188 }
2189
2190
2191 /*---------------------------------------------------------------*/
2192 /*--- Supporting functions for x87 FPU activities.            ---*/
2193 /*---------------------------------------------------------------*/
2194
2195 static inline Bool host_is_little_endian ( void )
2196 {
2197    UInt x = 0x76543210;
2198    UChar* p = (UChar*)(&x);
2199    return toBool(*p == 0x10);
2200 }
2201
2202 /* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
2203 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
2204 ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl )
2205 {
2206    Bool   mantissaIsZero;
2207    Int    bexp;
2208    UChar  sign;
2209    UChar* f64;
2210
2211    vassert(host_is_little_endian());
2212
2213    /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
2214
2215    f64  = (UChar*)(&dbl);
2216    sign = toUChar( (f64[7] >> 7) & 1 );
2217
2218    /* First off, if the tag indicates the register was empty,
2219       return 1,0,sign,1 */
2220    if (tag == 0) {
2221       /* vex_printf("Empty\n"); */
2222       return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1)
2223                                    | AMD64G_FC_MASK_C0;
2224    }
2225
2226    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
2227    bexp &= 0x7FF;
2228
2229    mantissaIsZero
2230       = toBool(
2231            (f64[6] & 0x0F) == 0
2232            && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
2233         );
2234
2235    /* If both exponent and mantissa are zero, the value is zero.
2236       Return 1,0,sign,0. */
2237    if (bexp == 0 && mantissaIsZero) {
2238       /* vex_printf("Zero\n"); */
2239       return AMD64G_FC_MASK_C3 | 0
2240                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
2241    }
2242
2243    /* If exponent is zero but mantissa isn't, it's a denormal.
2244       Return 1,1,sign,0. */
2245    if (bexp == 0 && !mantissaIsZero) {
2246       /* vex_printf("Denormal\n"); */
2247       return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2
2248                                | (sign << AMD64G_FC_SHIFT_C1) | 0;
2249    }
2250
2251    /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
2252       Return 0,1,sign,1. */
2253    if (bexp == 0x7FF && mantissaIsZero) {
2254       /* vex_printf("Inf\n"); */
2255       return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1)
2256                                    | AMD64G_FC_MASK_C0;
2257    }
2258
2259    /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
2260       Return 0,0,sign,1. */
2261    if (bexp == 0x7FF && !mantissaIsZero) {
2262       /* vex_printf("NaN\n"); */
2263       return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
2264    }
2265
2266    /* Uh, ok, we give up.  It must be a normal finite number.
2267       Return 0,1,sign,0.
2268    */
2269    /* vex_printf("normal\n"); */
2270    return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
2271 }
2272
2273
2274 /* This is used to implement both 'frstor' and 'fldenv'.  The latter
2275    appears to differ from the former only in that the 8 FP registers
2276    themselves are not transferred into the guest state. */
2277 static
2278 VexEmNote do_put_x87 ( Bool moveRegs,
2279                        /*IN*/Fpu_State* x87_state,
2280                        /*OUT*/VexGuestAMD64State* vex_state )
2281 {
2282    Int        stno, preg;
2283    UInt       tag;
2284    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2285    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2286    UInt       ftop    = (x87_state->env[FP_ENV_STAT] >> 11) & 7;
2287    UInt       tagw    = x87_state->env[FP_ENV_TAG];
2288    UInt       fpucw   = x87_state->env[FP_ENV_CTRL];
2289    UInt       c3210   = x87_state->env[FP_ENV_STAT] & 0x4700;
2290    VexEmNote  ew;
2291    UInt       fpround;
2292    ULong      pair;
2293
2294    /* Copy registers and tags */
2295    for (stno = 0; stno < 8; stno++) {
2296       preg = (stno + ftop) & 7;
2297       tag = (tagw >> (2*preg)) & 3;
2298       if (tag == 3) {
2299          /* register is empty */
2300          /* hmm, if it's empty, does it still get written?  Probably
2301             safer to say it does.  If we don't, memcheck could get out
2302             of sync, in that it thinks all FP registers are defined by
2303             this helper, but in reality some have not been updated. */
2304          if (moveRegs)
2305             vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2306          vexTags[preg] = 0;
2307       } else {
2308          /* register is non-empty */
2309          if (moveRegs)
2310             convert_f80le_to_f64le( &x87_state->reg[10*stno],
2311                                     (UChar*)&vexRegs[preg] );
2312          vexTags[preg] = 1;
2313       }
2314    }
2315
2316    /* stack pointer */
2317    vex_state->guest_FTOP = ftop;
2318
2319    /* status word */
2320    vex_state->guest_FC3210 = c3210;
2321
2322    /* handle the control word, setting FPROUND and detecting any
2323       emulation warnings. */
2324    pair    = amd64g_check_fldcw ( (ULong)fpucw );
2325    fpround = (UInt)pair & 0xFFFFFFFFULL;
2326    ew      = (VexEmNote)(pair >> 32);
2327
2328    vex_state->guest_FPROUND = fpround & 3;
2329
2330    /* emulation warnings --> caller */
2331    return ew;
2332 }
2333
2334
2335 /* Create an x87 FPU state from the guest state, as close as
2336    we can approximate it. */
2337 static
2338 void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
2339                   /*OUT*/Fpu_State* x87_state )
2340 {
2341    Int        i, stno, preg;
2342    UInt       tagw;
2343    ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2344    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2345    UInt       ftop    = vex_state->guest_FTOP;
2346    UInt       c3210   = vex_state->guest_FC3210;
2347
2348    for (i = 0; i < 14; i++)
2349       x87_state->env[i] = 0;
2350
2351    x87_state->env[1] = x87_state->env[3] = x87_state->env[5]
2352       = x87_state->env[13] = 0xFFFF;
2353    x87_state->env[FP_ENV_STAT]
2354       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2355    x87_state->env[FP_ENV_CTRL]
2356       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2357
2358    /* Dump the register stack in ST order. */
2359    tagw = 0;
2360    for (stno = 0; stno < 8; stno++) {
2361       preg = (stno + ftop) & 7;
2362       if (vexTags[preg] == 0) {
2363          /* register is empty */
2364          tagw |= (3 << (2*preg));
2365          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2366                                  &x87_state->reg[10*stno] );
2367       } else {
2368          /* register is full. */
2369          tagw |= (0 << (2*preg));
2370          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2371                                  &x87_state->reg[10*stno] );
2372       }
2373    }
2374    x87_state->env[FP_ENV_TAG] = toUShort(tagw);
2375 }
2376
2377
2378 /*---------------------------------------------------------------*/
2379 /*--- Supporting functions for XSAVE/FXSAVE.                  ---*/
2380 /*---------------------------------------------------------------*/
2381
2382 /* CALLED FROM GENERATED CODE */
2383 /* DIRTY HELPER (reads guest state, writes guest mem) */
2384 /* XSAVE component 0 is the x87 FPU state. */
2385 void amd64g_dirtyhelper_XSAVE_COMPONENT_0
2386         ( VexGuestAMD64State* gst, HWord addr )
2387 {
2388    /* Derived from values obtained from
2389       vendor_id       : AuthenticAMD
2390       cpu family      : 15
2391       model           : 12
2392       model name      : AMD Athlon(tm) 64 Processor 3200+
2393       stepping        : 0
2394       cpu MHz         : 2200.000
2395       cache size      : 512 KB
2396    */
2397    /* Somewhat roundabout, but at least it's simple. */
2398    Fpu_State tmp;
2399    UShort*   addrS = (UShort*)addr;
2400    UChar*    addrC = (UChar*)addr;
2401    UShort    fp_tags;
2402    UInt      summary_tags;
2403    Int       r, stno;
2404    UShort    *srcS, *dstS;
2405
2406    do_get_x87( gst, &tmp );
2407
2408    /* Now build the proper fxsave x87 image from the fsave x87 image
2409       we just made. */
2410
2411    addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
2412    addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
2413
2414    /* set addrS[2] in an endian-independent way */
2415    summary_tags = 0;
2416    fp_tags = tmp.env[FP_ENV_TAG];
2417    for (r = 0; r < 8; r++) {
2418       if ( ((fp_tags >> (2*r)) & 3) != 3 )
2419          summary_tags |= (1 << r);
2420    }
2421    addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
2422    addrC[5]  = 0; /* pad */
2423
2424    /* FOP: faulting fpu opcode.  From experimentation, the real CPU
2425       does not write this field. (?!) */
2426    addrS[3]  = 0; /* BOGUS */
2427
2428    /* RIP (Last x87 instruction pointer).  From experimentation, the
2429       real CPU does not write this field. (?!) */
2430    addrS[4]  = 0; /* BOGUS */
2431    addrS[5]  = 0; /* BOGUS */
2432    addrS[6]  = 0; /* BOGUS */
2433    addrS[7]  = 0; /* BOGUS */
2434
2435    /* RDP (Last x87 data pointer).  From experimentation, the real CPU
2436       does not write this field. (?!) */
2437    addrS[8]  = 0; /* BOGUS */
2438    addrS[9]  = 0; /* BOGUS */
2439    addrS[10] = 0; /* BOGUS */
2440    addrS[11] = 0; /* BOGUS */
2441
2442    /* addrS[13,12] are MXCSR -- not written */
2443    /* addrS[15,14] are MXCSR_MASK -- not written */
2444
2445    /* Copy in the FP registers, in ST order. */
2446    for (stno = 0; stno < 8; stno++) {
2447       srcS = (UShort*)(&tmp.reg[10*stno]);
2448       dstS = (UShort*)(&addrS[16 + 8*stno]);
2449       dstS[0] = srcS[0];
2450       dstS[1] = srcS[1];
2451       dstS[2] = srcS[2];
2452       dstS[3] = srcS[3];
2453       dstS[4] = srcS[4];
2454       dstS[5] = 0;
2455       dstS[6] = 0;
2456       dstS[7] = 0;
2457    }
2458 }
2459
2460
2461 /* CALLED FROM GENERATED CODE */
2462 /* DIRTY HELPER (reads guest state, writes guest mem) */
2463 /* XSAVE component 1 is the SSE state. */
2464 void amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
2465         ( VexGuestAMD64State* gst, HWord addr )
2466 {
2467    UShort* addrS = (UShort*)addr;
2468    UInt    mxcsr;
2469
2470    /* The only non-register parts of the SSE state are MXCSR and
2471       MXCSR_MASK. */
2472    mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
2473
2474    addrS[12] = toUShort(mxcsr);  /* MXCSR */
2475    addrS[13] = toUShort(mxcsr >> 16);
2476
2477    addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
2478    addrS[15] = 0x0000; /* MXCSR mask (hi16) */
2479 }
2480
2481
2482 /* VISIBLE TO LIBVEX CLIENT */
2483 /* Do FXSAVE from the supplied VexGuestAMD64State structure and store
2484    the result at the given address which represents a buffer of at
2485    least 416 bytes.
2486
2487    This function is not called from generated code.  FXSAVE is dealt
2488    with by the amd64 front end by calling the XSAVE_COMPONENT_{0,1}
2489    functions above plus some in-line IR.  This function is merely a
2490    convenience function for VEX's users.
2491 */
2492 void LibVEX_GuestAMD64_fxsave ( /*IN*/VexGuestAMD64State* gst,
2493                                 /*OUT*/HWord fp_state )
2494 {
2495    /* Do the x87 part */
2496    amd64g_dirtyhelper_XSAVE_COMPONENT_0(gst, fp_state);
2497
2498    /* And now the SSE part, except for the registers themselves. */
2499    amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
2500
2501    /* That's the first 160 bytes of the image done. */
2502    /* Now only %xmm0 .. %xmm15 remain to be copied.  If the host is
2503       big-endian, these need to be byte-swapped. */
2504    U128 *xmm = (U128 *)(fp_state + 160);
2505    vassert(host_is_little_endian());
2506
2507 #  define COPY_U128(_dst,_src)                       \
2508       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
2509            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
2510       while (0)
2511
2512    COPY_U128( xmm[0],  gst->guest_YMM0 );
2513    COPY_U128( xmm[1],  gst->guest_YMM1 );
2514    COPY_U128( xmm[2],  gst->guest_YMM2 );
2515    COPY_U128( xmm[3],  gst->guest_YMM3 );
2516    COPY_U128( xmm[4],  gst->guest_YMM4 );
2517    COPY_U128( xmm[5],  gst->guest_YMM5 );
2518    COPY_U128( xmm[6],  gst->guest_YMM6 );
2519    COPY_U128( xmm[7],  gst->guest_YMM7 );
2520    COPY_U128( xmm[8],  gst->guest_YMM8 );
2521    COPY_U128( xmm[9],  gst->guest_YMM9 );
2522    COPY_U128( xmm[10], gst->guest_YMM10 );
2523    COPY_U128( xmm[11], gst->guest_YMM11 );
2524    COPY_U128( xmm[12], gst->guest_YMM12 );
2525    COPY_U128( xmm[13], gst->guest_YMM13 );
2526    COPY_U128( xmm[14], gst->guest_YMM14 );
2527    COPY_U128( xmm[15], gst->guest_YMM15 );
2528 #  undef COPY_U128
2529 }
2530
2531
2532 /*---------------------------------------------------------------*/
2533 /*--- Supporting functions for XRSTOR/FXRSTOR.                ---*/
2534 /*---------------------------------------------------------------*/
2535
2536 /* CALLED FROM GENERATED CODE */
2537 /* DIRTY HELPER (writes guest state, reads guest mem) */
2538 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_0
2539              ( VexGuestAMD64State* gst, HWord addr )
2540 {
2541    Fpu_State tmp;
2542    UShort*   addrS   = (UShort*)addr;
2543    UChar*    addrC   = (UChar*)addr;
2544    UShort    fp_tags;
2545    Int       r, stno, i;
2546
2547    /* Copy the x87 registers out of the image, into a temporary
2548       Fpu_State struct. */
2549    for (i = 0; i < 14; i++) tmp.env[i] = 0;
2550    for (i = 0; i < 80; i++) tmp.reg[i] = 0;
2551    /* fill in tmp.reg[0..7] */
2552    for (stno = 0; stno < 8; stno++) {
2553       UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
2554       UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
2555       dstS[0] = srcS[0];
2556       dstS[1] = srcS[1];
2557       dstS[2] = srcS[2];
2558       dstS[3] = srcS[3];
2559       dstS[4] = srcS[4];
2560    }
2561    /* fill in tmp.env[0..13] */
2562    tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
2563    tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
2564
2565    fp_tags = 0;
2566    for (r = 0; r < 8; r++) {
2567       if (addrC[4] & (1<<r))
2568          fp_tags |= (0 << (2*r)); /* EMPTY */
2569       else
2570          fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
2571    }
2572    tmp.env[FP_ENV_TAG] = fp_tags;
2573
2574    /* Now write 'tmp' into the guest state. */
2575    VexEmNote warnX87 = do_put_x87( True/*moveRegs*/, &tmp, gst );
2576
2577    return warnX87;
2578 }
2579
2580
2581 /* CALLED FROM GENERATED CODE */
2582 /* DIRTY HELPER (writes guest state, reads guest mem) */
2583 VexEmNote amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
2584              ( VexGuestAMD64State* gst, HWord addr )
2585 {
2586    UShort* addrS = (UShort*)addr;
2587    UInt    w32   = (((UInt)addrS[12]) & 0xFFFF)
2588                    | ((((UInt)addrS[13]) & 0xFFFF) << 16);
2589    ULong   w64   = amd64g_check_ldmxcsr( (ULong)w32 );
2590
2591    VexEmNote warnXMM = (VexEmNote)(w64 >> 32);
2592
2593    gst->guest_SSEROUND = w64 & 0xFFFFFFFFULL;
2594    return warnXMM;
2595 }
2596
2597
2598 /* VISIBLE TO LIBVEX CLIENT */
2599 /* Do FXRSTOR from the supplied address and store read values to the given
2600    VexGuestAMD64State structure.
2601
2602    This function is not called from generated code.  FXRSTOR is dealt
2603    with by the amd64 front end by calling the XRSTOR_COMPONENT_{0,1}
2604    functions above plus some in-line IR.  This function is merely a
2605    convenience function for VEX's users.
2606 */
2607 VexEmNote LibVEX_GuestAMD64_fxrstor ( /*IN*/HWord fp_state,
2608                                       /*MOD*/VexGuestAMD64State* gst )
2609 {
2610    /* Restore %xmm0 .. %xmm15.  If the host is big-endian, these need
2611       to be byte-swapped. */
2612    U128 *xmm = (U128 *)(fp_state + 160);
2613
2614    vassert(host_is_little_endian());
2615
2616 #  define COPY_U128(_dst,_src)                       \
2617       do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
2618            _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
2619       while (0)
2620
2621    COPY_U128( gst->guest_YMM0, xmm[0] );
2622    COPY_U128( gst->guest_YMM1, xmm[1] );
2623    COPY_U128( gst->guest_YMM2, xmm[2] );
2624    COPY_U128( gst->guest_YMM3, xmm[3] );
2625    COPY_U128( gst->guest_YMM4, xmm[4] );
2626    COPY_U128( gst->guest_YMM5, xmm[5] );
2627    COPY_U128( gst->guest_YMM6, xmm[6] );
2628    COPY_U128( gst->guest_YMM7, xmm[7] );
2629    COPY_U128( gst->guest_YMM8, xmm[8] );
2630    COPY_U128( gst->guest_YMM9, xmm[9] );
2631    COPY_U128( gst->guest_YMM10, xmm[10] );
2632    COPY_U128( gst->guest_YMM11, xmm[11] );
2633    COPY_U128( gst->guest_YMM12, xmm[12] );
2634    COPY_U128( gst->guest_YMM13, xmm[13] );
2635    COPY_U128( gst->guest_YMM14, xmm[14] );
2636    COPY_U128( gst->guest_YMM15, xmm[15] );
2637
2638 #  undef COPY_U128
2639
2640    VexEmNote warnXMM
2641       = amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS(gst, fp_state);
2642    VexEmNote warnX87
2643       = amd64g_dirtyhelper_XRSTOR_COMPONENT_0(gst, fp_state);
2644
2645    /* Prefer an X87 emwarn over an XMM one, if both exist. */
2646    if (warnX87 != EmNote_NONE)
2647       return warnX87;
2648    else
2649       return warnXMM;
2650 }
2651
2652
2653 /*---------------------------------------------------------------*/
2654 /*--- Supporting functions for FSAVE/FRSTOR                   ---*/
2655 /*---------------------------------------------------------------*/
2656
2657 /* DIRTY HELPER (writes guest state) */
2658 /* Initialise the x87 FPU state as per 'finit'. */
2659 void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
2660 {
2661    Int i;
2662    gst->guest_FTOP = 0;
2663    for (i = 0; i < 8; i++) {
2664       gst->guest_FPTAG[i] = 0; /* empty */
2665       gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
2666    }
2667    gst->guest_FPROUND = (ULong)Irrm_NEAREST;
2668    gst->guest_FC3210  = 0;
2669 }
2670
2671
2672 /* CALLED FROM GENERATED CODE */
2673 /* DIRTY HELPER (reads guest memory) */
2674 ULong amd64g_dirtyhelper_loadF80le ( Addr addrU )
2675 {
2676    ULong f64;
2677    convert_f80le_to_f64le ( (UChar*)addrU, (UChar*)&f64 );
2678    return f64;
2679 }
2680
2681 /* CALLED FROM GENERATED CODE */
2682 /* DIRTY HELPER (writes guest memory) */
2683 void amd64g_dirtyhelper_storeF80le ( Addr addrU, ULong f64 )
2684 {
2685    convert_f64le_to_f80le( (UChar*)&f64, (UChar*)addrU );
2686 }
2687
2688
2689 /* CALLED FROM GENERATED CODE */
2690 /* CLEAN HELPER */
2691 /* mxcsr[15:0] contains a SSE native format MXCSR value.
2692    Extract from it the required SSEROUND value and any resulting
2693    emulation warning, and return (warn << 32) | sseround value.
2694 */
2695 ULong amd64g_check_ldmxcsr ( ULong mxcsr )
2696 {
2697    /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
2698    /* NOTE, encoded exactly as per enum IRRoundingMode. */
2699    ULong rmode = (mxcsr >> 13) & 3;
2700
2701    /* Detect any required emulation warnings. */
2702    VexEmNote ew = EmNote_NONE;
2703
2704    if ((mxcsr & 0x1F80) != 0x1F80) {
2705       /* unmasked exceptions! */
2706       ew = EmWarn_X86_sseExns;
2707    }
2708    else
2709    if (mxcsr & (1<<15)) {
2710       /* FZ is set */
2711       ew = EmWarn_X86_fz;
2712    }
2713    else
2714    if (mxcsr & (1<<6)) {
2715       /* DAZ is set */
2716       ew = EmWarn_X86_daz;
2717    }
2718
2719    return (((ULong)ew) << 32) | ((ULong)rmode);
2720 }
2721
2722
2723 /* CALLED FROM GENERATED CODE */
2724 /* CLEAN HELPER */
2725 /* Given sseround as an IRRoundingMode value, create a suitable SSE
2726    native format MXCSR value. */
2727 ULong amd64g_create_mxcsr ( ULong sseround )
2728 {
2729    sseround &= 3;
2730    return 0x1F80 | (sseround << 13);
2731 }
2732
2733
2734 /* CLEAN HELPER */
2735 /* fpucw[15:0] contains a x87 native format FPU control word.
2736    Extract from it the required FPROUND value and any resulting
2737    emulation warning, and return (warn << 32) | fpround value.
2738 */
2739 ULong amd64g_check_fldcw ( ULong fpucw )
2740 {
2741    /* Decide on a rounding mode.  fpucw[11:10] holds it. */
2742    /* NOTE, encoded exactly as per enum IRRoundingMode. */
2743    ULong rmode = (fpucw >> 10) & 3;
2744
2745    /* Detect any required emulation warnings. */
2746    VexEmNote ew = EmNote_NONE;
2747
2748    if ((fpucw & 0x3F) != 0x3F) {
2749       /* unmasked exceptions! */
2750       ew = EmWarn_X86_x87exns;
2751    }
2752    else
2753    if (((fpucw >> 8) & 3) != 3) {
2754       /* unsupported precision */
2755       ew = EmWarn_X86_x87precision;
2756    }
2757
2758    return (((ULong)ew) << 32) | ((ULong)rmode);
2759 }
2760
2761
2762 /* CLEAN HELPER */
2763 /* Given fpround as an IRRoundingMode value, create a suitable x87
2764    native format FPU control word. */
2765 ULong amd64g_create_fpucw ( ULong fpround )
2766 {
2767    fpround &= 3;
2768    return 0x037F | (fpround << 10);
2769 }
2770
2771
2772 /* This is used to implement 'fldenv'.
2773    Reads 28 bytes at x87_state[0 .. 27]. */
2774 /* CALLED FROM GENERATED CODE */
2775 /* DIRTY HELPER */
2776 VexEmNote amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
2777                                       /*IN*/HWord x87_state)
2778 {
2779    return do_put_x87( False, (Fpu_State*)x87_state, vex_state );
2780 }
2781
2782
2783 /* CALLED FROM GENERATED CODE */
2784 /* DIRTY HELPER */
2785 /* Create an x87 FPU env from the guest state, as close as we can
2786    approximate it.  Writes 28 bytes at x87_state[0..27]. */
2787 void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
2788                                  /*OUT*/HWord x87_state )
2789 {
2790    Int        i, stno, preg;
2791    UInt       tagw;
2792    UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2793    Fpu_State* x87     = (Fpu_State*)x87_state;
2794    UInt       ftop    = vex_state->guest_FTOP;
2795    ULong      c3210   = vex_state->guest_FC3210;
2796
2797    for (i = 0; i < 14; i++)
2798       x87->env[i] = 0;
2799
2800    x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
2801    x87->env[FP_ENV_STAT]
2802       = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
2803    x87->env[FP_ENV_CTRL]
2804       = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
2805
2806    /* Compute the x87 tag word. */
2807    tagw = 0;
2808    for (stno = 0; stno < 8; stno++) {
2809       preg = (stno + ftop) & 7;
2810       if (vexTags[preg] == 0) {
2811          /* register is empty */
2812          tagw |= (3 << (2*preg));
2813       } else {
2814          /* register is full. */
2815          tagw |= (0 << (2*preg));
2816       }
2817    }
2818    x87->env[FP_ENV_TAG] = toUShort(tagw);
2819
2820    /* We don't dump the x87 registers, tho. */
2821 }
2822
2823
2824 /* This is used to implement 'fnsave'.
2825    Writes 108 bytes at x87_state[0 .. 107]. */
2826 /* CALLED FROM GENERATED CODE */
2827 /* DIRTY HELPER */
2828 void amd64g_dirtyhelper_FNSAVE ( /*IN*/VexGuestAMD64State* vex_state,
2829                                  /*OUT*/HWord x87_state)
2830 {
2831    do_get_x87( vex_state, (Fpu_State*)x87_state );
2832 }
2833
2834
2835 /* This is used to implement 'fnsaves'.
2836    Writes 94 bytes at x87_state[0 .. 93]. */
2837 /* CALLED FROM GENERATED CODE */
2838 /* DIRTY HELPER */
2839 void amd64g_dirtyhelper_FNSAVES ( /*IN*/VexGuestAMD64State* vex_state,
2840                                   /*OUT*/HWord x87_state)
2841 {
2842    Int           i, stno, preg;
2843    UInt          tagw;
2844    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2845    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2846    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
2847    UInt          ftop    = vex_state->guest_FTOP;
2848    UInt          c3210   = vex_state->guest_FC3210;
2849
2850    for (i = 0; i < 7; i++)
2851       x87->env[i] = 0;
2852
2853    x87->env[FPS_ENV_STAT]
2854       = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
2855    x87->env[FPS_ENV_CTRL]
2856       = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
2857
2858    /* Dump the register stack in ST order. */
2859    tagw = 0;
2860    for (stno = 0; stno < 8; stno++) {
2861       preg = (stno + ftop) & 7;
2862       if (vexTags[preg] == 0) {
2863          /* register is empty */
2864          tagw |= (3 << (2*preg));
2865          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2866                                  &x87->reg[10*stno] );
2867       } else {
2868          /* register is full. */
2869          tagw |= (0 << (2*preg));
2870          convert_f64le_to_f80le( (UChar*)&vexRegs[preg],
2871                                  &x87->reg[10*stno] );
2872       }
2873    }
2874    x87->env[FPS_ENV_TAG] = toUShort(tagw);
2875 }
2876
2877
2878 /* This is used to implement 'frstor'.
2879    Reads 108 bytes at x87_state[0 .. 107]. */
2880 /* CALLED FROM GENERATED CODE */
2881 /* DIRTY HELPER */
2882 VexEmNote amd64g_dirtyhelper_FRSTOR ( /*OUT*/VexGuestAMD64State* vex_state,
2883                                       /*IN*/HWord x87_state)
2884 {
2885    return do_put_x87( True, (Fpu_State*)x87_state, vex_state );
2886 }
2887
2888
2889 /* This is used to implement 'frstors'.
2890    Reads 94 bytes at x87_state[0 .. 93]. */
2891 /* CALLED FROM GENERATED CODE */
2892 /* DIRTY HELPER */
2893 VexEmNote amd64g_dirtyhelper_FRSTORS ( /*OUT*/VexGuestAMD64State* vex_state,
2894                                        /*IN*/HWord x87_state)
2895 {
2896    Int           stno, preg;
2897    UInt          tag;
2898    ULong*        vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
2899    UChar*        vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
2900    Fpu_State_16* x87     = (Fpu_State_16*)x87_state;
2901    UInt          ftop    = (x87->env[FPS_ENV_STAT] >> 11) & 7;
2902    UInt          tagw    = x87->env[FPS_ENV_TAG];
2903    UInt          fpucw   = x87->env[FPS_ENV_CTRL];
2904    UInt          c3210   = x87->env[FPS_ENV_STAT] & 0x4700;
2905    VexEmNote     ew;
2906    UInt          fpround;
2907    ULong         pair;
2908
2909    /* Copy registers and tags */
2910    for (stno = 0; stno < 8; stno++) {
2911       preg = (stno + ftop) & 7;
2912       tag = (tagw >> (2*preg)) & 3;
2913       if (tag == 3) {
2914          /* register is empty */
2915          /* hmm, if it's empty, does it still get written?  Probably
2916             safer to say it does.  If we don't, memcheck could get out
2917             of sync, in that it thinks all FP registers are defined by
2918             this helper, but in reality some have not been updated. */
2919          vexRegs[preg] = 0; /* IEEE754 64-bit zero */
2920          vexTags[preg] = 0;
2921       } else {
2922          /* register is non-empty */
2923          convert_f80le_to_f64le( &x87->reg[10*stno],
2924                                  (UChar*)&vexRegs[preg] );
2925          vexTags[preg] = 1;
2926       }
2927    }
2928
2929    /* stack pointer */
2930    vex_state->guest_FTOP = ftop;
2931
2932    /* status word */
2933    vex_state->guest_FC3210 = c3210;
2934
2935    /* handle the control word, setting FPROUND and detecting any
2936       emulation warnings. */
2937    pair    = amd64g_check_fldcw ( (ULong)fpucw );
2938    fpround = (UInt)pair & 0xFFFFFFFFULL;
2939    ew      = (VexEmNote)(pair >> 32);
2940
2941    vex_state->guest_FPROUND = fpround & 3;
2942
2943    /* emulation warnings --> caller */
2944    return ew;
2945 }
2946
2947
2948 /*---------------------------------------------------------------*/
2949 /*--- CPUID helpers.                                          ---*/
2950 /*---------------------------------------------------------------*/
2951
2952 /* Claim to be the following CPU, which is probably representative of
2953    the lowliest (earliest) amd64 offerings.  It can do neither sse3
2954    nor cx16.
2955
2956    vendor_id       : AuthenticAMD
2957    cpu family      : 15
2958    model           : 5
2959    model name      : AMD Opteron (tm) Processor 848
2960    stepping        : 10
2961    cpu MHz         : 1797.682
2962    cache size      : 1024 KB
2963    fpu             : yes
2964    fpu_exception   : yes
2965    cpuid level     : 1
2966    wp              : yes
2967    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
2968                      mtrr pge mca cmov pat pse36 clflush mmx fxsr
2969                      sse sse2 syscall nx mmxext lm 3dnowext 3dnow
2970    bogomips        : 3600.62
2971    TLB size        : 1088 4K pages
2972    clflush size    : 64
2973    cache_alignment : 64
2974    address sizes   : 40 bits physical, 48 bits virtual
2975    power management: ts fid vid ttp
2976
2977    2012-Feb-21: don't claim 3dnow or 3dnowext, since in fact
2978    we don't support them.  See #291568.  3dnow is 80000001.EDX.31
2979    and 3dnowext is 80000001.EDX.30.
2980 */
2981 void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
2982 {
2983 #  define SET_ABCD(_a,_b,_c,_d)                \
2984       do { st->guest_RAX = (ULong)(_a);        \
2985            st->guest_RBX = (ULong)(_b);        \
2986            st->guest_RCX = (ULong)(_c);        \
2987            st->guest_RDX = (ULong)(_d);        \
2988       } while (0)
2989
2990    switch (0xFFFFFFFF & st->guest_RAX) {
2991       case 0x00000000:
2992          SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
2993          break;
2994       case 0x00000001:
2995          SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
2996          break;
2997       case 0x80000000:
2998          SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
2999          break;
3000       case 0x80000001:
3001          /* Don't claim to support 3dnow or 3dnowext.  0xe1d3fbff is
3002             the original it-is-supported value that the h/w provides.
3003             See #291568. */
3004          SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, /*0xe1d3fbff*/
3005                                                       0x21d3fbff);
3006          break;
3007       case 0x80000002:
3008          SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
3009          break;
3010       case 0x80000003:
3011          SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
3012          break;
3013       case 0x80000004:
3014          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3015          break;
3016       case 0x80000005:
3017          SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
3018          break;
3019       case 0x80000006:
3020          SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
3021          break;
3022       case 0x80000007:
3023          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
3024          break;
3025       case 0x80000008:
3026          SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
3027          break;
3028       default:
3029          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3030          break;
3031    }
3032 #  undef SET_ABCD
3033 }
3034
3035
3036 /* Claim to be the following CPU (2 x ...), which is sse3 and cx16
3037    capable.
3038
3039    vendor_id       : GenuineIntel
3040    cpu family      : 6
3041    model           : 15
3042    model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
3043    stepping        : 6
3044    cpu MHz         : 2394.000
3045    cache size      : 4096 KB
3046    physical id     : 0
3047    siblings        : 2
3048    core id         : 0
3049    cpu cores       : 2
3050    fpu             : yes
3051    fpu_exception   : yes
3052    cpuid level     : 10
3053    wp              : yes
3054    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
3055                      mtrr pge mca cmov pat pse36 clflush dts acpi
3056                      mmx fxsr sse sse2 ss ht tm syscall nx lm
3057                      constant_tsc pni monitor ds_cpl vmx est tm2
3058                      cx16 xtpr lahf_lm
3059    bogomips        : 4798.78
3060    clflush size    : 64
3061    cache_alignment : 64
3062    address sizes   : 36 bits physical, 48 bits virtual
3063    power management:
3064 */
3065 void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
3066 {
3067 #  define SET_ABCD(_a,_b,_c,_d)                \
3068       do { st->guest_RAX = (ULong)(_a);        \
3069            st->guest_RBX = (ULong)(_b);        \
3070            st->guest_RCX = (ULong)(_c);        \
3071            st->guest_RDX = (ULong)(_d);        \
3072       } while (0)
3073
3074    switch (0xFFFFFFFF & st->guest_RAX) {
3075       case 0x00000000:
3076          SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
3077          break;
3078       case 0x00000001:
3079          SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
3080          break;
3081       case 0x00000002:
3082          SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
3083          break;
3084       case 0x00000003:
3085          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3086          break;
3087       case 0x00000004: {
3088          switch (0xFFFFFFFF & st->guest_RCX) {
3089             case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
3090                                       0x0000003f, 0x00000001); break;
3091             case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
3092                                       0x0000003f, 0x00000001); break;
3093             case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
3094                                       0x00000fff, 0x00000001); break;
3095             default:         SET_ABCD(0x00000000, 0x00000000,
3096                                       0x00000000, 0x00000000); break;
3097          }
3098          break;
3099       }
3100       case 0x00000005:
3101          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
3102          break;
3103       case 0x00000006:
3104          SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
3105          break;
3106       case 0x00000007:
3107          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3108          break;
3109       case 0x00000008:
3110          SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
3111          break;
3112       case 0x00000009:
3113          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3114          break;
3115       case 0x0000000a:
3116       unhandled_eax_value:
3117          SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
3118          break;
3119       case 0x80000000:
3120          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3121          break;
3122       case 0x80000001:
3123          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
3124          break;
3125       case 0x80000002:
3126          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3127          break;
3128       case 0x80000003:
3129          SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
3130          break;
3131       case 0x80000004:
3132          SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
3133          break;
3134       case 0x80000005:
3135          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3136          break;
3137       case 0x80000006:
3138          SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
3139          break;
3140       case 0x80000007:
3141          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3142          break;
3143       case 0x80000008:
3144          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3145          break;
3146       default:
3147          goto unhandled_eax_value;
3148    }
3149 #  undef SET_ABCD
3150 }
3151
3152
3153 /* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
3154    capable.
3155
3156    vendor_id       : GenuineIntel
3157    cpu family      : 6
3158    model           : 37
3159    model name      : Intel(R) Core(TM) i5 CPU         670  @ 3.47GHz
3160    stepping        : 2
3161    cpu MHz         : 3334.000
3162    cache size      : 4096 KB
3163    physical id     : 0
3164    siblings        : 4
3165    core id         : 0
3166    cpu cores       : 2
3167    apicid          : 0
3168    initial apicid  : 0
3169    fpu             : yes
3170    fpu_exception   : yes
3171    cpuid level     : 11
3172    wp              : yes
3173    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
3174                      mtrr pge mca cmov pat pse36 clflush dts acpi
3175                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
3176                      lm constant_tsc arch_perfmon pebs bts rep_good
3177                      xtopology nonstop_tsc aperfmperf pni pclmulqdq
3178                      dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
3179                      xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
3180                      arat tpr_shadow vnmi flexpriority ept vpid
3181    bogomips        : 6957.57
3182    clflush size    : 64
3183    cache_alignment : 64
3184    address sizes   : 36 bits physical, 48 bits virtual
3185    power management:
3186 */
3187 void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
3188 {
3189 #  define SET_ABCD(_a,_b,_c,_d)                \
3190       do { st->guest_RAX = (ULong)(_a);        \
3191            st->guest_RBX = (ULong)(_b);        \
3192            st->guest_RCX = (ULong)(_c);        \
3193            st->guest_RDX = (ULong)(_d);        \
3194       } while (0)
3195
3196    UInt old_eax = (UInt)st->guest_RAX;
3197    UInt old_ecx = (UInt)st->guest_RCX;
3198
3199    switch (old_eax) {
3200       case 0x00000000:
3201          SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
3202          break;
3203       case 0x00000001:
3204          SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff, 0xbfebfbff);
3205          break;
3206       case 0x00000002:
3207          SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
3208          break;
3209       case 0x00000003:
3210          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3211          break;
3212       case 0x00000004:
3213          switch (old_ecx) {
3214             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3215                                       0x0000003f, 0x00000000); break;
3216             case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
3217                                       0x0000007f, 0x00000000); break;
3218             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3219                                       0x000001ff, 0x00000000); break;
3220             case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
3221                                       0x00000fff, 0x00000002); break;
3222             default:         SET_ABCD(0x00000000, 0x00000000,
3223                                       0x00000000, 0x00000000); break;
3224          }
3225          break;
3226       case 0x00000005:
3227          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
3228          break;
3229       case 0x00000006:
3230          SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
3231          break;
3232       case 0x00000007:
3233          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3234          break;
3235       case 0x00000008:
3236          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3237          break;
3238       case 0x00000009:
3239          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3240          break;
3241       case 0x0000000a:
3242          SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
3243          break;
3244       case 0x0000000b:
3245          switch (old_ecx) {
3246             case 0x00000000:
3247                SET_ABCD(0x00000001, 0x00000002,
3248                         0x00000100, 0x00000000); break;
3249             case 0x00000001:
3250                SET_ABCD(0x00000004, 0x00000004,
3251                         0x00000201, 0x00000000); break;
3252             default:
3253                SET_ABCD(0x00000000, 0x00000000,
3254                         old_ecx,    0x00000000); break;
3255          }
3256          break;
3257       case 0x0000000c:
3258          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
3259          break;
3260       case 0x0000000d:
3261          switch (old_ecx) {
3262             case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
3263                                       0x00000100, 0x00000000); break;
3264             case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
3265                                       0x00000201, 0x00000000); break;
3266             default:         SET_ABCD(0x00000000, 0x00000000,
3267                                       old_ecx,    0x00000000); break;
3268          }
3269          break;
3270       case 0x80000000:
3271          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3272          break;
3273       case 0x80000001:
3274          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
3275          break;
3276       case 0x80000002:
3277          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3278          break;
3279       case 0x80000003:
3280          SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
3281          break;
3282       case 0x80000004:
3283          SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
3284          break;
3285       case 0x80000005:
3286          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3287          break;
3288       case 0x80000006:
3289          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3290          break;
3291       case 0x80000007:
3292          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3293          break;
3294       case 0x80000008:
3295          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3296          break;
3297       default:
3298          SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
3299          break;
3300    }
3301 #  undef SET_ABCD
3302 }
3303
3304
3305 /* Claim to be the following CPU (4 x ...), which is AVX and cx16
3306    capable.  Plus (kludge!) it "supports" HTM.
3307
3308    Also with the following change: claim that XSaveOpt is not
3309    available, by cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1
3310    on the real CPU.  Consequently, programs that correctly observe
3311    these CPUID values should only try to use 3 of the 8 XSave-family
3312    instructions: XGETBV, XSAVE and XRSTOR.  In particular this avoids
3313    having to implement the compacted or optimised save/restore
3314    variants.
3315
3316    vendor_id       : GenuineIntel
3317    cpu family      : 6
3318    model           : 42
3319    model name      : Intel(R) Core(TM) i5-2300 CPU @ 2.80GHz
3320    stepping        : 7
3321    cpu MHz         : 1600.000
3322    cache size      : 6144 KB
3323    physical id     : 0
3324    siblings        : 4
3325    core id         : 3
3326    cpu cores       : 4
3327    apicid          : 6
3328    initial apicid  : 6
3329    fpu             : yes
3330    fpu_exception   : yes
3331    cpuid level     : 13
3332    wp              : yes
3333    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
3334                      mtrr pge mca cmov pat pse36 clflush dts acpi
3335                      mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
3336                      lm constant_tsc arch_perfmon pebs bts rep_good
3337                      nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq
3338                      dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16
3339                      xtpr pdcm sse4_1 sse4_2 popcnt aes xsave avx
3340                      lahf_lm ida arat epb xsaveopt pln pts dts
3341                      tpr_shadow vnmi flexpriority ept vpid
3342
3343    bogomips        : 5768.94
3344    clflush size    : 64
3345    cache_alignment : 64
3346    address sizes   : 36 bits physical, 48 bits virtual
3347    power management:
3348 */
3349 void amd64g_dirtyhelper_CPUID_avx_and_cx16 ( VexGuestAMD64State* st,
3350                                              ULong hasF16C, ULong hasRDRAND,
3351                                              ULong hasRDSEED )
3352 {
3353    vassert((hasF16C >> 1) == 0ULL);
3354    vassert((hasRDRAND >> 1) == 0ULL);
3355 #  define SET_ABCD(_a,_b,_c,_d)                \
3356       do { st->guest_RAX = (ULong)(_a);        \
3357            st->guest_RBX = (ULong)(_b);        \
3358            st->guest_RCX = (ULong)(_c);        \
3359            st->guest_RDX = (ULong)(_d);        \
3360       } while (0)
3361
3362    UInt old_eax = (UInt)st->guest_RAX;
3363    UInt old_ecx = (UInt)st->guest_RCX;
3364
3365    switch (old_eax) {
3366       case 0x00000000:
3367          SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
3368          break;
3369       case 0x00000001: {
3370          // As a baseline, advertise neither F16C (ecx:29) nor RDRAND (ecx:30),
3371          // but patch in support for them as directed by the caller.
3372          UInt ecx_extra
3373             = (hasF16C ? (1U << 29) : 0) | (hasRDRAND ? (1U << 30) : 0);
3374          SET_ABCD(0x000206a7, 0x00100800, (0x1f9ae3bf | ecx_extra), 0xbfebfbff);
3375          break;
3376       }
3377       case 0x00000002:
3378          SET_ABCD(0x76035a01, 0x00f0b0ff, 0x00000000, 0x00ca0000);
3379          break;
3380       case 0x00000003:
3381          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3382          break;
3383       case 0x00000004:
3384          switch (old_ecx) {
3385             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3386                                       0x0000003f, 0x00000000); break;
3387             case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
3388                                       0x0000003f, 0x00000000); break;
3389             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3390                                       0x000001ff, 0x00000000); break;
3391             case 0x00000003: SET_ABCD(0x1c03c163, 0x02c0003f,
3392                                       0x00001fff, 0x00000006); break;
3393             default:         SET_ABCD(0x00000000, 0x00000000,
3394                                       0x00000000, 0x00000000); break;
3395          }
3396          break;
3397       case 0x00000005:
3398          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
3399          break;
3400       case 0x00000006:
3401          SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
3402          break;
3403       case 0x00000007: {
3404          UInt ebx_extra = 0;
3405          if (old_ecx == 0)
3406              ebx_extra = hasRDSEED ? (1U << 18) : 0;
3407          SET_ABCD(0x00000000, 0x00000800 | ebx_extra, 0x00000000,
3408                   0x00000000);
3409          break;
3410                        }
3411       case 0x00000008:
3412          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3413          break;
3414       case 0x00000009:
3415          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3416          break;
3417       case 0x0000000a:
3418          SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
3419          break;
3420       case 0x0000000b:
3421          switch (old_ecx) {
3422             case 0x00000000:
3423                SET_ABCD(0x00000001, 0x00000001,
3424                         0x00000100, 0x00000000); break;
3425             case 0x00000001:
3426                SET_ABCD(0x00000004, 0x00000004,
3427                         0x00000201, 0x00000000); break;
3428             default:
3429                SET_ABCD(0x00000000, 0x00000000,
3430                         old_ecx,    0x00000000); break;
3431          }
3432          break;
3433       case 0x0000000c:
3434          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3435          break;
3436       case 0x0000000d:
3437          switch (old_ecx) {
3438             case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
3439                                       0x00000340, 0x00000000); break;
3440             case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
3441                                       0x00000000, 0x00000000); break;
3442             case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
3443                                       0x00000000, 0x00000000); break;
3444             default:         SET_ABCD(0x00000000, 0x00000000,
3445                                       0x00000000, 0x00000000); break;
3446          }
3447          break;
3448       case 0x0000000e:
3449          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3450          break;
3451       case 0x0000000f:
3452          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3453          break;
3454       case 0x80000000:
3455          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3456          break;
3457       case 0x80000001:
3458          SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
3459          break;
3460       case 0x80000002:
3461          SET_ABCD(0x20202020, 0x20202020, 0x65746e49, 0x2952286c);
3462          break;
3463       case 0x80000003:
3464          SET_ABCD(0x726f4320, 0x4d542865, 0x35692029, 0x3033322d);
3465          break;
3466       case 0x80000004:
3467          SET_ABCD(0x50432030, 0x20402055, 0x30382e32, 0x007a4847);
3468          break;
3469       case 0x80000005:
3470          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3471          break;
3472       case 0x80000006:
3473          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3474          break;
3475       case 0x80000007:
3476          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3477          break;
3478       case 0x80000008:
3479          SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
3480          break;
3481       default:
3482          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3483          break;
3484    }
3485 #  undef SET_ABCD
3486 }
3487
3488
3489 /* Claim to be the following CPU (4 x ...), which is AVX2 capable.
3490
3491    With the following change: claim that XSaveOpt is not available, by
3492    cpuid(eax=0xD,ecx=1).eax[0] returns 0, compared to 1 on the real
3493    CPU.  Consequently, programs that correctly observe these CPUID
3494    values should only try to use 3 of the 8 XSave-family instructions:
3495    XGETBV, XSAVE and XRSTOR.  In particular this avoids having to
3496    implement the compacted or optimised save/restore variants.
3497
3498    vendor_id       : GenuineIntel
3499    cpu family      : 6
3500    model           : 60
3501    model name      : Intel(R) Core(TM) i7-4910MQ CPU @ 2.90GHz
3502    stepping        : 3
3503    microcode       : 0x1c
3504    cpu MHz         : 919.957
3505    cache size      : 8192 KB
3506    physical id     : 0
3507    siblings        : 4
3508    core id         : 3
3509    cpu cores       : 4
3510    apicid          : 6
3511    initial apicid  : 6
3512    fpu             : yes
3513    fpu_exception   : yes
3514    cpuid level     : 13
3515    wp              : yes
3516    flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
3517                      cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht
3518                      tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc
3519                      arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc
3520                      aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl
3521                      vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1
3522                      sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave
3523                      avx f16c rdrand lahf_lm abm ida arat epb pln pts dtherm
3524                      tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust
3525                      bmi1 avx2 smep bmi2 erms invpcid xsaveopt
3526    bugs            :
3527    bogomips        : 5786.68
3528    clflush size    : 64
3529    cache_alignment : 64
3530    address sizes   : 39 bits physical, 48 bits virtual
3531    power management:
3532 */
3533 void amd64g_dirtyhelper_CPUID_avx2 ( VexGuestAMD64State* st,
3534                                      ULong hasF16C, ULong hasRDRAND,
3535                                      ULong hasRDSEED )
3536 {
3537    vassert((hasF16C >> 1) == 0ULL);
3538    vassert((hasRDRAND >> 1) == 0ULL);
3539 #  define SET_ABCD(_a,_b,_c,_d)                \
3540       do { st->guest_RAX = (ULong)(_a);        \
3541            st->guest_RBX = (ULong)(_b);        \
3542            st->guest_RCX = (ULong)(_c);        \
3543            st->guest_RDX = (ULong)(_d);        \
3544       } while (0)
3545
3546    UInt old_eax = (UInt)st->guest_RAX;
3547    UInt old_ecx = (UInt)st->guest_RCX;
3548
3549    switch (old_eax) {
3550       case 0x00000000:
3551          SET_ABCD(0x0000000d, 0x756e6547, 0x6c65746e, 0x49656e69);
3552          break;
3553       case 0x00000001: {
3554          // As a baseline, advertise neither F16C (ecx:29) nor RDRAND (ecx:30),
3555          // but patch in support for them as directed by the caller.
3556          UInt ecx_extra
3557             = (hasF16C ? (1U << 29) : 0) | (hasRDRAND ? (1U << 30) : 0);
3558          SET_ABCD(0x000306c3, 0x02100800, (0x1ffafbff | ecx_extra), 0xbfebfbff);
3559          break;
3560       }
3561       case 0x00000002:
3562          SET_ABCD(0x76036301, 0x00f0b6ff, 0x00000000, 0x00c10000);
3563          break;
3564       case 0x00000003:
3565          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3566          break;
3567       case 0x00000004:
3568          switch (old_ecx) {
3569             case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
3570                                       0x0000003f, 0x00000000); break;
3571             case 0x00000001: SET_ABCD(0x1c004122, 0x01c0003f,
3572                                       0x0000003f, 0x00000000); break;
3573             case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
3574                                       0x000001ff, 0x00000000); break;
3575             case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
3576                                       0x00001fff, 0x00000006); break;
3577             default:         SET_ABCD(0x00000000, 0x00000000,
3578                                       0x00000000, 0x00000000); break;
3579          }
3580          break;
3581       case 0x00000005:
3582          SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00042120);
3583          break;
3584       case 0x00000006:
3585          SET_ABCD(0x00000077, 0x00000002, 0x00000009, 0x00000000);
3586          break;
3587       case 0x00000007:
3588          switch (old_ecx) {
3589             /* Don't advertise FSGSBASE support, bit 0 in EBX.  */
3590
3591             case 0x00000000: {
3592                UInt ebx_extra = hasRDSEED ? (1U << 18) : 0;
3593                SET_ABCD(0x00000000, 0x000027aa | ebx_extra,
3594                         0x00000000, 0x00000000); break;
3595                              }
3596             default:         SET_ABCD(0x00000000, 0x00000000,
3597                                       0x00000000, 0x00000000); break;
3598          }
3599          break;
3600       case 0x00000008:
3601          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3602          break;
3603       case 0x00000009:
3604          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3605          break;
3606       case 0x0000000a:
3607          SET_ABCD(0x07300803, 0x00000000, 0x00000000, 0x00000603);
3608          break;
3609       case 0x0000000b:
3610          switch (old_ecx) {
3611             case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
3612                                       0x00000100, 0x00000002); break;
3613             case 0x00000001: SET_ABCD(0x00000004, 0x00000008,
3614                                       0x00000201, 0x00000002); break;
3615             default:         SET_ABCD(0x00000000, 0x00000000,
3616                                       old_ecx,    0x00000002); break;
3617          }
3618          break;
3619       case 0x0000000c:
3620          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3621          break;
3622       case 0x0000000d:
3623          switch (old_ecx) {
3624             case 0x00000000: SET_ABCD(0x00000007, 0x00000340,
3625                                       0x00000340, 0x00000000); break;
3626             case 0x00000001: SET_ABCD(0x00000000, 0x00000000,
3627                                       0x00000000, 0x00000000); break;
3628             case 0x00000002: SET_ABCD(0x00000100, 0x00000240,
3629                                       0x00000000, 0x00000000); break;
3630             default:         SET_ABCD(0x00000000, 0x00000000,
3631                                       0x00000000, 0x00000000); break;
3632          }
3633          break;
3634       case 0x80000000:
3635          SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
3636          break;
3637       case 0x80000001:
3638          SET_ABCD(0x00000000, 0x00000000, 0x00000021, 0x2c100800);
3639          break;
3640       case 0x80000002:
3641          SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
3642          break;
3643       case 0x80000003:
3644          SET_ABCD(0x37692029, 0x3139342d, 0x20514d30, 0x20555043);
3645          break;
3646       case 0x80000004:
3647          SET_ABCD(0x2e322040, 0x48473039, 0x0000007a, 0x00000000);
3648          break;
3649       case 0x80000005:
3650          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
3651          break;
3652       case 0x80000006:
3653          SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
3654          break;
3655       case 0x80000007:
3656          SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
3657          break;
3658       case 0x80000008:
3659          SET_ABCD(0x00003027, 0x00000000, 0x00000000, 0x00000000);
3660          break;
3661       default:
3662          SET_ABCD(0x00000007, 0x00000340, 0x00000340, 0x00000000);
3663          break;
3664    }
3665 #  undef SET_ABCD
3666 }
3667
3668
3669 /*---------------------------------------------------------------*/
3670 /*--- Misc integer helpers, including rotates and crypto.     ---*/
3671 /*---------------------------------------------------------------*/
3672
3673 ULong amd64g_calculate_RCR ( ULong arg,
3674                              ULong rot_amt,
3675                              ULong rflags_in,
3676                              Long  szIN )
3677 {
3678    Bool  wantRflags = toBool(szIN < 0);
3679    ULong sz         = wantRflags ? (-szIN) : szIN;
3680    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
3681    ULong cf=0, of=0, tempcf;
3682
3683    switch (sz) {
3684       case 8:
3685          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3686          of        = ((arg >> 63) ^ cf) & 1;
3687          while (tempCOUNT > 0) {
3688             tempcf = arg & 1;
3689             arg    = (arg >> 1) | (cf << 63);
3690             cf     = tempcf;
3691             tempCOUNT--;
3692          }
3693          break;
3694       case 4:
3695          while (tempCOUNT >= 33) tempCOUNT -= 33;
3696          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3697          of        = ((arg >> 31) ^ cf) & 1;
3698          while (tempCOUNT > 0) {
3699             tempcf = arg & 1;
3700             arg    = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
3701             cf     = tempcf;
3702             tempCOUNT--;
3703          }
3704          break;
3705       case 2:
3706          while (tempCOUNT >= 17) tempCOUNT -= 17;
3707          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3708          of        = ((arg >> 15) ^ cf) & 1;
3709          while (tempCOUNT > 0) {
3710             tempcf = arg & 1;
3711             arg    = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
3712             cf     = tempcf;
3713             tempCOUNT--;
3714          }
3715          break;
3716       case 1:
3717          while (tempCOUNT >= 9) tempCOUNT -= 9;
3718          cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3719          of        = ((arg >> 7) ^ cf) & 1;
3720          while (tempCOUNT > 0) {
3721             tempcf = arg & 1;
3722             arg    = ((arg >> 1) & 0x7FULL) | (cf << 7);
3723             cf     = tempcf;
3724             tempCOUNT--;
3725          }
3726          break;
3727       default:
3728          vpanic("calculate_RCR(amd64g): invalid size");
3729    }
3730
3731    cf &= 1;
3732    of &= 1;
3733    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
3734    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
3735
3736    /* caller can ask to have back either the resulting flags or
3737       resulting value, but not both */
3738    return wantRflags ? rflags_in : arg;
3739 }
3740
3741 ULong amd64g_calculate_RCL ( ULong arg,
3742                              ULong rot_amt,
3743                              ULong rflags_in,
3744                              Long  szIN )
3745 {
3746    Bool  wantRflags = toBool(szIN < 0);
3747    ULong sz         = wantRflags ? (-szIN) : szIN;
3748    ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
3749    ULong cf=0, of=0, tempcf;
3750
3751    switch (sz) {
3752       case 8:
3753          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3754          while (tempCOUNT > 0) {
3755             tempcf = (arg >> 63) & 1;
3756             arg    = (arg << 1) | (cf & 1);
3757             cf     = tempcf;
3758             tempCOUNT--;
3759          }
3760          of = ((arg >> 63) ^ cf) & 1;
3761          break;
3762       case 4:
3763          while (tempCOUNT >= 33) tempCOUNT -= 33;
3764          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3765          while (tempCOUNT > 0) {
3766             tempcf = (arg >> 31) & 1;
3767             arg    = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
3768             cf     = tempcf;
3769             tempCOUNT--;
3770          }
3771          of = ((arg >> 31) ^ cf) & 1;
3772          break;
3773       case 2:
3774          while (tempCOUNT >= 17) tempCOUNT -= 17;
3775          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3776          while (tempCOUNT > 0) {
3777             tempcf = (arg >> 15) & 1;
3778             arg    = 0xFFFFULL & ((arg << 1) | (cf & 1));
3779             cf     = tempcf;
3780             tempCOUNT--;
3781          }
3782          of = ((arg >> 15) ^ cf) & 1;
3783          break;
3784       case 1:
3785          while (tempCOUNT >= 9) tempCOUNT -= 9;
3786          cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
3787          while (tempCOUNT > 0) {
3788             tempcf = (arg >> 7) & 1;
3789             arg    = 0xFFULL & ((arg << 1) | (cf & 1));
3790             cf     = tempcf;
3791             tempCOUNT--;
3792          }
3793          of = ((arg >> 7) ^ cf) & 1;
3794          break;
3795       default:
3796          vpanic("calculate_RCL(amd64g): invalid size");
3797    }
3798
3799    cf &= 1;
3800    of &= 1;
3801    rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
3802    rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
3803
3804    return wantRflags ? rflags_in : arg;
3805 }
3806
3807 /* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
3808  * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
3809  */
3810 ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
3811 {
3812     ULong hi, lo, tmp, A[16];
3813
3814    A[0] = 0;            A[1] = a;
3815    A[2] = A[1] << 1;    A[3] = A[2] ^ a;
3816    A[4] = A[2] << 1;    A[5] = A[4] ^ a;
3817    A[6] = A[3] << 1;    A[7] = A[6] ^ a;
3818    A[8] = A[4] << 1;    A[9] = A[8] ^ a;
3819    A[10] = A[5] << 1;   A[11] = A[10] ^ a;
3820    A[12] = A[6] << 1;   A[13] = A[12] ^ a;
3821    A[14] = A[7] << 1;   A[15] = A[14] ^ a;
3822
3823    lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
3824    hi = lo >> 56;
3825    lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
3826    hi = (hi << 8) | (lo >> 56);
3827    lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
3828    hi = (hi << 8) | (lo >> 56);
3829    lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
3830    hi = (hi << 8) | (lo >> 56);
3831    lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
3832    hi = (hi << 8) | (lo >> 56);
3833    lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
3834    hi = (hi << 8) | (lo >> 56);
3835    lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
3836    hi = (hi << 8) | (lo >> 56);
3837    lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
3838
3839    ULong m0 = -1;
3840    m0 /= 255;
3841    tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
3842    tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
3843    tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
3844    tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
3845    tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
3846    tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
3847    tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
3848
3849    return which ? hi : lo;
3850 }
3851
3852
3853 /* CALLED FROM GENERATED CODE */
3854 /* DIRTY HELPER (non-referentially-transparent) */
3855 /* Horrible hack.  On non-amd64 platforms, return 1. */
3856 ULong amd64g_dirtyhelper_RDTSC ( void )
3857 {
3858 #  if defined(__x86_64__)
3859    UInt  eax, edx;
3860    __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
3861    return (((ULong)edx) << 32) | ((ULong)eax);
3862 #  else
3863    return 1ULL;
3864 #  endif
3865 }
3866
3867 /* CALLED FROM GENERATED CODE */
3868 /* DIRTY HELPER (non-referentially-transparent) */
3869 /* Horrible hack.  On non-amd64 platforms, return 1. */
3870 /* This uses a different calling convention from _RDTSC just above
3871    only because of the difficulty of returning 96 bits from a C
3872    function -- RDTSC returns 64 bits and so is simple by comparison,
3873    on amd64. */
3874 void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* st )
3875 {
3876 #  if defined(__x86_64__)
3877    UInt eax, ecx, edx;
3878    __asm__ __volatile__("rdtscp" : "=a" (eax), "=d" (edx), "=c" (ecx));
3879    st->guest_RAX = (ULong)eax;
3880    st->guest_RCX = (ULong)ecx;
3881    st->guest_RDX = (ULong)edx;
3882 #  else
3883    /* Do nothing. */
3884 #  endif
3885 }
3886
3887 /* CALLED FROM GENERATED CODE */
3888 /* DIRTY HELPER (non-referentially-transparent) */
3889 /* Horrible hack.  On non-amd64 platforms, return 0. */
3890 ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
3891 {
3892 #  if defined(__x86_64__)
3893    ULong r = 0;
3894    portno &= 0xFFFF;
3895    switch (sz) {
3896       case 4:
3897          __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0"
3898                               : "=a" (r) : "Nd" (portno));
3899          break;
3900       case 2:
3901          __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0"
3902                               : "=a" (r) : "Nd" (portno));
3903          break;
3904       case 1:
3905          __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0"
3906                               : "=a" (r) : "Nd" (portno));
3907          break;
3908       default:
3909          break; /* note: no 64-bit version of insn exists */
3910    }
3911    return r;
3912 #  else
3913    return 0;
3914 #  endif
3915 }
3916
3917
3918 /* CALLED FROM GENERATED CODE */
3919 /* DIRTY HELPER (non-referentially-transparent) */
3920 /* Horrible hack.  On non-amd64 platforms, do nothing. */
3921 void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
3922 {
3923 #  if defined(__x86_64__)
3924    portno &= 0xFFFF;
3925    switch (sz) {
3926       case 4:
3927          __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1"
3928                               : : "a" (data), "Nd" (portno));
3929          break;
3930       case 2:
3931          __asm__ __volatile__("outw %w0, %w1"
3932                               : : "a" (data), "Nd" (portno));
3933          break;
3934       case 1:
3935          __asm__ __volatile__("outb %b0, %w1"
3936                               : : "a" (data), "Nd" (portno));
3937          break;
3938       default:
3939          break; /* note: no 64-bit version of insn exists */
3940    }
3941 #  else
3942    /* do nothing */
3943 #  endif
3944 }
3945
3946 /* CALLED FROM GENERATED CODE */
3947 /* DIRTY HELPER (non-referentially-transparent) */
3948 /* Horrible hack.  On non-amd64 platforms, do nothing. */
3949 /* op = 0: call the native SGDT instruction.
3950    op = 1: call the native SIDT instruction.
3951 */
3952 void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
3953 #  if defined(__x86_64__)
3954    switch (op) {
3955       case 0:
3956          __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
3957          break;
3958       case 1:
3959          __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
3960          break;
3961       default:
3962          vpanic("amd64g_dirtyhelper_SxDT");
3963    }
3964 #  else
3965    /* do nothing */
3966    UChar* p = (UChar*)address;
3967    p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
3968    p[6] = p[7] = p[8] = p[9] = 0;
3969 #  endif
3970 }
3971
3972 /* CALLED FROM GENERATED CODE */
3973 /* DIRTY HELPER (non-referentially-transparent) */
3974 /* Horrible hack.  On non-amd64 platforms, do nothing.  On amd64 targets, get a
3975    32 bit random number using RDRAND, and return it and the associated rflags.C
3976    value. */
3977 ULong amd64g_dirtyhelper_RDRAND ( void ) {
3978 #  if defined(__x86_64__)
3979    ULong res   = 0;
3980    ULong cflag = 0;
3981    __asm__ __volatile__(
3982       "movq $0, %%r11 ; "
3983       "movq $0, %%r12 ; "
3984       "rdrand %%r11d ; "
3985       "setc %%r12b ; "
3986       "movq %%r11, %0 ; "
3987       "movq %%r12, %1"
3988       : "=r"(res), "=r"(cflag) : : "r11", "r12"
3989    );
3990    res &= 0xFFFFFFFFULL;
3991    cflag &= 1ULL;
3992    return (cflag << 32) | res;
3993 #  else
3994    /* There's nothing we can sensibly do.  Return a value denoting
3995       "I succeeded, and the random bits are all zero" :-/ */
3996    return 1ULL << 32;
3997 #  endif
3998 }
3999
4000 ULong amd64g_dirtyhelper_RDSEED ( void ) {
4001 #  if defined(__x86_64__)
4002    ULong res   = 0;
4003    ULong cflag = 0;
4004    __asm__ __volatile__(
4005       "movq $0, %%r11 ; "
4006       "movq $0, %%r12 ; "
4007       "rdseed %%r11d ; "
4008       "setc %%r12b ; "
4009       "movq %%r11, %0 ; "
4010       "movq %%r12, %1"
4011       : "=r"(res), "=r"(cflag) : : "r11", "r12"
4012    );
4013    res &= 0xFFFFFFFFULL;
4014    cflag &= 1ULL;
4015    return (cflag << 32) | res;
4016 #  else
4017    /* There's nothing we can sensibly do.  Return a value denoting
4018       "I succeeded, and the random bits are all zero" :-/ */
4019    return 1ULL << 32;
4020 #  endif
4021 }
4022
4023 /*---------------------------------------------------------------*/
4024 /*--- Helpers for MMX/SSE/SSE2.                               ---*/
4025 /*---------------------------------------------------------------*/
4026
4027 static inline UChar abdU8 ( UChar xx, UChar yy ) {
4028    return toUChar(xx>yy ? xx-yy : yy-xx);
4029 }
4030
4031 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
4032    return (((ULong)w1) << 32) | ((ULong)w0);
4033 }
4034
4035 static inline UShort sel16x4_3 ( ULong w64 ) {
4036    UInt hi32 = toUInt(w64 >> 32);
4037    return toUShort(hi32 >> 16);
4038 }
4039 static inline UShort sel16x4_2 ( ULong w64 ) {
4040    UInt hi32 = toUInt(w64 >> 32);
4041    return toUShort(hi32);
4042 }
4043 static inline UShort sel16x4_1 ( ULong w64 ) {
4044    UInt lo32 = toUInt(w64);
4045    return toUShort(lo32 >> 16);
4046 }
4047 static inline UShort sel16x4_0 ( ULong w64 ) {
4048    UInt lo32 = toUInt(w64);
4049    return toUShort(lo32);
4050 }
4051
4052 static inline UChar sel8x8_7 ( ULong w64 ) {
4053    UInt hi32 = toUInt(w64 >> 32);
4054    return toUChar(hi32 >> 24);
4055 }
4056 static inline UChar sel8x8_6 ( ULong w64 ) {
4057    UInt hi32 = toUInt(w64 >> 32);
4058    return toUChar(hi32 >> 16);
4059 }
4060 static inline UChar sel8x8_5 ( ULong w64 ) {
4061    UInt hi32 = toUInt(w64 >> 32);
4062    return toUChar(hi32 >> 8);
4063 }
4064 static inline UChar sel8x8_4 ( ULong w64 ) {
4065    UInt hi32 = toUInt(w64 >> 32);
4066    return toUChar(hi32 >> 0);
4067 }
4068 static inline UChar sel8x8_3 ( ULong w64 ) {
4069    UInt lo32 = toUInt(w64);
4070    return toUChar(lo32 >> 24);
4071 }
4072 static inline UChar sel8x8_2 ( ULong w64 ) {
4073    UInt lo32 = toUInt(w64);
4074    return toUChar(lo32 >> 16);
4075 }
4076 static inline UChar sel8x8_1 ( ULong w64 ) {
4077    UInt lo32 = toUInt(w64);
4078    return toUChar(lo32 >> 8);
4079 }
4080 static inline UChar sel8x8_0 ( ULong w64 ) {
4081    UInt lo32 = toUInt(w64);
4082    return toUChar(lo32 >> 0);
4083 }
4084
4085 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4086 ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
4087 {
4088    return
4089       mk32x2(
4090          (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
4091             + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
4092          (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
4093             + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
4094       );
4095 }
4096
4097 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4098 ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
4099 {
4100    UInt t = 0;
4101    t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
4102    t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
4103    t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
4104    t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
4105    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
4106    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
4107    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
4108    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
4109    t &= 0xFFFF;
4110    return (ULong)t;
4111 }
4112
4113 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4114 ULong amd64g_calculate_sse_phminposuw ( ULong sLo, ULong sHi )
4115 {
4116    UShort t, min;
4117    UInt   idx;
4118    t = sel16x4_0(sLo); if (True)    { min = t; idx = 0; }
4119    t = sel16x4_1(sLo); if (t < min) { min = t; idx = 1; }
4120    t = sel16x4_2(sLo); if (t < min) { min = t; idx = 2; }
4121    t = sel16x4_3(sLo); if (t < min) { min = t; idx = 3; }
4122    t = sel16x4_0(sHi); if (t < min) { min = t; idx = 4; }
4123    t = sel16x4_1(sHi); if (t < min) { min = t; idx = 5; }
4124    t = sel16x4_2(sHi); if (t < min) { min = t; idx = 6; }
4125    t = sel16x4_3(sHi); if (t < min) { min = t; idx = 7; }
4126    return ((ULong)(idx << 16)) | ((ULong)min);
4127 }
4128
4129 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4130 ULong amd64g_calc_crc32b ( ULong crcIn, ULong b )
4131 {
4132    UInt  i;
4133    ULong crc = (b & 0xFFULL) ^ crcIn;
4134    for (i = 0; i < 8; i++)
4135       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
4136    return crc;
4137 }
4138
4139 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4140 ULong amd64g_calc_crc32w ( ULong crcIn, ULong w )
4141 {
4142    UInt  i;
4143    ULong crc = (w & 0xFFFFULL) ^ crcIn;
4144    for (i = 0; i < 16; i++)
4145       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
4146    return crc;
4147 }
4148
4149 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4150 ULong amd64g_calc_crc32l ( ULong crcIn, ULong l )
4151 {
4152    UInt i;
4153    ULong crc = (l & 0xFFFFFFFFULL) ^ crcIn;
4154    for (i = 0; i < 32; i++)
4155       crc = (crc >> 1) ^ ((crc & 1) ? 0x82f63b78ULL : 0);
4156    return crc;
4157 }
4158
4159 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4160 ULong amd64g_calc_crc32q ( ULong crcIn, ULong q )
4161 {
4162    ULong crc = amd64g_calc_crc32l(crcIn, q);
4163    return amd64g_calc_crc32l(crc, q >> 32);
4164 }
4165
4166
4167 /* .. helper for next fn .. */
4168 static inline ULong sad_8x4 ( ULong xx, ULong yy )
4169 {
4170    UInt t = 0;
4171    t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
4172    t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
4173    t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
4174    t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
4175    return (ULong)t;
4176 }
4177
4178 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4179 ULong amd64g_calc_mpsadbw ( ULong sHi, ULong sLo,
4180                             ULong dHi, ULong dLo,
4181                             ULong imm_and_return_control_bit )
4182 {
4183    UInt imm8     = imm_and_return_control_bit & 7;
4184    Bool calcHi   = (imm_and_return_control_bit >> 7) & 1;
4185    UInt srcOffsL = imm8 & 3; /* src offs in 32-bit (L) chunks */
4186    UInt dstOffsL = (imm8 >> 2) & 1; /* dst offs in ditto chunks */
4187    /* For src we only need 32 bits, so get them into the
4188       lower half of a 64 bit word. */
4189    ULong src = ((srcOffsL & 2) ? sHi : sLo) >> (32 * (srcOffsL & 1));
4190    /* For dst we need to get hold of 56 bits (7 bytes) from a total of
4191       11 bytes.  If calculating the low part of the result, need bytes
4192       dstOffsL * 4 + (0 .. 6); if calculating the high part,
4193       dstOffsL * 4 + (4 .. 10). */
4194    ULong dst;
4195    /* dstOffL = 0, Lo  ->  0 .. 6
4196       dstOffL = 1, Lo  ->  4 .. 10
4197       dstOffL = 0, Hi  ->  4 .. 10
4198       dstOffL = 1, Hi  ->  8 .. 14
4199    */
4200    if (calcHi && dstOffsL) {
4201       /* 8 .. 14 */
4202       dst = dHi & 0x00FFFFFFFFFFFFFFULL;
4203    }
4204    else if (!calcHi && !dstOffsL) {
4205       /* 0 .. 6 */
4206       dst = dLo & 0x00FFFFFFFFFFFFFFULL;
4207    }
4208    else {
4209       /* 4 .. 10 */
4210       dst = (dLo >> 32) | ((dHi & 0x00FFFFFFULL) << 32);
4211    }
4212    ULong r0  = sad_8x4( dst >>  0, src );
4213    ULong r1  = sad_8x4( dst >>  8, src );
4214    ULong r2  = sad_8x4( dst >> 16, src );
4215    ULong r3  = sad_8x4( dst >> 24, src );
4216    ULong res = (r3 << 48) | (r2 << 32) | (r1 << 16) | r0;
4217    return res;
4218 }
4219
4220 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4221 ULong amd64g_calculate_pext ( ULong src_masked, ULong mask )
4222 {
4223    ULong dst = 0;
4224    ULong src_bit;
4225    ULong dst_bit = 1;
4226    for (src_bit = 1; src_bit; src_bit <<= 1) {
4227       if (mask & src_bit) {
4228          if (src_masked & src_bit) dst |= dst_bit;
4229          dst_bit <<= 1;
4230       }
4231    }
4232    return dst;
4233 }
4234
4235 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
4236 ULong amd64g_calculate_pdep ( ULong src, ULong mask )
4237 {
4238    ULong dst = 0;
4239    ULong dst_bit;
4240    ULong src_bit = 1;
4241    for (dst_bit = 1; dst_bit; dst_bit <<= 1) {
4242       if (mask & dst_bit) {
4243          if (src & src_bit) dst |= dst_bit;
4244          src_bit <<= 1;
4245       }
4246    }
4247    return dst;
4248 }
4249
4250 /*---------------------------------------------------------------*/
4251 /*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
4252 /*---------------------------------------------------------------*/
4253
4254 static UInt zmask_from_V128 ( V128* arg )
4255 {
4256    UInt i, res = 0;
4257    for (i = 0; i < 16; i++) {
4258       res |=  ((arg->w8[i] == 0) ? 1 : 0) << i;
4259    }
4260    return res;
4261 }
4262
4263 static UInt zmask_from_V128_wide ( V128* arg )
4264 {
4265    UInt i, res = 0;
4266    for (i = 0; i < 8; i++) {
4267       res |=  ((arg->w16[i] == 0) ? 1 : 0) << i;
4268    }
4269    return res;
4270 }
4271
4272 /* Helps with PCMP{I,E}STR{I,M}.
4273
4274    CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
4275    actually it could be a clean helper, but for the fact that we can't
4276    pass by value 2 x V128 to a clean helper, nor have one returned.)
4277    Reads guest state, writes to guest state for the xSTRM cases, no
4278    accesses of memory, is a pure function.
4279
4280    opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
4281    the callee knows which I/E and I/M variant it is dealing with and
4282    what the specific operation is.  4th byte of opcode is in the range
4283    0x60 to 0x63:
4284        istri  66 0F 3A 63
4285        istrm  66 0F 3A 62
4286        estri  66 0F 3A 61
4287        estrm  66 0F 3A 60
4288
4289    gstOffL and gstOffR are the guest state offsets for the two XMM
4290    register inputs.  We never have to deal with the memory case since
4291    that is handled by pre-loading the relevant value into the fake
4292    XMM16 register.
4293
4294    For ESTRx variants, edxIN and eaxIN hold the values of those two
4295    registers.
4296
4297    In all cases, the bottom 16 bits of the result contain the new
4298    OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
4299    result hold the new %ecx value.  For xSTRM variants, the helper
4300    writes the result directly to the guest XMM0.
4301
4302    Declarable side effects: in all cases, reads guest state at
4303    [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
4304    guest_XMM0.
4305
4306    Is expected to be called with opc_and_imm combinations which have
4307    actually been validated, and will assert if otherwise.  The front
4308    end should ensure we're only called with verified values.
4309 */
4310 ULong amd64g_dirtyhelper_PCMPxSTRx (
4311           VexGuestAMD64State* gst,
4312           HWord opc4_and_imm,
4313           HWord gstOffL, HWord gstOffR,
4314           HWord edxIN, HWord eaxIN
4315        )
4316 {
4317    HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
4318    HWord imm8 = opc4_and_imm & 0xFF;
4319    HWord isISTRx = opc4 & 2;
4320    HWord isxSTRM = (opc4 & 1) ^ 1;
4321    vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
4322    HWord wide = (imm8 & 1);
4323
4324    // where the args are
4325    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4326    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4327
4328    /* Create the arg validity masks, either from the vectors
4329       themselves or from the supplied edx/eax values. */
4330    // FIXME: this is only right for the 8-bit data cases.
4331    // At least that is asserted above.
4332    UInt zmaskL, zmaskR;
4333
4334    // temp spot for the resulting flags and vector.
4335    V128 resV;
4336    UInt resOSZACP;
4337
4338    // for checking whether case was handled
4339    Bool ok = False;
4340
4341    if (wide) {
4342       if (isISTRx) {
4343          zmaskL = zmask_from_V128_wide(argL);
4344          zmaskR = zmask_from_V128_wide(argR);
4345       } else {
4346          Int tmp;
4347          tmp = edxIN & 0xFFFFFFFF;
4348          if (tmp < -8) tmp = -8;
4349          if (tmp > 8)  tmp = 8;
4350          if (tmp < 0)  tmp = -tmp;
4351          vassert(tmp >= 0 && tmp <= 8);
4352          zmaskL = (1 << tmp) & 0xFF;
4353          tmp = eaxIN & 0xFFFFFFFF;
4354          if (tmp < -8) tmp = -8;
4355          if (tmp > 8)  tmp = 8;
4356          if (tmp < 0)  tmp = -tmp;
4357          vassert(tmp >= 0 && tmp <= 8);
4358          zmaskR = (1 << tmp) & 0xFF;
4359       }
4360       // do the meyaath
4361       ok = compute_PCMPxSTRx_wide (
4362               &resV, &resOSZACP, argL, argR,
4363               zmaskL, zmaskR, imm8, (Bool)isxSTRM
4364            );
4365    } else {
4366       if (isISTRx) {
4367          zmaskL = zmask_from_V128(argL);
4368          zmaskR = zmask_from_V128(argR);
4369       } else {
4370          Int tmp;
4371          tmp = edxIN & 0xFFFFFFFF;
4372          if (tmp < -16) tmp = -16;
4373          if (tmp > 16)  tmp = 16;
4374          if (tmp < 0)   tmp = -tmp;
4375          vassert(tmp >= 0 && tmp <= 16);
4376          zmaskL = (1 << tmp) & 0xFFFF;
4377          tmp = eaxIN & 0xFFFFFFFF;
4378          if (tmp < -16) tmp = -16;
4379          if (tmp > 16)  tmp = 16;
4380          if (tmp < 0)   tmp = -tmp;
4381          vassert(tmp >= 0 && tmp <= 16);
4382          zmaskR = (1 << tmp) & 0xFFFF;
4383       }
4384       // do the meyaath
4385       ok = compute_PCMPxSTRx (
4386               &resV, &resOSZACP, argL, argR,
4387               zmaskL, zmaskR, imm8, (Bool)isxSTRM
4388            );
4389    }
4390
4391    // front end shouldn't pass us any imm8 variants we can't
4392    // handle.  Hence:
4393    vassert(ok);
4394
4395    // So, finally we need to get the results back to the caller.
4396    // In all cases, the new OSZACP value is the lowest 16 of
4397    // the return value.
4398    if (isxSTRM) {
4399       gst->guest_YMM0[0] = resV.w32[0];
4400       gst->guest_YMM0[1] = resV.w32[1];
4401       gst->guest_YMM0[2] = resV.w32[2];
4402       gst->guest_YMM0[3] = resV.w32[3];
4403       return resOSZACP & 0x8D5;
4404    } else {
4405       UInt newECX = resV.w32[0] & 0xFFFF;
4406       return (newECX << 16) | (resOSZACP & 0x8D5);
4407    }
4408 }
4409
4410 /*---------------------------------------------------------------*/
4411 /*--- AES primitives and helpers                              ---*/
4412 /*---------------------------------------------------------------*/
4413 /* a 16 x 16 matrix */
4414 static const UChar sbox[256] = {                   // row nr
4415    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, // 1
4416    0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
4417    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, // 2
4418    0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
4419    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, // 3
4420    0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
4421    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, // 4
4422    0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
4423    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, // 5
4424    0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
4425    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, // 6
4426    0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
4427    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, // 7
4428    0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
4429    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, // 8
4430    0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
4431    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, // 9
4432    0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
4433    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, //10
4434    0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
4435    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, //11
4436    0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
4437    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, //12
4438    0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
4439    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, //13
4440    0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
4441    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, //14
4442    0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
4443    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, //15
4444    0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
4445    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, //16
4446    0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
4447 };
4448 static void SubBytes (V128* v)
4449 {
4450    V128 r;
4451    UInt i;
4452    for (i = 0; i < 16; i++)
4453       r.w8[i] = sbox[v->w8[i]];
4454    *v = r;
4455 }
4456
4457 /* a 16 x 16 matrix */
4458 static const UChar invsbox[256] = {                // row nr
4459    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, // 1
4460    0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
4461    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, // 2
4462    0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
4463    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, // 3
4464    0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
4465    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, // 4
4466    0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
4467    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, // 5
4468    0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
4469    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, // 6
4470    0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
4471    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, // 7
4472    0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
4473    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, // 8
4474    0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
4475    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, // 9
4476    0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
4477    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, //10
4478    0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
4479    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, //11
4480    0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
4481    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, //12
4482    0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
4483    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, //13
4484    0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
4485    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, //14
4486    0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
4487    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, //15
4488    0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
4489    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, //16
4490    0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
4491 };
4492 static void InvSubBytes (V128* v)
4493 {
4494    V128 r;
4495    UInt i;
4496    for (i = 0; i < 16; i++)
4497       r.w8[i] = invsbox[v->w8[i]];
4498    *v = r;
4499 }
4500
4501 static const UChar ShiftRows_op[16] =
4502    {11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0};
4503 static void ShiftRows (V128* v)
4504 {
4505    V128 r;
4506    UInt i;
4507    for (i = 0; i < 16; i++)
4508       r.w8[i] = v->w8[ShiftRows_op[15-i]];
4509    *v = r;
4510 }
4511
4512 static const UChar InvShiftRows_op[16] =
4513    {3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0};
4514 static void InvShiftRows (V128* v)
4515 {
4516    V128 r;
4517    UInt i;
4518    for (i = 0; i < 16; i++)
4519       r.w8[i] = v->w8[InvShiftRows_op[15-i]];
4520    *v = r;
4521 }
4522
4523 /* Multiplication of the finite fields elements of AES.
4524    See "A Specification for The AES Algorithm Rijndael
4525         (by Joan Daemen & Vincent Rijmen)"
4526         Dr. Brian Gladman, v3.1, 3rd March 2001. */
4527 /* N values so that (hex) xy = 0x03^N.
4528    0x00 cannot be used. We put 0xff for this value.*/
4529 /* a 16 x 16 matrix */
4530 static const UChar Nxy[256] = {                    // row nr
4531    0xff, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, // 1
4532    0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
4533    0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, // 2
4534    0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
4535    0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, // 3
4536    0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
4537    0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, // 4
4538    0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
4539    0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, // 5
4540    0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
4541    0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, // 6
4542    0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
4543    0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, // 7
4544    0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
4545    0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, // 8
4546    0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
4547    0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, // 9
4548    0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
4549    0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, //10
4550    0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
4551    0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, //11
4552    0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
4553    0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, //12
4554    0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
4555    0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, //13
4556    0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
4557    0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, //14
4558    0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
4559    0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, //15
4560    0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
4561    0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, //16
4562    0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07
4563 };
4564
4565 /* E values so that E = 0x03^xy. */
4566 static const UChar Exy[256] = {                    // row nr
4567    0x01, 0x03, 0x05, 0x0f, 0x11, 0x33, 0x55, 0xff, // 1
4568    0x1a, 0x2e, 0x72, 0x96, 0xa1, 0xf8, 0x13, 0x35,
4569    0x5f, 0xe1, 0x38, 0x48, 0xd8, 0x73, 0x95, 0xa4, // 2
4570    0xf7, 0x02, 0x06, 0x0a, 0x1e, 0x22, 0x66, 0xaa,
4571    0xe5, 0x34, 0x5c, 0xe4, 0x37, 0x59, 0xeb, 0x26, // 3
4572    0x6a, 0xbe, 0xd9, 0x70, 0x90, 0xab, 0xe6, 0x31,
4573    0x53, 0xf5, 0x04, 0x0c, 0x14, 0x3c, 0x44, 0xcc, // 4
4574    0x4f, 0xd1, 0x68, 0xb8, 0xd3, 0x6e, 0xb2, 0xcd,
4575    0x4c, 0xd4, 0x67, 0xa9, 0xe0, 0x3b, 0x4d, 0xd7, // 5
4576    0x62, 0xa6, 0xf1, 0x08, 0x18, 0x28, 0x78, 0x88,
4577    0x83, 0x9e, 0xb9, 0xd0, 0x6b, 0xbd, 0xdc, 0x7f, // 6
4578    0x81, 0x98, 0xb3, 0xce, 0x49, 0xdb, 0x76, 0x9a,
4579    0xb5, 0xc4, 0x57, 0xf9, 0x10, 0x30, 0x50, 0xf0, // 7
4580    0x0b, 0x1d, 0x27, 0x69, 0xbb, 0xd6, 0x61, 0xa3,
4581    0xfe, 0x19, 0x2b, 0x7d, 0x87, 0x92, 0xad, 0xec, // 8
4582    0x2f, 0x71, 0x93, 0xae, 0xe9, 0x20, 0x60, 0xa0,
4583    0xfb, 0x16, 0x3a, 0x4e, 0xd2, 0x6d, 0xb7, 0xc2, // 9
4584    0x5d, 0xe7, 0x32, 0x56, 0xfa, 0x15, 0x3f, 0x41,
4585    0xc3, 0x5e, 0xe2, 0x3d, 0x47, 0xc9, 0x40, 0xc0, //10
4586    0x5b, 0xed, 0x2c, 0x74, 0x9c, 0xbf, 0xda, 0x75,
4587    0x9f, 0xba, 0xd5, 0x64, 0xac, 0xef, 0x2a, 0x7e, //11
4588    0x82, 0x9d, 0xbc, 0xdf, 0x7a, 0x8e, 0x89, 0x80,
4589    0x9b, 0xb6, 0xc1, 0x58, 0xe8, 0x23, 0x65, 0xaf, //12
4590    0xea, 0x25, 0x6f, 0xb1, 0xc8, 0x43, 0xc5, 0x54,
4591    0xfc, 0x1f, 0x21, 0x63, 0xa5, 0xf4, 0x07, 0x09, //13
4592    0x1b, 0x2d, 0x77, 0x99, 0xb0, 0xcb, 0x46, 0xca,
4593    0x45, 0xcf, 0x4a, 0xde, 0x79, 0x8b, 0x86, 0x91, //14
4594    0xa8, 0xe3, 0x3e, 0x42, 0xc6, 0x51, 0xf3, 0x0e,
4595    0x12, 0x36, 0x5a, 0xee, 0x29, 0x7b, 0x8d, 0x8c, //15
4596    0x8f, 0x8a, 0x85, 0x94, 0xa7, 0xf2, 0x0d, 0x17,
4597    0x39, 0x4b, 0xdd, 0x7c, 0x84, 0x97, 0xa2, 0xfd, //16
4598    0x1c, 0x24, 0x6c, 0xb4, 0xc7, 0x52, 0xf6, 0x01};
4599
4600 static inline UChar ff_mul(UChar u1, UChar u2)
4601 {
4602    if ((u1 > 0) && (u2 > 0)) {
4603       UInt ui = Nxy[u1] + Nxy[u2];
4604       if (ui >= 255)
4605          ui = ui - 255;
4606       return Exy[ui];
4607    } else {
4608       return 0;
4609    };
4610 }
4611
4612 static void MixColumns (V128* v)
4613 {
4614    V128 r;
4615    Int j;
4616 #define P(x,row,col) (x)->w8[((row)*4+(col))]
4617    for (j = 0; j < 4; j++) {
4618       P(&r,j,0) = ff_mul(0x02, P(v,j,0)) ^ ff_mul(0x03, P(v,j,1))
4619          ^ P(v,j,2) ^ P(v,j,3);
4620       P(&r,j,1) = P(v,j,0) ^ ff_mul( 0x02, P(v,j,1) )
4621          ^ ff_mul(0x03, P(v,j,2) ) ^ P(v,j,3);
4622       P(&r,j,2) = P(v,j,0) ^ P(v,j,1) ^ ff_mul( 0x02, P(v,j,2) )
4623          ^ ff_mul(0x03, P(v,j,3) );
4624       P(&r,j,3) = ff_mul(0x03, P(v,j,0) ) ^ P(v,j,1) ^ P(v,j,2)
4625          ^ ff_mul( 0x02, P(v,j,3) );
4626    }
4627    *v = r;
4628 #undef P
4629 }
4630
4631 static void InvMixColumns (V128* v)
4632 {
4633    V128 r;
4634    Int j;
4635 #define P(x,row,col) (x)->w8[((row)*4+(col))]
4636    for (j = 0; j < 4; j++) {
4637       P(&r,j,0) = ff_mul(0x0e, P(v,j,0) ) ^ ff_mul(0x0b, P(v,j,1) )
4638          ^ ff_mul(0x0d,P(v,j,2) ) ^ ff_mul(0x09, P(v,j,3) );
4639       P(&r,j,1) = ff_mul(0x09, P(v,j,0) ) ^ ff_mul(0x0e, P(v,j,1) )
4640          ^ ff_mul(0x0b,P(v,j,2) ) ^ ff_mul(0x0d, P(v,j,3) );
4641       P(&r,j,2) = ff_mul(0x0d, P(v,j,0) ) ^ ff_mul(0x09, P(v,j,1) )
4642          ^ ff_mul(0x0e,P(v,j,2) ) ^ ff_mul(0x0b, P(v,j,3) );
4643       P(&r,j,3) = ff_mul(0x0b, P(v,j,0) ) ^ ff_mul(0x0d, P(v,j,1) )
4644          ^ ff_mul(0x09,P(v,j,2) ) ^ ff_mul(0x0e, P(v,j,3) );
4645    }
4646    *v = r;
4647 #undef P
4648
4649 }
4650
4651 /* For description, see definition in guest_amd64_defs.h */
4652 void amd64g_dirtyhelper_AES (
4653           VexGuestAMD64State* gst,
4654           HWord opc4, HWord gstOffD,
4655           HWord gstOffL, HWord gstOffR
4656        )
4657 {
4658    // where the args are
4659    V128* argD = (V128*)( ((UChar*)gst) + gstOffD );
4660    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4661    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4662    V128  r;
4663
4664    switch (opc4) {
4665       case 0xDC: /* AESENC */
4666       case 0xDD: /* AESENCLAST */
4667          r = *argR;
4668          ShiftRows (&r);
4669          SubBytes  (&r);
4670          if (opc4 == 0xDC)
4671             MixColumns (&r);
4672          argD->w64[0] = r.w64[0] ^ argL->w64[0];
4673          argD->w64[1] = r.w64[1] ^ argL->w64[1];
4674          break;
4675
4676       case 0xDE: /* AESDEC */
4677       case 0xDF: /* AESDECLAST */
4678          r = *argR;
4679          InvShiftRows (&r);
4680          InvSubBytes (&r);
4681          if (opc4 == 0xDE)
4682             InvMixColumns (&r);
4683          argD->w64[0] = r.w64[0] ^ argL->w64[0];
4684          argD->w64[1] = r.w64[1] ^ argL->w64[1];
4685          break;
4686
4687       case 0xDB: /* AESIMC */
4688          *argD = *argL;
4689          InvMixColumns (argD);
4690          break;
4691       default: vassert(0);
4692    }
4693 }
4694
4695 static inline UInt RotWord (UInt   w32)
4696 {
4697    return ((w32 >> 8) | (w32 << 24));
4698 }
4699
4700 static inline UInt SubWord (UInt   w32)
4701 {
4702    UChar *w8;
4703    UChar *r8;
4704    UInt res;
4705    w8 = (UChar*) &w32;
4706    r8 = (UChar*) &res;
4707    r8[0] = sbox[w8[0]];
4708    r8[1] = sbox[w8[1]];
4709    r8[2] = sbox[w8[2]];
4710    r8[3] = sbox[w8[3]];
4711    return res;
4712 }
4713
4714 /* For description, see definition in guest_amd64_defs.h */
4715 extern void amd64g_dirtyhelper_AESKEYGENASSIST (
4716           VexGuestAMD64State* gst,
4717           HWord imm8,
4718           HWord gstOffL, HWord gstOffR
4719        )
4720 {
4721    // where the args are
4722    V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
4723    V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
4724
4725    // We have to create the result in a temporary in the
4726    // case where the src and dst regs are the same.  See #341698.
4727    V128 tmp;
4728
4729    tmp.w32[3] = RotWord (SubWord (argL->w32[3])) ^ imm8;
4730    tmp.w32[2] = SubWord (argL->w32[3]);
4731    tmp.w32[1] = RotWord (SubWord (argL->w32[1])) ^ imm8;
4732    tmp.w32[0] = SubWord (argL->w32[1]);
4733
4734    argR->w32[3] = tmp.w32[3];
4735    argR->w32[2] = tmp.w32[2];
4736    argR->w32[1] = tmp.w32[1];
4737    argR->w32[0] = tmp.w32[0];
4738 }
4739
4740
4741
4742 /*---------------------------------------------------------------*/
4743 /*--- Helpers for dealing with, and describing,               ---*/
4744 /*--- guest state as a whole.                                 ---*/
4745 /*---------------------------------------------------------------*/
4746
4747 /* Initialise the entire amd64 guest state. */
4748 /* VISIBLE TO LIBVEX CLIENT */
4749 void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
4750 {
4751    vex_state->host_EvC_FAILADDR = 0;
4752    vex_state->host_EvC_COUNTER = 0;
4753    vex_state->pad0 = 0;
4754
4755    vex_state->guest_RAX = 0;
4756    vex_state->guest_RCX = 0;
4757    vex_state->guest_RDX = 0;
4758    vex_state->guest_RBX = 0;
4759    vex_state->guest_RSP = 0;
4760    vex_state->guest_RBP = 0;
4761    vex_state->guest_RSI = 0;
4762    vex_state->guest_RDI = 0;
4763    vex_state->guest_R8  = 0;
4764    vex_state->guest_R9  = 0;
4765    vex_state->guest_R10 = 0;
4766    vex_state->guest_R11 = 0;
4767    vex_state->guest_R12 = 0;
4768    vex_state->guest_R13 = 0;
4769    vex_state->guest_R14 = 0;
4770    vex_state->guest_R15 = 0;
4771
4772    vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
4773    vex_state->guest_CC_DEP1 = 0;
4774    vex_state->guest_CC_DEP2 = 0;
4775    vex_state->guest_CC_NDEP = 0;
4776
4777    vex_state->guest_DFLAG   = 1; /* forwards */
4778    vex_state->guest_IDFLAG  = 0;
4779    vex_state->guest_ACFLAG  = 0;
4780
4781    /* HACK: represent the offset associated with a constant %fs.
4782       Typically, on linux, this assumes that %fs is only ever zero (main
4783       thread) or 0x63. */
4784    vex_state->guest_FS_CONST = 0;
4785
4786    vex_state->guest_RIP = 0;
4787
4788    /* Initialise the simulated FPU */
4789    amd64g_dirtyhelper_FINIT( vex_state );
4790
4791    /* Initialise the AVX state. */
4792 #  define AVXZERO(_ymm) \
4793       do { _ymm[0]=_ymm[1]=_ymm[2]=_ymm[3] = 0; \
4794            _ymm[4]=_ymm[5]=_ymm[6]=_ymm[7] = 0; \
4795       } while (0)
4796    vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
4797    AVXZERO(vex_state->guest_YMM0);
4798    AVXZERO(vex_state->guest_YMM1);
4799    AVXZERO(vex_state->guest_YMM2);
4800    AVXZERO(vex_state->guest_YMM3);
4801    AVXZERO(vex_state->guest_YMM4);
4802    AVXZERO(vex_state->guest_YMM5);
4803    AVXZERO(vex_state->guest_YMM6);
4804    AVXZERO(vex_state->guest_YMM7);
4805    AVXZERO(vex_state->guest_YMM8);
4806    AVXZERO(vex_state->guest_YMM9);
4807    AVXZERO(vex_state->guest_YMM10);
4808    AVXZERO(vex_state->guest_YMM11);
4809    AVXZERO(vex_state->guest_YMM12);
4810    AVXZERO(vex_state->guest_YMM13);
4811    AVXZERO(vex_state->guest_YMM14);
4812    AVXZERO(vex_state->guest_YMM15);
4813    AVXZERO(vex_state->guest_YMM16);
4814
4815 #  undef AVXZERO
4816
4817    vex_state->guest_EMNOTE = EmNote_NONE;
4818
4819    /* These should not ever be either read or written, but we
4820       initialise them anyway. */
4821    vex_state->guest_CMSTART = 0;
4822    vex_state->guest_CMLEN   = 0;
4823
4824    vex_state->guest_NRADDR   = 0;
4825    vex_state->guest_SC_CLASS = 0;
4826    vex_state->guest_GS_CONST = 0;
4827
4828    vex_state->guest_IP_AT_SYSCALL = 0;
4829    vex_state->pad1 = 0;
4830 }
4831
4832
4833 /* Figure out if any part of the guest state contained in minoff
4834    .. maxoff requires precise memory exceptions.  If in doubt return
4835    True (but this generates significantly slower code).
4836
4837    By default we enforce precise exns for guest %RSP, %RBP and %RIP
4838    only.  These are the minimum needed to extract correct stack
4839    backtraces from amd64 code.
4840
4841    Only %RSP is needed in mode VexRegUpdSpAtMemAccess.
4842 */
4843 Bool guest_amd64_state_requires_precise_mem_exns (
4844         Int minoff, Int maxoff, VexRegisterUpdates pxControl
4845      )
4846 {
4847    Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
4848    Int rbp_max = rbp_min + 8 - 1;
4849    Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
4850    Int rsp_max = rsp_min + 8 - 1;
4851    Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
4852    Int rip_max = rip_min + 8 - 1;
4853
4854    if (maxoff < rsp_min || minoff > rsp_max) {
4855       /* no overlap with rsp */
4856       if (pxControl == VexRegUpdSpAtMemAccess)
4857          return False; // We only need to check stack pointer.
4858    } else {
4859       return True;
4860    }
4861
4862    if (maxoff < rbp_min || minoff > rbp_max) {
4863       /* no overlap with rbp */
4864    } else {
4865       return True;
4866    }
4867
4868    if (maxoff < rip_min || minoff > rip_max) {
4869       /* no overlap with eip */
4870    } else {
4871       return True;
4872    }
4873
4874    return False;
4875 }
4876
4877
4878 #define ALWAYSDEFD(field)                             \
4879     { offsetof(VexGuestAMD64State, field),            \
4880       (sizeof ((VexGuestAMD64State*)0)->field) }
4881
4882 VexGuestLayout
4883    amd64guest_layout
4884       = {
4885           /* Total size of the guest state, in bytes. */
4886           .total_sizeB = sizeof(VexGuestAMD64State),
4887
4888           /* Describe the stack pointer. */
4889           .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
4890           .sizeof_SP = 8,
4891
4892           /* Describe the frame pointer. */
4893           .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
4894           .sizeof_FP = 8,
4895
4896           /* Describe the instruction pointer. */
4897           .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
4898           .sizeof_IP = 8,
4899
4900           /* Describe any sections to be regarded by Memcheck as
4901              'always-defined'. */
4902           .n_alwaysDefd = 16,
4903
4904           /* flags thunk: OP and NDEP are always defd, whereas DEP1
4905              and DEP2 have to be tracked.  See detailed comment in
4906              gdefs.h on meaning of thunk fields. */
4907           .alwaysDefd
4908              = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
4909                  /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
4910                  /*  2 */ ALWAYSDEFD(guest_DFLAG),
4911                  /*  3 */ ALWAYSDEFD(guest_IDFLAG),
4912                  /*  4 */ ALWAYSDEFD(guest_RIP),
4913                  /*  5 */ ALWAYSDEFD(guest_FS_CONST),
4914                  /*  6 */ ALWAYSDEFD(guest_FTOP),
4915                  /*  7 */ ALWAYSDEFD(guest_FPTAG),
4916                  /*  8 */ ALWAYSDEFD(guest_FPROUND),
4917                  /*  9 */ ALWAYSDEFD(guest_FC3210),
4918                  // /* */ ALWAYSDEFD(guest_CS),
4919                  // /* */ ALWAYSDEFD(guest_DS),
4920                  // /* */ ALWAYSDEFD(guest_ES),
4921                  // /* */ ALWAYSDEFD(guest_FS),
4922                  // /* */ ALWAYSDEFD(guest_GS),
4923                  // /* */ ALWAYSDEFD(guest_SS),
4924                  // /* */ ALWAYSDEFD(guest_LDT),
4925                  // /* */ ALWAYSDEFD(guest_GDT),
4926                  /* 10 */ ALWAYSDEFD(guest_EMNOTE),
4927                  /* 11 */ ALWAYSDEFD(guest_SSEROUND),
4928                  /* 12 */ ALWAYSDEFD(guest_CMSTART),
4929                  /* 13 */ ALWAYSDEFD(guest_CMLEN),
4930                  /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
4931                  /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
4932                }
4933         };
4934
4935
4936 /*---------------------------------------------------------------*/
4937 /*--- end                               guest_amd64_helpers.c ---*/
4938 /*---------------------------------------------------------------*/