VEX/priv/guest_arm64_toIR.c

   1 /* -*- mode: C; c-basic-offset: 3; -*- */
   2
   3 /*--------------------------------------------------------------------*/
   4 /*--- begin                                     guest_arm64_toIR.c ---*/
   5 /*--------------------------------------------------------------------*/
   6
   7 /*
   8    This file is part of Valgrind, a dynamic binary instrumentation
   9    framework.
  10
  11    Copyright (C) 2013-2017 OpenWorks
  12       info@open-works.net
  13
  14    This program is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU General Public License as
  16    published by the Free Software Foundation; either version 2 of the
  17    License, or (at your option) any later version.
  18
  19    This program is distributed in the hope that it will be useful, but
  20    WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    General Public License for more details.
  23
  24    You should have received a copy of the GNU General Public License
  25    along with this program; if not, see <http://www.gnu.org/licenses/>.
  26
  27    The GNU General Public License is contained in the file COPYING.
  28 */
  29
  30 /* KNOWN LIMITATIONS 2014-Nov-16
  31
  32    * Correctness: FMAXNM, FMINNM are implemented the same as FMAX/FMIN.
  33
  34      Also FP comparison "unordered" .. is implemented as normal FP
  35      comparison.
  36
  37      Both should be fixed.  They behave incorrectly in the presence of
  38      NaNs.
  39
  40      FMULX is treated the same as FMUL.  That's also not correct.
  41
  42    * Floating multiply-add (etc) insns.  Are split into a multiply and
  43      an add, and so suffer double rounding and hence sometimes the
  44      least significant mantissa bit is incorrect.  Fix: use the IR
  45      multiply-add IROps instead.
  46
  47    * FRINTA, FRINTN are kludged .. they just round to nearest.  No special
  48      handling for the "ties" case.  FRINTX might be dubious too.
  49
  50    * Ditto FCVTXN.  No idea what "round to odd" means.  This implementation
  51      just rounds to nearest.
  52 */
  53
  54 /* "Special" instructions.
  55
  56    This instruction decoder can decode four special instructions
  57    which mean nothing natively (are no-ops as far as regs/mem are
  58    concerned) but have meaning for supporting Valgrind.  A special
  59    instruction is flagged by a 16-byte preamble:
  60
  61       93CC0D8C 93CC358C 93CCCD8C 93CCF58C
  62       (ror x12, x12, #3;   ror x12, x12, #13
  63        ror x12, x12, #51;  ror x12, x12, #61)
  64
  65    Following that, one of the following 3 are allowed
  66    (standard interpretation in parentheses):
  67
  68       AA0A014A (orr x10,x10,x10)   X3 = client_request ( X4 )
  69       AA0B016B (orr x11,x11,x11)   X3 = guest_NRADDR
  70       AA0C018C (orr x12,x12,x12)   branch-and-link-to-noredir X8
  71       AA090129 (orr x9,x9,x9)      IR injection
  72
  73    Any other bytes following the 16-byte preamble are illegal and
  74    constitute a failure in instruction decoding.  This all assumes
  75    that the preamble will never occur except in specific code
  76    fragments designed for Valgrind to catch.
  77 */
  78
  79 /* Translates ARM64 code to IR. */
  80
  81 #include "libvex_basictypes.h"
  82 #include "libvex_ir.h"
  83 #include "libvex.h"
  84 #include "libvex_guest_arm64.h"
  85
  86 #include "main_util.h"
  87 #include "main_globals.h"
  88 #include "guest_generic_bb_to_IR.h"
  89 #include "guest_arm64_defs.h"
  90
  91
  92 /*------------------------------------------------------------*/
  93 /*--- Globals                                              ---*/
  94 /*------------------------------------------------------------*/
  95
  96 /* These are set at the start of the translation of a instruction, so
  97    that we don't have to pass them around endlessly.  CONST means does
  98    not change during translation of the instruction.
  99 */
 100
 101 /* CONST: what is the host's endianness?  We need to know this in
 102    order to do sub-register accesses to the SIMD/FP registers
 103    correctly. */
 104 static VexEndness host_endness;
 105
 106 /* CONST: The guest address for the instruction currently being
 107    translated.  */
 108 static Addr64 guest_PC_curr_instr;
 109
 110 /* MOD: The IRSB* into which we're generating code. */
 111 static IRSB* irsb;
 112
 113
 114 /*------------------------------------------------------------*/
 115 /*--- Debugging output                                     ---*/
 116 /*------------------------------------------------------------*/
 117
 118 #define DIP(format, args...)           \
 119    if (vex_traceflags & VEX_TRACE_FE)  \
 120       vex_printf(format, ## args)
 121
 122 #define DIS(buf, format, args...)      \
 123    if (vex_traceflags & VEX_TRACE_FE)  \
 124       vex_sprintf(buf, format, ## args)
 125
 126
 127 /*------------------------------------------------------------*/
 128 /*--- Helper bits and pieces for deconstructing the        ---*/
 129 /*--- arm insn stream.                                     ---*/
 130 /*------------------------------------------------------------*/
 131
 132 /* Do a little-endian load of a 32-bit word, regardless of the
 133    endianness of the underlying host. */
 134 static inline UInt getUIntLittleEndianly ( const UChar* p )
 135 {
 136    UInt w = 0;
 137    w = (w << 8) | p[3];
 138    w = (w << 8) | p[2];
 139    w = (w << 8) | p[1];
 140    w = (w << 8) | p[0];
 141    return w;
 142 }
 143
 144 /* Sign extend a N-bit value up to 64 bits, by copying
 145    bit N-1 into all higher positions. */
 146 static ULong sx_to_64 ( ULong x, UInt n )
 147 {
 148    vassert(n > 1 && n < 64);
 149    x <<= (64-n);
 150    Long r = (Long)x;
 151    r >>= (64-n);
 152    return (ULong)r;
 153 }
 154
 155 //ZZ /* Do a little-endian load of a 16-bit word, regardless of the
 156 //ZZ    endianness of the underlying host. */
 157 //ZZ static inline UShort getUShortLittleEndianly ( UChar* p )
 158 //ZZ {
 159 //ZZ    UShort w = 0;
 160 //ZZ    w = (w << 8) | p[1];
 161 //ZZ    w = (w << 8) | p[0];
 162 //ZZ    return w;
 163 //ZZ }
 164 //ZZ
 165 //ZZ static UInt ROR32 ( UInt x, UInt sh ) {
 166 //ZZ    vassert(sh >= 0 && sh < 32);
 167 //ZZ    if (sh == 0)
 168 //ZZ       return x;
 169 //ZZ    else
 170 //ZZ       return (x << (32-sh)) | (x >> sh);
 171 //ZZ }
 172 //ZZ
 173 //ZZ static Int popcount32 ( UInt x )
 174 //ZZ {
 175 //ZZ    Int res = 0, i;
 176 //ZZ    for (i = 0; i < 32; i++) {
 177 //ZZ       res += (x & 1);
 178 //ZZ       x >>= 1;
 179 //ZZ    }
 180 //ZZ    return res;
 181 //ZZ }
 182 //ZZ
 183 //ZZ static UInt setbit32 ( UInt x, Int ix, UInt b )
 184 //ZZ {
 185 //ZZ    UInt mask = 1 << ix;
 186 //ZZ    x &= ~mask;
 187 //ZZ    x |= ((b << ix) & mask);
 188 //ZZ    return x;
 189 //ZZ }
 190
 191 #define BITS2(_b1,_b0)  \
 192    (((_b1) << 1) | (_b0))
 193
 194 #define BITS3(_b2,_b1,_b0)  \
 195   (((_b2) << 2) | ((_b1) << 1) | (_b0))
 196
 197 #define BITS4(_b3,_b2,_b1,_b0)  \
 198    (((_b3) << 3) | ((_b2) << 2) | ((_b1) << 1) | (_b0))
 199
 200 #define BITS8(_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 201    ((BITS4((_b7),(_b6),(_b5),(_b4)) << 4)  \
 202     | BITS4((_b3),(_b2),(_b1),(_b0)))
 203
 204 #define BITS5(_b4,_b3,_b2,_b1,_b0)  \
 205    (BITS8(0,0,0,(_b4),(_b3),(_b2),(_b1),(_b0)))
 206 #define BITS6(_b5,_b4,_b3,_b2,_b1,_b0)  \
 207    (BITS8(0,0,(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 208 #define BITS7(_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 209    (BITS8(0,(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 210
 211 #define BITS9(_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 212    (((_b8) << 8)  \
 213     | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 214
 215 #define BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 216    (((_b9) << 9) | ((_b8) << 8)  \
 217     | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 218
 219 #define BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 220    (((_b10) << 10)  \
 221     | BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
 222
 223 #define BITS12(_b11, _b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0) \
 224    (((_b11) << 11)  \
 225     | BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
 226
 227 #define X00 BITS2(0,0)
 228 #define X01 BITS2(0,1)
 229 #define X10 BITS2(1,0)
 230 #define X11 BITS2(1,1)
 231
 232 // produces _uint[_bMax:_bMin]
 233 #define SLICE_UInt(_uint,_bMax,_bMin)  \
 234    (( ((UInt)(_uint)) >> (_bMin))  \
 235     & (UInt)((1ULL << ((_bMax) - (_bMin) + 1)) - 1ULL))
 236
 237
 238 /*------------------------------------------------------------*/
 239 /*--- Helper bits and pieces for creating IR fragments.    ---*/
 240 /*------------------------------------------------------------*/
 241
 242 static IRExpr* mkV128 ( UShort w )
 243 {
 244    return IRExpr_Const(IRConst_V128(w));
 245 }
 246
 247 static IRExpr* mkU64 ( ULong i )
 248 {
 249    return IRExpr_Const(IRConst_U64(i));
 250 }
 251
 252 static IRExpr* mkU32 ( UInt i )
 253 {
 254    return IRExpr_Const(IRConst_U32(i));
 255 }
 256
 257 static IRExpr* mkU16 ( UInt i )
 258 {
 259    vassert(i < 65536);
 260    return IRExpr_Const(IRConst_U16(i));
 261 }
 262
 263 static IRExpr* mkU8 ( UInt i )
 264 {
 265    vassert(i < 256);
 266    return IRExpr_Const(IRConst_U8( (UChar)i ));
 267 }
 268
 269 static IRExpr* mkexpr ( IRTemp tmp )
 270 {
 271    return IRExpr_RdTmp(tmp);
 272 }
 273
 274 static IRExpr* unop ( IROp op, IRExpr* a )
 275 {
 276    return IRExpr_Unop(op, a);
 277 }
 278
 279 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
 280 {
 281    return IRExpr_Binop(op, a1, a2);
 282 }
 283
 284 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
 285 {
 286    return IRExpr_Triop(op, a1, a2, a3);
 287 }
 288
 289 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
 290 {
 291    return IRExpr_Load(Iend_LE, ty, addr);
 292 }
 293
 294 /* Add a statement to the list held by "irbb". */
 295 static void stmt ( IRStmt* st )
 296 {
 297    addStmtToIRSB( irsb, st );
 298 }
 299
 300 static void assign ( IRTemp dst, IRExpr* e )
 301 {
 302    stmt( IRStmt_WrTmp(dst, e) );
 303 }
 304
 305 static void storeLE ( IRExpr* addr, IRExpr* data )
 306 {
 307    stmt( IRStmt_Store(Iend_LE, addr, data) );
 308 }
 309
 310 //ZZ static void storeGuardedLE ( IRExpr* addr, IRExpr* data, IRTemp guardT )
 311 //ZZ {
 312 //ZZ    if (guardT == IRTemp_INVALID) {
 313 //ZZ       /* unconditional */
 314 //ZZ       storeLE(addr, data);
 315 //ZZ    } else {
 316 //ZZ       stmt( IRStmt_StoreG(Iend_LE, addr, data,
 317 //ZZ                           binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
 318 //ZZ    }
 319 //ZZ }
 320 //ZZ
 321 //ZZ static void loadGuardedLE ( IRTemp dst, IRLoadGOp cvt,
 322 //ZZ                             IRExpr* addr, IRExpr* alt,
 323 //ZZ                             IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
 324 //ZZ {
 325 //ZZ    if (guardT == IRTemp_INVALID) {
 326 //ZZ       /* unconditional */
 327 //ZZ       IRExpr* loaded = NULL;
 328 //ZZ       switch (cvt) {
 329 //ZZ          case ILGop_Ident32:
 330 //ZZ             loaded = loadLE(Ity_I32, addr); break;
 331 //ZZ          case ILGop_8Uto32:
 332 //ZZ             loaded = unop(Iop_8Uto32, loadLE(Ity_I8, addr)); break;
 333 //ZZ          case ILGop_8Sto32:
 334 //ZZ             loaded = unop(Iop_8Sto32, loadLE(Ity_I8, addr)); break;
 335 //ZZ          case ILGop_16Uto32:
 336 //ZZ             loaded = unop(Iop_16Uto32, loadLE(Ity_I16, addr)); break;
 337 //ZZ          case ILGop_16Sto32:
 338 //ZZ             loaded = unop(Iop_16Sto32, loadLE(Ity_I16, addr)); break;
 339 //ZZ          default:
 340 //ZZ             vassert(0);
 341 //ZZ       }
 342 //ZZ       vassert(loaded != NULL);
 343 //ZZ       assign(dst, loaded);
 344 //ZZ    } else {
 345 //ZZ       /* Generate a guarded load into 'dst', but apply 'cvt' to the
 346 //ZZ          loaded data before putting the data in 'dst'.  If the load
 347 //ZZ          does not take place, 'alt' is placed directly in 'dst'. */
 348 //ZZ       stmt( IRStmt_LoadG(Iend_LE, cvt, dst, addr, alt,
 349 //ZZ                          binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
 350 //ZZ    }
 351 //ZZ }
 352
 353 /* Generate a new temporary of the given type. */
 354 static IRTemp newTemp ( IRType ty )
 355 {
 356    vassert(isPlausibleIRType(ty));
 357    return newIRTemp( irsb->tyenv, ty );
 358 }
 359
 360 /* This is used in many places, so the brevity is an advantage. */
 361 static IRTemp newTempV128(void)
 362 {
 363    return newTemp(Ity_V128);
 364 }
 365
 366 /* Initialise V128 temporaries en masse. */
 367 static
 368 void newTempsV128_2(IRTemp* t1, IRTemp* t2)
 369 {
 370    vassert(t1 && *t1 == IRTemp_INVALID);
 371    vassert(t2 && *t2 == IRTemp_INVALID);
 372    *t1 = newTempV128();
 373    *t2 = newTempV128();
 374 }
 375
 376 static
 377 void newTempsV128_3(IRTemp* t1, IRTemp* t2, IRTemp* t3)
 378 {
 379    vassert(t1 && *t1 == IRTemp_INVALID);
 380    vassert(t2 && *t2 == IRTemp_INVALID);
 381    vassert(t3 && *t3 == IRTemp_INVALID);
 382    *t1 = newTempV128();
 383    *t2 = newTempV128();
 384    *t3 = newTempV128();
 385 }
 386
 387 static
 388 void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4)
 389 {
 390    vassert(t1 && *t1 == IRTemp_INVALID);
 391    vassert(t2 && *t2 == IRTemp_INVALID);
 392    vassert(t3 && *t3 == IRTemp_INVALID);
 393    vassert(t4 && *t4 == IRTemp_INVALID);
 394    *t1 = newTempV128();
 395    *t2 = newTempV128();
 396    *t3 = newTempV128();
 397    *t4 = newTempV128();
 398 }
 399
 400 static
 401 void newTempsV128_7(IRTemp* t1, IRTemp* t2, IRTemp* t3,
 402                     IRTemp* t4, IRTemp* t5, IRTemp* t6, IRTemp* t7)
 403 {
 404    vassert(t1 && *t1 == IRTemp_INVALID);
 405    vassert(t2 && *t2 == IRTemp_INVALID);
 406    vassert(t3 && *t3 == IRTemp_INVALID);
 407    vassert(t4 && *t4 == IRTemp_INVALID);
 408    vassert(t5 && *t5 == IRTemp_INVALID);
 409    vassert(t6 && *t6 == IRTemp_INVALID);
 410    vassert(t7 && *t7 == IRTemp_INVALID);
 411    *t1 = newTempV128();
 412    *t2 = newTempV128();
 413    *t3 = newTempV128();
 414    *t4 = newTempV128();
 415    *t5 = newTempV128();
 416    *t6 = newTempV128();
 417    *t7 = newTempV128();
 418 }
 419
 420 //ZZ /* Produces a value in 0 .. 3, which is encoded as per the type
 421 //ZZ    IRRoundingMode. */
 422 //ZZ static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
 423 //ZZ {
 424 //ZZ    return mkU32(Irrm_NEAREST);
 425 //ZZ }
 426 //ZZ
 427 //ZZ /* Generate an expression for SRC rotated right by ROT. */
 428 //ZZ static IRExpr* genROR32( IRTemp src, Int rot )
 429 //ZZ {
 430 //ZZ    vassert(rot >= 0 && rot < 32);
 431 //ZZ    if (rot == 0)
 432 //ZZ       return mkexpr(src);
 433 //ZZ    return
 434 //ZZ       binop(Iop_Or32,
 435 //ZZ             binop(Iop_Shl32, mkexpr(src), mkU8(32 - rot)),
 436 //ZZ             binop(Iop_Shr32, mkexpr(src), mkU8(rot)));
 437 //ZZ }
 438 //ZZ
 439 //ZZ static IRExpr* mkU128 ( ULong i )
 440 //ZZ {
 441 //ZZ    return binop(Iop_64HLtoV128, mkU64(i), mkU64(i));
 442 //ZZ }
 443 //ZZ
 444 //ZZ /* Generate a 4-aligned version of the given expression if
 445 //ZZ    the given condition is true.  Else return it unchanged. */
 446 //ZZ static IRExpr* align4if ( IRExpr* e, Bool b )
 447 //ZZ {
 448 //ZZ    if (b)
 449 //ZZ       return binop(Iop_And32, e, mkU32(~3));
 450 //ZZ    else
 451 //ZZ       return e;
 452 //ZZ }
 453
 454 /* Other IR construction helpers. */
 455 static IROp mkAND ( IRType ty ) {
 456    switch (ty) {
 457       case Ity_I32: return Iop_And32;
 458       case Ity_I64: return Iop_And64;
 459       default: vpanic("mkAND");
 460    }
 461 }
 462
 463 static IROp mkOR ( IRType ty ) {
 464    switch (ty) {
 465       case Ity_I32: return Iop_Or32;
 466       case Ity_I64: return Iop_Or64;
 467       default: vpanic("mkOR");
 468    }
 469 }
 470
 471 static IROp mkXOR ( IRType ty ) {
 472    switch (ty) {
 473       case Ity_I32: return Iop_Xor32;
 474       case Ity_I64: return Iop_Xor64;
 475       default: vpanic("mkXOR");
 476    }
 477 }
 478
 479 static IROp mkSHL ( IRType ty ) {
 480    switch (ty) {
 481       case Ity_I32: return Iop_Shl32;
 482       case Ity_I64: return Iop_Shl64;
 483       default: vpanic("mkSHL");
 484    }
 485 }
 486
 487 static IROp mkSHR ( IRType ty ) {
 488    switch (ty) {
 489       case Ity_I32: return Iop_Shr32;
 490       case Ity_I64: return Iop_Shr64;
 491       default: vpanic("mkSHR");
 492    }
 493 }
 494
 495 static IROp mkSAR ( IRType ty ) {
 496    switch (ty) {
 497       case Ity_I32: return Iop_Sar32;
 498       case Ity_I64: return Iop_Sar64;
 499       default: vpanic("mkSAR");
 500    }
 501 }
 502
 503 static IROp mkNOT ( IRType ty ) {
 504    switch (ty) {
 505       case Ity_I32: return Iop_Not32;
 506       case Ity_I64: return Iop_Not64;
 507       default: vpanic("mkNOT");
 508    }
 509 }
 510
 511 static IROp mkADD ( IRType ty ) {
 512    switch (ty) {
 513       case Ity_I32: return Iop_Add32;
 514       case Ity_I64: return Iop_Add64;
 515       default: vpanic("mkADD");
 516    }
 517 }
 518
 519 static IROp mkSUB ( IRType ty ) {
 520    switch (ty) {
 521       case Ity_I32: return Iop_Sub32;
 522       case Ity_I64: return Iop_Sub64;
 523       default: vpanic("mkSUB");
 524    }
 525 }
 526
 527 static IROp mkADDF ( IRType ty ) {
 528    switch (ty) {
 529       case Ity_F32: return Iop_AddF32;
 530       case Ity_F64: return Iop_AddF64;
 531       default: vpanic("mkADDF");
 532    }
 533 }
 534
 535 static IROp mkSUBF ( IRType ty ) {
 536    switch (ty) {
 537       case Ity_F32: return Iop_SubF32;
 538       case Ity_F64: return Iop_SubF64;
 539       default: vpanic("mkSUBF");
 540    }
 541 }
 542
 543 static IROp mkMULF ( IRType ty ) {
 544    switch (ty) {
 545       case Ity_F32: return Iop_MulF32;
 546       case Ity_F64: return Iop_MulF64;
 547       default: vpanic("mkMULF");
 548    }
 549 }
 550
 551 static IROp mkDIVF ( IRType ty ) {
 552    switch (ty) {
 553       case Ity_F32: return Iop_DivF32;
 554       case Ity_F64: return Iop_DivF64;
 555       default: vpanic("mkMULF");
 556    }
 557 }
 558
 559 static IROp mkNEGF ( IRType ty ) {
 560    switch (ty) {
 561       case Ity_F32: return Iop_NegF32;
 562       case Ity_F64: return Iop_NegF64;
 563       default: vpanic("mkNEGF");
 564    }
 565 }
 566
 567 static IROp mkABSF ( IRType ty ) {
 568    switch (ty) {
 569       case Ity_F32: return Iop_AbsF32;
 570       case Ity_F64: return Iop_AbsF64;
 571       default: vpanic("mkNEGF");
 572    }
 573 }
 574
 575 static IROp mkSQRTF ( IRType ty ) {
 576    switch (ty) {
 577       case Ity_F32: return Iop_SqrtF32;
 578       case Ity_F64: return Iop_SqrtF64;
 579       default: vpanic("mkNEGF");
 580    }
 581 }
 582
 583 static IROp mkVecADD ( UInt size ) {
 584    const IROp ops[4]
 585       = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
 586    vassert(size < 4);
 587    return ops[size];
 588 }
 589
 590 static IROp mkVecQADDU ( UInt size ) {
 591    const IROp ops[4]
 592       = { Iop_QAdd8Ux16, Iop_QAdd16Ux8, Iop_QAdd32Ux4, Iop_QAdd64Ux2 };
 593    vassert(size < 4);
 594    return ops[size];
 595 }
 596
 597 static IROp mkVecQADDS ( UInt size ) {
 598    const IROp ops[4]
 599       = { Iop_QAdd8Sx16, Iop_QAdd16Sx8, Iop_QAdd32Sx4, Iop_QAdd64Sx2 };
 600    vassert(size < 4);
 601    return ops[size];
 602 }
 603
 604 static IROp mkVecQADDEXTSUSATUU ( UInt size ) {
 605    const IROp ops[4]
 606       = { Iop_QAddExtSUsatUU8x16, Iop_QAddExtSUsatUU16x8,
 607           Iop_QAddExtSUsatUU32x4, Iop_QAddExtSUsatUU64x2 };
 608    vassert(size < 4);
 609    return ops[size];
 610 }
 611
 612 static IROp mkVecQADDEXTUSSATSS ( UInt size ) {
 613    const IROp ops[4]
 614       = { Iop_QAddExtUSsatSS8x16, Iop_QAddExtUSsatSS16x8,
 615           Iop_QAddExtUSsatSS32x4, Iop_QAddExtUSsatSS64x2 };
 616    vassert(size < 4);
 617    return ops[size];
 618 }
 619
 620 static IROp mkVecSUB ( UInt size ) {
 621    const IROp ops[4]
 622       = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
 623    vassert(size < 4);
 624    return ops[size];
 625 }
 626
 627 static IROp mkVecQSUBU ( UInt size ) {
 628    const IROp ops[4]
 629       = { Iop_QSub8Ux16, Iop_QSub16Ux8, Iop_QSub32Ux4, Iop_QSub64Ux2 };
 630    vassert(size < 4);
 631    return ops[size];
 632 }
 633
 634 static IROp mkVecQSUBS ( UInt size ) {
 635    const IROp ops[4]
 636       = { Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4, Iop_QSub64Sx2 };
 637    vassert(size < 4);
 638    return ops[size];
 639 }
 640
 641 static IROp mkVecSARN ( UInt size ) {
 642    const IROp ops[4]
 643       = { Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4, Iop_SarN64x2 };
 644    vassert(size < 4);
 645    return ops[size];
 646 }
 647
 648 static IROp mkVecSHRN ( UInt size ) {
 649    const IROp ops[4]
 650       = { Iop_ShrN8x16, Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
 651    vassert(size < 4);
 652    return ops[size];
 653 }
 654
 655 static IROp mkVecSHLN ( UInt size ) {
 656    const IROp ops[4]
 657       = { Iop_ShlN8x16, Iop_ShlN16x8, Iop_ShlN32x4, Iop_ShlN64x2 };
 658    vassert(size < 4);
 659    return ops[size];
 660 }
 661
 662 static IROp mkVecCATEVENLANES ( UInt size ) {
 663    const IROp ops[4]
 664       = { Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8,
 665           Iop_CatEvenLanes32x4, Iop_InterleaveLO64x2 };
 666    vassert(size < 4);
 667    return ops[size];
 668 }
 669
 670 static IROp mkVecCATODDLANES ( UInt size ) {
 671    const IROp ops[4]
 672       = { Iop_CatOddLanes8x16, Iop_CatOddLanes16x8,
 673           Iop_CatOddLanes32x4, Iop_InterleaveHI64x2 };
 674    vassert(size < 4);
 675    return ops[size];
 676 }
 677
 678 static IROp mkVecINTERLEAVELO ( UInt size ) {
 679    const IROp ops[4]
 680       = { Iop_InterleaveLO8x16, Iop_InterleaveLO16x8,
 681           Iop_InterleaveLO32x4, Iop_InterleaveLO64x2 };
 682    vassert(size < 4);
 683    return ops[size];
 684 }
 685
 686 static IROp mkVecINTERLEAVEHI ( UInt size ) {
 687    const IROp ops[4]
 688       = { Iop_InterleaveHI8x16, Iop_InterleaveHI16x8,
 689           Iop_InterleaveHI32x4, Iop_InterleaveHI64x2 };
 690    vassert(size < 4);
 691    return ops[size];
 692 }
 693
 694 static IROp mkVecMAXU ( UInt size ) {
 695    const IROp ops[4]
 696       = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4, Iop_Max64Ux2 };
 697    vassert(size < 4);
 698    return ops[size];
 699 }
 700
 701 static IROp mkVecMAXS ( UInt size ) {
 702    const IROp ops[4]
 703       = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4, Iop_Max64Sx2 };
 704    vassert(size < 4);
 705    return ops[size];
 706 }
 707
 708 static IROp mkVecMINU ( UInt size ) {
 709    const IROp ops[4]
 710       = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4, Iop_Min64Ux2 };
 711    vassert(size < 4);
 712    return ops[size];
 713 }
 714
 715 static IROp mkVecMINS ( UInt size ) {
 716    const IROp ops[4]
 717       = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4, Iop_Min64Sx2 };
 718    vassert(size < 4);
 719    return ops[size];
 720 }
 721
 722 static IROp mkVecMUL ( UInt size ) {
 723    const IROp ops[4]
 724       = { Iop_Mul8x16, Iop_Mul16x8, Iop_Mul32x4, Iop_INVALID };
 725    vassert(size < 3);
 726    return ops[size];
 727 }
 728
 729 static IROp mkVecMULLU ( UInt sizeNarrow ) {
 730    const IROp ops[4]
 731       = { Iop_Mull8Ux8, Iop_Mull16Ux4, Iop_Mull32Ux2, Iop_INVALID };
 732    vassert(sizeNarrow < 3);
 733    return ops[sizeNarrow];
 734 }
 735
 736 static IROp mkVecMULLS ( UInt sizeNarrow ) {
 737    const IROp ops[4]
 738       = { Iop_Mull8Sx8, Iop_Mull16Sx4, Iop_Mull32Sx2, Iop_INVALID };
 739    vassert(sizeNarrow < 3);
 740    return ops[sizeNarrow];
 741 }
 742
 743 static IROp mkVecQDMULLS ( UInt sizeNarrow ) {
 744    const IROp ops[4]
 745       = { Iop_INVALID, Iop_QDMull16Sx4, Iop_QDMull32Sx2, Iop_INVALID };
 746    vassert(sizeNarrow < 3);
 747    return ops[sizeNarrow];
 748 }
 749
 750 static IROp mkVecCMPEQ ( UInt size ) {
 751    const IROp ops[4]
 752       = { Iop_CmpEQ8x16, Iop_CmpEQ16x8, Iop_CmpEQ32x4, Iop_CmpEQ64x2 };
 753    vassert(size < 4);
 754    return ops[size];
 755 }
 756
 757 static IROp mkVecCMPGTU ( UInt size ) {
 758    const IROp ops[4]
 759       = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4, Iop_CmpGT64Ux2 };
 760    vassert(size < 4);
 761    return ops[size];
 762 }
 763
 764 static IROp mkVecCMPGTS ( UInt size ) {
 765    const IROp ops[4]
 766       = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2 };
 767    vassert(size < 4);
 768    return ops[size];
 769 }
 770
 771 static IROp mkVecABS ( UInt size ) {
 772    const IROp ops[4]
 773       = { Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2 };
 774    vassert(size < 4);
 775    return ops[size];
 776 }
 777
 778 static IROp mkVecZEROHIxxOFV128 ( UInt size ) {
 779    const IROp ops[4]
 780       = { Iop_ZeroHI120ofV128, Iop_ZeroHI112ofV128,
 781           Iop_ZeroHI96ofV128,  Iop_ZeroHI64ofV128 };
 782    vassert(size < 4);
 783    return ops[size];
 784 }
 785
 786 static IRExpr* mkU ( IRType ty, ULong imm ) {
 787    switch (ty) {
 788       case Ity_I32: return mkU32((UInt)(imm & 0xFFFFFFFFULL));
 789       case Ity_I64: return mkU64(imm);
 790       default: vpanic("mkU");
 791    }
 792 }
 793
 794 static IROp mkVecQDMULHIS ( UInt size ) {
 795    const IROp ops[4]
 796       = { Iop_INVALID, Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4, Iop_INVALID };
 797    vassert(size < 4);
 798    return ops[size];
 799 }
 800
 801 static IROp mkVecQRDMULHIS ( UInt size ) {
 802    const IROp ops[4]
 803       = { Iop_INVALID, Iop_QRDMulHi16Sx8, Iop_QRDMulHi32Sx4, Iop_INVALID };
 804    vassert(size < 4);
 805    return ops[size];
 806 }
 807
 808 static IROp mkVecQANDUQSH ( UInt size ) {
 809    const IROp ops[4]
 810       = { Iop_QandUQsh8x16, Iop_QandUQsh16x8,
 811           Iop_QandUQsh32x4, Iop_QandUQsh64x2 };
 812    vassert(size < 4);
 813    return ops[size];
 814 }
 815
 816 static IROp mkVecQANDSQSH ( UInt size ) {
 817    const IROp ops[4]
 818       = { Iop_QandSQsh8x16, Iop_QandSQsh16x8,
 819           Iop_QandSQsh32x4, Iop_QandSQsh64x2 };
 820    vassert(size < 4);
 821    return ops[size];
 822 }
 823
 824 static IROp mkVecQANDUQRSH ( UInt size ) {
 825    const IROp ops[4]
 826       = { Iop_QandUQRsh8x16, Iop_QandUQRsh16x8,
 827           Iop_QandUQRsh32x4, Iop_QandUQRsh64x2 };
 828    vassert(size < 4);
 829    return ops[size];
 830 }
 831
 832 static IROp mkVecQANDSQRSH ( UInt size ) {
 833    const IROp ops[4]
 834       = { Iop_QandSQRsh8x16, Iop_QandSQRsh16x8,
 835           Iop_QandSQRsh32x4, Iop_QandSQRsh64x2 };
 836    vassert(size < 4);
 837    return ops[size];
 838 }
 839
 840 static IROp mkVecSHU ( UInt size ) {
 841    const IROp ops[4]
 842       = { Iop_Sh8Ux16, Iop_Sh16Ux8, Iop_Sh32Ux4, Iop_Sh64Ux2 };
 843    vassert(size < 4);
 844    return ops[size];
 845 }
 846
 847 static IROp mkVecSHS ( UInt size ) {
 848    const IROp ops[4]
 849       = { Iop_Sh8Sx16, Iop_Sh16Sx8, Iop_Sh32Sx4, Iop_Sh64Sx2 };
 850    vassert(size < 4);
 851    return ops[size];
 852 }
 853
 854 static IROp mkVecRSHU ( UInt size ) {
 855    const IROp ops[4]
 856       = { Iop_Rsh8Ux16, Iop_Rsh16Ux8, Iop_Rsh32Ux4, Iop_Rsh64Ux2 };
 857    vassert(size < 4);
 858    return ops[size];
 859 }
 860
 861 static IROp mkVecRSHS ( UInt size ) {
 862    const IROp ops[4]
 863       = { Iop_Rsh8Sx16, Iop_Rsh16Sx8, Iop_Rsh32Sx4, Iop_Rsh64Sx2 };
 864    vassert(size < 4);
 865    return ops[size];
 866 }
 867
 868 static IROp mkVecNARROWUN ( UInt sizeNarrow ) {
 869    const IROp ops[4]
 870       = { Iop_NarrowUn16to8x8, Iop_NarrowUn32to16x4,
 871           Iop_NarrowUn64to32x2, Iop_INVALID };
 872    vassert(sizeNarrow < 4);
 873    return ops[sizeNarrow];
 874 }
 875
 876 static IROp mkVecQNARROWUNSU ( UInt sizeNarrow ) {
 877    const IROp ops[4]
 878       = { Iop_QNarrowUn16Sto8Ux8,  Iop_QNarrowUn32Sto16Ux4,
 879           Iop_QNarrowUn64Sto32Ux2, Iop_INVALID };
 880    vassert(sizeNarrow < 4);
 881    return ops[sizeNarrow];
 882 }
 883
 884 static IROp mkVecQNARROWUNSS ( UInt sizeNarrow ) {
 885    const IROp ops[4]
 886       = { Iop_QNarrowUn16Sto8Sx8,  Iop_QNarrowUn32Sto16Sx4,
 887           Iop_QNarrowUn64Sto32Sx2, Iop_INVALID };
 888    vassert(sizeNarrow < 4);
 889    return ops[sizeNarrow];
 890 }
 891
 892 static IROp mkVecQNARROWUNUU ( UInt sizeNarrow ) {
 893    const IROp ops[4]
 894       = { Iop_QNarrowUn16Uto8Ux8,  Iop_QNarrowUn32Uto16Ux4,
 895           Iop_QNarrowUn64Uto32Ux2, Iop_INVALID };
 896    vassert(sizeNarrow < 4);
 897    return ops[sizeNarrow];
 898 }
 899
 900 static IROp mkVecQANDqshrNNARROWUU ( UInt sizeNarrow ) {
 901    const IROp ops[4]
 902       = { Iop_QandQShrNnarrow16Uto8Ux8, Iop_QandQShrNnarrow32Uto16Ux4,
 903           Iop_QandQShrNnarrow64Uto32Ux2, Iop_INVALID };
 904    vassert(sizeNarrow < 4);
 905    return ops[sizeNarrow];
 906 }
 907
 908 static IROp mkVecQANDqsarNNARROWSS ( UInt sizeNarrow ) {
 909    const IROp ops[4]
 910       = { Iop_QandQSarNnarrow16Sto8Sx8,  Iop_QandQSarNnarrow32Sto16Sx4,
 911           Iop_QandQSarNnarrow64Sto32Sx2, Iop_INVALID };
 912    vassert(sizeNarrow < 4);
 913    return ops[sizeNarrow];
 914 }
 915
 916 static IROp mkVecQANDqsarNNARROWSU ( UInt sizeNarrow ) {
 917    const IROp ops[4]
 918       = { Iop_QandQSarNnarrow16Sto8Ux8,  Iop_QandQSarNnarrow32Sto16Ux4,
 919           Iop_QandQSarNnarrow64Sto32Ux2, Iop_INVALID };
 920    vassert(sizeNarrow < 4);
 921    return ops[sizeNarrow];
 922 }
 923
 924 static IROp mkVecQANDqrshrNNARROWUU ( UInt sizeNarrow ) {
 925    const IROp ops[4]
 926       = { Iop_QandQRShrNnarrow16Uto8Ux8,  Iop_QandQRShrNnarrow32Uto16Ux4,
 927           Iop_QandQRShrNnarrow64Uto32Ux2, Iop_INVALID };
 928    vassert(sizeNarrow < 4);
 929    return ops[sizeNarrow];
 930 }
 931
 932 static IROp mkVecQANDqrsarNNARROWSS ( UInt sizeNarrow ) {
 933    const IROp ops[4]
 934       = { Iop_QandQRSarNnarrow16Sto8Sx8,  Iop_QandQRSarNnarrow32Sto16Sx4,
 935           Iop_QandQRSarNnarrow64Sto32Sx2, Iop_INVALID };
 936    vassert(sizeNarrow < 4);
 937    return ops[sizeNarrow];
 938 }
 939
 940 static IROp mkVecQANDqrsarNNARROWSU ( UInt sizeNarrow ) {
 941    const IROp ops[4]
 942       = { Iop_QandQRSarNnarrow16Sto8Ux8,  Iop_QandQRSarNnarrow32Sto16Ux4,
 943           Iop_QandQRSarNnarrow64Sto32Ux2, Iop_INVALID };
 944    vassert(sizeNarrow < 4);
 945    return ops[sizeNarrow];
 946 }
 947
 948 static IROp mkVecQSHLNSATUU ( UInt size ) {
 949    const IROp ops[4]
 950       = { Iop_QShlNsatUU8x16, Iop_QShlNsatUU16x8,
 951           Iop_QShlNsatUU32x4, Iop_QShlNsatUU64x2 };
 952    vassert(size < 4);
 953    return ops[size];
 954 }
 955
 956 static IROp mkVecQSHLNSATSS ( UInt size ) {
 957    const IROp ops[4]
 958       = { Iop_QShlNsatSS8x16, Iop_QShlNsatSS16x8,
 959           Iop_QShlNsatSS32x4, Iop_QShlNsatSS64x2 };
 960    vassert(size < 4);
 961    return ops[size];
 962 }
 963
 964 static IROp mkVecQSHLNSATSU ( UInt size ) {
 965    const IROp ops[4]
 966       = { Iop_QShlNsatSU8x16, Iop_QShlNsatSU16x8,
 967           Iop_QShlNsatSU32x4, Iop_QShlNsatSU64x2 };
 968    vassert(size < 4);
 969    return ops[size];
 970 }
 971
 972 static IROp mkVecADDF ( UInt size ) {
 973    const IROp ops[4]
 974       = { Iop_INVALID, Iop_INVALID, Iop_Add32Fx4, Iop_Add64Fx2 };
 975    vassert(size < 4);
 976    return ops[size];
 977 }
 978
 979 static IROp mkVecMAXF ( UInt size ) {
 980    const IROp ops[4]
 981       = { Iop_INVALID, Iop_INVALID, Iop_Max32Fx4, Iop_Max64Fx2 };
 982    vassert(size < 4);
 983    return ops[size];
 984 }
 985
 986 static IROp mkVecMINF ( UInt size ) {
 987    const IROp ops[4]
 988       = { Iop_INVALID, Iop_INVALID, Iop_Min32Fx4, Iop_Min64Fx2 };
 989    vassert(size < 4);
 990    return ops[size];
 991 }
 992
 993 /* Generate IR to create 'arg rotated right by imm', for sane values
 994    of 'ty' and 'imm'. */
 995 static IRTemp mathROR ( IRType ty, IRTemp arg, UInt imm )
 996 {
 997    UInt w = 0;
 998    if (ty == Ity_I64) {
 999       w = 64;
1000    } else {
1001       vassert(ty == Ity_I32);
1002       w = 32;
1003    }
1004    vassert(w != 0);
1005    vassert(imm < w);
1006    if (imm == 0) {
1007       return arg;
1008    }
1009    IRTemp res = newTemp(ty);
1010    assign(res, binop(mkOR(ty),
1011                      binop(mkSHL(ty), mkexpr(arg), mkU8(w - imm)),
1012                      binop(mkSHR(ty), mkexpr(arg), mkU8(imm)) ));
1013    return res;
1014 }
1015
1016 /* Generate IR to set the returned temp to either all-zeroes or
1017    all ones, as a copy of arg<imm>. */
1018 static IRTemp mathREPLICATE ( IRType ty, IRTemp arg, UInt imm )
1019 {
1020    UInt w = 0;
1021    if (ty == Ity_I64) {
1022       w = 64;
1023    } else {
1024       vassert(ty == Ity_I32);
1025       w = 32;
1026    }
1027    vassert(w != 0);
1028    vassert(imm < w);
1029    IRTemp res = newTemp(ty);
1030    assign(res, binop(mkSAR(ty),
1031                      binop(mkSHL(ty), mkexpr(arg), mkU8(w - 1 - imm)),
1032                      mkU8(w - 1)));
1033    return res;
1034 }
1035
1036 /* S-widen 8/16/32/64 bit int expr to 64. */
1037 static IRExpr* widenSto64 ( IRType srcTy, IRExpr* e )
1038 {
1039    switch (srcTy) {
1040       case Ity_I64: return e;
1041       case Ity_I32: return unop(Iop_32Sto64, e);
1042       case Ity_I16: return unop(Iop_16Sto64, e);
1043       case Ity_I8:  return unop(Iop_8Sto64, e);
1044       default: vpanic("widenSto64(arm64)");
1045    }
1046 }
1047
1048 /* U-widen 8/16/32/64 bit int expr to 64. */
1049 static IRExpr* widenUto64 ( IRType srcTy, IRExpr* e )
1050 {
1051    switch (srcTy) {
1052       case Ity_I64: return e;
1053       case Ity_I32: return unop(Iop_32Uto64, e);
1054       case Ity_I16: return unop(Iop_16Uto64, e);
1055       case Ity_I8:  return unop(Iop_8Uto64, e);
1056       default: vpanic("widenUto64(arm64)");
1057    }
1058 }
1059
1060 /* Narrow 64 bit int expr to 8/16/32/64.  Clearly only some
1061    of these combinations make sense. */
1062 static IRExpr* narrowFrom64 ( IRType dstTy, IRExpr* e )
1063 {
1064    switch (dstTy) {
1065       case Ity_I64: return e;
1066       case Ity_I32: return unop(Iop_64to32, e);
1067       case Ity_I16: return unop(Iop_64to16, e);
1068       case Ity_I8:  return unop(Iop_64to8, e);
1069       default: vpanic("narrowFrom64(arm64)");
1070    }
1071 }
1072
1073
1074 /*------------------------------------------------------------*/
1075 /*--- Helpers for accessing guest registers.               ---*/
1076 /*------------------------------------------------------------*/
1077
1078 #define OFFB_X0       offsetof(VexGuestARM64State,guest_X0)
1079 #define OFFB_X1       offsetof(VexGuestARM64State,guest_X1)
1080 #define OFFB_X2       offsetof(VexGuestARM64State,guest_X2)
1081 #define OFFB_X3       offsetof(VexGuestARM64State,guest_X3)
1082 #define OFFB_X4       offsetof(VexGuestARM64State,guest_X4)
1083 #define OFFB_X5       offsetof(VexGuestARM64State,guest_X5)
1084 #define OFFB_X6       offsetof(VexGuestARM64State,guest_X6)
1085 #define OFFB_X7       offsetof(VexGuestARM64State,guest_X7)
1086 #define OFFB_X8       offsetof(VexGuestARM64State,guest_X8)
1087 #define OFFB_X9       offsetof(VexGuestARM64State,guest_X9)
1088 #define OFFB_X10      offsetof(VexGuestARM64State,guest_X10)
1089 #define OFFB_X11      offsetof(VexGuestARM64State,guest_X11)
1090 #define OFFB_X12      offsetof(VexGuestARM64State,guest_X12)
1091 #define OFFB_X13      offsetof(VexGuestARM64State,guest_X13)
1092 #define OFFB_X14      offsetof(VexGuestARM64State,guest_X14)
1093 #define OFFB_X15      offsetof(VexGuestARM64State,guest_X15)
1094 #define OFFB_X16      offsetof(VexGuestARM64State,guest_X16)
1095 #define OFFB_X17      offsetof(VexGuestARM64State,guest_X17)
1096 #define OFFB_X18      offsetof(VexGuestARM64State,guest_X18)
1097 #define OFFB_X19      offsetof(VexGuestARM64State,guest_X19)
1098 #define OFFB_X20      offsetof(VexGuestARM64State,guest_X20)
1099 #define OFFB_X21      offsetof(VexGuestARM64State,guest_X21)
1100 #define OFFB_X22      offsetof(VexGuestARM64State,guest_X22)
1101 #define OFFB_X23      offsetof(VexGuestARM64State,guest_X23)
1102 #define OFFB_X24      offsetof(VexGuestARM64State,guest_X24)
1103 #define OFFB_X25      offsetof(VexGuestARM64State,guest_X25)
1104 #define OFFB_X26      offsetof(VexGuestARM64State,guest_X26)
1105 #define OFFB_X27      offsetof(VexGuestARM64State,guest_X27)
1106 #define OFFB_X28      offsetof(VexGuestARM64State,guest_X28)
1107 #define OFFB_X29      offsetof(VexGuestARM64State,guest_X29)
1108 #define OFFB_X30      offsetof(VexGuestARM64State,guest_X30)
1109
1110 #define OFFB_XSP      offsetof(VexGuestARM64State,guest_XSP)
1111 #define OFFB_PC       offsetof(VexGuestARM64State,guest_PC)
1112
1113 #define OFFB_CC_OP    offsetof(VexGuestARM64State,guest_CC_OP)
1114 #define OFFB_CC_DEP1  offsetof(VexGuestARM64State,guest_CC_DEP1)
1115 #define OFFB_CC_DEP2  offsetof(VexGuestARM64State,guest_CC_DEP2)
1116 #define OFFB_CC_NDEP  offsetof(VexGuestARM64State,guest_CC_NDEP)
1117
1118 #define OFFB_TPIDR_EL0 offsetof(VexGuestARM64State,guest_TPIDR_EL0)
1119 #define OFFB_NRADDR   offsetof(VexGuestARM64State,guest_NRADDR)
1120
1121 #define OFFB_Q0       offsetof(VexGuestARM64State,guest_Q0)
1122 #define OFFB_Q1       offsetof(VexGuestARM64State,guest_Q1)
1123 #define OFFB_Q2       offsetof(VexGuestARM64State,guest_Q2)
1124 #define OFFB_Q3       offsetof(VexGuestARM64State,guest_Q3)
1125 #define OFFB_Q4       offsetof(VexGuestARM64State,guest_Q4)
1126 #define OFFB_Q5       offsetof(VexGuestARM64State,guest_Q5)
1127 #define OFFB_Q6       offsetof(VexGuestARM64State,guest_Q6)
1128 #define OFFB_Q7       offsetof(VexGuestARM64State,guest_Q7)
1129 #define OFFB_Q8       offsetof(VexGuestARM64State,guest_Q8)
1130 #define OFFB_Q9       offsetof(VexGuestARM64State,guest_Q9)
1131 #define OFFB_Q10      offsetof(VexGuestARM64State,guest_Q10)
1132 #define OFFB_Q11      offsetof(VexGuestARM64State,guest_Q11)
1133 #define OFFB_Q12      offsetof(VexGuestARM64State,guest_Q12)
1134 #define OFFB_Q13      offsetof(VexGuestARM64State,guest_Q13)
1135 #define OFFB_Q14      offsetof(VexGuestARM64State,guest_Q14)
1136 #define OFFB_Q15      offsetof(VexGuestARM64State,guest_Q15)
1137 #define OFFB_Q16      offsetof(VexGuestARM64State,guest_Q16)
1138 #define OFFB_Q17      offsetof(VexGuestARM64State,guest_Q17)
1139 #define OFFB_Q18      offsetof(VexGuestARM64State,guest_Q18)
1140 #define OFFB_Q19      offsetof(VexGuestARM64State,guest_Q19)
1141 #define OFFB_Q20      offsetof(VexGuestARM64State,guest_Q20)
1142 #define OFFB_Q21      offsetof(VexGuestARM64State,guest_Q21)
1143 #define OFFB_Q22      offsetof(VexGuestARM64State,guest_Q22)
1144 #define OFFB_Q23      offsetof(VexGuestARM64State,guest_Q23)
1145 #define OFFB_Q24      offsetof(VexGuestARM64State,guest_Q24)
1146 #define OFFB_Q25      offsetof(VexGuestARM64State,guest_Q25)
1147 #define OFFB_Q26      offsetof(VexGuestARM64State,guest_Q26)
1148 #define OFFB_Q27      offsetof(VexGuestARM64State,guest_Q27)
1149 #define OFFB_Q28      offsetof(VexGuestARM64State,guest_Q28)
1150 #define OFFB_Q29      offsetof(VexGuestARM64State,guest_Q29)
1151 #define OFFB_Q30      offsetof(VexGuestARM64State,guest_Q30)
1152 #define OFFB_Q31      offsetof(VexGuestARM64State,guest_Q31)
1153
1154 #define OFFB_FPCR     offsetof(VexGuestARM64State,guest_FPCR)
1155 #define OFFB_QCFLAG   offsetof(VexGuestARM64State,guest_QCFLAG)
1156
1157 #define OFFB_CMSTART  offsetof(VexGuestARM64State,guest_CMSTART)
1158 #define OFFB_CMLEN    offsetof(VexGuestARM64State,guest_CMLEN)
1159
1160 #define OFFB_LLSC_SIZE offsetof(VexGuestARM64State,guest_LLSC_SIZE)
1161 #define OFFB_LLSC_ADDR offsetof(VexGuestARM64State,guest_LLSC_ADDR)
1162 #define OFFB_LLSC_DATA offsetof(VexGuestARM64State,guest_LLSC_DATA)
1163
1164
1165 /* ---------------- Integer registers ---------------- */
1166
1167 static Int offsetIReg64 ( UInt iregNo )
1168 {
1169    /* Do we care about endianness here?  We do if sub-parts of integer
1170       registers are accessed. */
1171    switch (iregNo) {
1172       case 0:  return OFFB_X0;
1173       case 1:  return OFFB_X1;
1174       case 2:  return OFFB_X2;
1175       case 3:  return OFFB_X3;
1176       case 4:  return OFFB_X4;
1177       case 5:  return OFFB_X5;
1178       case 6:  return OFFB_X6;
1179       case 7:  return OFFB_X7;
1180       case 8:  return OFFB_X8;
1181       case 9:  return OFFB_X9;
1182       case 10: return OFFB_X10;
1183       case 11: return OFFB_X11;
1184       case 12: return OFFB_X12;
1185       case 13: return OFFB_X13;
1186       case 14: return OFFB_X14;
1187       case 15: return OFFB_X15;
1188       case 16: return OFFB_X16;
1189       case 17: return OFFB_X17;
1190       case 18: return OFFB_X18;
1191       case 19: return OFFB_X19;
1192       case 20: return OFFB_X20;
1193       case 21: return OFFB_X21;
1194       case 22: return OFFB_X22;
1195       case 23: return OFFB_X23;
1196       case 24: return OFFB_X24;
1197       case 25: return OFFB_X25;
1198       case 26: return OFFB_X26;
1199       case 27: return OFFB_X27;
1200       case 28: return OFFB_X28;
1201       case 29: return OFFB_X29;
1202       case 30: return OFFB_X30;
1203       /* but not 31 */
1204       default: vassert(0);
1205    }
1206 }
1207
1208 static Int offsetIReg64orSP ( UInt iregNo )
1209 {
1210    return iregNo == 31  ? OFFB_XSP  : offsetIReg64(iregNo);
1211 }
1212
1213 static const HChar* nameIReg64orZR ( UInt iregNo )
1214 {
1215    vassert(iregNo < 32);
1216    static const HChar* names[32]
1217       = { "x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",
1218           "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15",
1219           "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
1220           "x24", "x25", "x26", "x27", "x28", "x29", "x30", "xzr" };
1221    return names[iregNo];
1222 }
1223
1224 static const HChar* nameIReg64orSP ( UInt iregNo )
1225 {
1226    if (iregNo == 31) {
1227       return "sp";
1228    }
1229    vassert(iregNo < 31);
1230    return nameIReg64orZR(iregNo);
1231 }
1232
1233 static IRExpr* getIReg64orSP ( UInt iregNo )
1234 {
1235    vassert(iregNo < 32);
1236    return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
1237 }
1238
1239 static IRExpr* getIReg64orZR ( UInt iregNo )
1240 {
1241    if (iregNo == 31) {
1242       return mkU64(0);
1243    }
1244    vassert(iregNo < 31);
1245    return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
1246 }
1247
1248 static void putIReg64orSP ( UInt iregNo, IRExpr* e )
1249 {
1250    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1251    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
1252 }
1253
1254 static void putIReg64orZR ( UInt iregNo, IRExpr* e )
1255 {
1256    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1257    if (iregNo == 31) {
1258       return;
1259    }
1260    vassert(iregNo < 31);
1261    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
1262 }
1263
1264 static const HChar* nameIReg32orZR ( UInt iregNo )
1265 {
1266    vassert(iregNo < 32);
1267    static const HChar* names[32]
1268       = { "w0",  "w1",  "w2",  "w3",  "w4",  "w5",  "w6",  "w7",
1269           "w8",  "w9",  "w10", "w11", "w12", "w13", "w14", "w15",
1270           "w16", "w17", "w18", "w19", "w20", "w21", "w22", "w23",
1271           "w24", "w25", "w26", "w27", "w28", "w29", "w30", "wzr" };
1272    return names[iregNo];
1273 }
1274
1275 static const HChar* nameIReg32orSP ( UInt iregNo )
1276 {
1277    if (iregNo == 31) {
1278       return "wsp";
1279    }
1280    vassert(iregNo < 31);
1281    return nameIReg32orZR(iregNo);
1282 }
1283
1284 static IRExpr* getIReg32orSP ( UInt iregNo )
1285 {
1286    vassert(iregNo < 32);
1287    return unop(Iop_64to32,
1288                IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
1289 }
1290
1291 static IRExpr* getIReg32orZR ( UInt iregNo )
1292 {
1293    if (iregNo == 31) {
1294       return mkU32(0);
1295    }
1296    vassert(iregNo < 31);
1297    return unop(Iop_64to32,
1298                IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
1299 }
1300
1301 static void putIReg32orSP ( UInt iregNo, IRExpr* e )
1302 {
1303    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1304    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
1305 }
1306
1307 static void putIReg32orZR ( UInt iregNo, IRExpr* e )
1308 {
1309    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1310    if (iregNo == 31) {
1311       return;
1312    }
1313    vassert(iregNo < 31);
1314    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
1315 }
1316
1317 static const HChar* nameIRegOrSP ( Bool is64, UInt iregNo )
1318 {
1319    vassert(is64 == True || is64 == False);
1320    return is64 ? nameIReg64orSP(iregNo) : nameIReg32orSP(iregNo);
1321 }
1322
1323 static const HChar* nameIRegOrZR ( Bool is64, UInt iregNo )
1324 {
1325    vassert(is64 == True || is64 == False);
1326    return is64 ? nameIReg64orZR(iregNo) : nameIReg32orZR(iregNo);
1327 }
1328
1329 static IRExpr* getIRegOrZR ( Bool is64, UInt iregNo )
1330 {
1331    vassert(is64 == True || is64 == False);
1332    return is64 ? getIReg64orZR(iregNo) : getIReg32orZR(iregNo);
1333 }
1334
1335 static void putIRegOrZR ( Bool is64, UInt iregNo, IRExpr* e )
1336 {
1337    vassert(is64 == True || is64 == False);
1338    if (is64) putIReg64orZR(iregNo, e); else putIReg32orZR(iregNo, e);
1339 }
1340
1341 static void putPC ( IRExpr* e )
1342 {
1343    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1344    stmt( IRStmt_Put(OFFB_PC, e) );
1345 }
1346
1347
1348 /* ---------------- Vector (Q) registers ---------------- */
1349
1350 static Int offsetQReg128 ( UInt qregNo )
1351 {
1352    /* We don't care about endianness at this point.  It only becomes
1353       relevant when dealing with sections of these registers.*/
1354    switch (qregNo) {
1355       case 0:  return OFFB_Q0;
1356       case 1:  return OFFB_Q1;
1357       case 2:  return OFFB_Q2;
1358       case 3:  return OFFB_Q3;
1359       case 4:  return OFFB_Q4;
1360       case 5:  return OFFB_Q5;
1361       case 6:  return OFFB_Q6;
1362       case 7:  return OFFB_Q7;
1363       case 8:  return OFFB_Q8;
1364       case 9:  return OFFB_Q9;
1365       case 10: return OFFB_Q10;
1366       case 11: return OFFB_Q11;
1367       case 12: return OFFB_Q12;
1368       case 13: return OFFB_Q13;
1369       case 14: return OFFB_Q14;
1370       case 15: return OFFB_Q15;
1371       case 16: return OFFB_Q16;
1372       case 17: return OFFB_Q17;
1373       case 18: return OFFB_Q18;
1374       case 19: return OFFB_Q19;
1375       case 20: return OFFB_Q20;
1376       case 21: return OFFB_Q21;
1377       case 22: return OFFB_Q22;
1378       case 23: return OFFB_Q23;
1379       case 24: return OFFB_Q24;
1380       case 25: return OFFB_Q25;
1381       case 26: return OFFB_Q26;
1382       case 27: return OFFB_Q27;
1383       case 28: return OFFB_Q28;
1384       case 29: return OFFB_Q29;
1385       case 30: return OFFB_Q30;
1386       case 31: return OFFB_Q31;
1387       default: vassert(0);
1388    }
1389 }
1390
1391 /* Write to a complete Qreg. */
1392 static void putQReg128 ( UInt qregNo, IRExpr* e )
1393 {
1394    vassert(qregNo < 32);
1395    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_V128);
1396    stmt( IRStmt_Put(offsetQReg128(qregNo), e) );
1397 }
1398
1399 /* Read a complete Qreg. */
1400 static IRExpr* getQReg128 ( UInt qregNo )
1401 {
1402    vassert(qregNo < 32);
1403    return IRExpr_Get(offsetQReg128(qregNo), Ity_V128);
1404 }
1405
1406 /* Produce the IR type for some sub-part of a vector.  For 32- and 64-
1407    bit sub-parts we can choose either integer or float types, and
1408    choose float on the basis that that is the common use case and so
1409    will give least interference with Put-to-Get forwarding later
1410    on. */
1411 static IRType preferredVectorSubTypeFromSize ( UInt szB )
1412 {
1413    switch (szB) {
1414       case 1:  return Ity_I8;
1415       case 2:  return Ity_I16;
1416       case 4:  return Ity_I32; //Ity_F32;
1417       case 8:  return Ity_F64;
1418       case 16: return Ity_V128;
1419       default: vassert(0);
1420    }
1421 }
1422
1423 /* Find the offset of the laneNo'th lane of type laneTy in the given
1424    Qreg.  Since the host is little-endian, the least significant lane
1425    has the lowest offset. */
1426 static Int offsetQRegLane ( UInt qregNo, IRType laneTy, UInt laneNo )
1427 {
1428    vassert(host_endness == VexEndnessLE);
1429    Int base = offsetQReg128(qregNo);
1430    /* Since the host is little-endian, the least significant lane
1431       will be at the lowest address. */
1432    /* Restrict this to known types, so as to avoid silently accepting
1433       stupid types. */
1434    UInt laneSzB = 0;
1435    switch (laneTy) {
1436       case Ity_I8:                 laneSzB = 1;  break;
1437       case Ity_F16: case Ity_I16:  laneSzB = 2;  break;
1438       case Ity_F32: case Ity_I32:  laneSzB = 4;  break;
1439       case Ity_F64: case Ity_I64:  laneSzB = 8;  break;
1440       case Ity_V128:               laneSzB = 16; break;
1441       default: break;
1442    }
1443    vassert(laneSzB > 0);
1444    UInt minOff = laneNo * laneSzB;
1445    UInt maxOff = minOff + laneSzB - 1;
1446    vassert(maxOff < 16);
1447    return base + minOff;
1448 }
1449
1450 /* Put to the least significant lane of a Qreg. */
1451 static void putQRegLO ( UInt qregNo, IRExpr* e )
1452 {
1453    IRType ty  = typeOfIRExpr(irsb->tyenv, e);
1454    Int    off = offsetQRegLane(qregNo, ty, 0);
1455    switch (ty) {
1456       case Ity_I8:  case Ity_I16: case Ity_I32: case Ity_I64:
1457       case Ity_F16: case Ity_F32: case Ity_F64: case Ity_V128:
1458          break;
1459       default:
1460          vassert(0); // Other cases are probably invalid
1461    }
1462    stmt(IRStmt_Put(off, e));
1463 }
1464
1465 /* Get from the least significant lane of a Qreg. */
1466 static IRExpr* getQRegLO ( UInt qregNo, IRType ty )
1467 {
1468    Int off = offsetQRegLane(qregNo, ty, 0);
1469    switch (ty) {
1470       case Ity_I8:
1471       case Ity_F16: case Ity_I16:
1472       case Ity_I32: case Ity_I64:
1473       case Ity_F32: case Ity_F64: case Ity_V128:
1474          break;
1475       default:
1476          vassert(0); // Other cases are ATC
1477    }
1478    return IRExpr_Get(off, ty);
1479 }
1480
1481 static const HChar* nameQRegLO ( UInt qregNo, IRType laneTy )
1482 {
1483    static const HChar* namesQ[32]
1484       = { "q0",  "q1",  "q2",  "q3",  "q4",  "q5",  "q6",  "q7",
1485           "q8",  "q9",  "q10", "q11", "q12", "q13", "q14", "q15",
1486           "q16", "q17", "q18", "q19", "q20", "q21", "q22", "q23",
1487           "q24", "q25", "q26", "q27", "q28", "q29", "q30", "q31" };
1488    static const HChar* namesD[32]
1489       = { "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
1490           "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
1491           "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
1492           "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" };
1493    static const HChar* namesS[32]
1494       = { "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
1495           "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15",
1496           "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
1497           "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31" };
1498    static const HChar* namesH[32]
1499       = { "h0",  "h1",  "h2",  "h3",  "h4",  "h5",  "h6",  "h7",
1500           "h8",  "h9",  "h10", "h11", "h12", "h13", "h14", "h15",
1501           "h16", "h17", "h18", "h19", "h20", "h21", "h22", "h23",
1502           "h24", "h25", "h26", "h27", "h28", "h29", "h30", "h31" };
1503    static const HChar* namesB[32]
1504       = { "b0",  "b1",  "b2",  "b3",  "b4",  "b5",  "b6",  "b7",
1505           "b8",  "b9",  "b10", "b11", "b12", "b13", "b14", "b15",
1506           "b16", "b17", "b18", "b19", "b20", "b21", "b22", "b23",
1507           "b24", "b25", "b26", "b27", "b28", "b29", "b30", "b31" };
1508    vassert(qregNo < 32);
1509    switch (sizeofIRType(laneTy)) {
1510       case 1:  return namesB[qregNo];
1511       case 2:  return namesH[qregNo];
1512       case 4:  return namesS[qregNo];
1513       case 8:  return namesD[qregNo];
1514       case 16: return namesQ[qregNo];
1515       default: vassert(0);
1516    }
1517    /*NOTREACHED*/
1518 }
1519
1520 static const HChar* nameQReg128 ( UInt qregNo )
1521 {
1522    return nameQRegLO(qregNo, Ity_V128);
1523 }
1524
1525 /* Find the offset of the most significant half (8 bytes) of the given
1526    Qreg.  This requires knowing the endianness of the host. */
1527 static Int offsetQRegHI64 ( UInt qregNo )
1528 {
1529    return offsetQRegLane(qregNo, Ity_I64, 1);
1530 }
1531
1532 static IRExpr* getQRegHI64 ( UInt qregNo )
1533 {
1534    return IRExpr_Get(offsetQRegHI64(qregNo), Ity_I64);
1535 }
1536
1537 static void putQRegHI64 ( UInt qregNo, IRExpr* e )
1538 {
1539    IRType ty  = typeOfIRExpr(irsb->tyenv, e);
1540    Int    off = offsetQRegHI64(qregNo);
1541    switch (ty) {
1542       case Ity_I64: case Ity_F64:
1543          break;
1544       default:
1545          vassert(0); // Other cases are plain wrong
1546    }
1547    stmt(IRStmt_Put(off, e));
1548 }
1549
1550 /* Put to a specified lane of a Qreg. */
1551 static void putQRegLane ( UInt qregNo, UInt laneNo, IRExpr* e )
1552 {
1553    IRType laneTy  = typeOfIRExpr(irsb->tyenv, e);
1554    Int    off     = offsetQRegLane(qregNo, laneTy, laneNo);
1555    switch (laneTy) {
1556       case Ity_F64: case Ity_I64:
1557       case Ity_I32: case Ity_F32:
1558       case Ity_I16: case Ity_F16:
1559       case Ity_I8:
1560          break;
1561       default:
1562          vassert(0); // Other cases are ATC
1563    }
1564    stmt(IRStmt_Put(off, e));
1565 }
1566
1567 /* Get from a specified lane of a Qreg. */
1568 static IRExpr* getQRegLane ( UInt qregNo, UInt laneNo, IRType laneTy )
1569 {
1570    Int off = offsetQRegLane(qregNo, laneTy, laneNo);
1571    switch (laneTy) {
1572       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
1573       case Ity_F64: case Ity_F32: case Ity_F16:
1574          break;
1575       default:
1576          vassert(0); // Other cases are ATC
1577    }
1578    return IRExpr_Get(off, laneTy);
1579 }
1580
1581
1582 //ZZ /* ---------------- Misc registers ---------------- */
1583 //ZZ
1584 //ZZ static void putMiscReg32 ( UInt    gsoffset,
1585 //ZZ                            IRExpr* e, /* :: Ity_I32 */
1586 //ZZ                            IRTemp  guardT /* :: Ity_I32, 0 or 1 */)
1587 //ZZ {
1588 //ZZ    switch (gsoffset) {
1589 //ZZ       case OFFB_FPSCR:   break;
1590 //ZZ       case OFFB_QFLAG32: break;
1591 //ZZ       case OFFB_GEFLAG0: break;
1592 //ZZ       case OFFB_GEFLAG1: break;
1593 //ZZ       case OFFB_GEFLAG2: break;
1594 //ZZ       case OFFB_GEFLAG3: break;
1595 //ZZ       default: vassert(0); /* awaiting more cases */
1596 //ZZ    }
1597 //ZZ    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1598 //ZZ
1599 //ZZ    if (guardT == IRTemp_INVALID) {
1600 //ZZ       /* unconditional write */
1601 //ZZ       stmt(IRStmt_Put(gsoffset, e));
1602 //ZZ    } else {
1603 //ZZ       stmt(IRStmt_Put(
1604 //ZZ          gsoffset,
1605 //ZZ          IRExpr_ITE( binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0)),
1606 //ZZ                      e, IRExpr_Get(gsoffset, Ity_I32) )
1607 //ZZ       ));
1608 //ZZ    }
1609 //ZZ }
1610 //ZZ
1611 //ZZ static IRTemp get_ITSTATE ( void )
1612 //ZZ {
1613 //ZZ    ASSERT_IS_THUMB;
1614 //ZZ    IRTemp t = newTemp(Ity_I32);
1615 //ZZ    assign(t, IRExpr_Get( OFFB_ITSTATE, Ity_I32));
1616 //ZZ    return t;
1617 //ZZ }
1618 //ZZ
1619 //ZZ static void put_ITSTATE ( IRTemp t )
1620 //ZZ {
1621 //ZZ    ASSERT_IS_THUMB;
1622 //ZZ    stmt( IRStmt_Put( OFFB_ITSTATE, mkexpr(t)) );
1623 //ZZ }
1624 //ZZ
1625 //ZZ static IRTemp get_QFLAG32 ( void )
1626 //ZZ {
1627 //ZZ    IRTemp t = newTemp(Ity_I32);
1628 //ZZ    assign(t, IRExpr_Get( OFFB_QFLAG32, Ity_I32));
1629 //ZZ    return t;
1630 //ZZ }
1631 //ZZ
1632 //ZZ static void put_QFLAG32 ( IRTemp t, IRTemp condT )
1633 //ZZ {
1634 //ZZ    putMiscReg32( OFFB_QFLAG32, mkexpr(t), condT );
1635 //ZZ }
1636 //ZZ
1637 //ZZ /* Stickily set the 'Q' flag (APSR bit 27) of the APSR (Application Program
1638 //ZZ    Status Register) to indicate that overflow or saturation occurred.
1639 //ZZ    Nb: t must be zero to denote no saturation, and any nonzero
1640 //ZZ    value to indicate saturation. */
1641 //ZZ static void or_into_QFLAG32 ( IRExpr* e, IRTemp condT )
1642 //ZZ {
1643 //ZZ    IRTemp old = get_QFLAG32();
1644 //ZZ    IRTemp nyu = newTemp(Ity_I32);
1645 //ZZ    assign(nyu, binop(Iop_Or32, mkexpr(old), e) );
1646 //ZZ    put_QFLAG32(nyu, condT);
1647 //ZZ }
1648
1649
1650 /* ---------------- FPCR stuff ---------------- */
1651
1652 /* Generate IR to get hold of the rounding mode bits in FPCR, and
1653    convert them to IR format.  Bind the final result to the
1654    returned temp. */
1655 static IRTemp /* :: Ity_I32 */ mk_get_IR_rounding_mode ( void )
1656 {
1657    /* The ARMvfp encoding for rounding mode bits is:
1658          00  to nearest
1659          01  to +infinity
1660          10  to -infinity
1661          11  to zero
1662       We need to convert that to the IR encoding:
1663          00  to nearest (the default)
1664          10  to +infinity
1665          01  to -infinity
1666          11  to zero
1667       Which can be done by swapping bits 0 and 1.
1668       The rmode bits are at 23:22 in FPSCR.
1669    */
1670    IRTemp armEncd = newTemp(Ity_I32);
1671    IRTemp swapped = newTemp(Ity_I32);
1672    /* Fish FPCR[23:22] out, and slide to bottom.  Doesn't matter that
1673       we don't zero out bits 24 and above, since the assignment to
1674       'swapped' will mask them out anyway. */
1675    assign(armEncd,
1676           binop(Iop_Shr32, IRExpr_Get(OFFB_FPCR, Ity_I32), mkU8(22)));
1677    /* Now swap them. */
1678    assign(swapped,
1679           binop(Iop_Or32,
1680                 binop(Iop_And32,
1681                       binop(Iop_Shl32, mkexpr(armEncd), mkU8(1)),
1682                       mkU32(2)),
1683                 binop(Iop_And32,
1684                       binop(Iop_Shr32, mkexpr(armEncd), mkU8(1)),
1685                       mkU32(1))
1686          ));
1687    return swapped;
1688 }
1689
1690
1691 /*------------------------------------------------------------*/
1692 /*--- Helpers for flag handling and conditional insns      ---*/
1693 /*------------------------------------------------------------*/
1694
1695 static const HChar* nameARM64Condcode ( ARM64Condcode cond )
1696 {
1697    switch (cond) {
1698       case ARM64CondEQ:  return "eq";
1699       case ARM64CondNE:  return "ne";
1700       case ARM64CondCS:  return "cs";  // or 'hs'
1701       case ARM64CondCC:  return "cc";  // or 'lo'
1702       case ARM64CondMI:  return "mi";
1703       case ARM64CondPL:  return "pl";
1704       case ARM64CondVS:  return "vs";
1705       case ARM64CondVC:  return "vc";
1706       case ARM64CondHI:  return "hi";
1707       case ARM64CondLS:  return "ls";
1708       case ARM64CondGE:  return "ge";
1709       case ARM64CondLT:  return "lt";
1710       case ARM64CondGT:  return "gt";
1711       case ARM64CondLE:  return "le";
1712       case ARM64CondAL:  return "al";
1713       case ARM64CondNV:  return "nv";
1714       default: vpanic("name_ARM64Condcode");
1715    }
1716 }
1717
1718 /* and a handy shorthand for it */
1719 static const HChar* nameCC ( ARM64Condcode cond ) {
1720    return nameARM64Condcode(cond);
1721 }
1722
1723
1724 /* Build IR to calculate some particular condition from stored
1725    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
1726    Ity_I64, suitable for narrowing.  Although the return type is
1727    Ity_I64, the returned value is either 0 or 1.  'cond' must be
1728    :: Ity_I64 and must denote the condition to compute in
1729    bits 7:4, and be zero everywhere else.
1730 */
1731 static IRExpr* mk_arm64g_calculate_condition_dyn ( IRExpr* cond )
1732 {
1733    vassert(typeOfIRExpr(irsb->tyenv, cond) == Ity_I64);
1734    /* And 'cond' had better produce a value in which only bits 7:4 are
1735       nonzero.  However, obviously we can't assert for that. */
1736
1737    /* So what we're constructing for the first argument is
1738       "(cond << 4) | stored-operation".
1739       However, as per comments above, 'cond' must be supplied
1740       pre-shifted to this function.
1741
1742       This pairing scheme requires that the ARM64_CC_OP_ values all fit
1743       in 4 bits.  Hence we are passing a (COND, OP) pair in the lowest
1744       8 bits of the first argument. */
1745    IRExpr** args
1746       = mkIRExprVec_4(
1747            binop(Iop_Or64, IRExpr_Get(OFFB_CC_OP, Ity_I64), cond),
1748            IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1749            IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1750            IRExpr_Get(OFFB_CC_NDEP, Ity_I64)
1751         );
1752    IRExpr* call
1753       = mkIRExprCCall(
1754            Ity_I64,
1755            0/*regparm*/,
1756            "arm64g_calculate_condition", &arm64g_calculate_condition,
1757            args
1758         );
1759
1760    /* Exclude the requested condition, OP and NDEP from definedness
1761       checking.  We're only interested in DEP1 and DEP2. */
1762    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1763    return call;
1764 }
1765
1766
1767 /* Build IR to calculate some particular condition from stored
1768    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
1769    Ity_I64, suitable for narrowing.  Although the return type is
1770    Ity_I64, the returned value is either 0 or 1.
1771 */
1772 static IRExpr* mk_arm64g_calculate_condition ( ARM64Condcode cond )
1773 {
1774   /* First arg is "(cond << 4) | condition".  This requires that the
1775      ARM64_CC_OP_ values all fit in 4 bits.  Hence we are passing a
1776      (COND, OP) pair in the lowest 8 bits of the first argument. */
1777    vassert(cond >= 0 && cond <= 15);
1778    return mk_arm64g_calculate_condition_dyn( mkU64(cond << 4) );
1779 }
1780
1781
1782 /* Build IR to calculate just the carry flag from stored
1783    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1784    Ity_I64. */
1785 static IRExpr* mk_arm64g_calculate_flag_c ( void )
1786 {
1787    IRExpr** args
1788       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1789                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1790                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1791                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1792    IRExpr* call
1793       = mkIRExprCCall(
1794            Ity_I64,
1795            0/*regparm*/,
1796            "arm64g_calculate_flag_c", &arm64g_calculate_flag_c,
1797            args
1798         );
1799    /* Exclude OP and NDEP from definedness checking.  We're only
1800       interested in DEP1 and DEP2. */
1801    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1802    return call;
1803 }
1804
1805
1806 //ZZ /* Build IR to calculate just the overflow flag from stored
1807 //ZZ    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1808 //ZZ    Ity_I32. */
1809 //ZZ static IRExpr* mk_armg_calculate_flag_v ( void )
1810 //ZZ {
1811 //ZZ    IRExpr** args
1812 //ZZ       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
1813 //ZZ                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
1814 //ZZ                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
1815 //ZZ                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
1816 //ZZ    IRExpr* call
1817 //ZZ       = mkIRExprCCall(
1818 //ZZ            Ity_I32,
1819 //ZZ            0/*regparm*/,
1820 //ZZ            "armg_calculate_flag_v", &armg_calculate_flag_v,
1821 //ZZ            args
1822 //ZZ         );
1823 //ZZ    /* Exclude OP and NDEP from definedness checking.  We're only
1824 //ZZ       interested in DEP1 and DEP2. */
1825 //ZZ    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1826 //ZZ    return call;
1827 //ZZ }
1828
1829
1830 /* Build IR to calculate N Z C V in bits 31:28 of the
1831    returned word. */
1832 static IRExpr* mk_arm64g_calculate_flags_nzcv ( void )
1833 {
1834    IRExpr** args
1835       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1836                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1837                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1838                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1839    IRExpr* call
1840       = mkIRExprCCall(
1841            Ity_I64,
1842            0/*regparm*/,
1843            "arm64g_calculate_flags_nzcv", &arm64g_calculate_flags_nzcv,
1844            args
1845         );
1846    /* Exclude OP and NDEP from definedness checking.  We're only
1847       interested in DEP1 and DEP2. */
1848    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1849    return call;
1850 }
1851
1852
1853 /* Build IR to set the flags thunk, in the most general case. */
1854 static
1855 void setFlags_D1_D2_ND ( UInt cc_op,
1856                          IRTemp t_dep1, IRTemp t_dep2, IRTemp t_ndep )
1857 {
1858    vassert(typeOfIRTemp(irsb->tyenv, t_dep1 == Ity_I64));
1859    vassert(typeOfIRTemp(irsb->tyenv, t_dep2 == Ity_I64));
1860    vassert(typeOfIRTemp(irsb->tyenv, t_ndep == Ity_I64));
1861    vassert(cc_op >= ARM64G_CC_OP_COPY && cc_op < ARM64G_CC_OP_NUMBER);
1862    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(cc_op) ));
1863    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t_dep1) ));
1864    stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(t_dep2) ));
1865    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(t_ndep) ));
1866 }
1867
1868 /* Build IR to set the flags thunk after ADD or SUB. */
1869 static
1870 void setFlags_ADD_SUB ( Bool is64, Bool isSUB, IRTemp argL, IRTemp argR )
1871 {
1872    IRTemp argL64 = IRTemp_INVALID;
1873    IRTemp argR64 = IRTemp_INVALID;
1874    IRTemp z64    = newTemp(Ity_I64);
1875    if (is64) {
1876       argL64 = argL;
1877       argR64 = argR;
1878    } else {
1879       argL64 = newTemp(Ity_I64);
1880       argR64 = newTemp(Ity_I64);
1881       assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
1882       assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
1883    }
1884    assign(z64, mkU64(0));
1885    UInt cc_op = ARM64G_CC_OP_NUMBER;
1886    /**/ if ( isSUB &&  is64) { cc_op = ARM64G_CC_OP_SUB64; }
1887    else if ( isSUB && !is64) { cc_op = ARM64G_CC_OP_SUB32; }
1888    else if (!isSUB &&  is64) { cc_op = ARM64G_CC_OP_ADD64; }
1889    else if (!isSUB && !is64) { cc_op = ARM64G_CC_OP_ADD32; }
1890    else                      { vassert(0); }
1891    setFlags_D1_D2_ND(cc_op, argL64, argR64, z64);
1892 }
1893
1894 /* Build IR to set the flags thunk after ADC or SBC. */
1895 static
1896 void setFlags_ADC_SBC ( Bool is64, Bool isSBC,
1897                         IRTemp argL, IRTemp argR, IRTemp oldC )
1898 {
1899    IRTemp argL64 = IRTemp_INVALID;
1900    IRTemp argR64 = IRTemp_INVALID;
1901    IRTemp oldC64 = IRTemp_INVALID;
1902    if (is64) {
1903       argL64 = argL;
1904       argR64 = argR;
1905       oldC64 = oldC;
1906    } else {
1907       argL64 = newTemp(Ity_I64);
1908       argR64 = newTemp(Ity_I64);
1909       oldC64 = newTemp(Ity_I64);
1910       assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
1911       assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
1912       assign(oldC64, unop(Iop_32Uto64, mkexpr(oldC)));
1913    }
1914    UInt cc_op = ARM64G_CC_OP_NUMBER;
1915    /**/ if ( isSBC &&  is64) { cc_op = ARM64G_CC_OP_SBC64; }
1916    else if ( isSBC && !is64) { cc_op = ARM64G_CC_OP_SBC32; }
1917    else if (!isSBC &&  is64) { cc_op = ARM64G_CC_OP_ADC64; }
1918    else if (!isSBC && !is64) { cc_op = ARM64G_CC_OP_ADC32; }
1919    else                      { vassert(0); }
1920    setFlags_D1_D2_ND(cc_op, argL64, argR64, oldC64);
1921 }
1922
1923 /* Build IR to set the flags thunk after ADD or SUB, if the given
1924    condition evaluates to True at run time.  If not, the flags are set
1925    to the specified NZCV value. */
1926 static
1927 void setFlags_ADD_SUB_conditionally (
1928         Bool is64, Bool isSUB,
1929         IRTemp cond, IRTemp argL, IRTemp argR, UInt nzcv
1930      )
1931 {
1932    /* Generate IR as follows:
1933         CC_OP   = ITE(cond, OP_{ADD,SUB}{32,64}, OP_COPY)
1934         CC_DEP1 = ITE(cond, argL64, nzcv << 28)
1935         CC_DEP2 = ITE(cond, argR64, 0)
1936         CC_NDEP = 0
1937    */
1938
1939    IRTemp z64 = newTemp(Ity_I64);
1940    assign(z64, mkU64(0));
1941
1942    /* Establish the operation and operands for the True case. */
1943    IRTemp t_dep1 = IRTemp_INVALID;
1944    IRTemp t_dep2 = IRTemp_INVALID;
1945    UInt   t_op   = ARM64G_CC_OP_NUMBER;
1946    /**/ if ( isSUB &&  is64) { t_op = ARM64G_CC_OP_SUB64; }
1947    else if ( isSUB && !is64) { t_op = ARM64G_CC_OP_SUB32; }
1948    else if (!isSUB &&  is64) { t_op = ARM64G_CC_OP_ADD64; }
1949    else if (!isSUB && !is64) { t_op = ARM64G_CC_OP_ADD32; }
1950    else                      { vassert(0); }
1951    /* */
1952    if (is64) {
1953       t_dep1 = argL;
1954       t_dep2 = argR;
1955    } else {
1956       t_dep1 = newTemp(Ity_I64);
1957       t_dep2 = newTemp(Ity_I64);
1958       assign(t_dep1, unop(Iop_32Uto64, mkexpr(argL)));
1959       assign(t_dep2, unop(Iop_32Uto64, mkexpr(argR)));
1960    }
1961
1962    /* Establish the operation and operands for the False case. */
1963    IRTemp f_dep1 = newTemp(Ity_I64);
1964    IRTemp f_dep2 = z64;
1965    UInt   f_op   = ARM64G_CC_OP_COPY;
1966    assign(f_dep1, mkU64(nzcv << 28));
1967
1968    /* Final thunk values */
1969    IRTemp dep1 = newTemp(Ity_I64);
1970    IRTemp dep2 = newTemp(Ity_I64);
1971    IRTemp op   = newTemp(Ity_I64);
1972
1973    assign(op,   IRExpr_ITE(mkexpr(cond), mkU64(t_op), mkU64(f_op)));
1974    assign(dep1, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep1), mkexpr(f_dep1)));
1975    assign(dep2, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep2), mkexpr(f_dep2)));
1976
1977    /* finally .. */
1978    stmt( IRStmt_Put( OFFB_CC_OP,   mkexpr(op) ));
1979    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(dep1) ));
1980    stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(dep2) ));
1981    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(z64) ));
1982 }
1983
1984 /* Build IR to set the flags thunk after AND/OR/XOR or variants thereof. */
1985 static
1986 void setFlags_LOGIC ( Bool is64, IRTemp res )
1987 {
1988    IRTemp res64 = IRTemp_INVALID;
1989    IRTemp z64   = newTemp(Ity_I64);
1990    UInt   cc_op = ARM64G_CC_OP_NUMBER;
1991    if (is64) {
1992       res64 = res;
1993       cc_op = ARM64G_CC_OP_LOGIC64;
1994    } else {
1995       res64 = newTemp(Ity_I64);
1996       assign(res64, unop(Iop_32Uto64, mkexpr(res)));
1997       cc_op = ARM64G_CC_OP_LOGIC32;
1998    }
1999    assign(z64, mkU64(0));
2000    setFlags_D1_D2_ND(cc_op, res64, z64, z64);
2001 }
2002
2003 /* Build IR to set the flags thunk to a given NZCV value.  NZCV is
2004    located in bits 31:28 of the supplied value. */
2005 static
2006 void setFlags_COPY ( IRTemp nzcv_28x0 )
2007 {
2008    IRTemp z64 = newTemp(Ity_I64);
2009    assign(z64, mkU64(0));
2010    setFlags_D1_D2_ND(ARM64G_CC_OP_COPY, nzcv_28x0, z64, z64);
2011 }
2012
2013
2014 //ZZ /* Minor variant of the above that sets NDEP to zero (if it
2015 //ZZ    sets it at all) */
2016 //ZZ static void setFlags_D1_D2 ( UInt cc_op, IRTemp t_dep1,
2017 //ZZ                              IRTemp t_dep2,
2018 //ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2019 //ZZ {
2020 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2021 //ZZ    assign( z32, mkU32(0) );
2022 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, t_dep2, z32, guardT );
2023 //ZZ }
2024 //ZZ
2025 //ZZ
2026 //ZZ /* Minor variant of the above that sets DEP2 to zero (if it
2027 //ZZ    sets it at all) */
2028 //ZZ static void setFlags_D1_ND ( UInt cc_op, IRTemp t_dep1,
2029 //ZZ                              IRTemp t_ndep,
2030 //ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2031 //ZZ {
2032 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2033 //ZZ    assign( z32, mkU32(0) );
2034 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, t_ndep, guardT );
2035 //ZZ }
2036 //ZZ
2037 //ZZ
2038 //ZZ /* Minor variant of the above that sets DEP2 and NDEP to zero (if it
2039 //ZZ    sets them at all) */
2040 //ZZ static void setFlags_D1 ( UInt cc_op, IRTemp t_dep1,
2041 //ZZ                           IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2042 //ZZ {
2043 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2044 //ZZ    assign( z32, mkU32(0) );
2045 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, z32, guardT );
2046 //ZZ }
2047
2048
2049 /*------------------------------------------------------------*/
2050 /*--- Misc math helpers                                    ---*/
2051 /*------------------------------------------------------------*/
2052
2053 /* Generate IR for ((x & mask) >>u sh) | ((x << sh) & mask) */
2054 static IRTemp math_SWAPHELPER ( IRTemp x, ULong mask, Int sh )
2055 {
2056    IRTemp maskT = newTemp(Ity_I64);
2057    IRTemp res   = newTemp(Ity_I64);
2058    vassert(sh >= 1 && sh <= 63);
2059    assign(maskT, mkU64(mask));
2060    assign( res,
2061            binop(Iop_Or64,
2062                  binop(Iop_Shr64,
2063                        binop(Iop_And64,mkexpr(x),mkexpr(maskT)),
2064                        mkU8(sh)),
2065                  binop(Iop_And64,
2066                        binop(Iop_Shl64,mkexpr(x),mkU8(sh)),
2067                        mkexpr(maskT))
2068                  )
2069            );
2070    return res;
2071 }
2072
2073 /* Generates byte swaps within 32-bit lanes. */
2074 static IRTemp math_UINTSWAP64 ( IRTemp src )
2075 {
2076    IRTemp res;
2077    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2078    res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
2079    return res;
2080 }
2081
2082 /* Generates byte swaps within 16-bit lanes. */
2083 static IRTemp math_USHORTSWAP64 ( IRTemp src )
2084 {
2085    IRTemp res;
2086    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2087    return res;
2088 }
2089
2090 /* Generates a 64-bit byte swap. */
2091 static IRTemp math_BYTESWAP64 ( IRTemp src )
2092 {
2093    IRTemp res;
2094    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2095    res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
2096    res = math_SWAPHELPER(res, 0xFFFFFFFF00000000ULL, 32);
2097    return res;
2098 }
2099
2100 /* Generates a 64-bit bit swap. */
2101 static IRTemp math_BITSWAP64 ( IRTemp src )
2102 {
2103    IRTemp res;
2104    res = math_SWAPHELPER(src, 0xAAAAAAAAAAAAAAAAULL, 1);
2105    res = math_SWAPHELPER(res, 0xCCCCCCCCCCCCCCCCULL, 2);
2106    res = math_SWAPHELPER(res, 0xF0F0F0F0F0F0F0F0ULL, 4);
2107    return math_BYTESWAP64(res);
2108 }
2109
2110 /* Duplicates the bits at the bottom of the given word to fill the
2111    whole word.  src :: Ity_I64 is assumed to have zeroes everywhere
2112    except for the bottom bits. */
2113 static IRTemp math_DUP_TO_64 ( IRTemp src, IRType srcTy )
2114 {
2115    if (srcTy == Ity_I8) {
2116       IRTemp t16 = newTemp(Ity_I64);
2117       assign(t16, binop(Iop_Or64, mkexpr(src),
2118                                   binop(Iop_Shl64, mkexpr(src), mkU8(8))));
2119       IRTemp t32 = newTemp(Ity_I64);
2120       assign(t32, binop(Iop_Or64, mkexpr(t16),
2121                                   binop(Iop_Shl64, mkexpr(t16), mkU8(16))));
2122       IRTemp t64 = newTemp(Ity_I64);
2123       assign(t64, binop(Iop_Or64, mkexpr(t32),
2124                                   binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
2125       return t64;
2126    }
2127    if (srcTy == Ity_I16) {
2128       IRTemp t32 = newTemp(Ity_I64);
2129       assign(t32, binop(Iop_Or64, mkexpr(src),
2130                                   binop(Iop_Shl64, mkexpr(src), mkU8(16))));
2131       IRTemp t64 = newTemp(Ity_I64);
2132       assign(t64, binop(Iop_Or64, mkexpr(t32),
2133                                   binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
2134       return t64;
2135    }
2136    if (srcTy == Ity_I32) {
2137       IRTemp t64 = newTemp(Ity_I64);
2138       assign(t64, binop(Iop_Or64, mkexpr(src),
2139                                   binop(Iop_Shl64, mkexpr(src), mkU8(32))));
2140       return t64;
2141    }
2142    if (srcTy == Ity_I64) {
2143       return src;
2144    }
2145    vassert(0);
2146 }
2147
2148
2149 /* Duplicates the src element exactly so as to fill a V128 value. */
2150 static IRTemp math_DUP_TO_V128 ( IRTemp src, IRType srcTy )
2151 {
2152    IRTemp res = newTempV128();
2153    if (srcTy == Ity_F64) {
2154       IRTemp i64 = newTemp(Ity_I64);
2155       assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(src)));
2156       assign(res, binop(Iop_64HLtoV128, mkexpr(i64), mkexpr(i64)));
2157       return res;
2158    }
2159    if (srcTy == Ity_F32) {
2160       IRTemp i64a = newTemp(Ity_I64);
2161       assign(i64a, unop(Iop_32Uto64, unop(Iop_ReinterpF32asI32, mkexpr(src))));
2162       IRTemp i64b = newTemp(Ity_I64);
2163       assign(i64b, binop(Iop_Or64, binop(Iop_Shl64, mkexpr(i64a), mkU8(32)),
2164                                    mkexpr(i64a)));
2165       assign(res, binop(Iop_64HLtoV128, mkexpr(i64b), mkexpr(i64b)));
2166       return res;
2167    }
2168    if (srcTy == Ity_I64) {
2169       assign(res, binop(Iop_64HLtoV128, mkexpr(src), mkexpr(src)));
2170       return res;
2171    }
2172    if (srcTy == Ity_I32 || srcTy == Ity_I16 || srcTy == Ity_I8) {
2173       IRTemp t1 = newTemp(Ity_I64);
2174       assign(t1, widenUto64(srcTy, mkexpr(src)));
2175       IRTemp t2 = math_DUP_TO_64(t1, srcTy);
2176       assign(res, binop(Iop_64HLtoV128, mkexpr(t2), mkexpr(t2)));
2177       return res;
2178    }
2179    vassert(0);
2180 }
2181
2182
2183 /* |fullWidth| is a full V128 width result.  Depending on bitQ,
2184    zero out the upper half. */
2185 static IRExpr* math_MAYBE_ZERO_HI64 ( UInt bitQ, IRTemp fullWidth )
2186 {
2187    if (bitQ == 1) return mkexpr(fullWidth);
2188    if (bitQ == 0) return unop(Iop_ZeroHI64ofV128, mkexpr(fullWidth));
2189    vassert(0);
2190 }
2191
2192 /* The same, but from an expression instead. */
2193 static IRExpr* math_MAYBE_ZERO_HI64_fromE ( UInt bitQ, IRExpr* fullWidth )
2194 {
2195    IRTemp fullWidthT = newTempV128();
2196    assign(fullWidthT, fullWidth);
2197    return math_MAYBE_ZERO_HI64(bitQ, fullWidthT);
2198 }
2199
2200
2201 /*------------------------------------------------------------*/
2202 /*--- FP comparison helpers                                ---*/
2203 /*------------------------------------------------------------*/
2204
2205 /* irRes :: Ity_I32 holds a floating point comparison result encoded
2206    as an IRCmpF64Result.  Generate code to convert it to an
2207    ARM64-encoded (N,Z,C,V) group in the lowest 4 bits of an I64 value.
2208    Assign a new temp to hold that value, and return the temp. */
2209 static
2210 IRTemp mk_convert_IRCmpF64Result_to_NZCV ( IRTemp irRes32 )
2211 {
2212    IRTemp ix       = newTemp(Ity_I64);
2213    IRTemp termL    = newTemp(Ity_I64);
2214    IRTemp termR    = newTemp(Ity_I64);
2215    IRTemp nzcv     = newTemp(Ity_I64);
2216    IRTemp irRes    = newTemp(Ity_I64);
2217
2218    /* This is where the fun starts.  We have to convert 'irRes' from
2219       an IR-convention return result (IRCmpF64Result) to an
2220       ARM-encoded (N,Z,C,V) group.  The final result is in the bottom
2221       4 bits of 'nzcv'. */
2222    /* Map compare result from IR to ARM(nzcv) */
2223    /*
2224       FP cmp result | IR   | ARM(nzcv)
2225       --------------------------------
2226       UN              0x45   0011
2227       LT              0x01   1000
2228       GT              0x00   0010
2229       EQ              0x40   0110
2230    */
2231    /* Now since you're probably wondering WTF ..
2232
2233       ix fishes the useful bits out of the IR value, bits 6 and 0, and
2234       places them side by side, giving a number which is 0, 1, 2 or 3.
2235
2236       termL is a sequence cooked up by GNU superopt.  It converts ix
2237          into an almost correct value NZCV value (incredibly), except
2238          for the case of UN, where it produces 0100 instead of the
2239          required 0011.
2240
2241       termR is therefore a correction term, also computed from ix.  It
2242          is 1 in the UN case and 0 for LT, GT and UN.  Hence, to get
2243          the final correct value, we subtract termR from termL.
2244
2245       Don't take my word for it.  There's a test program at the bottom
2246       of guest_arm_toIR.c, to try this out with.
2247    */
2248    assign(irRes, unop(Iop_32Uto64, mkexpr(irRes32)));
2249
2250    assign(
2251       ix,
2252       binop(Iop_Or64,
2253             binop(Iop_And64,
2254                   binop(Iop_Shr64, mkexpr(irRes), mkU8(5)),
2255                   mkU64(3)),
2256             binop(Iop_And64, mkexpr(irRes), mkU64(1))));
2257
2258    assign(
2259       termL,
2260       binop(Iop_Add64,
2261             binop(Iop_Shr64,
2262                   binop(Iop_Sub64,
2263                         binop(Iop_Shl64,
2264                               binop(Iop_Xor64, mkexpr(ix), mkU64(1)),
2265                               mkU8(62)),
2266                         mkU64(1)),
2267                   mkU8(61)),
2268             mkU64(1)));
2269
2270    assign(
2271       termR,
2272       binop(Iop_And64,
2273             binop(Iop_And64,
2274                   mkexpr(ix),
2275                   binop(Iop_Shr64, mkexpr(ix), mkU8(1))),
2276             mkU64(1)));
2277
2278    assign(nzcv, binop(Iop_Sub64, mkexpr(termL), mkexpr(termR)));
2279    return nzcv;
2280 }
2281
2282
2283 /*------------------------------------------------------------*/
2284 /*--- Data processing (immediate)                          ---*/
2285 /*------------------------------------------------------------*/
2286
2287 /* Helper functions for supporting "DecodeBitMasks" */
2288
2289 static ULong dbm_ROR ( Int width, ULong x, Int rot )
2290 {
2291    vassert(width > 0 && width <= 64);
2292    vassert(rot >= 0 && rot < width);
2293    if (rot == 0) return x;
2294    ULong res = x >> rot;
2295    res |= (x << (width - rot));
2296    if (width < 64)
2297      res &= ((1ULL << width) - 1);
2298    return res;
2299 }
2300
2301 static ULong dbm_RepTo64( Int esize, ULong x )
2302 {
2303    switch (esize) {
2304       case 64:
2305          return x;
2306       case 32:
2307          x &= 0xFFFFFFFF; x |= (x << 32);
2308          return x;
2309       case 16:
2310          x &= 0xFFFF; x |= (x << 16); x |= (x << 32);
2311          return x;
2312       case 8:
2313          x &= 0xFF; x |= (x << 8); x |= (x << 16); x |= (x << 32);
2314          return x;
2315       case 4:
2316          x &= 0xF; x |= (x << 4); x |= (x << 8);
2317          x |= (x << 16); x |= (x << 32);
2318          return x;
2319       case 2:
2320          x &= 0x3; x |= (x << 2); x |= (x << 4); x |= (x << 8);
2321          x |= (x << 16); x |= (x << 32);
2322          return x;
2323       default:
2324          break;
2325    }
2326    vpanic("dbm_RepTo64");
2327    /*NOTREACHED*/
2328    return 0;
2329 }
2330
2331 static Int dbm_highestSetBit ( ULong x )
2332 {
2333    Int i;
2334    for (i = 63; i >= 0; i--) {
2335       if (x & (1ULL << i))
2336          return i;
2337    }
2338    vassert(x == 0);
2339    return -1;
2340 }
2341
2342 static
2343 Bool dbm_DecodeBitMasks ( /*OUT*/ULong* wmask, /*OUT*/ULong* tmask,
2344                           ULong immN, ULong imms, ULong immr, Bool immediate,
2345                           UInt M /*32 or 64*/)
2346 {
2347    vassert(immN < (1ULL << 1));
2348    vassert(imms < (1ULL << 6));
2349    vassert(immr < (1ULL << 6));
2350    vassert(immediate == False || immediate == True);
2351    vassert(M == 32 || M == 64);
2352
2353    Int len = dbm_highestSetBit( ((immN << 6) & 64) | ((~imms) & 63) );
2354    if (len < 1) { /* printf("fail1\n"); */ return False; }
2355    vassert(len <= 6);
2356    vassert(M >= (1 << len));
2357
2358    vassert(len >= 1 && len <= 6);
2359    ULong levels = // (zeroes(6 - len) << (6-len)) | ones(len);
2360                   (1 << len) - 1;
2361    vassert(levels >= 1 && levels <= 63);
2362
2363    if (immediate && ((imms & levels) == levels)) {
2364       /* printf("fail2 imms %llu levels %llu len %d\n", imms, levels, len); */
2365       return False;
2366    }
2367
2368    ULong S = imms & levels;
2369    ULong R = immr & levels;
2370    Int   diff = S - R;
2371    diff &= 63;
2372    Int esize = 1 << len;
2373    vassert(2 <= esize && esize <= 64);
2374
2375    /* Be careful of these (1ULL << (S+1)) - 1 expressions, and the
2376       same below with d.  S can be 63 in which case we have an out of
2377       range and hence undefined shift. */
2378    vassert(S >= 0 && S <= 63);
2379    vassert(esize >= (S+1));
2380    ULong elem_s = // Zeroes(esize-(S+1)):Ones(S+1)
2381                   //(1ULL << (S+1)) - 1;
2382                   ((1ULL << S) - 1) + (1ULL << S);
2383
2384    Int d = // diff<len-1:0>
2385            diff & ((1 << len)-1);
2386    vassert(esize >= (d+1));
2387    vassert(d >= 0 && d <= 63);
2388
2389    ULong elem_d = // Zeroes(esize-(d+1)):Ones(d+1)
2390                   //(1ULL << (d+1)) - 1;
2391                   ((1ULL << d) - 1) + (1ULL << d);
2392
2393    if (esize != 64) vassert(elem_s < (1ULL << esize));
2394    if (esize != 64) vassert(elem_d < (1ULL << esize));
2395
2396    if (wmask) *wmask = dbm_RepTo64(esize, dbm_ROR(esize, elem_s, R));
2397    if (tmask) *tmask = dbm_RepTo64(esize, elem_d);
2398
2399    return True;
2400 }
2401
2402
2403 static
2404 Bool dis_ARM64_data_processing_immediate(/*MB_OUT*/DisResult* dres,
2405                                          UInt insn)
2406 {
2407 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
2408
2409    /* insn[28:23]
2410       10000x PC-rel addressing
2411       10001x Add/subtract (immediate)
2412       100100 Logical (immediate)
2413       100101 Move Wide (immediate)
2414       100110 Bitfield
2415       100111 Extract
2416    */
2417
2418    /* ------------------ ADD/SUB{,S} imm12 ------------------ */
2419    if (INSN(28,24) == BITS5(1,0,0,0,1)) {
2420       Bool is64   = INSN(31,31) == 1;
2421       Bool isSub  = INSN(30,30) == 1;
2422       Bool setCC  = INSN(29,29) == 1;
2423       UInt sh     = INSN(23,22);
2424       UInt uimm12 = INSN(21,10);
2425       UInt nn     = INSN(9,5);
2426       UInt dd     = INSN(4,0);
2427       const HChar* nm = isSub ? "sub" : "add";
2428       if (sh >= 2) {
2429          /* Invalid; fall through */
2430       } else {
2431          vassert(sh <= 1);
2432          uimm12 <<= (12 * sh);
2433          if (is64) {
2434             IRTemp argL  = newTemp(Ity_I64);
2435             IRTemp argR  = newTemp(Ity_I64);
2436             IRTemp res   = newTemp(Ity_I64);
2437             assign(argL, getIReg64orSP(nn));
2438             assign(argR, mkU64(uimm12));
2439             assign(res,  binop(isSub ? Iop_Sub64 : Iop_Add64,
2440                                mkexpr(argL), mkexpr(argR)));
2441             if (setCC) {
2442                putIReg64orZR(dd, mkexpr(res));
2443                setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
2444                DIP("%ss %s, %s, 0x%x\n",
2445                    nm, nameIReg64orZR(dd), nameIReg64orSP(nn), uimm12);
2446             } else {
2447                putIReg64orSP(dd, mkexpr(res));
2448                DIP("%s %s, %s, 0x%x\n",
2449                    nm, nameIReg64orSP(dd), nameIReg64orSP(nn), uimm12);
2450             }
2451          } else {
2452             IRTemp argL  = newTemp(Ity_I32);
2453             IRTemp argR  = newTemp(Ity_I32);
2454             IRTemp res   = newTemp(Ity_I32);
2455             assign(argL, getIReg32orSP(nn));
2456             assign(argR, mkU32(uimm12));
2457             assign(res,  binop(isSub ? Iop_Sub32 : Iop_Add32,
2458                                mkexpr(argL), mkexpr(argR)));
2459             if (setCC) {
2460                putIReg32orZR(dd, mkexpr(res));
2461                setFlags_ADD_SUB(False/*!is64*/, isSub, argL, argR);
2462                DIP("%ss %s, %s, 0x%x\n",
2463                    nm, nameIReg32orZR(dd), nameIReg32orSP(nn), uimm12);
2464             } else {
2465                putIReg32orSP(dd, mkexpr(res));
2466                DIP("%s %s, %s, 0x%x\n",
2467                    nm, nameIReg32orSP(dd), nameIReg32orSP(nn), uimm12);
2468             }
2469          }
2470          return True;
2471       }
2472    }
2473
2474    /* -------------------- ADR/ADRP -------------------- */
2475    if (INSN(28,24) == BITS5(1,0,0,0,0)) {
2476       UInt  bP    = INSN(31,31);
2477       UInt  immLo = INSN(30,29);
2478       UInt  immHi = INSN(23,5);
2479       UInt  rD    = INSN(4,0);
2480       ULong uimm  = (immHi << 2) | immLo;
2481       ULong simm  = sx_to_64(uimm, 21);
2482       ULong val;
2483       if (bP) {
2484          val = (guest_PC_curr_instr & 0xFFFFFFFFFFFFF000ULL) + (simm << 12);
2485       } else {
2486          val = guest_PC_curr_instr + simm;
2487       }
2488       putIReg64orZR(rD, mkU64(val));
2489       DIP("adr%s %s, 0x%llx\n", bP ? "p" : "", nameIReg64orZR(rD), val);
2490       return True;
2491    }
2492
2493    /* -------------------- LOGIC(imm) -------------------- */
2494    if (INSN(28,23) == BITS6(1,0,0,1,0,0)) {
2495       /* 31 30 28     22 21   15   9  4
2496          sf op 100100 N  immr imms Rn Rd
2497            op=00: AND  Rd|SP, Rn, #imm
2498            op=01: ORR  Rd|SP, Rn, #imm
2499            op=10: EOR  Rd|SP, Rn, #imm
2500            op=11: ANDS Rd|ZR, Rn, #imm
2501       */
2502       Bool  is64 = INSN(31,31) == 1;
2503       UInt  op   = INSN(30,29);
2504       UInt  N    = INSN(22,22);
2505       UInt  immR = INSN(21,16);
2506       UInt  immS = INSN(15,10);
2507       UInt  nn   = INSN(9,5);
2508       UInt  dd   = INSN(4,0);
2509       ULong imm  = 0;
2510       Bool  ok;
2511       if (N == 1 && !is64)
2512          goto after_logic_imm; /* not allowed; fall through */
2513       ok = dbm_DecodeBitMasks(&imm, NULL,
2514                               N, immS, immR, True, is64 ? 64 : 32);
2515       if (!ok)
2516          goto after_logic_imm;
2517
2518       const HChar* names[4] = { "and", "orr", "eor", "ands" };
2519       const IROp   ops64[4] = { Iop_And64, Iop_Or64, Iop_Xor64, Iop_And64 };
2520       const IROp   ops32[4] = { Iop_And32, Iop_Or32, Iop_Xor32, Iop_And32 };
2521
2522       vassert(op < 4);
2523       if (is64) {
2524          IRExpr* argL = getIReg64orZR(nn);
2525          IRExpr* argR = mkU64(imm);
2526          IRTemp  res  = newTemp(Ity_I64);
2527          assign(res, binop(ops64[op], argL, argR));
2528          if (op < 3) {
2529             putIReg64orSP(dd, mkexpr(res));
2530             DIP("%s %s, %s, 0x%llx\n", names[op],
2531                 nameIReg64orSP(dd), nameIReg64orZR(nn), imm);
2532          } else {
2533             putIReg64orZR(dd, mkexpr(res));
2534             setFlags_LOGIC(True/*is64*/, res);
2535             DIP("%s %s, %s, 0x%llx\n", names[op],
2536                 nameIReg64orZR(dd), nameIReg64orZR(nn), imm);
2537          }
2538       } else {
2539          IRExpr* argL = getIReg32orZR(nn);
2540          IRExpr* argR = mkU32((UInt)imm);
2541          IRTemp  res  = newTemp(Ity_I32);
2542          assign(res, binop(ops32[op], argL, argR));
2543          if (op < 3) {
2544             putIReg32orSP(dd, mkexpr(res));
2545             DIP("%s %s, %s, 0x%x\n", names[op],
2546                 nameIReg32orSP(dd), nameIReg32orZR(nn), (UInt)imm);
2547          } else {
2548             putIReg32orZR(dd, mkexpr(res));
2549             setFlags_LOGIC(False/*!is64*/, res);
2550             DIP("%s %s, %s, 0x%x\n", names[op],
2551                 nameIReg32orZR(dd), nameIReg32orZR(nn), (UInt)imm);
2552          }
2553       }
2554       return True;
2555    }
2556    after_logic_imm:
2557
2558    /* -------------------- MOV{Z,N,K} -------------------- */
2559    if (INSN(28,23) == BITS6(1,0,0,1,0,1)) {
2560       /* 31 30 28      22 20    4
2561          |  |  |       |  |     |
2562          sf 10 100 101 hw imm16 Rd   MOV(Z) Rd, (imm16 << (16*hw))
2563          sf 00 100 101 hw imm16 Rd   MOV(N) Rd, ~(imm16 << (16*hw))
2564          sf 11 100 101 hw imm16 Rd   MOV(K) Rd, (imm16 << (16*hw))
2565       */
2566       Bool is64   = INSN(31,31) == 1;
2567       UInt subopc = INSN(30,29);
2568       UInt hw     = INSN(22,21);
2569       UInt imm16  = INSN(20,5);
2570       UInt dd     = INSN(4,0);
2571       if (subopc == BITS2(0,1) || (!is64 && hw >= 2)) {
2572          /* invalid; fall through */
2573       } else {
2574          ULong imm64 = ((ULong)imm16) << (16 * hw);
2575          if (!is64)
2576             vassert(imm64 < 0x100000000ULL);
2577          switch (subopc) {
2578             case BITS2(1,0): // MOVZ
2579                putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
2580                DIP("movz %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
2581                break;
2582             case BITS2(0,0): // MOVN
2583                imm64 = ~imm64;
2584                if (!is64)
2585                   imm64 &= 0xFFFFFFFFULL;
2586                putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
2587                DIP("movn %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
2588                break;
2589             case BITS2(1,1): // MOVK
2590                /* This is more complex.  We are inserting a slice into
2591                   the destination register, so we need to have the old
2592                   value of it. */
2593                if (is64) {
2594                   IRTemp old = newTemp(Ity_I64);
2595                   assign(old, getIReg64orZR(dd));
2596                   ULong mask = 0xFFFFULL << (16 * hw);
2597                   IRExpr* res
2598                      = binop(Iop_Or64,
2599                              binop(Iop_And64, mkexpr(old), mkU64(~mask)),
2600                              mkU64(imm64));
2601                   putIReg64orZR(dd, res);
2602                   DIP("movk %s, 0x%x, lsl %u\n",
2603                       nameIReg64orZR(dd), imm16, 16*hw);
2604                } else {
2605                   IRTemp old = newTemp(Ity_I32);
2606                   assign(old, getIReg32orZR(dd));
2607                   vassert(hw <= 1);
2608                   UInt mask = ((UInt)0xFFFF) << (16 * hw);
2609                   IRExpr* res
2610                      = binop(Iop_Or32,
2611                              binop(Iop_And32, mkexpr(old), mkU32(~mask)),
2612                              mkU32((UInt)imm64));
2613                   putIReg32orZR(dd, res);
2614                   DIP("movk %s, 0x%x, lsl %u\n",
2615                       nameIReg32orZR(dd), imm16, 16*hw);
2616                }
2617                break;
2618             default:
2619                vassert(0);
2620          }
2621          return True;
2622       }
2623    }
2624
2625    /* -------------------- {U,S,}BFM -------------------- */
2626    /*    30 28     22 21   15   9  4
2627
2628       sf 10 100110 N  immr imms nn dd
2629          UBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2630          UBFM Xd, Xn, #immr, #imms   when sf=1, N=1
2631
2632       sf 00 100110 N  immr imms nn dd
2633          SBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2634          SBFM Xd, Xn, #immr, #imms   when sf=1, N=1
2635
2636       sf 01 100110 N  immr imms nn dd
2637          BFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2638          BFM Xd, Xn, #immr, #imms   when sf=1, N=1
2639    */
2640    if (INSN(28,23) == BITS6(1,0,0,1,1,0)) {
2641       UInt sf     = INSN(31,31);
2642       UInt opc    = INSN(30,29);
2643       UInt N      = INSN(22,22);
2644       UInt immR   = INSN(21,16);
2645       UInt immS   = INSN(15,10);
2646       UInt nn     = INSN(9,5);
2647       UInt dd     = INSN(4,0);
2648       Bool inZero = False;
2649       Bool extend = False;
2650       const HChar* nm = "???";
2651       /* skip invalid combinations */
2652       switch (opc) {
2653          case BITS2(0,0):
2654             inZero = True; extend = True; nm = "sbfm"; break;
2655          case BITS2(0,1):
2656             inZero = False; extend = False; nm = "bfm"; break;
2657          case BITS2(1,0):
2658             inZero = True; extend = False; nm = "ubfm"; break;
2659          case BITS2(1,1):
2660             goto after_bfm; /* invalid */
2661          default:
2662             vassert(0);
2663       }
2664       if (sf == 1 && N != 1) goto after_bfm;
2665       if (sf == 0 && (N != 0 || ((immR >> 5) & 1) != 0
2666                              || ((immS >> 5) & 1) != 0)) goto after_bfm;
2667       ULong wmask = 0, tmask = 0;
2668       Bool ok = dbm_DecodeBitMasks(&wmask, &tmask,
2669                                    N, immS, immR, False, sf == 1 ? 64 : 32);
2670       if (!ok) goto after_bfm; /* hmmm */
2671
2672       Bool   is64 = sf == 1;
2673       IRType ty   = is64 ? Ity_I64 : Ity_I32;
2674
2675       IRTemp dst = newTemp(ty);
2676       IRTemp src = newTemp(ty);
2677       IRTemp bot = newTemp(ty);
2678       IRTemp top = newTemp(ty);
2679       IRTemp res = newTemp(ty);
2680       assign(dst, inZero ? mkU(ty,0) : getIRegOrZR(is64, dd));
2681       assign(src, getIRegOrZR(is64, nn));
2682       /* perform bitfield move on low bits */
2683       assign(bot, binop(mkOR(ty),
2684                         binop(mkAND(ty), mkexpr(dst), mkU(ty, ~wmask)),
2685                         binop(mkAND(ty), mkexpr(mathROR(ty, src, immR)),
2686                                          mkU(ty, wmask))));
2687       /* determine extension bits (sign, zero or dest register) */
2688       assign(top, mkexpr(extend ? mathREPLICATE(ty, src, immS) : dst));
2689       /* combine extension bits and result bits */
2690       assign(res, binop(mkOR(ty),
2691                         binop(mkAND(ty), mkexpr(top), mkU(ty, ~tmask)),
2692                         binop(mkAND(ty), mkexpr(bot), mkU(ty, tmask))));
2693       putIRegOrZR(is64, dd, mkexpr(res));
2694       DIP("%s %s, %s, immR=%u, immS=%u\n",
2695           nm, nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR, immS);
2696       return True;
2697    }
2698    after_bfm:
2699
2700    /* ---------------------- EXTR ---------------------- */
2701    /*   30 28     22 20 15   9 4
2702       1 00 100111 10 m  imm6 n d  EXTR Xd, Xn, Xm, #imm6
2703       0 00 100111 00 m  imm6 n d  EXTR Wd, Wn, Wm, #imm6 when #imm6 < 32
2704    */
2705    if (INSN(30,23) == BITS8(0,0,1,0,0,1,1,1) && INSN(21,21) == 0) {
2706       Bool is64  = INSN(31,31) == 1;
2707       UInt mm    = INSN(20,16);
2708       UInt imm6  = INSN(15,10);
2709       UInt nn    = INSN(9,5);
2710       UInt dd    = INSN(4,0);
2711       Bool valid = True;
2712       if (INSN(31,31) != INSN(22,22))
2713         valid = False;
2714       if (!is64 && imm6 >= 32)
2715         valid = False;
2716       if (!valid) goto after_extr;
2717       IRType ty    = is64 ? Ity_I64 : Ity_I32;
2718       IRTemp srcHi = newTemp(ty);
2719       IRTemp srcLo = newTemp(ty);
2720       IRTemp res   = newTemp(ty);
2721       assign(srcHi, getIRegOrZR(is64, nn));
2722       assign(srcLo, getIRegOrZR(is64, mm));
2723       if (imm6 == 0) {
2724         assign(res, mkexpr(srcLo));
2725       } else {
2726         UInt szBits = 8 * sizeofIRType(ty);
2727         vassert(imm6 > 0 && imm6 < szBits);
2728         assign(res, binop(mkOR(ty),
2729                           binop(mkSHL(ty), mkexpr(srcHi), mkU8(szBits-imm6)),
2730                           binop(mkSHR(ty), mkexpr(srcLo), mkU8(imm6))));
2731       }
2732       putIRegOrZR(is64, dd, mkexpr(res));
2733       DIP("extr %s, %s, %s, #%u\n",
2734           nameIRegOrZR(is64,dd),
2735           nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm), imm6);
2736       return True;
2737    }
2738   after_extr:
2739
2740    vex_printf("ARM64 front end: data_processing_immediate\n");
2741    return False;
2742 #  undef INSN
2743 }
2744
2745
2746 /*------------------------------------------------------------*/
2747 /*--- Data processing (register) instructions              ---*/
2748 /*------------------------------------------------------------*/
2749
2750 static const HChar* nameSH ( UInt sh ) {
2751    switch (sh) {
2752       case 0: return "lsl";
2753       case 1: return "lsr";
2754       case 2: return "asr";
2755       case 3: return "ror";
2756       default: vassert(0);
2757    }
2758 }
2759
2760 /* Generate IR to get a register value, possibly shifted by an
2761    immediate.  Returns either a 32- or 64-bit temporary holding the
2762    result.  After the shift, the value can optionally be NOT-ed
2763    too.
2764
2765    sh_how coding: 00=SHL, 01=SHR, 10=SAR, 11=ROR.  sh_amt may only be
2766    in the range 0 to (is64 ? 64 : 32)-1.  For some instructions, ROR
2767    isn't allowed, but it's the job of the caller to check that.
2768 */
2769 static IRTemp getShiftedIRegOrZR ( Bool is64,
2770                                    UInt sh_how, UInt sh_amt, UInt regNo,
2771                                    Bool invert )
2772 {
2773    vassert(sh_how < 4);
2774    vassert(sh_amt < (is64 ? 64 : 32));
2775    IRType ty = is64 ? Ity_I64 : Ity_I32;
2776    IRTemp t0 = newTemp(ty);
2777    assign(t0, getIRegOrZR(is64, regNo));
2778    IRTemp t1 = newTemp(ty);
2779    switch (sh_how) {
2780       case BITS2(0,0):
2781          assign(t1, binop(mkSHL(ty), mkexpr(t0), mkU8(sh_amt)));
2782          break;
2783       case BITS2(0,1):
2784          assign(t1, binop(mkSHR(ty), mkexpr(t0), mkU8(sh_amt)));
2785          break;
2786       case BITS2(1,0):
2787          assign(t1, binop(mkSAR(ty), mkexpr(t0), mkU8(sh_amt)));
2788          break;
2789       case BITS2(1,1):
2790          assign(t1, mkexpr(mathROR(ty, t0, sh_amt)));
2791          break;
2792       default:
2793          vassert(0);
2794    }
2795    if (invert) {
2796       IRTemp t2 = newTemp(ty);
2797       assign(t2, unop(mkNOT(ty), mkexpr(t1)));
2798       return t2;
2799    } else {
2800       return t1;
2801    }
2802 }
2803
2804
2805 static
2806 Bool dis_ARM64_data_processing_register(/*MB_OUT*/DisResult* dres,
2807                                         UInt insn)
2808 {
2809 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
2810
2811    /* ------------------- ADD/SUB(reg) ------------------- */
2812    /* x==0 => 32 bit op      x==1 => 64 bit op
2813       sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR(NOT ALLOWED)
2814
2815       31 30 29 28    23 21 20 15   9  4
2816       |  |  |  |     |  |  |  |    |  |
2817       x  0  0  01011 sh 0  Rm imm6 Rn Rd   ADD  Rd,Rn, sh(Rm,imm6)
2818       x  0  1  01011 sh 0  Rm imm6 Rn Rd   ADDS Rd,Rn, sh(Rm,imm6)
2819       x  1  0  01011 sh 0  Rm imm6 Rn Rd   SUB  Rd,Rn, sh(Rm,imm6)
2820       x  1  1  01011 sh 0  Rm imm6 Rn Rd   SUBS Rd,Rn, sh(Rm,imm6)
2821    */
2822    if (INSN(28,24) == BITS5(0,1,0,1,1) && INSN(21,21) == 0) {
2823       UInt   bX    = INSN(31,31);
2824       UInt   bOP   = INSN(30,30); /* 0: ADD, 1: SUB */
2825       UInt   bS    = INSN(29, 29); /* set flags? */
2826       UInt   sh    = INSN(23,22);
2827       UInt   rM    = INSN(20,16);
2828       UInt   imm6  = INSN(15,10);
2829       UInt   rN    = INSN(9,5);
2830       UInt   rD    = INSN(4,0);
2831       Bool   isSUB = bOP == 1;
2832       Bool   is64  = bX == 1;
2833       IRType ty    = is64 ? Ity_I64 : Ity_I32;
2834       if ((!is64 && imm6 > 31) || sh == BITS2(1,1)) {
2835          /* invalid; fall through */
2836       } else {
2837          IRTemp argL = newTemp(ty);
2838          assign(argL, getIRegOrZR(is64, rN));
2839          IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, False);
2840          IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
2841          IRTemp res  = newTemp(ty);
2842          assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
2843          if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
2844          if (bS) {
2845             setFlags_ADD_SUB(is64, isSUB, argL, argR);
2846          }
2847          DIP("%s%s %s, %s, %s, %s #%u\n",
2848              bOP ? "sub" : "add", bS ? "s" : "",
2849              nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
2850              nameIRegOrZR(is64, rM), nameSH(sh), imm6);
2851          return True;
2852       }
2853    }
2854
2855    /* ------------------- ADC/SBC(reg) ------------------- */
2856    /* x==0 => 32 bit op      x==1 => 64 bit op
2857
2858       31 30 29 28    23 21 20 15     9  4
2859       |  |  |  |     |  |  |  |      |  |
2860       x  0  0  11010 00 0  Rm 000000 Rn Rd   ADC  Rd,Rn,Rm
2861       x  0  1  11010 00 0  Rm 000000 Rn Rd   ADCS Rd,Rn,Rm
2862       x  1  0  11010 00 0  Rm 000000 Rn Rd   SBC  Rd,Rn,Rm
2863       x  1  1  11010 00 0  Rm 000000 Rn Rd   SBCS Rd,Rn,Rm
2864    */
2865
2866    if (INSN(28,21) == BITS8(1,1,0,1,0,0,0,0) && INSN(15,10) == 0 ) {
2867       UInt   bX    = INSN(31,31);
2868       UInt   bOP   = INSN(30,30); /* 0: ADC, 1: SBC */
2869       UInt   bS    = INSN(29,29); /* set flags */
2870       UInt   rM    = INSN(20,16);
2871       UInt   rN    = INSN(9,5);
2872       UInt   rD    = INSN(4,0);
2873
2874       Bool   isSUB = bOP == 1;
2875       Bool   is64  = bX == 1;
2876       IRType ty    = is64 ? Ity_I64 : Ity_I32;
2877
2878       IRTemp oldC = newTemp(ty);
2879       assign(oldC,
2880              is64 ? mk_arm64g_calculate_flag_c()
2881                   : unop(Iop_64to32, mk_arm64g_calculate_flag_c()) );
2882
2883       IRTemp argL = newTemp(ty);
2884       assign(argL, getIRegOrZR(is64, rN));
2885       IRTemp argR = newTemp(ty);
2886       assign(argR, getIRegOrZR(is64, rM));
2887
2888       IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
2889       IRTemp res  = newTemp(ty);
2890       if (isSUB) {
2891          IRExpr* one = is64 ? mkU64(1) : mkU32(1);
2892          IROp xorOp = is64 ? Iop_Xor64 : Iop_Xor32;
2893          assign(res,
2894                 binop(op,
2895                       binop(op, mkexpr(argL), mkexpr(argR)),
2896                       binop(xorOp, mkexpr(oldC), one)));
2897       } else {
2898          assign(res,
2899                 binop(op,
2900                       binop(op, mkexpr(argL), mkexpr(argR)),
2901                       mkexpr(oldC)));
2902       }
2903
2904       if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
2905
2906       if (bS) {
2907          setFlags_ADC_SBC(is64, isSUB, argL, argR, oldC);
2908       }
2909
2910       DIP("%s%s %s, %s, %s\n",
2911           bOP ? "sbc" : "adc", bS ? "s" : "",
2912           nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
2913           nameIRegOrZR(is64, rM));
2914       return True;
2915    }
2916
2917    /* -------------------- LOGIC(reg) -------------------- */
2918    /* x==0 => 32 bit op      x==1 => 64 bit op
2919       N==0 => inv? is no-op (no inversion)
2920       N==1 => inv? is NOT
2921       sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR
2922
2923       31 30 28    23 21 20 15   9  4
2924       |  |  |     |  |  |  |    |  |
2925       x  00 01010 sh N  Rm imm6 Rn Rd  AND  Rd,Rn, inv?(sh(Rm,imm6))
2926       x  01 01010 sh N  Rm imm6 Rn Rd  ORR  Rd,Rn, inv?(sh(Rm,imm6))
2927       x  10 01010 sh N  Rm imm6 Rn Rd  EOR  Rd,Rn, inv?(sh(Rm,imm6))
2928       x  11 01010 sh N  Rm imm6 Rn Rd  ANDS Rd,Rn, inv?(sh(Rm,imm6))
2929       With N=1, the names are: BIC ORN EON BICS
2930    */
2931    if (INSN(28,24) == BITS5(0,1,0,1,0)) {
2932       UInt   bX   = INSN(31,31);
2933       UInt   sh   = INSN(23,22);
2934       UInt   bN   = INSN(21,21);
2935       UInt   rM   = INSN(20,16);
2936       UInt   imm6 = INSN(15,10);
2937       UInt   rN   = INSN(9,5);
2938       UInt   rD   = INSN(4,0);
2939       Bool   is64 = bX == 1;
2940       IRType ty   = is64 ? Ity_I64 : Ity_I32;
2941       if (!is64 && imm6 > 31) {
2942          /* invalid; fall though */
2943       } else {
2944          IRTemp argL = newTemp(ty);
2945          assign(argL, getIRegOrZR(is64, rN));
2946          IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, bN == 1);
2947          IROp   op   = Iop_INVALID;
2948          switch (INSN(30,29)) {
2949             case BITS2(0,0): case BITS2(1,1): op = mkAND(ty); break;
2950             case BITS2(0,1):                  op = mkOR(ty);  break;
2951             case BITS2(1,0):                  op = mkXOR(ty); break;
2952             default: vassert(0);
2953          }
2954          IRTemp res = newTemp(ty);
2955          assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
2956          if (INSN(30,29) == BITS2(1,1)) {
2957             setFlags_LOGIC(is64, res);
2958          }
2959          putIRegOrZR(is64, rD, mkexpr(res));
2960
2961          static const HChar* names_op[8]
2962             = { "and", "orr", "eor", "ands", "bic", "orn", "eon", "bics" };
2963          vassert(((bN << 2) | INSN(30,29)) < 8);
2964          const HChar* nm_op = names_op[(bN << 2) | INSN(30,29)];
2965          /* Special-case the printing of "MOV" */
2966          if (rN == 31/*zr*/ && sh == 0/*LSL*/ && imm6 == 0 && bN == 0) {
2967             DIP("mov %s, %s\n", nameIRegOrZR(is64, rD),
2968                                 nameIRegOrZR(is64, rM));
2969          } else {
2970             DIP("%s %s, %s, %s, %s #%u\n", nm_op,
2971                 nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
2972                 nameIRegOrZR(is64, rM), nameSH(sh), imm6);
2973          }
2974          return True;
2975       }
2976    }
2977
2978    /* -------------------- {U,S}MULH -------------------- */
2979    /* 31       23 22 20 15     9   4
2980       10011011 1  10 Rm 011111 Rn Rd   UMULH Xd,Xn,Xm
2981       10011011 0  10 Rm 011111 Rn Rd   SMULH Xd,Xn,Xm
2982    */
2983    if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1)
2984        && INSN(22,21) == BITS2(1,0) && INSN(15,10) == BITS6(0,1,1,1,1,1)) {
2985       Bool isU = INSN(23,23) == 1;
2986       UInt mm  = INSN(20,16);
2987       UInt nn  = INSN(9,5);
2988       UInt dd  = INSN(4,0);
2989       putIReg64orZR(dd, unop(Iop_128HIto64,
2990                              binop(isU ? Iop_MullU64 : Iop_MullS64,
2991                                    getIReg64orZR(nn), getIReg64orZR(mm))));
2992       DIP("%cmulh %s, %s, %s\n",
2993           isU ? 'u' : 's',
2994           nameIReg64orZR(dd), nameIReg64orZR(nn), nameIReg64orZR(mm));
2995       return True;
2996    }
2997
2998    /* -------------------- M{ADD,SUB} -------------------- */
2999    /* 31 30           20 15 14 9 4
3000       sf 00 11011 000 m  0  a  n r   MADD Rd,Rn,Rm,Ra  d = a+m*n
3001       sf 00 11011 000 m  1  a  n r   MADD Rd,Rn,Rm,Ra  d = a-m*n
3002    */
3003    if (INSN(30,21) == BITS10(0,0,1,1,0,1,1,0,0,0)) {
3004       Bool is64  = INSN(31,31) == 1;
3005       UInt mm    = INSN(20,16);
3006       Bool isAdd = INSN(15,15) == 0;
3007       UInt aa    = INSN(14,10);
3008       UInt nn    = INSN(9,5);
3009       UInt dd    = INSN(4,0);
3010       if (is64) {
3011          putIReg64orZR(
3012             dd,
3013             binop(isAdd ? Iop_Add64 : Iop_Sub64,
3014                   getIReg64orZR(aa),
3015                   binop(Iop_Mul64, getIReg64orZR(mm), getIReg64orZR(nn))));
3016       } else {
3017          putIReg32orZR(
3018             dd,
3019             binop(isAdd ? Iop_Add32 : Iop_Sub32,
3020                   getIReg32orZR(aa),
3021                   binop(Iop_Mul32, getIReg32orZR(mm), getIReg32orZR(nn))));
3022       }
3023       DIP("%s %s, %s, %s, %s\n",
3024           isAdd ? "madd" : "msub",
3025           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
3026           nameIRegOrZR(is64, mm), nameIRegOrZR(is64, aa));
3027       return True;
3028    }
3029
3030    /* ---------------- CS{EL,INC,INV,NEG} ---------------- */
3031    /* 31 30 28        20 15   11 9  4
3032       sf 00 1101 0100 mm cond 00 nn dd   CSEL  Rd,Rn,Rm
3033       sf 00 1101 0100 mm cond 01 nn dd   CSINC Rd,Rn,Rm
3034       sf 10 1101 0100 mm cond 00 nn dd   CSINV Rd,Rn,Rm
3035       sf 10 1101 0100 mm cond 01 nn dd   CSNEG Rd,Rn,Rm
3036       In all cases, the operation is: Rd = if cond then Rn else OP(Rm)
3037    */
3038    if (INSN(29,21) == BITS9(0, 1,1,0,1, 0,1,0,0) && INSN(11,11) == 0) {
3039       Bool    is64 = INSN(31,31) == 1;
3040       UInt    b30  = INSN(30,30);
3041       UInt    mm   = INSN(20,16);
3042       UInt    cond = INSN(15,12);
3043       UInt    b10  = INSN(10,10);
3044       UInt    nn   = INSN(9,5);
3045       UInt    dd   = INSN(4,0);
3046       UInt    op   = (b30 << 1) | b10; /* 00=id 01=inc 10=inv 11=neg */
3047       IRType  ty   = is64 ? Ity_I64 : Ity_I32;
3048       IRExpr* argL = getIRegOrZR(is64, nn);
3049       IRExpr* argR = getIRegOrZR(is64, mm);
3050       switch (op) {
3051          case BITS2(0,0):
3052             break;
3053          case BITS2(0,1):
3054             argR = binop(mkADD(ty), argR, mkU(ty,1));
3055             break;
3056          case BITS2(1,0):
3057             argR = unop(mkNOT(ty), argR);
3058             break;
3059          case BITS2(1,1):
3060             argR = binop(mkSUB(ty), mkU(ty,0), argR);
3061             break;
3062          default:
3063             vassert(0);
3064       }
3065       putIRegOrZR(
3066          is64, dd,
3067          IRExpr_ITE(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
3068                     argL, argR)
3069       );
3070       const HChar* op_nm[4] = { "csel", "csinc", "csinv", "csneg" };
3071       DIP("%s %s, %s, %s, %s\n", op_nm[op],
3072           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
3073           nameIRegOrZR(is64, mm), nameCC(cond));
3074       return True;
3075    }
3076
3077    /* -------------- ADD/SUB(extended reg) -------------- */
3078    /*     28         20 15  12   9 4
3079       000 01011 00 1 m  opt imm3 n d   ADD  Wd|SP, Wn|SP, Wm ext&lsld
3080       100 01011 00 1 m  opt imm3 n d   ADD  Xd|SP, Xn|SP, Rm ext&lsld
3081
3082       001 01011 00 1 m  opt imm3 n d   ADDS Wd,    Wn|SP, Wm ext&lsld
3083       101 01011 00 1 m  opt imm3 n d   ADDS Xd,    Xn|SP, Rm ext&lsld
3084
3085       010 01011 00 1 m  opt imm3 n d   SUB  Wd|SP, Wn|SP, Wm ext&lsld
3086       110 01011 00 1 m  opt imm3 n d   SUB  Xd|SP, Xn|SP, Rm ext&lsld
3087
3088       011 01011 00 1 m  opt imm3 n d   SUBS Wd,    Wn|SP, Wm ext&lsld
3089       111 01011 00 1 m  opt imm3 n d   SUBS Xd,    Xn|SP, Rm ext&lsld
3090
3091       The 'm' operand is extended per opt, thusly:
3092
3093         000   Xm & 0xFF           UXTB
3094         001   Xm & 0xFFFF         UXTH
3095         010   Xm & (2^32)-1       UXTW
3096         011   Xm                  UXTX
3097
3098         100   Xm sx from bit 7    SXTB
3099         101   Xm sx from bit 15   SXTH
3100         110   Xm sx from bit 31   SXTW
3101         111   Xm                  SXTX
3102
3103       In the 64 bit case (bit31 == 1), UXTX and SXTX are the identity
3104       operation on Xm.  In the 32 bit case, UXTW, UXTX, SXTW and SXTX
3105       are the identity operation on Wm.
3106
3107       After extension, the value is shifted left by imm3 bits, which
3108       may only be in the range 0 .. 4 inclusive.
3109    */
3110    if (INSN(28,21) == BITS8(0,1,0,1,1,0,0,1) && INSN(12,10) <= 4) {
3111       Bool is64  = INSN(31,31) == 1;
3112       Bool isSub = INSN(30,30) == 1;
3113       Bool setCC = INSN(29,29) == 1;
3114       UInt mm    = INSN(20,16);
3115       UInt opt   = INSN(15,13);
3116       UInt imm3  = INSN(12,10);
3117       UInt nn    = INSN(9,5);
3118       UInt dd    = INSN(4,0);
3119       const HChar* nameExt[8] = { "uxtb", "uxth", "uxtw", "uxtx",
3120                                   "sxtb", "sxth", "sxtw", "sxtx" };
3121       /* Do almost the same thing in the 32- and 64-bit cases. */
3122       IRTemp xN = newTemp(Ity_I64);
3123       IRTemp xM = newTemp(Ity_I64);
3124       assign(xN, getIReg64orSP(nn));
3125       assign(xM, getIReg64orZR(mm));
3126       IRExpr* xMw  = mkexpr(xM); /* "xM widened" */
3127       Int     shSX = 0;
3128       /* widen Xm .. */
3129       switch (opt) {
3130          case BITS3(0,0,0): // UXTB
3131             xMw = binop(Iop_And64, xMw, mkU64(0xFF)); break;
3132          case BITS3(0,0,1): // UXTH
3133             xMw = binop(Iop_And64, xMw, mkU64(0xFFFF)); break;
3134          case BITS3(0,1,0): // UXTW -- noop for the 32bit case
3135             if (is64) {
3136                xMw = unop(Iop_32Uto64, unop(Iop_64to32, xMw));
3137             }
3138             break;
3139          case BITS3(0,1,1): // UXTX -- always a noop
3140             break;
3141          case BITS3(1,0,0): // SXTB
3142             shSX = 56; goto sxTo64;
3143          case BITS3(1,0,1): // SXTH
3144             shSX = 48; goto sxTo64;
3145          case BITS3(1,1,0): // SXTW -- noop for the 32bit case
3146             if (is64) {
3147                shSX = 32; goto sxTo64;
3148             }
3149             break;
3150          case BITS3(1,1,1): // SXTX -- always a noop
3151             break;
3152          sxTo64:
3153             vassert(shSX >= 32);
3154             xMw = binop(Iop_Sar64, binop(Iop_Shl64, xMw, mkU8(shSX)),
3155                         mkU8(shSX));
3156             break;
3157          default:
3158             vassert(0);
3159       }
3160       /* and now shift */
3161       IRTemp argL = xN;
3162       IRTemp argR = newTemp(Ity_I64);
3163       assign(argR, binop(Iop_Shl64, xMw, mkU8(imm3)));
3164       IRTemp res = newTemp(Ity_I64);
3165       assign(res, binop(isSub ? Iop_Sub64 : Iop_Add64,
3166                         mkexpr(argL), mkexpr(argR)));
3167       if (is64) {
3168          if (setCC) {
3169             putIReg64orZR(dd, mkexpr(res));
3170             setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
3171          } else {
3172             putIReg64orSP(dd, mkexpr(res));
3173          }
3174       } else {
3175          if (setCC) {
3176             IRTemp argL32 = newTemp(Ity_I32);
3177             IRTemp argR32 = newTemp(Ity_I32);
3178             putIReg32orZR(dd, unop(Iop_64to32, mkexpr(res)));
3179             assign(argL32, unop(Iop_64to32, mkexpr(argL)));
3180             assign(argR32, unop(Iop_64to32, mkexpr(argR)));
3181             setFlags_ADD_SUB(False/*!is64*/, isSub, argL32, argR32);
3182          } else {
3183             putIReg32orSP(dd, unop(Iop_64to32, mkexpr(res)));
3184          }
3185       }
3186       DIP("%s%s %s, %s, %s %s lsl %u\n",
3187           isSub ? "sub" : "add", setCC ? "s" : "",
3188           setCC ? nameIRegOrZR(is64, dd) : nameIRegOrSP(is64, dd),
3189           nameIRegOrSP(is64, nn), nameIRegOrSP(is64, mm),
3190           nameExt[opt], imm3);
3191       return True;
3192    }
3193
3194    /* ---------------- CCMP/CCMN(imm) ---------------- */
3195    /* Bizarrely, these appear in the "data processing register"
3196       category, even though they are operations against an
3197       immediate. */
3198    /* 31   29        20   15   11 9    3
3199       sf 1 111010010 imm5 cond 10 Rn 0 nzcv   CCMP Rn, #imm5, #nzcv, cond
3200       sf 0 111010010 imm5 cond 10 Rn 0 nzcv   CCMN Rn, #imm5, #nzcv, cond
3201
3202       Operation is:
3203          (CCMP) flags = if cond then flags-after-sub(Rn,imm5) else nzcv
3204          (CCMN) flags = if cond then flags-after-add(Rn,imm5) else nzcv
3205    */
3206    if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
3207        && INSN(11,10) == BITS2(1,0) && INSN(4,4) == 0) {
3208       Bool is64  = INSN(31,31) == 1;
3209       Bool isSUB = INSN(30,30) == 1;
3210       UInt imm5  = INSN(20,16);
3211       UInt cond  = INSN(15,12);
3212       UInt nn    = INSN(9,5);
3213       UInt nzcv  = INSN(3,0);
3214
3215       IRTemp condT = newTemp(Ity_I1);
3216       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
3217
3218       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3219       IRTemp argL = newTemp(ty);
3220       IRTemp argR = newTemp(ty);
3221
3222       if (is64) {
3223          assign(argL, getIReg64orZR(nn));
3224          assign(argR, mkU64(imm5));
3225       } else {
3226          assign(argL, getIReg32orZR(nn));
3227          assign(argR, mkU32(imm5));
3228       }
3229       setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
3230
3231       DIP("ccm%c %s, #%u, #%u, %s\n",
3232           isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
3233           imm5, nzcv, nameCC(cond));
3234       return True;
3235    }
3236
3237    /* ---------------- CCMP/CCMN(reg) ---------------- */
3238    /* 31   29        20 15   11 9    3
3239       sf 1 111010010 Rm cond 00 Rn 0 nzcv   CCMP Rn, Rm, #nzcv, cond
3240       sf 0 111010010 Rm cond 00 Rn 0 nzcv   CCMN Rn, Rm, #nzcv, cond
3241       Operation is:
3242          (CCMP) flags = if cond then flags-after-sub(Rn,Rm) else nzcv
3243          (CCMN) flags = if cond then flags-after-add(Rn,Rm) else nzcv
3244    */
3245    if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
3246        && INSN(11,10) == BITS2(0,0) && INSN(4,4) == 0) {
3247       Bool is64  = INSN(31,31) == 1;
3248       Bool isSUB = INSN(30,30) == 1;
3249       UInt mm    = INSN(20,16);
3250       UInt cond  = INSN(15,12);
3251       UInt nn    = INSN(9,5);
3252       UInt nzcv  = INSN(3,0);
3253
3254       IRTemp condT = newTemp(Ity_I1);
3255       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
3256
3257       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3258       IRTemp argL = newTemp(ty);
3259       IRTemp argR = newTemp(ty);
3260
3261       if (is64) {
3262          assign(argL, getIReg64orZR(nn));
3263          assign(argR, getIReg64orZR(mm));
3264       } else {
3265          assign(argL, getIReg32orZR(nn));
3266          assign(argR, getIReg32orZR(mm));
3267       }
3268       setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
3269
3270       DIP("ccm%c %s, %s, #%u, %s\n",
3271           isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
3272           nameIRegOrZR(is64, mm), nzcv, nameCC(cond));
3273       return True;
3274    }
3275
3276
3277    /* -------------- REV/REV16/REV32/RBIT -------------- */
3278    /* 31 30 28       20    15   11 9 4
3279
3280       1  10 11010110 00000 0000 11 n d    (1) REV   Xd, Xn
3281       0  10 11010110 00000 0000 10 n d    (2) REV   Wd, Wn
3282
3283       1  10 11010110 00000 0000 00 n d    (3) RBIT  Xd, Xn
3284       0  10 11010110 00000 0000 00 n d    (4) RBIT  Wd, Wn
3285
3286       1  10 11010110 00000 0000 01 n d    (5) REV16 Xd, Xn
3287       0  10 11010110 00000 0000 01 n d    (6) REV16 Wd, Wn
3288
3289       1  10 11010110 00000 0000 10 n d    (7) REV32 Xd, Xn
3290    */
3291    if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
3292        && INSN(20,12) == BITS9(0,0,0,0,0,0,0,0,0)) {
3293       UInt b31 = INSN(31,31);
3294       UInt opc = INSN(11,10);
3295
3296       UInt ix = 0;
3297       /**/ if (b31 == 1 && opc == BITS2(1,1)) ix = 1;
3298       else if (b31 == 0 && opc == BITS2(1,0)) ix = 2;
3299       else if (b31 == 1 && opc == BITS2(0,0)) ix = 3;
3300       else if (b31 == 0 && opc == BITS2(0,0)) ix = 4;
3301       else if (b31 == 1 && opc == BITS2(0,1)) ix = 5;
3302       else if (b31 == 0 && opc == BITS2(0,1)) ix = 6;
3303       else if (b31 == 1 && opc == BITS2(1,0)) ix = 7;
3304       if (ix >= 1 && ix <= 7) {
3305          Bool   is64  = ix == 1 || ix == 3 || ix == 5 || ix == 7;
3306          UInt   nn    = INSN(9,5);
3307          UInt   dd    = INSN(4,0);
3308          IRTemp src   = newTemp(Ity_I64);
3309          IRTemp dst   = IRTemp_INVALID;
3310          IRTemp (*math)(IRTemp) = NULL;
3311          switch (ix) {
3312             case 1: case 2: math = math_BYTESWAP64;   break;
3313             case 3: case 4: math = math_BITSWAP64;    break;
3314             case 5: case 6: math = math_USHORTSWAP64; break;
3315             case 7:         math = math_UINTSWAP64;   break;
3316             default: vassert(0);
3317          }
3318          const HChar* names[7]
3319            = { "rev", "rev", "rbit", "rbit", "rev16", "rev16", "rev32" };
3320          const HChar* nm = names[ix-1];
3321          vassert(math);
3322          if (ix == 6) {
3323             /* This has to be special cased, since the logic below doesn't
3324                handle it correctly. */
3325             assign(src, getIReg64orZR(nn));
3326             dst = math(src);
3327             putIReg64orZR(dd,
3328                           unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(dst))));
3329          } else if (is64) {
3330             assign(src, getIReg64orZR(nn));
3331             dst = math(src);
3332             putIReg64orZR(dd, mkexpr(dst));
3333          } else {
3334             assign(src, binop(Iop_Shl64, getIReg64orZR(nn), mkU8(32)));
3335             dst = math(src);
3336             putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
3337          }
3338          DIP("%s %s, %s\n", nm,
3339              nameIRegOrZR(is64,dd), nameIRegOrZR(is64,nn));
3340          return True;
3341       }
3342       /* else fall through */
3343    }
3344
3345    /* -------------------- CLZ/CLS -------------------- */
3346    /*    30 28   24   20    15      9 4
3347       sf 10 1101 0110 00000 00010 0 n d    CLZ Rd, Rn
3348       sf 10 1101 0110 00000 00010 1 n d    CLS Rd, Rn
3349    */
3350    if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
3351        && INSN(20,11) == BITS10(0,0,0,0,0,0,0,0,1,0)) {
3352       Bool   is64  = INSN(31,31) == 1;
3353       Bool   isCLS = INSN(10,10) == 1;
3354       UInt   nn    = INSN(9,5);
3355       UInt   dd    = INSN(4,0);
3356       IRTemp src   = newTemp(Ity_I64);
3357       IRTemp srcZ  = newTemp(Ity_I64);
3358       IRTemp dst   = newTemp(Ity_I64);
3359       /* Get the argument, widened out to 64 bit */
3360       if (is64) {
3361          assign(src, getIReg64orZR(nn));
3362       } else {
3363          assign(src, binop(Iop_Shl64,
3364                            unop(Iop_32Uto64, getIReg32orZR(nn)), mkU8(32)));
3365       }
3366       /* If this is CLS, mash the arg around accordingly */
3367       if (isCLS) {
3368          IRExpr* one = mkU8(1);
3369          assign(srcZ,
3370          binop(Iop_Xor64,
3371                binop(Iop_Shl64, mkexpr(src), one),
3372                binop(Iop_Shl64, binop(Iop_Shr64, mkexpr(src), one), one)));
3373       } else {
3374          assign(srcZ, mkexpr(src));
3375       }
3376       /* And compute CLZ. */
3377       if (is64) {
3378          assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
3379                                 mkU64(isCLS ? 63 : 64),
3380                                 unop(Iop_Clz64, mkexpr(srcZ))));
3381          putIReg64orZR(dd, mkexpr(dst));
3382       } else {
3383          assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
3384                                 mkU64(isCLS ? 31 : 32),
3385                                 unop(Iop_Clz64, mkexpr(srcZ))));
3386          putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
3387       }
3388       DIP("cl%c %s, %s\n", isCLS ? 's' : 'z',
3389           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn));
3390       return True;
3391    }
3392
3393    /* ------------------ LSLV/LSRV/ASRV/RORV ------------------ */
3394    /*    30 28        20 15   11 9 4
3395       sf 00 1101 0110 m  0010 00 n d   LSLV Rd,Rn,Rm
3396       sf 00 1101 0110 m  0010 01 n d   LSRV Rd,Rn,Rm
3397       sf 00 1101 0110 m  0010 10 n d   ASRV Rd,Rn,Rm
3398       sf 00 1101 0110 m  0010 11 n d   RORV Rd,Rn,Rm
3399    */
3400    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3401        && INSN(15,12) == BITS4(0,0,1,0)) {
3402       Bool   is64 = INSN(31,31) == 1;
3403       UInt   mm   = INSN(20,16);
3404       UInt   op   = INSN(11,10);
3405       UInt   nn   = INSN(9,5);
3406       UInt   dd   = INSN(4,0);
3407       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3408       IRTemp srcL = newTemp(ty);
3409       IRTemp srcR = newTemp(Ity_I64);
3410       IRTemp res  = newTemp(ty);
3411       IROp   iop  = Iop_INVALID;
3412       assign(srcL, getIRegOrZR(is64, nn));
3413       assign(srcR, binop(Iop_And64, getIReg64orZR(mm),
3414                                     mkU64(is64 ? 63 : 31)));
3415       if (op < 3) {
3416          // LSLV, LSRV, ASRV
3417          switch (op) {
3418             case BITS2(0,0): iop = mkSHL(ty); break;
3419             case BITS2(0,1): iop = mkSHR(ty); break;
3420             case BITS2(1,0): iop = mkSAR(ty); break;
3421             default: vassert(0);
3422          }
3423          assign(res, binop(iop, mkexpr(srcL),
3424                                 unop(Iop_64to8, mkexpr(srcR))));
3425       } else {
3426          // RORV
3427          IROp opSHL = mkSHL(ty);
3428          IROp opSHR = mkSHR(ty);
3429          IROp opOR  = mkOR(ty);
3430          IRExpr* width = mkU64(is64 ? 64: 32);
3431          assign(
3432             res,
3433             IRExpr_ITE(
3434                binop(Iop_CmpEQ64, mkexpr(srcR), mkU64(0)),
3435                mkexpr(srcL),
3436                binop(opOR,
3437                      binop(opSHL,
3438                            mkexpr(srcL),
3439                            unop(Iop_64to8, binop(Iop_Sub64, width,
3440                                                             mkexpr(srcR)))),
3441                      binop(opSHR,
3442                            mkexpr(srcL), unop(Iop_64to8, mkexpr(srcR))))
3443          ));
3444       }
3445       putIRegOrZR(is64, dd, mkexpr(res));
3446       vassert(op < 4);
3447       const HChar* names[4] = { "lslv", "lsrv", "asrv", "rorv" };
3448       DIP("%s %s, %s, %s\n",
3449           names[op], nameIRegOrZR(is64,dd),
3450                      nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm));
3451       return True;
3452    }
3453
3454    /* -------------------- SDIV/UDIV -------------------- */
3455    /*    30 28        20 15    10 9 4
3456       sf 00 1101 0110 m  00001  1 n d  SDIV Rd,Rn,Rm
3457       sf 00 1101 0110 m  00001  0 n d  UDIV Rd,Rn,Rm
3458    */
3459    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3460        && INSN(15,11) == BITS5(0,0,0,0,1)) {
3461       Bool is64 = INSN(31,31) == 1;
3462       UInt mm   = INSN(20,16);
3463       Bool isS  = INSN(10,10) == 1;
3464       UInt nn   = INSN(9,5);
3465       UInt dd   = INSN(4,0);
3466       if (isS) {
3467          putIRegOrZR(is64, dd, binop(is64 ? Iop_DivS64 : Iop_DivS32,
3468                                      getIRegOrZR(is64, nn),
3469                                      getIRegOrZR(is64, mm)));
3470       } else {
3471          putIRegOrZR(is64, dd, binop(is64 ? Iop_DivU64 : Iop_DivU32,
3472                                      getIRegOrZR(is64, nn),
3473                                      getIRegOrZR(is64, mm)));
3474       }
3475       DIP("%cdiv %s, %s, %s\n", isS ? 's' : 'u',
3476           nameIRegOrZR(is64, dd),
3477           nameIRegOrZR(is64, nn), nameIRegOrZR(is64, mm));
3478       return True;
3479    }
3480
3481    /* ------------------ {S,U}M{ADD,SUB}L ------------------ */
3482    /* 31        23  20 15 14 9 4
3483       1001 1011 101 m  0  a  n d   UMADDL Xd,Wn,Wm,Xa
3484       1001 1011 001 m  0  a  n d   SMADDL Xd,Wn,Wm,Xa
3485       1001 1011 101 m  1  a  n d   UMSUBL Xd,Wn,Wm,Xa
3486       1001 1011 001 m  1  a  n d   SMSUBL Xd,Wn,Wm,Xa
3487       with operation
3488          Xd = Xa +/- (Wn *u/s Wm)
3489    */
3490    if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1) && INSN(22,21) == BITS2(0,1)) {
3491       Bool   isU   = INSN(23,23) == 1;
3492       UInt   mm    = INSN(20,16);
3493       Bool   isAdd = INSN(15,15) == 0;
3494       UInt   aa    = INSN(14,10);
3495       UInt   nn    = INSN(9,5);
3496       UInt   dd    = INSN(4,0);
3497       IRTemp wN    = newTemp(Ity_I32);
3498       IRTemp wM    = newTemp(Ity_I32);
3499       IRTemp xA    = newTemp(Ity_I64);
3500       IRTemp muld  = newTemp(Ity_I64);
3501       IRTemp res   = newTemp(Ity_I64);
3502       assign(wN, getIReg32orZR(nn));
3503       assign(wM, getIReg32orZR(mm));
3504       assign(xA, getIReg64orZR(aa));
3505       assign(muld, binop(isU ? Iop_MullU32 : Iop_MullS32,
3506                          mkexpr(wN), mkexpr(wM)));
3507       assign(res, binop(isAdd ? Iop_Add64 : Iop_Sub64,
3508                         mkexpr(xA), mkexpr(muld)));
3509       putIReg64orZR(dd, mkexpr(res));
3510       DIP("%cm%sl %s, %s, %s, %s\n", isU ? 'u' : 's', isAdd ? "add" : "sub",
3511           nameIReg64orZR(dd), nameIReg32orZR(nn),
3512           nameIReg32orZR(mm), nameIReg64orZR(aa));
3513       return True;
3514    }
3515
3516    /* -------------------- CRC32/CRC32C -------------------- */
3517    /* 31 30           20 15   11 9 4
3518       sf 00 1101 0110 m  0100 sz n d   CRC32<sz>  Wd, Wn, Wm|Xm
3519       sf 00 1101 0110 m  0101 sz n d   CRC32C<sz> Wd, Wn, Wm|Xm
3520    */
3521    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3522        && INSN(15,13) == BITS3(0,1,0)) {
3523       UInt bitSF = INSN(31,31);
3524       UInt mm    = INSN(20,16);
3525       UInt bitC  = INSN(12,12);
3526       UInt sz    = INSN(11,10);
3527       UInt nn    = INSN(9,5);
3528       UInt dd    = INSN(4,0);
3529       vassert(sz >= 0 && sz <= 3);
3530       if ((bitSF == 0 && sz <= BITS2(1,0))
3531           || (bitSF == 1 && sz == BITS2(1,1))) {
3532          UInt ix = (bitC == 1 ? 4 : 0) | sz;
3533          void* helpers[8]
3534             = { &arm64g_calc_crc32b,   &arm64g_calc_crc32h,
3535                 &arm64g_calc_crc32w,   &arm64g_calc_crc32x,
3536                 &arm64g_calc_crc32cb,  &arm64g_calc_crc32ch,
3537                 &arm64g_calc_crc32cw,  &arm64g_calc_crc32cx };
3538          const HChar* hNames[8]
3539             = { "arm64g_calc_crc32b",  "arm64g_calc_crc32h",
3540                 "arm64g_calc_crc32w",  "arm64g_calc_crc32x",
3541                 "arm64g_calc_crc32cb", "arm64g_calc_crc32ch",
3542                 "arm64g_calc_crc32cw", "arm64g_calc_crc32cx" };
3543          const HChar* iNames[8]
3544             = { "crc32b",  "crc32h",  "crc32w",  "crc32x",
3545                 "crc32cb", "crc32ch", "crc32cw", "crc32cx" };
3546
3547          IRTemp srcN = newTemp(Ity_I64);
3548          assign(srcN, unop(Iop_32Uto64, unop(Iop_64to32, getIReg64orZR(nn))));
3549
3550          IRTemp  srcM = newTemp(Ity_I64);
3551          IRExpr* at64 = getIReg64orZR(mm);
3552          switch (sz) {
3553             case BITS2(0,0):
3554                assign(srcM, binop(Iop_And64, at64, mkU64(0xFF))); break;
3555             case BITS2(0,1):
3556                assign(srcM, binop(Iop_And64, at64, mkU64(0xFFFF))); break;
3557             case BITS2(1,0):
3558                assign(srcM, binop(Iop_And64, at64, mkU64(0xFFFFFFFF))); break;
3559             case BITS2(1,1):
3560                assign(srcM, at64); break;
3561             default:
3562                vassert(0);
3563          }
3564
3565          vassert(ix >= 0 && ix <= 7);
3566
3567          putIReg64orZR(
3568             dd,
3569             unop(Iop_32Uto64,
3570                  unop(Iop_64to32,
3571                       mkIRExprCCall(Ity_I64, 0/*regparm*/,
3572                                     hNames[ix], helpers[ix],
3573                                     mkIRExprVec_2(mkexpr(srcN),
3574                                                   mkexpr(srcM))))));
3575
3576          DIP("%s %s, %s, %s\n", iNames[ix],
3577              nameIReg32orZR(dd),
3578              nameIReg32orZR(nn), nameIRegOrZR(bitSF == 1, mm));
3579          return True;
3580       }
3581       /* fall through */
3582    }
3583
3584    vex_printf("ARM64 front end: data_processing_register\n");
3585    return False;
3586 #  undef INSN
3587 }
3588
3589
3590 /*------------------------------------------------------------*/
3591 /*--- Math helpers for vector interleave/deinterleave      ---*/
3592 /*------------------------------------------------------------*/
3593
3594 #define EX(_tmp) \
3595            mkexpr(_tmp)
3596 #define SL(_hi128,_lo128,_nbytes) \
3597            ( (_nbytes) == 0 \
3598                 ? (_lo128) \
3599                 : triop(Iop_SliceV128,(_hi128),(_lo128),mkU8(_nbytes)) )
3600 #define ROR(_v128,_nbytes) \
3601            SL((_v128),(_v128),(_nbytes))
3602 #define ROL(_v128,_nbytes) \
3603            SL((_v128),(_v128),16-(_nbytes))
3604 #define SHR(_v128,_nbytes) \
3605            binop(Iop_ShrV128,(_v128),mkU8(8*(_nbytes)))
3606 #define SHL(_v128,_nbytes) \
3607            binop(Iop_ShlV128,(_v128),mkU8(8*(_nbytes)))
3608 #define ILO64x2(_argL,_argR) \
3609            binop(Iop_InterleaveLO64x2,(_argL),(_argR))
3610 #define IHI64x2(_argL,_argR) \
3611            binop(Iop_InterleaveHI64x2,(_argL),(_argR))
3612 #define ILO32x4(_argL,_argR) \
3613            binop(Iop_InterleaveLO32x4,(_argL),(_argR))
3614 #define IHI32x4(_argL,_argR) \
3615            binop(Iop_InterleaveHI32x4,(_argL),(_argR))
3616 #define ILO16x8(_argL,_argR) \
3617            binop(Iop_InterleaveLO16x8,(_argL),(_argR))
3618 #define IHI16x8(_argL,_argR) \
3619            binop(Iop_InterleaveHI16x8,(_argL),(_argR))
3620 #define ILO8x16(_argL,_argR) \
3621            binop(Iop_InterleaveLO8x16,(_argL),(_argR))
3622 #define IHI8x16(_argL,_argR) \
3623            binop(Iop_InterleaveHI8x16,(_argL),(_argR))
3624 #define CEV32x4(_argL,_argR) \
3625            binop(Iop_CatEvenLanes32x4,(_argL),(_argR))
3626 #define COD32x4(_argL,_argR) \
3627            binop(Iop_CatOddLanes32x4,(_argL),(_argR))
3628 #define COD16x8(_argL,_argR) \
3629            binop(Iop_CatOddLanes16x8,(_argL),(_argR))
3630 #define COD8x16(_argL,_argR) \
3631            binop(Iop_CatOddLanes8x16,(_argL),(_argR))
3632 #define CEV8x16(_argL,_argR) \
3633            binop(Iop_CatEvenLanes8x16,(_argL),(_argR))
3634 #define AND(_arg1,_arg2) \
3635            binop(Iop_AndV128,(_arg1),(_arg2))
3636 #define OR2(_arg1,_arg2) \
3637            binop(Iop_OrV128,(_arg1),(_arg2))
3638 #define OR3(_arg1,_arg2,_arg3) \
3639            binop(Iop_OrV128,(_arg1),binop(Iop_OrV128,(_arg2),(_arg3)))
3640 #define OR4(_arg1,_arg2,_arg3,_arg4) \
3641            binop(Iop_OrV128, \
3642                  binop(Iop_OrV128,(_arg1),(_arg2)), \
3643                  binop(Iop_OrV128,(_arg3),(_arg4)))
3644
3645
3646 /* Do interleaving for 1 128 bit vector, for ST1 insns. */
3647 static
3648 void math_INTERLEAVE1_128( /*OUTx1*/ IRTemp* i0,
3649                            UInt laneSzBlg2, IRTemp u0 )
3650 {
3651    assign(*i0, mkexpr(u0));
3652 }
3653
3654
3655 /* Do interleaving for 2 128 bit vectors, for ST2 insns. */
3656 static
3657 void math_INTERLEAVE2_128( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
3658                            UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
3659 {
3660    /* This is pretty easy, since we have primitives directly to
3661       hand. */
3662    if (laneSzBlg2 == 3) {
3663       // 64x2
3664       // u1 == B1 B0, u0 == A1 A0
3665       // i1 == B1 A1, i0 == B0 A0
3666       assign(*i0, binop(Iop_InterleaveLO64x2, mkexpr(u1), mkexpr(u0)));
3667       assign(*i1, binop(Iop_InterleaveHI64x2, mkexpr(u1), mkexpr(u0)));
3668       return;
3669    }
3670    if (laneSzBlg2 == 2) {
3671       // 32x4
3672       // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
3673       // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
3674       assign(*i0, binop(Iop_InterleaveLO32x4, mkexpr(u1), mkexpr(u0)));
3675       assign(*i1, binop(Iop_InterleaveHI32x4, mkexpr(u1), mkexpr(u0)));
3676       return;
3677    }
3678    if (laneSzBlg2 == 1) {
3679       // 16x8
3680       // u1 == B{7..0}, u0 == A{7..0}
3681       // i0 == B3 A3 B2 A2 B1 A1 B0 A0
3682       // i1 == B7 A7 B6 A6 B5 A5 B4 A4
3683       assign(*i0, binop(Iop_InterleaveLO16x8, mkexpr(u1), mkexpr(u0)));
3684       assign(*i1, binop(Iop_InterleaveHI16x8, mkexpr(u1), mkexpr(u0)));
3685       return;
3686    }
3687    if (laneSzBlg2 == 0) {
3688       // 8x16
3689       // u1 == B{f..0}, u0 == A{f..0}
3690       // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
3691       // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
3692       assign(*i0, binop(Iop_InterleaveLO8x16, mkexpr(u1), mkexpr(u0)));
3693       assign(*i1, binop(Iop_InterleaveHI8x16, mkexpr(u1), mkexpr(u0)));
3694       return;
3695    }
3696    /*NOTREACHED*/
3697    vassert(0);
3698 }
3699
3700
3701 /* Do interleaving for 3 128 bit vectors, for ST3 insns. */
3702 static
3703 void math_INTERLEAVE3_128(
3704         /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
3705         UInt laneSzBlg2,
3706         IRTemp u0, IRTemp u1, IRTemp u2 )
3707 {
3708    if (laneSzBlg2 == 3) {
3709       // 64x2
3710       // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
3711       // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
3712       assign(*i2, IHI64x2( EX(u2), EX(u1) ));
3713       assign(*i1, ILO64x2( ROR(EX(u0),8), EX(u2) ));
3714       assign(*i0, ILO64x2( EX(u1), EX(u0) ));
3715       return;
3716    }
3717
3718    if (laneSzBlg2 == 2) {
3719       // 32x4
3720       // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
3721       // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
3722       // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
3723       IRTemp p0    = newTempV128();
3724       IRTemp p1    = newTempV128();
3725       IRTemp p2    = newTempV128();
3726       IRTemp c1100 = newTempV128();
3727       IRTemp c0011 = newTempV128();
3728       IRTemp c0110 = newTempV128();
3729       assign(c1100, mkV128(0xFF00));
3730       assign(c0011, mkV128(0x00FF));
3731       assign(c0110, mkV128(0x0FF0));
3732       // First interleave them at 64x2 granularity,
3733       // generating partial ("p") values.
3734       math_INTERLEAVE3_128(&p0, &p1, &p2, 3, u0, u1, u2);
3735       // And more shuffling around for the final answer
3736       assign(*i2, OR2( AND( IHI32x4(EX(p2), ROL(EX(p2),8)), EX(c1100) ),
3737                        AND( IHI32x4(ROR(EX(p1),4), EX(p2)), EX(c0011) ) ));
3738       assign(*i1, OR3( SHL(EX(p2),12),
3739                        AND(EX(p1),EX(c0110)),
3740                        SHR(EX(p0),12) ));
3741       assign(*i0, OR2( AND( ILO32x4(EX(p0),ROL(EX(p1),4)), EX(c1100) ),
3742                        AND( ILO32x4(ROR(EX(p0),8),EX(p0)), EX(c0011) ) ));
3743       return;
3744    }
3745
3746    if (laneSzBlg2 == 1) {
3747       // 16x8
3748       // u2 == C7 C6 C5 C4 C3 C2 C1 C0
3749       // u1 == B7 B6 B5 B4 B3 B2 B1 B0
3750       // u0 == A7 A6 A5 A4 A3 A2 A1 A0
3751       //
3752       // p2 == C7 C6 B7 B6 A7 A6 C5 C4
3753       // p1 == B5 B4 A5 A4 C3 C2 B3 B2
3754       // p0 == A3 A2 C1 C0 B1 B0 A1 A0
3755       //
3756       // i2 == C7 B7 A7 C6 B6 A6 C5 B5
3757       // i1 == A5 C4 B4 A4 C4 B3 A3 C2
3758       // i0 == B2 A2 C1 B1 A1 C0 B0 A0
3759       IRTemp p0    = newTempV128();
3760       IRTemp p1    = newTempV128();
3761       IRTemp p2    = newTempV128();
3762       IRTemp c1000 = newTempV128();
3763       IRTemp c0100 = newTempV128();
3764       IRTemp c0010 = newTempV128();
3765       IRTemp c0001 = newTempV128();
3766       assign(c1000, mkV128(0xF000));
3767       assign(c0100, mkV128(0x0F00));
3768       assign(c0010, mkV128(0x00F0));
3769       assign(c0001, mkV128(0x000F));
3770       // First interleave them at 32x4 granularity,
3771       // generating partial ("p") values.
3772       math_INTERLEAVE3_128(&p0, &p1, &p2, 2, u0, u1, u2);
3773       // And more shuffling around for the final answer
3774       assign(*i2,
3775              OR4( AND( IHI16x8( EX(p2),        ROL(EX(p2),4) ), EX(c1000) ),
3776                   AND( IHI16x8( ROL(EX(p2),6), EX(p2)        ), EX(c0100) ),
3777                   AND( IHI16x8( ROL(EX(p2),2), ROL(EX(p2),6) ), EX(c0010) ),
3778                   AND( ILO16x8( ROR(EX(p2),2), ROL(EX(p1),2) ), EX(c0001) )
3779       ));
3780       assign(*i1,
3781              OR4( AND( IHI16x8( ROL(EX(p1),4), ROR(EX(p2),2) ), EX(c1000) ),
3782                   AND( IHI16x8( EX(p1),        ROL(EX(p1),4) ), EX(c0100) ),
3783                   AND( IHI16x8( ROL(EX(p1),4), ROL(EX(p1),8) ), EX(c0010) ),
3784                   AND( IHI16x8( ROR(EX(p0),6), ROL(EX(p1),4) ), EX(c0001) )
3785       ));
3786       assign(*i0,
3787              OR4( AND( IHI16x8( ROR(EX(p1),2), ROL(EX(p0),2) ), EX(c1000) ),
3788                   AND( IHI16x8( ROL(EX(p0),2), ROL(EX(p0),6) ), EX(c0100) ),
3789                   AND( IHI16x8( ROL(EX(p0),8), ROL(EX(p0),2) ), EX(c0010) ),
3790                   AND( IHI16x8( ROL(EX(p0),4), ROL(EX(p0),8) ), EX(c0001) )
3791       ));
3792       return;
3793    }
3794
3795    if (laneSzBlg2 == 0) {
3796       // 8x16.  It doesn't seem worth the hassle of first doing a
3797       // 16x8 interleave, so just generate all 24 partial results
3798       // directly :-(
3799       // u2 == Cf .. C0, u1 == Bf .. B0, u0 == Af .. A0
3800       // i2 == Cf Bf Af Ce .. Bb Ab Ca
3801       // i1 == Ba Aa C9 B9 .. A6 C5 B5
3802       // i0 == A5 C4 B4 A4 .. C0 B0 A0
3803
3804       IRTemp i2_FEDC = newTempV128(); IRTemp i2_BA98 = newTempV128();
3805       IRTemp i2_7654 = newTempV128(); IRTemp i2_3210 = newTempV128();
3806       IRTemp i1_FEDC = newTempV128(); IRTemp i1_BA98 = newTempV128();
3807       IRTemp i1_7654 = newTempV128(); IRTemp i1_3210 = newTempV128();
3808       IRTemp i0_FEDC = newTempV128(); IRTemp i0_BA98 = newTempV128();
3809       IRTemp i0_7654 = newTempV128(); IRTemp i0_3210 = newTempV128();
3810       IRTemp i2_hi64 = newTempV128(); IRTemp i2_lo64 = newTempV128();
3811       IRTemp i1_hi64 = newTempV128(); IRTemp i1_lo64 = newTempV128();
3812       IRTemp i0_hi64 = newTempV128(); IRTemp i0_lo64 = newTempV128();
3813
3814       // eg XXXX(qqq, CC, 0xF, BB, 0xA)) sets qqq to be a vector
3815       // of the form 14 bytes junk : CC[0xF] : BB[0xA]
3816       //
3817 #     define XXXX(_tempName,_srcVec1,_srcShift1,_srcVec2,_srcShift2) \
3818          IRTemp t_##_tempName = newTempV128(); \
3819          assign(t_##_tempName, \
3820                 ILO8x16( ROR(EX(_srcVec1),(_srcShift1)), \
3821                          ROR(EX(_srcVec2),(_srcShift2)) ) )
3822
3823       // Let CC, BB, AA be (handy) aliases of u2, u1, u0 respectively
3824       IRTemp CC = u2; IRTemp BB = u1; IRTemp AA = u0;
3825
3826       // The slicing and reassembly are done as interleavedly as possible,
3827       // so as to minimise the demand for registers in the back end, which
3828       // was observed to be a problem in testing.
3829
3830       XXXX(CfBf, CC, 0xf, BB, 0xf); // i2[15:14]
3831       XXXX(AfCe, AA, 0xf, CC, 0xe);
3832       assign(i2_FEDC, ILO16x8(EX(t_CfBf), EX(t_AfCe)));
3833
3834       XXXX(BeAe, BB, 0xe, AA, 0xe);
3835       XXXX(CdBd, CC, 0xd, BB, 0xd);
3836       assign(i2_BA98, ILO16x8(EX(t_BeAe), EX(t_CdBd)));
3837       assign(i2_hi64, ILO32x4(EX(i2_FEDC), EX(i2_BA98)));
3838
3839       XXXX(AdCc, AA, 0xd, CC, 0xc);
3840       XXXX(BcAc, BB, 0xc, AA, 0xc);
3841       assign(i2_7654, ILO16x8(EX(t_AdCc), EX(t_BcAc)));
3842
3843       XXXX(CbBb, CC, 0xb, BB, 0xb);
3844       XXXX(AbCa, AA, 0xb, CC, 0xa); // i2[1:0]
3845       assign(i2_3210, ILO16x8(EX(t_CbBb), EX(t_AbCa)));
3846       assign(i2_lo64, ILO32x4(EX(i2_7654), EX(i2_3210)));
3847       assign(*i2, ILO64x2(EX(i2_hi64), EX(i2_lo64)));
3848
3849       XXXX(BaAa, BB, 0xa, AA, 0xa); // i1[15:14]
3850       XXXX(C9B9, CC, 0x9, BB, 0x9);
3851       assign(i1_FEDC, ILO16x8(EX(t_BaAa), EX(t_C9B9)));
3852
3853       XXXX(A9C8, AA, 0x9, CC, 0x8);
3854       XXXX(B8A8, BB, 0x8, AA, 0x8);
3855       assign(i1_BA98, ILO16x8(EX(t_A9C8), EX(t_B8A8)));
3856       assign(i1_hi64, ILO32x4(EX(i1_FEDC), EX(i1_BA98)));
3857
3858       XXXX(C7B7, CC, 0x7, BB, 0x7);
3859       XXXX(A7C6, AA, 0x7, CC, 0x6);
3860       assign(i1_7654, ILO16x8(EX(t_C7B7), EX(t_A7C6)));
3861
3862       XXXX(B6A6, BB, 0x6, AA, 0x6);
3863       XXXX(C5B5, CC, 0x5, BB, 0x5); // i1[1:0]
3864       assign(i1_3210, ILO16x8(EX(t_B6A6), EX(t_C5B5)));
3865       assign(i1_lo64, ILO32x4(EX(i1_7654), EX(i1_3210)));
3866       assign(*i1, ILO64x2(EX(i1_hi64), EX(i1_lo64)));
3867
3868       XXXX(A5C4, AA, 0x5, CC, 0x4); // i0[15:14]
3869       XXXX(B4A4, BB, 0x4, AA, 0x4);
3870       assign(i0_FEDC, ILO16x8(EX(t_A5C4), EX(t_B4A4)));
3871
3872       XXXX(C3B3, CC, 0x3, BB, 0x3);
3873       XXXX(A3C2, AA, 0x3, CC, 0x2);
3874       assign(i0_BA98, ILO16x8(EX(t_C3B3), EX(t_A3C2)));
3875       assign(i0_hi64, ILO32x4(EX(i0_FEDC), EX(i0_BA98)));
3876
3877       XXXX(B2A2, BB, 0x2, AA, 0x2);
3878       XXXX(C1B1, CC, 0x1, BB, 0x1);
3879       assign(i0_7654, ILO16x8(EX(t_B2A2), EX(t_C1B1)));
3880
3881       XXXX(A1C0, AA, 0x1, CC, 0x0);
3882       XXXX(B0A0, BB, 0x0, AA, 0x0); // i0[1:0]
3883       assign(i0_3210, ILO16x8(EX(t_A1C0), EX(t_B0A0)));
3884       assign(i0_lo64, ILO32x4(EX(i0_7654), EX(i0_3210)));
3885       assign(*i0, ILO64x2(EX(i0_hi64), EX(i0_lo64)));
3886
3887 #     undef XXXX
3888       return;
3889    }
3890
3891    /*NOTREACHED*/
3892    vassert(0);
3893 }
3894
3895
3896 /* Do interleaving for 4 128 bit vectors, for ST4 insns. */
3897 static
3898 void math_INTERLEAVE4_128(
3899         /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
3900         UInt laneSzBlg2,
3901         IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
3902 {
3903    if (laneSzBlg2 == 3) {
3904       // 64x2
3905       assign(*i0, ILO64x2(EX(u1), EX(u0)));
3906       assign(*i1, ILO64x2(EX(u3), EX(u2)));
3907       assign(*i2, IHI64x2(EX(u1), EX(u0)));
3908       assign(*i3, IHI64x2(EX(u3), EX(u2)));
3909       return;
3910    }
3911    if (laneSzBlg2 == 2) {
3912       // 32x4
3913       // First, interleave at the 64-bit lane size.
3914       IRTemp p0 = newTempV128();
3915       IRTemp p1 = newTempV128();
3916       IRTemp p2 = newTempV128();
3917       IRTemp p3 = newTempV128();
3918       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 3, u0, u1, u2, u3);
3919       // And interleave (cat) at the 32 bit size.
3920       assign(*i0, CEV32x4(EX(p1), EX(p0)));
3921       assign(*i1, COD32x4(EX(p1), EX(p0)));
3922       assign(*i2, CEV32x4(EX(p3), EX(p2)));
3923       assign(*i3, COD32x4(EX(p3), EX(p2)));
3924       return;
3925    }
3926    if (laneSzBlg2 == 1) {
3927       // 16x8
3928       // First, interleave at the 32-bit lane size.
3929       IRTemp p0 = newTempV128();
3930       IRTemp p1 = newTempV128();
3931       IRTemp p2 = newTempV128();
3932       IRTemp p3 = newTempV128();
3933       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 2, u0, u1, u2, u3);
3934       // And rearrange within each vector, to get the right 16 bit lanes.
3935       assign(*i0, COD16x8(EX(p0), SHL(EX(p0), 2)));
3936       assign(*i1, COD16x8(EX(p1), SHL(EX(p1), 2)));
3937       assign(*i2, COD16x8(EX(p2), SHL(EX(p2), 2)));
3938       assign(*i3, COD16x8(EX(p3), SHL(EX(p3), 2)));
3939       return;
3940    }
3941    if (laneSzBlg2 == 0) {
3942       // 8x16
3943       // First, interleave at the 16-bit lane size.
3944       IRTemp p0 = newTempV128();
3945       IRTemp p1 = newTempV128();
3946       IRTemp p2 = newTempV128();
3947       IRTemp p3 = newTempV128();
3948       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 1, u0, u1, u2, u3);
3949       // And rearrange within each vector, to get the right 8 bit lanes.
3950       assign(*i0, IHI32x4(COD8x16(EX(p0),EX(p0)), CEV8x16(EX(p0),EX(p0))));
3951       assign(*i1, IHI32x4(COD8x16(EX(p1),EX(p1)), CEV8x16(EX(p1),EX(p1))));
3952       assign(*i2, IHI32x4(COD8x16(EX(p2),EX(p2)), CEV8x16(EX(p2),EX(p2))));
3953       assign(*i3, IHI32x4(COD8x16(EX(p3),EX(p3)), CEV8x16(EX(p3),EX(p3))));
3954       return;
3955    }
3956    /*NOTREACHED*/
3957    vassert(0);
3958 }
3959
3960
3961 /* Do deinterleaving for 1 128 bit vector, for LD1 insns. */
3962 static
3963 void math_DEINTERLEAVE1_128( /*OUTx1*/ IRTemp* u0,
3964                              UInt laneSzBlg2, IRTemp i0 )
3965 {
3966    assign(*u0, mkexpr(i0));
3967 }
3968
3969
3970 /* Do deinterleaving for 2 128 bit vectors, for LD2 insns. */
3971 static
3972 void math_DEINTERLEAVE2_128( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
3973                              UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
3974 {
3975    /* This is pretty easy, since we have primitives directly to
3976       hand. */
3977    if (laneSzBlg2 == 3) {
3978       // 64x2
3979       // i1 == B1 A1, i0 == B0 A0
3980       // u1 == B1 B0, u0 == A1 A0
3981       assign(*u0, binop(Iop_InterleaveLO64x2, mkexpr(i1), mkexpr(i0)));
3982       assign(*u1, binop(Iop_InterleaveHI64x2, mkexpr(i1), mkexpr(i0)));
3983       return;
3984    }
3985    if (laneSzBlg2 == 2) {
3986       // 32x4
3987       // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
3988       // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
3989       assign(*u0, binop(Iop_CatEvenLanes32x4, mkexpr(i1), mkexpr(i0)));
3990       assign(*u1, binop(Iop_CatOddLanes32x4, mkexpr(i1), mkexpr(i0)));
3991       return;
3992    }
3993    if (laneSzBlg2 == 1) {
3994       // 16x8
3995       // i0 == B3 A3 B2 A2 B1 A1 B0 A0
3996       // i1 == B7 A7 B6 A6 B5 A5 B4 A4
3997       // u1 == B{7..0}, u0 == A{7..0}
3998       assign(*u0, binop(Iop_CatEvenLanes16x8, mkexpr(i1), mkexpr(i0)));
3999       assign(*u1, binop(Iop_CatOddLanes16x8,  mkexpr(i1), mkexpr(i0)));
4000       return;
4001    }
4002    if (laneSzBlg2 == 0) {
4003       // 8x16
4004       // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
4005       // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
4006       // u1 == B{f..0}, u0 == A{f..0}
4007       assign(*u0, binop(Iop_CatEvenLanes8x16, mkexpr(i1), mkexpr(i0)));
4008       assign(*u1, binop(Iop_CatOddLanes8x16,  mkexpr(i1), mkexpr(i0)));
4009       return;
4010    }
4011    /*NOTREACHED*/
4012    vassert(0);
4013 }
4014
4015
4016 /* Do deinterleaving for 3 128 bit vectors, for LD3 insns. */
4017 static
4018 void math_DEINTERLEAVE3_128(
4019         /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
4020         UInt laneSzBlg2,
4021         IRTemp i0, IRTemp i1, IRTemp i2 )
4022 {
4023    if (laneSzBlg2 == 3) {
4024       // 64x2
4025       // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
4026       // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
4027       assign(*u2, ILO64x2( ROL(EX(i2),8), EX(i1)        ));
4028       assign(*u1, ILO64x2( EX(i2),        ROL(EX(i0),8) ));
4029       assign(*u0, ILO64x2( ROL(EX(i1),8), EX(i0)        ));
4030       return;
4031    }
4032
4033    if (laneSzBlg2 == 2) {
4034       // 32x4
4035       // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
4036       // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
4037       // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
4038       IRTemp t_a1c0b0a0 = newTempV128();
4039       IRTemp t_a2c1b1a1 = newTempV128();
4040       IRTemp t_a3c2b2a2 = newTempV128();
4041       IRTemp t_a0c3b3a3 = newTempV128();
4042       IRTemp p0 = newTempV128();
4043       IRTemp p1 = newTempV128();
4044       IRTemp p2 = newTempV128();
4045       // Compute some intermediate values.
4046       assign(t_a1c0b0a0, EX(i0));
4047       assign(t_a2c1b1a1, SL(EX(i1),EX(i0),3*4));
4048       assign(t_a3c2b2a2, SL(EX(i2),EX(i1),2*4));
4049       assign(t_a0c3b3a3, SL(EX(i0),EX(i2),1*4));
4050       // First deinterleave into lane-pairs
4051       assign(p0, ILO32x4(EX(t_a2c1b1a1),EX(t_a1c0b0a0)));
4052       assign(p1, ILO64x2(ILO32x4(EX(t_a0c3b3a3), EX(t_a3c2b2a2)),
4053                          IHI32x4(EX(t_a2c1b1a1), EX(t_a1c0b0a0))));
4054       assign(p2, ILO32x4(ROR(EX(t_a0c3b3a3),1*4), ROR(EX(t_a3c2b2a2),1*4)));
4055       // Then deinterleave at 64x2 granularity.
4056       math_DEINTERLEAVE3_128(u0, u1, u2, 3, p0, p1, p2);
4057       return;
4058    }
4059
4060    if (laneSzBlg2 == 1) {
4061       // 16x8
4062       // u2 == C7 C6 C5 C4 C3 C2 C1 C0
4063       // u1 == B7 B6 B5 B4 B3 B2 B1 B0
4064       // u0 == A7 A6 A5 A4 A3 A2 A1 A0
4065       //
4066       // i2 == C7 B7 A7 C6 B6 A6 C5 B5
4067       // i1 == A5 C4 B4 A4 C4 B3 A3 C2
4068       // i0 == B2 A2 C1 B1 A1 C0 B0 A0
4069       //
4070       // p2 == C7 C6 B7 B6 A7 A6 C5 C4
4071       // p1 == B5 B4 A5 A4 C3 C2 B3 B2
4072       // p0 == A3 A2 C1 C0 B1 B0 A1 A0
4073
4074       IRTemp s0, s1, s2, s3, t0, t1, t2, t3, p0, p1, p2, c00111111;
4075       s0 = s1 = s2 = s3
4076          = t0 = t1 = t2 = t3 = p0 = p1 = p2 = c00111111 = IRTemp_INVALID;
4077       newTempsV128_4(&s0, &s1, &s2, &s3);
4078       newTempsV128_4(&t0, &t1, &t2, &t3);
4079       newTempsV128_4(&p0, &p1, &p2, &c00111111);
4080
4081       // s0 == b2a2 c1b1a1 c0b0a0
4082       // s1 == b4a4 c3b3c3 c2b2a2
4083       // s2 == b6a6 c5b5a5 c4b4a4
4084       // s3 == b0a0 c7b7a7 c6b6a6
4085       assign(s0, EX(i0));
4086       assign(s1, SL(EX(i1),EX(i0),6*2));
4087       assign(s2, SL(EX(i2),EX(i1),4*2));
4088       assign(s3, SL(EX(i0),EX(i2),2*2));
4089
4090       // t0 == 0 0 c1c0 b1b0 a1a0
4091       // t1 == 0 0 c3c2 b3b2 a3a2
4092       // t2 == 0 0 c5c4 b5b4 a5a4
4093       // t3 == 0 0 c7c6 b7b6 a7a6
4094       assign(c00111111, mkV128(0x0FFF));
4095       assign(t0, AND( ILO16x8( ROR(EX(s0),3*2), EX(s0)), EX(c00111111)));
4096       assign(t1, AND( ILO16x8( ROR(EX(s1),3*2), EX(s1)), EX(c00111111)));
4097       assign(t2, AND( ILO16x8( ROR(EX(s2),3*2), EX(s2)), EX(c00111111)));
4098       assign(t3, AND( ILO16x8( ROR(EX(s3),3*2), EX(s3)), EX(c00111111)));
4099
4100       assign(p0, OR2(EX(t0),          SHL(EX(t1),6*2)));
4101       assign(p1, OR2(SHL(EX(t2),4*2), SHR(EX(t1),2*2)));
4102       assign(p2, OR2(SHL(EX(t3),2*2), SHR(EX(t2),4*2)));
4103
4104       // Then deinterleave at 32x4 granularity.
4105       math_DEINTERLEAVE3_128(u0, u1, u2, 2, p0, p1, p2);
4106       return;
4107    }
4108
4109    if (laneSzBlg2 == 0) {
4110       // 8x16.  This is the same scheme as for 16x8, with twice the
4111       // number of intermediate values.
4112       //
4113       // u2 == C{f..0}
4114       // u1 == B{f..0}
4115       // u0 == A{f..0}
4116       //
4117       // i2 == CBA{f} CBA{e} CBA{d} CBA{c} CBA{b} C{a}
4118       // i1 ==  BA{a} CBA{9} CBA{8} CBA{7} CBA{6} CB{5}
4119       // i0 ==   A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
4120       //
4121       // p2 == C{fe} B{fe} A{fe} C{dc} B{dc} A{dc} C{ba} B{ba}
4122       // p1 == A{ba} C{98} B{98} A{98} C{76} B{76} A{76} C{54}
4123       // p0 == B{54} A{54} C{32} B{32} A{32} C{10} B{10} A{10}
4124       //
4125       IRTemp s0, s1, s2, s3, s4, s5, s6, s7,
4126              t0, t1, t2, t3, t4, t5, t6, t7, p0, p1, p2, cMASK;
4127       s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7
4128          = t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = p0 = p1 = p2 = cMASK
4129          = IRTemp_INVALID;
4130       newTempsV128_4(&s0, &s1, &s2, &s3);
4131       newTempsV128_4(&s4, &s5, &s6, &s7);
4132       newTempsV128_4(&t0, &t1, &t2, &t3);
4133       newTempsV128_4(&t4, &t5, &t6, &t7);
4134       newTempsV128_4(&p0, &p1, &p2, &cMASK);
4135
4136       // s0 == A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
4137       // s1 == A{7} CBA{6} CBA{5} CBA{4} CBA{3} CBA{2}
4138       // s2 == A{9} CBA{8} CBA{7} CBA{6} CBA{5} CBA{4}
4139       // s3 == A{b} CBA{a} CBA{9} CBA{8} CBA{7} CBA{6}
4140       // s4 == A{d} CBA{c} CBA{b} CBA{a} CBA{9} CBA{8}
4141       // s5 == A{f} CBA{e} CBA{d} CBA{c} CBA{b} CBA{a}
4142       // s6 == A{1} CBA{0} CBA{f} CBA{e} CBA{d} CBA{c}
4143       // s7 == A{3} CBA{2} CBA{1} CBA{0} CBA{f} CBA{e}
4144       assign(s0, SL(EX(i1),EX(i0), 0));
4145       assign(s1, SL(EX(i1),EX(i0), 6));
4146       assign(s2, SL(EX(i1),EX(i0),12));
4147       assign(s3, SL(EX(i2),EX(i1), 2));
4148       assign(s4, SL(EX(i2),EX(i1), 8));
4149       assign(s5, SL(EX(i2),EX(i1),14));
4150       assign(s6, SL(EX(i0),EX(i2), 4));
4151       assign(s7, SL(EX(i0),EX(i2),10));
4152
4153       // t0 == 0--(ten)--0 C1 C0 B1 B0 A1 A0
4154       // t1 == 0--(ten)--0 C3 C2 B3 B2 A3 A2
4155       // t2 == 0--(ten)--0 C5 C4 B5 B4 A5 A4
4156       // t3 == 0--(ten)--0 C7 C6 B7 B6 A7 A6
4157       // t4 == 0--(ten)--0 C9 C8 B9 B8 A9 A8
4158       // t5 == 0--(ten)--0 Cb Ca Bb Ba Ab Aa
4159       // t6 == 0--(ten)--0 Cd Cc Bd Bc Ad Ac
4160       // t7 == 0--(ten)--0 Cf Ce Bf Be Af Ae
4161       assign(cMASK, mkV128(0x003F));
4162       assign(t0, AND( ILO8x16( ROR(EX(s0),3), EX(s0)), EX(cMASK)));
4163       assign(t1, AND( ILO8x16( ROR(EX(s1),3), EX(s1)), EX(cMASK)));
4164       assign(t2, AND( ILO8x16( ROR(EX(s2),3), EX(s2)), EX(cMASK)));
4165       assign(t3, AND( ILO8x16( ROR(EX(s3),3), EX(s3)), EX(cMASK)));
4166       assign(t4, AND( ILO8x16( ROR(EX(s4),3), EX(s4)), EX(cMASK)));
4167       assign(t5, AND( ILO8x16( ROR(EX(s5),3), EX(s5)), EX(cMASK)));
4168       assign(t6, AND( ILO8x16( ROR(EX(s6),3), EX(s6)), EX(cMASK)));
4169       assign(t7, AND( ILO8x16( ROR(EX(s7),3), EX(s7)), EX(cMASK)));
4170
4171       assign(p0, OR3( SHL(EX(t2),12), SHL(EX(t1),6), EX(t0) ));
4172       assign(p1, OR4( SHL(EX(t5),14), SHL(EX(t4),8),
4173                  SHL(EX(t3),2), SHR(EX(t2),4) ));
4174       assign(p2, OR3( SHL(EX(t7),10), SHL(EX(t6),4), SHR(EX(t5),2) ));
4175
4176       // Then deinterleave at 16x8 granularity.
4177       math_DEINTERLEAVE3_128(u0, u1, u2, 1, p0, p1, p2);
4178       return;
4179    }
4180
4181    /*NOTREACHED*/
4182    vassert(0);
4183 }
4184
4185
4186 /* Do deinterleaving for 4 128 bit vectors, for LD4 insns. */
4187 static
4188 void math_DEINTERLEAVE4_128(
4189         /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
4190         UInt laneSzBlg2,
4191         IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
4192 {
4193    if (laneSzBlg2 == 3) {
4194       // 64x2
4195       assign(*u0, ILO64x2(EX(i2), EX(i0)));
4196       assign(*u1, IHI64x2(EX(i2), EX(i0)));
4197       assign(*u2, ILO64x2(EX(i3), EX(i1)));
4198       assign(*u3, IHI64x2(EX(i3), EX(i1)));
4199       return;
4200    }
4201    if (laneSzBlg2 == 2) {
4202       // 32x4
4203       IRTemp p0 = newTempV128();
4204       IRTemp p2 = newTempV128();
4205       IRTemp p1 = newTempV128();
4206       IRTemp p3 = newTempV128();
4207       assign(p0, ILO32x4(EX(i1), EX(i0)));
4208       assign(p1, IHI32x4(EX(i1), EX(i0)));
4209       assign(p2, ILO32x4(EX(i3), EX(i2)));
4210       assign(p3, IHI32x4(EX(i3), EX(i2)));
4211       // And now do what we did for the 64-bit case.
4212       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 3, p0, p1, p2, p3);
4213       return;
4214    }
4215    if (laneSzBlg2 == 1) {
4216       // 16x8
4217       // Deinterleave into 32-bit chunks, then do as the 32-bit case.
4218       IRTemp p0 = newTempV128();
4219       IRTemp p1 = newTempV128();
4220       IRTemp p2 = newTempV128();
4221       IRTemp p3 = newTempV128();
4222       assign(p0, IHI16x8(EX(i0), SHL(EX(i0), 8)));
4223       assign(p1, IHI16x8(EX(i1), SHL(EX(i1), 8)));
4224       assign(p2, IHI16x8(EX(i2), SHL(EX(i2), 8)));
4225       assign(p3, IHI16x8(EX(i3), SHL(EX(i3), 8)));
4226       // From here on is like the 32 bit case.
4227       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 2, p0, p1, p2, p3);
4228       return;
4229    }
4230    if (laneSzBlg2 == 0) {
4231       // 8x16
4232       // Deinterleave into 16-bit chunks, then do as the 16-bit case.
4233       IRTemp p0 = newTempV128();
4234       IRTemp p1 = newTempV128();
4235       IRTemp p2 = newTempV128();
4236       IRTemp p3 = newTempV128();
4237       assign(p0, IHI64x2( IHI8x16(EX(i0),ROL(EX(i0),4)),
4238                           ILO8x16(EX(i0),ROL(EX(i0),4)) ));
4239       assign(p1, IHI64x2( IHI8x16(EX(i1),ROL(EX(i1),4)),
4240                           ILO8x16(EX(i1),ROL(EX(i1),4)) ));
4241       assign(p2, IHI64x2( IHI8x16(EX(i2),ROL(EX(i2),4)),
4242                           ILO8x16(EX(i2),ROL(EX(i2),4)) ));
4243       assign(p3, IHI64x2( IHI8x16(EX(i3),ROL(EX(i3),4)),
4244                           ILO8x16(EX(i3),ROL(EX(i3),4)) ));
4245       // From here on is like the 16 bit case.
4246       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 1, p0, p1, p2, p3);
4247       return;
4248    }
4249    /*NOTREACHED*/
4250    vassert(0);
4251 }
4252
4253
4254 /* Wrappers that use the full-width (de)interleavers to do half-width
4255    (de)interleaving.  The scheme is to clone each input lane in the
4256    lower half of each incoming value, do a full width (de)interleave
4257    at the next lane size up, and remove every other lane of the the
4258    result.  The returned values may have any old junk in the upper
4259    64 bits -- the caller must ignore that. */
4260
4261 /* Helper function -- get doubling and narrowing operations. */
4262 static
4263 void math_get_doubler_and_halver ( /*OUT*/IROp* doubler,
4264                                    /*OUT*/IROp* halver,
4265                                    UInt laneSzBlg2 )
4266 {
4267    switch (laneSzBlg2) {
4268       case 2:
4269          *doubler = Iop_InterleaveLO32x4; *halver = Iop_CatEvenLanes32x4;
4270          break;
4271       case 1:
4272          *doubler = Iop_InterleaveLO16x8; *halver = Iop_CatEvenLanes16x8;
4273          break;
4274       case 0:
4275          *doubler = Iop_InterleaveLO8x16; *halver = Iop_CatEvenLanes8x16;
4276          break;
4277       default:
4278          vassert(0);
4279    }
4280 }
4281
4282 /* Do interleaving for 1 64 bit vector, for ST1 insns. */
4283 static
4284 void math_INTERLEAVE1_64( /*OUTx1*/ IRTemp* i0,
4285                           UInt laneSzBlg2, IRTemp u0 )
4286 {
4287    assign(*i0, mkexpr(u0));
4288 }
4289
4290
4291 /* Do interleaving for 2 64 bit vectors, for ST2 insns. */
4292 static
4293 void math_INTERLEAVE2_64( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
4294                           UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
4295 {
4296    if (laneSzBlg2 == 3) {
4297       // 1x64, degenerate case
4298       assign(*i0, EX(u0));
4299       assign(*i1, EX(u1));
4300       return;
4301    }
4302
4303    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4304    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4305    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4306
4307    IRTemp du0 = newTempV128();
4308    IRTemp du1 = newTempV128();
4309    assign(du0, binop(doubler, EX(u0), EX(u0)));
4310    assign(du1, binop(doubler, EX(u1), EX(u1)));
4311    IRTemp di0 = newTempV128();
4312    IRTemp di1 = newTempV128();
4313    math_INTERLEAVE2_128(&di0, &di1, laneSzBlg2 + 1, du0, du1);
4314    assign(*i0, binop(halver, EX(di0), EX(di0)));
4315    assign(*i1, binop(halver, EX(di1), EX(di1)));
4316 }
4317
4318
4319 /* Do interleaving for 3 64 bit vectors, for ST3 insns. */
4320 static
4321 void math_INTERLEAVE3_64(
4322         /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
4323         UInt laneSzBlg2,
4324         IRTemp u0, IRTemp u1, IRTemp u2 )
4325 {
4326    if (laneSzBlg2 == 3) {
4327       // 1x64, degenerate case
4328       assign(*i0, EX(u0));
4329       assign(*i1, EX(u1));
4330       assign(*i2, EX(u2));
4331       return;
4332    }
4333
4334    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4335    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4336    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4337
4338    IRTemp du0 = newTempV128();
4339    IRTemp du1 = newTempV128();
4340    IRTemp du2 = newTempV128();
4341    assign(du0, binop(doubler, EX(u0), EX(u0)));
4342    assign(du1, binop(doubler, EX(u1), EX(u1)));
4343    assign(du2, binop(doubler, EX(u2), EX(u2)));
4344    IRTemp di0 = newTempV128();
4345    IRTemp di1 = newTempV128();
4346    IRTemp di2 = newTempV128();
4347    math_INTERLEAVE3_128(&di0, &di1, &di2, laneSzBlg2 + 1, du0, du1, du2);
4348    assign(*i0, binop(halver, EX(di0), EX(di0)));
4349    assign(*i1, binop(halver, EX(di1), EX(di1)));
4350    assign(*i2, binop(halver, EX(di2), EX(di2)));
4351 }
4352
4353
4354 /* Do interleaving for 4 64 bit vectors, for ST4 insns. */
4355 static
4356 void math_INTERLEAVE4_64(
4357         /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
4358         UInt laneSzBlg2,
4359         IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
4360 {
4361    if (laneSzBlg2 == 3) {
4362       // 1x64, degenerate case
4363       assign(*i0, EX(u0));
4364       assign(*i1, EX(u1));
4365       assign(*i2, EX(u2));
4366       assign(*i3, EX(u3));
4367       return;
4368    }
4369
4370    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4371    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4372    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4373
4374    IRTemp du0 = newTempV128();
4375    IRTemp du1 = newTempV128();
4376    IRTemp du2 = newTempV128();
4377    IRTemp du3 = newTempV128();
4378    assign(du0, binop(doubler, EX(u0), EX(u0)));
4379    assign(du1, binop(doubler, EX(u1), EX(u1)));
4380    assign(du2, binop(doubler, EX(u2), EX(u2)));
4381    assign(du3, binop(doubler, EX(u3), EX(u3)));
4382    IRTemp di0 = newTempV128();
4383    IRTemp di1 = newTempV128();
4384    IRTemp di2 = newTempV128();
4385    IRTemp di3 = newTempV128();
4386    math_INTERLEAVE4_128(&di0, &di1, &di2, &di3,
4387                         laneSzBlg2 + 1, du0, du1, du2, du3);
4388    assign(*i0, binop(halver, EX(di0), EX(di0)));
4389    assign(*i1, binop(halver, EX(di1), EX(di1)));
4390    assign(*i2, binop(halver, EX(di2), EX(di2)));
4391    assign(*i3, binop(halver, EX(di3), EX(di3)));
4392 }
4393
4394
4395 /* Do deinterleaving for 1 64 bit vector, for LD1 insns. */
4396 static
4397 void math_DEINTERLEAVE1_64( /*OUTx1*/ IRTemp* u0,
4398                             UInt laneSzBlg2, IRTemp i0 )
4399 {
4400    assign(*u0, mkexpr(i0));
4401 }
4402
4403
4404 /* Do deinterleaving for 2 64 bit vectors, for LD2 insns. */
4405 static
4406 void math_DEINTERLEAVE2_64( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
4407                             UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
4408 {
4409    if (laneSzBlg2 == 3) {
4410       // 1x64, degenerate case
4411       assign(*u0, EX(i0));
4412       assign(*u1, EX(i1));
4413       return;
4414    }
4415
4416    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4417    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4418    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4419
4420    IRTemp di0 = newTempV128();
4421    IRTemp di1 = newTempV128();
4422    assign(di0, binop(doubler, EX(i0), EX(i0)));
4423    assign(di1, binop(doubler, EX(i1), EX(i1)));
4424
4425    IRTemp du0 = newTempV128();
4426    IRTemp du1 = newTempV128();
4427    math_DEINTERLEAVE2_128(&du0, &du1, laneSzBlg2 + 1, di0, di1);
4428    assign(*u0, binop(halver, EX(du0), EX(du0)));
4429    assign(*u1, binop(halver, EX(du1), EX(du1)));
4430 }
4431
4432
4433 /* Do deinterleaving for 3 64 bit vectors, for LD3 insns. */
4434 static
4435 void math_DEINTERLEAVE3_64(
4436         /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
4437         UInt laneSzBlg2,
4438         IRTemp i0, IRTemp i1, IRTemp i2 )
4439 {
4440    if (laneSzBlg2 == 3) {
4441       // 1x64, degenerate case
4442       assign(*u0, EX(i0));
4443       assign(*u1, EX(i1));
4444       assign(*u2, EX(i2));
4445       return;
4446    }
4447
4448    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4449    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4450    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4451
4452    IRTemp di0 = newTempV128();
4453    IRTemp di1 = newTempV128();
4454    IRTemp di2 = newTempV128();
4455    assign(di0, binop(doubler, EX(i0), EX(i0)));
4456    assign(di1, binop(doubler, EX(i1), EX(i1)));
4457    assign(di2, binop(doubler, EX(i2), EX(i2)));
4458    IRTemp du0 = newTempV128();
4459    IRTemp du1 = newTempV128();
4460    IRTemp du2 = newTempV128();
4461    math_DEINTERLEAVE3_128(&du0, &du1, &du2, laneSzBlg2 + 1, di0, di1, di2);
4462    assign(*u0, binop(halver, EX(du0), EX(du0)));
4463    assign(*u1, binop(halver, EX(du1), EX(du1)));
4464    assign(*u2, binop(halver, EX(du2), EX(du2)));
4465 }
4466
4467
4468 /* Do deinterleaving for 4 64 bit vectors, for LD4 insns. */
4469 static
4470 void math_DEINTERLEAVE4_64(
4471         /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
4472         UInt laneSzBlg2,
4473         IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
4474 {
4475    if (laneSzBlg2 == 3) {
4476       // 1x64, degenerate case
4477       assign(*u0, EX(i0));
4478       assign(*u1, EX(i1));
4479       assign(*u2, EX(i2));
4480       assign(*u3, EX(i3));
4481       return;
4482    }
4483
4484    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4485    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4486    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4487
4488    IRTemp di0 = newTempV128();
4489    IRTemp di1 = newTempV128();
4490    IRTemp di2 = newTempV128();
4491    IRTemp di3 = newTempV128();
4492    assign(di0, binop(doubler, EX(i0), EX(i0)));
4493    assign(di1, binop(doubler, EX(i1), EX(i1)));
4494    assign(di2, binop(doubler, EX(i2), EX(i2)));
4495    assign(di3, binop(doubler, EX(i3), EX(i3)));
4496    IRTemp du0 = newTempV128();
4497    IRTemp du1 = newTempV128();
4498    IRTemp du2 = newTempV128();
4499    IRTemp du3 = newTempV128();
4500    math_DEINTERLEAVE4_128(&du0, &du1, &du2, &du3,
4501                           laneSzBlg2 + 1, di0, di1, di2, di3);
4502    assign(*u0, binop(halver, EX(du0), EX(du0)));
4503    assign(*u1, binop(halver, EX(du1), EX(du1)));
4504    assign(*u2, binop(halver, EX(du2), EX(du2)));
4505    assign(*u3, binop(halver, EX(du3), EX(du3)));
4506 }
4507
4508
4509 #undef EX
4510 #undef SL
4511 #undef ROR
4512 #undef ROL
4513 #undef SHR
4514 #undef SHL
4515 #undef ILO64x2
4516 #undef IHI64x2
4517 #undef ILO32x4
4518 #undef IHI32x4
4519 #undef ILO16x8
4520 #undef IHI16x8
4521 #undef ILO16x8
4522 #undef IHI16x8
4523 #undef CEV32x4
4524 #undef COD32x4
4525 #undef COD16x8
4526 #undef COD8x16
4527 #undef CEV8x16
4528 #undef AND
4529 #undef OR2
4530 #undef OR3
4531 #undef OR4
4532
4533
4534 /*------------------------------------------------------------*/
4535 /*--- Load and Store instructions                          ---*/
4536 /*------------------------------------------------------------*/
4537
4538 /* Generate the EA for a "reg + reg" style amode.  This is done from
4539    parts of the insn, but for sanity checking sake it takes the whole
4540    insn.  This appears to depend on insn[15:12], with opt=insn[15:13]
4541    and S=insn[12]:
4542
4543    The possible forms, along with their opt:S values, are:
4544       011:0   Xn|SP + Xm
4545       111:0   Xn|SP + Xm
4546       011:1   Xn|SP + Xm * transfer_szB
4547       111:1   Xn|SP + Xm * transfer_szB
4548       010:0   Xn|SP + 32Uto64(Wm)
4549       010:1   Xn|SP + 32Uto64(Wm) * transfer_szB
4550       110:0   Xn|SP + 32Sto64(Wm)
4551       110:1   Xn|SP + 32Sto64(Wm) * transfer_szB
4552
4553    Rm is insn[20:16].  Rn is insn[9:5].  Rt is insn[4:0].  Log2 of
4554    the transfer size is insn[23,31,30].  For integer loads/stores,
4555    insn[23] is zero, hence szLg2 can be at most 3 in such cases.
4556
4557    If the decoding fails, it returns IRTemp_INVALID.
4558
4559    isInt is True iff this is decoding is for transfers to/from integer
4560    registers.  If False it is for transfers to/from vector registers.
4561 */
4562 static IRTemp gen_indexed_EA ( /*OUT*/HChar* buf, UInt insn, Bool isInt )
4563 {
4564    UInt    optS  = SLICE_UInt(insn, 15, 12);
4565    UInt    mm    = SLICE_UInt(insn, 20, 16);
4566    UInt    nn    = SLICE_UInt(insn, 9, 5);
4567    UInt    szLg2 = (isInt ? 0 : (SLICE_UInt(insn, 23, 23) << 2))
4568                    | SLICE_UInt(insn, 31, 30); // Log2 of the size
4569
4570    buf[0] = 0;
4571
4572    /* Sanity checks, that this really is a load/store insn. */
4573    if (SLICE_UInt(insn, 11, 10) != BITS2(1,0))
4574       goto fail;
4575
4576    if (isInt
4577        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,1,1)/*LDR*/
4578        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,0,1)/*STR*/
4579        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,0,1)/*LDRSbhw Xt*/
4580        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,1,1))/*LDRSbhw Wt*/
4581       goto fail;
4582
4583    if (!isInt
4584        && SLICE_UInt(insn, 29, 24) != BITS6(1,1,1,1,0,0)) /*LDR/STR*/
4585       goto fail;
4586
4587    /* Throw out non-verified but possibly valid cases. */
4588    switch (szLg2) {
4589       case BITS3(0,0,0): break; //  8 bit, valid for both int and vec
4590       case BITS3(0,0,1): break; // 16 bit, valid for both int and vec
4591       case BITS3(0,1,0): break; // 32 bit, valid for both int and vec
4592       case BITS3(0,1,1): break; // 64 bit, valid for both int and vec
4593       case BITS3(1,0,0): // can only ever be valid for the vector case
4594                          if (isInt) goto fail; else break;
4595       case BITS3(1,0,1): // these sizes are never valid
4596       case BITS3(1,1,0):
4597       case BITS3(1,1,1): goto fail;
4598
4599       default: vassert(0);
4600    }
4601
4602    IRExpr* rhs  = NULL;
4603    switch (optS) {
4604       case BITS4(1,1,1,0): goto fail; //ATC
4605       case BITS4(0,1,1,0):
4606          rhs = getIReg64orZR(mm);
4607          vex_sprintf(buf, "[%s, %s]",
4608                      nameIReg64orZR(nn), nameIReg64orZR(mm));
4609          break;
4610       case BITS4(1,1,1,1): goto fail; //ATC
4611       case BITS4(0,1,1,1):
4612          rhs = binop(Iop_Shl64, getIReg64orZR(mm), mkU8(szLg2));
4613          vex_sprintf(buf, "[%s, %s lsl %u]",
4614                      nameIReg64orZR(nn), nameIReg64orZR(mm), szLg2);
4615          break;
4616       case BITS4(0,1,0,0):
4617          rhs = unop(Iop_32Uto64, getIReg32orZR(mm));
4618          vex_sprintf(buf, "[%s, %s uxtx]",
4619                      nameIReg64orZR(nn), nameIReg32orZR(mm));
4620          break;
4621       case BITS4(0,1,0,1):
4622          rhs = binop(Iop_Shl64,
4623                      unop(Iop_32Uto64, getIReg32orZR(mm)), mkU8(szLg2));
4624          vex_sprintf(buf, "[%s, %s uxtx, lsl %u]",
4625                      nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
4626          break;
4627       case BITS4(1,1,0,0):
4628          rhs = unop(Iop_32Sto64, getIReg32orZR(mm));
4629          vex_sprintf(buf, "[%s, %s sxtx]",
4630                      nameIReg64orZR(nn), nameIReg32orZR(mm));
4631          break;
4632       case BITS4(1,1,0,1):
4633          rhs = binop(Iop_Shl64,
4634                      unop(Iop_32Sto64, getIReg32orZR(mm)), mkU8(szLg2));
4635          vex_sprintf(buf, "[%s, %s sxtx, lsl %u]",
4636                      nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
4637          break;
4638       default:
4639          /* The rest appear to be genuinely invalid */
4640          goto fail;
4641    }
4642
4643    vassert(rhs);
4644    IRTemp res = newTemp(Ity_I64);
4645    assign(res, binop(Iop_Add64, getIReg64orSP(nn), rhs));
4646    return res;
4647
4648   fail:
4649    vex_printf("gen_indexed_EA: unhandled case optS == 0x%x\n", optS);
4650    return IRTemp_INVALID;
4651 }
4652
4653
4654 /* Generate an 8/16/32/64 bit integer store to ADDR for the lowest
4655    bits of DATAE :: Ity_I64. */
4656 static void gen_narrowing_store ( UInt szB, IRTemp addr, IRExpr* dataE )
4657 {
4658    IRExpr* addrE = mkexpr(addr);
4659    switch (szB) {
4660       case 8:
4661          storeLE(addrE, dataE);
4662          break;
4663       case 4:
4664          storeLE(addrE, unop(Iop_64to32, dataE));
4665          break;
4666       case 2:
4667          storeLE(addrE, unop(Iop_64to16, dataE));
4668          break;
4669       case 1:
4670          storeLE(addrE, unop(Iop_64to8, dataE));
4671          break;
4672       default:
4673          vassert(0);
4674    }
4675 }
4676
4677
4678 /* Generate an 8/16/32/64 bit unsigned widening load from ADDR,
4679    placing the result in an Ity_I64 temporary. */
4680 static IRTemp gen_zwidening_load ( UInt szB, IRTemp addr )
4681 {
4682    IRTemp  res   = newTemp(Ity_I64);
4683    IRExpr* addrE = mkexpr(addr);
4684    switch (szB) {
4685       case 8:
4686          assign(res, loadLE(Ity_I64,addrE));
4687          break;
4688       case 4:
4689          assign(res, unop(Iop_32Uto64, loadLE(Ity_I32,addrE)));
4690          break;
4691       case 2:
4692          assign(res, unop(Iop_16Uto64, loadLE(Ity_I16,addrE)));
4693          break;
4694       case 1:
4695          assign(res, unop(Iop_8Uto64, loadLE(Ity_I8,addrE)));
4696          break;
4697       default:
4698          vassert(0);
4699    }
4700    return res;
4701 }
4702
4703
4704 /* Generate a "standard 7" name, from bitQ and size.  But also
4705    allow ".1d" since that's occasionally useful. */
4706 static
4707 const HChar* nameArr_Q_SZ ( UInt bitQ, UInt size )
4708 {
4709    vassert(bitQ <= 1 && size <= 3);
4710    const HChar* nms[8]
4711       = { "8b", "4h", "2s", "1d", "16b", "8h", "4s", "2d" };
4712    UInt ix = (bitQ << 2) | size;
4713    vassert(ix < 8);
4714    return nms[ix];
4715 }
4716
4717
4718 static
4719 Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
4720                           const VexAbiInfo* abiinfo
4721 )
4722 {
4723 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
4724
4725    /* ------------ LDR,STR (immediate, uimm12) ----------- */
4726    /* uimm12 is scaled by the transfer size
4727
4728       31 29  26    21    9  4
4729       |  |   |     |     |  |
4730       11 111 00100 imm12 nn tt    STR  Xt, [Xn|SP, #imm12 * 8]
4731       11 111 00101 imm12 nn tt    LDR  Xt, [Xn|SP, #imm12 * 8]
4732
4733       10 111 00100 imm12 nn tt    STR  Wt, [Xn|SP, #imm12 * 4]
4734       10 111 00101 imm12 nn tt    LDR  Wt, [Xn|SP, #imm12 * 4]
4735
4736       01 111 00100 imm12 nn tt    STRH Wt, [Xn|SP, #imm12 * 2]
4737       01 111 00101 imm12 nn tt    LDRH Wt, [Xn|SP, #imm12 * 2]
4738
4739       00 111 00100 imm12 nn tt    STRB Wt, [Xn|SP, #imm12 * 1]
4740       00 111 00101 imm12 nn tt    LDRB Wt, [Xn|SP, #imm12 * 1]
4741    */
4742    if (INSN(29,23) == BITS7(1,1,1,0,0,1,0)) {
4743       UInt   szLg2 = INSN(31,30);
4744       UInt   szB   = 1 << szLg2;
4745       Bool   isLD  = INSN(22,22) == 1;
4746       UInt   offs  = INSN(21,10) * szB;
4747       UInt   nn    = INSN(9,5);
4748       UInt   tt    = INSN(4,0);
4749       IRTemp ta    = newTemp(Ity_I64);
4750       assign(ta, binop(Iop_Add64, getIReg64orSP(nn), mkU64(offs)));
4751       if (nn == 31) { /* FIXME generate stack alignment check */ }
4752       vassert(szLg2 < 4);
4753       if (isLD) {
4754          putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, ta)));
4755       } else {
4756          gen_narrowing_store(szB, ta, getIReg64orZR(tt));
4757       }
4758       const HChar* ld_name[4] = { "ldrb", "ldrh", "ldr", "ldr" };
4759       const HChar* st_name[4] = { "strb", "strh", "str", "str" };
4760       DIP("%s %s, [%s, #%u]\n",
4761           (isLD ? ld_name : st_name)[szLg2], nameIRegOrZR(szB == 8, tt),
4762           nameIReg64orSP(nn), offs);
4763       return True;
4764    }
4765
4766    /* ------------ LDUR,STUR (immediate, simm9) ----------- */
4767    /*
4768       31 29  26      20   11 9  4
4769       |  |   |       |    |  |  |
4770       (at-Rn-then-Rn=EA)  |  |  |
4771       sz 111 00000 0 imm9 01 Rn Rt   STR Rt, [Xn|SP], #simm9
4772       sz 111 00001 0 imm9 01 Rn Rt   LDR Rt, [Xn|SP], #simm9
4773
4774       (at-EA-then-Rn=EA)
4775       sz 111 00000 0 imm9 11 Rn Rt   STR Rt, [Xn|SP, #simm9]!
4776       sz 111 00001 0 imm9 11 Rn Rt   LDR Rt, [Xn|SP, #simm9]!
4777
4778       (at-EA)
4779       sz 111 00000 0 imm9 00 Rn Rt   STR Rt, [Xn|SP, #simm9]
4780       sz 111 00001 0 imm9 00 Rn Rt   LDR Rt, [Xn|SP, #simm9]
4781
4782       simm9 is unscaled.
4783
4784       The case 'wback && Rn == Rt && Rt != 31' is disallowed.  In the
4785       load case this is because would create two competing values for
4786       Rt.  In the store case the reason is unclear, but the spec
4787       disallows it anyway.
4788
4789       Stores are narrowing, loads are unsigned widening.  sz encodes
4790       the transfer size in the normal way: 00=1, 01=2, 10=4, 11=8.
4791    */
4792    if ((INSN(29,21) & BITS9(1,1,1, 1,1,1,1,0, 1))
4793        == BITS9(1,1,1, 0,0,0,0,0, 0)) {
4794       UInt szLg2  = INSN(31,30);
4795       UInt szB    = 1 << szLg2;
4796       Bool isLoad = INSN(22,22) == 1;
4797       UInt imm9   = INSN(20,12);
4798       UInt nn     = INSN(9,5);
4799       UInt tt     = INSN(4,0);
4800       Bool wBack  = INSN(10,10) == 1;
4801       UInt how    = INSN(11,10);
4802       if (how == BITS2(1,0) || (wBack && nn == tt && tt != 31)) {
4803          /* undecodable; fall through */
4804       } else {
4805          if (nn == 31) { /* FIXME generate stack alignment check */ }
4806
4807          // Compute the transfer address TA and the writeback address WA.
4808          IRTemp tRN = newTemp(Ity_I64);
4809          assign(tRN, getIReg64orSP(nn));
4810          IRTemp tEA = newTemp(Ity_I64);
4811          Long simm9 = (Long)sx_to_64(imm9, 9);
4812          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
4813
4814          IRTemp tTA = newTemp(Ity_I64);
4815          IRTemp tWA = newTemp(Ity_I64);
4816          switch (how) {
4817             case BITS2(0,1):
4818                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
4819             case BITS2(1,1):
4820                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
4821             case BITS2(0,0):
4822                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
4823             default:
4824                vassert(0); /* NOTREACHED */
4825          }
4826
4827          /* Normally rN would be updated after the transfer.  However, in
4828             the special cases typifed by
4829                str x30, [sp,#-16]!
4830                str w1, [sp,#-32]!
4831             it is necessary to update SP before the transfer, (1)
4832             because Memcheck will otherwise complain about a write
4833             below the stack pointer, and (2) because the segfault
4834             stack extension mechanism will otherwise extend the stack
4835             only down to SP before the instruction, which might not be
4836             far enough, if the -16/-32 bit takes the actual access
4837             address to the next page.
4838          */
4839          Bool earlyWBack
4840            = wBack && simm9 < 0 && (szB == 8 || szB == 4)
4841              && how == BITS2(1,1) && nn == 31 && !isLoad;
4842
4843          if (wBack && earlyWBack)
4844             putIReg64orSP(nn, mkexpr(tEA));
4845
4846          if (isLoad) {
4847             putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, tTA)));
4848          } else {
4849             gen_narrowing_store(szB, tTA, getIReg64orZR(tt));
4850          }
4851
4852          if (wBack && !earlyWBack)
4853             putIReg64orSP(nn, mkexpr(tEA));
4854
4855          const HChar* ld_name[4] = { "ldurb", "ldurh", "ldur", "ldur" };
4856          const HChar* st_name[4] = { "sturb", "sturh", "stur", "stur" };
4857          const HChar* fmt_str = NULL;
4858          switch (how) {
4859             case BITS2(0,1):
4860                fmt_str = "%s %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
4861                break;
4862             case BITS2(1,1):
4863                fmt_str = "%s %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
4864                break;
4865             case BITS2(0,0):
4866                fmt_str = "%s %s, [%s, #%lld] (at-Rn)\n";
4867                break;
4868             default:
4869                vassert(0);
4870          }
4871          DIP(fmt_str, (isLoad ? ld_name : st_name)[szLg2],
4872                       nameIRegOrZR(szB == 8, tt),
4873                       nameIReg64orSP(nn), simm9);
4874          return True;
4875       }
4876    }
4877
4878    /* -------- LDP,STP (immediate, simm7) (INT REGS) -------- */
4879    /* L==1 => mm==LD
4880       L==0 => mm==ST
4881       x==0 => 32 bit transfers, and zero extended loads
4882       x==1 => 64 bit transfers
4883       simm7 is scaled by the (single-register) transfer size
4884
4885       (at-Rn-then-Rn=EA)
4886       x0 101 0001 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP], #imm
4887
4888       (at-EA-then-Rn=EA)
4889       x0 101 0011 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]!
4890
4891       (at-EA)
4892       x0 101 0010 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]
4893    */
4894    UInt insn_30_23 = INSN(30,23);
4895    if (insn_30_23 == BITS8(0,1,0,1,0,0,0,1)
4896        || insn_30_23 == BITS8(0,1,0,1,0,0,1,1)
4897        || insn_30_23 == BITS8(0,1,0,1,0,0,1,0)) {
4898       UInt bL     = INSN(22,22);
4899       UInt bX     = INSN(31,31);
4900       UInt bWBack = INSN(23,23);
4901       UInt rT1    = INSN(4,0);
4902       UInt rN     = INSN(9,5);
4903       UInt rT2    = INSN(14,10);
4904       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
4905       if ((bWBack && (rT1 == rN || rT2 == rN) && rN != 31)
4906           || (bL && rT1 == rT2)) {
4907          /* undecodable; fall through */
4908       } else {
4909          if (rN == 31) { /* FIXME generate stack alignment check */ }
4910
4911          // Compute the transfer address TA and the writeback address WA.
4912          IRTemp tRN = newTemp(Ity_I64);
4913          assign(tRN, getIReg64orSP(rN));
4914          IRTemp tEA = newTemp(Ity_I64);
4915          simm7 = (bX ? 8 : 4) * simm7;
4916          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
4917
4918          IRTemp tTA = newTemp(Ity_I64);
4919          IRTemp tWA = newTemp(Ity_I64);
4920          switch (INSN(24,23)) {
4921             case BITS2(0,1):
4922                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
4923             case BITS2(1,1):
4924                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
4925             case BITS2(1,0):
4926                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
4927             default:
4928                vassert(0); /* NOTREACHED */
4929          }
4930
4931          /* Normally rN would be updated after the transfer.  However, in
4932             the special case typifed by
4933                stp x29, x30, [sp,#-112]!
4934             it is necessary to update SP before the transfer, (1)
4935             because Memcheck will otherwise complain about a write
4936             below the stack pointer, and (2) because the segfault
4937             stack extension mechanism will otherwise extend the stack
4938             only down to SP before the instruction, which might not be
4939             far enough, if the -112 bit takes the actual access
4940             address to the next page.
4941          */
4942          Bool earlyWBack
4943            = bWBack && simm7 < 0
4944              && INSN(24,23) == BITS2(1,1) && rN == 31 && bL == 0;
4945
4946          if (bWBack && earlyWBack)
4947             putIReg64orSP(rN, mkexpr(tEA));
4948
4949          /**/ if (bL == 1 && bX == 1) {
4950             // 64 bit load
4951             putIReg64orZR(rT1, loadLE(Ity_I64,
4952                                       binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
4953             putIReg64orZR(rT2, loadLE(Ity_I64,
4954                                       binop(Iop_Add64,mkexpr(tTA),mkU64(8))));
4955          } else if (bL == 1 && bX == 0) {
4956             // 32 bit load
4957             putIReg32orZR(rT1, loadLE(Ity_I32,
4958                                       binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
4959             putIReg32orZR(rT2, loadLE(Ity_I32,
4960                                       binop(Iop_Add64,mkexpr(tTA),mkU64(4))));
4961          } else if (bL == 0 && bX == 1) {
4962             // 64 bit store
4963             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
4964                     getIReg64orZR(rT1));
4965             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(8)),
4966                     getIReg64orZR(rT2));
4967          } else {
4968             vassert(bL == 0 && bX == 0);
4969             // 32 bit store
4970             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
4971                     getIReg32orZR(rT1));
4972             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(4)),
4973                     getIReg32orZR(rT2));
4974          }
4975
4976          if (bWBack && !earlyWBack)
4977             putIReg64orSP(rN, mkexpr(tEA));
4978
4979          const HChar* fmt_str = NULL;
4980          switch (INSN(24,23)) {
4981             case BITS2(0,1):
4982                fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
4983                break;
4984             case BITS2(1,1):
4985                fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
4986                break;
4987             case BITS2(1,0):
4988                fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
4989                break;
4990             default:
4991                vassert(0);
4992          }
4993          DIP(fmt_str, bL == 0 ? "st" : "ld",
4994                       nameIRegOrZR(bX == 1, rT1),
4995                       nameIRegOrZR(bX == 1, rT2),
4996                       nameIReg64orSP(rN), simm7);
4997          return True;
4998       }
4999    }
5000
5001    /* -------- LDPSW (immediate, simm7) (INT REGS) -------- */
5002    /* Does 32 bit transfers which are sign extended to 64 bits.
5003       simm7 is scaled by the (single-register) transfer size
5004
5005       (at-Rn-then-Rn=EA)
5006       01 101 0001 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP], #imm
5007
5008       (at-EA-then-Rn=EA)
5009       01 101 0011 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP, #imm]!
5010
5011       (at-EA)
5012       01 101 0010 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP, #imm]
5013    */
5014    UInt insn_31_22 = INSN(31,22);
5015    if (insn_31_22 == BITS10(0,1,1,0,1,0,0,0,1,1)
5016        || insn_31_22 == BITS10(0,1,1,0,1,0,0,1,1,1)
5017        || insn_31_22 == BITS10(0,1,1,0,1,0,0,1,0,1)) {
5018       UInt bWBack = INSN(23,23);
5019       UInt rT1    = INSN(4,0);
5020       UInt rN     = INSN(9,5);
5021       UInt rT2    = INSN(14,10);
5022       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5023       if ((bWBack && (rT1 == rN || rT2 == rN) && rN != 31)
5024           || (rT1 == rT2)) {
5025          /* undecodable; fall through */
5026       } else {
5027          if (rN == 31) { /* FIXME generate stack alignment check */ }
5028
5029          // Compute the transfer address TA and the writeback address WA.
5030          IRTemp tRN = newTemp(Ity_I64);
5031          assign(tRN, getIReg64orSP(rN));
5032          IRTemp tEA = newTemp(Ity_I64);
5033          simm7 = 4 * simm7;
5034          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5035
5036          IRTemp tTA = newTemp(Ity_I64);
5037          IRTemp tWA = newTemp(Ity_I64);
5038          switch (INSN(24,23)) {
5039             case BITS2(0,1):
5040                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5041             case BITS2(1,1):
5042                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5043             case BITS2(1,0):
5044                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5045             default:
5046                vassert(0); /* NOTREACHED */
5047          }
5048
5049          // 32 bit load, sign extended to 64 bits
5050          putIReg64orZR(rT1, unop(Iop_32Sto64,
5051                                  loadLE(Ity_I32, binop(Iop_Add64,
5052                                                        mkexpr(tTA),
5053                                                        mkU64(0)))));
5054          putIReg64orZR(rT2, unop(Iop_32Sto64,
5055                                  loadLE(Ity_I32, binop(Iop_Add64,
5056                                                        mkexpr(tTA),
5057                                                        mkU64(4)))));
5058          if (bWBack)
5059             putIReg64orSP(rN, mkexpr(tEA));
5060
5061          const HChar* fmt_str = NULL;
5062          switch (INSN(24,23)) {
5063             case BITS2(0,1):
5064                fmt_str = "ldpsw %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5065                break;
5066             case BITS2(1,1):
5067                fmt_str = "ldpsw %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5068                break;
5069             case BITS2(1,0):
5070                fmt_str = "ldpsw %s, %s, [%s, #%lld] (at-Rn)\n";
5071                break;
5072             default:
5073                vassert(0);
5074          }
5075          DIP(fmt_str, nameIReg64orZR(rT1),
5076                       nameIReg64orZR(rT2),
5077                       nameIReg64orSP(rN), simm7);
5078          return True;
5079       }
5080    }
5081
5082    /* ---------------- LDR (literal, int reg) ---------------- */
5083    /* 31 29      23    4
5084       00 011 000 imm19 Rt   LDR   Wt, [PC + sxTo64(imm19 << 2)]
5085       01 011 000 imm19 Rt   LDR   Xt, [PC + sxTo64(imm19 << 2)]
5086       10 011 000 imm19 Rt   LDRSW Xt, [PC + sxTo64(imm19 << 2)]
5087       11 011 000 imm19 Rt   prefetch  [PC + sxTo64(imm19 << 2)]
5088       Just handles the first two cases for now.
5089    */
5090    if (INSN(29,24) == BITS6(0,1,1,0,0,0) && INSN(31,31) == 0) {
5091       UInt  imm19 = INSN(23,5);
5092       UInt  rT    = INSN(4,0);
5093       UInt  bX    = INSN(30,30);
5094       ULong ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
5095       if (bX) {
5096          putIReg64orZR(rT, loadLE(Ity_I64, mkU64(ea)));
5097       } else {
5098          putIReg32orZR(rT, loadLE(Ity_I32, mkU64(ea)));
5099       }
5100       DIP("ldr %s, 0x%llx (literal)\n", nameIRegOrZR(bX == 1, rT), ea);
5101       return True;
5102    }
5103
5104    /* -------------- {LD,ST}R (integer register) --------------- */
5105    /* 31 29        20 15     12 11 9  4
5106       |  |         |  |      |  |  |  |
5107       11 111000011 Rm option S  10 Rn Rt  LDR  Xt, [Xn|SP, R<m>{ext/sh}]
5108       10 111000011 Rm option S  10 Rn Rt  LDR  Wt, [Xn|SP, R<m>{ext/sh}]
5109       01 111000011 Rm option S  10 Rn Rt  LDRH Wt, [Xn|SP, R<m>{ext/sh}]
5110       00 111000011 Rm option S  10 Rn Rt  LDRB Wt, [Xn|SP, R<m>{ext/sh}]
5111
5112       11 111000001 Rm option S  10 Rn Rt  STR  Xt, [Xn|SP, R<m>{ext/sh}]
5113       10 111000001 Rm option S  10 Rn Rt  STR  Wt, [Xn|SP, R<m>{ext/sh}]
5114       01 111000001 Rm option S  10 Rn Rt  STRH Wt, [Xn|SP, R<m>{ext/sh}]
5115       00 111000001 Rm option S  10 Rn Rt  STRB Wt, [Xn|SP, R<m>{ext/sh}]
5116    */
5117    if (INSN(29,23) == BITS7(1,1,1,0,0,0,0)
5118        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5119       HChar  dis_buf[64];
5120       UInt   szLg2 = INSN(31,30);
5121       Bool   isLD  = INSN(22,22) == 1;
5122       UInt   tt    = INSN(4,0);
5123       IRTemp ea    = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
5124       if (ea != IRTemp_INVALID) {
5125          switch (szLg2) {
5126             case 3: /* 64 bit */
5127                if (isLD) {
5128                   putIReg64orZR(tt, loadLE(Ity_I64, mkexpr(ea)));
5129                   DIP("ldr %s, %s\n", nameIReg64orZR(tt), dis_buf);
5130                } else {
5131                   storeLE(mkexpr(ea), getIReg64orZR(tt));
5132                   DIP("str %s, %s\n", nameIReg64orZR(tt), dis_buf);
5133                }
5134                break;
5135             case 2: /* 32 bit */
5136                if (isLD) {
5137                   putIReg32orZR(tt, loadLE(Ity_I32, mkexpr(ea)));
5138                   DIP("ldr %s, %s\n", nameIReg32orZR(tt), dis_buf);
5139                } else {
5140                   storeLE(mkexpr(ea), getIReg32orZR(tt));
5141                   DIP("str %s, %s\n", nameIReg32orZR(tt), dis_buf);
5142                }
5143                break;
5144             case 1: /* 16 bit */
5145                if (isLD) {
5146                   putIReg64orZR(tt, unop(Iop_16Uto64,
5147                                          loadLE(Ity_I16, mkexpr(ea))));
5148                   DIP("ldruh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5149                } else {
5150                   storeLE(mkexpr(ea), unop(Iop_64to16, getIReg64orZR(tt)));
5151                   DIP("strh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5152                }
5153                break;
5154             case 0: /* 8 bit */
5155                if (isLD) {
5156                   putIReg64orZR(tt, unop(Iop_8Uto64,
5157                                          loadLE(Ity_I8, mkexpr(ea))));
5158                   DIP("ldrub %s, %s\n", nameIReg32orZR(tt), dis_buf);
5159                } else {
5160                   storeLE(mkexpr(ea), unop(Iop_64to8, getIReg64orZR(tt)));
5161                   DIP("strb %s, %s\n", nameIReg32orZR(tt), dis_buf);
5162                }
5163                break;
5164             default:
5165                vassert(0);
5166          }
5167          return True;
5168       }
5169    }
5170
5171    /* -------------- LDRS{B,H,W} (uimm12) -------------- */
5172    /* 31 29  26  23 21    9 4
5173       10 111 001 10 imm12 n t   LDRSW Xt, [Xn|SP, #pimm12 * 4]
5174       01 111 001 1x imm12 n t   LDRSH Rt, [Xn|SP, #pimm12 * 2]
5175       00 111 001 1x imm12 n t   LDRSB Rt, [Xn|SP, #pimm12 * 1]
5176       where
5177          Rt is Wt when x==1, Xt when x==0
5178    */
5179    if (INSN(29,23) == BITS7(1,1,1,0,0,1,1)) {
5180       /* Further checks on bits 31:30 and 22 */
5181       Bool valid = False;
5182       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5183          case BITS3(1,0,0):
5184          case BITS3(0,1,0): case BITS3(0,1,1):
5185          case BITS3(0,0,0): case BITS3(0,0,1):
5186             valid = True;
5187             break;
5188       }
5189       if (valid) {
5190          UInt    szLg2 = INSN(31,30);
5191          UInt    bitX  = INSN(22,22);
5192          UInt    imm12 = INSN(21,10);
5193          UInt    nn    = INSN(9,5);
5194          UInt    tt    = INSN(4,0);
5195          UInt    szB   = 1 << szLg2;
5196          IRExpr* ea    = binop(Iop_Add64,
5197                                getIReg64orSP(nn), mkU64(imm12 * szB));
5198          switch (szB) {
5199             case 4:
5200                vassert(bitX == 0);
5201                putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, ea)));
5202                DIP("ldrsw %s, [%s, #%u]\n", nameIReg64orZR(tt),
5203                    nameIReg64orSP(nn), imm12 * szB);
5204                break;
5205             case 2:
5206                if (bitX == 1) {
5207                   putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, ea)));
5208                } else {
5209                   putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, ea)));
5210                }
5211                DIP("ldrsh %s, [%s, #%u]\n",
5212                    nameIRegOrZR(bitX == 0, tt),
5213                    nameIReg64orSP(nn), imm12 * szB);
5214                break;
5215             case 1:
5216                if (bitX == 1) {
5217                   putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, ea)));
5218                } else {
5219                   putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, ea)));
5220                }
5221                DIP("ldrsb %s, [%s, #%u]\n",
5222                    nameIRegOrZR(bitX == 0, tt),
5223                    nameIReg64orSP(nn), imm12 * szB);
5224                break;
5225             default:
5226                vassert(0);
5227          }
5228          return True;
5229       }
5230       /* else fall through */
5231    }
5232
5233    /* -------------- LDRS{B,H,W} (simm9, upd) -------------- */
5234    /* (at-Rn-then-Rn=EA)
5235       31 29      23 21 20   11 9 4
5236       00 111 000 1x 0  imm9 01 n t  LDRSB Rt, [Xn|SP], #simm9
5237       01 111 000 1x 0  imm9 01 n t  LDRSH Rt, [Xn|SP], #simm9
5238       10 111 000 10 0  imm9 01 n t  LDRSW Xt, [Xn|SP], #simm9
5239
5240       (at-EA-then-Rn=EA)
5241       00 111 000 1x 0  imm9 11 n t  LDRSB Rt, [Xn|SP, #simm9]!
5242       01 111 000 1x 0  imm9 11 n t  LDRSH Rt, [Xn|SP, #simm9]!
5243       10 111 000 10 0  imm9 11 n t  LDRSW Xt, [Xn|SP, #simm9]!
5244       where
5245          Rt is Wt when x==1, Xt when x==0
5246          transfer-at-Rn when [11]==0, at EA when [11]==1
5247    */
5248    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5249        && INSN(21,21) == 0 && INSN(10,10) == 1) {
5250       /* Further checks on bits 31:30 and 22 */
5251       Bool valid = False;
5252       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5253          case BITS3(1,0,0):                    // LDRSW Xt
5254          case BITS3(0,1,0): case BITS3(0,1,1): // LDRSH Xt, Wt
5255          case BITS3(0,0,0): case BITS3(0,0,1): // LDRSB Xt, Wt
5256             valid = True;
5257             break;
5258       }
5259       if (valid) {
5260          UInt   szLg2 = INSN(31,30);
5261          UInt   imm9  = INSN(20,12);
5262          Bool   atRN  = INSN(11,11) == 0;
5263          UInt   nn    = INSN(9,5);
5264          UInt   tt    = INSN(4,0);
5265          IRTemp tRN   = newTemp(Ity_I64);
5266          IRTemp tEA   = newTemp(Ity_I64);
5267          IRTemp tTA   = IRTemp_INVALID;
5268          ULong  simm9 = sx_to_64(imm9, 9);
5269          Bool   is64  = INSN(22,22) == 0;
5270          assign(tRN, getIReg64orSP(nn));
5271          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5272          tTA = atRN ? tRN : tEA;
5273          HChar ch = '?';
5274          /* There are 5 cases:
5275                byte     load,           SX to 64
5276                byte     load, SX to 32, ZX to 64
5277                halfword load,           SX to 64
5278                halfword load, SX to 32, ZX to 64
5279                word     load,           SX to 64
5280             The ifs below handle them in the listed order.
5281          */
5282          if (szLg2 == 0) {
5283             ch = 'b';
5284             if (is64) {
5285                putIReg64orZR(tt, unop(Iop_8Sto64,
5286                                       loadLE(Ity_I8, mkexpr(tTA))));
5287             } else {
5288                putIReg32orZR(tt, unop(Iop_8Sto32,
5289                                       loadLE(Ity_I8, mkexpr(tTA))));
5290             }
5291          }
5292          else if (szLg2 == 1) {
5293             ch = 'h';
5294             if (is64) {
5295                putIReg64orZR(tt, unop(Iop_16Sto64,
5296                                       loadLE(Ity_I16, mkexpr(tTA))));
5297             } else {
5298                putIReg32orZR(tt, unop(Iop_16Sto32,
5299                                       loadLE(Ity_I16, mkexpr(tTA))));
5300             }
5301          }
5302          else if (szLg2 == 2 && is64) {
5303             ch = 'w';
5304             putIReg64orZR(tt, unop(Iop_32Sto64,
5305                                    loadLE(Ity_I32, mkexpr(tTA))));
5306          }
5307          else {
5308             vassert(0);
5309          }
5310          putIReg64orSP(nn, mkexpr(tEA));
5311          DIP(atRN ? "ldrs%c %s, [%s], #%llu\n" : "ldrs%c %s, [%s, #%llu]!",
5312              ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), simm9);
5313          return True;
5314       }
5315       /* else fall through */
5316    }
5317
5318    /* -------------- LDRS{B,H,W} (simm9, noUpd) -------------- */
5319    /* 31 29      23 21 20   11 9 4
5320       00 111 000 1x 0  imm9 00 n t  LDURSB Rt, [Xn|SP, #simm9]
5321       01 111 000 1x 0  imm9 00 n t  LDURSH Rt, [Xn|SP, #simm9]
5322       10 111 000 10 0  imm9 00 n t  LDURSW Xt, [Xn|SP, #simm9]
5323       where
5324          Rt is Wt when x==1, Xt when x==0
5325    */
5326    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5327        && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
5328       /* Further checks on bits 31:30 and 22 */
5329       Bool valid = False;
5330       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5331          case BITS3(1,0,0):                    // LDURSW Xt
5332          case BITS3(0,1,0): case BITS3(0,1,1): // LDURSH Xt, Wt
5333          case BITS3(0,0,0): case BITS3(0,0,1): // LDURSB Xt, Wt
5334             valid = True;
5335             break;
5336       }
5337       if (valid) {
5338          UInt   szLg2 = INSN(31,30);
5339          UInt   imm9  = INSN(20,12);
5340          UInt   nn    = INSN(9,5);
5341          UInt   tt    = INSN(4,0);
5342          IRTemp tRN   = newTemp(Ity_I64);
5343          IRTemp tEA   = newTemp(Ity_I64);
5344          ULong  simm9 = sx_to_64(imm9, 9);
5345          Bool   is64  = INSN(22,22) == 0;
5346          assign(tRN, getIReg64orSP(nn));
5347          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5348          HChar ch = '?';
5349          /* There are 5 cases:
5350                byte     load,           SX to 64
5351                byte     load, SX to 32, ZX to 64
5352                halfword load,           SX to 64
5353                halfword load, SX to 32, ZX to 64
5354                word     load,           SX to 64
5355             The ifs below handle them in the listed order.
5356          */
5357          if (szLg2 == 0) {
5358             ch = 'b';
5359             if (is64) {
5360                putIReg64orZR(tt, unop(Iop_8Sto64,
5361                                       loadLE(Ity_I8, mkexpr(tEA))));
5362             } else {
5363                putIReg32orZR(tt, unop(Iop_8Sto32,
5364                                       loadLE(Ity_I8, mkexpr(tEA))));
5365             }
5366          }
5367          else if (szLg2 == 1) {
5368             ch = 'h';
5369             if (is64) {
5370                putIReg64orZR(tt, unop(Iop_16Sto64,
5371                                       loadLE(Ity_I16, mkexpr(tEA))));
5372             } else {
5373                putIReg32orZR(tt, unop(Iop_16Sto32,
5374                                       loadLE(Ity_I16, mkexpr(tEA))));
5375             }
5376          }
5377          else if (szLg2 == 2 && is64) {
5378             ch = 'w';
5379             putIReg64orZR(tt, unop(Iop_32Sto64,
5380                                    loadLE(Ity_I32, mkexpr(tEA))));
5381          }
5382          else {
5383             vassert(0);
5384          }
5385          DIP("ldurs%c %s, [%s, #%lld]",
5386              ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), (Long)simm9);
5387          return True;
5388       }
5389       /* else fall through */
5390    }
5391
5392    /* -------- LDP,STP (immediate, simm7) (FP&VEC) -------- */
5393    /* L==1    => mm==LD
5394       L==0    => mm==ST
5395       sz==00  => 32 bit (S) transfers
5396       sz==01  => 64 bit (D) transfers
5397       sz==10  => 128 bit (Q) transfers
5398       sz==11  isn't allowed
5399       simm7 is scaled by the (single-register) transfer size
5400
5401       31 29  26   22 21   14 9 4
5402
5403       sz 101 1000 L  imm7 t2 n t1   mmNP SDQt1, SDQt2, [Xn|SP, #imm]
5404                                     (at-EA, with nontemporal hint)
5405
5406       sz 101 1001 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP], #imm
5407                                     (at-Rn-then-Rn=EA)
5408
5409       sz 101 1010 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]
5410                                     (at-EA)
5411
5412       sz 101 1011 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]!
5413                                     (at-EA-then-Rn=EA)
5414    */
5415    if (INSN(29,25) == BITS5(1,0,1,1,0)) {
5416       UInt szSlg2 = INSN(31,30); // log2 of the xfer size in 32-bit units
5417       Bool isLD   = INSN(22,22) == 1;
5418       Bool wBack  = INSN(23,23) == 1;
5419       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5420       UInt tt2    = INSN(14,10);
5421       UInt nn     = INSN(9,5);
5422       UInt tt1    = INSN(4,0);
5423       if (szSlg2 == BITS2(1,1) || (isLD && tt1 == tt2)) {
5424          /* undecodable; fall through */
5425       } else {
5426          if (nn == 31) { /* FIXME generate stack alignment check */ }
5427
5428          // Compute the transfer address TA and the writeback address WA.
5429          UInt   szB = 4 << szSlg2; /* szB is the per-register size */
5430          IRTemp tRN = newTemp(Ity_I64);
5431          assign(tRN, getIReg64orSP(nn));
5432          IRTemp tEA = newTemp(Ity_I64);
5433          simm7 = szB * simm7;
5434          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5435
5436          IRTemp tTA = newTemp(Ity_I64);
5437          IRTemp tWA = newTemp(Ity_I64);
5438          switch (INSN(24,23)) {
5439             case BITS2(0,1):
5440                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5441             case BITS2(1,1):
5442                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5443             case BITS2(1,0):
5444             case BITS2(0,0):
5445                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5446             default:
5447                vassert(0); /* NOTREACHED */
5448          }
5449
5450          IRType ty = Ity_INVALID;
5451          switch (szB) {
5452             case 4:  ty = Ity_F32;  break;
5453             case 8:  ty = Ity_F64;  break;
5454             case 16: ty = Ity_V128; break;
5455             default: vassert(0);
5456          }
5457
5458          /* Normally rN would be updated after the transfer.  However, in
5459             the special cases typifed by
5460                stp q0, q1, [sp,#-512]!
5461                stp d0, d1, [sp,#-512]!
5462                stp s0, s1, [sp,#-512]!
5463             it is necessary to update SP before the transfer, (1)
5464             because Memcheck will otherwise complain about a write
5465             below the stack pointer, and (2) because the segfault
5466             stack extension mechanism will otherwise extend the stack
5467             only down to SP before the instruction, which might not be
5468             far enough, if the -512 bit takes the actual access
5469             address to the next page.
5470          */
5471          Bool earlyWBack
5472            = wBack && simm7 < 0
5473              && INSN(24,23) == BITS2(1,1) && nn == 31 && !isLD;
5474
5475          if (wBack && earlyWBack)
5476             putIReg64orSP(nn, mkexpr(tEA));
5477
5478          if (isLD) {
5479             if (szB < 16) {
5480                putQReg128(tt1, mkV128(0x0000));
5481             }
5482             putQRegLO(tt1,
5483                       loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(0))));
5484             if (szB < 16) {
5485                putQReg128(tt2, mkV128(0x0000));
5486             }
5487             putQRegLO(tt2,
5488                       loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(szB))));
5489          } else {
5490             storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(0)),
5491                     getQRegLO(tt1, ty));
5492             storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(szB)),
5493                     getQRegLO(tt2, ty));
5494          }
5495
5496          if (wBack && !earlyWBack)
5497             putIReg64orSP(nn, mkexpr(tEA));
5498
5499          const HChar* fmt_str = NULL;
5500          switch (INSN(24,23)) {
5501             case BITS2(0,1):
5502                fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5503                break;
5504             case BITS2(1,1):
5505                fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5506                break;
5507             case BITS2(1,0):
5508                fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
5509                break;
5510             case BITS2(0,0):
5511                fmt_str = "%snp %s, %s, [%s, #%lld] (at-Rn)\n";
5512                break;
5513             default:
5514                vassert(0);
5515          }
5516          DIP(fmt_str, isLD ? "ld" : "st",
5517                       nameQRegLO(tt1, ty), nameQRegLO(tt2, ty),
5518                       nameIReg64orSP(nn), simm7);
5519          return True;
5520       }
5521    }
5522
5523    /* -------------- {LD,ST}R (vector register) --------------- */
5524    /* 31 29     23  20 15     12 11 9  4
5525       |  |      |   |  |      |  |  |  |
5526       00 111100 011 Rm option S  10 Rn Rt  LDR Bt, [Xn|SP, R<m>{ext/sh}]
5527       01 111100 011 Rm option S  10 Rn Rt  LDR Ht, [Xn|SP, R<m>{ext/sh}]
5528       10 111100 011 Rm option S  10 Rn Rt  LDR St, [Xn|SP, R<m>{ext/sh}]
5529       11 111100 011 Rm option S  10 Rn Rt  LDR Dt, [Xn|SP, R<m>{ext/sh}]
5530       00 111100 111 Rm option S  10 Rn Rt  LDR Qt, [Xn|SP, R<m>{ext/sh}]
5531
5532       00 111100 001 Rm option S  10 Rn Rt  STR Bt, [Xn|SP, R<m>{ext/sh}]
5533       01 111100 001 Rm option S  10 Rn Rt  STR Ht, [Xn|SP, R<m>{ext/sh}]
5534       10 111100 001 Rm option S  10 Rn Rt  STR St, [Xn|SP, R<m>{ext/sh}]
5535       11 111100 001 Rm option S  10 Rn Rt  STR Dt, [Xn|SP, R<m>{ext/sh}]
5536       00 111100 101 Rm option S  10 Rn Rt  STR Qt, [Xn|SP, R<m>{ext/sh}]
5537    */
5538    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5539        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5540       HChar  dis_buf[64];
5541       UInt   szLg2 = (INSN(23,23) << 2) | INSN(31,30);
5542       Bool   isLD  = INSN(22,22) == 1;
5543       UInt   tt    = INSN(4,0);
5544       if (szLg2 > 4) goto after_LDR_STR_vector_register;
5545       IRTemp ea    = gen_indexed_EA(dis_buf, insn, False/*to/from vec regs*/);
5546       if (ea == IRTemp_INVALID) goto after_LDR_STR_vector_register;
5547       switch (szLg2) {
5548          case 0: /* 8 bit */
5549             if (isLD) {
5550                putQReg128(tt, mkV128(0x0000));
5551                putQRegLO(tt, loadLE(Ity_I8, mkexpr(ea)));
5552                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
5553             } else {
5554                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I8));
5555                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
5556             }
5557             break;
5558          case 1:
5559             if (isLD) {
5560                putQReg128(tt, mkV128(0x0000));
5561                putQRegLO(tt, loadLE(Ity_I16, mkexpr(ea)));
5562                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
5563             } else {
5564                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I16));
5565                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
5566             }
5567             break;
5568          case 2: /* 32 bit */
5569             if (isLD) {
5570                putQReg128(tt, mkV128(0x0000));
5571                putQRegLO(tt, loadLE(Ity_I32, mkexpr(ea)));
5572                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
5573             } else {
5574                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I32));
5575                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
5576             }
5577             break;
5578          case 3: /* 64 bit */
5579             if (isLD) {
5580                putQReg128(tt, mkV128(0x0000));
5581                putQRegLO(tt, loadLE(Ity_I64, mkexpr(ea)));
5582                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
5583             } else {
5584                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I64));
5585                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
5586             }
5587             break;
5588          case 4:
5589             if (isLD) {
5590                putQReg128(tt, loadLE(Ity_V128, mkexpr(ea)));
5591                DIP("ldr %s, %s\n", nameQReg128(tt), dis_buf);
5592             } else {
5593                storeLE(mkexpr(ea), getQReg128(tt));
5594                DIP("str %s, %s\n", nameQReg128(tt), dis_buf);
5595             }
5596             break;
5597          default:
5598             vassert(0);
5599       }
5600       return True;
5601    }
5602   after_LDR_STR_vector_register:
5603
5604    /* ---------- LDRS{B,H,W} (integer register, SX) ---------- */
5605    /* 31 29      22 20 15  12 11 9  4
5606       |  |       |  |  |   |  |  |  |
5607       10 1110001 01 Rm opt S 10 Rn Rt    LDRSW Xt, [Xn|SP, R<m>{ext/sh}]
5608
5609       01 1110001 01 Rm opt S 10 Rn Rt    LDRSH Xt, [Xn|SP, R<m>{ext/sh}]
5610       01 1110001 11 Rm opt S 10 Rn Rt    LDRSH Wt, [Xn|SP, R<m>{ext/sh}]
5611
5612       00 1110001 01 Rm opt S 10 Rn Rt    LDRSB Xt, [Xn|SP, R<m>{ext/sh}]
5613       00 1110001 11 Rm opt S 10 Rn Rt    LDRSB Wt, [Xn|SP, R<m>{ext/sh}]
5614    */
5615    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5616        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5617       HChar  dis_buf[64];
5618       UInt   szLg2  = INSN(31,30);
5619       Bool   sxTo64 = INSN(22,22) == 0; // else sx to 32 and zx to 64
5620       UInt   tt     = INSN(4,0);
5621       if (szLg2 == 3) goto after_LDRS_integer_register;
5622       IRTemp ea     = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
5623       if (ea == IRTemp_INVALID) goto after_LDRS_integer_register;
5624       /* Enumerate the 5 variants explicitly. */
5625       if (szLg2 == 2/*32 bit*/ && sxTo64) {
5626          putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, mkexpr(ea))));
5627          DIP("ldrsw %s, %s\n", nameIReg64orZR(tt), dis_buf);
5628          return True;
5629       }
5630       else
5631       if (szLg2 == 1/*16 bit*/) {
5632          if (sxTo64) {
5633             putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, mkexpr(ea))));
5634             DIP("ldrsh %s, %s\n", nameIReg64orZR(tt), dis_buf);
5635          } else {
5636             putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, mkexpr(ea))));
5637             DIP("ldrsh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5638          }
5639          return True;
5640       }
5641       else
5642       if (szLg2 == 0/*8 bit*/) {
5643          if (sxTo64) {
5644             putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, mkexpr(ea))));
5645             DIP("ldrsb %s, %s\n", nameIReg64orZR(tt), dis_buf);
5646          } else {
5647             putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, mkexpr(ea))));
5648             DIP("ldrsb %s, %s\n", nameIReg32orZR(tt), dis_buf);
5649          }
5650          return True;
5651       }
5652       /* else it's an invalid combination */
5653    }
5654   after_LDRS_integer_register:
5655
5656    /* -------- LDR/STR (immediate, SIMD&FP, unsigned offset) -------- */
5657    /* This is the Unsigned offset variant only.  The Post-Index and
5658       Pre-Index variants are below.
5659
5660       31 29      23 21    9 4
5661       00 111 101 01 imm12 n t   LDR Bt, [Xn|SP + imm12 * 1]
5662       01 111 101 01 imm12 n t   LDR Ht, [Xn|SP + imm12 * 2]
5663       10 111 101 01 imm12 n t   LDR St, [Xn|SP + imm12 * 4]
5664       11 111 101 01 imm12 n t   LDR Dt, [Xn|SP + imm12 * 8]
5665       00 111 101 11 imm12 n t   LDR Qt, [Xn|SP + imm12 * 16]
5666
5667       00 111 101 00 imm12 n t   STR Bt, [Xn|SP + imm12 * 1]
5668       01 111 101 00 imm12 n t   STR Ht, [Xn|SP + imm12 * 2]
5669       10 111 101 00 imm12 n t   STR St, [Xn|SP + imm12 * 4]
5670       11 111 101 00 imm12 n t   STR Dt, [Xn|SP + imm12 * 8]
5671       00 111 101 10 imm12 n t   STR Qt, [Xn|SP + imm12 * 16]
5672    */
5673    if (INSN(29,24) == BITS6(1,1,1,1,0,1)
5674        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4) {
5675       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5676       Bool   isLD   = INSN(22,22) == 1;
5677       UInt   pimm12 = INSN(21,10) << szLg2;
5678       UInt   nn     = INSN(9,5);
5679       UInt   tt     = INSN(4,0);
5680       IRTemp tEA    = newTemp(Ity_I64);
5681       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5682       assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(pimm12)));
5683       if (isLD) {
5684          if (szLg2 < 4) {
5685             putQReg128(tt, mkV128(0x0000));
5686          }
5687          putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
5688       } else {
5689          storeLE(mkexpr(tEA), getQRegLO(tt, ty));
5690       }
5691       DIP("%s %s, [%s, #%u]\n",
5692           isLD ? "ldr" : "str",
5693           nameQRegLO(tt, ty), nameIReg64orSP(nn), pimm12);
5694       return True;
5695    }
5696
5697    /* -------- LDR/STR (immediate, SIMD&FP, pre/post index) -------- */
5698    /* These are the Post-Index and Pre-Index variants.
5699
5700       31 29      23   20   11 9 4
5701       (at-Rn-then-Rn=EA)
5702       00 111 100 01 0 imm9 01 n t   LDR Bt, [Xn|SP], #simm
5703       01 111 100 01 0 imm9 01 n t   LDR Ht, [Xn|SP], #simm
5704       10 111 100 01 0 imm9 01 n t   LDR St, [Xn|SP], #simm
5705       11 111 100 01 0 imm9 01 n t   LDR Dt, [Xn|SP], #simm
5706       00 111 100 11 0 imm9 01 n t   LDR Qt, [Xn|SP], #simm
5707
5708       (at-EA-then-Rn=EA)
5709       00 111 100 01 0 imm9 11 n t   LDR Bt, [Xn|SP, #simm]!
5710       01 111 100 01 0 imm9 11 n t   LDR Ht, [Xn|SP, #simm]!
5711       10 111 100 01 0 imm9 11 n t   LDR St, [Xn|SP, #simm]!
5712       11 111 100 01 0 imm9 11 n t   LDR Dt, [Xn|SP, #simm]!
5713       00 111 100 11 0 imm9 11 n t   LDR Qt, [Xn|SP, #simm]!
5714
5715       Stores are the same except with bit 22 set to 0.
5716    */
5717    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5718        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
5719        && INSN(21,21) == 0 && INSN(10,10) == 1) {
5720       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5721       Bool   isLD   = INSN(22,22) == 1;
5722       UInt   imm9   = INSN(20,12);
5723       Bool   atRN   = INSN(11,11) == 0;
5724       UInt   nn     = INSN(9,5);
5725       UInt   tt     = INSN(4,0);
5726       IRTemp tRN    = newTemp(Ity_I64);
5727       IRTemp tEA    = newTemp(Ity_I64);
5728       IRTemp tTA    = IRTemp_INVALID;
5729       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5730       ULong  simm9  = sx_to_64(imm9, 9);
5731       assign(tRN, getIReg64orSP(nn));
5732       assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5733       tTA = atRN ? tRN : tEA;
5734
5735       /* Do early writeback for the cases typified by
5736             str d8, [sp, #-32]!
5737             str d10, [sp, #-128]!
5738             str q1, [sp, #-32]!
5739          for the same reasons as described in a similar comment in the
5740          "LDP,STP (immediate, simm7) (FP&VEC)" case just above.
5741       */
5742       Bool earlyWBack
5743          = !atRN && !isLD && (ty == Ity_F64 || ty == Ity_V128)
5744            && nn == 31 && ((Long)simm9) < 0;
5745
5746       if (earlyWBack)
5747          putIReg64orSP(nn, mkexpr(tEA));
5748
5749       if (isLD) {
5750          if (szLg2 < 4) {
5751             putQReg128(tt, mkV128(0x0000));
5752          }
5753          putQRegLO(tt, loadLE(ty, mkexpr(tTA)));
5754       } else {
5755          storeLE(mkexpr(tTA), getQRegLO(tt, ty));
5756       }
5757
5758       if (!earlyWBack)
5759          putIReg64orSP(nn, mkexpr(tEA));
5760
5761       DIP(atRN ? "%s %s, [%s], #%lld\n" : "%s %s, [%s, #%lld]!\n",
5762           isLD ? "ldr" : "str",
5763           nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
5764       return True;
5765    }
5766
5767    /* -------- LDUR/STUR (unscaled offset, SIMD&FP) -------- */
5768    /* 31 29      23   20   11 9 4
5769       00 111 100 01 0 imm9 00 n t   LDR Bt, [Xn|SP, #simm]
5770       01 111 100 01 0 imm9 00 n t   LDR Ht, [Xn|SP, #simm]
5771       10 111 100 01 0 imm9 00 n t   LDR St, [Xn|SP, #simm]
5772       11 111 100 01 0 imm9 00 n t   LDR Dt, [Xn|SP, #simm]
5773       00 111 100 11 0 imm9 00 n t   LDR Qt, [Xn|SP, #simm]
5774
5775       00 111 100 00 0 imm9 00 n t   STR Bt, [Xn|SP, #simm]
5776       01 111 100 00 0 imm9 00 n t   STR Ht, [Xn|SP, #simm]
5777       10 111 100 00 0 imm9 00 n t   STR St, [Xn|SP, #simm]
5778       11 111 100 00 0 imm9 00 n t   STR Dt, [Xn|SP, #simm]
5779       00 111 100 10 0 imm9 00 n t   STR Qt, [Xn|SP, #simm]
5780    */
5781    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5782        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
5783        && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
5784       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5785       Bool   isLD   = INSN(22,22) == 1;
5786       UInt   imm9   = INSN(20,12);
5787       UInt   nn     = INSN(9,5);
5788       UInt   tt     = INSN(4,0);
5789       ULong  simm9  = sx_to_64(imm9, 9);
5790       IRTemp tEA    = newTemp(Ity_I64);
5791       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5792       assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(simm9)));
5793       if (isLD) {
5794          if (szLg2 < 4) {
5795             putQReg128(tt, mkV128(0x0000));
5796          }
5797          putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
5798       } else {
5799          storeLE(mkexpr(tEA), getQRegLO(tt, ty));
5800       }
5801       DIP("%s %s, [%s, #%lld]\n",
5802           isLD ? "ldur" : "stur",
5803           nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
5804       return True;
5805    }
5806
5807    /* ---------------- LDR (literal, SIMD&FP) ---------------- */
5808    /* 31 29      23    4
5809       00 011 100 imm19 t    LDR St, [PC + sxTo64(imm19 << 2)]
5810       01 011 100 imm19 t    LDR Dt, [PC + sxTo64(imm19 << 2)]
5811       10 011 100 imm19 t    LDR Qt, [PC + sxTo64(imm19 << 2)]
5812    */
5813    if (INSN(29,24) == BITS6(0,1,1,1,0,0) && INSN(31,30) < BITS2(1,1)) {
5814       UInt   szB   = 4 << INSN(31,30);
5815       UInt   imm19 = INSN(23,5);
5816       UInt   tt    = INSN(4,0);
5817       ULong  ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
5818       IRType ty    = preferredVectorSubTypeFromSize(szB);
5819       putQReg128(tt, mkV128(0x0000));
5820       putQRegLO(tt, loadLE(ty, mkU64(ea)));
5821       DIP("ldr %s, 0x%llx (literal)\n", nameQRegLO(tt, ty), ea);
5822       return True;
5823    }
5824
5825    /* ------ LD1/ST1 (multiple 1-elem structs to/from 1 reg  ------ */
5826    /* ------ LD2/ST2 (multiple 2-elem structs to/from 2 regs ------ */
5827    /* ------ LD3/ST3 (multiple 3-elem structs to/from 3 regs ------ */
5828    /* ------ LD4/ST4 (multiple 4-elem structs to/from 4 regs ------ */
5829    /* 31 29  26   22 21 20    15   11 9 4
5830
5831       0q 001 1000 L  0  00000 0000 sz n t  xx4 {Vt..t+3.T}, [Xn|SP]
5832       0q 001 1001 L  0  m     0000 sz n t  xx4 {Vt..t+3.T}, [Xn|SP], step
5833
5834       0q 001 1000 L  0  00000 0100 sz n t  xx3 {Vt..t+2.T}, [Xn|SP]
5835       0q 001 1001 L  0  m     0100 sz n t  xx3 {Vt..t+2.T}, [Xn|SP], step
5836
5837       0q 001 1000 L  0  00000 1000 sz n t  xx2 {Vt..t+1.T}, [Xn|SP]
5838       0q 001 1001 L  0  m     1000 sz n t  xx2 {Vt..t+1.T}, [Xn|SP], step
5839
5840       0q 001 1000 L  0  00000 0111 sz n t  xx1 {Vt.T},      [Xn|SP]
5841       0q 001 1001 L  0  m     0111 sz n t  xx1 {Vt.T},      [Xn|SP], step
5842
5843       T    = defined by Q and sz in the normal way
5844       step = if m == 11111 then transfer-size else Xm
5845       xx   = case L of 1 -> LD ; 0 -> ST
5846    */
5847    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
5848        && INSN(21,21) == 0) {
5849       Bool bitQ  = INSN(30,30);
5850       Bool isPX  = INSN(23,23) == 1;
5851       Bool isLD  = INSN(22,22) == 1;
5852       UInt mm    = INSN(20,16);
5853       UInt opc   = INSN(15,12);
5854       UInt sz    = INSN(11,10);
5855       UInt nn    = INSN(9,5);
5856       UInt tt    = INSN(4,0);
5857       Bool isQ   = bitQ == 1;
5858       Bool is1d  = sz == BITS2(1,1) && !isQ;
5859       UInt nRegs = 0;
5860       switch (opc) {
5861          case BITS4(0,0,0,0): nRegs = 4; break;
5862          case BITS4(0,1,0,0): nRegs = 3; break;
5863          case BITS4(1,0,0,0): nRegs = 2; break;
5864          case BITS4(0,1,1,1): nRegs = 1; break;
5865          default: break;
5866       }
5867
5868       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
5869          If we see it, set nRegs to 0 so as to cause the next conditional
5870          to fail. */
5871       if (!isPX && mm != 0)
5872          nRegs = 0;
5873
5874       if (nRegs == 1                             /* .1d is allowed */
5875           || (nRegs >= 2 && nRegs <= 4 && !is1d) /* .1d is not allowed */) {
5876
5877          UInt xferSzB = (isQ ? 16 : 8) * nRegs;
5878
5879          /* Generate the transfer address (TA) and if necessary the
5880             writeback address (WB) */
5881          IRTemp tTA = newTemp(Ity_I64);
5882          assign(tTA, getIReg64orSP(nn));
5883          if (nn == 31) { /* FIXME generate stack alignment check */ }
5884          IRTemp tWB = IRTemp_INVALID;
5885          if (isPX) {
5886             tWB = newTemp(Ity_I64);
5887             assign(tWB, binop(Iop_Add64,
5888                               mkexpr(tTA),
5889                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
5890                                                      : getIReg64orZR(mm)));
5891          }
5892
5893          /* -- BEGIN generate the transfers -- */
5894
5895          IRTemp u0, u1, u2, u3, i0, i1, i2, i3;
5896          u0 = u1 = u2 = u3 = i0 = i1 = i2 = i3 = IRTemp_INVALID;
5897          switch (nRegs) {
5898             case 4: u3 = newTempV128(); i3 = newTempV128(); /* fallthru */
5899             case 3: u2 = newTempV128(); i2 = newTempV128(); /* fallthru */
5900             case 2: u1 = newTempV128(); i1 = newTempV128(); /* fallthru */
5901             case 1: u0 = newTempV128(); i0 = newTempV128(); break;
5902             default: vassert(0);
5903          }
5904
5905          /* -- Multiple 128 or 64 bit stores -- */
5906          if (!isLD) {
5907             switch (nRegs) {
5908                case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
5909                case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
5910                case 2: assign(u1, getQReg128((tt+1) % 32)); /* fallthru */
5911                case 1: assign(u0, getQReg128((tt+0) % 32)); break;
5912                default: vassert(0);
5913             }
5914             switch (nRegs) {
5915                case 4:  (isQ ? math_INTERLEAVE4_128 : math_INTERLEAVE4_64)
5916                            (&i0, &i1, &i2, &i3, sz, u0, u1, u2, u3);
5917                         break;
5918                case 3:  (isQ ? math_INTERLEAVE3_128 : math_INTERLEAVE3_64)
5919                            (&i0, &i1, &i2, sz, u0, u1, u2);
5920                         break;
5921                case 2:  (isQ ? math_INTERLEAVE2_128 : math_INTERLEAVE2_64)
5922                            (&i0, &i1, sz, u0, u1);
5923                         break;
5924                case 1:  (isQ ? math_INTERLEAVE1_128 : math_INTERLEAVE1_64)
5925                            (&i0, sz, u0);
5926                         break;
5927                default: vassert(0);
5928             }
5929 #           define MAYBE_NARROW_TO_64(_expr) \
5930                       (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
5931             UInt step = isQ ? 16 : 8;
5932             switch (nRegs) {
5933                case 4:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
5934                                  MAYBE_NARROW_TO_64(mkexpr(i3)) );
5935                         /* fallthru */
5936                case 3:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
5937                                  MAYBE_NARROW_TO_64(mkexpr(i2)) );
5938                         /* fallthru */
5939                case 2:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
5940                                  MAYBE_NARROW_TO_64(mkexpr(i1)) );
5941                         /* fallthru */
5942                case 1:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
5943                                  MAYBE_NARROW_TO_64(mkexpr(i0)) );
5944                         break;
5945                default: vassert(0);
5946             }
5947 #           undef MAYBE_NARROW_TO_64
5948          }
5949
5950          /* -- Multiple 128 or 64 bit loads -- */
5951          else /* isLD */ {
5952             UInt   step   = isQ ? 16 : 8;
5953             IRType loadTy = isQ ? Ity_V128 : Ity_I64;
5954 #           define MAYBE_WIDEN_FROM_64(_expr) \
5955                       (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
5956             switch (nRegs) {
5957                case 4:
5958                   assign(i3, MAYBE_WIDEN_FROM_64(
5959                                 loadLE(loadTy,
5960                                        binop(Iop_Add64, mkexpr(tTA),
5961                                                         mkU64(3 * step)))));
5962                   /* fallthru */
5963                case 3:
5964                   assign(i2, MAYBE_WIDEN_FROM_64(
5965                                 loadLE(loadTy,
5966                                        binop(Iop_Add64, mkexpr(tTA),
5967                                                         mkU64(2 * step)))));
5968                   /* fallthru */
5969                case 2:
5970                   assign(i1, MAYBE_WIDEN_FROM_64(
5971                                 loadLE(loadTy,
5972                                        binop(Iop_Add64, mkexpr(tTA),
5973                                                         mkU64(1 * step)))));
5974                   /* fallthru */
5975                case 1:
5976                   assign(i0, MAYBE_WIDEN_FROM_64(
5977                                 loadLE(loadTy,
5978                                        binop(Iop_Add64, mkexpr(tTA),
5979                                                         mkU64(0 * step)))));
5980                   break;
5981                default:
5982                   vassert(0);
5983             }
5984 #           undef MAYBE_WIDEN_FROM_64
5985             switch (nRegs) {
5986                case 4:  (isQ ? math_DEINTERLEAVE4_128 : math_DEINTERLEAVE4_64)
5987                            (&u0, &u1, &u2, &u3, sz, i0,i1,i2,i3);
5988                         break;
5989                case 3:  (isQ ? math_DEINTERLEAVE3_128 : math_DEINTERLEAVE3_64)
5990                            (&u0, &u1, &u2, sz, i0, i1, i2);
5991                         break;
5992                case 2:  (isQ ? math_DEINTERLEAVE2_128 : math_DEINTERLEAVE2_64)
5993                            (&u0, &u1, sz, i0, i1);
5994                         break;
5995                case 1:  (isQ ? math_DEINTERLEAVE1_128 : math_DEINTERLEAVE1_64)
5996                            (&u0, sz, i0);
5997                         break;
5998                default: vassert(0);
5999             }
6000             switch (nRegs) {
6001                case 4:  putQReg128( (tt+3) % 32,
6002                                     math_MAYBE_ZERO_HI64(bitQ, u3));
6003                         /* fallthru */
6004                case 3:  putQReg128( (tt+2) % 32,
6005                                     math_MAYBE_ZERO_HI64(bitQ, u2));
6006                         /* fallthru */
6007                case 2:  putQReg128( (tt+1) % 32,
6008                                     math_MAYBE_ZERO_HI64(bitQ, u1));
6009                         /* fallthru */
6010                case 1:  putQReg128( (tt+0) % 32,
6011                                     math_MAYBE_ZERO_HI64(bitQ, u0));
6012                         break;
6013                default: vassert(0);
6014             }
6015          }
6016
6017          /* -- END generate the transfers -- */
6018
6019          /* Do the writeback, if necessary */
6020          if (isPX) {
6021             putIReg64orSP(nn, mkexpr(tWB));
6022          }
6023
6024          HChar pxStr[20];
6025          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6026          if (isPX) {
6027             if (mm == BITS5(1,1,1,1,1))
6028                vex_sprintf(pxStr, ", #%u", xferSzB);
6029             else
6030                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6031          }
6032          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6033          DIP("%s%u {v%u.%s .. v%u.%s}, [%s]%s\n",
6034              isLD ? "ld" : "st", nRegs,
6035              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6036              pxStr);
6037
6038          return True;
6039       }
6040       /* else fall through */
6041    }
6042
6043    /* ------ LD1/ST1 (multiple 1-elem structs to/from 2 regs  ------ */
6044    /* ------ LD1/ST1 (multiple 1-elem structs to/from 3 regs  ------ */
6045    /* ------ LD1/ST1 (multiple 1-elem structs to/from 4 regs  ------ */
6046    /* 31 29  26   22 21 20    15   11 9 4
6047
6048       0q 001 1000 L  0  00000 0010 sz n t  xx1 {Vt..t+3.T}, [Xn|SP]
6049       0q 001 1001 L  0  m     0010 sz n t  xx1 {Vt..t+3.T}, [Xn|SP], step
6050
6051       0q 001 1000 L  0  00000 0110 sz n t  xx1 {Vt..t+2.T}, [Xn|SP]
6052       0q 001 1001 L  0  m     0110 sz n t  xx1 {Vt..t+2.T}, [Xn|SP], step
6053
6054       0q 001 1000 L  0  00000 1010 sz n t  xx1 {Vt..t+1.T}, [Xn|SP]
6055       0q 001 1001 L  0  m     1010 sz n t  xx1 {Vt..t+1.T}, [Xn|SP], step
6056
6057       T    = defined by Q and sz in the normal way
6058       step = if m == 11111 then transfer-size else Xm
6059       xx   = case L of 1 -> LD ; 0 -> ST
6060    */
6061    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
6062        && INSN(21,21) == 0) {
6063       Bool bitQ  = INSN(30,30);
6064       Bool isPX  = INSN(23,23) == 1;
6065       Bool isLD  = INSN(22,22) == 1;
6066       UInt mm    = INSN(20,16);
6067       UInt opc   = INSN(15,12);
6068       UInt sz    = INSN(11,10);
6069       UInt nn    = INSN(9,5);
6070       UInt tt    = INSN(4,0);
6071       Bool isQ   = bitQ == 1;
6072       UInt nRegs = 0;
6073       switch (opc) {
6074          case BITS4(0,0,1,0): nRegs = 4; break;
6075          case BITS4(0,1,1,0): nRegs = 3; break;
6076          case BITS4(1,0,1,0): nRegs = 2; break;
6077          default: break;
6078       }
6079
6080       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
6081          If we see it, set nRegs to 0 so as to cause the next conditional
6082          to fail. */
6083       if (!isPX && mm != 0)
6084          nRegs = 0;
6085
6086       if (nRegs >= 2 && nRegs <= 4) {
6087
6088          UInt xferSzB = (isQ ? 16 : 8) * nRegs;
6089
6090          /* Generate the transfer address (TA) and if necessary the
6091             writeback address (WB) */
6092          IRTemp tTA = newTemp(Ity_I64);
6093          assign(tTA, getIReg64orSP(nn));
6094          if (nn == 31) { /* FIXME generate stack alignment check */ }
6095          IRTemp tWB = IRTemp_INVALID;
6096          if (isPX) {
6097             tWB = newTemp(Ity_I64);
6098             assign(tWB, binop(Iop_Add64,
6099                               mkexpr(tTA),
6100                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6101                                                      : getIReg64orZR(mm)));
6102          }
6103
6104          /* -- BEGIN generate the transfers -- */
6105
6106          IRTemp u0, u1, u2, u3;
6107          u0 = u1 = u2 = u3 = IRTemp_INVALID;
6108          switch (nRegs) {
6109             case 4: u3 = newTempV128(); /* fallthru */
6110             case 3: u2 = newTempV128(); /* fallthru */
6111             case 2: u1 = newTempV128();
6112                     u0 = newTempV128(); break;
6113             default: vassert(0);
6114          }
6115
6116          /* -- Multiple 128 or 64 bit stores -- */
6117          if (!isLD) {
6118             switch (nRegs) {
6119                case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
6120                case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
6121                case 2: assign(u1, getQReg128((tt+1) % 32));
6122                        assign(u0, getQReg128((tt+0) % 32)); break;
6123                default: vassert(0);
6124             }
6125 #           define MAYBE_NARROW_TO_64(_expr) \
6126                       (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
6127             UInt step = isQ ? 16 : 8;
6128             switch (nRegs) {
6129                case 4:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
6130                                  MAYBE_NARROW_TO_64(mkexpr(u3)) );
6131                         /* fallthru */
6132                case 3:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
6133                                  MAYBE_NARROW_TO_64(mkexpr(u2)) );
6134                         /* fallthru */
6135                case 2:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
6136                                  MAYBE_NARROW_TO_64(mkexpr(u1)) );
6137                         storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
6138                                  MAYBE_NARROW_TO_64(mkexpr(u0)) );
6139                         break;
6140                default: vassert(0);
6141             }
6142 #           undef MAYBE_NARROW_TO_64
6143          }
6144
6145          /* -- Multiple 128 or 64 bit loads -- */
6146          else /* isLD */ {
6147             UInt   step   = isQ ? 16 : 8;
6148             IRType loadTy = isQ ? Ity_V128 : Ity_I64;
6149 #           define MAYBE_WIDEN_FROM_64(_expr) \
6150                       (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
6151             switch (nRegs) {
6152                case 4:
6153                   assign(u3, MAYBE_WIDEN_FROM_64(
6154                                 loadLE(loadTy,
6155                                        binop(Iop_Add64, mkexpr(tTA),
6156                                                         mkU64(3 * step)))));
6157                   /* fallthru */
6158                case 3:
6159                   assign(u2, MAYBE_WIDEN_FROM_64(
6160                                 loadLE(loadTy,
6161                                        binop(Iop_Add64, mkexpr(tTA),
6162                                                         mkU64(2 * step)))));
6163                   /* fallthru */
6164                case 2:
6165                   assign(u1, MAYBE_WIDEN_FROM_64(
6166                                 loadLE(loadTy,
6167                                        binop(Iop_Add64, mkexpr(tTA),
6168                                                         mkU64(1 * step)))));
6169                   assign(u0, MAYBE_WIDEN_FROM_64(
6170                                 loadLE(loadTy,
6171                                        binop(Iop_Add64, mkexpr(tTA),
6172                                                         mkU64(0 * step)))));
6173                   break;
6174                default:
6175                   vassert(0);
6176             }
6177 #           undef MAYBE_WIDEN_FROM_64
6178             switch (nRegs) {
6179                case 4:  putQReg128( (tt+3) % 32,
6180                                     math_MAYBE_ZERO_HI64(bitQ, u3));
6181                         /* fallthru */
6182                case 3:  putQReg128( (tt+2) % 32,
6183                                     math_MAYBE_ZERO_HI64(bitQ, u2));
6184                         /* fallthru */
6185                case 2:  putQReg128( (tt+1) % 32,
6186                                     math_MAYBE_ZERO_HI64(bitQ, u1));
6187                         putQReg128( (tt+0) % 32,
6188                                     math_MAYBE_ZERO_HI64(bitQ, u0));
6189                         break;
6190                default: vassert(0);
6191             }
6192          }
6193
6194          /* -- END generate the transfers -- */
6195
6196          /* Do the writeback, if necessary */
6197          if (isPX) {
6198             putIReg64orSP(nn, mkexpr(tWB));
6199          }
6200
6201          HChar pxStr[20];
6202          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6203          if (isPX) {
6204             if (mm == BITS5(1,1,1,1,1))
6205                vex_sprintf(pxStr, ", #%u", xferSzB);
6206             else
6207                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6208          }
6209          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6210          DIP("%s1 {v%u.%s .. v%u.%s}, [%s]%s\n",
6211              isLD ? "ld" : "st",
6212              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6213              pxStr);
6214
6215          return True;
6216       }
6217       /* else fall through */
6218    }
6219
6220    /* ---------- LD1R (single structure, replicate) ---------- */
6221    /* ---------- LD2R (single structure, replicate) ---------- */
6222    /* ---------- LD3R (single structure, replicate) ---------- */
6223    /* ---------- LD4R (single structure, replicate) ---------- */
6224    /* 31 29       22 20    15    11 9 4
6225       0q 001 1010 10 00000 110 0 sz n t  LD1R {Vt.T}, [Xn|SP]
6226       0q 001 1011 10 m     110 0 sz n t  LD1R {Vt.T}, [Xn|SP], step
6227
6228       0q 001 1010 11 00000 110 0 sz n t  LD2R {Vt..t+1.T}, [Xn|SP]
6229       0q 001 1011 11 m     110 0 sz n t  LD2R {Vt..t+1.T}, [Xn|SP], step
6230
6231       0q 001 1010 10 00000 111 0 sz n t  LD3R {Vt..t+2.T}, [Xn|SP]
6232       0q 001 1011 10 m     111 0 sz n t  LD3R {Vt..t+2.T}, [Xn|SP], step
6233
6234       0q 001 1010 11 00000 111 0 sz n t  LD4R {Vt..t+3.T}, [Xn|SP]
6235       0q 001 1011 11 m     111 0 sz n t  LD4R {Vt..t+3.T}, [Xn|SP], step
6236
6237       step = if m == 11111 then transfer-size else Xm
6238    */
6239    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)
6240        && INSN(22,22) == 1 && INSN(15,14) == BITS2(1,1)
6241        && INSN(12,12) == 0) {
6242       UInt   bitQ  = INSN(30,30);
6243       Bool   isPX  = INSN(23,23) == 1;
6244       UInt   nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
6245       UInt   mm    = INSN(20,16);
6246       UInt   sz    = INSN(11,10);
6247       UInt   nn    = INSN(9,5);
6248       UInt   tt    = INSN(4,0);
6249
6250       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
6251       if (isPX || mm == 0) {
6252
6253          IRType ty    = integerIRTypeOfSize(1 << sz);
6254
6255          UInt laneSzB = 1 << sz;
6256          UInt xferSzB = laneSzB * nRegs;
6257
6258          /* Generate the transfer address (TA) and if necessary the
6259             writeback address (WB) */
6260          IRTemp tTA = newTemp(Ity_I64);
6261          assign(tTA, getIReg64orSP(nn));
6262          if (nn == 31) { /* FIXME generate stack alignment check */ }
6263          IRTemp tWB = IRTemp_INVALID;
6264          if (isPX) {
6265             tWB = newTemp(Ity_I64);
6266             assign(tWB, binop(Iop_Add64,
6267                               mkexpr(tTA),
6268                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6269                                                      : getIReg64orZR(mm)));
6270          }
6271
6272          /* Do the writeback, if necessary */
6273          if (isPX) {
6274             putIReg64orSP(nn, mkexpr(tWB));
6275          }
6276
6277          IRTemp e0, e1, e2, e3, v0, v1, v2, v3;
6278          e0 = e1 = e2 = e3 = v0 = v1 = v2 = v3 = IRTemp_INVALID;
6279          switch (nRegs) {
6280             case 4:
6281                e3 = newTemp(ty);
6282                assign(e3, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6283                                                       mkU64(3 * laneSzB))));
6284                v3 = math_DUP_TO_V128(e3, ty);
6285                putQReg128((tt+3) % 32, math_MAYBE_ZERO_HI64(bitQ, v3));
6286                /* fallthrough */
6287             case 3:
6288                e2 = newTemp(ty);
6289                assign(e2, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6290                                                       mkU64(2 * laneSzB))));
6291                v2 = math_DUP_TO_V128(e2, ty);
6292                putQReg128((tt+2) % 32, math_MAYBE_ZERO_HI64(bitQ, v2));
6293                /* fallthrough */
6294             case 2:
6295                e1 = newTemp(ty);
6296                assign(e1, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6297                                                       mkU64(1 * laneSzB))));
6298                v1 = math_DUP_TO_V128(e1, ty);
6299                putQReg128((tt+1) % 32, math_MAYBE_ZERO_HI64(bitQ, v1));
6300                /* fallthrough */
6301             case 1:
6302                e0 = newTemp(ty);
6303                assign(e0, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6304                                                       mkU64(0 * laneSzB))));
6305                v0 = math_DUP_TO_V128(e0, ty);
6306                putQReg128((tt+0) % 32, math_MAYBE_ZERO_HI64(bitQ, v0));
6307                break;
6308             default:
6309                vassert(0);
6310          }
6311
6312          HChar pxStr[20];
6313          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6314          if (isPX) {
6315             if (mm == BITS5(1,1,1,1,1))
6316                vex_sprintf(pxStr, ", #%u", xferSzB);
6317             else
6318                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6319          }
6320          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6321          DIP("ld%ur {v%u.%s .. v%u.%s}, [%s]%s\n",
6322              nRegs,
6323              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6324              pxStr);
6325
6326          return True;
6327       }
6328       /* else fall through */
6329    }
6330
6331    /* ------ LD1/ST1 (single structure, to/from one lane) ------ */
6332    /* ------ LD2/ST2 (single structure, to/from one lane) ------ */
6333    /* ------ LD3/ST3 (single structure, to/from one lane) ------ */
6334    /* ------ LD4/ST4 (single structure, to/from one lane) ------ */
6335    /* 31 29       22 21 20    15    11 9 4
6336       0q 001 1010 L  0  00000 xx0 S sz n t  op1 {Vt.T}[ix], [Xn|SP]
6337       0q 001 1011 L  0  m     xx0 S sz n t  op1 {Vt.T}[ix], [Xn|SP], step
6338
6339       0q 001 1010 L  1  00000 xx0 S sz n t  op2 {Vt..t+1.T}[ix], [Xn|SP]
6340       0q 001 1011 L  1  m     xx0 S sz n t  op2 {Vt..t+1.T}[ix], [Xn|SP], step
6341
6342       0q 001 1010 L  0  00000 xx1 S sz n t  op3 {Vt..t+2.T}[ix], [Xn|SP]
6343       0q 001 1011 L  0  m     xx1 S sz n t  op3 {Vt..t+2.T}[ix], [Xn|SP], step
6344
6345       0q 001 1010 L  1  00000 xx1 S sz n t  op4 {Vt..t+3.T}[ix], [Xn|SP]
6346       0q 001 1011 L  1  m     xx1 S sz n t  op4 {Vt..t+3.T}[ix], [Xn|SP], step
6347
6348       step = if m == 11111 then transfer-size else Xm
6349       op   = case L of 1 -> LD ; 0 -> ST
6350
6351       laneszB,ix = case xx:q:S:sz of 00:b:b:bb -> 1, bbbb
6352                                      01:b:b:b0 -> 2, bbb
6353                                      10:b:b:00 -> 4, bb
6354                                      10:b:0:01 -> 8, b
6355    */
6356    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)) {
6357       UInt   bitQ  = INSN(30,30);
6358       Bool   isPX  = INSN(23,23) == 1;
6359       Bool   isLD  = INSN(22,22) == 1;
6360       UInt   nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
6361       UInt   mm    = INSN(20,16);
6362       UInt   xx    = INSN(15,14);
6363       UInt   bitS  = INSN(12,12);
6364       UInt   sz    = INSN(11,10);
6365       UInt   nn    = INSN(9,5);
6366       UInt   tt    = INSN(4,0);
6367
6368       Bool valid = True;
6369
6370       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
6371       if (!isPX && mm != 0)
6372          valid = False;
6373
6374       UInt laneSzB = 0;  /* invalid */
6375       UInt ix      = 16; /* invalid */
6376
6377       UInt xx_q_S_sz = (xx << 4) | (bitQ << 3) | (bitS << 2) | sz;
6378       switch (xx_q_S_sz) {
6379          case 0x00: case 0x01: case 0x02: case 0x03:
6380          case 0x04: case 0x05: case 0x06: case 0x07:
6381          case 0x08: case 0x09: case 0x0A: case 0x0B:
6382          case 0x0C: case 0x0D: case 0x0E: case 0x0F:
6383             laneSzB = 1; ix = xx_q_S_sz & 0xF;
6384             break;
6385          case 0x10: case 0x12: case 0x14: case 0x16:
6386          case 0x18: case 0x1A: case 0x1C: case 0x1E:
6387             laneSzB = 2; ix = (xx_q_S_sz >> 1) & 7;
6388             break;
6389          case 0x20: case 0x24: case 0x28: case 0x2C:
6390             laneSzB = 4; ix = (xx_q_S_sz >> 2) & 3;
6391             break;
6392          case 0x21: case 0x29:
6393             laneSzB = 8; ix = (xx_q_S_sz >> 3) & 1;
6394             break;
6395          default:
6396             break;
6397       }
6398
6399       if (valid && laneSzB != 0) {
6400
6401          IRType ty      = integerIRTypeOfSize(laneSzB);
6402          UInt   xferSzB = laneSzB * nRegs;
6403
6404          /* Generate the transfer address (TA) and if necessary the
6405             writeback address (WB) */
6406          IRTemp tTA = newTemp(Ity_I64);
6407          assign(tTA, getIReg64orSP(nn));
6408          if (nn == 31) { /* FIXME generate stack alignment check */ }
6409          IRTemp tWB = IRTemp_INVALID;
6410          if (isPX) {
6411             tWB = newTemp(Ity_I64);
6412             assign(tWB, binop(Iop_Add64,
6413                               mkexpr(tTA),
6414                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6415                                                      : getIReg64orZR(mm)));
6416          }
6417
6418          /* Do the writeback, if necessary */
6419          if (isPX) {
6420             putIReg64orSP(nn, mkexpr(tWB));
6421          }
6422
6423          switch (nRegs) {
6424             case 4: {
6425                IRExpr* addr
6426                   = binop(Iop_Add64, mkexpr(tTA), mkU64(3 * laneSzB));
6427                if (isLD) {
6428                   putQRegLane((tt+3) % 32, ix, loadLE(ty, addr));
6429                } else {
6430                   storeLE(addr, getQRegLane((tt+3) % 32, ix, ty));
6431                }
6432             }
6433             /* fallthrough */
6434             case 3: {
6435                IRExpr* addr
6436                   = binop(Iop_Add64, mkexpr(tTA), mkU64(2 * laneSzB));
6437                if (isLD) {
6438                   putQRegLane((tt+2) % 32, ix, loadLE(ty, addr));
6439                } else {
6440                   storeLE(addr, getQRegLane((tt+2) % 32, ix, ty));
6441                }
6442             }
6443             /* fallthrough */
6444             case 2: {
6445                IRExpr* addr
6446                   = binop(Iop_Add64, mkexpr(tTA), mkU64(1 * laneSzB));
6447                if (isLD) {
6448                   putQRegLane((tt+1) % 32, ix, loadLE(ty, addr));
6449                } else {
6450                   storeLE(addr, getQRegLane((tt+1) % 32, ix, ty));
6451                }
6452             }
6453             /* fallthrough */
6454             case 1: {
6455                IRExpr* addr
6456                   = binop(Iop_Add64, mkexpr(tTA), mkU64(0 * laneSzB));
6457                if (isLD) {
6458                   putQRegLane((tt+0) % 32, ix, loadLE(ty, addr));
6459                } else {
6460                   storeLE(addr, getQRegLane((tt+0) % 32, ix, ty));
6461                }
6462                break;
6463             }
6464             default:
6465                vassert(0);
6466          }
6467
6468          HChar pxStr[20];
6469          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6470          if (isPX) {
6471             if (mm == BITS5(1,1,1,1,1))
6472                vex_sprintf(pxStr, ", #%u", xferSzB);
6473             else
6474                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6475          }
6476          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6477          DIP("%s%u {v%u.%s .. v%u.%s}[%u], [%s]%s\n",
6478              isLD ? "ld" : "st", nRegs,
6479              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr,
6480              ix, nameIReg64orSP(nn), pxStr);
6481
6482          return True;
6483       }
6484       /* else fall through */
6485    }
6486
6487    /* ------------------ LD{,A}X{R,RH,RB} ------------------ */
6488    /* ------------------ ST{,L}X{R,RH,RB} ------------------ */
6489    /* 31 29     23  20      14    9 4
6490       sz 001000 010 11111 0 11111 n t   LDX{R,RH,RB}  Rt, [Xn|SP]
6491       sz 001000 010 11111 1 11111 n t   LDAX{R,RH,RB} Rt, [Xn|SP]
6492       sz 001000 000 s     0 11111 n t   STX{R,RH,RB}  Ws, Rt, [Xn|SP]
6493       sz 001000 000 s     1 11111 n t   STLX{R,RH,RB} Ws, Rt, [Xn|SP]
6494    */
6495    /* For the "standard" implementation we pass through the LL and SC to
6496       the host.  For the "fallback" implementation, for details see
6497         https://bugs.kde.org/show_bug.cgi?id=344524 and
6498         https://bugs.kde.org/show_bug.cgi?id=369459,
6499       but in short:
6500
6501       LoadLinked(addr)
6502         gs.LLsize = load_size // 1, 2, 4 or 8
6503         gs.LLaddr = addr
6504         gs.LLdata = zeroExtend(*addr)
6505
6506       StoreCond(addr, data)
6507         tmp_LLsize = gs.LLsize
6508         gs.LLsize = 0 // "no transaction"
6509         if tmp_LLsize != store_size        -> fail
6510         if addr != gs.LLaddr               -> fail
6511         if zeroExtend(*addr) != gs.LLdata  -> fail
6512         cas_ok = CAS(store_size, addr, gs.LLdata -> data)
6513         if !cas_ok                         -> fail
6514         succeed
6515
6516       When thread scheduled
6517         gs.LLsize = 0 // "no transaction"
6518         (coregrind/m_scheduler/scheduler.c, run_thread_for_a_while()
6519          has to do this bit)
6520    */
6521    if (INSN(29,23) == BITS7(0,0,1,0,0,0,0)
6522        && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,0)
6523        && INSN(14,10) == BITS5(1,1,1,1,1)) {
6524       UInt szBlg2     = INSN(31,30);
6525       Bool isLD       = INSN(22,22) == 1;
6526       Bool isAcqOrRel = INSN(15,15) == 1;
6527       UInt ss         = INSN(20,16);
6528       UInt nn         = INSN(9,5);
6529       UInt tt         = INSN(4,0);
6530
6531       vassert(szBlg2 < 4);
6532       UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
6533       IRType ty  = integerIRTypeOfSize(szB);
6534       const HChar* suffix[4] = { "rb", "rh", "r", "r" };
6535
6536       IRTemp ea = newTemp(Ity_I64);
6537       assign(ea, getIReg64orSP(nn));
6538       /* FIXME generate check that ea is szB-aligned */
6539
6540       if (isLD && ss == BITS5(1,1,1,1,1)) {
6541          IRTemp res = newTemp(ty);
6542          if (abiinfo->guest__use_fallback_LLSC) {
6543             // Do the load first so we don't update any guest state
6544             // if it faults.
6545             IRTemp loaded_data64 = newTemp(Ity_I64);
6546             assign(loaded_data64, widenUto64(ty, loadLE(ty, mkexpr(ea))));
6547             stmt( IRStmt_Put( OFFB_LLSC_DATA, mkexpr(loaded_data64) ));
6548             stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
6549             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(szB) ));
6550             putIReg64orZR(tt, mkexpr(loaded_data64));
6551          } else {
6552             stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
6553             putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
6554          }
6555          if (isAcqOrRel) {
6556             stmt(IRStmt_MBE(Imbe_Fence));
6557          }
6558          DIP("ld%sx%s %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
6559              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
6560              abiinfo->guest__use_fallback_LLSC
6561                 ? "(fallback implementation)" : "");
6562          return True;
6563       }
6564       if (!isLD) {
6565          if (isAcqOrRel) {
6566             stmt(IRStmt_MBE(Imbe_Fence));
6567          }
6568          IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
6569          if (abiinfo->guest__use_fallback_LLSC) {
6570             // This is really ugly, since we don't have any way to do
6571             // proper if-then-else.  First, set up as if the SC failed,
6572             // and jump forwards if it really has failed.
6573
6574             // Continuation address
6575             IRConst* nia = IRConst_U64(guest_PC_curr_instr + 4);
6576
6577             // "the SC failed".  Any non-zero value means failure.
6578             putIReg64orZR(ss, mkU64(1));
6579
6580             IRTemp tmp_LLsize = newTemp(Ity_I64);
6581             assign(tmp_LLsize, IRExpr_Get(OFFB_LLSC_SIZE, Ity_I64));
6582             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) // "no transaction"
6583             ));
6584             // Fail if no or wrong-size transaction
6585             vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
6586             stmt( IRStmt_Exit(
6587                      binop(Iop_CmpNE64, mkexpr(tmp_LLsize), mkU64(szB)),
6588                      Ijk_Boring, nia, OFFB_PC
6589             ));
6590             // Fail if the address doesn't match the LL address
6591             stmt( IRStmt_Exit(
6592                       binop(Iop_CmpNE64, mkexpr(ea),
6593                                          IRExpr_Get(OFFB_LLSC_ADDR, Ity_I64)),
6594                       Ijk_Boring, nia, OFFB_PC
6595             ));
6596             // Fail if the data doesn't match the LL data
6597             IRTemp llsc_data64 = newTemp(Ity_I64);
6598             assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA, Ity_I64));
6599             stmt( IRStmt_Exit(
6600                       binop(Iop_CmpNE64, widenUto64(ty, loadLE(ty, mkexpr(ea))),
6601                                          mkexpr(llsc_data64)),
6602                       Ijk_Boring, nia, OFFB_PC
6603             ));
6604             // Try to CAS the new value in.
6605             IRTemp old = newTemp(ty);
6606             IRTemp expd = newTemp(ty);
6607             assign(expd, narrowFrom64(ty, mkexpr(llsc_data64)));
6608             stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
6609                                      Iend_LE, mkexpr(ea),
6610                                      /*expdHi*/NULL, mkexpr(expd),
6611                                      /*dataHi*/NULL, data
6612             )));
6613             // Fail if the CAS failed (viz, old != expd)
6614             stmt( IRStmt_Exit(
6615                       binop(Iop_CmpNE64,
6616                             widenUto64(ty, mkexpr(old)),
6617                             widenUto64(ty, mkexpr(expd))),
6618                       Ijk_Boring, nia, OFFB_PC
6619             ));
6620             // Otherwise we succeeded (!)
6621             putIReg64orZR(ss, mkU64(0));
6622          } else {
6623             IRTemp res = newTemp(Ity_I1);
6624             stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
6625             /* IR semantics: res is 1 if store succeeds, 0 if it fails.
6626                Need to set rS to 1 on failure, 0 on success. */
6627             putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
6628                                                mkU64(1)));
6629          }
6630          DIP("st%sx%s %s, %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
6631              nameIRegOrZR(False, ss),
6632              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
6633              abiinfo->guest__use_fallback_LLSC
6634                 ? "(fallback implementation)" : "");
6635          return True;
6636       }
6637       /* else fall through */
6638    }
6639
6640    /* ------------------ LDA{R,RH,RB} ------------------ */
6641    /* ------------------ STL{R,RH,RB} ------------------ */
6642    /* 31 29     23  20      14    9 4
6643       sz 001000 110 11111 1 11111 n t   LDAR<sz> Rt, [Xn|SP]
6644       sz 001000 100 11111 1 11111 n t   STLR<sz> Rt, [Xn|SP]
6645    */
6646    if (INSN(29,23) == BITS7(0,0,1,0,0,0,1)
6647        && INSN(21,10) == BITS12(0,1,1,1,1,1,1,1,1,1,1,1)) {
6648       UInt szBlg2 = INSN(31,30);
6649       Bool isLD   = INSN(22,22) == 1;
6650       UInt nn     = INSN(9,5);
6651       UInt tt     = INSN(4,0);
6652
6653       vassert(szBlg2 < 4);
6654       UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
6655       IRType ty  = integerIRTypeOfSize(szB);
6656       const HChar* suffix[4] = { "rb", "rh", "r", "r" };
6657
6658       IRTemp ea = newTemp(Ity_I64);
6659       assign(ea, getIReg64orSP(nn));
6660       /* FIXME generate check that ea is szB-aligned */
6661
6662       if (isLD) {
6663          IRTemp res = newTemp(ty);
6664          assign(res, loadLE(ty, mkexpr(ea)));
6665          putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
6666          stmt(IRStmt_MBE(Imbe_Fence));
6667          DIP("lda%s %s, [%s]\n", suffix[szBlg2],
6668              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
6669       } else {
6670          stmt(IRStmt_MBE(Imbe_Fence));
6671          IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
6672          storeLE(mkexpr(ea), data);
6673          DIP("stl%s %s, [%s]\n", suffix[szBlg2],
6674              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
6675       }
6676       return True;
6677    }
6678
6679    /* The PRFM cases that follow are possibly allow Rt values (the
6680       prefetch operation) which are not allowed by the documentation.
6681       This should be looked into. */
6682    /* ------------------ PRFM (immediate) ------------------ */
6683    /* 31           21    9 4
6684       11 111 00110 imm12 n t   PRFM pfrop=Rt, [Xn|SP, #pimm]
6685    */
6686    if (INSN(31,22) == BITS10(1,1,1,1,1,0,0,1,1,0)) {
6687       UInt imm12 = INSN(21,10);
6688       UInt nn    = INSN(9,5);
6689       UInt tt    = INSN(4,0);
6690       /* Generating any IR here is pointless, except for documentation
6691          purposes, as it will get optimised away later. */
6692       IRTemp ea = newTemp(Ity_I64);
6693       assign(ea, binop(Iop_Add64, getIReg64orSP(nn), mkU64(imm12 * 8)));
6694       DIP("prfm prfop=%u, [%s, #%u]\n", tt, nameIReg64orSP(nn), imm12 * 8);
6695       return True;
6696    }
6697
6698    /* ------------------ PRFM (register) ------------------ */
6699    /* 31 29      22 20 15  12 11 9  4
6700       11 1110001 01 Rm opt S  10 Rn Rt    PRFM pfrop=Rt, [Xn|SP, R<m>{ext/sh}]
6701    */
6702    if (INSN(31,21) == BITS11(1,1,1,1,1,0,0,0,1,0,1)
6703        && INSN(11,10) == BITS2(1,0)) {
6704       HChar  dis_buf[64];
6705       UInt   tt = INSN(4,0);
6706       IRTemp ea = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
6707       if (ea != IRTemp_INVALID) {
6708          /* No actual code to generate. */
6709          DIP("prfm prfop=%u, %s\n", tt, dis_buf);
6710          return True;
6711       }
6712    }
6713
6714    /* ------------------ PRFM (unscaled offset) ------------------ */
6715    /* 31 29      22 20   11 9  4
6716       11 1110001 00 imm9 00 Rn Rt    PRFM pfrop=Rt, [Xn|SP, #simm]
6717    */
6718    if (INSN(31,21) == BITS11(1,1, 1,1,1,0,0,0,1, 0,0)
6719        && INSN(11,10) == BITS2(0,0)) {
6720       ULong  imm9   = INSN(20,12);
6721       UInt   nn     = INSN(9,5);
6722       UInt   tt     = INSN(4,0);
6723       ULong  offset = sx_to_64(imm9, 9);
6724       IRTemp ea     = newTemp(Ity_I64);
6725       assign(ea, binop(Iop_Add64, getIReg64orSP(nn), mkU64(offset)));
6726       /* No actual code to generate. */
6727       DIP("prfum prfop=%u, [%s, #0x%llx]\n", tt, nameIReg64orSP(nn), offset);
6728       return True;
6729    }
6730
6731    /* ---------------- ARMv8.1-LSE: Atomic Memory Operations ---------------- */
6732    /* 31 29     23 22 21 20 15   11 9 4
6733       sz 111000 A  R  1  s  0000 00 n t LDADD{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
6734       sz 111000 A  R  1  s  0001 00 n t LDCLR{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
6735       sz 111000 A  R  1  s  0010 00 n t LDEOR{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
6736       sz 111000 A  R  1  s  0011 00 n t LDSET{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
6737       sz 111000 A  R  1  s  0100 00 n t LDSMAX{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
6738       sz 111000 A  R  1  s  0101 00 n t LDSMIN{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
6739       sz 111000 A  R  1  s  0110 00 n t LDUMAX{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
6740       sz 111000 A  R  1  s  0111 00 n t LDUMIN{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
6741       sz 111000 A  R  1  s  1000 00 n t SWP{,A}{,L}<sz>    <Rs>, <Rt>, [<Xn|SP>]
6742    */
6743    if (INSN(29,24) == BITS6(1,1,1,0,0,0)
6744        && INSN(21,21) == 1
6745        && (INSN(15,12) <= BITS4(1,0,0,0))
6746        && INSN(11,10) == BITS2(0,0)) {
6747       UInt szBlg2 = INSN(31,30);
6748       Bool isAcq = INSN(23,23) == 1;
6749       Bool isRel = INSN(22,22) == 1;
6750       UInt ss  = INSN(20,16);
6751       UInt opc = INSN(15,12);
6752       UInt nn  = INSN(9,5);
6753       UInt tt  = INSN(4,0);
6754
6755       const HChar* nm = NULL;
6756       const HChar* suffix[4] = { "b", "h", "", "" };
6757
6758       vassert(szBlg2 < 4);
6759       UInt  szB = 1 << szBlg2; /* 1, 2, 4 or 8 bytes*/
6760       IRType ty = integerIRTypeOfSize(szB);
6761       Bool is64 = szB == 8;
6762       Bool isSigned = (opc == 4) || (opc == 5) /*smax || smin*/;
6763
6764       // IR used to emulate these atomic memory ops:
6765       // 1) barrier
6766       // 2) load
6767       // 3) widen operands and do arithmetic/logic op
6768       // 4) cas to see if target memory updated
6769       // 5) barrier
6770       // 6) repeat from 1) if cas says target memory not updated
6771       // 7) update register
6772
6773       IRTemp ea = newTemp(Ity_I64);
6774       assign(ea, getIReg64orSP(nn));
6775
6776       // Insert barrier before loading for acquire and acquire-release variants:
6777       // A and AL.
6778       if (isAcq && (tt != 31))
6779          stmt(IRStmt_MBE(Imbe_Fence));
6780
6781       // Load LHS from memory, RHS from register.
6782       IRTemp orig = newTemp(ty);
6783       assign(orig, loadLE(ty, mkexpr(ea)));
6784       IRExpr *lhs = mkexpr(orig);
6785       IRExpr *rhs = narrowFrom64(ty, getIReg64orZR(ss));
6786       IRExpr *res = NULL;
6787
6788       lhs = isSigned ? widenSto64(ty, lhs) : widenUto64(ty, lhs);
6789       rhs = isSigned ? widenSto64(ty, rhs) : widenUto64(ty, rhs);
6790
6791       // Perform the operation.
6792       switch (opc) {
6793          case 0:
6794             nm = "ldadd";
6795             res = binop(Iop_Add64, lhs, rhs);
6796             break;
6797          case 1:
6798             nm = "ldclr";
6799             res = binop(Iop_And64, lhs, unop(mkNOT(Ity_I64), rhs));
6800             break;
6801          case 2:
6802             nm = "ldeor";
6803             res = binop(Iop_Xor64, lhs, rhs);
6804             break;
6805          case 3:
6806             nm = "ldset";
6807             res = binop(Iop_Or64, lhs, rhs);
6808             break;
6809          case 4:
6810             nm = "ldsmax";
6811             res = IRExpr_ITE(binop(Iop_CmpLT64S, lhs, rhs), rhs, lhs);
6812             break;
6813          case 5:
6814             nm = "ldsmin";
6815             res = IRExpr_ITE(binop(Iop_CmpLT64S, lhs, rhs), lhs, rhs);
6816             break;
6817          case 6:
6818             nm = "ldumax";
6819             res = IRExpr_ITE(binop(Iop_CmpLT64U, lhs, rhs), rhs, rhs);
6820             break;
6821          case 7:
6822             nm = "ldumin";
6823             res = IRExpr_ITE(binop(Iop_CmpLT64U, lhs, rhs), lhs, rhs);
6824             break;
6825          case 8:
6826             nm = "swp";
6827             res = lhs;
6828             break;
6829          default:
6830             vassert(0);
6831             break;
6832       }
6833
6834       // Store the result back if LHS remains unchanged in memory.
6835       IRTemp old = newTemp(ty);
6836       stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
6837                                Iend_LE, mkexpr(ea),
6838                                /*expdHi*/NULL, mkexpr(orig),
6839                                /*dataHi*/NULL, narrowFrom64(ty, res))) );
6840
6841       // Insert barrier after storing for release and acquire-release variants:
6842       // L and AL.
6843       if (isRel)
6844          stmt(IRStmt_MBE(Imbe_Fence));
6845
6846       // Retry if the CAS failed (i.e. when old != orig).
6847       IRConst* nia = IRConst_U64(guest_PC_curr_instr);
6848       stmt( IRStmt_Exit(
6849                 binop(Iop_CasCmpNE64,
6850                       widenUto64(ty, mkexpr(old)),
6851                       widenUto64(ty, mkexpr(orig))),
6852                 Ijk_Boring, nia, OFFB_PC ));
6853       // Otherwise we succeeded.
6854       putIReg64orZR(tt, widenUto64(ty, mkexpr(old)));
6855
6856       DIP("%s%s%s%s %s, %s, [%s]\n", nm, isAcq ? "a" : "", isRel ? "l" : "",
6857           suffix[szBlg2], nameIRegOrZR(is64, ss), nameIRegOrZR(is64, tt),
6858           nameIReg64orSP(nn));
6859       return True;
6860    }
6861
6862    vex_printf("ARM64 front end: load_store\n");
6863    return False;
6864 #  undef INSN
6865 }
6866
6867
6868 /*------------------------------------------------------------*/
6869 /*--- Control flow and misc instructions                   ---*/
6870 /*------------------------------------------------------------*/
6871
6872 static
6873 Bool dis_ARM64_branch_etc(/*MB_OUT*/DisResult* dres, UInt insn,
6874                           const VexArchInfo* archinfo,
6875                           const VexAbiInfo* abiinfo)
6876 {
6877 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
6878
6879    /* ---------------------- B cond ----------------------- */
6880    /* 31        24    4 3
6881       0101010 0 imm19 0 cond */
6882    if (INSN(31,24) == BITS8(0,1,0,1,0,1,0,0) && INSN(4,4) == 0) {
6883       UInt  cond   = INSN(3,0);
6884       ULong uimm64 = INSN(23,5) << 2;
6885       Long  simm64 = (Long)sx_to_64(uimm64, 21);
6886       vassert(dres->whatNext    == Dis_Continue);
6887       vassert(dres->len         == 4);
6888       vassert(dres->jk_StopHere == Ijk_INVALID);
6889       stmt( IRStmt_Exit(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
6890                         Ijk_Boring,
6891                         IRConst_U64(guest_PC_curr_instr + simm64),
6892                         OFFB_PC) );
6893       putPC(mkU64(guest_PC_curr_instr + 4));
6894       dres->whatNext    = Dis_StopHere;
6895       dres->jk_StopHere = Ijk_Boring;
6896       DIP("b.%s 0x%llx\n", nameCC(cond), guest_PC_curr_instr + simm64);
6897       return True;
6898    }
6899
6900    /* -------------------- B{L} uncond -------------------- */
6901    if (INSN(30,26) == BITS5(0,0,1,0,1)) {
6902       /* 000101 imm26  B  (PC + sxTo64(imm26 << 2))
6903          100101 imm26  B  (PC + sxTo64(imm26 << 2))
6904       */
6905       UInt  bLink  = INSN(31,31);
6906       ULong uimm64 = INSN(25,0) << 2;
6907       Long  simm64 = (Long)sx_to_64(uimm64, 28);
6908       if (bLink) {
6909          putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
6910       }
6911       putPC(mkU64(guest_PC_curr_instr + simm64));
6912       dres->whatNext = Dis_StopHere;
6913       dres->jk_StopHere = Ijk_Call;
6914       DIP("b%s 0x%llx\n", bLink == 1 ? "l" : "",
6915                           guest_PC_curr_instr + simm64);
6916       return True;
6917    }
6918
6919    /* --------------------- B{L} reg --------------------- */
6920    /* 31      24 22 20    15     9  4
6921       1101011 00 10 11111 000000 nn 00000  RET  Rn
6922       1101011 00 01 11111 000000 nn 00000  CALL Rn
6923       1101011 00 00 11111 000000 nn 00000  JMP  Rn
6924    */
6925    if (INSN(31,23) == BITS9(1,1,0,1,0,1,1,0,0)
6926        && INSN(20,16) == BITS5(1,1,1,1,1)
6927        && INSN(15,10) == BITS6(0,0,0,0,0,0)
6928        && INSN(4,0) == BITS5(0,0,0,0,0)) {
6929       UInt branch_type = INSN(22,21);
6930       UInt nn          = INSN(9,5);
6931       if (branch_type == BITS2(1,0) /* RET */) {
6932          putPC(getIReg64orZR(nn));
6933          dres->whatNext = Dis_StopHere;
6934          dres->jk_StopHere = Ijk_Ret;
6935          DIP("ret %s\n", nameIReg64orZR(nn));
6936          return True;
6937       }
6938       if (branch_type == BITS2(0,1) /* CALL */) {
6939          IRTemp dst = newTemp(Ity_I64);
6940          assign(dst, getIReg64orZR(nn));
6941          putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
6942          putPC(mkexpr(dst));
6943          dres->whatNext = Dis_StopHere;
6944          dres->jk_StopHere = Ijk_Call;
6945          DIP("blr %s\n", nameIReg64orZR(nn));
6946          return True;
6947       }
6948       if (branch_type == BITS2(0,0) /* JMP */) {
6949          putPC(getIReg64orZR(nn));
6950          dres->whatNext = Dis_StopHere;
6951          dres->jk_StopHere = Ijk_Boring;
6952          DIP("jmp %s\n", nameIReg64orZR(nn));
6953          return True;
6954       }
6955    }
6956
6957    /* -------------------- CB{N}Z -------------------- */
6958    /* sf 011 010 1 imm19 Rt   CBNZ Xt|Wt, (PC + sxTo64(imm19 << 2))
6959       sf 011 010 0 imm19 Rt   CBZ  Xt|Wt, (PC + sxTo64(imm19 << 2))
6960    */
6961    if (INSN(30,25) == BITS6(0,1,1,0,1,0)) {
6962       Bool    is64   = INSN(31,31) == 1;
6963       Bool    bIfZ   = INSN(24,24) == 0;
6964       ULong   uimm64 = INSN(23,5) << 2;
6965       UInt    rT     = INSN(4,0);
6966       Long    simm64 = (Long)sx_to_64(uimm64, 21);
6967       IRExpr* cond   = NULL;
6968       if (is64) {
6969          cond = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
6970                       getIReg64orZR(rT), mkU64(0));
6971       } else {
6972          cond = binop(bIfZ ? Iop_CmpEQ32 : Iop_CmpNE32,
6973                       getIReg32orZR(rT), mkU32(0));
6974       }
6975       stmt( IRStmt_Exit(cond,
6976                         Ijk_Boring,
6977                         IRConst_U64(guest_PC_curr_instr + simm64),
6978                         OFFB_PC) );
6979       putPC(mkU64(guest_PC_curr_instr + 4));
6980       dres->whatNext    = Dis_StopHere;
6981       dres->jk_StopHere = Ijk_Boring;
6982       DIP("cb%sz %s, 0x%llx\n",
6983           bIfZ ? "" : "n", nameIRegOrZR(is64, rT),
6984           guest_PC_curr_instr + simm64);
6985       return True;
6986    }
6987
6988    /* -------------------- TB{N}Z -------------------- */
6989    /* 31 30      24 23  18  5 4
6990       b5 011 011 1  b40 imm14 t  TBNZ Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
6991       b5 011 011 0  b40 imm14 t  TBZ  Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
6992    */
6993    if (INSN(30,25) == BITS6(0,1,1,0,1,1)) {
6994       UInt    b5     = INSN(31,31);
6995       Bool    bIfZ   = INSN(24,24) == 0;
6996       UInt    b40    = INSN(23,19);
6997       UInt    imm14  = INSN(18,5);
6998       UInt    tt     = INSN(4,0);
6999       UInt    bitNo  = (b5 << 5) | b40;
7000       ULong   uimm64 = imm14 << 2;
7001       Long    simm64 = sx_to_64(uimm64, 16);
7002       IRExpr* cond
7003          = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
7004                  binop(Iop_And64,
7005                        binop(Iop_Shr64, getIReg64orZR(tt), mkU8(bitNo)),
7006                        mkU64(1)),
7007                  mkU64(0));
7008       stmt( IRStmt_Exit(cond,
7009                         Ijk_Boring,
7010                         IRConst_U64(guest_PC_curr_instr + simm64),
7011                         OFFB_PC) );
7012       putPC(mkU64(guest_PC_curr_instr + 4));
7013       dres->whatNext    = Dis_StopHere;
7014       dres->jk_StopHere = Ijk_Boring;
7015       DIP("tb%sz %s, #%u, 0x%llx\n",
7016           bIfZ ? "" : "n", nameIReg64orZR(tt), bitNo,
7017           guest_PC_curr_instr + simm64);
7018       return True;
7019    }
7020
7021    /* -------------------- SVC -------------------- */
7022    /* 11010100 000 imm16 000 01
7023       Don't bother with anything except the imm16==0 case.
7024    */
7025    if (INSN(31,0) == 0xD4000001) {
7026       putPC(mkU64(guest_PC_curr_instr + 4));
7027       dres->whatNext    = Dis_StopHere;
7028       dres->jk_StopHere = Ijk_Sys_syscall;
7029       DIP("svc #0\n");
7030       return True;
7031    }
7032
7033    /* ------------------ M{SR,RS} ------------------ */
7034    /* ---- Cases for TPIDR_EL0 ----
7035       0xD51BD0 010 Rt   MSR tpidr_el0, rT
7036       0xD53BD0 010 Rt   MRS rT, tpidr_el0
7037    */
7038    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51BD040 /*MSR*/
7039        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53BD040 /*MRS*/) {
7040       Bool toSys = INSN(21,21) == 0;
7041       UInt tt    = INSN(4,0);
7042       if (toSys) {
7043          stmt( IRStmt_Put( OFFB_TPIDR_EL0, getIReg64orZR(tt)) );
7044          DIP("msr tpidr_el0, %s\n", nameIReg64orZR(tt));
7045       } else {
7046          putIReg64orZR(tt, IRExpr_Get( OFFB_TPIDR_EL0, Ity_I64 ));
7047          DIP("mrs %s, tpidr_el0\n", nameIReg64orZR(tt));
7048       }
7049       return True;
7050    }
7051    /* ---- Cases for FPCR ----
7052       0xD51B44 000 Rt  MSR fpcr, rT
7053       0xD53B44 000 Rt  MSR rT, fpcr
7054    */
7055    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4400 /*MSR*/
7056        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4400 /*MRS*/) {
7057       Bool toSys = INSN(21,21) == 0;
7058       UInt tt    = INSN(4,0);
7059       if (toSys) {
7060          stmt( IRStmt_Put( OFFB_FPCR, getIReg32orZR(tt)) );
7061          DIP("msr fpcr, %s\n", nameIReg64orZR(tt));
7062       } else {
7063          putIReg32orZR(tt, IRExpr_Get(OFFB_FPCR, Ity_I32));
7064          DIP("mrs %s, fpcr\n", nameIReg64orZR(tt));
7065       }
7066       return True;
7067    }
7068    /* ---- Cases for FPSR ----
7069       0xD51B44 001 Rt  MSR fpsr, rT
7070       0xD53B44 001 Rt  MSR rT, fpsr
7071       The only part of this we model is FPSR.QC.  All other bits
7072       are ignored when writing to it and RAZ when reading from it.
7073    */
7074    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4420 /*MSR*/
7075        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4420 /*MRS*/) {
7076       Bool toSys = INSN(21,21) == 0;
7077       UInt tt    = INSN(4,0);
7078       if (toSys) {
7079          /* Just deal with FPSR.QC.  Make up a V128 value which is
7080             zero if Xt[27] is zero and any other value if Xt[27] is
7081             nonzero. */
7082          IRTemp qc64 = newTemp(Ity_I64);
7083          assign(qc64, binop(Iop_And64,
7084                             binop(Iop_Shr64, getIReg64orZR(tt), mkU8(27)),
7085                             mkU64(1)));
7086          IRExpr* qcV128 = binop(Iop_64HLtoV128, mkexpr(qc64), mkexpr(qc64));
7087          stmt( IRStmt_Put( OFFB_QCFLAG, qcV128 ) );
7088          DIP("msr fpsr, %s\n", nameIReg64orZR(tt));
7089       } else {
7090          /* Generate a value which is all zeroes except for bit 27,
7091             which must be zero if QCFLAG is all zeroes and one otherwise. */
7092          IRTemp qcV128 = newTempV128();
7093          assign(qcV128, IRExpr_Get( OFFB_QCFLAG, Ity_V128 ));
7094          IRTemp qc64 = newTemp(Ity_I64);
7095          assign(qc64, binop(Iop_Or64, unop(Iop_V128HIto64, mkexpr(qcV128)),
7096                                       unop(Iop_V128to64,   mkexpr(qcV128))));
7097          IRExpr* res = binop(Iop_Shl64,
7098                              unop(Iop_1Uto64,
7099                                   binop(Iop_CmpNE64, mkexpr(qc64), mkU64(0))),
7100                              mkU8(27));
7101          putIReg64orZR(tt, res);
7102          DIP("mrs %s, fpsr\n", nameIReg64orZR(tt));
7103       }
7104       return True;
7105    }
7106    /* ---- Cases for NZCV ----
7107       D51B42 000 Rt  MSR nzcv, rT
7108       D53B42 000 Rt  MRS rT, nzcv
7109       The only parts of NZCV that actually exist are bits 31:28, which
7110       are the N Z C and V bits themselves.  Hence the flags thunk provides
7111       all the state we need.
7112    */
7113    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4200 /*MSR*/
7114        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4200 /*MRS*/) {
7115       Bool  toSys = INSN(21,21) == 0;
7116       UInt  tt    = INSN(4,0);
7117       if (toSys) {
7118          IRTemp t = newTemp(Ity_I64);
7119          assign(t, binop(Iop_And64, getIReg64orZR(tt), mkU64(0xF0000000ULL)));
7120          setFlags_COPY(t);
7121          DIP("msr %s, nzcv\n", nameIReg32orZR(tt));
7122       } else {
7123          IRTemp res = newTemp(Ity_I64);
7124          assign(res, mk_arm64g_calculate_flags_nzcv());
7125          putIReg32orZR(tt, unop(Iop_64to32, mkexpr(res)));
7126          DIP("mrs %s, nzcv\n", nameIReg64orZR(tt));
7127       }
7128       return True;
7129    }
7130    /* ---- Cases for DCZID_EL0 ----
7131       Don't support arbitrary reads and writes to this register.  Just
7132       return the value 16, which indicates that the DC ZVA instruction
7133       is not permitted, so we don't have to emulate it.
7134       D5 3B 00 111 Rt  MRS rT, dczid_el0
7135    */
7136    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B00E0) {
7137       UInt tt = INSN(4,0);
7138       putIReg64orZR(tt, mkU64(1<<4));
7139       DIP("mrs %s, dczid_el0 (FAKED)\n", nameIReg64orZR(tt));
7140       return True;
7141    }
7142    /* ---- Cases for CTR_EL0 ----
7143       We just handle reads, and make up a value from the D and I line
7144       sizes in the VexArchInfo we are given, and patch in the following
7145       fields that the Foundation model gives ("natively"):
7146       CWG = 0b0100, ERG = 0b0100, L1Ip = 0b11
7147       D5 3B 00 001 Rt  MRS rT, dczid_el0
7148    */
7149    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B0020) {
7150       UInt tt = INSN(4,0);
7151       /* Need to generate a value from dMinLine_lg2_szB and
7152          dMinLine_lg2_szB.  The value in the register is in 32-bit
7153          units, so need to subtract 2 from the values in the
7154          VexArchInfo.  We can assume that the values here are valid --
7155          disInstr_ARM64 checks them -- so there's no need to deal with
7156          out-of-range cases. */
7157       vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
7158               && archinfo->arm64_dMinLine_lg2_szB <= 17
7159               && archinfo->arm64_iMinLine_lg2_szB >= 2
7160               && archinfo->arm64_iMinLine_lg2_szB <= 17);
7161       UInt val
7162          = 0x8440c000 | ((0xF & (archinfo->arm64_dMinLine_lg2_szB - 2)) << 16)
7163                       | ((0xF & (archinfo->arm64_iMinLine_lg2_szB - 2)) << 0);
7164       putIReg64orZR(tt, mkU64(val));
7165       DIP("mrs %s, ctr_el0\n", nameIReg64orZR(tt));
7166       return True;
7167    }
7168    /* ---- Cases for CNTVCT_EL0 ----
7169       This is a timestamp counter of some sort.  Support reads of it only
7170       by passing through to the host.
7171       D5 3B E0 010 Rt  MRS Xt, cntvct_el0
7172    */
7173    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53BE040) {
7174       UInt     tt   = INSN(4,0);
7175       IRTemp   val  = newTemp(Ity_I64);
7176       IRExpr** args = mkIRExprVec_0();
7177       IRDirty* d    = unsafeIRDirty_1_N (
7178                          val,
7179                          0/*regparms*/,
7180                          "arm64g_dirtyhelper_MRS_CNTVCT_EL0",
7181                          &arm64g_dirtyhelper_MRS_CNTVCT_EL0,
7182                          args
7183                       );
7184       /* execute the dirty call, dumping the result in val. */
7185       stmt( IRStmt_Dirty(d) );
7186       putIReg64orZR(tt, mkexpr(val));
7187       DIP("mrs %s, cntvct_el0\n", nameIReg64orZR(tt));
7188       return True;
7189    }
7190    /* ---- Cases for CNTFRQ_EL0 ----
7191       This is always RO at EL0, so it's safe to pass through to the host.
7192       D5 3B E0 000 Rt  MRS Xt, cntfrq_el0
7193    */
7194    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53BE000) {
7195       UInt     tt   = INSN(4,0);
7196       IRTemp   val  = newTemp(Ity_I64);
7197       IRExpr** args = mkIRExprVec_0();
7198       IRDirty* d    = unsafeIRDirty_1_N (
7199                          val,
7200                          0/*regparms*/,
7201                          "arm64g_dirtyhelper_MRS_CNTFRQ_EL0",
7202                          &arm64g_dirtyhelper_MRS_CNTFRQ_EL0,
7203                          args
7204                       );
7205       /* execute the dirty call, dumping the result in val. */
7206       stmt( IRStmt_Dirty(d) );
7207       putIReg64orZR(tt, mkexpr(val));
7208       DIP("mrs %s, cntfrq_el0\n", nameIReg64orZR(tt));
7209       return True;
7210    }
7211
7212    /* ------------------ IC_IVAU ------------------ */
7213    /* D5 0B 75 001 Rt  ic ivau, rT
7214    */
7215    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD50B7520) {
7216       /* We will always be provided with a valid iMinLine value. */
7217       vassert(archinfo->arm64_iMinLine_lg2_szB >= 2
7218               && archinfo->arm64_iMinLine_lg2_szB <= 17);
7219       /* Round the requested address, in rT, down to the start of the
7220          containing block. */
7221       UInt   tt      = INSN(4,0);
7222       ULong  lineszB = 1ULL << archinfo->arm64_iMinLine_lg2_szB;
7223       IRTemp addr    = newTemp(Ity_I64);
7224       assign( addr, binop( Iop_And64,
7225                            getIReg64orZR(tt),
7226                            mkU64(~(lineszB - 1))) );
7227       /* Set the invalidation range, request exit-and-invalidate, with
7228          continuation at the next instruction. */
7229       stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
7230       stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
7231       /* be paranoid ... */
7232       stmt( IRStmt_MBE(Imbe_Fence) );
7233       putPC(mkU64( guest_PC_curr_instr + 4 ));
7234       dres->whatNext    = Dis_StopHere;
7235       dres->jk_StopHere = Ijk_InvalICache;
7236       DIP("ic ivau, %s\n", nameIReg64orZR(tt));
7237       return True;
7238    }
7239
7240    /* ------------------ DC_CVAU ------------------ */
7241    /* D5 0B 7B 001 Rt  dc cvau, rT
7242    */
7243    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD50B7B20) {
7244       /* Exactly the same scheme as for IC IVAU, except we observe the
7245          dMinLine size, and request an Ijk_FlushDCache instead of
7246          Ijk_InvalICache. */
7247       /* We will always be provided with a valid dMinLine value. */
7248       vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
7249               && archinfo->arm64_dMinLine_lg2_szB <= 17);
7250       /* Round the requested address, in rT, down to the start of the
7251          containing block. */
7252       UInt   tt      = INSN(4,0);
7253       ULong  lineszB = 1ULL << archinfo->arm64_dMinLine_lg2_szB;
7254       IRTemp addr    = newTemp(Ity_I64);
7255       assign( addr, binop( Iop_And64,
7256                            getIReg64orZR(tt),
7257                            mkU64(~(lineszB - 1))) );
7258       /* Set the flush range, request exit-and-flush, with
7259          continuation at the next instruction. */
7260       stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
7261       stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
7262       /* be paranoid ... */
7263       stmt( IRStmt_MBE(Imbe_Fence) );
7264       putPC(mkU64( guest_PC_curr_instr + 4 ));
7265       dres->whatNext    = Dis_StopHere;
7266       dres->jk_StopHere = Ijk_FlushDCache;
7267       DIP("dc cvau, %s\n", nameIReg64orZR(tt));
7268       return True;
7269    }
7270
7271    /* ------------------ ISB, DMB, DSB ------------------ */
7272    /* 31          21            11  7 6  4
7273       11010 10100 0 00 011 0011 CRm 1 01 11111  DMB opt
7274       11010 10100 0 00 011 0011 CRm 1 00 11111  DSB opt
7275       11010 10100 0 00 011 0011 CRm 1 10 11111  ISB opt
7276    */
7277    if (INSN(31,22) == BITS10(1,1,0,1,0,1,0,1,0,0)
7278        && INSN(21,12) == BITS10(0,0,0,0,1,1,0,0,1,1)
7279        && INSN(7,7) == 1
7280        && INSN(6,5) <= BITS2(1,0) && INSN(4,0) == BITS5(1,1,1,1,1)) {
7281       UInt opc = INSN(6,5);
7282       UInt CRm = INSN(11,8);
7283       vassert(opc <= 2 && CRm <= 15);
7284       stmt(IRStmt_MBE(Imbe_Fence));
7285       const HChar* opNames[3]
7286          = { "dsb", "dmb", "isb" };
7287       const HChar* howNames[16]
7288          = { "#0", "oshld", "oshst", "osh", "#4", "nshld", "nshst", "nsh",
7289              "#8", "ishld", "ishst", "ish", "#12", "ld", "st", "sy" };
7290       DIP("%s %s\n", opNames[opc], howNames[CRm]);
7291       return True;
7292    }
7293
7294    /* -------------------- NOP -------------------- */
7295    if (INSN(31,0) == 0xD503201F) {
7296       DIP("nop\n");
7297       return True;
7298    }
7299
7300    /* -------------------- BRK -------------------- */
7301    /* 31        23  20    4
7302       1101 0100 001 imm16 00000  BRK #imm16
7303    */
7304    if (INSN(31,24) == BITS8(1,1,0,1,0,1,0,0)
7305        && INSN(23,21) == BITS3(0,0,1) && INSN(4,0) == BITS5(0,0,0,0,0)) {
7306       UInt imm16 = INSN(20,5);
7307       /* Request SIGTRAP and then restart of this insn. */
7308       putPC(mkU64(guest_PC_curr_instr + 0));
7309       dres->whatNext    = Dis_StopHere;
7310       dres->jk_StopHere = Ijk_SigTRAP;
7311       DIP("brk #%u\n", imm16);
7312       return True;
7313    }
7314
7315    /* ------------------- YIELD ------------------- */
7316    /* 31        23        15        7
7317       1101 0101 0000 0011 0010 0000 0011 1111
7318    */
7319    if (INSN(31,0) == 0xD503203F) {
7320       /* Request yield followed by continuation at the next insn. */
7321       putPC(mkU64(guest_PC_curr_instr + 4));
7322       dres->whatNext    = Dis_StopHere;
7323       dres->jk_StopHere = Ijk_Yield;
7324       DIP("yield\n");
7325       return True;
7326    }
7327
7328    /* -------------------- HINT ------------------- */
7329    /* 31        23        15   11   4 3
7330       1101 0101 0000 0011 0010 imm7 1 1111
7331       Catch otherwise unhandled HINT instructions - any
7332       like YIELD which are explicitly handled should go
7333       above this case.
7334    */
7335    if (INSN(31,24) == BITS8(1,1,0,1,0,1,0,1)
7336        && INSN(23,16) == BITS8(0,0,0,0,0,0,1,1)
7337        && INSN(15,12) == BITS4(0,0,1,0)
7338        && INSN(4,0) == BITS5(1,1,1,1,1)) {
7339       UInt imm7 = INSN(11,5);
7340       DIP("hint #%u\n", imm7);
7341       return True;
7342    }
7343
7344    /* ------------------- CLREX ------------------ */
7345    /* 31        23        15   11 7
7346       1101 0101 0000 0011 0011 m  0101 1111  CLREX CRm
7347       CRm is apparently ignored.
7348    */
7349    if ((INSN(31,0) & 0xFFFFF0FF) == 0xD503305F) {
7350       UInt mm = INSN(11,8);
7351       /* AFAICS, this simply cancels a (all?) reservations made by a
7352          (any?) preceding LDREX(es).  Arrange to hand it through to
7353          the back end. */
7354       if (abiinfo->guest__use_fallback_LLSC) {
7355          stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) )); // "no transaction"
7356       } else {
7357          stmt( IRStmt_MBE(Imbe_CancelReservation) );
7358       }
7359       DIP("clrex #%u\n", mm);
7360       return True;
7361    }
7362
7363    vex_printf("ARM64 front end: branch_etc\n");
7364    return False;
7365 #  undef INSN
7366 }
7367
7368
7369 /*------------------------------------------------------------*/
7370 /*--- SIMD and FP instructions: helper functions           ---*/
7371 /*------------------------------------------------------------*/
7372
7373 /* Some constructors for interleave/deinterleave expressions. */
7374
7375 static IRExpr* mk_CatEvenLanes64x2 ( IRTemp a10, IRTemp b10 ) {
7376    // returns a0 b0
7377    return binop(Iop_InterleaveLO64x2, mkexpr(a10), mkexpr(b10));
7378 }
7379
7380 static IRExpr* mk_CatOddLanes64x2 ( IRTemp a10, IRTemp b10 ) {
7381    // returns a1 b1
7382    return binop(Iop_InterleaveHI64x2, mkexpr(a10), mkexpr(b10));
7383 }
7384
7385 static IRExpr* mk_CatEvenLanes32x4 ( IRTemp a3210, IRTemp b3210 ) {
7386    // returns a2 a0 b2 b0
7387    return binop(Iop_CatEvenLanes32x4, mkexpr(a3210), mkexpr(b3210));
7388 }
7389
7390 static IRExpr* mk_CatOddLanes32x4 ( IRTemp a3210, IRTemp b3210 ) {
7391    // returns a3 a1 b3 b1
7392    return binop(Iop_CatOddLanes32x4, mkexpr(a3210), mkexpr(b3210));
7393 }
7394
7395 static IRExpr* mk_InterleaveLO32x4 ( IRTemp a3210, IRTemp b3210 ) {
7396    // returns a1 b1 a0 b0
7397    return binop(Iop_InterleaveLO32x4, mkexpr(a3210), mkexpr(b3210));
7398 }
7399
7400 static IRExpr* mk_InterleaveHI32x4 ( IRTemp a3210, IRTemp b3210 ) {
7401    // returns a3 b3 a2 b2
7402    return binop(Iop_InterleaveHI32x4, mkexpr(a3210), mkexpr(b3210));
7403 }
7404
7405 static IRExpr* mk_CatEvenLanes16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7406    // returns a6 a4 a2 a0 b6 b4 b2 b0
7407    return binop(Iop_CatEvenLanes16x8, mkexpr(a76543210), mkexpr(b76543210));
7408 }
7409
7410 static IRExpr* mk_CatOddLanes16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7411    // returns a7 a5 a3 a1 b7 b5 b3 b1
7412    return binop(Iop_CatOddLanes16x8, mkexpr(a76543210), mkexpr(b76543210));
7413 }
7414
7415 static IRExpr* mk_InterleaveLO16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7416    // returns a3 b3 a2 b2 a1 b1 a0 b0
7417    return binop(Iop_InterleaveLO16x8, mkexpr(a76543210), mkexpr(b76543210));
7418 }
7419
7420 static IRExpr* mk_InterleaveHI16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7421    // returns a7 b7 a6 b6 a5 b5 a4 b4
7422    return binop(Iop_InterleaveHI16x8, mkexpr(a76543210), mkexpr(b76543210));
7423 }
7424
7425 static IRExpr* mk_CatEvenLanes8x16 ( IRTemp aFEDCBA9876543210,
7426                                      IRTemp bFEDCBA9876543210 ) {
7427    // returns aE aC aA a8 a6 a4 a2 a0 bE bC bA b8 b6 b4 b2 b0
7428    return binop(Iop_CatEvenLanes8x16, mkexpr(aFEDCBA9876543210),
7429                                       mkexpr(bFEDCBA9876543210));
7430 }
7431
7432 static IRExpr* mk_CatOddLanes8x16 ( IRTemp aFEDCBA9876543210,
7433                                     IRTemp bFEDCBA9876543210 ) {
7434    // returns aF aD aB a9 a7 a5 a3 a1 bF bD bB b9 b7 b5 b3 b1
7435    return binop(Iop_CatOddLanes8x16, mkexpr(aFEDCBA9876543210),
7436                                      mkexpr(bFEDCBA9876543210));
7437 }
7438
7439 static IRExpr* mk_InterleaveLO8x16 ( IRTemp aFEDCBA9876543210,
7440                                      IRTemp bFEDCBA9876543210 ) {
7441    // returns a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0
7442    return binop(Iop_InterleaveLO8x16, mkexpr(aFEDCBA9876543210),
7443                                       mkexpr(bFEDCBA9876543210));
7444 }
7445
7446 static IRExpr* mk_InterleaveHI8x16 ( IRTemp aFEDCBA9876543210,
7447                                      IRTemp bFEDCBA9876543210 ) {
7448    // returns aF bF aE bE aD bD aC bC aB bB aA bA a9 b9 a8 b8
7449    return binop(Iop_InterleaveHI8x16, mkexpr(aFEDCBA9876543210),
7450                                       mkexpr(bFEDCBA9876543210));
7451 }
7452
7453 /* Generate N copies of |bit| in the bottom of a ULong. */
7454 static ULong Replicate ( ULong bit, Int N )
7455 {
7456    vassert(bit <= 1 && N >= 1 && N < 64);
7457    if (bit == 0) {
7458       return 0;
7459     } else {
7460       /* Careful.  This won't work for N == 64. */
7461       return (1ULL << N) - 1;
7462    }
7463 }
7464
7465 static ULong Replicate32x2 ( ULong bits32 )
7466 {
7467    vassert(0 == (bits32 & ~0xFFFFFFFFULL));
7468    return (bits32 << 32) | bits32;
7469 }
7470
7471 static ULong Replicate16x4 ( ULong bits16 )
7472 {
7473    vassert(0 == (bits16 & ~0xFFFFULL));
7474    return Replicate32x2((bits16 << 16) | bits16);
7475 }
7476
7477 static ULong Replicate8x8 ( ULong bits8 )
7478 {
7479    vassert(0 == (bits8 & ~0xFFULL));
7480    return Replicate16x4((bits8 << 8) | bits8);
7481 }
7482
7483 /* Expand the VFPExpandImm-style encoding in the bottom 8 bits of
7484    |imm8| to either a 32-bit value if N is 32 or a 64 bit value if N
7485    is 64.  In the former case, the upper 32 bits of the returned value
7486    are guaranteed to be zero. */
7487 static ULong VFPExpandImm ( ULong imm8, Int N )
7488 {
7489    vassert(imm8 <= 0xFF);
7490    vassert(N == 32 || N == 64);
7491    Int E = ((N == 32) ? 8 : 11) - 2; // The spec incorrectly omits the -2.
7492    Int F = N - E - 1;
7493    ULong imm8_6 = (imm8 >> 6) & 1;
7494    /* sign: 1 bit */
7495    /* exp:  E bits */
7496    /* frac: F bits */
7497    ULong sign = (imm8 >> 7) & 1;
7498    ULong exp  = ((imm8_6 ^ 1) << (E-1)) | Replicate(imm8_6, E-1);
7499    ULong frac = ((imm8 & 63) << (F-6)) | Replicate(0, F-6);
7500    vassert(sign < (1ULL << 1));
7501    vassert(exp  < (1ULL << E));
7502    vassert(frac < (1ULL << F));
7503    vassert(1 + E + F == N);
7504    ULong res = (sign << (E+F)) | (exp << F) | frac;
7505    return res;
7506 }
7507
7508 /* Expand an AdvSIMDExpandImm-style encoding into a 64-bit value.
7509    This might fail, as indicated by the returned Bool.  Page 2530 of
7510    the manual. */
7511 static Bool AdvSIMDExpandImm ( /*OUT*/ULong* res,
7512                                UInt op, UInt cmode, UInt imm8 )
7513 {
7514    vassert(op <= 1);
7515    vassert(cmode <= 15);
7516    vassert(imm8 <= 255);
7517
7518    *res = 0; /* will overwrite iff returning True */
7519
7520    ULong imm64    = 0;
7521    Bool  testimm8 = False;
7522
7523    switch (cmode >> 1) {
7524       case 0:
7525          testimm8 = False; imm64 = Replicate32x2(imm8); break;
7526       case 1:
7527          testimm8 = True; imm64 = Replicate32x2(imm8 << 8); break;
7528       case 2:
7529          testimm8 = True; imm64 = Replicate32x2(imm8 << 16); break;
7530       case 3:
7531          testimm8 = True; imm64 = Replicate32x2(imm8 << 24); break;
7532       case 4:
7533           testimm8 = False; imm64 = Replicate16x4(imm8); break;
7534       case 5:
7535           testimm8 = True; imm64 = Replicate16x4(imm8 << 8); break;
7536       case 6:
7537           testimm8 = True;
7538           if ((cmode & 1) == 0)
7539               imm64 = Replicate32x2((imm8 << 8) | 0xFF);
7540           else
7541               imm64 = Replicate32x2((imm8 << 16) | 0xFFFF);
7542           break;
7543       case 7:
7544          testimm8 = False;
7545          if ((cmode & 1) == 0 && op == 0)
7546              imm64 = Replicate8x8(imm8);
7547          if ((cmode & 1) == 0 && op == 1) {
7548              imm64 = 0;   imm64 |= (imm8 & 0x80) ? 0xFF : 0x00;
7549              imm64 <<= 8; imm64 |= (imm8 & 0x40) ? 0xFF : 0x00;
7550              imm64 <<= 8; imm64 |= (imm8 & 0x20) ? 0xFF : 0x00;
7551              imm64 <<= 8; imm64 |= (imm8 & 0x10) ? 0xFF : 0x00;
7552              imm64 <<= 8; imm64 |= (imm8 & 0x08) ? 0xFF : 0x00;
7553              imm64 <<= 8; imm64 |= (imm8 & 0x04) ? 0xFF : 0x00;
7554              imm64 <<= 8; imm64 |= (imm8 & 0x02) ? 0xFF : 0x00;
7555              imm64 <<= 8; imm64 |= (imm8 & 0x01) ? 0xFF : 0x00;
7556          }
7557          if ((cmode & 1) == 1 && op == 0) {
7558             ULong imm8_7  = (imm8 >> 7) & 1;
7559             ULong imm8_6  = (imm8 >> 6) & 1;
7560             ULong imm8_50 = imm8 & 63;
7561             ULong imm32 = (imm8_7                 << (1 + 5 + 6 + 19))
7562                           | ((imm8_6 ^ 1)         << (5 + 6 + 19))
7563                           | (Replicate(imm8_6, 5) << (6 + 19))
7564                           | (imm8_50              << 19);
7565             imm64 = Replicate32x2(imm32);
7566          }
7567          if ((cmode & 1) == 1 && op == 1) {
7568             // imm64 = imm8<7>:NOT(imm8<6>)
7569             //                :Replicate(imm8<6>,8):imm8<5:0>:Zeros(48);
7570             ULong imm8_7  = (imm8 >> 7) & 1;
7571             ULong imm8_6  = (imm8 >> 6) & 1;
7572             ULong imm8_50 = imm8 & 63;
7573             imm64 = (imm8_7 << 63) | ((imm8_6 ^ 1) << 62)
7574                     | (Replicate(imm8_6, 8) << 54)
7575                     | (imm8_50 << 48);
7576          }
7577          break;
7578       default:
7579         vassert(0);
7580    }
7581
7582    if (testimm8 && imm8 == 0)
7583       return False;
7584
7585    *res = imm64;
7586    return True;
7587 }
7588
7589 /* Help a bit for decoding laneage for vector operations that can be
7590    of the form 4x32, 2x64 or 2x32-and-zero-upper-half, as encoded by Q
7591    and SZ bits, typically for vector floating point. */
7592 static Bool getLaneInfo_Q_SZ ( /*OUT*/IRType* tyI,  /*OUT*/IRType* tyF,
7593                                /*OUT*/UInt* nLanes, /*OUT*/Bool* zeroUpper,
7594                                /*OUT*/const HChar** arrSpec,
7595                                Bool bitQ, Bool bitSZ )
7596 {
7597    vassert(bitQ == True || bitQ == False);
7598    vassert(bitSZ == True || bitSZ == False);
7599    if (bitQ && bitSZ) { // 2x64
7600       if (tyI)       *tyI       = Ity_I64;
7601       if (tyF)       *tyF       = Ity_F64;
7602       if (nLanes)    *nLanes    = 2;
7603       if (zeroUpper) *zeroUpper = False;
7604       if (arrSpec)   *arrSpec   = "2d";
7605       return True;
7606    }
7607    if (bitQ && !bitSZ) { // 4x32
7608       if (tyI)       *tyI       = Ity_I32;
7609       if (tyF)       *tyF       = Ity_F32;
7610       if (nLanes)    *nLanes    = 4;
7611       if (zeroUpper) *zeroUpper = False;
7612       if (arrSpec)   *arrSpec   = "4s";
7613       return True;
7614    }
7615    if (!bitQ && !bitSZ) { // 2x32
7616       if (tyI)       *tyI       = Ity_I32;
7617       if (tyF)       *tyF       = Ity_F32;
7618       if (nLanes)    *nLanes    = 2;
7619       if (zeroUpper) *zeroUpper = True;
7620       if (arrSpec)   *arrSpec   = "2s";
7621       return True;
7622    }
7623    // Else impliedly 1x64, which isn't allowed.
7624    return False;
7625 }
7626
7627 /* Helper for decoding laneage for shift-style vector operations
7628    that involve an immediate shift amount. */
7629 static Bool getLaneInfo_IMMH_IMMB ( /*OUT*/UInt* shift, /*OUT*/UInt* szBlg2,
7630                                     UInt immh, UInt immb )
7631 {
7632    vassert(immh < (1<<4));
7633    vassert(immb < (1<<3));
7634    UInt immhb = (immh << 3) | immb;
7635    if (immh & 8) {
7636       if (shift)  *shift  = 128 - immhb;
7637       if (szBlg2) *szBlg2 = 3;
7638       return True;
7639    }
7640    if (immh & 4) {
7641       if (shift)  *shift  = 64 - immhb;
7642       if (szBlg2) *szBlg2 = 2;
7643       return True;
7644    }
7645    if (immh & 2) {
7646       if (shift)  *shift  = 32 - immhb;
7647       if (szBlg2) *szBlg2 = 1;
7648       return True;
7649    }
7650    if (immh & 1) {
7651       if (shift)  *shift  = 16 - immhb;
7652       if (szBlg2) *szBlg2 = 0;
7653       return True;
7654    }
7655    return False;
7656 }
7657
7658 /* Generate IR to fold all lanes of the V128 value in 'src' as
7659    characterised by the operator 'op', and return the result in the
7660    bottom bits of a V128, with all other bits set to zero. */
7661 static IRTemp math_FOLDV ( IRTemp src, IROp op )
7662 {
7663    /* The basic idea is to use repeated applications of Iop_CatEven*
7664       and Iop_CatOdd* operators to 'src' so as to clone each lane into
7665       a complete vector.  Then fold all those vectors with 'op' and
7666       zero out all but the least significant lane. */
7667    switch (op) {
7668       case Iop_Min8Sx16: case Iop_Min8Ux16:
7669       case Iop_Max8Sx16: case Iop_Max8Ux16: case Iop_Add8x16: {
7670          /* NB: temp naming here is misleading -- the naming is for 8
7671             lanes of 16 bit, whereas what is being operated on is 16
7672             lanes of 8 bits. */
7673          IRTemp x76543210 = src;
7674          IRTemp x76547654 = newTempV128();
7675          IRTemp x32103210 = newTempV128();
7676          assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
7677          assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
7678          IRTemp x76767676 = newTempV128();
7679          IRTemp x54545454 = newTempV128();
7680          IRTemp x32323232 = newTempV128();
7681          IRTemp x10101010 = newTempV128();
7682          assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
7683          assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
7684          assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
7685          assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
7686          IRTemp x77777777 = newTempV128();
7687          IRTemp x66666666 = newTempV128();
7688          IRTemp x55555555 = newTempV128();
7689          IRTemp x44444444 = newTempV128();
7690          IRTemp x33333333 = newTempV128();
7691          IRTemp x22222222 = newTempV128();
7692          IRTemp x11111111 = newTempV128();
7693          IRTemp x00000000 = newTempV128();
7694          assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
7695          assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
7696          assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
7697          assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
7698          assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
7699          assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
7700          assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
7701          assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
7702          /* Naming not misleading after here. */
7703          IRTemp xAllF = newTempV128();
7704          IRTemp xAllE = newTempV128();
7705          IRTemp xAllD = newTempV128();
7706          IRTemp xAllC = newTempV128();
7707          IRTemp xAllB = newTempV128();
7708          IRTemp xAllA = newTempV128();
7709          IRTemp xAll9 = newTempV128();
7710          IRTemp xAll8 = newTempV128();
7711          IRTemp xAll7 = newTempV128();
7712          IRTemp xAll6 = newTempV128();
7713          IRTemp xAll5 = newTempV128();
7714          IRTemp xAll4 = newTempV128();
7715          IRTemp xAll3 = newTempV128();
7716          IRTemp xAll2 = newTempV128();
7717          IRTemp xAll1 = newTempV128();
7718          IRTemp xAll0 = newTempV128();
7719          assign(xAllF, mk_CatOddLanes8x16 (x77777777, x77777777));
7720          assign(xAllE, mk_CatEvenLanes8x16(x77777777, x77777777));
7721          assign(xAllD, mk_CatOddLanes8x16 (x66666666, x66666666));
7722          assign(xAllC, mk_CatEvenLanes8x16(x66666666, x66666666));
7723          assign(xAllB, mk_CatOddLanes8x16 (x55555555, x55555555));
7724          assign(xAllA, mk_CatEvenLanes8x16(x55555555, x55555555));
7725          assign(xAll9, mk_CatOddLanes8x16 (x44444444, x44444444));
7726          assign(xAll8, mk_CatEvenLanes8x16(x44444444, x44444444));
7727          assign(xAll7, mk_CatOddLanes8x16 (x33333333, x33333333));
7728          assign(xAll6, mk_CatEvenLanes8x16(x33333333, x33333333));
7729          assign(xAll5, mk_CatOddLanes8x16 (x22222222, x22222222));
7730          assign(xAll4, mk_CatEvenLanes8x16(x22222222, x22222222));
7731          assign(xAll3, mk_CatOddLanes8x16 (x11111111, x11111111));
7732          assign(xAll2, mk_CatEvenLanes8x16(x11111111, x11111111));
7733          assign(xAll1, mk_CatOddLanes8x16 (x00000000, x00000000));
7734          assign(xAll0, mk_CatEvenLanes8x16(x00000000, x00000000));
7735          IRTemp maxFE = newTempV128();
7736          IRTemp maxDC = newTempV128();
7737          IRTemp maxBA = newTempV128();
7738          IRTemp max98 = newTempV128();
7739          IRTemp max76 = newTempV128();
7740          IRTemp max54 = newTempV128();
7741          IRTemp max32 = newTempV128();
7742          IRTemp max10 = newTempV128();
7743          assign(maxFE, binop(op, mkexpr(xAllF), mkexpr(xAllE)));
7744          assign(maxDC, binop(op, mkexpr(xAllD), mkexpr(xAllC)));
7745          assign(maxBA, binop(op, mkexpr(xAllB), mkexpr(xAllA)));
7746          assign(max98, binop(op, mkexpr(xAll9), mkexpr(xAll8)));
7747          assign(max76, binop(op, mkexpr(xAll7), mkexpr(xAll6)));
7748          assign(max54, binop(op, mkexpr(xAll5), mkexpr(xAll4)));
7749          assign(max32, binop(op, mkexpr(xAll3), mkexpr(xAll2)));
7750          assign(max10, binop(op, mkexpr(xAll1), mkexpr(xAll0)));
7751          IRTemp maxFEDC = newTempV128();
7752          IRTemp maxBA98 = newTempV128();
7753          IRTemp max7654 = newTempV128();
7754          IRTemp max3210 = newTempV128();
7755          assign(maxFEDC, binop(op, mkexpr(maxFE), mkexpr(maxDC)));
7756          assign(maxBA98, binop(op, mkexpr(maxBA), mkexpr(max98)));
7757          assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
7758          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
7759          IRTemp maxFEDCBA98 = newTempV128();
7760          IRTemp max76543210 = newTempV128();
7761          assign(maxFEDCBA98, binop(op, mkexpr(maxFEDC), mkexpr(maxBA98)));
7762          assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
7763          IRTemp maxAllLanes = newTempV128();
7764          assign(maxAllLanes, binop(op, mkexpr(maxFEDCBA98),
7765                                        mkexpr(max76543210)));
7766          IRTemp res = newTempV128();
7767          assign(res, unop(Iop_ZeroHI120ofV128, mkexpr(maxAllLanes)));
7768          return res;
7769       }
7770       case Iop_Min16Sx8: case Iop_Min16Ux8:
7771       case Iop_Max16Sx8: case Iop_Max16Ux8: case Iop_Add16x8: {
7772          IRTemp x76543210 = src;
7773          IRTemp x76547654 = newTempV128();
7774          IRTemp x32103210 = newTempV128();
7775          assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
7776          assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
7777          IRTemp x76767676 = newTempV128();
7778          IRTemp x54545454 = newTempV128();
7779          IRTemp x32323232 = newTempV128();
7780          IRTemp x10101010 = newTempV128();
7781          assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
7782          assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
7783          assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
7784          assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
7785          IRTemp x77777777 = newTempV128();
7786          IRTemp x66666666 = newTempV128();
7787          IRTemp x55555555 = newTempV128();
7788          IRTemp x44444444 = newTempV128();
7789          IRTemp x33333333 = newTempV128();
7790          IRTemp x22222222 = newTempV128();
7791          IRTemp x11111111 = newTempV128();
7792          IRTemp x00000000 = newTempV128();
7793          assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
7794          assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
7795          assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
7796          assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
7797          assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
7798          assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
7799          assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
7800          assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
7801          IRTemp max76 = newTempV128();
7802          IRTemp max54 = newTempV128();
7803          IRTemp max32 = newTempV128();
7804          IRTemp max10 = newTempV128();
7805          assign(max76, binop(op, mkexpr(x77777777), mkexpr(x66666666)));
7806          assign(max54, binop(op, mkexpr(x55555555), mkexpr(x44444444)));
7807          assign(max32, binop(op, mkexpr(x33333333), mkexpr(x22222222)));
7808          assign(max10, binop(op, mkexpr(x11111111), mkexpr(x00000000)));
7809          IRTemp max7654 = newTempV128();
7810          IRTemp max3210 = newTempV128();
7811          assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
7812          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
7813          IRTemp max76543210 = newTempV128();
7814          assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
7815          IRTemp res = newTempV128();
7816          assign(res, unop(Iop_ZeroHI112ofV128, mkexpr(max76543210)));
7817          return res;
7818       }
7819       case Iop_Max32Fx4: case Iop_Min32Fx4:
7820       case Iop_Min32Sx4: case Iop_Min32Ux4:
7821       case Iop_Max32Sx4: case Iop_Max32Ux4: case Iop_Add32x4: {
7822          IRTemp x3210 = src;
7823          IRTemp x3232 = newTempV128();
7824          IRTemp x1010 = newTempV128();
7825          assign(x3232, mk_CatOddLanes64x2 (x3210, x3210));
7826          assign(x1010, mk_CatEvenLanes64x2(x3210, x3210));
7827          IRTemp x3333 = newTempV128();
7828          IRTemp x2222 = newTempV128();
7829          IRTemp x1111 = newTempV128();
7830          IRTemp x0000 = newTempV128();
7831          assign(x3333, mk_CatOddLanes32x4 (x3232, x3232));
7832          assign(x2222, mk_CatEvenLanes32x4(x3232, x3232));
7833          assign(x1111, mk_CatOddLanes32x4 (x1010, x1010));
7834          assign(x0000, mk_CatEvenLanes32x4(x1010, x1010));
7835          IRTemp max32 = newTempV128();
7836          IRTemp max10 = newTempV128();
7837          assign(max32, binop(op, mkexpr(x3333), mkexpr(x2222)));
7838          assign(max10, binop(op, mkexpr(x1111), mkexpr(x0000)));
7839          IRTemp max3210 = newTempV128();
7840          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
7841          IRTemp res = newTempV128();
7842          assign(res, unop(Iop_ZeroHI96ofV128, mkexpr(max3210)));
7843          return res;
7844       }
7845       case Iop_Add64x2: {
7846          IRTemp x10 = src;
7847          IRTemp x00 = newTempV128();
7848          IRTemp x11 = newTempV128();
7849          assign(x11, binop(Iop_InterleaveHI64x2, mkexpr(x10), mkexpr(x10)));
7850          assign(x00, binop(Iop_InterleaveLO64x2, mkexpr(x10), mkexpr(x10)));
7851          IRTemp max10 = newTempV128();
7852          assign(max10, binop(op, mkexpr(x11), mkexpr(x00)));
7853          IRTemp res = newTempV128();
7854          assign(res, unop(Iop_ZeroHI64ofV128, mkexpr(max10)));
7855          return res;
7856       }
7857       default:
7858          vassert(0);
7859    }
7860 }
7861
7862
7863 /* Generate IR for TBL and TBX.  This deals with the 128 bit case
7864    only. */
7865 static IRTemp math_TBL_TBX ( IRTemp tab[4], UInt len, IRTemp src,
7866                              IRTemp oor_values )
7867 {
7868    vassert(len >= 0 && len <= 3);
7869
7870    /* Generate some useful constants as concisely as possible. */
7871    IRTemp half15 = newTemp(Ity_I64);
7872    assign(half15, mkU64(0x0F0F0F0F0F0F0F0FULL));
7873    IRTemp half16 = newTemp(Ity_I64);
7874    assign(half16, mkU64(0x1010101010101010ULL));
7875
7876    /* A zero vector */
7877    IRTemp allZero = newTempV128();
7878    assign(allZero, mkV128(0x0000));
7879    /* A vector containing 15 in each 8-bit lane */
7880    IRTemp all15 = newTempV128();
7881    assign(all15, binop(Iop_64HLtoV128, mkexpr(half15), mkexpr(half15)));
7882    /* A vector containing 16 in each 8-bit lane */
7883    IRTemp all16 = newTempV128();
7884    assign(all16, binop(Iop_64HLtoV128, mkexpr(half16), mkexpr(half16)));
7885    /* A vector containing 32 in each 8-bit lane */
7886    IRTemp all32 = newTempV128();
7887    assign(all32, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all16)));
7888    /* A vector containing 48 in each 8-bit lane */
7889    IRTemp all48 = newTempV128();
7890    assign(all48, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all32)));
7891    /* A vector containing 64 in each 8-bit lane */
7892    IRTemp all64 = newTempV128();
7893    assign(all64, binop(Iop_Add8x16, mkexpr(all32), mkexpr(all32)));
7894
7895    /* Group the 16/32/48/64 vectors so as to be indexable. */
7896    IRTemp allXX[4] = { all16, all32, all48, all64 };
7897
7898    /* Compute the result for each table vector, with zeroes in places
7899       where the index values are out of range, and OR them into the
7900       running vector. */
7901    IRTemp running_result = newTempV128();
7902    assign(running_result, mkV128(0));
7903
7904    UInt tabent;
7905    for (tabent = 0; tabent <= len; tabent++) {
7906       vassert(tabent >= 0 && tabent < 4);
7907       IRTemp bias = newTempV128();
7908       assign(bias,
7909              mkexpr(tabent == 0 ? allZero : allXX[tabent-1]));
7910       IRTemp biased_indices = newTempV128();
7911       assign(biased_indices,
7912              binop(Iop_Sub8x16, mkexpr(src), mkexpr(bias)));
7913       IRTemp valid_mask = newTempV128();
7914       assign(valid_mask,
7915              binop(Iop_CmpGT8Ux16, mkexpr(all16), mkexpr(biased_indices)));
7916       IRTemp safe_biased_indices = newTempV128();
7917       assign(safe_biased_indices,
7918              binop(Iop_AndV128, mkexpr(biased_indices), mkexpr(all15)));
7919       IRTemp results_or_junk = newTempV128();
7920       assign(results_or_junk,
7921              binop(Iop_Perm8x16, mkexpr(tab[tabent]),
7922                                  mkexpr(safe_biased_indices)));
7923       IRTemp results_or_zero = newTempV128();
7924       assign(results_or_zero,
7925              binop(Iop_AndV128, mkexpr(results_or_junk), mkexpr(valid_mask)));
7926       /* And OR that into the running result. */
7927       IRTemp tmp = newTempV128();
7928       assign(tmp, binop(Iop_OrV128, mkexpr(results_or_zero),
7929                         mkexpr(running_result)));
7930       running_result = tmp;
7931    }
7932
7933    /* So now running_result holds the overall result where the indices
7934       are in range, and zero in out-of-range lanes.  Now we need to
7935       compute an overall validity mask and use this to copy in the
7936       lanes in the oor_values for out of range indices.  This is
7937       unnecessary for TBL but will get folded out by iropt, so we lean
7938       on that and generate the same code for TBL and TBX here. */
7939    IRTemp overall_valid_mask = newTempV128();
7940    assign(overall_valid_mask,
7941           binop(Iop_CmpGT8Ux16, mkexpr(allXX[len]), mkexpr(src)));
7942    IRTemp result = newTempV128();
7943    assign(result,
7944           binop(Iop_OrV128,
7945                 mkexpr(running_result),
7946                 binop(Iop_AndV128,
7947                       mkexpr(oor_values),
7948                       unop(Iop_NotV128, mkexpr(overall_valid_mask)))));
7949    return result;
7950 }
7951
7952
7953 /* Let |argL| and |argR| be V128 values, and let |opI64x2toV128| be
7954    an op which takes two I64s and produces a V128.  That is, a widening
7955    operator.  Generate IR which applies |opI64x2toV128| to either the
7956    lower (if |is2| is False) or upper (if |is2| is True) halves of
7957    |argL| and |argR|, and return the value in a new IRTemp.
7958 */
7959 static
7960 IRTemp math_BINARY_WIDENING_V128 ( Bool is2, IROp opI64x2toV128,
7961                                    IRExpr* argL, IRExpr* argR )
7962 {
7963    IRTemp res   = newTempV128();
7964    IROp   slice = is2 ? Iop_V128HIto64 : Iop_V128to64;
7965    assign(res, binop(opI64x2toV128, unop(slice, argL),
7966                                     unop(slice, argR)));
7967    return res;
7968 }
7969
7970
7971 /* Generate signed/unsigned absolute difference vector IR. */
7972 static
7973 IRTemp math_ABD ( Bool isU, UInt size, IRExpr* argLE, IRExpr* argRE )
7974 {
7975    vassert(size <= 3);
7976    IRTemp argL = newTempV128();
7977    IRTemp argR = newTempV128();
7978    IRTemp msk  = newTempV128();
7979    IRTemp res  = newTempV128();
7980    assign(argL, argLE);
7981    assign(argR, argRE);
7982    assign(msk, binop(isU ? mkVecCMPGTU(size) : mkVecCMPGTS(size),
7983                      mkexpr(argL), mkexpr(argR)));
7984    assign(res,
7985           binop(Iop_OrV128,
7986                 binop(Iop_AndV128,
7987                       binop(mkVecSUB(size), mkexpr(argL), mkexpr(argR)),
7988                       mkexpr(msk)),
7989                 binop(Iop_AndV128,
7990                       binop(mkVecSUB(size), mkexpr(argR), mkexpr(argL)),
7991                       unop(Iop_NotV128, mkexpr(msk)))));
7992    return res;
7993 }
7994
7995
7996 /* Generate IR that takes a V128 and sign- or zero-widens
7997    either the lower or upper set of lanes to twice-as-wide,
7998    resulting in a new V128 value. */
7999 static
8000 IRTemp math_WIDEN_LO_OR_HI_LANES ( Bool zWiden, Bool fromUpperHalf,
8001                                    UInt sizeNarrow, IRExpr* srcE )
8002 {
8003    IRTemp src = newTempV128();
8004    IRTemp res = newTempV128();
8005    assign(src, srcE);
8006    switch (sizeNarrow) {
8007       case X10:
8008          assign(res,
8009                 binop(zWiden ? Iop_ShrN64x2 : Iop_SarN64x2,
8010                       binop(fromUpperHalf ? Iop_InterleaveHI32x4
8011                                           : Iop_InterleaveLO32x4,
8012                             mkexpr(src),
8013                             mkexpr(src)),
8014                       mkU8(32)));
8015          break;
8016       case X01:
8017          assign(res,
8018                 binop(zWiden ? Iop_ShrN32x4 : Iop_SarN32x4,
8019                       binop(fromUpperHalf ? Iop_InterleaveHI16x8
8020                                           : Iop_InterleaveLO16x8,
8021                             mkexpr(src),
8022                             mkexpr(src)),
8023                       mkU8(16)));
8024          break;
8025       case X00:
8026          assign(res,
8027                 binop(zWiden ? Iop_ShrN16x8 : Iop_SarN16x8,
8028                       binop(fromUpperHalf ? Iop_InterleaveHI8x16
8029                                           : Iop_InterleaveLO8x16,
8030                             mkexpr(src),
8031                             mkexpr(src)),
8032                       mkU8(8)));
8033          break;
8034       default:
8035          vassert(0);
8036    }
8037    return res;
8038 }
8039
8040
8041 /* Generate IR that takes a V128 and sign- or zero-widens
8042    either the even or odd lanes to twice-as-wide,
8043    resulting in a new V128 value. */
8044 static
8045 IRTemp math_WIDEN_EVEN_OR_ODD_LANES ( Bool zWiden, Bool fromOdd,
8046                                       UInt sizeNarrow, IRExpr* srcE )
8047 {
8048    IRTemp src   = newTempV128();
8049    IRTemp res   = newTempV128();
8050    IROp   opSAR = mkVecSARN(sizeNarrow+1);
8051    IROp   opSHR = mkVecSHRN(sizeNarrow+1);
8052    IROp   opSHL = mkVecSHLN(sizeNarrow+1);
8053    IROp   opSxR = zWiden ? opSHR : opSAR;
8054    UInt   amt   = 0;
8055    switch (sizeNarrow) {
8056       case X10: amt = 32; break;
8057       case X01: amt = 16; break;
8058       case X00: amt = 8;  break;
8059       default: vassert(0);
8060    }
8061    assign(src, srcE);
8062    if (fromOdd) {
8063       assign(res, binop(opSxR, mkexpr(src), mkU8(amt)));
8064    } else {
8065       assign(res, binop(opSxR, binop(opSHL, mkexpr(src), mkU8(amt)),
8066                                mkU8(amt)));
8067    }
8068    return res;
8069 }
8070
8071
8072 /* Generate IR that takes two V128s and narrows (takes lower half)
8073    of each lane, producing a single V128 value. */
8074 static
8075 IRTemp math_NARROW_LANES ( IRTemp argHi, IRTemp argLo, UInt sizeNarrow )
8076 {
8077    IRTemp res = newTempV128();
8078    assign(res, binop(mkVecCATEVENLANES(sizeNarrow),
8079                      mkexpr(argHi), mkexpr(argLo)));
8080    return res;
8081 }
8082
8083
8084 /* Return a temp which holds the vector dup of the lane of width
8085    (1 << size) obtained from src[laneNo]. */
8086 static
8087 IRTemp math_DUP_VEC_ELEM ( IRExpr* src, UInt size, UInt laneNo )
8088 {
8089    vassert(size <= 3);
8090    /* Normalise |laneNo| so it is of the form
8091       x000 for D, xx00 for S, xxx0 for H, and xxxx for B.
8092       This puts the bits we want to inspect at constant offsets
8093       regardless of the value of |size|.
8094    */
8095    UInt ix = laneNo << size;
8096    vassert(ix <= 15);
8097    IROp ops[4] = { Iop_INVALID, Iop_INVALID, Iop_INVALID, Iop_INVALID };
8098    switch (size) {
8099       case 0: /* B */
8100          ops[0] = (ix & 1) ? Iop_CatOddLanes8x16 : Iop_CatEvenLanes8x16;
8101          /* fallthrough */
8102       case 1: /* H */
8103          ops[1] = (ix & 2) ? Iop_CatOddLanes16x8 : Iop_CatEvenLanes16x8;
8104          /* fallthrough */
8105       case 2: /* S */
8106          ops[2] = (ix & 4) ? Iop_CatOddLanes32x4 : Iop_CatEvenLanes32x4;
8107          /* fallthrough */
8108       case 3: /* D */
8109          ops[3] = (ix & 8) ? Iop_InterleaveHI64x2 : Iop_InterleaveLO64x2;
8110          break;
8111       default:
8112          vassert(0);
8113    }
8114    IRTemp res = newTempV128();
8115    assign(res, src);
8116    Int i;
8117    for (i = 3; i >= 0; i--) {
8118       if (ops[i] == Iop_INVALID)
8119          break;
8120       IRTemp tmp = newTempV128();
8121       assign(tmp, binop(ops[i], mkexpr(res), mkexpr(res)));
8122       res = tmp;
8123    }
8124    return res;
8125 }
8126
8127
8128 /* Let |srcV| be a V128 value, and let |imm5| be a lane-and-size
8129    selector encoded as shown below.  Return a new V128 holding the
8130    selected lane from |srcV| dup'd out to V128, and also return the
8131    lane number, log2 of the lane size in bytes, and width-character via
8132    *laneNo, *laneSzLg2 and *laneCh respectively.  It may be that imm5
8133    is an invalid selector, in which case return
8134    IRTemp_INVALID, 0, 0 and '?' respectively.
8135
8136    imm5 = xxxx1   signifies .b[xxxx]
8137         = xxx10   .h[xxx]
8138         = xx100   .s[xx]
8139         = x1000   .d[x]
8140         otherwise invalid
8141 */
8142 static
8143 IRTemp handle_DUP_VEC_ELEM ( /*OUT*/UInt* laneNo,
8144                              /*OUT*/UInt* laneSzLg2, /*OUT*/HChar* laneCh,
8145                              IRExpr* srcV, UInt imm5 )
8146 {
8147    *laneNo    = 0;
8148    *laneSzLg2 = 0;
8149    *laneCh    = '?';
8150
8151    if (imm5 & 1) {
8152       *laneNo    = (imm5 >> 1) & 15;
8153       *laneSzLg2 = 0;
8154       *laneCh    = 'b';
8155    }
8156    else if (imm5 & 2) {
8157       *laneNo    = (imm5 >> 2) & 7;
8158       *laneSzLg2 = 1;
8159       *laneCh    = 'h';
8160    }
8161    else if (imm5 & 4) {
8162       *laneNo    = (imm5 >> 3) & 3;
8163       *laneSzLg2 = 2;
8164       *laneCh    = 's';
8165    }
8166    else if (imm5 & 8) {
8167       *laneNo    = (imm5 >> 4) & 1;
8168       *laneSzLg2 = 3;
8169       *laneCh    = 'd';
8170    }
8171    else {
8172       /* invalid */
8173       return IRTemp_INVALID;
8174    }
8175
8176    return math_DUP_VEC_ELEM(srcV, *laneSzLg2, *laneNo);
8177 }
8178
8179
8180 /* Clone |imm| to every lane of a V128, with lane size log2 of |size|. */
8181 static
8182 IRTemp math_VEC_DUP_IMM ( UInt size, ULong imm )
8183 {
8184    IRType ty  = Ity_INVALID;
8185    IRTemp rcS = IRTemp_INVALID;
8186    switch (size) {
8187       case X01:
8188          vassert(imm <= 0xFFFFULL);
8189          ty  = Ity_I16;
8190          rcS = newTemp(ty); assign(rcS, mkU16( (UShort)imm ));
8191          break;
8192       case X10:
8193          vassert(imm <= 0xFFFFFFFFULL);
8194          ty  = Ity_I32;
8195          rcS = newTemp(ty); assign(rcS, mkU32( (UInt)imm ));
8196          break;
8197       case X11:
8198          ty  = Ity_I64;
8199          rcS = newTemp(ty); assign(rcS, mkU64(imm)); break;
8200       default:
8201          vassert(0);
8202    }
8203    IRTemp rcV = math_DUP_TO_V128(rcS, ty);
8204    return rcV;
8205 }
8206
8207
8208 /* Let |new64| be a V128 in which only the lower 64 bits are interesting,
8209    and the upper can contain any value -- it is ignored.  If |is2| is False,
8210    generate IR to put |new64| in the lower half of vector reg |dd| and zero
8211    the upper half.  If |is2| is True, generate IR to put |new64| in the upper
8212    half of vector reg |dd| and leave the lower half unchanged.  This
8213    simulates the behaviour of the "foo/foo2" instructions in which the
8214    destination is half the width of sources, for example addhn/addhn2.
8215 */
8216 static
8217 void putLO64andZUorPutHI64 ( Bool is2, UInt dd, IRTemp new64 )
8218 {
8219    if (is2) {
8220       /* Get the old contents of Vdd, zero the upper half, and replace
8221          it with 'x'. */
8222       IRTemp t_zero_oldLO = newTempV128();
8223       assign(t_zero_oldLO, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
8224       IRTemp t_newHI_zero = newTempV128();
8225       assign(t_newHI_zero, binop(Iop_InterleaveLO64x2, mkexpr(new64),
8226                                                        mkV128(0x0000)));
8227       IRTemp res = newTempV128();
8228       assign(res, binop(Iop_OrV128, mkexpr(t_zero_oldLO),
8229                                     mkexpr(t_newHI_zero)));
8230       putQReg128(dd, mkexpr(res));
8231    } else {
8232       /* This is simple. */
8233       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(new64)));
8234    }
8235 }
8236
8237
8238 /* Compute vector SQABS at lane size |size| for |srcE|, returning
8239    the q result in |*qabs| and the normal result in |*nabs|. */
8240 static
8241 void math_SQABS ( /*OUT*/IRTemp* qabs, /*OUT*/IRTemp* nabs,
8242                   IRExpr* srcE, UInt size )
8243 {
8244       IRTemp src, mask, maskn, nsub, qsub;
8245       src = mask = maskn = nsub = qsub = IRTemp_INVALID;
8246       newTempsV128_7(&src, &mask, &maskn, &nsub, &qsub, nabs, qabs);
8247       assign(src,   srcE);
8248       assign(mask,  binop(mkVecCMPGTS(size),  mkV128(0x0000), mkexpr(src)));
8249       assign(maskn, unop(Iop_NotV128, mkexpr(mask)));
8250       assign(nsub,  binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
8251       assign(qsub,  binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
8252       assign(*nabs, binop(Iop_OrV128,
8253                           binop(Iop_AndV128, mkexpr(nsub), mkexpr(mask)),
8254                           binop(Iop_AndV128, mkexpr(src),  mkexpr(maskn))));
8255       assign(*qabs, binop(Iop_OrV128,
8256                           binop(Iop_AndV128, mkexpr(qsub), mkexpr(mask)),
8257                           binop(Iop_AndV128, mkexpr(src),  mkexpr(maskn))));
8258 }
8259
8260
8261 /* Compute vector SQNEG at lane size |size| for |srcE|, returning
8262    the q result in |*qneg| and the normal result in |*nneg|. */
8263 static
8264 void math_SQNEG ( /*OUT*/IRTemp* qneg, /*OUT*/IRTemp* nneg,
8265                   IRExpr* srcE, UInt size )
8266 {
8267       IRTemp src = IRTemp_INVALID;
8268       newTempsV128_3(&src, nneg, qneg);
8269       assign(src,   srcE);
8270       assign(*nneg, binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
8271       assign(*qneg, binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
8272 }
8273
8274
8275 /* Zero all except the least significant lane of |srcE|, where |size|
8276    indicates the lane size in the usual way. */
8277 static IRTemp math_ZERO_ALL_EXCEPT_LOWEST_LANE ( UInt size, IRExpr* srcE )
8278 {
8279    vassert(size < 4);
8280    IRTemp t = newTempV128();
8281    assign(t, unop(mkVecZEROHIxxOFV128(size), srcE));
8282    return t;
8283 }
8284
8285
8286 /* Generate IR to compute vector widening MULL from either the lower
8287    (is2==False) or upper (is2==True) halves of vecN and vecM.  The
8288    widening multiplies are unsigned when isU==True and signed when
8289    isU==False.  |size| is the narrow lane size indication.  Optionally,
8290    the product may be added to or subtracted from vecD, at the wide lane
8291    size.  This happens when |mas| is 'a' (add) or 's' (sub).  When |mas|
8292    is 'm' (only multiply) then the accumulate part does not happen, and
8293    |vecD| is expected to == IRTemp_INVALID.
8294
8295    Only size==0 (h_b_b), size==1 (s_h_h) and size==2 (d_s_s) variants
8296    are allowed.  The result is returned in a new IRTemp, which is
8297    returned in *res. */
8298 static
8299 void math_MULL_ACC ( /*OUT*/IRTemp* res,
8300                      Bool is2, Bool isU, UInt size, HChar mas,
8301                      IRTemp vecN, IRTemp vecM, IRTemp vecD )
8302 {
8303    vassert(res && *res == IRTemp_INVALID);
8304    vassert(size <= 2);
8305    vassert(mas == 'm' || mas == 'a' || mas == 's');
8306    if (mas == 'm') vassert(vecD == IRTemp_INVALID);
8307    IROp   mulOp = isU ? mkVecMULLU(size) : mkVecMULLS(size);
8308    IROp   accOp = (mas == 'a') ? mkVecADD(size+1)
8309                   : (mas == 's' ? mkVecSUB(size+1)
8310                   : Iop_INVALID);
8311    IRTemp mul   = math_BINARY_WIDENING_V128(is2, mulOp,
8312                                             mkexpr(vecN), mkexpr(vecM));
8313    *res = newTempV128();
8314    assign(*res, mas == 'm' ? mkexpr(mul)
8315                            : binop(accOp, mkexpr(vecD), mkexpr(mul)));
8316 }
8317
8318
8319 /* Same as math_MULL_ACC, except the multiply is signed widening,
8320    the multiplied value is then doubled, before being added to or
8321    subtracted from the accumulated value.  And everything is
8322    saturated.  In all cases, saturation residuals are returned
8323    via (sat1q, sat1n), and in the accumulate cases,
8324    via (sat2q, sat2n) too.  All results are returned in new temporaries.
8325    In the no-accumulate case, *sat2q and *sat2n are never instantiated,
8326    so the caller can tell this has happened. */
8327 static
8328 void math_SQDMULL_ACC ( /*OUT*/IRTemp* res,
8329                         /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
8330                         /*OUT*/IRTemp* sat2q, /*OUT*/IRTemp* sat2n,
8331                         Bool is2, UInt size, HChar mas,
8332                         IRTemp vecN, IRTemp vecM, IRTemp vecD )
8333 {
8334    vassert(size <= 2);
8335    vassert(mas == 'm' || mas == 'a' || mas == 's');
8336    /* Compute
8337          sat1q = vecN.D[is2] *sq vecM.d[is2] *q 2
8338          sat1n = vecN.D[is2] *s  vecM.d[is2] *  2
8339       IOW take either the low or high halves of vecN and vecM, signed widen,
8340       multiply, double that, and signedly saturate.  Also compute the same
8341       but without saturation.
8342    */
8343    vassert(sat2q && *sat2q == IRTemp_INVALID);
8344    vassert(sat2n && *sat2n == IRTemp_INVALID);
8345    newTempsV128_3(sat1q, sat1n, res);
8346    IRTemp tq = math_BINARY_WIDENING_V128(is2, mkVecQDMULLS(size),
8347                                          mkexpr(vecN), mkexpr(vecM));
8348    IRTemp tn = math_BINARY_WIDENING_V128(is2, mkVecMULLS(size),
8349                                          mkexpr(vecN), mkexpr(vecM));
8350    assign(*sat1q, mkexpr(tq));
8351    assign(*sat1n, binop(mkVecADD(size+1), mkexpr(tn), mkexpr(tn)));
8352
8353    /* If there is no accumulation, the final result is sat1q,
8354       and there's no assignment to sat2q or sat2n. */
8355    if (mas == 'm') {
8356       assign(*res, mkexpr(*sat1q));
8357       return;
8358    }
8359
8360    /* Compute
8361          sat2q  = vecD +sq/-sq sat1q
8362          sat2n  = vecD +/-     sat1n
8363          result = sat2q
8364    */
8365    newTempsV128_2(sat2q, sat2n);
8366    assign(*sat2q, binop(mas == 'a' ? mkVecQADDS(size+1) : mkVecQSUBS(size+1),
8367                         mkexpr(vecD), mkexpr(*sat1q)));
8368    assign(*sat2n, binop(mas == 'a' ? mkVecADD(size+1) : mkVecSUB(size+1),
8369                         mkexpr(vecD), mkexpr(*sat1n)));
8370    assign(*res, mkexpr(*sat2q));
8371 }
8372
8373
8374 /* Generate IR for widening signed vector multiplies.  The operands
8375    have their lane width signedly widened, and they are then multiplied
8376    at the wider width, returning results in two new IRTemps. */
8377 static
8378 void math_MULLS ( /*OUT*/IRTemp* resHI, /*OUT*/IRTemp* resLO,
8379                   UInt sizeNarrow, IRTemp argL, IRTemp argR )
8380 {
8381    vassert(sizeNarrow <= 2);
8382    newTempsV128_2(resHI, resLO);
8383    IRTemp argLhi = newTemp(Ity_I64);
8384    IRTemp argLlo = newTemp(Ity_I64);
8385    IRTemp argRhi = newTemp(Ity_I64);
8386    IRTemp argRlo = newTemp(Ity_I64);
8387    assign(argLhi, unop(Iop_V128HIto64, mkexpr(argL)));
8388    assign(argLlo, unop(Iop_V128to64,   mkexpr(argL)));
8389    assign(argRhi, unop(Iop_V128HIto64, mkexpr(argR)));
8390    assign(argRlo, unop(Iop_V128to64,   mkexpr(argR)));
8391    IROp opMulls = mkVecMULLS(sizeNarrow);
8392    assign(*resHI, binop(opMulls, mkexpr(argLhi), mkexpr(argRhi)));
8393    assign(*resLO, binop(opMulls, mkexpr(argLlo), mkexpr(argRlo)));
8394 }
8395
8396
8397 /* Generate IR for SQDMULH and SQRDMULH: signedly wideningly multiply,
8398    double that, possibly add a rounding constant (R variants), and take
8399    the high half. */
8400 static
8401 void math_SQDMULH ( /*OUT*/IRTemp* res,
8402                     /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
8403                     Bool isR, UInt size, IRTemp vN, IRTemp vM )
8404 {
8405    vassert(size == X01 || size == X10); /* s or h only */
8406
8407    newTempsV128_3(res, sat1q, sat1n);
8408
8409    IRTemp mullsHI = IRTemp_INVALID, mullsLO = IRTemp_INVALID;
8410    math_MULLS(&mullsHI, &mullsLO, size, vN, vM);
8411
8412    IRTemp addWide = mkVecADD(size+1);
8413
8414    if (isR) {
8415       assign(*sat1q, binop(mkVecQRDMULHIS(size), mkexpr(vN), mkexpr(vM)));
8416
8417       Int    rcShift    = size == X01 ? 15 : 31;
8418       IRTemp roundConst = math_VEC_DUP_IMM(size+1, 1ULL << rcShift);
8419       assign(*sat1n,
8420              binop(mkVecCATODDLANES(size),
8421                    binop(addWide,
8422                          binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
8423                          mkexpr(roundConst)),
8424                    binop(addWide,
8425                          binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO)),
8426                          mkexpr(roundConst))));
8427    } else {
8428       assign(*sat1q, binop(mkVecQDMULHIS(size), mkexpr(vN), mkexpr(vM)));
8429
8430       assign(*sat1n,
8431              binop(mkVecCATODDLANES(size),
8432                    binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
8433                    binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO))));
8434    }
8435
8436    assign(*res, mkexpr(*sat1q));
8437 }
8438
8439 /* Generate IR for SQRDMLAH and SQRDMLSH: signedly wideningly multiply,
8440    double, add a rounding constant, take the high half and accumulate. */
8441 static
8442 void math_SQRDMLAH ( /*OUT*/IRTemp* res, /*OUT*/IRTemp* res_nosat, Bool isAdd,
8443                      UInt size, IRTemp vD, IRTemp vN, IRTemp vM )
8444 {
8445    vassert(size == X01 || size == X10); /* s or h only */
8446
8447    /* SQRDMLAH = SQADD(A, SQRDMULH(B, C)) */
8448
8449    IRTemp mul, mul_nosat, dummy;
8450    mul = mul_nosat = dummy = IRTemp_INVALID;
8451    math_SQDMULH(&mul, &dummy, &mul_nosat, True/*R*/, size, vN, vM);
8452
8453    IROp  op = isAdd ? mkVecADD(size)   : mkVecSUB(size);
8454    IROp qop = isAdd ? mkVecQADDS(size) : mkVecQSUBS(size);
8455    newTempsV128_2(res, res_nosat);
8456    assign(*res, binop(qop, mkexpr(vD), mkexpr(mul)));
8457    assign(*res_nosat, binop(op, mkexpr(vD), mkexpr(mul_nosat)));
8458 }
8459
8460
8461 /* Generate IR for SQSHL, UQSHL, SQSHLU by imm.  Put the result in
8462    a new temp in *res, and the Q difference pair in new temps in
8463    *qDiff1 and *qDiff2 respectively.  |nm| denotes which of the
8464    three operations it is. */
8465 static
8466 void math_QSHL_IMM ( /*OUT*/IRTemp* res,
8467                      /*OUT*/IRTemp* qDiff1, /*OUT*/IRTemp* qDiff2,
8468                      IRTemp src, UInt size, UInt shift, const HChar* nm )
8469 {
8470    vassert(size <= 3);
8471    UInt laneBits = 8 << size;
8472    vassert(shift < laneBits);
8473    newTempsV128_3(res, qDiff1, qDiff2);
8474    IRTemp z128 = newTempV128();
8475    assign(z128, mkV128(0x0000));
8476
8477    /* UQSHL */
8478    if (vex_streq(nm, "uqshl")) {
8479       IROp qop = mkVecQSHLNSATUU(size);
8480       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
8481       if (shift == 0) {
8482          /* No shift means no saturation. */
8483          assign(*qDiff1, mkexpr(z128));
8484          assign(*qDiff2, mkexpr(z128));
8485       } else {
8486          /* Saturation has occurred if any of the shifted-out bits are
8487             nonzero.  We get the shifted-out bits by right-shifting the
8488             original value. */
8489          UInt rshift = laneBits - shift;
8490          vassert(rshift >= 1 && rshift < laneBits);
8491          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
8492          assign(*qDiff2, mkexpr(z128));
8493       }
8494       return;
8495    }
8496
8497    /* SQSHL */
8498    if (vex_streq(nm, "sqshl")) {
8499       IROp qop = mkVecQSHLNSATSS(size);
8500       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
8501       if (shift == 0) {
8502          /* No shift means no saturation. */
8503          assign(*qDiff1, mkexpr(z128));
8504          assign(*qDiff2, mkexpr(z128));
8505       } else {
8506          /* Saturation has occurred if any of the shifted-out bits are
8507             different from the top bit of the original value. */
8508          UInt rshift = laneBits - 1 - shift;
8509          vassert(rshift >= 0 && rshift < laneBits-1);
8510          /* qDiff1 is the shifted out bits, and the top bit of the original
8511             value, preceded by zeroes. */
8512          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
8513          /* qDiff2 is the top bit of the original value, cloned the
8514             correct number of times. */
8515          assign(*qDiff2, binop(mkVecSHRN(size),
8516                                binop(mkVecSARN(size), mkexpr(src),
8517                                                       mkU8(laneBits-1)),
8518                                mkU8(rshift)));
8519          /* This also succeeds in comparing the top bit of the original
8520             value to itself, which is a bit stupid, but not wrong. */
8521       }
8522       return;
8523    }
8524
8525    /* SQSHLU */
8526    if (vex_streq(nm, "sqshlu")) {
8527       IROp qop = mkVecQSHLNSATSU(size);
8528       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
8529       if (shift == 0) {
8530          /* If there's no shift, saturation depends on the top bit
8531             of the source. */
8532          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(laneBits-1)));
8533          assign(*qDiff2, mkexpr(z128));
8534       } else {
8535          /* Saturation has occurred if any of the shifted-out bits are
8536             nonzero.  We get the shifted-out bits by right-shifting the
8537             original value. */
8538          UInt rshift = laneBits - shift;
8539          vassert(rshift >= 1 && rshift < laneBits);
8540          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
8541          assign(*qDiff2, mkexpr(z128));
8542       }
8543       return;
8544    }
8545
8546    vassert(0);
8547 }
8548
8549
8550 /* Generate IR to do SRHADD and URHADD. */
8551 static
8552 IRTemp math_RHADD ( UInt size, Bool isU, IRTemp aa, IRTemp bb )
8553 {
8554    /* Generate this:
8555       (A >> 1) + (B >> 1) + (((A & 1) + (B & 1) + 1) >> 1)
8556    */
8557    vassert(size <= 3);
8558    IROp opSHR = isU ? mkVecSHRN(size) : mkVecSARN(size);
8559    IROp opADD = mkVecADD(size);
8560    /* The only tricky bit is to generate the correct vector 1 constant. */
8561    const ULong ones64[4]
8562       = { 0x0101010101010101ULL, 0x0001000100010001ULL,
8563           0x0000000100000001ULL, 0x0000000000000001ULL };
8564    IRTemp imm64 = newTemp(Ity_I64);
8565    assign(imm64, mkU64(ones64[size]));
8566    IRTemp vecOne = newTempV128();
8567    assign(vecOne, binop(Iop_64HLtoV128, mkexpr(imm64), mkexpr(imm64)));
8568    IRTemp scaOne = newTemp(Ity_I8);
8569    assign(scaOne, mkU8(1));
8570    IRTemp res = newTempV128();
8571    assign(res,
8572           binop(opADD,
8573                 binop(opSHR, mkexpr(aa), mkexpr(scaOne)),
8574                 binop(opADD,
8575                       binop(opSHR, mkexpr(bb), mkexpr(scaOne)),
8576                       binop(opSHR,
8577                             binop(opADD,
8578                                   binop(opADD,
8579                                         binop(Iop_AndV128, mkexpr(aa),
8580                                                            mkexpr(vecOne)),
8581                                         binop(Iop_AndV128, mkexpr(bb),
8582                                                            mkexpr(vecOne))
8583                                   ),
8584                                   mkexpr(vecOne)
8585                             ),
8586                             mkexpr(scaOne)
8587                       )
8588                 )
8589           )
8590    );
8591    return res;
8592 }
8593
8594
8595 /* QCFLAG tracks the SIMD sticky saturation status.  Update the status
8596    thusly: if, after application of |opZHI| to both |qres| and |nres|,
8597    they have the same value, leave QCFLAG unchanged.  Otherwise, set it
8598    (implicitly) to 1.  |opZHI| may only be one of the Iop_ZeroHIxxofV128
8599    operators, or Iop_INVALID, in which case |qres| and |nres| are used
8600    unmodified.  The presence |opZHI| means this function can be used to
8601    generate QCFLAG update code for both scalar and vector SIMD operations.
8602 */
8603 static
8604 void updateQCFLAGwithDifferenceZHI ( IRTemp qres, IRTemp nres, IROp opZHI )
8605 {
8606    IRTemp diff      = newTempV128();
8607    IRTemp oldQCFLAG = newTempV128();
8608    IRTemp newQCFLAG = newTempV128();
8609    if (opZHI == Iop_INVALID) {
8610       assign(diff, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres)));
8611    } else {
8612       vassert(opZHI == Iop_ZeroHI64ofV128
8613               || opZHI == Iop_ZeroHI96ofV128 || opZHI == Iop_ZeroHI112ofV128);
8614       assign(diff, unop(opZHI, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres))));
8615    }
8616    assign(oldQCFLAG, IRExpr_Get(OFFB_QCFLAG, Ity_V128));
8617    assign(newQCFLAG, binop(Iop_OrV128, mkexpr(oldQCFLAG), mkexpr(diff)));
8618    stmt(IRStmt_Put(OFFB_QCFLAG, mkexpr(newQCFLAG)));
8619 }
8620
8621
8622 /* A variant of updateQCFLAGwithDifferenceZHI in which |qres| and |nres|
8623    are used unmodified, hence suitable for QCFLAG updates for whole-vector
8624    operations. */
8625 static
8626 void updateQCFLAGwithDifference ( IRTemp qres, IRTemp nres )
8627 {
8628    updateQCFLAGwithDifferenceZHI(qres, nres, Iop_INVALID);
8629 }
8630
8631
8632 /* Generate IR to rearrange two vector values in a way which is useful
8633    for doing S/D add-pair etc operations.  There are 3 cases:
8634
8635    2d:  [m1 m0] [n1 n0]  -->  [m1 n1] [m0 n0]
8636
8637    4s:  [m3 m2 m1 m0] [n3 n2 n1 n0]  -->  [m3 m1 n3 n1] [m2 m0 n2 n0]
8638
8639    2s:  [m2 m2 m1 m0] [n3 n2 n1 n0]  -->  [0 0 m1 n1] [0 0 m0 n0]
8640
8641    The cases are distinguished as follows:
8642    isD == True,  bitQ == 1  =>  2d
8643    isD == False, bitQ == 1  =>  4s
8644    isD == False, bitQ == 0  =>  2s
8645 */
8646 static
8647 void math_REARRANGE_FOR_FLOATING_PAIRWISE (
8648         /*OUT*/IRTemp* rearrL, /*OUT*/IRTemp* rearrR,
8649         IRTemp vecM, IRTemp vecN, Bool isD, UInt bitQ
8650      )
8651 {
8652    vassert(rearrL && *rearrL == IRTemp_INVALID);
8653    vassert(rearrR && *rearrR == IRTemp_INVALID);
8654    *rearrL = newTempV128();
8655    *rearrR = newTempV128();
8656    if (isD) {
8657       // 2d case
8658       vassert(bitQ == 1);
8659       assign(*rearrL, binop(Iop_InterleaveHI64x2, mkexpr(vecM), mkexpr(vecN)));
8660       assign(*rearrR, binop(Iop_InterleaveLO64x2, mkexpr(vecM), mkexpr(vecN)));
8661    }
8662    else if (!isD && bitQ == 1) {
8663       // 4s case
8664       assign(*rearrL, binop(Iop_CatOddLanes32x4,  mkexpr(vecM), mkexpr(vecN)));
8665       assign(*rearrR, binop(Iop_CatEvenLanes32x4, mkexpr(vecM), mkexpr(vecN)));
8666    } else {
8667       // 2s case
8668       vassert(!isD && bitQ == 0);
8669       IRTemp m1n1m0n0 = newTempV128();
8670       IRTemp m0n0m1n1 = newTempV128();
8671       assign(m1n1m0n0, binop(Iop_InterleaveLO32x4,
8672                              mkexpr(vecM), mkexpr(vecN)));
8673       assign(m0n0m1n1, triop(Iop_SliceV128,
8674                              mkexpr(m1n1m0n0), mkexpr(m1n1m0n0), mkU8(8)));
8675       assign(*rearrL, unop(Iop_ZeroHI64ofV128, mkexpr(m1n1m0n0)));
8676       assign(*rearrR, unop(Iop_ZeroHI64ofV128, mkexpr(m0n0m1n1)));
8677    }
8678 }
8679
8680
8681 /* Returns 2.0 ^ (-n) for n in 1 .. 64 */
8682 static Double two_to_the_minus ( Int n )
8683 {
8684    if (n == 1) return 0.5;
8685    vassert(n >= 2 && n <= 64);
8686    Int half = n / 2;
8687    return two_to_the_minus(half) * two_to_the_minus(n - half);
8688 }
8689
8690
8691 /* Returns 2.0 ^ n for n in 1 .. 64 */
8692 static Double two_to_the_plus ( Int n )
8693 {
8694    if (n == 1) return 2.0;
8695    vassert(n >= 2 && n <= 64);
8696    Int half = n / 2;
8697    return two_to_the_plus(half) * two_to_the_plus(n - half);
8698 }
8699
8700
8701 /*------------------------------------------------------------*/
8702 /*--- SIMD and FP instructions                             ---*/
8703 /*------------------------------------------------------------*/
8704
8705 static
8706 Bool dis_AdvSIMD_EXT(/*MB_OUT*/DisResult* dres, UInt insn)
8707 {
8708    /* 31  29     23  21 20 15 14   10 9 4
8709       0 q 101110 op2 0  m  0  imm4 0  n d
8710       Decode fields: op2
8711    */
8712 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8713    if (INSN(31,31) != 0
8714        || INSN(29,24) != BITS6(1,0,1,1,1,0)
8715        || INSN(21,21) != 0 || INSN(15,15) != 0 || INSN(10,10) != 0) {
8716       return False;
8717    }
8718    UInt bitQ = INSN(30,30);
8719    UInt op2  = INSN(23,22);
8720    UInt mm   = INSN(20,16);
8721    UInt imm4 = INSN(14,11);
8722    UInt nn   = INSN(9,5);
8723    UInt dd   = INSN(4,0);
8724
8725    if (op2 == BITS2(0,0)) {
8726       /* -------- 00: EXT 16b_16b_16b, 8b_8b_8b -------- */
8727       IRTemp sHi = newTempV128();
8728       IRTemp sLo = newTempV128();
8729       IRTemp res = newTempV128();
8730       assign(sHi, getQReg128(mm));
8731       assign(sLo, getQReg128(nn));
8732       if (bitQ == 1) {
8733          if (imm4 == 0) {
8734             assign(res, mkexpr(sLo));
8735          } else {
8736             vassert(imm4 >= 1 && imm4 <= 15);
8737             assign(res, triop(Iop_SliceV128,
8738                               mkexpr(sHi), mkexpr(sLo), mkU8(imm4)));
8739          }
8740          putQReg128(dd, mkexpr(res));
8741          DIP("ext v%u.16b, v%u.16b, v%u.16b, #%u\n", dd, nn, mm, imm4);
8742       } else {
8743          if (imm4 >= 8) return False;
8744          if (imm4 == 0) {
8745             assign(res, mkexpr(sLo));
8746          } else {
8747             vassert(imm4 >= 1 && imm4 <= 7);
8748             IRTemp hi64lo64 = newTempV128();
8749             assign(hi64lo64, binop(Iop_InterleaveLO64x2,
8750                                    mkexpr(sHi), mkexpr(sLo)));
8751             assign(res, triop(Iop_SliceV128,
8752                               mkexpr(hi64lo64), mkexpr(hi64lo64), mkU8(imm4)));
8753          }
8754          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
8755          DIP("ext v%u.8b, v%u.8b, v%u.8b, #%u\n", dd, nn, mm, imm4);
8756       }
8757       return True;
8758    }
8759
8760    return False;
8761 #  undef INSN
8762 }
8763
8764
8765 static
8766 Bool dis_AdvSIMD_TBL_TBX(/*MB_OUT*/DisResult* dres, UInt insn)
8767 {
8768    /* 31  29     23  21 20 15 14  12 11 9 4
8769       0 q 001110 op2 0  m  0  len op 00 n d
8770       Decode fields: op2,len,op
8771    */
8772 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8773    if (INSN(31,31) != 0
8774        || INSN(29,24) != BITS6(0,0,1,1,1,0)
8775        || INSN(21,21) != 0
8776        || INSN(15,15) != 0
8777        || INSN(11,10) != BITS2(0,0)) {
8778       return False;
8779    }
8780    UInt bitQ  = INSN(30,30);
8781    UInt op2   = INSN(23,22);
8782    UInt mm    = INSN(20,16);
8783    UInt len   = INSN(14,13);
8784    UInt bitOP = INSN(12,12);
8785    UInt nn    = INSN(9,5);
8786    UInt dd    = INSN(4,0);
8787
8788    if (op2 == X00) {
8789       /* -------- 00,xx,0 TBL, xx register table -------- */
8790       /* -------- 00,xx,1 TBX, xx register table -------- */
8791       /* 31  28        20 15 14  12  9 4
8792          0q0 01110 000 m  0  len 000 n d  TBL Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
8793          0q0 01110 000 m  0  len 100 n d  TBX Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
8794          where Ta = 16b(q=1) or 8b(q=0)
8795       */
8796       Bool isTBX = bitOP == 1;
8797       /* The out-of-range values to use. */
8798       IRTemp oor_values = newTempV128();
8799       assign(oor_values, isTBX ? getQReg128(dd) : mkV128(0));
8800       /* src value */
8801       IRTemp src = newTempV128();
8802       assign(src, getQReg128(mm));
8803       /* The table values */
8804       IRTemp tab[4];
8805       UInt   i;
8806       for (i = 0; i <= len; i++) {
8807          vassert(i < 4);
8808          tab[i] = newTempV128();
8809          assign(tab[i], getQReg128((nn + i) % 32));
8810       }
8811       IRTemp res = math_TBL_TBX(tab, len, src, oor_values);
8812       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8813       const HChar* Ta = bitQ ==1 ? "16b" : "8b";
8814       const HChar* nm = isTBX ? "tbx" : "tbl";
8815       DIP("%s %s.%s, {v%u.16b .. v%u.16b}, %s.%s\n",
8816           nm, nameQReg128(dd), Ta, nn, (nn + len) % 32, nameQReg128(mm), Ta);
8817       return True;
8818    }
8819
8820 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8821    return False;
8822 #  undef INSN
8823 }
8824
8825
8826 static
8827 Bool dis_AdvSIMD_ZIP_UZP_TRN(/*MB_OUT*/DisResult* dres, UInt insn)
8828 {
8829    /* 31  29     23   21 20 15 14     11 9 4
8830       0 q 001110 size 0  m  0  opcode 10 n d
8831       Decode fields: opcode
8832    */
8833 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8834    if (INSN(31,31) != 0
8835        || INSN(29,24) != BITS6(0,0,1,1,1,0)
8836        || INSN(21,21) != 0 || INSN(15,15) != 0 || INSN(11,10) != BITS2(1,0)) {
8837       return False;
8838    }
8839    UInt bitQ   = INSN(30,30);
8840    UInt size   = INSN(23,22);
8841    UInt mm     = INSN(20,16);
8842    UInt opcode = INSN(14,12);
8843    UInt nn     = INSN(9,5);
8844    UInt dd     = INSN(4,0);
8845
8846    if (opcode == BITS3(0,0,1) || opcode == BITS3(1,0,1)) {
8847       /* -------- 001 UZP1 std7_std7_std7 -------- */
8848       /* -------- 101 UZP2 std7_std7_std7 -------- */
8849       if (bitQ == 0 && size == X11) return False; // implied 1d case
8850       Bool   isUZP1 = opcode == BITS3(0,0,1);
8851       IROp   op     = isUZP1 ? mkVecCATEVENLANES(size)
8852                              : mkVecCATODDLANES(size);
8853       IRTemp preL = newTempV128();
8854       IRTemp preR = newTempV128();
8855       IRTemp res  = newTempV128();
8856       if (bitQ == 0) {
8857          assign(preL, binop(Iop_InterleaveLO64x2, getQReg128(mm),
8858                                                   getQReg128(nn)));
8859          assign(preR, mkexpr(preL));
8860       } else {
8861          assign(preL, getQReg128(mm));
8862          assign(preR, getQReg128(nn));
8863       }
8864       assign(res, binop(op, mkexpr(preL), mkexpr(preR)));
8865       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8866       const HChar* nm  = isUZP1 ? "uzp1" : "uzp2";
8867       const HChar* arr = nameArr_Q_SZ(bitQ, size);
8868       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
8869           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
8870       return True;
8871    }
8872
8873    if (opcode == BITS3(0,1,0) || opcode == BITS3(1,1,0)) {
8874       /* -------- 010 TRN1 std7_std7_std7 -------- */
8875       /* -------- 110 TRN2 std7_std7_std7 -------- */
8876       if (bitQ == 0 && size == X11) return False; // implied 1d case
8877       Bool   isTRN1 = opcode == BITS3(0,1,0);
8878       IROp   op1    = isTRN1 ? mkVecCATEVENLANES(size)
8879                              : mkVecCATODDLANES(size);
8880       IROp op2 = mkVecINTERLEAVEHI(size);
8881       IRTemp srcM = newTempV128();
8882       IRTemp srcN = newTempV128();
8883       IRTemp res  = newTempV128();
8884       assign(srcM, getQReg128(mm));
8885       assign(srcN, getQReg128(nn));
8886       assign(res, binop(op2, binop(op1, mkexpr(srcM), mkexpr(srcM)),
8887                              binop(op1, mkexpr(srcN), mkexpr(srcN))));
8888       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8889       const HChar* nm  = isTRN1 ? "trn1" : "trn2";
8890       const HChar* arr = nameArr_Q_SZ(bitQ, size);
8891       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
8892           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
8893       return True;
8894    }
8895
8896    if (opcode == BITS3(0,1,1) || opcode == BITS3(1,1,1)) {
8897       /* -------- 011 ZIP1 std7_std7_std7 -------- */
8898       /* -------- 111 ZIP2 std7_std7_std7 -------- */
8899       if (bitQ == 0 && size == X11) return False; // implied 1d case
8900       Bool   isZIP1 = opcode == BITS3(0,1,1);
8901       IROp   op     = isZIP1 ? mkVecINTERLEAVELO(size)
8902                              : mkVecINTERLEAVEHI(size);
8903       IRTemp preL = newTempV128();
8904       IRTemp preR = newTempV128();
8905       IRTemp res  = newTempV128();
8906       if (bitQ == 0 && !isZIP1) {
8907          IRTemp z128 = newTempV128();
8908          assign(z128, mkV128(0x0000));
8909          // preL = Vm shifted left 32 bits
8910          // preR = Vn shifted left 32 bits
8911          assign(preL, triop(Iop_SliceV128,
8912                             getQReg128(mm), mkexpr(z128), mkU8(12)));
8913          assign(preR, triop(Iop_SliceV128,
8914                             getQReg128(nn), mkexpr(z128), mkU8(12)));
8915
8916       } else {
8917          assign(preL, getQReg128(mm));
8918          assign(preR, getQReg128(nn));
8919       }
8920       assign(res, binop(op, mkexpr(preL), mkexpr(preR)));
8921       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8922       const HChar* nm  = isZIP1 ? "zip1" : "zip2";
8923       const HChar* arr = nameArr_Q_SZ(bitQ, size);
8924       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
8925           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
8926       return True;
8927    }
8928
8929    return False;
8930 #  undef INSN
8931 }
8932
8933
8934 static
8935 Bool dis_AdvSIMD_across_lanes(/*MB_OUT*/DisResult* dres, UInt insn)
8936 {
8937    /* 31    28    23   21    16     11 9 4
8938       0 q u 01110 size 11000 opcode 10 n d
8939       Decode fields: u,size,opcode
8940    */
8941 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8942    if (INSN(31,31) != 0
8943        || INSN(28,24) != BITS5(0,1,1,1,0)
8944        || INSN(21,17) != BITS5(1,1,0,0,0) || INSN(11,10) != BITS2(1,0)) {
8945       return False;
8946    }
8947    UInt bitQ   = INSN(30,30);
8948    UInt bitU   = INSN(29,29);
8949    UInt size   = INSN(23,22);
8950    UInt opcode = INSN(16,12);
8951    UInt nn     = INSN(9,5);
8952    UInt dd     = INSN(4,0);
8953
8954    if (opcode == BITS5(0,0,0,1,1)) {
8955       /* -------- 0,xx,00011 SADDLV -------- */
8956       /* -------- 1,xx,00011 UADDLV -------- */
8957       /* size is the narrow size */
8958       if (size == X11 || (size == X10 && bitQ == 0)) return False;
8959       Bool   isU = bitU == 1;
8960       IRTemp src = newTempV128();
8961       assign(src, getQReg128(nn));
8962       /* The basic plan is to widen the lower half, and if Q = 1,
8963          the upper half too.  Add them together (if Q = 1), and in
8964          either case fold with add at twice the lane width.
8965       */
8966       IRExpr* widened
8967          = mkexpr(math_WIDEN_LO_OR_HI_LANES(
8968                      isU, False/*!fromUpperHalf*/, size, mkexpr(src)));
8969       if (bitQ == 1) {
8970          widened
8971             = binop(mkVecADD(size+1),
8972                     widened,
8973                     mkexpr(math_WIDEN_LO_OR_HI_LANES(
8974                               isU, True/*fromUpperHalf*/, size, mkexpr(src)))
8975               );
8976       }
8977       /* Now fold. */
8978       IRTemp tWi = newTempV128();
8979       assign(tWi, widened);
8980       IRTemp res = math_FOLDV(tWi, mkVecADD(size+1));
8981       putQReg128(dd, mkexpr(res));
8982       const HChar* arr = nameArr_Q_SZ(bitQ, size);
8983       const HChar  ch  = "bhsd"[size];
8984       DIP("%s %s.%c, %s.%s\n", isU ? "uaddlv" : "saddlv",
8985           nameQReg128(dd), ch, nameQReg128(nn), arr);
8986       return True;
8987    }
8988
8989    UInt ix = 0;
8990    /**/ if (opcode == BITS5(0,1,0,1,0)) { ix = bitU == 0 ? 1 : 2; }
8991    else if (opcode == BITS5(1,1,0,1,0)) { ix = bitU == 0 ? 3 : 4; }
8992    else if (opcode == BITS5(1,1,0,1,1) && bitU == 0) { ix = 5; }
8993    /**/
8994    if (ix != 0) {
8995       /* -------- 0,xx,01010: SMAXV -------- (1) */
8996       /* -------- 1,xx,01010: UMAXV -------- (2) */
8997       /* -------- 0,xx,11010: SMINV -------- (3) */
8998       /* -------- 1,xx,11010: UMINV -------- (4) */
8999       /* -------- 0,xx,11011: ADDV  -------- (5) */
9000       vassert(ix >= 1 && ix <= 5);
9001       if (size == X11) return False; // 1d,2d cases not allowed
9002       if (size == X10 && bitQ == 0) return False; // 2s case not allowed
9003       const IROp opMAXS[3]
9004          = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4 };
9005       const IROp opMAXU[3]
9006          = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4 };
9007       const IROp opMINS[3]
9008          = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4 };
9009       const IROp opMINU[3]
9010          = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4 };
9011       const IROp opADD[3]
9012          = { Iop_Add8x16,  Iop_Add16x8,  Iop_Add32x4 };
9013       vassert(size < 3);
9014       IROp op = Iop_INVALID;
9015       const HChar* nm = NULL;
9016       switch (ix) {
9017          case 1: op = opMAXS[size]; nm = "smaxv"; break;
9018          case 2: op = opMAXU[size]; nm = "umaxv"; break;
9019          case 3: op = opMINS[size]; nm = "sminv"; break;
9020          case 4: op = opMINU[size]; nm = "uminv"; break;
9021          case 5: op = opADD[size];  nm = "addv";  break;
9022          default: vassert(0);
9023       }
9024       vassert(op != Iop_INVALID && nm != NULL);
9025       IRTemp tN1 = newTempV128();
9026       assign(tN1, getQReg128(nn));
9027       /* If Q == 0, we're just folding lanes in the lower half of
9028          the value.  In which case, copy the lower half of the
9029          source into the upper half, so we can then treat it the
9030          same as the full width case.  Except for the addition case,
9031          in which we have to zero out the upper half. */
9032       IRTemp tN2 = newTempV128();
9033       assign(tN2, bitQ == 0
9034                      ? (ix == 5 ? unop(Iop_ZeroHI64ofV128, mkexpr(tN1))
9035                                 : mk_CatEvenLanes64x2(tN1,tN1))
9036                      : mkexpr(tN1));
9037       IRTemp res = math_FOLDV(tN2, op);
9038       if (res == IRTemp_INVALID)
9039          return False; /* means math_FOLDV
9040                           doesn't handle this case yet */
9041       putQReg128(dd, mkexpr(res));
9042       const IRType tys[3] = { Ity_I8, Ity_I16, Ity_I32 };
9043       IRType laneTy = tys[size];
9044       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9045       DIP("%s %s, %s.%s\n", nm,
9046           nameQRegLO(dd, laneTy), nameQReg128(nn), arr);
9047       return True;
9048    }
9049
9050    if ((size == X00 || size == X10)
9051        && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) {
9052       /* -------- 0,00,01100: FMAXMNV s_4s -------- */
9053       /* -------- 0,10,01100: FMINMNV s_4s -------- */
9054       /* -------- 1,00,01111: FMAXV   s_4s -------- */
9055       /* -------- 1,10,01111: FMINV   s_4s -------- */
9056       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
9057       if (bitQ == 0) return False; // Only 4s is allowed
9058       Bool   isMIN = (size & 2) == 2;
9059       Bool   isNM  = opcode == BITS5(0,1,1,0,0);
9060       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(2);
9061       IRTemp src = newTempV128();
9062       assign(src, getQReg128(nn));
9063       IRTemp res = math_FOLDV(src, opMXX);
9064       putQReg128(dd, mkexpr(res));
9065       DIP("%s%sv s%u, %u.4s\n",
9066           isMIN ? "fmin" : "fmax", isNM ? "nm" : "", dd, nn);
9067       return True;
9068    }
9069
9070 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9071    return False;
9072 #  undef INSN
9073 }
9074
9075
9076 static
9077 Bool dis_AdvSIMD_copy(/*MB_OUT*/DisResult* dres, UInt insn)
9078 {
9079    /* 31     28       20   15 14   10 9 4
9080       0 q op 01110000 imm5 0  imm4 1  n d
9081       Decode fields: q,op,imm4
9082    */
9083 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9084    if (INSN(31,31) != 0
9085        || INSN(28,21) != BITS8(0,1,1,1,0,0,0,0)
9086        || INSN(15,15) != 0 || INSN(10,10) != 1) {
9087       return False;
9088    }
9089    UInt bitQ  = INSN(30,30);
9090    UInt bitOP = INSN(29,29);
9091    UInt imm5  = INSN(20,16);
9092    UInt imm4  = INSN(14,11);
9093    UInt nn    = INSN(9,5);
9094    UInt dd    = INSN(4,0);
9095
9096    /* -------- x,0,0000: DUP (element, vector) -------- */
9097    /* 31  28       20   15     9 4
9098       0q0 01110000 imm5 000001 n d  DUP Vd.T, Vn.Ts[index]
9099    */
9100    if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
9101       UInt   laneNo    = 0;
9102       UInt   laneSzLg2 = 0;
9103       HChar  laneCh    = '?';
9104       IRTemp res       = handle_DUP_VEC_ELEM(&laneNo, &laneSzLg2, &laneCh,
9105                                              getQReg128(nn), imm5);
9106       if (res == IRTemp_INVALID)
9107          return False;
9108       if (bitQ == 0 && laneSzLg2 == X11)
9109          return False; /* .1d case */
9110       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9111       const HChar* arT = nameArr_Q_SZ(bitQ, laneSzLg2);
9112       DIP("dup %s.%s, %s.%c[%u]\n",
9113            nameQReg128(dd), arT, nameQReg128(nn), laneCh, laneNo);
9114       return True;
9115    }
9116
9117    /* -------- x,0,0001: DUP (general, vector) -------- */
9118    /* 31  28       20   15       9 4
9119       0q0 01110000 imm5 0 0001 1 n d  DUP Vd.T, Rn
9120       Q=0 writes 64, Q=1 writes 128
9121       imm5: xxxx1  8B(q=0)      or 16b(q=1),     R=W
9122             xxx10  4H(q=0)      or 8H(q=1),      R=W
9123             xx100  2S(q=0)      or 4S(q=1),      R=W
9124             x1000  Invalid(q=0) or 2D(q=1),      R=X
9125             x0000  Invalid(q=0) or Invalid(q=1)
9126       Require op=0, imm4=0001
9127    */
9128    if (bitOP == 0 && imm4 == BITS4(0,0,0,1)) {
9129       Bool   isQ = bitQ == 1;
9130       IRTemp w0  = newTemp(Ity_I64);
9131       const HChar* arT = "??";
9132       IRType laneTy = Ity_INVALID;
9133       if (imm5 & 1) {
9134          arT    = isQ ? "16b" : "8b";
9135          laneTy = Ity_I8;
9136          assign(w0, unop(Iop_8Uto64, unop(Iop_64to8, getIReg64orZR(nn))));
9137       }
9138       else if (imm5 & 2) {
9139          arT    = isQ ? "8h" : "4h";
9140          laneTy = Ity_I16;
9141          assign(w0, unop(Iop_16Uto64, unop(Iop_64to16, getIReg64orZR(nn))));
9142       }
9143       else if (imm5 & 4) {
9144          arT    = isQ ? "4s" : "2s";
9145          laneTy = Ity_I32;
9146          assign(w0, unop(Iop_32Uto64, unop(Iop_64to32, getIReg64orZR(nn))));
9147       }
9148       else if ((imm5 & 8) && isQ) {
9149          arT    = "2d";
9150          laneTy = Ity_I64;
9151          assign(w0, getIReg64orZR(nn));
9152       }
9153       else {
9154          /* invalid; leave laneTy unchanged. */
9155       }
9156       /* */
9157       if (laneTy != Ity_INVALID) {
9158          IRTemp w1 = math_DUP_TO_64(w0, laneTy);
9159          putQReg128(dd, binop(Iop_64HLtoV128,
9160                               isQ ? mkexpr(w1) : mkU64(0), mkexpr(w1)));
9161          DIP("dup %s.%s, %s\n",
9162              nameQReg128(dd), arT, nameIRegOrZR(laneTy == Ity_I64, nn));
9163          return True;
9164       }
9165       /* invalid */
9166       return False;
9167    }
9168
9169    /* -------- 1,0,0011: INS (general) -------- */
9170    /* 31  28       20   15     9 4
9171       010 01110000 imm5 000111 n d  INS Vd.Ts[ix], Rn
9172       where Ts,ix = case imm5 of xxxx1 -> B, xxxx
9173                                  xxx10 -> H, xxx
9174                                  xx100 -> S, xx
9175                                  x1000 -> D, x
9176    */
9177    if (bitQ == 1 && bitOP == 0 && imm4 == BITS4(0,0,1,1)) {
9178       HChar   ts     = '?';
9179       UInt    laneNo = 16;
9180       IRExpr* src    = NULL;
9181       if (imm5 & 1) {
9182          src    = unop(Iop_64to8, getIReg64orZR(nn));
9183          laneNo = (imm5 >> 1) & 15;
9184          ts     = 'b';
9185       }
9186       else if (imm5 & 2) {
9187          src    = unop(Iop_64to16, getIReg64orZR(nn));
9188          laneNo = (imm5 >> 2) & 7;
9189          ts     = 'h';
9190       }
9191       else if (imm5 & 4) {
9192          src    = unop(Iop_64to32, getIReg64orZR(nn));
9193          laneNo = (imm5 >> 3) & 3;
9194          ts     = 's';
9195       }
9196       else if (imm5 & 8) {
9197          src    = getIReg64orZR(nn);
9198          laneNo = (imm5 >> 4) & 1;
9199          ts     = 'd';
9200       }
9201       /* */
9202       if (src) {
9203          vassert(laneNo < 16);
9204          putQRegLane(dd, laneNo, src);
9205          DIP("ins %s.%c[%u], %s\n",
9206              nameQReg128(dd), ts, laneNo, nameIReg64orZR(nn));
9207          return True;
9208       }
9209       /* invalid */
9210       return False;
9211    }
9212
9213    /* -------- x,0,0101: SMOV -------- */
9214    /* -------- x,0,0111: UMOV -------- */
9215    /* 31  28        20   15     9 4
9216       0q0 01110 000 imm5 001111 n d  UMOV Xd/Wd, Vn.Ts[index]
9217       0q0 01110 000 imm5 001011 n d  SMOV Xd/Wd, Vn.Ts[index]
9218       dest is Xd when q==1, Wd when q==0
9219       UMOV:
9220          Ts,index,ops = case q:imm5 of
9221                           0:xxxx1 -> B, xxxx, 8Uto64
9222                           1:xxxx1 -> invalid
9223                           0:xxx10 -> H, xxx,  16Uto64
9224                           1:xxx10 -> invalid
9225                           0:xx100 -> S, xx,   32Uto64
9226                           1:xx100 -> invalid
9227                           1:x1000 -> D, x,    copy64
9228                           other   -> invalid
9229       SMOV:
9230          Ts,index,ops = case q:imm5 of
9231                           0:xxxx1 -> B, xxxx, (32Uto64 . 8Sto32)
9232                           1:xxxx1 -> B, xxxx, 8Sto64
9233                           0:xxx10 -> H, xxx,  (32Uto64 . 16Sto32)
9234                           1:xxx10 -> H, xxx,  16Sto64
9235                           0:xx100 -> invalid
9236                           1:xx100 -> S, xx,   32Sto64
9237                           1:x1000 -> invalid
9238                           other   -> invalid
9239    */
9240    if (bitOP == 0 && (imm4 == BITS4(0,1,0,1) || imm4 == BITS4(0,1,1,1))) {
9241       Bool isU  = (imm4 & 2) == 2;
9242       const HChar* arTs = "??";
9243       UInt    laneNo = 16; /* invalid */
9244       // Setting 'res' to non-NULL determines valid/invalid
9245       IRExpr* res    = NULL;
9246       if (!bitQ && (imm5 & 1)) { // 0:xxxx1
9247          laneNo = (imm5 >> 1) & 15;
9248          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
9249          res = isU ? unop(Iop_8Uto64, lane)
9250                    : unop(Iop_32Uto64, unop(Iop_8Sto32, lane));
9251          arTs = "b";
9252       }
9253       else if (bitQ && (imm5 & 1)) { // 1:xxxx1
9254          laneNo = (imm5 >> 1) & 15;
9255          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
9256          res = isU ? NULL
9257                    : unop(Iop_8Sto64, lane);
9258          arTs = "b";
9259       }
9260       else if (!bitQ && (imm5 & 2)) { // 0:xxx10
9261          laneNo = (imm5 >> 2) & 7;
9262          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
9263          res = isU ? unop(Iop_16Uto64, lane)
9264                    : unop(Iop_32Uto64, unop(Iop_16Sto32, lane));
9265          arTs = "h";
9266       }
9267       else if (bitQ && (imm5 & 2)) { // 1:xxx10
9268          laneNo = (imm5 >> 2) & 7;
9269          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
9270          res = isU ? NULL
9271                    : unop(Iop_16Sto64, lane);
9272          arTs = "h";
9273       }
9274       else if (!bitQ && (imm5 & 4)) { // 0:xx100
9275          laneNo = (imm5 >> 3) & 3;
9276          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
9277          res = isU ? unop(Iop_32Uto64, lane)
9278                    : NULL;
9279          arTs = "s";
9280       }
9281       else if (bitQ && (imm5 & 4)) { // 1:xxx10
9282          laneNo = (imm5 >> 3) & 3;
9283          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
9284          res = isU ? NULL
9285                    : unop(Iop_32Sto64, lane);
9286          arTs = "s";
9287       }
9288       else if (bitQ && (imm5 & 8)) { // 1:x1000
9289          laneNo = (imm5 >> 4) & 1;
9290          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I64);
9291          res = isU ? lane
9292                    : NULL;
9293          arTs = "d";
9294       }
9295       /* */
9296       if (res) {
9297          vassert(laneNo < 16);
9298          putIReg64orZR(dd, res);
9299          DIP("%cmov %s, %s.%s[%u]\n", isU ? 'u' : 's',
9300              nameIRegOrZR(bitQ == 1, dd),
9301              nameQReg128(nn), arTs, laneNo);
9302          return True;
9303       }
9304       /* invalid */
9305       return False;
9306    }
9307
9308    /* -------- 1,1,xxxx: INS (element) -------- */
9309    /* 31  28       20     14   9 4
9310       011 01110000 imm5 0 imm4 n d  INS Vd.Ts[ix1], Vn.Ts[ix2]
9311       where Ts,ix1,ix2
9312                = case imm5 of xxxx1 -> B, xxxx, imm4[3:0]
9313                               xxx10 -> H, xxx,  imm4[3:1]
9314                               xx100 -> S, xx,   imm4[3:2]
9315                               x1000 -> D, x,    imm4[3:3]
9316    */
9317    if (bitQ == 1 && bitOP == 1) {
9318       HChar   ts  = '?';
9319       IRType  ity = Ity_INVALID;
9320       UInt    ix1 = 16;
9321       UInt    ix2 = 16;
9322       if (imm5 & 1) {
9323          ts  = 'b';
9324          ity = Ity_I8;
9325          ix1 = (imm5 >> 1) & 15;
9326          ix2 = (imm4 >> 0) & 15;
9327       }
9328       else if (imm5 & 2) {
9329          ts  = 'h';
9330          ity = Ity_I16;
9331          ix1 = (imm5 >> 2) & 7;
9332          ix2 = (imm4 >> 1) & 7;
9333       }
9334       else if (imm5 & 4) {
9335          ts  = 's';
9336          ity = Ity_I32;
9337          ix1 = (imm5 >> 3) & 3;
9338          ix2 = (imm4 >> 2) & 3;
9339       }
9340       else if (imm5 & 8) {
9341          ts  = 'd';
9342          ity = Ity_I64;
9343          ix1 = (imm5 >> 4) & 1;
9344          ix2 = (imm4 >> 3) & 1;
9345       }
9346       /* */
9347       if (ity != Ity_INVALID) {
9348          vassert(ix1 < 16);
9349          vassert(ix2 < 16);
9350          putQRegLane(dd, ix1, getQRegLane(nn, ix2, ity));
9351          DIP("ins %s.%c[%u], %s.%c[%u]\n",
9352              nameQReg128(dd), ts, ix1, nameQReg128(nn), ts, ix2);
9353          return True;
9354       }
9355       /* invalid */
9356       return False;
9357    }
9358
9359    return False;
9360 #  undef INSN
9361 }
9362
9363
9364 static
9365 Bool dis_AdvSIMD_modified_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
9366 {
9367    /* 31    28          18  15    11 9     4
9368       0q op 01111 00000 abc cmode 01 defgh d
9369       Decode fields: q,op,cmode
9370       Bit 11 is really "o2", but it is always zero.
9371    */
9372 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9373    if (INSN(31,31) != 0
9374        || INSN(28,19) != BITS10(0,1,1,1,1,0,0,0,0,0)
9375        || INSN(11,10) != BITS2(0,1)) {
9376       return False;
9377    }
9378    UInt bitQ     = INSN(30,30);
9379    UInt bitOP    = INSN(29,29);
9380    UInt cmode    = INSN(15,12);
9381    UInt abcdefgh = (INSN(18,16) << 5) | INSN(9,5);
9382    UInt dd       = INSN(4,0);
9383
9384    ULong imm64lo  = 0;
9385    UInt  op_cmode = (bitOP << 4) | cmode;
9386    Bool  ok       = False;
9387    Bool  isORR    = False;
9388    Bool  isBIC    = False;
9389    Bool  isMOV    = False;
9390    Bool  isMVN    = False;
9391    Bool  isFMOV   = False;
9392    switch (op_cmode) {
9393       /* -------- x,0,0000 MOVI 32-bit shifted imm -------- */
9394       /* -------- x,0,0010 MOVI 32-bit shifted imm -------- */
9395       /* -------- x,0,0100 MOVI 32-bit shifted imm -------- */
9396       /* -------- x,0,0110 MOVI 32-bit shifted imm -------- */
9397       case BITS5(0,0,0,0,0): case BITS5(0,0,0,1,0):
9398       case BITS5(0,0,1,0,0): case BITS5(0,0,1,1,0): // 0:0xx0
9399          ok = True; isMOV = True; break;
9400
9401       /* -------- x,0,0001 ORR (vector, immediate) 32-bit -------- */
9402       /* -------- x,0,0011 ORR (vector, immediate) 32-bit -------- */
9403       /* -------- x,0,0101 ORR (vector, immediate) 32-bit -------- */
9404       /* -------- x,0,0111 ORR (vector, immediate) 32-bit -------- */
9405       case BITS5(0,0,0,0,1): case BITS5(0,0,0,1,1):
9406       case BITS5(0,0,1,0,1): case BITS5(0,0,1,1,1): // 0:0xx1
9407          ok = True; isORR = True; break;
9408
9409       /* -------- x,0,1000 MOVI 16-bit shifted imm -------- */
9410       /* -------- x,0,1010 MOVI 16-bit shifted imm -------- */
9411       case BITS5(0,1,0,0,0): case BITS5(0,1,0,1,0): // 0:10x0
9412          ok = True; isMOV = True; break;
9413
9414       /* -------- x,0,1001 ORR (vector, immediate) 16-bit -------- */
9415       /* -------- x,0,1011 ORR (vector, immediate) 16-bit -------- */
9416       case BITS5(0,1,0,0,1): case BITS5(0,1,0,1,1): // 0:10x1
9417          ok = True; isORR = True; break;
9418
9419       /* -------- x,0,1100 MOVI 32-bit shifting ones -------- */
9420       /* -------- x,0,1101 MOVI 32-bit shifting ones -------- */
9421       case BITS5(0,1,1,0,0): case BITS5(0,1,1,0,1): // 0:110x
9422          ok = True; isMOV = True; break;
9423
9424       /* -------- x,0,1110 MOVI 8-bit -------- */
9425       case BITS5(0,1,1,1,0):
9426          ok = True; isMOV = True; break;
9427
9428       /* -------- x,0,1111 FMOV (vector, immediate, F32) -------- */
9429       case BITS5(0,1,1,1,1): // 0:1111
9430          ok = True; isFMOV = True; break;
9431
9432       /* -------- x,1,0000 MVNI 32-bit shifted imm -------- */
9433       /* -------- x,1,0010 MVNI 32-bit shifted imm  -------- */
9434       /* -------- x,1,0100 MVNI 32-bit shifted imm  -------- */
9435       /* -------- x,1,0110 MVNI 32-bit shifted imm  -------- */
9436       case BITS5(1,0,0,0,0): case BITS5(1,0,0,1,0):
9437       case BITS5(1,0,1,0,0): case BITS5(1,0,1,1,0): // 1:0xx0
9438          ok = True; isMVN = True; break;
9439
9440       /* -------- x,1,0001 BIC (vector, immediate) 32-bit -------- */
9441       /* -------- x,1,0011 BIC (vector, immediate) 32-bit -------- */
9442       /* -------- x,1,0101 BIC (vector, immediate) 32-bit -------- */
9443       /* -------- x,1,0111 BIC (vector, immediate) 32-bit -------- */
9444       case BITS5(1,0,0,0,1): case BITS5(1,0,0,1,1):
9445       case BITS5(1,0,1,0,1): case BITS5(1,0,1,1,1): // 1:0xx1
9446          ok = True; isBIC = True; break;
9447
9448       /* -------- x,1,1000 MVNI 16-bit shifted imm -------- */
9449       /* -------- x,1,1010 MVNI 16-bit shifted imm -------- */
9450       case BITS5(1,1,0,0,0): case BITS5(1,1,0,1,0): // 1:10x0
9451          ok = True; isMVN = True; break;
9452
9453       /* -------- x,1,1001 BIC (vector, immediate) 16-bit -------- */
9454       /* -------- x,1,1011 BIC (vector, immediate) 16-bit -------- */
9455       case BITS5(1,1,0,0,1): case BITS5(1,1,0,1,1): // 1:10x1
9456          ok = True; isBIC = True; break;
9457
9458       /* -------- x,1,1100 MVNI 32-bit shifting ones -------- */
9459       /* -------- x,1,1101 MVNI 32-bit shifting ones -------- */
9460       case BITS5(1,1,1,0,0): case BITS5(1,1,1,0,1): // 1:110x
9461          ok = True; isMVN = True; break;
9462
9463       /* -------- 0,1,1110 MOVI 64-bit scalar -------- */
9464       /* -------- 1,1,1110 MOVI 64-bit vector -------- */
9465       case BITS5(1,1,1,1,0):
9466          ok = True; isMOV = True; break;
9467
9468       /* -------- 1,1,1111 FMOV (vector, immediate, F64) -------- */
9469       case BITS5(1,1,1,1,1): // 1:1111
9470          ok = bitQ == 1; isFMOV = True; break;
9471
9472       default:
9473         break;
9474    }
9475    if (ok) {
9476       vassert(1 == (isMOV ? 1 : 0) + (isMVN ? 1 : 0)
9477                    + (isORR ? 1 : 0) + (isBIC ? 1 : 0) + (isFMOV ? 1 : 0));
9478       ok = AdvSIMDExpandImm(&imm64lo, bitOP, cmode, abcdefgh);
9479    }
9480    if (ok) {
9481       if (isORR || isBIC) {
9482          ULong inv
9483             = isORR ? 0ULL : ~0ULL;
9484          IRExpr* immV128
9485             = binop(Iop_64HLtoV128, mkU64(inv ^ imm64lo), mkU64(inv ^ imm64lo));
9486          IRExpr* res
9487             = binop(isORR ? Iop_OrV128 : Iop_AndV128, getQReg128(dd), immV128);
9488          const HChar* nm = isORR ? "orr" : "bic";
9489          if (bitQ == 0) {
9490             putQReg128(dd, unop(Iop_ZeroHI64ofV128, res));
9491             DIP("%s %s.1d, %016llx\n", nm, nameQReg128(dd), imm64lo);
9492          } else {
9493             putQReg128(dd, res);
9494             DIP("%s %s.2d, #0x%016llx'%016llx\n", nm,
9495                 nameQReg128(dd), imm64lo, imm64lo);
9496          }
9497       }
9498       else if (isMOV || isMVN || isFMOV) {
9499          if (isMVN) imm64lo = ~imm64lo;
9500          ULong   imm64hi = bitQ == 0  ? 0  :  imm64lo;
9501          IRExpr* immV128 = binop(Iop_64HLtoV128, mkU64(imm64hi),
9502                                                  mkU64(imm64lo));
9503          putQReg128(dd, immV128);
9504          DIP("mov %s, #0x%016llx'%016llx\n", nameQReg128(dd), imm64hi, imm64lo);
9505       }
9506       return True;
9507    }
9508    /* else fall through */
9509
9510    return False;
9511 #  undef INSN
9512 }
9513
9514
9515 static
9516 Bool dis_AdvSIMD_scalar_copy(/*MB_OUT*/DisResult* dres, UInt insn)
9517 {
9518    /* 31    28       20   15 14   10 9 4
9519       01 op 11110000 imm5 0  imm4 1  n d
9520       Decode fields: op,imm4
9521    */
9522 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9523    if (INSN(31,30) != BITS2(0,1)
9524        || INSN(28,21) != BITS8(1,1,1,1,0,0,0,0)
9525        || INSN(15,15) != 0 || INSN(10,10) != 1) {
9526       return False;
9527    }
9528    UInt bitOP = INSN(29,29);
9529    UInt imm5  = INSN(20,16);
9530    UInt imm4  = INSN(14,11);
9531    UInt nn    = INSN(9,5);
9532    UInt dd    = INSN(4,0);
9533
9534    if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
9535       /* -------- 0,0000 DUP (element, scalar) -------- */
9536       IRTemp w0     = newTemp(Ity_I64);
9537       const HChar* arTs = "??";
9538       IRType laneTy = Ity_INVALID;
9539       UInt   laneNo = 16; /* invalid */
9540       if (imm5 & 1) {
9541          arTs   = "b";
9542          laneNo = (imm5 >> 1) & 15;
9543          laneTy = Ity_I8;
9544          assign(w0, unop(Iop_8Uto64, getQRegLane(nn, laneNo, laneTy)));
9545       }
9546       else if (imm5 & 2) {
9547          arTs   = "h";
9548          laneNo = (imm5 >> 2) & 7;
9549          laneTy = Ity_I16;
9550          assign(w0, unop(Iop_16Uto64, getQRegLane(nn, laneNo, laneTy)));
9551       }
9552       else if (imm5 & 4) {
9553          arTs   = "s";
9554          laneNo = (imm5 >> 3) & 3;
9555          laneTy = Ity_I32;
9556          assign(w0, unop(Iop_32Uto64, getQRegLane(nn, laneNo, laneTy)));
9557       }
9558       else if (imm5 & 8) {
9559          arTs   = "d";
9560          laneNo = (imm5 >> 4) & 1;
9561          laneTy = Ity_I64;
9562          assign(w0, getQRegLane(nn, laneNo, laneTy));
9563       }
9564       else {
9565          /* invalid; leave laneTy unchanged. */
9566       }
9567       /* */
9568       if (laneTy != Ity_INVALID) {
9569          vassert(laneNo < 16);
9570          putQReg128(dd, binop(Iop_64HLtoV128, mkU64(0), mkexpr(w0)));
9571          DIP("dup %s, %s.%s[%u]\n",
9572              nameQRegLO(dd, laneTy), nameQReg128(nn), arTs, laneNo);
9573          return True;
9574       }
9575       /* else fall through */
9576    }
9577
9578    return False;
9579 #  undef INSN
9580 }
9581
9582
9583 static
9584 Bool dis_AdvSIMD_scalar_pairwise(/*MB_OUT*/DisResult* dres, UInt insn)
9585 {
9586    /* 31   28    23 21    16     11 9 4
9587       01 u 11110 sz 11000 opcode 10 n d
9588       Decode fields: u,sz,opcode
9589    */
9590 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9591    if (INSN(31,30) != BITS2(0,1)
9592        || INSN(28,24) != BITS5(1,1,1,1,0)
9593        || INSN(21,17) != BITS5(1,1,0,0,0)
9594        || INSN(11,10) != BITS2(1,0)) {
9595       return False;
9596    }
9597    UInt bitU   = INSN(29,29);
9598    UInt sz     = INSN(23,22);
9599    UInt opcode = INSN(16,12);
9600    UInt nn     = INSN(9,5);
9601    UInt dd     = INSN(4,0);
9602
9603    if (bitU == 0 && sz == X11 && opcode == BITS5(1,1,0,1,1)) {
9604       /* -------- 0,11,11011 ADDP d_2d -------- */
9605       IRTemp xy = newTempV128();
9606       IRTemp xx = newTempV128();
9607       assign(xy, getQReg128(nn));
9608       assign(xx, binop(Iop_InterleaveHI64x2, mkexpr(xy), mkexpr(xy)));
9609       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
9610                           binop(Iop_Add64x2, mkexpr(xy), mkexpr(xx))));
9611       DIP("addp d%u, %s.2d\n", dd, nameQReg128(nn));
9612       return True;
9613    }
9614
9615    if (bitU == 1 && sz <= X01 && opcode == BITS5(0,1,1,0,1)) {
9616       /* -------- 1,00,01101 ADDP s_2s -------- */
9617       /* -------- 1,01,01101 ADDP d_2d -------- */
9618       Bool   isD   = sz == X01;
9619       IROp   opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
9620       IROp   opADD = mkVecADDF(isD ? 3 : 2);
9621       IRTemp src   = newTempV128();
9622       IRTemp argL  = newTempV128();
9623       IRTemp argR  = newTempV128();
9624       assign(src, getQReg128(nn));
9625       assign(argL, unop(opZHI, mkexpr(src)));
9626       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
9627                                                     mkU8(isD ? 8 : 4))));
9628       putQReg128(dd, unop(opZHI,
9629                           triop(opADD, mkexpr(mk_get_IR_rounding_mode()),
9630                                               mkexpr(argL), mkexpr(argR))));
9631       DIP(isD ? "faddp d%u, v%u.2d\n" : "faddp s%u, v%u.2s\n", dd, nn);
9632       return True;
9633    }
9634
9635    if (bitU == 1
9636        && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) {
9637       /* -------- 1,0x,01100 FMAXNMP d_2d, s_2s -------- */
9638       /* -------- 1,1x,01100 FMINNMP d_2d, s_2s -------- */
9639       /* -------- 1,0x,01111 FMAXP   d_2d, s_2s -------- */
9640       /* -------- 1,1x,01111 FMINP   d_2d, s_2s -------- */
9641       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
9642       Bool   isD   = (sz & 1) == 1;
9643       Bool   isMIN = (sz & 2) == 2;
9644       Bool   isNM  = opcode == BITS5(0,1,1,0,0);
9645       IROp   opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
9646       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? 3 : 2);
9647       IRTemp src   = newTempV128();
9648       IRTemp argL  = newTempV128();
9649       IRTemp argR  = newTempV128();
9650       assign(src, getQReg128(nn));
9651       assign(argL, unop(opZHI, mkexpr(src)));
9652       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
9653                                                     mkU8(isD ? 8 : 4))));
9654       putQReg128(dd, unop(opZHI,
9655                           binop(opMXX, mkexpr(argL), mkexpr(argR))));
9656       HChar c = isD ? 'd' : 's';
9657       DIP("%s%sp %c%u, v%u.2%c\n",
9658            isMIN ? "fmin" : "fmax", isNM ? "nm" : "", c, dd, nn, c);
9659       return True;
9660    }
9661
9662    return False;
9663 #  undef INSN
9664 }
9665
9666
9667 static
9668 Bool dis_AdvSIMD_scalar_shift_by_imm(/*MB_OUT*/DisResult* dres, UInt insn)
9669 {
9670    /* 31   28     22   18   15     10 9 4
9671       01 u 111110 immh immb opcode 1  n d
9672       Decode fields: u,immh,opcode
9673    */
9674 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9675    if (INSN(31,30) != BITS2(0,1)
9676        || INSN(28,23) != BITS6(1,1,1,1,1,0) || INSN(10,10) != 1) {
9677       return False;
9678    }
9679    UInt bitU   = INSN(29,29);
9680    UInt immh   = INSN(22,19);
9681    UInt immb   = INSN(18,16);
9682    UInt opcode = INSN(15,11);
9683    UInt nn     = INSN(9,5);
9684    UInt dd     = INSN(4,0);
9685    UInt immhb  = (immh << 3) | immb;
9686
9687    if ((immh & 8) == 8
9688        && (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0))) {
9689       /* -------- 0,1xxx,00000 SSHR d_d_#imm -------- */
9690       /* -------- 1,1xxx,00000 USHR d_d_#imm -------- */
9691       /* -------- 0,1xxx,00010 SSRA d_d_#imm -------- */
9692       /* -------- 1,1xxx,00010 USRA d_d_#imm -------- */
9693       Bool isU   = bitU == 1;
9694       Bool isAcc = opcode == BITS5(0,0,0,1,0);
9695       UInt sh    = 128 - immhb;
9696       vassert(sh >= 1 && sh <= 64);
9697       IROp    op  = isU ? Iop_ShrN64x2 : Iop_SarN64x2;
9698       IRExpr* src = getQReg128(nn);
9699       IRTemp  shf = newTempV128();
9700       IRTemp  res = newTempV128();
9701       if (sh == 64 && isU) {
9702          assign(shf, mkV128(0x0000));
9703       } else {
9704          UInt nudge = 0;
9705          if (sh == 64) {
9706             vassert(!isU);
9707             nudge = 1;
9708          }
9709          assign(shf, binop(op, src, mkU8(sh - nudge)));
9710       }
9711       assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
9712                         : mkexpr(shf));
9713       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9714       const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
9715                               : (isU ? "ushr" : "sshr");
9716       DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
9717       return True;
9718    }
9719
9720    if ((immh & 8) == 8
9721        && (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0))) {
9722       /* -------- 0,1xxx,00100 SRSHR d_d_#imm -------- */
9723       /* -------- 1,1xxx,00100 URSHR d_d_#imm -------- */
9724       /* -------- 0,1xxx,00110 SRSRA d_d_#imm -------- */
9725       /* -------- 1,1xxx,00110 URSRA d_d_#imm -------- */
9726       Bool isU   = bitU == 1;
9727       Bool isAcc = opcode == BITS5(0,0,1,1,0);
9728       UInt sh    = 128 - immhb;
9729       vassert(sh >= 1 && sh <= 64);
9730       IROp    op  = isU ? Iop_Rsh64Ux2 : Iop_Rsh64Sx2;
9731       vassert(sh >= 1 && sh <= 64);
9732       IRExpr* src  = getQReg128(nn);
9733       IRTemp  imm8 = newTemp(Ity_I8);
9734       assign(imm8, mkU8((UChar)(-sh)));
9735       IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
9736       IRTemp  shf  = newTempV128();
9737       IRTemp  res  = newTempV128();
9738       assign(shf, binop(op, src, amt));
9739       assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
9740                         : mkexpr(shf));
9741       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9742       const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
9743                               : (isU ? "urshr" : "srshr");
9744       DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
9745       return True;
9746    }
9747
9748    if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,0,0)) {
9749       /* -------- 1,1xxx,01000 SRI d_d_#imm -------- */
9750       UInt sh = 128 - immhb;
9751       vassert(sh >= 1 && sh <= 64);
9752       if (sh == 64) {
9753          putQReg128(dd, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
9754       } else {
9755          /* sh is in range 1 .. 63 */
9756          ULong   nmask  = (ULong)(((Long)0x8000000000000000ULL) >> (sh-1));
9757          IRExpr* nmaskV = binop(Iop_64HLtoV128, mkU64(nmask), mkU64(nmask));
9758          IRTemp  res    = newTempV128();
9759          assign(res, binop(Iop_OrV128,
9760                            binop(Iop_AndV128, getQReg128(dd), nmaskV),
9761                            binop(Iop_ShrN64x2, getQReg128(nn), mkU8(sh))));
9762          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9763       }
9764       DIP("sri d%u, d%u, #%u\n", dd, nn, sh);
9765       return True;
9766    }
9767
9768    if (bitU == 0 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
9769       /* -------- 0,1xxx,01010 SHL d_d_#imm -------- */
9770       UInt sh = immhb - 64;
9771       vassert(sh >= 0 && sh < 64);
9772       putQReg128(dd,
9773                  unop(Iop_ZeroHI64ofV128,
9774                       sh == 0 ? getQReg128(nn)
9775                               : binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
9776       DIP("shl d%u, d%u, #%u\n", dd, nn, sh);
9777       return True;
9778    }
9779
9780    if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
9781       /* -------- 1,1xxx,01010 SLI d_d_#imm -------- */
9782       UInt sh = immhb - 64;
9783       vassert(sh >= 0 && sh < 64);
9784       if (sh == 0) {
9785          putQReg128(dd, unop(Iop_ZeroHI64ofV128, getQReg128(nn)));
9786       } else {
9787          /* sh is in range 1 .. 63 */
9788          ULong   nmask  = (1ULL << sh) - 1;
9789          IRExpr* nmaskV = binop(Iop_64HLtoV128, mkU64(nmask), mkU64(nmask));
9790          IRTemp  res    = newTempV128();
9791          assign(res, binop(Iop_OrV128,
9792                            binop(Iop_AndV128, getQReg128(dd), nmaskV),
9793                            binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
9794          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9795       }
9796       DIP("sli d%u, d%u, #%u\n", dd, nn, sh);
9797       return True;
9798    }
9799
9800    if (opcode == BITS5(0,1,1,1,0)
9801        || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
9802       /* -------- 0,01110  SQSHL  #imm -------- */
9803       /* -------- 1,01110  UQSHL  #imm -------- */
9804       /* -------- 1,01100  SQSHLU #imm -------- */
9805       UInt size  = 0;
9806       UInt shift = 0;
9807       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
9808       if (!ok) return False;
9809       vassert(size >= 0 && size <= 3);
9810       /* The shift encoding has opposite sign for the leftwards case.
9811          Adjust shift to compensate. */
9812       UInt lanebits = 8 << size;
9813       shift = lanebits - shift;
9814       vassert(shift >= 0 && shift < lanebits);
9815       const HChar* nm = NULL;
9816       /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
9817       else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
9818       else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
9819       else vassert(0);
9820       IRTemp qDiff1 = IRTemp_INVALID;
9821       IRTemp qDiff2 = IRTemp_INVALID;
9822       IRTemp res = IRTemp_INVALID;
9823       IRTemp src = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn));
9824       /* This relies on the fact that the zeroed out lanes generate zeroed
9825          result lanes and don't saturate, so there's no point in trimming
9826          the resulting res, qDiff1 or qDiff2 values. */
9827       math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
9828       putQReg128(dd, mkexpr(res));
9829       updateQCFLAGwithDifference(qDiff1, qDiff2);
9830       const HChar arr = "bhsd"[size];
9831       DIP("%s %c%u, %c%u, #%u\n", nm, arr, dd, arr, nn, shift);
9832       return True;
9833    }
9834
9835    if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
9836        || (bitU == 1
9837            && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
9838       /* -------- 0,10010   SQSHRN #imm -------- */
9839       /* -------- 1,10010   UQSHRN #imm -------- */
9840       /* -------- 0,10011  SQRSHRN #imm -------- */
9841       /* -------- 1,10011  UQRSHRN #imm -------- */
9842       /* -------- 1,10000  SQSHRUN #imm -------- */
9843       /* -------- 1,10001 SQRSHRUN #imm -------- */
9844       UInt size  = 0;
9845       UInt shift = 0;
9846       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
9847       if (!ok || size == X11) return False;
9848       vassert(size >= X00 && size <= X10);
9849       vassert(shift >= 1 && shift <= (8 << size));
9850       const HChar* nm = "??";
9851       IROp op = Iop_INVALID;
9852       /* Decide on the name and the operation. */
9853       /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
9854          nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size);
9855       }
9856       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
9857          nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size);
9858       }
9859       else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) {
9860          nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size);
9861       }
9862       else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
9863          nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size);
9864       }
9865       else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) {
9866          nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size);
9867       }
9868       else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) {
9869          nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size);
9870       }
9871       else vassert(0);
9872       /* Compute the result (Q, shifted value) pair. */
9873       IRTemp src128 = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size+1, getQReg128(nn));
9874       IRTemp pair   = newTempV128();
9875       assign(pair, binop(op, mkexpr(src128), mkU8(shift)));
9876       /* Update the result reg */
9877       IRTemp res64in128 = newTempV128();
9878       assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair)));
9879       putQReg128(dd, mkexpr(res64in128));
9880       /* Update the Q flag. */
9881       IRTemp q64q64 = newTempV128();
9882       assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair)));
9883       IRTemp z128 = newTempV128();
9884       assign(z128, mkV128(0x0000));
9885       updateQCFLAGwithDifference(q64q64, z128);
9886       /* */
9887       const HChar arrNarrow = "bhsd"[size];
9888       const HChar arrWide   = "bhsd"[size+1];
9889       DIP("%s %c%u, %c%u, #%u\n", nm, arrNarrow, dd, arrWide, nn, shift);
9890       return True;
9891    }
9892
9893    if (immh >= BITS4(0,1,0,0) && opcode == BITS5(1,1,1,0,0)) {
9894       /* -------- 0,!=00xx,11100 SCVTF d_d_imm, s_s_imm -------- */
9895       /* -------- 1,!=00xx,11100 UCVTF d_d_imm, s_s_imm -------- */
9896       UInt size  = 0;
9897       UInt fbits = 0;
9898       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
9899       /* The following holds because immh is never zero. */
9900       vassert(ok);
9901       /* The following holds because immh >= 0100. */
9902       vassert(size == X10 || size == X11);
9903       Bool isD = size == X11;
9904       Bool isU = bitU == 1;
9905       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
9906       Double  scale  = two_to_the_minus(fbits);
9907       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
9908                              : IRExpr_Const(IRConst_F32( (Float)scale ));
9909       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
9910       IROp    opCVT  = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
9911                            : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
9912       IRType tyF = isD ? Ity_F64 : Ity_F32;
9913       IRType tyI = isD ? Ity_I64 : Ity_I32;
9914       IRTemp src = newTemp(tyI);
9915       IRTemp res = newTemp(tyF);
9916       IRTemp rm  = mk_get_IR_rounding_mode();
9917       assign(src, getQRegLane(nn, 0, tyI));
9918       assign(res, triop(opMUL, mkexpr(rm),
9919                                binop(opCVT, mkexpr(rm), mkexpr(src)), scaleE));
9920       putQRegLane(dd, 0, mkexpr(res));
9921       if (!isD) {
9922          putQRegLane(dd, 1, mkU32(0));
9923       }
9924       putQRegLane(dd, 1, mkU64(0));
9925       const HChar ch = isD ? 'd' : 's';
9926       DIP("%s %c%u, %c%u, #%u\n", isU ? "ucvtf" : "scvtf",
9927           ch, dd, ch, nn, fbits);
9928       return True;
9929    }
9930
9931    if (immh >= BITS4(0,1,0,0) && opcode == BITS5(1,1,1,1,1)) {
9932       /* -------- 0,!=00xx,11111 FCVTZS d_d_imm, s_s_imm -------- */
9933       /* -------- 1,!=00xx,11111 FCVTZU d_d_imm, s_s_imm -------- */
9934       UInt size  = 0;
9935       UInt fbits = 0;
9936       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
9937       /* The following holds because immh is never zero. */
9938       vassert(ok);
9939       /* The following holds because immh >= 0100. */
9940       vassert(size == X10 || size == X11);
9941       Bool isD = size == X11;
9942       Bool isU = bitU == 1;
9943       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
9944       Double  scale  = two_to_the_plus(fbits);
9945       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
9946                            : IRExpr_Const(IRConst_F32( (Float)scale ));
9947       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
9948       IROp    opCVT  = isU ? (isD ? Iop_F64toI64U : Iop_F32toI32U)
9949                            : (isD ? Iop_F64toI64S : Iop_F32toI32S);
9950       IRType tyF = isD ? Ity_F64 : Ity_F32;
9951       IRType tyI = isD ? Ity_I64 : Ity_I32;
9952       IRTemp src = newTemp(tyF);
9953       IRTemp res = newTemp(tyI);
9954       IRTemp rm  = newTemp(Ity_I32);
9955       assign(src, getQRegLane(nn, 0, tyF));
9956       assign(rm,  mkU32(Irrm_ZERO));
9957       assign(res, binop(opCVT, mkexpr(rm),
9958                                triop(opMUL, mkexpr(rm), mkexpr(src), scaleE)));
9959       putQRegLane(dd, 0, mkexpr(res));
9960       if (!isD) {
9961          putQRegLane(dd, 1, mkU32(0));
9962       }
9963       putQRegLane(dd, 1, mkU64(0));
9964       const HChar ch = isD ? 'd' : 's';
9965       DIP("%s %c%u, %c%u, #%u\n", isU ? "fcvtzu" : "fcvtzs",
9966           ch, dd, ch, nn, fbits);
9967       return True;
9968    }
9969
9970 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9971    return False;
9972 #  undef INSN
9973 }
9974
9975
9976 static
9977 Bool dis_AdvSIMD_scalar_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
9978 {
9979    /* 31 29 28    23   21 20 15     11 9 4
9980       01 U  11110 size 1  m  opcode 00 n d
9981       Decode fields: u,opcode
9982    */
9983 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9984    if (INSN(31,30) != BITS2(0,1)
9985        || INSN(28,24) != BITS5(1,1,1,1,0)
9986        || INSN(21,21) != 1
9987        || INSN(11,10) != BITS2(0,0)) {
9988       return False;
9989    }
9990    UInt bitU   = INSN(29,29);
9991    UInt size   = INSN(23,22);
9992    UInt mm     = INSN(20,16);
9993    UInt opcode = INSN(15,12);
9994    UInt nn     = INSN(9,5);
9995    UInt dd     = INSN(4,0);
9996    vassert(size < 4);
9997
9998    if (bitU == 0
9999        && (opcode == BITS4(1,1,0,1)
10000            || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
10001       /* -------- 0,1101  SQDMULL -------- */ // 0 (ks)
10002       /* -------- 0,1001  SQDMLAL -------- */ // 1
10003       /* -------- 0,1011  SQDMLSL -------- */ // 2
10004       /* Widens, and size refers to the narrowed lanes. */
10005       UInt ks = 3;
10006       switch (opcode) {
10007          case BITS4(1,1,0,1): ks = 0; break;
10008          case BITS4(1,0,0,1): ks = 1; break;
10009          case BITS4(1,0,1,1): ks = 2; break;
10010          default: vassert(0);
10011       }
10012       vassert(ks >= 0 && ks <= 2);
10013       if (size == X00 || size == X11) return False;
10014       vassert(size <= 2);
10015       IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
10016       vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
10017       newTempsV128_3(&vecN, &vecM, &vecD);
10018       assign(vecN, getQReg128(nn));
10019       assign(vecM, getQReg128(mm));
10020       assign(vecD, getQReg128(dd));
10021       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
10022                        False/*!is2*/, size, "mas"[ks],
10023                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
10024       IROp opZHI = mkVecZEROHIxxOFV128(size+1);
10025       putQReg128(dd, unop(opZHI, mkexpr(res)));
10026       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
10027       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
10028       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
10029          updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
10030       }
10031       const HChar* nm        = ks == 0 ? "sqdmull"
10032                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
10033       const HChar  arrNarrow = "bhsd"[size];
10034       const HChar  arrWide   = "bhsd"[size+1];
10035       DIP("%s %c%u, %c%u, %c%u\n",
10036           nm, arrWide, dd, arrNarrow, nn, arrNarrow, mm);
10037       return True;
10038    }
10039
10040    return False;
10041 #  undef INSN
10042 }
10043
10044
10045 static
10046 Bool dis_AdvSIMD_scalar_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
10047 {
10048    /* 31 29 28    23   21 20 15     10 9 4
10049       01 U  11110 size 1  m  opcode 1  n d
10050       Decode fields: u,size,opcode
10051    */
10052 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10053    if (INSN(31,30) != BITS2(0,1)
10054        || INSN(28,24) != BITS5(1,1,1,1,0)
10055        || INSN(21,21) != 1
10056        || INSN(10,10) != 1) {
10057       return False;
10058    }
10059    UInt bitU   = INSN(29,29);
10060    UInt size   = INSN(23,22);
10061    UInt mm     = INSN(20,16);
10062    UInt opcode = INSN(15,11);
10063    UInt nn     = INSN(9,5);
10064    UInt dd     = INSN(4,0);
10065    vassert(size < 4);
10066
10067    if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
10068       /* -------- 0,xx,00001 SQADD std4_std4_std4 -------- */
10069       /* -------- 1,xx,00001 UQADD std4_std4_std4 -------- */
10070       /* -------- 0,xx,00101 SQSUB std4_std4_std4 -------- */
10071       /* -------- 1,xx,00101 UQSUB std4_std4_std4 -------- */
10072       Bool isADD = opcode == BITS5(0,0,0,0,1);
10073       Bool isU   = bitU == 1;
10074       IROp qop   = Iop_INVALID;
10075       IROp nop   = Iop_INVALID;
10076       if (isADD) {
10077          qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
10078          nop = mkVecADD(size);
10079       } else {
10080          qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
10081          nop = mkVecSUB(size);
10082       }
10083       IRTemp argL = newTempV128();
10084       IRTemp argR = newTempV128();
10085       IRTemp qres = newTempV128();
10086       IRTemp nres = newTempV128();
10087       assign(argL, getQReg128(nn));
10088       assign(argR, getQReg128(mm));
10089       assign(qres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10090                              size, binop(qop, mkexpr(argL), mkexpr(argR)))));
10091       assign(nres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10092                              size, binop(nop, mkexpr(argL), mkexpr(argR)))));
10093       putQReg128(dd, mkexpr(qres));
10094       updateQCFLAGwithDifference(qres, nres);
10095       const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd")
10096                                : (isU ? "uqsub" : "sqsub");
10097       const HChar  arr = "bhsd"[size];
10098       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10099       return True;
10100    }
10101
10102    if (size == X11 && opcode == BITS5(0,0,1,1,0)) {
10103       /* -------- 0,11,00110 CMGT d_d_d -------- */ // >s
10104       /* -------- 1,11,00110 CMHI d_d_d -------- */ // >u
10105       Bool    isGT = bitU == 0;
10106       IRExpr* argL = getQReg128(nn);
10107       IRExpr* argR = getQReg128(mm);
10108       IRTemp  res  = newTempV128();
10109       assign(res,
10110              isGT ? binop(Iop_CmpGT64Sx2, argL, argR)
10111                   : binop(Iop_CmpGT64Ux2, argL, argR));
10112       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10113       DIP("%s %s, %s, %s\n",isGT ? "cmgt" : "cmhi",
10114           nameQRegLO(dd, Ity_I64),
10115           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10116       return True;
10117    }
10118
10119    if (size == X11 && opcode == BITS5(0,0,1,1,1)) {
10120       /* -------- 0,11,00111 CMGE d_d_d -------- */ // >=s
10121       /* -------- 1,11,00111 CMHS d_d_d -------- */ // >=u
10122       Bool    isGE = bitU == 0;
10123       IRExpr* argL = getQReg128(nn);
10124       IRExpr* argR = getQReg128(mm);
10125       IRTemp  res  = newTempV128();
10126       assign(res,
10127              isGE ? unop(Iop_NotV128, binop(Iop_CmpGT64Sx2, argR, argL))
10128                   : unop(Iop_NotV128, binop(Iop_CmpGT64Ux2, argR, argL)));
10129       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10130       DIP("%s %s, %s, %s\n", isGE ? "cmge" : "cmhs",
10131           nameQRegLO(dd, Ity_I64),
10132           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10133       return True;
10134    }
10135
10136    if (size == X11 && (opcode == BITS5(0,1,0,0,0)
10137                        || opcode == BITS5(0,1,0,1,0))) {
10138       /* -------- 0,xx,01000 SSHL  d_d_d -------- */
10139       /* -------- 0,xx,01010 SRSHL d_d_d -------- */
10140       /* -------- 1,xx,01000 USHL  d_d_d -------- */
10141       /* -------- 1,xx,01010 URSHL d_d_d -------- */
10142       Bool isU = bitU == 1;
10143       Bool isR = opcode == BITS5(0,1,0,1,0);
10144       IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
10145                      : (isU ? mkVecSHU(size)  : mkVecSHS(size));
10146       IRTemp res = newTempV128();
10147       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
10148       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10149       const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
10150                              : (isU ? "ushl"  : "sshl");
10151       DIP("%s %s, %s, %s\n", nm,
10152           nameQRegLO(dd, Ity_I64),
10153           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10154       return True;
10155    }
10156
10157    if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
10158       /* -------- 0,xx,01001 SQSHL  std4_std4_std4 -------- */
10159       /* -------- 0,xx,01011 SQRSHL std4_std4_std4 -------- */
10160       /* -------- 1,xx,01001 UQSHL  std4_std4_std4 -------- */
10161       /* -------- 1,xx,01011 UQRSHL std4_std4_std4 -------- */
10162       Bool isU = bitU == 1;
10163       Bool isR = opcode == BITS5(0,1,0,1,1);
10164       IROp op  = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
10165                      : (isU ? mkVecQANDUQSH(size)  : mkVecQANDSQSH(size));
10166       /* This is a bit tricky.  Since we're only interested in the lowest
10167          lane of the result, we zero out all the rest in the operands, so
10168          as to ensure that other lanes don't pollute the returned Q value.
10169          This works because it means, for the lanes we don't care about, we
10170          are shifting zero by zero, which can never saturate. */
10171       IRTemp res256 = newTemp(Ity_V256);
10172       IRTemp resSH  = newTempV128();
10173       IRTemp resQ   = newTempV128();
10174       IRTemp zero   = newTempV128();
10175       assign(
10176          res256,
10177          binop(op,
10178                mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn))),
10179                mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(mm)))));
10180       assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
10181       assign(resQ,  unop(Iop_V256toV128_1, mkexpr(res256)));
10182       assign(zero,  mkV128(0x0000));
10183       putQReg128(dd, mkexpr(resSH));
10184       updateQCFLAGwithDifference(resQ, zero);
10185       const HChar* nm  = isR ? (isU ? "uqrshl" : "sqrshl")
10186                              : (isU ? "uqshl"  : "sqshl");
10187       const HChar  arr = "bhsd"[size];
10188       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10189       return True;
10190    }
10191
10192    if (size == X11 && opcode == BITS5(1,0,0,0,0)) {
10193       /* -------- 0,11,10000 ADD d_d_d -------- */
10194       /* -------- 1,11,10000 SUB d_d_d -------- */
10195       Bool   isSUB = bitU == 1;
10196       IRTemp res   = newTemp(Ity_I64);
10197       assign(res, binop(isSUB ? Iop_Sub64 : Iop_Add64,
10198                         getQRegLane(nn, 0, Ity_I64),
10199                         getQRegLane(mm, 0, Ity_I64)));
10200       putQRegLane(dd, 0, mkexpr(res));
10201       putQRegLane(dd, 1, mkU64(0));
10202       DIP("%s %s, %s, %s\n", isSUB ? "sub" : "add",
10203           nameQRegLO(dd, Ity_I64),
10204           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10205       return True;
10206    }
10207
10208    if (size == X11 && opcode == BITS5(1,0,0,0,1)) {
10209       /* -------- 0,11,10001 CMTST d_d_d -------- */ // &, != 0
10210       /* -------- 1,11,10001 CMEQ  d_d_d -------- */ // ==
10211       Bool    isEQ = bitU == 1;
10212       IRExpr* argL = getQReg128(nn);
10213       IRExpr* argR = getQReg128(mm);
10214       IRTemp  res  = newTempV128();
10215       assign(res,
10216              isEQ ? binop(Iop_CmpEQ64x2, argL, argR)
10217                   : unop(Iop_NotV128, binop(Iop_CmpEQ64x2,
10218                                             binop(Iop_AndV128, argL, argR),
10219                                             mkV128(0x0000))));
10220       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10221       DIP("%s %s, %s, %s\n", isEQ ? "cmeq" : "cmtst",
10222           nameQRegLO(dd, Ity_I64),
10223           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10224       return True;
10225    }
10226
10227    if (opcode == BITS5(1,0,1,1,0)) {
10228       /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
10229       /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
10230       if (size == X00 || size == X11) return False;
10231       Bool isR = bitU == 1;
10232       IRTemp res, sat1q, sat1n, vN, vM;
10233       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
10234       newTempsV128_2(&vN, &vM);
10235       assign(vN, getQReg128(nn));
10236       assign(vM, getQReg128(mm));
10237       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
10238       putQReg128(dd,
10239                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res))));
10240       updateQCFLAGwithDifference(
10241          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1q)),
10242          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1n)));
10243       const HChar  arr = "bhsd"[size];
10244       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
10245       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10246       return True;
10247    }
10248
10249    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
10250       /* -------- 1,1x,11010 FABD d_d_d, s_s_s -------- */
10251       IRType ity = size == X11 ? Ity_F64 : Ity_F32;
10252       IRTemp res = newTemp(ity);
10253       assign(res, unop(mkABSF(ity),
10254                        triop(mkSUBF(ity),
10255                              mkexpr(mk_get_IR_rounding_mode()),
10256                              getQRegLO(nn,ity), getQRegLO(mm,ity))));
10257       putQReg128(dd, mkV128(0x0000));
10258       putQRegLO(dd, mkexpr(res));
10259       DIP("fabd %s, %s, %s\n",
10260           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10261       return True;
10262    }
10263
10264    if (bitU == 0 && size <= X01 && opcode == BITS5(1,1,0,1,1)) {
10265       /* -------- 0,0x,11011 FMULX d_d_d, s_s_s -------- */
10266       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
10267       IRType ity = size == X01 ? Ity_F64 : Ity_F32;
10268       IRTemp res = newTemp(ity);
10269       assign(res, triop(mkMULF(ity),
10270                         mkexpr(mk_get_IR_rounding_mode()),
10271                         getQRegLO(nn,ity), getQRegLO(mm,ity)));
10272       putQReg128(dd, mkV128(0x0000));
10273       putQRegLO(dd, mkexpr(res));
10274       DIP("fmulx %s, %s, %s\n",
10275           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10276       return True;
10277    }
10278
10279    if (size <= X01 && opcode == BITS5(1,1,1,0,0)) {
10280       /* -------- 0,0x,11100 FCMEQ d_d_d, s_s_s -------- */
10281       /* -------- 1,0x,11100 FCMGE d_d_d, s_s_s -------- */
10282       Bool   isD   = size == X01;
10283       IRType ity   = isD ? Ity_F64 : Ity_F32;
10284       Bool   isGE  = bitU == 1;
10285       IROp   opCMP = isGE ? (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4)
10286                           : (isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4);
10287       IRTemp res   = newTempV128();
10288       assign(res, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn)) // swapd
10289                        : binop(opCMP, getQReg128(nn), getQReg128(mm)));
10290       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10291                                                              mkexpr(res))));
10292       DIP("%s %s, %s, %s\n", isGE ? "fcmge" : "fcmeq",
10293           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10294       return True;
10295    }
10296
10297    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,0,0)) {
10298       /* -------- 1,1x,11100 FCMGT d_d_d, s_s_s -------- */
10299       Bool   isD   = size == X11;
10300       IRType ity   = isD ? Ity_F64 : Ity_F32;
10301       IROp   opCMP = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
10302       IRTemp res   = newTempV128();
10303       assign(res, binop(opCMP, getQReg128(mm), getQReg128(nn))); // swapd
10304       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10305                                                              mkexpr(res))));
10306       DIP("%s %s, %s, %s\n", "fcmgt",
10307           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10308       return True;
10309    }
10310
10311    if (bitU == 1 && opcode == BITS5(1,1,1,0,1)) {
10312       /* -------- 1,0x,11101 FACGE d_d_d, s_s_s -------- */
10313       /* -------- 1,1x,11101 FACGT d_d_d, s_s_s -------- */
10314       Bool   isD   = (size & 1) == 1;
10315       IRType ity   = isD ? Ity_F64 : Ity_F32;
10316       Bool   isGT  = (size & 2) == 2;
10317       IROp   opCMP = isGT ? (isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4)
10318                           : (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4);
10319       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
10320       IRTemp res   = newTempV128();
10321       assign(res, binop(opCMP, unop(opABS, getQReg128(mm)),
10322                                unop(opABS, getQReg128(nn)))); // swapd
10323       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10324                                                              mkexpr(res))));
10325       DIP("%s %s, %s, %s\n", isGT ? "facgt" : "facge",
10326           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10327       return True;
10328    }
10329
10330    if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
10331       /* -------- 0,0x,11111: FRECPS  d_d_d, s_s_s -------- */
10332       /* -------- 0,1x,11111: FRSQRTS d_d_d, s_s_s -------- */
10333       Bool isSQRT = (size & 2) == 2;
10334       Bool isD    = (size & 1) == 1;
10335       IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
10336                            : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
10337       IRTemp res = newTempV128();
10338       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
10339       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10340                                                              mkexpr(res))));
10341       HChar c = isD ? 'd' : 's';
10342       DIP("%s %c%u, %c%u, %c%u\n", isSQRT ? "frsqrts" : "frecps",
10343           c, dd, c, nn, c, mm);
10344       return True;
10345    }
10346
10347    return False;
10348 #  undef INSN
10349 }
10350
10351 static
10352 Bool dis_AdvSIMD_scalar_three_same_extra(/*MB_OUT*/DisResult* dres, UInt insn)
10353 {
10354    /* 31 29 28    23   21 20 15     10 9 4
10355       01 U  11110 size 0  m  opcode 1  n d
10356       Decode fields: u,size,opcode
10357    */
10358 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10359    if (INSN(31,30) != BITS2(0,1)
10360        || INSN(28,24) != BITS5(1,1,1,1,0)
10361        || INSN(21,21) != 0
10362        || INSN(10,10) != 1) {
10363       return False;
10364    }
10365    UInt bitU   = INSN(29,29);
10366    UInt size   = INSN(23,22);
10367    UInt mm     = INSN(20,16);
10368    UInt opcode = INSN(15,11);
10369    UInt nn     = INSN(9,5);
10370    UInt dd     = INSN(4,0);
10371    vassert(size < 4);
10372    vassert(mm < 32 && nn < 32 && dd < 32);
10373
10374    if (bitU == 1 && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1))) {
10375       /* -------- xx,10000 SQRDMLAH s and h variants only -------- */
10376       /* -------- xx,10001 SQRDMLSH s and h variants only -------- */
10377       if (size == X00 || size == X11) return False;
10378       Bool isAdd = opcode == BITS5(1,0,0,0,0);
10379
10380       IRTemp res, res_nosat, vD, vN, vM;
10381       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
10382       newTempsV128_3(&vD, &vN, &vM);
10383       assign(vD, getQReg128(dd));
10384       assign(vN, getQReg128(nn));
10385       assign(vM, getQReg128(mm));
10386
10387       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
10388       putQReg128(dd,
10389                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res))));
10390       updateQCFLAGwithDifference(
10391          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res)),
10392          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res_nosat)));
10393
10394       const HChar  arr = "hs"[size];
10395       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
10396       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10397       return True;
10398    }
10399
10400    return False;
10401 #  undef INSN
10402 }
10403
10404
10405 static
10406 Bool dis_AdvSIMD_scalar_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
10407 {
10408    /* 31 29 28    23   21    16     11 9 4
10409       01 U  11110 size 10000 opcode 10 n d
10410       Decode fields: u,size,opcode
10411    */
10412 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10413    if (INSN(31,30) != BITS2(0,1)
10414        || INSN(28,24) != BITS5(1,1,1,1,0)
10415        || INSN(21,17) != BITS5(1,0,0,0,0)
10416        || INSN(11,10) != BITS2(1,0)) {
10417       return False;
10418    }
10419    UInt bitU   = INSN(29,29);
10420    UInt size   = INSN(23,22);
10421    UInt opcode = INSN(16,12);
10422    UInt nn     = INSN(9,5);
10423    UInt dd     = INSN(4,0);
10424    vassert(size < 4);
10425
10426    if (opcode == BITS5(0,0,0,1,1)) {
10427       /* -------- 0,xx,00011: SUQADD std4_std4 -------- */
10428       /* -------- 1,xx,00011: USQADD std4_std4 -------- */
10429       /* These are a bit tricky (to say the least).  See comments on
10430          the vector variants (in dis_AdvSIMD_two_reg_misc) below for
10431          details. */
10432       Bool   isUSQADD = bitU == 1;
10433       IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
10434                              : mkVecQADDEXTUSSATSS(size);
10435       IROp   nop  = mkVecADD(size);
10436       IRTemp argL = newTempV128();
10437       IRTemp argR = newTempV128();
10438       assign(argL, getQReg128(nn));
10439       assign(argR, getQReg128(dd));
10440       IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10441                        size, binop(qop, mkexpr(argL), mkexpr(argR)));
10442       IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10443                        size, binop(nop, mkexpr(argL), mkexpr(argR)));
10444       putQReg128(dd, mkexpr(qres));
10445       updateQCFLAGwithDifference(qres, nres);
10446       const HChar arr = "bhsd"[size];
10447       DIP("%s %c%u, %c%u\n", isUSQADD ? "usqadd" : "suqadd", arr, dd, arr, nn);
10448       return True;
10449    }
10450
10451    if (opcode == BITS5(0,0,1,1,1)) {
10452       /* -------- 0,xx,00111 SQABS std4_std4 -------- */
10453       /* -------- 1,xx,00111 SQNEG std4_std4 -------- */
10454       Bool isNEG = bitU == 1;
10455       IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
10456       (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
10457                                          getQReg128(nn), size );
10458       IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(qresFW));
10459       IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(nresFW));
10460       putQReg128(dd, mkexpr(qres));
10461       updateQCFLAGwithDifference(qres, nres);
10462       const HChar arr = "bhsd"[size];
10463       DIP("%s %c%u, %c%u\n", isNEG ? "sqneg" : "sqabs", arr, dd, arr, nn);
10464       return True;
10465    }
10466
10467    if (size == X11 && opcode == BITS5(0,1,0,0,0)) {
10468       /* -------- 0,11,01000: CMGT d_d_#0 -------- */ // >s 0
10469       /* -------- 1,11,01000: CMGE d_d_#0 -------- */ // >=s 0
10470       Bool    isGT = bitU == 0;
10471       IRExpr* argL = getQReg128(nn);
10472       IRExpr* argR = mkV128(0x0000);
10473       IRTemp  res  = newTempV128();
10474       assign(res, isGT ? binop(Iop_CmpGT64Sx2, argL, argR)
10475                        : unop(Iop_NotV128, binop(Iop_CmpGT64Sx2, argR, argL)));
10476       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10477       DIP("cm%s d%u, d%u, #0\n", isGT ? "gt" : "ge", dd, nn);
10478       return True;
10479    }
10480
10481    if (size == X11 && opcode == BITS5(0,1,0,0,1)) {
10482       /* -------- 0,11,01001: CMEQ d_d_#0 -------- */ // == 0
10483       /* -------- 1,11,01001: CMLE d_d_#0 -------- */ // <=s 0
10484       Bool    isEQ = bitU == 0;
10485       IRExpr* argL = getQReg128(nn);
10486       IRExpr* argR = mkV128(0x0000);
10487       IRTemp  res  = newTempV128();
10488       assign(res, isEQ ? binop(Iop_CmpEQ64x2, argL, argR)
10489                        : unop(Iop_NotV128,
10490                               binop(Iop_CmpGT64Sx2, argL, argR)));
10491       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10492       DIP("cm%s d%u, d%u, #0\n", isEQ ? "eq" : "le", dd, nn);
10493       return True;
10494    }
10495
10496    if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,0)) {
10497       /* -------- 0,11,01010: CMLT d_d_#0 -------- */ // <s 0
10498       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10499                           binop(Iop_CmpGT64Sx2, mkV128(0x0000),
10500                                                 getQReg128(nn))));
10501       DIP("cm%s d%u, d%u, #0\n", "lt", dd, nn);
10502       return True;
10503    }
10504
10505    if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
10506       /* -------- 0,11,01011 ABS d_d -------- */
10507       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10508                           unop(Iop_Abs64x2, getQReg128(nn))));
10509       DIP("abs d%u, d%u\n", dd, nn);
10510       return True;
10511    }
10512
10513    if (bitU == 1 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
10514       /* -------- 1,11,01011 NEG d_d -------- */
10515       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10516                           binop(Iop_Sub64x2, mkV128(0x0000), getQReg128(nn))));
10517       DIP("neg d%u, d%u\n", dd, nn);
10518       return True;
10519    }
10520
10521    UInt ix = 0; /*INVALID*/
10522    if (size >= X10) {
10523       switch (opcode) {
10524          case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 4 : 1; break;
10525          case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 5 : 2; break;
10526          case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;
10527          default: break;
10528       }
10529    }
10530    if (ix > 0) {
10531       /* -------- 0,1x,01100 FCMGT d_d_#0.0, s_s_#0.0 (ix 1) -------- */
10532       /* -------- 0,1x,01101 FCMEQ d_d_#0.0, s_s_#0.0 (ix 2) -------- */
10533       /* -------- 0,1x,01110 FCMLT d_d_#0.0, s_s_#0.0 (ix 3) -------- */
10534       /* -------- 1,1x,01100 FCMGE d_d_#0.0, s_s_#0.0 (ix 4) -------- */
10535       /* -------- 1,1x,01101 FCMLE d_d_#0.0, s_s_#0.0 (ix 5) -------- */
10536       Bool   isD     = size == X11;
10537       IRType ity     = isD ? Ity_F64 : Ity_F32;
10538       IROp   opCmpEQ = isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
10539       IROp   opCmpLE = isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
10540       IROp   opCmpLT = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
10541       IROp   opCmp   = Iop_INVALID;
10542       Bool   swap    = False;
10543       const HChar* nm = "??";
10544       switch (ix) {
10545          case 1: nm = "fcmgt"; opCmp = opCmpLT; swap = True; break;
10546          case 2: nm = "fcmeq"; opCmp = opCmpEQ; break;
10547          case 3: nm = "fcmlt"; opCmp = opCmpLT; break;
10548          case 4: nm = "fcmge"; opCmp = opCmpLE; swap = True; break;
10549          case 5: nm = "fcmle"; opCmp = opCmpLE; break;
10550          default: vassert(0);
10551       }
10552       IRExpr* zero = mkV128(0x0000);
10553       IRTemp res = newTempV128();
10554       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
10555                        : binop(opCmp, getQReg128(nn), zero));
10556       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10557                                                              mkexpr(res))));
10558
10559       DIP("%s %s, %s, #0.0\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
10560       return True;
10561    }
10562
10563    if (opcode == BITS5(1,0,1,0,0)
10564        || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) {
10565       /* -------- 0,xx,10100: SQXTN -------- */
10566       /* -------- 1,xx,10100: UQXTN -------- */
10567       /* -------- 1,xx,10010: SQXTUN -------- */
10568       if (size == X11) return False;
10569       vassert(size < 3);
10570       IROp  opN    = Iop_INVALID;
10571       Bool  zWiden = True;
10572       const HChar* nm = "??";
10573       /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) {
10574          opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False;
10575       }
10576       else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) {
10577          opN = mkVecQNARROWUNUU(size); nm = "uqxtn";
10578       }
10579       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
10580          opN = mkVecQNARROWUNSU(size); nm = "sqxtun";
10581       }
10582       else vassert(0);
10583       IRTemp src  = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10584                        size+1, getQReg128(nn));
10585       IRTemp resN = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10586                        size, unop(Iop_64UtoV128, unop(opN, mkexpr(src))));
10587       putQReg128(dd, mkexpr(resN));
10588       /* This widens zero lanes to zero, and compares it against zero, so all
10589          of the non-participating lanes make no contribution to the
10590          Q flag state. */
10591       IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/,
10592                                               size, mkexpr(resN));
10593       updateQCFLAGwithDifference(src, resW);
10594       const HChar arrNarrow = "bhsd"[size];
10595       const HChar arrWide   = "bhsd"[size+1];
10596       DIP("%s %c%u, %c%u\n", nm, arrNarrow, dd, arrWide, nn);
10597       return True;
10598    }
10599
10600    if (opcode == BITS5(1,0,1,1,0) && bitU == 1 && size == X01) {
10601       /* -------- 1,01,10110 FCVTXN s_d -------- */
10602       /* Using Irrm_NEAREST here isn't right.  The docs say "round to
10603          odd" but I don't know what that really means. */
10604       putQRegLO(dd,
10605                 binop(Iop_F64toF32, mkU32(Irrm_NEAREST),
10606                                     getQRegLO(nn, Ity_F64)));
10607       putQRegLane(dd, 1, mkU32(0));
10608       putQRegLane(dd, 1, mkU64(0));
10609       DIP("fcvtxn s%u, d%u\n", dd, nn);
10610       return True;
10611    }
10612
10613    ix = 0; /*INVALID*/
10614    switch (opcode) {
10615       case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
10616       case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
10617       case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
10618       default: break;
10619    }
10620    if (ix > 0) {
10621       /* -------- 0,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
10622       /* -------- 0,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
10623       /* -------- 0,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
10624       /* -------- 0,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
10625       /* -------- 0,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
10626       /* -------- 1,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
10627       /* -------- 1,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
10628       /* -------- 1,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
10629       /* -------- 1,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
10630       /* -------- 1,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
10631       Bool           isD  = (size & 1) == 1;
10632       IRType         tyF  = isD ? Ity_F64 : Ity_F32;
10633       IRType         tyI  = isD ? Ity_I64 : Ity_I32;
10634       IRRoundingMode irrm = 8; /*impossible*/
10635       HChar          ch   = '?';
10636       switch (ix) {
10637          case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
10638          case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
10639          case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
10640          case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
10641          case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
10642          default: vassert(0);
10643       }
10644       IROp cvt = Iop_INVALID;
10645       if (bitU == 1) {
10646          cvt = isD ? Iop_F64toI64U : Iop_F32toI32U;
10647       } else {
10648          cvt = isD ? Iop_F64toI64S : Iop_F32toI32S;
10649       }
10650       IRTemp src = newTemp(tyF);
10651       IRTemp res = newTemp(tyI);
10652       assign(src, getQRegLane(nn, 0, tyF));
10653       assign(res, binop(cvt, mkU32(irrm), mkexpr(src)));
10654       putQRegLane(dd, 0, mkexpr(res)); /* bits 31-0 or 63-0 */
10655       if (!isD) {
10656          putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
10657       }
10658       putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
10659       HChar sOrD = isD ? 'd' : 's';
10660       DIP("fcvt%c%c %c%u, %c%u\n", ch, bitU == 1 ? 'u' : 's',
10661           sOrD, dd, sOrD, nn);
10662       return True;
10663    }
10664
10665    if (size <= X01 && opcode == BITS5(1,1,1,0,1)) {
10666       /* -------- 0,0x,11101: SCVTF d_d, s_s -------- */
10667       /* -------- 1,0x,11101: UCVTF d_d, s_s -------- */
10668       Bool   isU = bitU == 1;
10669       Bool   isD = (size & 1) == 1;
10670       IRType tyI = isD ? Ity_I64 : Ity_I32;
10671       IROp   iop = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
10672                        : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
10673       IRTemp rm  = mk_get_IR_rounding_mode();
10674       putQRegLO(dd, binop(iop, mkexpr(rm), getQRegLO(nn, tyI)));
10675       if (!isD) {
10676          putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
10677       }
10678       putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
10679       HChar c = isD ? 'd' : 's';
10680       DIP("%ccvtf %c%u, %c%u\n", isU ? 'u' : 's', c, dd, c, nn);
10681       return True;
10682    }
10683
10684    if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
10685       /* -------- 0,1x,11101: FRECPE  d_d, s_s -------- */
10686       /* -------- 1,1x,11101: FRSQRTE d_d, s_s -------- */
10687       Bool isSQRT = bitU == 1;
10688       Bool isD    = (size & 1) == 1;
10689       IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
10690                            : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
10691       IRTemp resV = newTempV128();
10692       assign(resV, unop(op, getQReg128(nn)));
10693       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10694                                                              mkexpr(resV))));
10695       HChar c = isD ? 'd' : 's';
10696       DIP("%s %c%u, %c%u\n", isSQRT ? "frsqrte" : "frecpe", c, dd, c, nn);
10697       return True;
10698    }
10699
10700    if (bitU == 0 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
10701       /* -------- 0,1x,11111: FRECPX  d_d, s_s -------- */
10702       Bool   isD = (size & 1) == 1;
10703       IRType ty  = isD ? Ity_F64 : Ity_F32;
10704       IROp   op  = isD ? Iop_RecpExpF64 : Iop_RecpExpF32;
10705       IRTemp res = newTemp(ty);
10706       IRTemp rm  = mk_get_IR_rounding_mode();
10707       assign(res, binop(op, mkexpr(rm), getQRegLane(nn, 0, ty)));
10708       putQReg128(dd, mkV128(0x0000));
10709       putQRegLane(dd, 0, mkexpr(res));
10710       HChar c = isD ? 'd' : 's';
10711       DIP("%s %c%u, %c%u\n", "frecpx", c, dd, c, nn);
10712       return True;
10713    }
10714
10715    return False;
10716 #  undef INSN
10717 }
10718
10719
10720 static
10721 Bool dis_AdvSIMD_scalar_x_indexed_element(/*MB_OUT*/DisResult* dres, UInt insn)
10722 {
10723    /* 31   28    23   21 20 19 15     11   9 4
10724       01 U 11111 size L  M  m  opcode H  0 n d
10725       Decode fields are: u,size,opcode
10726       M is really part of the mm register number.  Individual
10727       cases need to inspect L and H though.
10728    */
10729 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10730    if (INSN(31,30) != BITS2(0,1)
10731        || INSN(28,24) != BITS5(1,1,1,1,1) || INSN(10,10) != 0) {
10732       return False;
10733    }
10734    UInt bitU   = INSN(29,29);
10735    UInt size   = INSN(23,22);
10736    UInt bitL   = INSN(21,21);
10737    UInt bitM   = INSN(20,20);
10738    UInt mmLO4  = INSN(19,16);
10739    UInt opcode = INSN(15,12);
10740    UInt bitH   = INSN(11,11);
10741    UInt nn     = INSN(9,5);
10742    UInt dd     = INSN(4,0);
10743    vassert(size < 4);
10744    vassert(bitH < 2 && bitM < 2 && bitL < 2);
10745
10746    if (bitU == 0 && size >= X10
10747        && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
10748       /* -------- 0,1x,0001 FMLA d_d_d[], s_s_s[] -------- */
10749       /* -------- 0,1x,0101 FMLS d_d_d[], s_s_s[] -------- */
10750       Bool isD   = (size & 1) == 1;
10751       Bool isSUB = opcode == BITS4(0,1,0,1);
10752       UInt index;
10753       if      (!isD)             index = (bitH << 1) | bitL;
10754       else if (isD && bitL == 0) index = bitH;
10755       else return False; // sz:L == x11 => unallocated encoding
10756       vassert(index < (isD ? 2 : 4));
10757       IRType ity   = isD ? Ity_F64 : Ity_F32;
10758       IRTemp elem  = newTemp(ity);
10759       UInt   mm    = (bitM << 4) | mmLO4;
10760       assign(elem, getQRegLane(mm, index, ity));
10761       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
10762       IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
10763       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
10764       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
10765       IRTemp rm    = mk_get_IR_rounding_mode();
10766       IRTemp t1    = newTempV128();
10767       IRTemp t2    = newTempV128();
10768       // FIXME: double rounding; use FMA primops instead
10769       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
10770       assign(t2, triop(isSUB ? opSUB : opADD,
10771                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
10772       putQReg128(dd,
10773                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
10774                                                          mkexpr(t2))));
10775       const HChar c = isD ? 'd' : 's';
10776       DIP("%s %c%u, %c%u, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
10777           c, dd, c, nn, nameQReg128(mm), c, index);
10778       return True;
10779    }
10780
10781    if (size >= X10 && opcode == BITS4(1,0,0,1)) {
10782       /* -------- 0,1x,1001 FMUL  d_d_d[], s_s_s[] -------- */
10783       /* -------- 1,1x,1001 FMULX d_d_d[], s_s_s[] -------- */
10784       Bool isD    = (size & 1) == 1;
10785       Bool isMULX = bitU == 1;
10786       UInt index;
10787       if      (!isD)             index = (bitH << 1) | bitL;
10788       else if (isD && bitL == 0) index = bitH;
10789       else return False; // sz:L == x11 => unallocated encoding
10790       vassert(index < (isD ? 2 : 4));
10791       IRType ity   = isD ? Ity_F64 : Ity_F32;
10792       IRTemp elem  = newTemp(ity);
10793       UInt   mm    = (bitM << 4) | mmLO4;
10794       assign(elem, getQRegLane(mm, index, ity));
10795       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
10796       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
10797       IRTemp rm    = mk_get_IR_rounding_mode();
10798       IRTemp t1    = newTempV128();
10799       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
10800       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
10801       putQReg128(dd,
10802                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
10803                                                          mkexpr(t1))));
10804       const HChar c = isD ? 'd' : 's';
10805       DIP("%s %c%u, %c%u, %s.%c[%u]\n", isMULX ? "fmulx" : "fmul",
10806           c, dd, c, nn, nameQReg128(mm), c, index);
10807       return True;
10808    }
10809
10810    if (bitU == 0
10811        && (opcode == BITS4(1,0,1,1)
10812            || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
10813       /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
10814       /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
10815       /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
10816       /* Widens, and size refers to the narrowed lanes. */
10817       UInt ks = 3;
10818       switch (opcode) {
10819          case BITS4(1,0,1,1): ks = 0; break;
10820          case BITS4(0,0,1,1): ks = 1; break;
10821          case BITS4(0,1,1,1): ks = 2; break;
10822          default: vassert(0);
10823       }
10824       vassert(ks >= 0 && ks <= 2);
10825       UInt mm  = 32; // invalid
10826       UInt ix  = 16; // invalid
10827       switch (size) {
10828          case X00:
10829             return False; // h_b_b[] case is not allowed
10830          case X01:
10831             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
10832          case X10:
10833             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
10834          case X11:
10835             return False; // q_d_d[] case is not allowed
10836          default:
10837             vassert(0);
10838       }
10839       vassert(mm < 32 && ix < 16);
10840       IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
10841       vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
10842       newTempsV128_2(&vecN, &vecD);
10843       assign(vecN, getQReg128(nn));
10844       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
10845       assign(vecD, getQReg128(dd));
10846       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
10847                        False/*!is2*/, size, "mas"[ks],
10848                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
10849       IROp opZHI = mkVecZEROHIxxOFV128(size+1);
10850       putQReg128(dd, unop(opZHI, mkexpr(res)));
10851       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
10852       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
10853       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
10854          updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
10855       }
10856       const HChar* nm        = ks == 0 ? "sqmull"
10857                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
10858       const HChar  arrNarrow = "bhsd"[size];
10859       const HChar  arrWide   = "bhsd"[size+1];
10860       DIP("%s %c%u, %c%u, v%u.%c[%u]\n",
10861           nm, arrWide, dd, arrNarrow, nn, dd, arrNarrow, ix);
10862       return True;
10863    }
10864
10865    if (bitU == 0 && (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1))) {
10866       /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
10867       /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
10868       UInt mm  = 32; // invalid
10869       UInt ix  = 16; // invalid
10870       switch (size) {
10871          case X00:
10872             return False; // b case is not allowed
10873          case X01:
10874             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
10875          case X10:
10876             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
10877          case X11:
10878             return False; // q case is not allowed
10879          default:
10880             vassert(0);
10881       }
10882       vassert(mm < 32 && ix < 16);
10883       Bool isR = opcode == BITS4(1,1,0,1);
10884       IRTemp res, sat1q, sat1n, vN, vM;
10885       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
10886       vN = newTempV128();
10887       assign(vN, getQReg128(nn));
10888       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
10889       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
10890       IROp opZHI = mkVecZEROHIxxOFV128(size);
10891       putQReg128(dd, unop(opZHI, mkexpr(res)));
10892       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
10893       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
10894       HChar ch         = size == X01 ? 'h' : 's';
10895       DIP("%s %c%u, %c%u, v%d.%c[%u]\n", nm, ch, dd, ch, nn, ch, (Int)dd, ix);
10896       return True;
10897    }
10898
10899    if (bitU == 1 && (opcode == BITS4(1,1,0,1) || opcode == BITS4(1,1,1,1))) {
10900       /* -------- 0,xx,1101 SQRDMLAH s and h variants only -------- */
10901       /* -------- 0,xx,1111 SQRDMLSH s and h variants only -------- */
10902       UInt mm  = 32; // invalid
10903       UInt ix  = 16; // invalid
10904       switch (size) {
10905          case X00:
10906             return False; // b case is not allowed
10907          case X01:
10908             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
10909          case X10:
10910             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
10911          case X11:
10912             return False; // d case is not allowed
10913          default:
10914             vassert(0);
10915       }
10916       vassert(size < 4);
10917       vassert(mm < 32 && ix < 16);
10918       Bool isAdd = opcode == BITS4(1,1,0,1);
10919
10920       IRTemp res, res_nosat, vD, vN, vM;
10921       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
10922       newTempsV128_2(&vD, &vN);
10923       assign(vD, getQReg128(dd));
10924       assign(vN, getQReg128(nn));
10925       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
10926
10927       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
10928       IROp opZHI = mkVecZEROHIxxOFV128(size);
10929       putQReg128(dd, unop(opZHI, mkexpr(res)));
10930       updateQCFLAGwithDifferenceZHI(res, res_nosat, opZHI);
10931
10932       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
10933       HChar ch         = size == X01 ? 'h' : 's';
10934       DIP("%s %c%u, %c%u, v%d.%c[%u]\n", nm, ch, dd, ch, nn, ch, (Int)dd, ix);
10935       return True;
10936    }
10937
10938    return False;
10939 #  undef INSN
10940 }
10941
10942
10943 static
10944 Bool dis_AdvSIMD_shift_by_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
10945 {
10946    /* 31    28     22   18   15     10 9 4
10947       0 q u 011110 immh immb opcode 1  n d
10948       Decode fields: u,opcode
10949    */
10950 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10951    if (INSN(31,31) != 0
10952        || INSN(28,23) != BITS6(0,1,1,1,1,0) || INSN(10,10) != 1) {
10953       return False;
10954    }
10955    UInt bitQ   = INSN(30,30);
10956    UInt bitU   = INSN(29,29);
10957    UInt immh   = INSN(22,19);
10958    UInt immb   = INSN(18,16);
10959    UInt opcode = INSN(15,11);
10960    UInt nn     = INSN(9,5);
10961    UInt dd     = INSN(4,0);
10962
10963    if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0)) {
10964       /* -------- 0,00000 SSHR std7_std7_#imm -------- */
10965       /* -------- 1,00000 USHR std7_std7_#imm -------- */
10966       /* -------- 0,00010 SSRA std7_std7_#imm -------- */
10967       /* -------- 1,00010 USRA std7_std7_#imm -------- */
10968       /* laneTy, shift = case immh:immb of
10969                          0001:xxx -> B, SHR:8-xxx
10970                          001x:xxx -> H, SHR:16-xxxx
10971                          01xx:xxx -> S, SHR:32-xxxxx
10972                          1xxx:xxx -> D, SHR:64-xxxxxx
10973                          other    -> invalid
10974       */
10975       UInt size  = 0;
10976       UInt shift = 0;
10977       Bool isQ   = bitQ == 1;
10978       Bool isU   = bitU == 1;
10979       Bool isAcc = opcode == BITS5(0,0,0,1,0);
10980       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10981       if (!ok || (bitQ == 0 && size == X11)) return False;
10982       vassert(size >= 0 && size <= 3);
10983       UInt lanebits = 8 << size;
10984       vassert(shift >= 1 && shift <= lanebits);
10985       IROp    op  = isU ? mkVecSHRN(size) : mkVecSARN(size);
10986       IRExpr* src = getQReg128(nn);
10987       IRTemp  shf = newTempV128();
10988       IRTemp  res = newTempV128();
10989       if (shift == lanebits && isU) {
10990          assign(shf, mkV128(0x0000));
10991       } else {
10992          UInt nudge = 0;
10993          if (shift == lanebits) {
10994             vassert(!isU);
10995             nudge = 1;
10996          }
10997          assign(shf, binop(op, src, mkU8(shift - nudge)));
10998       }
10999       assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
11000                         : mkexpr(shf));
11001       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11002       HChar laneCh = "bhsd"[size];
11003       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
11004       const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
11005                               : (isU ? "ushr" : "sshr");
11006       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
11007           nameQReg128(dd), nLanes, laneCh,
11008           nameQReg128(nn), nLanes, laneCh, shift);
11009       return True;
11010    }
11011
11012    if (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0)) {
11013       /* -------- 0,00100 SRSHR std7_std7_#imm -------- */
11014       /* -------- 1,00100 URSHR std7_std7_#imm -------- */
11015       /* -------- 0,00110 SRSRA std7_std7_#imm -------- */
11016       /* -------- 1,00110 URSRA std7_std7_#imm -------- */
11017       /* laneTy, shift = case immh:immb of
11018                          0001:xxx -> B, SHR:8-xxx
11019                          001x:xxx -> H, SHR:16-xxxx
11020                          01xx:xxx -> S, SHR:32-xxxxx
11021                          1xxx:xxx -> D, SHR:64-xxxxxx
11022                          other    -> invalid
11023       */
11024       UInt size  = 0;
11025       UInt shift = 0;
11026       Bool isQ   = bitQ == 1;
11027       Bool isU   = bitU == 1;
11028       Bool isAcc = opcode == BITS5(0,0,1,1,0);
11029       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11030       if (!ok || (bitQ == 0 && size == X11)) return False;
11031       vassert(size >= 0 && size <= 3);
11032       UInt lanebits = 8 << size;
11033       vassert(shift >= 1 && shift <= lanebits);
11034       IROp    op   = isU ? mkVecRSHU(size) : mkVecRSHS(size);
11035       IRExpr* src  = getQReg128(nn);
11036       IRTemp  imm8 = newTemp(Ity_I8);
11037       assign(imm8, mkU8((UChar)(-shift)));
11038       IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
11039       IRTemp  shf  = newTempV128();
11040       IRTemp  res  = newTempV128();
11041       assign(shf, binop(op, src, amt));
11042       assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
11043                         : mkexpr(shf));
11044       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11045       HChar laneCh = "bhsd"[size];
11046       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
11047       const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
11048                               : (isU ? "urshr" : "srshr");
11049       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
11050           nameQReg128(dd), nLanes, laneCh,
11051           nameQReg128(nn), nLanes, laneCh, shift);
11052       return True;
11053    }
11054
11055    if (bitU == 1 && opcode == BITS5(0,1,0,0,0)) {
11056       /* -------- 1,01000 SRI std7_std7_#imm -------- */
11057       /* laneTy, shift = case immh:immb of
11058                          0001:xxx -> B, SHR:8-xxx
11059                          001x:xxx -> H, SHR:16-xxxx
11060                          01xx:xxx -> S, SHR:32-xxxxx
11061                          1xxx:xxx -> D, SHR:64-xxxxxx
11062                          other    -> invalid
11063       */
11064       UInt size  = 0;
11065       UInt shift = 0;
11066       Bool isQ   = bitQ == 1;
11067       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11068       if (!ok || (bitQ == 0 && size == X11)) return False;
11069       vassert(size >= 0 && size <= 3);
11070       UInt lanebits = 8 << size;
11071       vassert(shift >= 1 && shift <= lanebits);
11072       IRExpr* src = getQReg128(nn);
11073       IRTemp  res = newTempV128();
11074       if (shift == lanebits) {
11075          assign(res, getQReg128(dd));
11076       } else {
11077          assign(res, binop(mkVecSHRN(size), src, mkU8(shift)));
11078          IRExpr* nmask = binop(mkVecSHLN(size),
11079                                mkV128(0xFFFF), mkU8(lanebits - shift));
11080          IRTemp  tmp   = newTempV128();
11081          assign(tmp, binop(Iop_OrV128,
11082                            mkexpr(res),
11083                            binop(Iop_AndV128, getQReg128(dd), nmask)));
11084          res = tmp;
11085       }
11086       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11087       HChar laneCh = "bhsd"[size];
11088       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
11089       DIP("%s %s.%u%c, %s.%u%c, #%u\n", "sri",
11090           nameQReg128(dd), nLanes, laneCh,
11091           nameQReg128(nn), nLanes, laneCh, shift);
11092       return True;
11093    }
11094
11095    if (opcode == BITS5(0,1,0,1,0)) {
11096       /* -------- 0,01010 SHL std7_std7_#imm -------- */
11097       /* -------- 1,01010 SLI std7_std7_#imm -------- */
11098       /* laneTy, shift = case immh:immb of
11099                          0001:xxx -> B, xxx
11100                          001x:xxx -> H, xxxx
11101                          01xx:xxx -> S, xxxxx
11102                          1xxx:xxx -> D, xxxxxx
11103                          other    -> invalid
11104       */
11105       UInt size  = 0;
11106       UInt shift = 0;
11107       Bool isSLI = bitU == 1;
11108       Bool isQ   = bitQ == 1;
11109       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11110       if (!ok || (bitQ == 0 && size == X11)) return False;
11111       vassert(size >= 0 && size <= 3);
11112       /* The shift encoding has opposite sign for the leftwards case.
11113          Adjust shift to compensate. */
11114       UInt lanebits = 8 << size;
11115       shift = lanebits - shift;
11116       vassert(shift >= 0 && shift < lanebits);
11117       IROp    op  = mkVecSHLN(size);
11118       IRExpr* src = getQReg128(nn);
11119       IRTemp  res = newTempV128();
11120       if (shift == 0) {
11121          assign(res, src);
11122       } else {
11123          assign(res, binop(op, src, mkU8(shift)));
11124          if (isSLI) {
11125             IRExpr* nmask = binop(mkVecSHRN(size),
11126                                   mkV128(0xFFFF), mkU8(lanebits - shift));
11127             IRTemp  tmp   = newTempV128();
11128             assign(tmp, binop(Iop_OrV128,
11129                               mkexpr(res),
11130                               binop(Iop_AndV128, getQReg128(dd), nmask)));
11131             res = tmp;
11132          }
11133       }
11134       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11135       HChar laneCh = "bhsd"[size];
11136       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
11137       const HChar* nm = isSLI ? "sli" : "shl";
11138       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
11139           nameQReg128(dd), nLanes, laneCh,
11140           nameQReg128(nn), nLanes, laneCh, shift);
11141       return True;
11142    }
11143
11144    if (opcode == BITS5(0,1,1,1,0)
11145        || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
11146       /* -------- 0,01110  SQSHL  std7_std7_#imm -------- */
11147       /* -------- 1,01110  UQSHL  std7_std7_#imm -------- */
11148       /* -------- 1,01100  SQSHLU std7_std7_#imm -------- */
11149       UInt size  = 0;
11150       UInt shift = 0;
11151       Bool isQ   = bitQ == 1;
11152       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11153       if (!ok || (bitQ == 0 && size == X11)) return False;
11154       vassert(size >= 0 && size <= 3);
11155       /* The shift encoding has opposite sign for the leftwards case.
11156          Adjust shift to compensate. */
11157       UInt lanebits = 8 << size;
11158       shift = lanebits - shift;
11159       vassert(shift >= 0 && shift < lanebits);
11160       const HChar* nm = NULL;
11161       /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
11162       else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
11163       else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
11164       else vassert(0);
11165       IRTemp qDiff1 = IRTemp_INVALID;
11166       IRTemp qDiff2 = IRTemp_INVALID;
11167       IRTemp res = IRTemp_INVALID;
11168       IRTemp src = newTempV128();
11169       assign(src, getQReg128(nn));
11170       math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
11171       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11172       updateQCFLAGwithDifferenceZHI(qDiff1, qDiff2,
11173                                     isQ ? Iop_INVALID : Iop_ZeroHI64ofV128);
11174       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11175       DIP("%s %s.%s, %s.%s, #%u\n", nm,
11176           nameQReg128(dd), arr, nameQReg128(nn), arr, shift);
11177       return True;
11178    }
11179
11180    if (bitU == 0
11181        && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1))) {
11182       /* -------- 0,10000  SHRN{,2} #imm -------- */
11183       /* -------- 0,10001 RSHRN{,2} #imm -------- */
11184       /* Narrows, and size is the narrow size. */
11185       UInt size  = 0;
11186       UInt shift = 0;
11187       Bool is2   = bitQ == 1;
11188       Bool isR   = opcode == BITS5(1,0,0,0,1);
11189       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11190       if (!ok || size == X11) return False;
11191       vassert(shift >= 1);
11192       IRTemp t1 = newTempV128();
11193       IRTemp t2 = newTempV128();
11194       IRTemp t3 = newTempV128();
11195       assign(t1, getQReg128(nn));
11196       assign(t2, isR ? binop(mkVecADD(size+1),
11197                              mkexpr(t1),
11198                              mkexpr(math_VEC_DUP_IMM(size+1, 1ULL<<(shift-1))))
11199                      : mkexpr(t1));
11200       assign(t3, binop(mkVecSHRN(size+1), mkexpr(t2), mkU8(shift)));
11201       IRTemp t4 = math_NARROW_LANES(t3, t3, size);
11202       putLO64andZUorPutHI64(is2, dd, t4);
11203       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11204       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11205       DIP("%s %s.%s, %s.%s, #%u\n", isR ? "rshrn" : "shrn",
11206           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
11207       return True;
11208    }
11209
11210    if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
11211        || (bitU == 1
11212            && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
11213       /* -------- 0,10010   SQSHRN{,2} #imm -------- */
11214       /* -------- 1,10010   UQSHRN{,2} #imm -------- */
11215       /* -------- 0,10011  SQRSHRN{,2} #imm -------- */
11216       /* -------- 1,10011  UQRSHRN{,2} #imm -------- */
11217       /* -------- 1,10000  SQSHRUN{,2} #imm -------- */
11218       /* -------- 1,10001 SQRSHRUN{,2} #imm -------- */
11219       UInt size  = 0;
11220       UInt shift = 0;
11221       Bool is2   = bitQ == 1;
11222       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11223       if (!ok || size == X11) return False;
11224       vassert(shift >= 1 && shift <= (8 << size));
11225       const HChar* nm = "??";
11226       IROp op = Iop_INVALID;
11227       /* Decide on the name and the operation. */
11228       /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
11229          nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size);
11230       }
11231       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
11232          nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size);
11233       }
11234       else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) {
11235          nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size);
11236       }
11237       else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
11238          nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size);
11239       }
11240       else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) {
11241          nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size);
11242       }
11243       else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) {
11244          nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size);
11245       }
11246       else vassert(0);
11247       /* Compute the result (Q, shifted value) pair. */
11248       IRTemp src128 = newTempV128();
11249       assign(src128, getQReg128(nn));
11250       IRTemp pair = newTempV128();
11251       assign(pair, binop(op, mkexpr(src128), mkU8(shift)));
11252       /* Update the result reg */
11253       IRTemp res64in128 = newTempV128();
11254       assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair)));
11255       putLO64andZUorPutHI64(is2, dd, res64in128);
11256       /* Update the Q flag. */
11257       IRTemp q64q64 = newTempV128();
11258       assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair)));
11259       IRTemp z128 = newTempV128();
11260       assign(z128, mkV128(0x0000));
11261       updateQCFLAGwithDifference(q64q64, z128);
11262       /* */
11263       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11264       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11265       DIP("%s %s.%s, %s.%s, #%u\n", nm,
11266           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
11267       return True;
11268    }
11269
11270    if (opcode == BITS5(1,0,1,0,0)) {
11271       /* -------- 0,10100 SSHLL{,2} #imm -------- */
11272       /* -------- 1,10100 USHLL{,2} #imm -------- */
11273       /* 31  28     22   18   15     9 4
11274          0q0 011110 immh immb 101001 n d  SSHLL Vd.Ta, Vn.Tb, #sh
11275          0q1 011110 immh immb 101001 n d  USHLL Vd.Ta, Vn.Tb, #sh
11276          where Ta,Tb,sh
11277            = case immh of 1xxx -> invalid
11278                           01xx -> 2d, 2s(q0)/4s(q1),  immh:immb - 32 (0..31)
11279                           001x -> 4s, 4h(q0)/8h(q1),  immh:immb - 16 (0..15)
11280                           0001 -> 8h, 8b(q0)/16b(q1), immh:immb - 8  (0..7)
11281                           0000 -> AdvSIMD modified immediate (???)
11282       */
11283       Bool    isQ   = bitQ == 1;
11284       Bool    isU   = bitU == 1;
11285       UInt    immhb = (immh << 3) | immb;
11286       IRTemp  src   = newTempV128();
11287       IRTemp  zero  = newTempV128();
11288       IRExpr* res   = NULL;
11289       UInt    sh    = 0;
11290       const HChar* ta = "??";
11291       const HChar* tb = "??";
11292       assign(src, getQReg128(nn));
11293       assign(zero, mkV128(0x0000));
11294       if (immh & 8) {
11295          /* invalid; don't assign to res */
11296       }
11297       else if (immh & 4) {
11298          sh = immhb - 32;
11299          vassert(sh < 32); /* so 32-sh is 1..32 */
11300          ta = "2d";
11301          tb = isQ ? "4s" : "2s";
11302          IRExpr* tmp = isQ ? mk_InterleaveHI32x4(src, zero)
11303                            : mk_InterleaveLO32x4(src, zero);
11304          res = binop(isU ? Iop_ShrN64x2 : Iop_SarN64x2, tmp, mkU8(32-sh));
11305       }
11306       else if (immh & 2) {
11307          sh = immhb - 16;
11308          vassert(sh < 16); /* so 16-sh is 1..16 */
11309          ta = "4s";
11310          tb = isQ ? "8h" : "4h";
11311          IRExpr* tmp = isQ ? mk_InterleaveHI16x8(src, zero)
11312                            : mk_InterleaveLO16x8(src, zero);
11313          res = binop(isU ? Iop_ShrN32x4 : Iop_SarN32x4, tmp, mkU8(16-sh));
11314       }
11315       else if (immh & 1) {
11316          sh = immhb - 8;
11317          vassert(sh < 8); /* so 8-sh is 1..8 */
11318          ta = "8h";
11319          tb = isQ ? "16b" : "8b";
11320          IRExpr* tmp = isQ ? mk_InterleaveHI8x16(src, zero)
11321                            : mk_InterleaveLO8x16(src, zero);
11322          res = binop(isU ? Iop_ShrN16x8 : Iop_SarN16x8, tmp, mkU8(8-sh));
11323       } else {
11324          vassert(immh == 0);
11325          /* invalid; don't assign to res */
11326       }
11327       /* */
11328       if (res) {
11329          putQReg128(dd, res);
11330          DIP("%cshll%s %s.%s, %s.%s, #%u\n",
11331              isU ? 'u' : 's', isQ ? "2" : "",
11332              nameQReg128(dd), ta, nameQReg128(nn), tb, sh);
11333          return True;
11334       }
11335       return False;
11336    }
11337
11338    if (opcode == BITS5(1,1,1,0,0)) {
11339       /* -------- 0,11100 SCVTF {2d_2d,4s_4s,2s_2s}_imm -------- */
11340       /* -------- 1,11100 UCVTF {2d_2d,4s_4s,2s_2s}_imm -------- */
11341       /* If immh is of the form 00xx, the insn is invalid. */
11342       if (immh < BITS4(0,1,0,0)) return False;
11343       UInt size  = 0;
11344       UInt fbits = 0;
11345       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
11346       /* The following holds because immh is never zero. */
11347       vassert(ok);
11348       /* The following holds because immh >= 0100. */
11349       vassert(size == X10 || size == X11);
11350       Bool isD = size == X11;
11351       Bool isU = bitU == 1;
11352       Bool isQ = bitQ == 1;
11353       if (isD && !isQ) return False; /* reject .1d case */
11354       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
11355       Double  scale  = two_to_the_minus(fbits);
11356       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
11357                            : IRExpr_Const(IRConst_F32( (Float)scale ));
11358       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
11359       IROp    opCVT  = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
11360                            : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
11361       IRType tyF = isD ? Ity_F64 : Ity_F32;
11362       IRType tyI = isD ? Ity_I64 : Ity_I32;
11363       UInt nLanes = (isQ ? 2 : 1) * (isD ? 1 : 2);
11364       vassert(nLanes == 2 || nLanes == 4);
11365       for (UInt i = 0; i < nLanes; i++) {
11366          IRTemp src = newTemp(tyI);
11367          IRTemp res = newTemp(tyF);
11368          IRTemp rm  = mk_get_IR_rounding_mode();
11369          assign(src, getQRegLane(nn, i, tyI));
11370          assign(res, triop(opMUL, mkexpr(rm),
11371                                   binop(opCVT, mkexpr(rm), mkexpr(src)),
11372                                   scaleE));
11373          putQRegLane(dd, i, mkexpr(res));
11374       }
11375       if (!isQ) {
11376          putQRegLane(dd, 1, mkU64(0));
11377       }
11378       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11379       DIP("%s %s.%s, %s.%s, #%u\n", isU ? "ucvtf" : "scvtf",
11380           nameQReg128(dd), arr, nameQReg128(nn), arr, fbits);
11381       return True;
11382    }
11383
11384    if (opcode == BITS5(1,1,1,1,1)) {
11385       /* -------- 0,11111 FCVTZS {2d_2d,4s_4s,2s_2s}_imm -------- */
11386       /* -------- 1,11111 FCVTZU {2d_2d,4s_4s,2s_2s}_imm -------- */
11387       /* If immh is of the form 00xx, the insn is invalid. */
11388       if (immh < BITS4(0,1,0,0)) return False;
11389       UInt size  = 0;
11390       UInt fbits = 0;
11391       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
11392       /* The following holds because immh is never zero. */
11393       vassert(ok);
11394       /* The following holds because immh >= 0100. */
11395       vassert(size == X10 || size == X11);
11396       Bool isD = size == X11;
11397       Bool isU = bitU == 1;
11398       Bool isQ = bitQ == 1;
11399       if (isD && !isQ) return False; /* reject .1d case */
11400       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
11401       Double  scale  = two_to_the_plus(fbits);
11402       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
11403                            : IRExpr_Const(IRConst_F32( (Float)scale ));
11404       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
11405       IROp    opCVT  = isU ? (isD ? Iop_F64toI64U : Iop_F32toI32U)
11406                            : (isD ? Iop_F64toI64S : Iop_F32toI32S);
11407       IRType tyF = isD ? Ity_F64 : Ity_F32;
11408       IRType tyI = isD ? Ity_I64 : Ity_I32;
11409       UInt nLanes = (isQ ? 2 : 1) * (isD ? 1 : 2);
11410       vassert(nLanes == 2 || nLanes == 4);
11411       for (UInt i = 0; i < nLanes; i++) {
11412          IRTemp src = newTemp(tyF);
11413          IRTemp res = newTemp(tyI);
11414          IRTemp rm  = newTemp(Ity_I32);
11415          assign(src, getQRegLane(nn, i, tyF));
11416          assign(rm,  mkU32(Irrm_ZERO));
11417          assign(res, binop(opCVT, mkexpr(rm),
11418                                   triop(opMUL, mkexpr(rm),
11419                                                mkexpr(src), scaleE)));
11420          putQRegLane(dd, i, mkexpr(res));
11421       }
11422       if (!isQ) {
11423          putQRegLane(dd, 1, mkU64(0));
11424       }
11425       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11426       DIP("%s %s.%s, %s.%s, #%u\n", isU ? "fcvtzu" : "fcvtzs",
11427           nameQReg128(dd), arr, nameQReg128(nn), arr, fbits);
11428       return True;
11429    }
11430
11431 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11432    return False;
11433 #  undef INSN
11434 }
11435
11436
11437 static
11438 Bool dis_AdvSIMD_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
11439 {
11440    /* 31 30 29 28    23   21 20 15     11 9 4
11441       0  Q  U  01110 size 1  m  opcode 00 n d
11442       Decode fields: u,opcode
11443    */
11444 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11445    if (INSN(31,31) != 0
11446        || INSN(28,24) != BITS5(0,1,1,1,0)
11447        || INSN(21,21) != 1
11448        || INSN(11,10) != BITS2(0,0)) {
11449       return False;
11450    }
11451    UInt bitQ   = INSN(30,30);
11452    UInt bitU   = INSN(29,29);
11453    UInt size   = INSN(23,22);
11454    UInt mm     = INSN(20,16);
11455    UInt opcode = INSN(15,12);
11456    UInt nn     = INSN(9,5);
11457    UInt dd     = INSN(4,0);
11458    vassert(size < 4);
11459    Bool is2    = bitQ == 1;
11460
11461    if (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,0,1,0)) {
11462       /* -------- 0,0000 SADDL{2} -------- */
11463       /* -------- 1,0000 UADDL{2} -------- */
11464       /* -------- 0,0010 SSUBL{2} -------- */
11465       /* -------- 1,0010 USUBL{2} -------- */
11466       /* Widens, and size refers to the narrow lanes. */
11467       if (size == X11) return False;
11468       vassert(size <= 2);
11469       Bool   isU   = bitU == 1;
11470       Bool   isADD = opcode == BITS4(0,0,0,0);
11471       IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
11472       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
11473       IRTemp res   = newTempV128();
11474       assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
11475                         mkexpr(argL), mkexpr(argR)));
11476       putQReg128(dd, mkexpr(res));
11477       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11478       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11479       const HChar* nm        = isADD ? (isU ? "uaddl" : "saddl")
11480                                      : (isU ? "usubl" : "ssubl");
11481       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11482           nameQReg128(dd), arrWide,
11483           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11484       return True;
11485    }
11486
11487    if (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,0,1,1)) {
11488       /* -------- 0,0001 SADDW{2} -------- */
11489       /* -------- 1,0001 UADDW{2} -------- */
11490       /* -------- 0,0011 SSUBW{2} -------- */
11491       /* -------- 1,0011 USUBW{2} -------- */
11492       /* Widens, and size refers to the narrow lanes. */
11493       if (size == X11) return False;
11494       vassert(size <= 2);
11495       Bool   isU   = bitU == 1;
11496       Bool   isADD = opcode == BITS4(0,0,0,1);
11497       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
11498       IRTemp res   = newTempV128();
11499       assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
11500                         getQReg128(nn), mkexpr(argR)));
11501       putQReg128(dd, mkexpr(res));
11502       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11503       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11504       const HChar* nm        = isADD ? (isU ? "uaddw" : "saddw")
11505                                      : (isU ? "usubw" : "ssubw");
11506       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11507           nameQReg128(dd), arrWide,
11508           nameQReg128(nn), arrWide, nameQReg128(mm), arrNarrow);
11509       return True;
11510    }
11511
11512    if (opcode == BITS4(0,1,0,0) || opcode == BITS4(0,1,1,0)) {
11513       /* -------- 0,0100  ADDHN{2} -------- */
11514       /* -------- 1,0100 RADDHN{2} -------- */
11515       /* -------- 0,0110  SUBHN{2} -------- */
11516       /* -------- 1,0110 RSUBHN{2} -------- */
11517       /* Narrows, and size refers to the narrowed lanes. */
11518       if (size == X11) return False;
11519       vassert(size <= 2);
11520       const UInt shift[3] = { 8, 16, 32 };
11521       Bool isADD = opcode == BITS4(0,1,0,0);
11522       Bool isR   = bitU == 1;
11523       /* Combined elements in wide lanes */
11524       IRTemp  wide  = newTempV128();
11525       IRExpr* wideE = binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
11526                             getQReg128(nn), getQReg128(mm));
11527       if (isR) {
11528          wideE = binop(mkVecADD(size+1),
11529                        wideE,
11530                        mkexpr(math_VEC_DUP_IMM(size+1,
11531                                                1ULL << (shift[size]-1))));
11532       }
11533       assign(wide, wideE);
11534       /* Top halves of elements, still in wide lanes */
11535       IRTemp shrd = newTempV128();
11536       assign(shrd, binop(mkVecSHRN(size+1), mkexpr(wide), mkU8(shift[size])));
11537       /* Elements now compacted into lower 64 bits */
11538       IRTemp new64 = newTempV128();
11539       assign(new64, binop(mkVecCATEVENLANES(size), mkexpr(shrd), mkexpr(shrd)));
11540       putLO64andZUorPutHI64(is2, dd, new64);
11541       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11542       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11543       const HChar* nm = isADD ? (isR ? "raddhn" : "addhn")
11544                               : (isR ? "rsubhn" : "subhn");
11545       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11546           nameQReg128(dd), arrNarrow,
11547           nameQReg128(nn), arrWide, nameQReg128(mm), arrWide);
11548       return True;
11549    }
11550
11551    if (opcode == BITS4(0,1,0,1) || opcode == BITS4(0,1,1,1)) {
11552       /* -------- 0,0101 SABAL{2} -------- */
11553       /* -------- 1,0101 UABAL{2} -------- */
11554       /* -------- 0,0111 SABDL{2} -------- */
11555       /* -------- 1,0111 UABDL{2} -------- */
11556       /* Widens, and size refers to the narrow lanes. */
11557       if (size == X11) return False;
11558       vassert(size <= 2);
11559       Bool   isU   = bitU == 1;
11560       Bool   isACC = opcode == BITS4(0,1,0,1);
11561       IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
11562       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
11563       IRTemp abd   = math_ABD(isU, size+1, mkexpr(argL), mkexpr(argR));
11564       IRTemp res   = newTempV128();
11565       assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(abd), getQReg128(dd))
11566                         : mkexpr(abd));
11567       putQReg128(dd, mkexpr(res));
11568       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11569       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11570       const HChar* nm        = isACC ? (isU ? "uabal" : "sabal")
11571                                      : (isU ? "uabdl" : "sabdl");
11572       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11573           nameQReg128(dd), arrWide,
11574           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11575       return True;
11576    }
11577
11578    if (opcode == BITS4(1,1,0,0)
11579        || opcode == BITS4(1,0,0,0) || opcode == BITS4(1,0,1,0)) {
11580       /* -------- 0,1100  SMULL{2} -------- */ // 0 (ks)
11581       /* -------- 1,1100  UMULL{2} -------- */ // 0
11582       /* -------- 0,1000  SMLAL{2} -------- */ // 1
11583       /* -------- 1,1000  UMLAL{2} -------- */ // 1
11584       /* -------- 0,1010  SMLSL{2} -------- */ // 2
11585       /* -------- 1,1010  UMLSL{2} -------- */ // 2
11586       /* Widens, and size refers to the narrow lanes. */
11587       UInt ks = 3;
11588       switch (opcode) {
11589          case BITS4(1,1,0,0): ks = 0; break;
11590          case BITS4(1,0,0,0): ks = 1; break;
11591          case BITS4(1,0,1,0): ks = 2; break;
11592          default: vassert(0);
11593       }
11594       vassert(ks >= 0 && ks <= 2);
11595       if (size == X11) return False;
11596       vassert(size <= 2);
11597       Bool   isU  = bitU == 1;
11598       IRTemp vecN = newTempV128();
11599       IRTemp vecM = newTempV128();
11600       IRTemp vecD = newTempV128();
11601       assign(vecN, getQReg128(nn));
11602       assign(vecM, getQReg128(mm));
11603       assign(vecD, getQReg128(dd));
11604       IRTemp res = IRTemp_INVALID;
11605       math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
11606                     vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
11607       putQReg128(dd, mkexpr(res));
11608       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11609       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11610       const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
11611       DIP("%c%s%s %s.%s, %s.%s, %s.%s\n", isU ? 'u' : 's', nm, is2 ? "2" : "",
11612           nameQReg128(dd), arrWide,
11613           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11614       return True;
11615    }
11616
11617    if (bitU == 0
11618        && (opcode == BITS4(1,1,0,1)
11619            || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
11620       /* -------- 0,1101  SQDMULL{2} -------- */ // 0 (ks)
11621       /* -------- 0,1001  SQDMLAL{2} -------- */ // 1
11622       /* -------- 0,1011  SQDMLSL{2} -------- */ // 2
11623       /* Widens, and size refers to the narrow lanes. */
11624       UInt ks = 3;
11625       switch (opcode) {
11626          case BITS4(1,1,0,1): ks = 0; break;
11627          case BITS4(1,0,0,1): ks = 1; break;
11628          case BITS4(1,0,1,1): ks = 2; break;
11629          default: vassert(0);
11630       }
11631       vassert(ks >= 0 && ks <= 2);
11632       if (size == X00 || size == X11) return False;
11633       vassert(size <= 2);
11634       IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
11635       vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
11636       newTempsV128_3(&vecN, &vecM, &vecD);
11637       assign(vecN, getQReg128(nn));
11638       assign(vecM, getQReg128(mm));
11639       assign(vecD, getQReg128(dd));
11640       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
11641                        is2, size, "mas"[ks],
11642                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
11643       putQReg128(dd, mkexpr(res));
11644       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
11645       updateQCFLAGwithDifference(sat1q, sat1n);
11646       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
11647          updateQCFLAGwithDifference(sat2q, sat2n);
11648       }
11649       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11650       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11651       const HChar* nm        = ks == 0 ? "sqdmull"
11652                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
11653       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11654           nameQReg128(dd), arrWide,
11655           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11656       return True;
11657    }
11658
11659    if (bitU == 0 && opcode == BITS4(1,1,1,0)) {
11660       /* -------- 0,1110  PMULL{2} -------- */
11661       /* Widens, and size refers to the narrow lanes. */
11662       if (size != X00 && size != X11) return False;
11663       IRTemp  res  = IRTemp_INVALID;
11664       IRExpr* srcN = getQReg128(nn);
11665       IRExpr* srcM = getQReg128(mm);
11666       const HChar* arrNarrow = NULL;
11667       const HChar* arrWide   = NULL;
11668       if (size == X00) {
11669          res = math_BINARY_WIDENING_V128(is2, Iop_PolynomialMull8x8,
11670                                          srcN, srcM);
11671          arrNarrow = nameArr_Q_SZ(bitQ, size);
11672          arrWide   = nameArr_Q_SZ(1,    size+1);
11673       } else {
11674          /* The same thing as the X00 case, except we have to call
11675             a helper to do it. */
11676          vassert(size == X11);
11677          res = newTemp(Ity_V128);
11678          IROp slice
11679             = is2 ? Iop_V128HIto64 : Iop_V128to64;
11680          IRExpr** args
11681             = mkIRExprVec_3( IRExpr_VECRET(),
11682                              unop(slice, srcN), unop(slice, srcM));
11683          IRDirty* di
11684             = unsafeIRDirty_1_N( res, 0/*regparms*/,
11685                                       "arm64g_dirtyhelper_PMULLQ",
11686                                       &arm64g_dirtyhelper_PMULLQ, args);
11687          stmt(IRStmt_Dirty(di));
11688          /* We can't use nameArr_Q_SZ for this because it can't deal with
11689             Q-sized (128 bit) results.  Hence do it by hand. */
11690          arrNarrow = bitQ == 0 ? "1d" : "2d";
11691          arrWide   = "1q";
11692       }
11693       putQReg128(dd, mkexpr(res));
11694       DIP("%s%s %s.%s, %s.%s, %s.%s\n", "pmull", is2 ? "2" : "",
11695           nameQReg128(dd), arrWide,
11696           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11697       return True;
11698    }
11699
11700    return False;
11701 #  undef INSN
11702 }
11703
11704
11705 static
11706 Bool dis_AdvSIMD_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
11707 {
11708    /* 31 30 29 28    23   21 20 15     10 9 4
11709       0  Q  U  01110 size 1  m  opcode 1  n d
11710       Decode fields: u,size,opcode
11711    */
11712 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11713    if (INSN(31,31) != 0
11714        || INSN(28,24) != BITS5(0,1,1,1,0)
11715        || INSN(21,21) != 1
11716        || INSN(10,10) != 1) {
11717       return False;
11718    }
11719    UInt bitQ   = INSN(30,30);
11720    UInt bitU   = INSN(29,29);
11721    UInt size   = INSN(23,22);
11722    UInt mm     = INSN(20,16);
11723    UInt opcode = INSN(15,11);
11724    UInt nn     = INSN(9,5);
11725    UInt dd     = INSN(4,0);
11726    vassert(size < 4);
11727
11728    if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,1,0,0)) {
11729       /* -------- 0,xx,00000 SHADD std6_std6_std6 -------- */
11730       /* -------- 1,xx,00000 UHADD std6_std6_std6 -------- */
11731       /* -------- 0,xx,00100 SHSUB std6_std6_std6 -------- */
11732       /* -------- 1,xx,00100 UHSUB std6_std6_std6 -------- */
11733       if (size == X11) return False;
11734       Bool isADD = opcode == BITS5(0,0,0,0,0);
11735       Bool isU   = bitU == 1;
11736       /* Widen both args out, do the math, narrow to final result. */
11737       IRTemp argL   = newTempV128();
11738       IRTemp argLhi = IRTemp_INVALID;
11739       IRTemp argLlo = IRTemp_INVALID;
11740       IRTemp argR   = newTempV128();
11741       IRTemp argRhi = IRTemp_INVALID;
11742       IRTemp argRlo = IRTemp_INVALID;
11743       IRTemp resHi  = newTempV128();
11744       IRTemp resLo  = newTempV128();
11745       IRTemp res    = IRTemp_INVALID;
11746       assign(argL, getQReg128(nn));
11747       argLlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argL));
11748       argLhi = math_WIDEN_LO_OR_HI_LANES(isU, True,  size, mkexpr(argL));
11749       assign(argR, getQReg128(mm));
11750       argRlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argR));
11751       argRhi = math_WIDEN_LO_OR_HI_LANES(isU, True,  size, mkexpr(argR));
11752       IROp opADDSUB = isADD ? mkVecADD(size+1) : mkVecSUB(size+1);
11753       IROp opSxR = isU ? mkVecSHRN(size+1) : mkVecSARN(size+1);
11754       assign(resHi, binop(opSxR,
11755                           binop(opADDSUB, mkexpr(argLhi), mkexpr(argRhi)),
11756                           mkU8(1)));
11757       assign(resLo, binop(opSxR,
11758                           binop(opADDSUB, mkexpr(argLlo), mkexpr(argRlo)),
11759                           mkU8(1)));
11760       res = math_NARROW_LANES ( resHi, resLo, size );
11761       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11762       const HChar* nm  = isADD ? (isU ? "uhadd" : "shadd")
11763                                : (isU ? "uhsub" : "shsub");
11764       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11765       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11766           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11767       return True;
11768    }
11769
11770    if (opcode == BITS5(0,0,0,1,0)) {
11771       /* -------- 0,xx,00010 SRHADD std7_std7_std7 -------- */
11772       /* -------- 1,xx,00010 URHADD std7_std7_std7 -------- */
11773       if (bitQ == 0 && size == X11) return False; // implied 1d case
11774       Bool   isU  = bitU == 1;
11775       IRTemp argL = newTempV128();
11776       IRTemp argR = newTempV128();
11777       assign(argL, getQReg128(nn));
11778       assign(argR, getQReg128(mm));
11779       IRTemp res = math_RHADD(size, isU, argL, argR);
11780       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11781       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11782       DIP("%s %s.%s, %s.%s, %s.%s\n", isU ? "urhadd" : "srhadd",
11783           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11784       return True;
11785    }
11786
11787    if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
11788       /* -------- 0,xx,00001 SQADD std7_std7_std7 -------- */
11789       /* -------- 1,xx,00001 UQADD std7_std7_std7 -------- */
11790       /* -------- 0,xx,00101 SQSUB std7_std7_std7 -------- */
11791       /* -------- 1,xx,00101 UQSUB std7_std7_std7 -------- */
11792       if (bitQ == 0 && size == X11) return False; // implied 1d case
11793       Bool isADD = opcode == BITS5(0,0,0,0,1);
11794       Bool isU   = bitU == 1;
11795       IROp qop   = Iop_INVALID;
11796       IROp nop   = Iop_INVALID;
11797       if (isADD) {
11798          qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
11799          nop = mkVecADD(size);
11800       } else {
11801          qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
11802          nop = mkVecSUB(size);
11803       }
11804       IRTemp argL = newTempV128();
11805       IRTemp argR = newTempV128();
11806       IRTemp qres = newTempV128();
11807       IRTemp nres = newTempV128();
11808       assign(argL, getQReg128(nn));
11809       assign(argR, getQReg128(mm));
11810       assign(qres, math_MAYBE_ZERO_HI64_fromE(
11811                       bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
11812       assign(nres, math_MAYBE_ZERO_HI64_fromE(
11813                       bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
11814       putQReg128(dd, mkexpr(qres));
11815       updateQCFLAGwithDifference(qres, nres);
11816       const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd")
11817                                : (isU ? "uqsub" : "sqsub");
11818       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11819       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11820           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11821       return True;
11822    }
11823
11824    if (bitU == 0 && opcode == BITS5(0,0,0,1,1)) {
11825       /* -------- 0,00,00011 AND 16b_16b_16b, 8b_8b_8b -------- */
11826       /* -------- 0,01,00011 BIC 16b_16b_16b, 8b_8b_8b -------- */
11827       /* -------- 0,10,00011 ORR 16b_16b_16b, 8b_8b_8b -------- */
11828       /* -------- 0,10,00011 ORN 16b_16b_16b, 8b_8b_8b -------- */
11829       Bool   isORx  = (size & 2) == 2;
11830       Bool   invert = (size & 1) == 1;
11831       IRTemp res    = newTempV128();
11832       assign(res, binop(isORx ? Iop_OrV128 : Iop_AndV128,
11833                         getQReg128(nn),
11834                         invert ? unop(Iop_NotV128, getQReg128(mm))
11835                                : getQReg128(mm)));
11836       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11837       const HChar* names[4] = { "and", "bic", "orr", "orn" };
11838       const HChar* ar = bitQ == 1 ? "16b" : "8b";
11839       DIP("%s %s.%s, %s.%s, %s.%s\n", names[INSN(23,22)],
11840           nameQReg128(dd), ar, nameQReg128(nn), ar, nameQReg128(mm), ar);
11841       return True;
11842    }
11843
11844    if (bitU == 1 && opcode == BITS5(0,0,0,1,1)) {
11845       /* -------- 1,00,00011 EOR 16b_16b_16b, 8b_8b_8b -------- */
11846       /* -------- 1,01,00011 BSL 16b_16b_16b, 8b_8b_8b -------- */
11847       /* -------- 1,10,00011 BIT 16b_16b_16b, 8b_8b_8b -------- */
11848       /* -------- 1,10,00011 BIF 16b_16b_16b, 8b_8b_8b -------- */
11849       IRTemp argD = newTempV128();
11850       IRTemp argN = newTempV128();
11851       IRTemp argM = newTempV128();
11852       assign(argD, getQReg128(dd));
11853       assign(argN, getQReg128(nn));
11854       assign(argM, getQReg128(mm));
11855       const IROp opXOR = Iop_XorV128;
11856       const IROp opAND = Iop_AndV128;
11857       const IROp opNOT = Iop_NotV128;
11858       IRTemp res = newTempV128();
11859       switch (size) {
11860          case BITS2(0,0): /* EOR */
11861             assign(res, binop(opXOR, mkexpr(argM), mkexpr(argN)));
11862             break;
11863          case BITS2(0,1): /* BSL */
11864             assign(res, binop(opXOR, mkexpr(argM),
11865                               binop(opAND,
11866                                     binop(opXOR, mkexpr(argM), mkexpr(argN)),
11867                                           mkexpr(argD))));
11868             break;
11869          case BITS2(1,0): /* BIT */
11870             assign(res, binop(opXOR, mkexpr(argD),
11871                               binop(opAND,
11872                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
11873                                     mkexpr(argM))));
11874             break;
11875          case BITS2(1,1): /* BIF */
11876             assign(res, binop(opXOR, mkexpr(argD),
11877                               binop(opAND,
11878                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
11879                                     unop(opNOT, mkexpr(argM)))));
11880             break;
11881          default:
11882             vassert(0);
11883       }
11884       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11885       const HChar* nms[4] = { "eor", "bsl", "bit", "bif" };
11886       const HChar* arr = bitQ == 1 ? "16b" : "8b";
11887       DIP("%s %s.%s, %s.%s, %s.%s\n", nms[size],
11888           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11889       return True;
11890    }
11891
11892    if (opcode == BITS5(0,0,1,1,0)) {
11893       /* -------- 0,xx,00110 CMGT std7_std7_std7 -------- */ // >s
11894       /* -------- 1,xx,00110 CMHI std7_std7_std7 -------- */ // >u
11895       if (bitQ == 0 && size == X11) return False; // implied 1d case
11896       Bool   isGT  = bitU == 0;
11897       IRExpr* argL = getQReg128(nn);
11898       IRExpr* argR = getQReg128(mm);
11899       IRTemp  res  = newTempV128();
11900       assign(res,
11901              isGT ? binop(mkVecCMPGTS(size), argL, argR)
11902                   : binop(mkVecCMPGTU(size), argL, argR));
11903       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11904       const HChar* nm  = isGT ? "cmgt" : "cmhi";
11905       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11906       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11907           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11908       return True;
11909    }
11910
11911    if (opcode == BITS5(0,0,1,1,1)) {
11912       /* -------- 0,xx,00111 CMGE std7_std7_std7 -------- */ // >=s
11913       /* -------- 1,xx,00111 CMHS std7_std7_std7 -------- */ // >=u
11914       if (bitQ == 0 && size == X11) return False; // implied 1d case
11915       Bool    isGE = bitU == 0;
11916       IRExpr* argL = getQReg128(nn);
11917       IRExpr* argR = getQReg128(mm);
11918       IRTemp  res  = newTempV128();
11919       assign(res,
11920              isGE ? unop(Iop_NotV128, binop(mkVecCMPGTS(size), argR, argL))
11921                   : unop(Iop_NotV128, binop(mkVecCMPGTU(size), argR, argL)));
11922       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11923       const HChar* nm  = isGE ? "cmge" : "cmhs";
11924       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11925       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11926           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11927       return True;
11928    }
11929
11930    if (opcode == BITS5(0,1,0,0,0) || opcode == BITS5(0,1,0,1,0)) {
11931       /* -------- 0,xx,01000 SSHL  std7_std7_std7 -------- */
11932       /* -------- 0,xx,01010 SRSHL std7_std7_std7 -------- */
11933       /* -------- 1,xx,01000 USHL  std7_std7_std7 -------- */
11934       /* -------- 1,xx,01010 URSHL std7_std7_std7 -------- */
11935       if (bitQ == 0 && size == X11) return False; // implied 1d case
11936       Bool isU = bitU == 1;
11937       Bool isR = opcode == BITS5(0,1,0,1,0);
11938       IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
11939                      : (isU ? mkVecSHU(size)  : mkVecSHS(size));
11940       IRTemp res = newTempV128();
11941       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
11942       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11943       const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
11944                              : (isU ? "ushl"  : "sshl");
11945       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11946       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11947           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11948       return True;
11949    }
11950
11951    if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
11952       /* -------- 0,xx,01001 SQSHL  std7_std7_std7 -------- */
11953       /* -------- 0,xx,01011 SQRSHL std7_std7_std7 -------- */
11954       /* -------- 1,xx,01001 UQSHL  std7_std7_std7 -------- */
11955       /* -------- 1,xx,01011 UQRSHL std7_std7_std7 -------- */
11956       if (bitQ == 0 && size == X11) return False; // implied 1d case
11957       Bool isU = bitU == 1;
11958       Bool isR = opcode == BITS5(0,1,0,1,1);
11959       IROp op  = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
11960                      : (isU ? mkVecQANDUQSH(size)  : mkVecQANDSQSH(size));
11961       /* This is a bit tricky.  If we're only interested in the lowest 64 bits
11962          of the result (viz, bitQ == 0), then we must adjust the operands to
11963          ensure that the upper part of the result, that we don't care about,
11964          doesn't pollute the returned Q value.  To do this, zero out the upper
11965          operand halves beforehand.  This works because it means, for the
11966          lanes we don't care about, we are shifting zero by zero, which can
11967          never saturate. */
11968       IRTemp res256 = newTemp(Ity_V256);
11969       IRTemp resSH  = newTempV128();
11970       IRTemp resQ   = newTempV128();
11971       IRTemp zero   = newTempV128();
11972       assign(res256, binop(op,
11973                            math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(nn)),
11974                            math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(mm))));
11975       assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
11976       assign(resQ,  unop(Iop_V256toV128_1, mkexpr(res256)));
11977       assign(zero,  mkV128(0x0000));
11978       putQReg128(dd, mkexpr(resSH));
11979       updateQCFLAGwithDifference(resQ, zero);
11980       const HChar* nm  = isR ? (isU ? "uqrshl" : "sqrshl")
11981                              : (isU ? "uqshl"  : "sqshl");
11982       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11983       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11984           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11985       return True;
11986    }
11987
11988    if (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,0,1)) {
11989       /* -------- 0,xx,01100 SMAX std7_std7_std7 -------- */
11990       /* -------- 1,xx,01100 UMAX std7_std7_std7 -------- */
11991       /* -------- 0,xx,01101 SMIN std7_std7_std7 -------- */
11992       /* -------- 1,xx,01101 UMIN std7_std7_std7 -------- */
11993       if (bitQ == 0 && size == X11) return False; // implied 1d case
11994       Bool isU   = bitU == 1;
11995       Bool isMAX = (opcode & 1) == 0;
11996       IROp op    = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
11997                          : (isU ? mkVecMINU(size) : mkVecMINS(size));
11998       IRTemp t   = newTempV128();
11999       assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
12000       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t));
12001       const HChar* nm = isMAX ? (isU ? "umax" : "smax")
12002                               : (isU ? "umin" : "smin");
12003       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12004       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12005           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12006       return True;
12007    }
12008
12009    if (opcode == BITS5(0,1,1,1,0) || opcode == BITS5(0,1,1,1,1)) {
12010       /* -------- 0,xx,01110 SABD std6_std6_std6 -------- */
12011       /* -------- 1,xx,01110 UABD std6_std6_std6 -------- */
12012       /* -------- 0,xx,01111 SABA std6_std6_std6 -------- */
12013       /* -------- 1,xx,01111 UABA std6_std6_std6 -------- */
12014       if (size == X11) return False; // 1d/2d cases not allowed
12015       Bool isU   = bitU == 1;
12016       Bool isACC = opcode == BITS5(0,1,1,1,1);
12017       vassert(size <= 2);
12018       IRTemp t1 = math_ABD(isU, size, getQReg128(nn), getQReg128(mm));
12019       IRTemp t2 = newTempV128();
12020       assign(t2, isACC ? binop(mkVecADD(size), mkexpr(t1), getQReg128(dd))
12021                        : mkexpr(t1));
12022       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
12023       const HChar* nm  = isACC ? (isU ? "uaba" : "saba")
12024                                : (isU ? "uabd" : "sabd");
12025       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12026       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12027           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12028       return True;
12029    }
12030
12031    if (opcode == BITS5(1,0,0,0,0)) {
12032       /* -------- 0,xx,10000 ADD std7_std7_std7 -------- */
12033       /* -------- 1,xx,10000 SUB std7_std7_std7 -------- */
12034       if (bitQ == 0 && size == X11) return False; // implied 1d case
12035       Bool   isSUB = bitU == 1;
12036       IROp   op    = isSUB ? mkVecSUB(size) : mkVecADD(size);
12037       IRTemp t     = newTempV128();
12038       assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
12039       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t));
12040       const HChar* nm  = isSUB ? "sub" : "add";
12041       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12042       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12043           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12044       return True;
12045    }
12046
12047    if (opcode == BITS5(1,0,0,0,1)) {
12048       /* -------- 0,xx,10001 CMTST std7_std7_std7 -------- */ // &, != 0
12049       /* -------- 1,xx,10001 CMEQ  std7_std7_std7 -------- */ // ==
12050       if (bitQ == 0 && size == X11) return False; // implied 1d case
12051       Bool    isEQ = bitU == 1;
12052       IRExpr* argL = getQReg128(nn);
12053       IRExpr* argR = getQReg128(mm);
12054       IRTemp  res  = newTempV128();
12055       assign(res,
12056              isEQ ? binop(mkVecCMPEQ(size), argL, argR)
12057                   : unop(Iop_NotV128, binop(mkVecCMPEQ(size),
12058                                             binop(Iop_AndV128, argL, argR),
12059                                             mkV128(0x0000))));
12060       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12061       const HChar* nm  = isEQ ? "cmeq" : "cmtst";
12062       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12063       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12064           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12065       return True;
12066    }
12067
12068    if (opcode == BITS5(1,0,0,1,0)) {
12069       /* -------- 0,xx,10010 MLA std7_std7_std7 -------- */
12070       /* -------- 1,xx,10010 MLS std7_std7_std7 -------- */
12071       if (bitQ == 0 && size == X11) return False; // implied 1d case
12072       Bool isMLS = bitU == 1;
12073       IROp   opMUL    = mkVecMUL(size);
12074       IROp   opADDSUB = isMLS ? mkVecSUB(size) : mkVecADD(size);
12075       IRTemp res      = newTempV128();
12076       if (opMUL != Iop_INVALID && opADDSUB != Iop_INVALID) {
12077          assign(res, binop(opADDSUB,
12078                            getQReg128(dd),
12079                            binop(opMUL, getQReg128(nn), getQReg128(mm))));
12080          putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12081          const HChar* arr = nameArr_Q_SZ(bitQ, size);
12082          DIP("%s %s.%s, %s.%s, %s.%s\n", isMLS ? "mls" : "mla",
12083              nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12084          return True;
12085       }
12086       return False;
12087    }
12088
12089    if (opcode == BITS5(1,0,0,1,1)) {
12090       /* -------- 0,xx,10011 MUL  std7_std7_std7 -------- */
12091       /* -------- 1,xx,10011 PMUL 16b_16b_16b, 8b_8b_8b -------- */
12092       if (bitQ == 0 && size == X11) return False; // implied 1d case
12093       Bool isPMUL = bitU == 1;
12094       const IROp opsPMUL[4]
12095          = { Iop_PolynomialMul8x16, Iop_INVALID, Iop_INVALID, Iop_INVALID };
12096       IROp   opMUL = isPMUL ? opsPMUL[size] : mkVecMUL(size);
12097       IRTemp res   = newTempV128();
12098       if (opMUL != Iop_INVALID) {
12099          assign(res, binop(opMUL, getQReg128(nn), getQReg128(mm)));
12100          putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12101          const HChar* arr = nameArr_Q_SZ(bitQ, size);
12102          DIP("%s %s.%s, %s.%s, %s.%s\n", isPMUL ? "pmul" : "mul",
12103              nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12104          return True;
12105       }
12106       return False;
12107    }
12108
12109    if (opcode == BITS5(1,0,1,0,0) || opcode == BITS5(1,0,1,0,1)) {
12110       /* -------- 0,xx,10100 SMAXP std6_std6_std6 -------- */
12111       /* -------- 1,xx,10100 UMAXP std6_std6_std6 -------- */
12112       /* -------- 0,xx,10101 SMINP std6_std6_std6 -------- */
12113       /* -------- 1,xx,10101 UMINP std6_std6_std6 -------- */
12114       if (size == X11) return False;
12115       Bool isU   = bitU == 1;
12116       Bool isMAX = opcode == BITS5(1,0,1,0,0);
12117       IRTemp vN  = newTempV128();
12118       IRTemp vM  = newTempV128();
12119       IROp op = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
12120                       : (isU ? mkVecMINU(size) : mkVecMINS(size));
12121       assign(vN, getQReg128(nn));
12122       assign(vM, getQReg128(mm));
12123       IRTemp res128 = newTempV128();
12124       assign(res128,
12125              binop(op,
12126                    binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
12127                    binop(mkVecCATODDLANES(size),  mkexpr(vM), mkexpr(vN))));
12128       /* In the half-width case, use CatEL32x4 to extract the half-width
12129          result from the full-width result. */
12130       IRExpr* res
12131          = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
12132                             binop(Iop_CatEvenLanes32x4, mkexpr(res128),
12133                                                         mkexpr(res128)))
12134                      : mkexpr(res128);
12135       putQReg128(dd, res);
12136       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12137       const HChar* nm  = isMAX ? (isU ? "umaxp" : "smaxp")
12138                                : (isU ? "uminp" : "sminp");
12139       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12140           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12141       return True;
12142    }
12143
12144    if (opcode == BITS5(1,0,1,1,0)) {
12145       /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
12146       /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
12147       if (size == X00 || size == X11) return False;
12148       Bool isR = bitU == 1;
12149       IRTemp res, sat1q, sat1n, vN, vM;
12150       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
12151       newTempsV128_2(&vN, &vM);
12152       assign(vN, getQReg128(nn));
12153       assign(vM, getQReg128(mm));
12154       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
12155       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12156       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
12157       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
12158       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12159       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
12160       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12161           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12162       return True;
12163    }
12164
12165    if (bitU == 0 && opcode == BITS5(1,0,1,1,1)) {
12166       /* -------- 0,xx,10111 ADDP std7_std7_std7 -------- */
12167       if (bitQ == 0 && size == X11) return False; // implied 1d case
12168       IRTemp vN = newTempV128();
12169       IRTemp vM = newTempV128();
12170       assign(vN, getQReg128(nn));
12171       assign(vM, getQReg128(mm));
12172       IRTemp res128 = newTempV128();
12173       assign(res128,
12174              binop(mkVecADD(size),
12175                    binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
12176                    binop(mkVecCATODDLANES(size),  mkexpr(vM), mkexpr(vN))));
12177       /* In the half-width case, use CatEL32x4 to extract the half-width
12178          result from the full-width result. */
12179       IRExpr* res
12180          = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
12181                             binop(Iop_CatEvenLanes32x4, mkexpr(res128),
12182                                                         mkexpr(res128)))
12183                      : mkexpr(res128);
12184       putQReg128(dd, res);
12185       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12186       DIP("addp %s.%s, %s.%s, %s.%s\n",
12187           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12188       return True;
12189    }
12190
12191    if (bitU == 0
12192        && (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,1,1,0))) {
12193       /* -------- 0,0x,11000 FMAXNM 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12194       /* -------- 0,1x,11000 FMINNM 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12195       /* -------- 0,0x,11110 FMAX   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12196       /* -------- 0,1x,11110 FMIN   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12197       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
12198       Bool   isD   = (size & 1) == 1;
12199       if (bitQ == 0 && isD) return False; // implied 1d case
12200       Bool   isMIN = (size & 2) == 2;
12201       Bool   isNM  = opcode == BITS5(1,1,0,0,0);
12202       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? X11 : X10);
12203       IRTemp res   = newTempV128();
12204       assign(res, binop(opMXX, getQReg128(nn), getQReg128(mm)));
12205       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12206       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12207       DIP("%s%s %s.%s, %s.%s, %s.%s\n",
12208           isMIN ? "fmin" : "fmax", isNM ? "nm" : "",
12209           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12210       return True;
12211    }
12212
12213    if (bitU == 0 && opcode == BITS5(1,1,0,0,1)) {
12214       /* -------- 0,0x,11001 FMLA 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12215       /* -------- 0,1x,11001 FMLS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12216       Bool isD   = (size & 1) == 1;
12217       Bool isSUB = (size & 2) == 2;
12218       if (bitQ == 0 && isD) return False; // implied 1d case
12219       IROp opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
12220       IROp opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
12221       IROp opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
12222       IRTemp rm = mk_get_IR_rounding_mode();
12223       IRTemp t1 = newTempV128();
12224       IRTemp t2 = newTempV128();
12225       // FIXME: double rounding; use FMA primops instead
12226       assign(t1, triop(opMUL,
12227                        mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12228       assign(t2, triop(isSUB ? opSUB : opADD,
12229                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
12230       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
12231       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12232       DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fmls" : "fmla",
12233           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12234       return True;
12235    }
12236
12237    if (bitU == 0 && opcode == BITS5(1,1,0,1,0)) {
12238       /* -------- 0,0x,11010 FADD 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12239       /* -------- 0,1x,11010 FSUB 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12240       Bool isD   = (size & 1) == 1;
12241       Bool isSUB = (size & 2) == 2;
12242       if (bitQ == 0 && isD) return False; // implied 1d case
12243       const IROp ops[4]
12244          = { Iop_Add32Fx4, Iop_Add64Fx2, Iop_Sub32Fx4, Iop_Sub64Fx2 };
12245       IROp   op = ops[size];
12246       IRTemp rm = mk_get_IR_rounding_mode();
12247       IRTemp t1 = newTempV128();
12248       IRTemp t2 = newTempV128();
12249       assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12250       assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
12251       putQReg128(dd, mkexpr(t2));
12252       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12253       DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fsub" : "fadd",
12254           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12255       return True;
12256    }
12257
12258    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
12259       /* -------- 1,1x,11010 FABD 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12260       Bool isD = (size & 1) == 1;
12261       if (bitQ == 0 && isD) return False; // implied 1d case
12262       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
12263       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
12264       IRTemp rm    = mk_get_IR_rounding_mode();
12265       IRTemp t1    = newTempV128();
12266       IRTemp t2    = newTempV128();
12267       // FIXME: use Abd primop instead?
12268       assign(t1, triop(opSUB, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12269       assign(t2, unop(opABS, mkexpr(t1)));
12270       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
12271       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12272       DIP("fabd %s.%s, %s.%s, %s.%s\n",
12273           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12274       return True;
12275    }
12276
12277    if (size <= X01 && opcode == BITS5(1,1,0,1,1)) {
12278       /* -------- 0,0x,11011 FMULX 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12279       /* -------- 1,0x,11011 FMUL  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12280       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
12281       Bool isD    = (size & 1) == 1;
12282       Bool isMULX = bitU == 0;
12283       if (bitQ == 0 && isD) return False; // implied 1d case
12284       IRTemp rm = mk_get_IR_rounding_mode();
12285       IRTemp t1 = newTempV128();
12286       assign(t1, triop(isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4,
12287                        mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12288       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12289       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12290       DIP("%s %s.%s, %s.%s, %s.%s\n", isMULX ? "fmulx" : "fmul",
12291           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12292       return True;
12293    }
12294
12295    if (size <= X01 && opcode == BITS5(1,1,1,0,0)) {
12296       /* -------- 0,0x,11100 FCMEQ 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12297       /* -------- 1,0x,11100 FCMGE 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12298       Bool isD = (size & 1) == 1;
12299       if (bitQ == 0 && isD) return False; // implied 1d case
12300       Bool   isGE  = bitU == 1;
12301       IROp   opCMP = isGE ? (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4)
12302                           : (isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4);
12303       IRTemp t1    = newTempV128();
12304       assign(t1, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn)) // swapd
12305                       : binop(opCMP, getQReg128(nn), getQReg128(mm)));
12306       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12307       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12308       DIP("%s %s.%s, %s.%s, %s.%s\n", isGE ? "fcmge" : "fcmeq",
12309           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12310       return True;
12311    }
12312
12313    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,0,0)) {
12314       /* -------- 1,1x,11100 FCMGT 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12315       Bool isD = (size & 1) == 1;
12316       if (bitQ == 0 && isD) return False; // implied 1d case
12317       IROp   opCMP = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
12318       IRTemp t1    = newTempV128();
12319       assign(t1, binop(opCMP, getQReg128(mm), getQReg128(nn))); // swapd
12320       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12321       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12322       DIP("%s %s.%s, %s.%s, %s.%s\n", "fcmgt",
12323           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12324       return True;
12325    }
12326
12327    if (bitU == 1 && opcode == BITS5(1,1,1,0,1)) {
12328       /* -------- 1,0x,11101 FACGE 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12329       /* -------- 1,1x,11101 FACGT 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12330       Bool isD  = (size & 1) == 1;
12331       Bool isGT = (size & 2) == 2;
12332       if (bitQ == 0 && isD) return False; // implied 1d case
12333       IROp   opCMP = isGT ? (isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4)
12334                           : (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4);
12335       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
12336       IRTemp t1    = newTempV128();
12337       assign(t1, binop(opCMP, unop(opABS, getQReg128(mm)),
12338                               unop(opABS, getQReg128(nn)))); // swapd
12339       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12340       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12341       DIP("%s %s.%s, %s.%s, %s.%s\n", isGT ? "facgt" : "facge",
12342           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12343       return True;
12344    }
12345
12346    if (bitU == 1
12347        && (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,1,1,0))) {
12348       /* -------- 1,0x,11000 FMAXNMP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12349       /* -------- 1,1x,11000 FMINNMP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12350       /* -------- 1,0x,11110 FMAXP   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12351       /* -------- 1,1x,11110 FMINP   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12352       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
12353       Bool isD = (size & 1) == 1;
12354       if (bitQ == 0 && isD) return False; // implied 1d case
12355       Bool   isMIN = (size & 2) == 2;
12356       Bool   isNM  = opcode == BITS5(1,1,0,0,0);
12357       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? 3 : 2);
12358       IRTemp srcN  = newTempV128();
12359       IRTemp srcM  = newTempV128();
12360       IRTemp preL  = IRTemp_INVALID;
12361       IRTemp preR  = IRTemp_INVALID;
12362       assign(srcN, getQReg128(nn));
12363       assign(srcM, getQReg128(mm));
12364       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR,
12365                                            srcM, srcN, isD, bitQ);
12366       putQReg128(
12367          dd, math_MAYBE_ZERO_HI64_fromE(
12368                 bitQ,
12369                 binop(opMXX, mkexpr(preL), mkexpr(preR))));
12370       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12371       DIP("%s%sp %s.%s, %s.%s, %s.%s\n",
12372           isMIN ? "fmin" : "fmax", isNM ? "nm" : "",
12373           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12374       return True;
12375    }
12376
12377    if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,0,1,0)) {
12378       /* -------- 1,0x,11010 FADDP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12379       Bool isD = size == X01;
12380       if (bitQ == 0 && isD) return False; // implied 1d case
12381       IRTemp srcN = newTempV128();
12382       IRTemp srcM = newTempV128();
12383       IRTemp preL = IRTemp_INVALID;
12384       IRTemp preR = IRTemp_INVALID;
12385       assign(srcN, getQReg128(nn));
12386       assign(srcM, getQReg128(mm));
12387       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR,
12388                                            srcM, srcN, isD, bitQ);
12389       putQReg128(
12390          dd, math_MAYBE_ZERO_HI64_fromE(
12391                 bitQ,
12392                 triop(mkVecADDF(isD ? 3 : 2),
12393                       mkexpr(mk_get_IR_rounding_mode()),
12394                       mkexpr(preL), mkexpr(preR))));
12395       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12396       DIP("%s %s.%s, %s.%s, %s.%s\n", "faddp",
12397           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12398       return True;
12399    }
12400
12401    if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,1,1,1)) {
12402       /* -------- 1,0x,11111 FDIV 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12403       Bool isD = (size & 1) == 1;
12404       if (bitQ == 0 && isD) return False; // implied 1d case
12405       vassert(size <= 1);
12406       const IROp ops[2] = { Iop_Div32Fx4, Iop_Div64Fx2 };
12407       IROp   op = ops[size];
12408       IRTemp rm = mk_get_IR_rounding_mode();
12409       IRTemp t1 = newTempV128();
12410       IRTemp t2 = newTempV128();
12411       assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12412       assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
12413       putQReg128(dd, mkexpr(t2));
12414       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12415       DIP("%s %s.%s, %s.%s, %s.%s\n", "fdiv",
12416           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12417       return True;
12418    }
12419
12420    if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
12421       /* -------- 0,0x,11111: FRECPS  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12422       /* -------- 0,1x,11111: FRSQRTS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12423       Bool isSQRT = (size & 2) == 2;
12424       Bool isD    = (size & 1) == 1;
12425       if (bitQ == 0 && isD) return False; // implied 1d case
12426       IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
12427                            : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
12428       IRTemp res = newTempV128();
12429       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
12430       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12431       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12432       DIP("%s %s.%s, %s.%s, %s.%s\n", isSQRT ? "frsqrts" : "frecps",
12433           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12434       return True;
12435    }
12436
12437    return False;
12438 #  undef INSN
12439 }
12440
12441
12442 static
12443 Bool dis_AdvSIMD_three_same_extra(/*MB_OUT*/DisResult* dres, UInt insn)
12444 {
12445    /* 31 30 29 28    23   21 20 15 14     10 9 4
12446       0  Q  U  01110 size 0  m  1  opcode 1  n d
12447       Decode fields: u,size,opcode
12448    */
12449 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12450    if (INSN(31,31) != 0
12451        || INSN(28,24) != BITS5(0,1,1,1,0)
12452        || INSN(21,21) != 0
12453        || INSN(15,15) != 1
12454        || INSN(10,10) != 1) {
12455       return False;
12456    }
12457    UInt bitQ   = INSN(30,30);
12458    UInt bitU   = INSN(29,29);
12459    UInt size   = INSN(23,22);
12460    UInt mm     = INSN(20,16);
12461    UInt opcode = INSN(14,11);
12462    UInt nn     = INSN(9,5);
12463    UInt dd     = INSN(4,0);
12464    vassert(size < 4);
12465    vassert(mm < 32 && nn < 32 && dd < 32);
12466
12467    if (bitU == 1 && (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,0,0,1))) {
12468       /* -------- 0,xx,10110 SQRDMLAH s and h variants only -------- */
12469       /* -------- 1,xx,10110 SQRDMLSH s and h variants only -------- */
12470       if (size == X00 || size == X11) return False;
12471       Bool isAdd = opcode == BITS4(0,0,0,0);
12472
12473       IRTemp res, res_nosat, vD, vN, vM;
12474       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
12475       newTempsV128_3(&vD, &vN, &vM);
12476       assign(vD, getQReg128(dd));
12477       assign(vN, getQReg128(nn));
12478       assign(vM, getQReg128(mm));
12479
12480       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
12481       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
12482       updateQCFLAGwithDifferenceZHI(res, res_nosat, opZHI);
12483       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12484
12485       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12486       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
12487       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12488           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12489       return True;
12490    }
12491
12492    return False;
12493 #  undef INSN
12494 }
12495
12496
12497 static
12498 Bool dis_AdvSIMD_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
12499 {
12500    /* 31 30 29 28    23   21    16     11 9 4
12501       0  Q  U  01110 size 10000 opcode 10 n d
12502       Decode fields: U,size,opcode
12503    */
12504 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12505    if (INSN(31,31) != 0
12506        || INSN(28,24) != BITS5(0,1,1,1,0)
12507        || INSN(21,17) != BITS5(1,0,0,0,0)
12508        || INSN(11,10) != BITS2(1,0)) {
12509       return False;
12510    }
12511    UInt bitQ   = INSN(30,30);
12512    UInt bitU   = INSN(29,29);
12513    UInt size   = INSN(23,22);
12514    UInt opcode = INSN(16,12);
12515    UInt nn     = INSN(9,5);
12516    UInt dd     = INSN(4,0);
12517    vassert(size < 4);
12518
12519    if (bitU == 0 && size <= X10 && opcode == BITS5(0,0,0,0,0)) {
12520       /* -------- 0,00,00000: REV64 16b_16b, 8b_8b -------- */
12521       /* -------- 0,01,00000: REV64 8h_8h, 4h_4h -------- */
12522       /* -------- 0,10,00000: REV64 4s_4s, 2s_2s -------- */
12523       const IROp iops[3] = { Iop_Reverse8sIn64_x2,
12524                              Iop_Reverse16sIn64_x2, Iop_Reverse32sIn64_x2 };
12525       vassert(size <= 2);
12526       IRTemp res = newTempV128();
12527       assign(res, unop(iops[size], getQReg128(nn)));
12528       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12529       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12530       DIP("%s %s.%s, %s.%s\n", "rev64",
12531           nameQReg128(dd), arr, nameQReg128(nn), arr);
12532       return True;
12533    }
12534
12535    if (bitU == 1 && size <= X01 && opcode == BITS5(0,0,0,0,0)) {
12536       /* -------- 1,00,00000: REV32 16b_16b, 8b_8b -------- */
12537       /* -------- 1,01,00000: REV32 8h_8h, 4h_4h -------- */
12538       Bool   isH = size == X01;
12539       IRTemp res = newTempV128();
12540       IROp   iop = isH ? Iop_Reverse16sIn32_x4 : Iop_Reverse8sIn32_x4;
12541       assign(res, unop(iop, getQReg128(nn)));
12542       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12543       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12544       DIP("%s %s.%s, %s.%s\n", "rev32",
12545           nameQReg128(dd), arr, nameQReg128(nn), arr);
12546       return True;
12547    }
12548
12549    if (bitU == 0 && size == X00 && opcode == BITS5(0,0,0,0,1)) {
12550       /* -------- 0,00,00001: REV16 16b_16b, 8b_8b -------- */
12551       IRTemp res = newTempV128();
12552       assign(res, unop(Iop_Reverse8sIn16_x8, getQReg128(nn)));
12553       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12554       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12555       DIP("%s %s.%s, %s.%s\n", "rev16",
12556           nameQReg128(dd), arr, nameQReg128(nn), arr);
12557       return True;
12558    }
12559
12560    if (opcode == BITS5(0,0,0,1,0) || opcode == BITS5(0,0,1,1,0)) {
12561       /* -------- 0,xx,00010: SADDLP std6_std6 -------- */
12562       /* -------- 1,xx,00010: UADDLP std6_std6 -------- */
12563       /* -------- 0,xx,00110: SADALP std6_std6 -------- */
12564       /* -------- 1,xx,00110: UADALP std6_std6 -------- */
12565       /* Widens, and size refers to the narrow size. */
12566       if (size == X11) return False; // no 1d or 2d cases
12567       Bool   isU   = bitU == 1;
12568       Bool   isACC = opcode == BITS5(0,0,1,1,0);
12569       IRTemp src   = newTempV128();
12570       IRTemp sum   = newTempV128();
12571       IRTemp res   = newTempV128();
12572       assign(src, getQReg128(nn));
12573       assign(sum,
12574              binop(mkVecADD(size+1),
12575                    mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
12576                              isU, True/*fromOdd*/, size, mkexpr(src))),
12577                    mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
12578                              isU, False/*!fromOdd*/, size, mkexpr(src)))));
12579       assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(sum), getQReg128(dd))
12580                         : mkexpr(sum));
12581       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12582       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12583       const HChar* arrWide   = nameArr_Q_SZ(bitQ, size+1);
12584       DIP("%s %s.%s, %s.%s\n", isACC ? (isU ? "uadalp" : "sadalp")
12585                                      : (isU ? "uaddlp" : "saddlp"),
12586           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
12587       return True;
12588    }
12589
12590    if (opcode == BITS5(0,0,0,1,1)) {
12591       /* -------- 0,xx,00011: SUQADD std7_std7 -------- */
12592       /* -------- 1,xx,00011: USQADD std7_std7 -------- */
12593       if (bitQ == 0 && size == X11) return False; // implied 1d case
12594       Bool isUSQADD = bitU == 1;
12595       /* This is switched (in the US vs SU sense) deliberately.
12596          SUQADD corresponds to the ExtUSsatSS variants and
12597          USQADD corresponds to the ExtSUsatUU variants.
12598          See libvex_ir for more details. */
12599       IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
12600                              : mkVecQADDEXTUSSATSS(size);
12601       IROp   nop  = mkVecADD(size);
12602       IRTemp argL = newTempV128();
12603       IRTemp argR = newTempV128();
12604       IRTemp qres = newTempV128();
12605       IRTemp nres = newTempV128();
12606       /* Because the two arguments to the addition are implicitly
12607          extended differently (one signedly, the other unsignedly) it is
12608          important to present them to the primop in the correct order. */
12609       assign(argL, getQReg128(nn));
12610       assign(argR, getQReg128(dd));
12611       assign(qres, math_MAYBE_ZERO_HI64_fromE(
12612                       bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
12613       assign(nres, math_MAYBE_ZERO_HI64_fromE(
12614                       bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
12615       putQReg128(dd, mkexpr(qres));
12616       updateQCFLAGwithDifference(qres, nres);
12617       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12618       DIP("%s %s.%s, %s.%s\n", isUSQADD ? "usqadd" : "suqadd",
12619           nameQReg128(dd), arr, nameQReg128(nn), arr);
12620       return True;
12621    }
12622
12623    if (opcode == BITS5(0,0,1,0,0)) {
12624       /* -------- 0,xx,00100: CLS std6_std6 -------- */
12625       /* -------- 1,xx,00100: CLZ std6_std6 -------- */
12626       if (size == X11) return False; // no 1d or 2d cases
12627       const IROp opsCLS[3] = { Iop_Cls8x16, Iop_Cls16x8, Iop_Cls32x4 };
12628       const IROp opsCLZ[3] = { Iop_Clz8x16, Iop_Clz16x8, Iop_Clz32x4 };
12629       Bool   isCLZ = bitU == 1;
12630       IRTemp res   = newTempV128();
12631       vassert(size <= 2);
12632       assign(res, unop(isCLZ ? opsCLZ[size] : opsCLS[size], getQReg128(nn)));
12633       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12634       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12635       DIP("%s %s.%s, %s.%s\n", isCLZ ? "clz" : "cls",
12636           nameQReg128(dd), arr, nameQReg128(nn), arr);
12637       return True;
12638    }
12639
12640    if (size == X00 && opcode == BITS5(0,0,1,0,1)) {
12641       /* -------- 0,00,00101: CNT 16b_16b, 8b_8b -------- */
12642       /* -------- 1,00,00101: NOT 16b_16b, 8b_8b -------- */
12643       IRTemp res = newTempV128();
12644       assign(res, unop(bitU == 0 ? Iop_Cnt8x16 : Iop_NotV128, getQReg128(nn)));
12645       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12646       const HChar* arr = nameArr_Q_SZ(bitQ, 0);
12647       DIP("%s %s.%s, %s.%s\n", bitU == 0 ? "cnt" : "not",
12648           nameQReg128(dd), arr, nameQReg128(nn), arr);
12649       return True;
12650    }
12651
12652    if (bitU == 1 && size == X01 && opcode == BITS5(0,0,1,0,1)) {
12653       /* -------- 1,01,00101  RBIT 16b_16b, 8b_8b -------- */
12654       IRTemp res = newTempV128();
12655       assign(res, unop(Iop_Reverse1sIn8_x16, getQReg128(nn)));
12656       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12657       const HChar* arr = nameArr_Q_SZ(bitQ, 0);
12658       DIP("%s %s.%s, %s.%s\n", "rbit",
12659           nameQReg128(dd), arr, nameQReg128(nn), arr);
12660       return True;
12661    }
12662
12663    if (opcode == BITS5(0,0,1,1,1)) {
12664       /* -------- 0,xx,00111 SQABS std7_std7 -------- */
12665       /* -------- 1,xx,00111 SQNEG std7_std7 -------- */
12666       if (bitQ == 0 && size == X11) return False; // implied 1d case
12667       Bool   isNEG  = bitU == 1;
12668       IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
12669       (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
12670                                          getQReg128(nn), size );
12671       IRTemp qres = newTempV128(), nres = newTempV128();
12672       assign(qres, math_MAYBE_ZERO_HI64(bitQ, qresFW));
12673       assign(nres, math_MAYBE_ZERO_HI64(bitQ, nresFW));
12674       putQReg128(dd, mkexpr(qres));
12675       updateQCFLAGwithDifference(qres, nres);
12676       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12677       DIP("%s %s.%s, %s.%s\n", isNEG ? "sqneg" : "sqabs",
12678           nameQReg128(dd), arr, nameQReg128(nn), arr);
12679       return True;
12680    }
12681
12682    if (opcode == BITS5(0,1,0,0,0)) {
12683       /* -------- 0,xx,01000: CMGT std7_std7_#0 -------- */ // >s 0
12684       /* -------- 1,xx,01000: CMGE std7_std7_#0 -------- */ // >=s 0
12685       if (bitQ == 0 && size == X11) return False; // implied 1d case
12686       Bool    isGT  = bitU == 0;
12687       IRExpr* argL  = getQReg128(nn);
12688       IRExpr* argR  = mkV128(0x0000);
12689       IRTemp  res   = newTempV128();
12690       IROp    opGTS = mkVecCMPGTS(size);
12691       assign(res, isGT ? binop(opGTS, argL, argR)
12692                        : unop(Iop_NotV128, binop(opGTS, argR, argL)));
12693       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12694       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12695       DIP("cm%s %s.%s, %s.%s, #0\n", isGT ? "gt" : "ge",
12696           nameQReg128(dd), arr, nameQReg128(nn), arr);
12697       return True;
12698    }
12699
12700    if (opcode == BITS5(0,1,0,0,1)) {
12701       /* -------- 0,xx,01001: CMEQ std7_std7_#0 -------- */ // == 0
12702       /* -------- 1,xx,01001: CMLE std7_std7_#0 -------- */ // <=s 0
12703       if (bitQ == 0 && size == X11) return False; // implied 1d case
12704       Bool    isEQ = bitU == 0;
12705       IRExpr* argL = getQReg128(nn);
12706       IRExpr* argR = mkV128(0x0000);
12707       IRTemp  res  = newTempV128();
12708       assign(res, isEQ ? binop(mkVecCMPEQ(size), argL, argR)
12709                        : unop(Iop_NotV128,
12710                               binop(mkVecCMPGTS(size), argL, argR)));
12711       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12712       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12713       DIP("cm%s %s.%s, %s.%s, #0\n", isEQ ? "eq" : "le",
12714           nameQReg128(dd), arr, nameQReg128(nn), arr);
12715       return True;
12716    }
12717
12718    if (bitU == 0 && opcode == BITS5(0,1,0,1,0)) {
12719       /* -------- 0,xx,01010: CMLT std7_std7_#0 -------- */ // <s 0
12720       if (bitQ == 0 && size == X11) return False; // implied 1d case
12721       IRExpr* argL = getQReg128(nn);
12722       IRExpr* argR = mkV128(0x0000);
12723       IRTemp  res  = newTempV128();
12724       assign(res, binop(mkVecCMPGTS(size), argR, argL));
12725       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12726       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12727       DIP("cm%s %s.%s, %s.%s, #0\n", "lt",
12728           nameQReg128(dd), arr, nameQReg128(nn), arr);
12729       return True;
12730    }
12731
12732    if (bitU == 0 && opcode == BITS5(0,1,0,1,1)) {
12733       /* -------- 0,xx,01011: ABS std7_std7 -------- */
12734       if (bitQ == 0 && size == X11) return False; // implied 1d case
12735       IRTemp res = newTempV128();
12736       assign(res, unop(mkVecABS(size), getQReg128(nn)));
12737       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12738       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12739       DIP("abs %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
12740       return True;
12741    }
12742
12743    if (bitU == 1 && opcode == BITS5(0,1,0,1,1)) {
12744       /* -------- 1,xx,01011: NEG std7_std7 -------- */
12745       if (bitQ == 0 && size == X11) return False; // implied 1d case
12746       IRTemp res = newTempV128();
12747       assign(res, binop(mkVecSUB(size), mkV128(0x0000), getQReg128(nn)));
12748       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12749       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12750       DIP("neg %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
12751       return True;
12752    }
12753
12754    UInt ix = 0; /*INVALID*/
12755    if (size >= X10) {
12756       switch (opcode) {
12757          case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 4 : 1; break;
12758          case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 5 : 2; break;
12759          case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;
12760          default: break;
12761       }
12762    }
12763    if (ix > 0) {
12764       /* -------- 0,1x,01100 FCMGT 2d_2d,4s_4s,2s_2s _#0.0 (ix 1) -------- */
12765       /* -------- 0,1x,01101 FCMEQ 2d_2d,4s_4s,2s_2s _#0.0 (ix 2) -------- */
12766       /* -------- 0,1x,01110 FCMLT 2d_2d,4s_4s,2s_2s _#0.0 (ix 3) -------- */
12767       /* -------- 1,1x,01100 FCMGE 2d_2d,4s_4s,2s_2s _#0.0 (ix 4) -------- */
12768       /* -------- 1,1x,01101 FCMLE 2d_2d,4s_4s,2s_2s _#0.0 (ix 5) -------- */
12769       if (bitQ == 0 && size == X11) return False; // implied 1d case
12770       Bool   isD     = size == X11;
12771       IROp   opCmpEQ = isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
12772       IROp   opCmpLE = isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
12773       IROp   opCmpLT = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
12774       IROp   opCmp   = Iop_INVALID;
12775       Bool   swap    = False;
12776       const HChar* nm = "??";
12777       switch (ix) {
12778          case 1: nm = "fcmgt"; opCmp = opCmpLT; swap = True; break;
12779          case 2: nm = "fcmeq"; opCmp = opCmpEQ; break;
12780          case 3: nm = "fcmlt"; opCmp = opCmpLT; break;
12781          case 4: nm = "fcmge"; opCmp = opCmpLE; swap = True; break;
12782          case 5: nm = "fcmle"; opCmp = opCmpLE; break;
12783          default: vassert(0);
12784       }
12785       IRExpr* zero = mkV128(0x0000);
12786       IRTemp res = newTempV128();
12787       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
12788                        : binop(opCmp, getQReg128(nn), zero));
12789       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12790       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
12791       DIP("%s %s.%s, %s.%s, #0.0\n", nm,
12792           nameQReg128(dd), arr, nameQReg128(nn), arr);
12793       return True;
12794    }
12795
12796    if (size >= X10 && opcode == BITS5(0,1,1,1,1)) {
12797       /* -------- 0,1x,01111: FABS 2d_2d, 4s_4s, 2s_2s -------- */
12798       /* -------- 1,1x,01111: FNEG 2d_2d, 4s_4s, 2s_2s -------- */
12799       if (bitQ == 0 && size == X11) return False; // implied 1d case
12800       Bool   isFNEG = bitU == 1;
12801       IROp   op     = isFNEG ? (size == X10 ? Iop_Neg32Fx4 : Iop_Neg64Fx2)
12802                              : (size == X10 ? Iop_Abs32Fx4 : Iop_Abs64Fx2);
12803       IRTemp res = newTempV128();
12804       assign(res, unop(op, getQReg128(nn)));
12805       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12806       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
12807       DIP("%s %s.%s, %s.%s\n", isFNEG ? "fneg" : "fabs",
12808           nameQReg128(dd), arr, nameQReg128(nn), arr);
12809       return True;
12810    }
12811
12812    if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
12813       /* -------- 0,xx,10010: XTN{,2} -------- */
12814       if (size == X11) return False;
12815       vassert(size < 3);
12816       Bool   is2  = bitQ == 1;
12817       IROp   opN  = mkVecNARROWUN(size);
12818       IRTemp resN = newTempV128();
12819       assign(resN, unop(Iop_64UtoV128, unop(opN, getQReg128(nn))));
12820       putLO64andZUorPutHI64(is2, dd, resN);
12821       const HChar* nm        = "xtn";
12822       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12823       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12824       DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm,
12825           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
12826       return True;
12827    }
12828
12829    if (opcode == BITS5(1,0,1,0,0)
12830        || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) {
12831       /* -------- 0,xx,10100: SQXTN{,2} -------- */
12832       /* -------- 1,xx,10100: UQXTN{,2} -------- */
12833       /* -------- 1,xx,10010: SQXTUN{,2} -------- */
12834       if (size == X11) return False;
12835       vassert(size < 3);
12836       Bool  is2    = bitQ == 1;
12837       IROp  opN    = Iop_INVALID;
12838       Bool  zWiden = True;
12839       const HChar* nm = "??";
12840       /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) {
12841          opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False;
12842       }
12843       else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) {
12844          opN = mkVecQNARROWUNUU(size); nm = "uqxtn";
12845       }
12846       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
12847          opN = mkVecQNARROWUNSU(size); nm = "sqxtun";
12848       }
12849       else vassert(0);
12850       IRTemp src  = newTempV128();
12851       assign(src, getQReg128(nn));
12852       IRTemp resN = newTempV128();
12853       assign(resN, unop(Iop_64UtoV128, unop(opN, mkexpr(src))));
12854       putLO64andZUorPutHI64(is2, dd, resN);
12855       IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/,
12856                                               size, mkexpr(resN));
12857       updateQCFLAGwithDifference(src, resW);
12858       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12859       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12860       DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm,
12861           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
12862       return True;
12863    }
12864
12865    if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
12866       /* -------- 1,xx,10011 SHLL{2} #lane-width -------- */
12867       /* Widens, and size is the narrow size. */
12868       if (size == X11) return False;
12869       Bool is2   = bitQ == 1;
12870       IROp opINT = is2 ? mkVecINTERLEAVEHI(size) : mkVecINTERLEAVELO(size);
12871       IROp opSHL = mkVecSHLN(size+1);
12872       IRTemp src = newTempV128();
12873       IRTemp res = newTempV128();
12874       assign(src, getQReg128(nn));
12875       assign(res, binop(opSHL, binop(opINT, mkexpr(src), mkexpr(src)),
12876                                mkU8(8 << size)));
12877       putQReg128(dd, mkexpr(res));
12878       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12879       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12880       DIP("shll%s %s.%s, %s.%s, #%d\n", is2 ? "2" : "",
12881           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow, 8 << size);
12882       return True;
12883    }
12884
12885    if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,0)) {
12886       /* -------- 0,0x,10110: FCVTN 4h/8h_4s, 2s/4s_2d -------- */
12887       UInt   nLanes = size == X00 ? 4 : 2;
12888       IRType srcTy  = size == X00 ? Ity_F32 : Ity_F64;
12889       IROp   opCvt  = size == X00 ? Iop_F32toF16 : Iop_F64toF32;
12890       IRTemp rm     = mk_get_IR_rounding_mode();
12891       IRTemp src[nLanes];
12892       for (UInt i = 0; i < nLanes; i++) {
12893          src[i] = newTemp(srcTy);
12894          assign(src[i], getQRegLane(nn, i, srcTy));
12895       }
12896       for (UInt i = 0; i < nLanes; i++) {
12897          putQRegLane(dd, nLanes * bitQ + i,
12898                          binop(opCvt, mkexpr(rm), mkexpr(src[i])));
12899       }
12900       if (bitQ == 0) {
12901          putQRegLane(dd, 1, mkU64(0));
12902       }
12903       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
12904       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
12905       DIP("fcvtn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
12906           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
12907       return True;
12908    }
12909
12910    if (bitU == 1 && size == X01 && opcode == BITS5(1,0,1,1,0)) {
12911       /* -------- 1,01,10110: FCVTXN 2s/4s_2d -------- */
12912       /* Using Irrm_NEAREST here isn't right.  The docs say "round to
12913          odd" but I don't know what that really means. */
12914       IRType srcTy = Ity_F64;
12915       IROp   opCvt = Iop_F64toF32;
12916       IRTemp src[2];
12917       for (UInt i = 0; i < 2; i++) {
12918          src[i] = newTemp(srcTy);
12919          assign(src[i], getQRegLane(nn, i, srcTy));
12920       }
12921       for (UInt i = 0; i < 2; i++) {
12922          putQRegLane(dd, 2 * bitQ + i,
12923                          binop(opCvt, mkU32(Irrm_NEAREST), mkexpr(src[i])));
12924       }
12925       if (bitQ == 0) {
12926          putQRegLane(dd, 1, mkU64(0));
12927       }
12928       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
12929       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
12930       DIP("fcvtxn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
12931           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
12932       return True;
12933    }
12934
12935    if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,1)) {
12936       /* -------- 0,0x,10111: FCVTL 4s_4h/8h, 2d_2s/4s -------- */
12937       UInt   nLanes = size == X00 ? 4 : 2;
12938       IRType srcTy  = size == X00 ? Ity_F16 : Ity_F32;
12939       IROp   opCvt  = size == X00 ? Iop_F16toF32 : Iop_F32toF64;
12940       IRTemp src[nLanes];
12941       for (UInt i = 0; i < nLanes; i++) {
12942          src[i] = newTemp(srcTy);
12943          assign(src[i], getQRegLane(nn, nLanes * bitQ + i, srcTy));
12944       }
12945       for (UInt i = 0; i < nLanes; i++) {
12946          putQRegLane(dd, i, unop(opCvt, mkexpr(src[i])));
12947       }
12948       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
12949       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
12950       DIP("fcvtl%s %s.%s, %s.%s\n", bitQ ? "2" : "",
12951           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
12952       return True;
12953    }
12954
12955    ix = 0;
12956    if (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,0,0,1)) {
12957       ix = 1 + ((((bitU & 1) << 2) | ((size & 2) << 0)) | ((opcode & 1) << 0));
12958       // = 1 + bitU[0]:size[1]:opcode[0]
12959       vassert(ix >= 1 && ix <= 8);
12960       if (ix == 7) ix = 0;
12961    }
12962    if (ix > 0) {
12963       /* -------- 0,0x,11000 FRINTN 2d_2d, 4s_4s, 2s_2s (1) -------- */
12964       /* -------- 0,0x,11001 FRINTM 2d_2d, 4s_4s, 2s_2s (2) -------- */
12965       /* -------- 0,1x,11000 FRINTP 2d_2d, 4s_4s, 2s_2s (3) -------- */
12966       /* -------- 0,1x,11001 FRINTZ 2d_2d, 4s_4s, 2s_2s (4) -------- */
12967       /* -------- 1,0x,11000 FRINTA 2d_2d, 4s_4s, 2s_2s (5) -------- */
12968       /* -------- 1,0x,11001 FRINTX 2d_2d, 4s_4s, 2s_2s (6) -------- */
12969       /* -------- 1,1x,11000 (apparently unassigned)    (7) -------- */
12970       /* -------- 1,1x,11001 FRINTI 2d_2d, 4s_4s, 2s_2s (8) -------- */
12971       /* rm plan:
12972          FRINTN: tieeven -- !! FIXME KLUDGED !!
12973          FRINTM: -inf
12974          FRINTP: +inf
12975          FRINTZ: zero
12976          FRINTA: tieaway -- !! FIXME KLUDGED !!
12977          FRINTX: per FPCR + "exact = TRUE"
12978          FRINTI: per FPCR
12979       */
12980       Bool isD = (size & 1) == 1;
12981       if (bitQ == 0 && isD) return False; // implied 1d case
12982
12983       IRTemp irrmRM = mk_get_IR_rounding_mode();
12984
12985       UChar ch = '?';
12986       IRTemp irrm = newTemp(Ity_I32);
12987       switch (ix) {
12988          case 1: ch = 'n'; assign(irrm, mkU32(Irrm_NEAREST)); break;
12989          case 2: ch = 'm'; assign(irrm, mkU32(Irrm_NegINF)); break;
12990          case 3: ch = 'p'; assign(irrm, mkU32(Irrm_PosINF)); break;
12991          case 4: ch = 'z'; assign(irrm, mkU32(Irrm_ZERO)); break;
12992          // The following is a kludge.  Should be: Irrm_NEAREST_TIE_AWAY_0
12993          case 5: ch = 'a'; assign(irrm, mkU32(Irrm_NEAREST)); break;
12994          // I am unsure about the following, due to the "integral exact"
12995          // description in the manual.  What does it mean? (frintx, that is)
12996          case 6: ch = 'x'; assign(irrm, mkexpr(irrmRM)); break;
12997          case 8: ch = 'i'; assign(irrm, mkexpr(irrmRM)); break;
12998          default: vassert(0);
12999       }
13000
13001       IROp opRND = isD ? Iop_RoundF64toInt : Iop_RoundF32toInt;
13002       if (isD) {
13003          for (UInt i = 0; i < 2; i++) {
13004             putQRegLane(dd, i, binop(opRND, mkexpr(irrm),
13005                                             getQRegLane(nn, i, Ity_F64)));
13006          }
13007       } else {
13008          UInt n = bitQ==1 ? 4 : 2;
13009          for (UInt i = 0; i < n; i++) {
13010             putQRegLane(dd, i, binop(opRND, mkexpr(irrm),
13011                                             getQRegLane(nn, i, Ity_F32)));
13012          }
13013          if (bitQ == 0)
13014             putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
13015       }
13016       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13017       DIP("frint%c %s.%s, %s.%s\n", ch,
13018           nameQReg128(dd), arr, nameQReg128(nn), arr);
13019       return True;
13020    }
13021
13022    ix = 0; /*INVALID*/
13023    switch (opcode) {
13024       case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
13025       case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
13026       case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
13027       default: break;
13028    }
13029    if (ix > 0) {
13030       /* -------- 0,0x,11010 FCVTNS 2d_2d, 4s_4s, 2s_2s (ix 1) -------- */
13031       /* -------- 0,0x,11011 FCVTMS 2d_2d, 4s_4s, 2s_2s (ix 2) -------- */
13032       /* -------- 0,0x,11100 FCVTAS 2d_2d, 4s_4s, 2s_2s (ix 3) -------- */
13033       /* -------- 0,1x,11010 FCVTPS 2d_2d, 4s_4s, 2s_2s (ix 4) -------- */
13034       /* -------- 0,1x,11011 FCVTZS 2d_2d, 4s_4s, 2s_2s (ix 5) -------- */
13035       /* -------- 1,0x,11010 FCVTNS 2d_2d, 4s_4s, 2s_2s (ix 1) -------- */
13036       /* -------- 1,0x,11011 FCVTMS 2d_2d, 4s_4s, 2s_2s (ix 2) -------- */
13037       /* -------- 1,0x,11100 FCVTAS 2d_2d, 4s_4s, 2s_2s (ix 3) -------- */
13038       /* -------- 1,1x,11010 FCVTPS 2d_2d, 4s_4s, 2s_2s (ix 4) -------- */
13039       /* -------- 1,1x,11011 FCVTZS 2d_2d, 4s_4s, 2s_2s (ix 5) -------- */
13040       Bool isD = (size & 1) == 1;
13041       if (bitQ == 0 && isD) return False; // implied 1d case
13042
13043       IRRoundingMode irrm = 8; /*impossible*/
13044       HChar          ch   = '?';
13045       switch (ix) {
13046          case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
13047          case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
13048          case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
13049          case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
13050          case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
13051          default: vassert(0);
13052       }
13053       IROp cvt = Iop_INVALID;
13054       if (bitU == 1) {
13055          cvt = isD ? Iop_F64toI64U : Iop_F32toI32U;
13056       } else {
13057          cvt = isD ? Iop_F64toI64S : Iop_F32toI32S;
13058       }
13059       if (isD) {
13060          for (UInt i = 0; i < 2; i++) {
13061             putQRegLane(dd, i, binop(cvt, mkU32(irrm),
13062                                             getQRegLane(nn, i, Ity_F64)));
13063          }
13064       } else {
13065          UInt n = bitQ==1 ? 4 : 2;
13066          for (UInt i = 0; i < n; i++) {
13067             putQRegLane(dd, i, binop(cvt, mkU32(irrm),
13068                                             getQRegLane(nn, i, Ity_F32)));
13069          }
13070          if (bitQ == 0)
13071             putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
13072       }
13073       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13074       DIP("fcvt%c%c %s.%s, %s.%s\n", ch, bitU == 1 ? 'u' : 's',
13075           nameQReg128(dd), arr, nameQReg128(nn), arr);
13076       return True;
13077    }
13078
13079    if (size == X10 && opcode == BITS5(1,1,1,0,0)) {
13080       /* -------- 0,10,11100: URECPE  4s_4s, 2s_2s -------- */
13081       /* -------- 1,10,11100: URSQRTE 4s_4s, 2s_2s -------- */
13082       Bool isREC = bitU == 0;
13083       IROp op    = isREC ? Iop_RecipEst32Ux4 : Iop_RSqrtEst32Ux4;
13084       IRTemp res = newTempV128();
13085       assign(res, unop(op, getQReg128(nn)));
13086       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13087       const HChar* nm  = isREC ? "urecpe" : "ursqrte";
13088       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13089       DIP("%s %s.%s, %s.%s\n", nm,
13090           nameQReg128(dd), arr, nameQReg128(nn), arr);
13091       return True;
13092    }
13093
13094    if (size <= X01 && opcode == BITS5(1,1,1,0,1)) {
13095       /* -------- 0,0x,11101: SCVTF -------- */
13096       /* -------- 1,0x,11101: UCVTF -------- */
13097       /* 31  28      22 21       15     9 4
13098          0q0 01110 0 sz 1  00001 110110 n d  SCVTF Vd, Vn
13099          0q1 01110 0 sz 1  00001 110110 n d  UCVTF Vd, Vn
13100          with laneage:
13101          case sz:Q of 00 -> 2S, zero upper, 01 -> 4S, 10 -> illegal, 11 -> 2D
13102       */
13103       Bool isQ   = bitQ == 1;
13104       Bool isU   = bitU == 1;
13105       Bool isF64 = (size & 1) == 1;
13106       if (isQ || !isF64) {
13107          IRType tyF = Ity_INVALID, tyI = Ity_INVALID;
13108          UInt   nLanes = 0;
13109          Bool   zeroHI = False;
13110          const HChar* arrSpec = NULL;
13111          Bool   ok  = getLaneInfo_Q_SZ(&tyI, &tyF, &nLanes, &zeroHI, &arrSpec,
13112                                        isQ, isF64 );
13113          IROp   iop = isU ? (isF64 ? Iop_I64UtoF64 : Iop_I32UtoF32)
13114                           : (isF64 ? Iop_I64StoF64 : Iop_I32StoF32);
13115          IRTemp rm  = mk_get_IR_rounding_mode();
13116          UInt   i;
13117          vassert(ok); /* the 'if' above should ensure this */
13118          for (i = 0; i < nLanes; i++) {
13119             putQRegLane(dd, i,
13120                         binop(iop, mkexpr(rm), getQRegLane(nn, i, tyI)));
13121          }
13122          if (zeroHI) {
13123             putQRegLane(dd, 1, mkU64(0));
13124          }
13125          DIP("%ccvtf %s.%s, %s.%s\n", isU ? 'u' : 's',
13126              nameQReg128(dd), arrSpec, nameQReg128(nn), arrSpec);
13127          return True;
13128       }
13129       /* else fall through */
13130    }
13131
13132    if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
13133       /* -------- 0,1x,11101: FRECPE  2d_2d, 4s_4s, 2s_2s -------- */
13134       /* -------- 1,1x,11101: FRSQRTE 2d_2d, 4s_4s, 2s_2s -------- */
13135       Bool isSQRT = bitU == 1;
13136       Bool isD    = (size & 1) == 1;
13137       IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
13138                            : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
13139       if (bitQ == 0 && isD) return False; // implied 1d case
13140       IRTemp resV = newTempV128();
13141       assign(resV, unop(op, getQReg128(nn)));
13142       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
13143       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
13144       DIP("%s %s.%s, %s.%s\n", isSQRT ? "frsqrte" : "frecpe",
13145           nameQReg128(dd), arr, nameQReg128(nn), arr);
13146       return True;
13147    }
13148
13149    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
13150       /* -------- 1,1x,11111: FSQRT 2d_2d, 4s_4s, 2s_2s -------- */
13151       Bool isD = (size & 1) == 1;
13152       IROp op  = isD ? Iop_Sqrt64Fx2 : Iop_Sqrt32Fx4;
13153       if (bitQ == 0 && isD) return False; // implied 1d case
13154       IRTemp resV = newTempV128();
13155       assign(resV, binop(op, mkexpr(mk_get_IR_rounding_mode()),
13156                              getQReg128(nn)));
13157       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
13158       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
13159       DIP("%s %s.%s, %s.%s\n", "fsqrt",
13160           nameQReg128(dd), arr, nameQReg128(nn), arr);
13161       return True;
13162    }
13163
13164    return False;
13165 #  undef INSN
13166 }
13167
13168
13169 static
13170 Bool dis_AdvSIMD_vector_x_indexed_elem(/*MB_OUT*/DisResult* dres, UInt insn)
13171 {
13172    /* 31    28    23   21 20 19 15     11   9 4
13173       0 Q U 01111 size L  M  m  opcode H  0 n d
13174       Decode fields are: u,size,opcode
13175       M is really part of the mm register number.  Individual
13176       cases need to inspect L and H though.
13177    */
13178 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13179    if (INSN(31,31) != 0
13180        || INSN(28,24) != BITS5(0,1,1,1,1) || INSN(10,10) !=0) {
13181       return False;
13182    }
13183    UInt bitQ   = INSN(30,30);
13184    UInt bitU   = INSN(29,29);
13185    UInt size   = INSN(23,22);
13186    UInt bitL   = INSN(21,21);
13187    UInt bitM   = INSN(20,20);
13188    UInt mmLO4  = INSN(19,16);
13189    UInt opcode = INSN(15,12);
13190    UInt bitH   = INSN(11,11);
13191    UInt nn     = INSN(9,5);
13192    UInt dd     = INSN(4,0);
13193    vassert(size < 4);
13194    vassert(bitH < 2 && bitM < 2 && bitL < 2);
13195
13196    if (bitU == 0 && size >= X10
13197        && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
13198       /* -------- 0,1x,0001 FMLA 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
13199       /* -------- 0,1x,0101 FMLS 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
13200       if (bitQ == 0 && size == X11) return False; // implied 1d case
13201       Bool isD   = (size & 1) == 1;
13202       Bool isSUB = opcode == BITS4(0,1,0,1);
13203       UInt index;
13204       if      (!isD)             index = (bitH << 1) | bitL;
13205       else if (isD && bitL == 0) index = bitH;
13206       else return False; // sz:L == x11 => unallocated encoding
13207       vassert(index < (isD ? 2 : 4));
13208       IRType ity   = isD ? Ity_F64 : Ity_F32;
13209       IRTemp elem  = newTemp(ity);
13210       UInt   mm    = (bitM << 4) | mmLO4;
13211       assign(elem, getQRegLane(mm, index, ity));
13212       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
13213       IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
13214       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
13215       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
13216       IRTemp rm    = mk_get_IR_rounding_mode();
13217       IRTemp t1    = newTempV128();
13218       IRTemp t2    = newTempV128();
13219       // FIXME: double rounding; use FMA primops instead
13220       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
13221       assign(t2, triop(isSUB ? opSUB : opADD,
13222                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
13223       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
13224       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13225       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
13226           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm),
13227           isD ? 'd' : 's', index);
13228       return True;
13229    }
13230
13231    if (size >= X10 && opcode == BITS4(1,0,0,1)) {
13232       /* -------- 0,1x,1001 FMUL  2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
13233       /* -------- 1,1x,1001 FMULX 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
13234       if (bitQ == 0 && size == X11) return False; // implied 1d case
13235       Bool isD    = (size & 1) == 1;
13236       Bool isMULX = bitU == 1;
13237       UInt index;
13238       if      (!isD)             index = (bitH << 1) | bitL;
13239       else if (isD && bitL == 0) index = bitH;
13240       else return False; // sz:L == x11 => unallocated encoding
13241       vassert(index < (isD ? 2 : 4));
13242       IRType ity  = isD ? Ity_F64 : Ity_F32;
13243       IRTemp elem = newTemp(ity);
13244       UInt   mm   = (bitM << 4) | mmLO4;
13245       assign(elem, getQRegLane(mm, index, ity));
13246       IRTemp dupd = math_DUP_TO_V128(elem, ity);
13247       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
13248       IRTemp res  = newTempV128();
13249       assign(res, triop(isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4,
13250                         mkexpr(mk_get_IR_rounding_mode()),
13251                         getQReg128(nn), mkexpr(dupd)));
13252       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13253       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13254       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n",
13255           isMULX ? "fmulx" : "fmul", nameQReg128(dd), arr,
13256           nameQReg128(nn), arr, nameQReg128(mm), isD ? 'd' : 's', index);
13257       return True;
13258    }
13259
13260    if ((bitU == 1 && (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,1,0,0)))
13261        || (bitU == 0 && opcode == BITS4(1,0,0,0))) {
13262       /* -------- 1,xx,0000 MLA s/h variants only -------- */
13263       /* -------- 1,xx,0100 MLS s/h variants only -------- */
13264       /* -------- 0,xx,1000 MUL s/h variants only -------- */
13265       Bool isMLA = opcode == BITS4(0,0,0,0);
13266       Bool isMLS = opcode == BITS4(0,1,0,0);
13267       UInt mm    = 32; // invalid
13268       UInt ix    = 16; // invalid
13269       switch (size) {
13270          case X00:
13271             return False; // b case is not allowed
13272          case X01:
13273             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13274          case X10:
13275             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13276          case X11:
13277             return False; // d case is not allowed
13278          default:
13279             vassert(0);
13280       }
13281       vassert(mm < 32 && ix < 16);
13282       IROp   opMUL = mkVecMUL(size);
13283       IROp   opADD = mkVecADD(size);
13284       IROp   opSUB = mkVecSUB(size);
13285       HChar  ch    = size == X01 ? 'h' : 's';
13286       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13287       IRTemp vecD  = newTempV128();
13288       IRTemp vecN  = newTempV128();
13289       IRTemp res   = newTempV128();
13290       assign(vecD, getQReg128(dd));
13291       assign(vecN, getQReg128(nn));
13292       IRExpr* prod = binop(opMUL, mkexpr(vecN), mkexpr(vecM));
13293       if (isMLA || isMLS) {
13294          assign(res, binop(isMLA ? opADD : opSUB, mkexpr(vecD), prod));
13295       } else {
13296          assign(res, prod);
13297       }
13298       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13299       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13300       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isMLA ? "mla"
13301                                                 : (isMLS ? "mls" : "mul"),
13302           nameQReg128(dd), arr,
13303           nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
13304       return True;
13305    }
13306
13307    if (opcode == BITS4(1,0,1,0)
13308        || opcode == BITS4(0,0,1,0) || opcode == BITS4(0,1,1,0)) {
13309       /* -------- 0,xx,1010 SMULL s/h variants only -------- */ // 0 (ks)
13310       /* -------- 1,xx,1010 UMULL s/h variants only -------- */ // 0
13311       /* -------- 0,xx,0010 SMLAL s/h variants only -------- */ // 1
13312       /* -------- 1,xx,0010 UMLAL s/h variants only -------- */ // 1
13313       /* -------- 0,xx,0110 SMLSL s/h variants only -------- */ // 2
13314       /* -------- 1,xx,0110 SMLSL s/h variants only -------- */ // 2
13315       /* Widens, and size refers to the narrowed lanes. */
13316       UInt ks = 3;
13317       switch (opcode) {
13318          case BITS4(1,0,1,0): ks = 0; break;
13319          case BITS4(0,0,1,0): ks = 1; break;
13320          case BITS4(0,1,1,0): ks = 2; break;
13321          default: vassert(0);
13322       }
13323       vassert(ks >= 0 && ks <= 2);
13324       Bool isU = bitU == 1;
13325       Bool is2 = bitQ == 1;
13326       UInt mm  = 32; // invalid
13327       UInt ix  = 16; // invalid
13328       switch (size) {
13329          case X00:
13330             return False; // h_b_b[] case is not allowed
13331          case X01:
13332             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13333          case X10:
13334             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13335          case X11:
13336             return False; // q_d_d[] case is not allowed
13337          default:
13338             vassert(0);
13339       }
13340       vassert(mm < 32 && ix < 16);
13341       IRTemp vecN  = newTempV128();
13342       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13343       IRTemp vecD  = newTempV128();
13344       assign(vecN, getQReg128(nn));
13345       assign(vecD, getQReg128(dd));
13346       IRTemp res = IRTemp_INVALID;
13347       math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
13348                     vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
13349       putQReg128(dd, mkexpr(res));
13350       const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
13351       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13352       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13353       HChar ch               = size == X01 ? 'h' : 's';
13354       DIP("%c%s%s %s.%s, %s.%s, %s.%c[%u]\n",
13355           isU ? 'u' : 's', nm, is2 ? "2" : "",
13356           nameQReg128(dd), arrWide,
13357           nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
13358       return True;
13359    }
13360
13361    if (bitU == 0
13362        && (opcode == BITS4(1,0,1,1)
13363            || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
13364       /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
13365       /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
13366       /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
13367       /* Widens, and size refers to the narrowed lanes. */
13368       UInt ks = 3;
13369       switch (opcode) {
13370          case BITS4(1,0,1,1): ks = 0; break;
13371          case BITS4(0,0,1,1): ks = 1; break;
13372          case BITS4(0,1,1,1): ks = 2; break;
13373          default: vassert(0);
13374       }
13375       vassert(ks >= 0 && ks <= 2);
13376       Bool is2 = bitQ == 1;
13377       UInt mm  = 32; // invalid
13378       UInt ix  = 16; // invalid
13379       switch (size) {
13380          case X00:
13381             return False; // h_b_b[] case is not allowed
13382          case X01:
13383             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13384          case X10:
13385             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13386          case X11:
13387             return False; // q_d_d[] case is not allowed
13388          default:
13389             vassert(0);
13390       }
13391       vassert(mm < 32 && ix < 16);
13392       IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
13393       vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
13394       newTempsV128_2(&vecN, &vecD);
13395       assign(vecN, getQReg128(nn));
13396       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13397       assign(vecD, getQReg128(dd));
13398       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
13399                        is2, size, "mas"[ks],
13400                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
13401       putQReg128(dd, mkexpr(res));
13402       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
13403       updateQCFLAGwithDifference(sat1q, sat1n);
13404       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
13405          updateQCFLAGwithDifference(sat2q, sat2n);
13406       }
13407       const HChar* nm        = ks == 0 ? "sqdmull"
13408                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
13409       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13410       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13411       HChar ch               = size == X01 ? 'h' : 's';
13412       DIP("%s%s %s.%s, %s.%s, %s.%c[%u]\n",
13413           nm, is2 ? "2" : "",
13414           nameQReg128(dd), arrWide,
13415           nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
13416       return True;
13417    }
13418
13419    if (bitU == 0 && (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1))) {
13420       /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
13421       /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
13422       UInt mm  = 32; // invalid
13423       UInt ix  = 16; // invalid
13424       switch (size) {
13425          case X00:
13426             return False; // b case is not allowed
13427          case X01:
13428             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13429          case X10:
13430             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13431          case X11:
13432             return False; // q case is not allowed
13433          default:
13434             vassert(0);
13435       }
13436       vassert(mm < 32 && ix < 16);
13437       Bool isR = opcode == BITS4(1,1,0,1);
13438       IRTemp res, sat1q, sat1n, vN, vM;
13439       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
13440       vN = newTempV128();
13441       assign(vN, getQReg128(nn));
13442       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13443       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
13444       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13445       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
13446       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
13447       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
13448       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13449       HChar ch         = size == X01 ? 'h' : 's';
13450       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", nm,
13451           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
13452       return True;
13453    }
13454
13455    if (bitU == 1 && (opcode == BITS4(1,1,0,1) || opcode == BITS4(1,1,1,1))) {
13456       /* -------- 0,xx,1101 SQRDMLAH s and h variants only -------- */
13457       /* -------- 0,xx,1111 SQRDMLSH s and h variants only -------- */
13458       UInt mm  = 32; // invalid
13459       UInt ix  = 16; // invalid
13460       switch (size) {
13461          case X00:
13462             return False; // b case is not allowed
13463          case X01:        // h
13464             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13465          case X10:        // s
13466             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13467          case X11:
13468             return False; // d case is not allowed
13469          default:
13470             vassert(0);
13471       }
13472       vassert(mm < 32 && ix < 16);
13473
13474       IRTemp res, res_nosat, vD, vN, vM;
13475       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
13476       newTempsV128_2(&vD, &vN);
13477       assign(vD, getQReg128(dd));
13478       assign(vN, getQReg128(nn));
13479
13480       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13481       Bool isAdd = opcode == BITS4(1,1,0,1);
13482       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
13483       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
13484       updateQCFLAGwithDifferenceZHI(res, res_nosat, opZHI);
13485       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13486
13487       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13488       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
13489       HChar ch         = size == X01 ? 'h' : 's';
13490       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", nm,
13491           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), ch, ix);
13492       return True;
13493    }
13494
13495    return False;
13496 #  undef INSN
13497 }
13498
13499
13500 static
13501 Bool dis_AdvSIMD_crypto_aes(/*MB_OUT*/DisResult* dres, UInt insn)
13502 {
13503    /* 31        23   21    16     11 9 4
13504       0100 1110 size 10100 opcode 10 n d
13505       Decode fields are: size,opcode
13506       Size is always 00 in ARMv8, it appears.
13507    */
13508 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13509    if (INSN(31,24) != BITS8(0,1,0,0,1,1,1,0)
13510       || INSN(21,17) != BITS5(1,0,1,0,0) || INSN(11,10) != BITS2(1,0)) {
13511       return False;
13512    }
13513    UInt size   = INSN(23,22);
13514    UInt opcode = INSN(16,12);
13515    UInt nn     = INSN(9,5);
13516    UInt dd     = INSN(4,0);
13517
13518    if (size == BITS2(0,0)
13519        && (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,0,1))) {
13520       /* -------- 00,00100: AESE Vd.16b, Vn.16b -------- */
13521       /* -------- 00,00101: AESD Vd.16b, Vn.16b -------- */
13522       Bool   isD  = opcode == BITS5(0,0,1,0,1);
13523       IRTemp op1  = newTemp(Ity_V128);
13524       IRTemp op2  = newTemp(Ity_V128);
13525       IRTemp xord = newTemp(Ity_V128);
13526       IRTemp res  = newTemp(Ity_V128);
13527       void*        helper = isD ? &arm64g_dirtyhelper_AESD
13528                                 : &arm64g_dirtyhelper_AESE;
13529       const HChar* hname  = isD ? "arm64g_dirtyhelper_AESD"
13530                                 : "arm64g_dirtyhelper_AESE";
13531       assign(op1, getQReg128(dd));
13532       assign(op2, getQReg128(nn));
13533       assign(xord, binop(Iop_XorV128, mkexpr(op1), mkexpr(op2)));
13534       IRDirty* di
13535          = unsafeIRDirty_1_N( res, 0/*regparms*/, hname, helper,
13536                               mkIRExprVec_3(
13537                                  IRExpr_VECRET(),
13538                                  unop(Iop_V128HIto64, mkexpr(xord)),
13539                                  unop(Iop_V128to64, mkexpr(xord)) ) );
13540       stmt(IRStmt_Dirty(di));
13541       putQReg128(dd, mkexpr(res));
13542       DIP("aes%c %s.16b, %s.16b\n", isD ? 'd' : 'e',
13543                                     nameQReg128(dd), nameQReg128(nn));
13544       return True;
13545    }
13546
13547    if (size == BITS2(0,0)
13548        && (opcode == BITS5(0,0,1,1,0) || opcode == BITS5(0,0,1,1,1))) {
13549       /* -------- 00,00110: AESMC  Vd.16b, Vn.16b -------- */
13550       /* -------- 00,00111: AESIMC Vd.16b, Vn.16b -------- */
13551       Bool   isI  = opcode == BITS5(0,0,1,1,1);
13552       IRTemp src  = newTemp(Ity_V128);
13553       IRTemp res  = newTemp(Ity_V128);
13554       void*        helper = isI ? &arm64g_dirtyhelper_AESIMC
13555                                 : &arm64g_dirtyhelper_AESMC;
13556       const HChar* hname  = isI ? "arm64g_dirtyhelper_AESIMC"
13557                                 : "arm64g_dirtyhelper_AESMC";
13558       assign(src, getQReg128(nn));
13559       IRDirty* di
13560          = unsafeIRDirty_1_N( res, 0/*regparms*/, hname, helper,
13561                               mkIRExprVec_3(
13562                                  IRExpr_VECRET(),
13563                                  unop(Iop_V128HIto64, mkexpr(src)),
13564                                  unop(Iop_V128to64, mkexpr(src)) ) );
13565       stmt(IRStmt_Dirty(di));
13566       putQReg128(dd, mkexpr(res));
13567       DIP("aes%s %s.16b, %s.16b\n", isI ? "imc" : "mc",
13568                                     nameQReg128(dd), nameQReg128(nn));
13569       return True;
13570    }
13571
13572    return False;
13573 #  undef INSN
13574 }
13575
13576
13577 static
13578 Bool dis_AdvSIMD_crypto_three_reg_sha(/*MB_OUT*/DisResult* dres, UInt insn)
13579 {
13580    /* 31   28   23 21 20 15 14  11 9 4
13581       0101 1110 sz 0  m  0  opc 00 n d
13582       Decode fields are: sz,opc
13583    */
13584 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13585    if (INSN(31,24) != BITS8(0,1,0,1,1,1,1,0) || INSN(21,21) != 0
13586        || INSN(15,15) != 0 || INSN(11,10) != BITS2(0,0)) {
13587       return False;
13588    }
13589    UInt sz  = INSN(23,22);
13590    UInt mm  = INSN(20,16);
13591    UInt opc = INSN(14,12);
13592    UInt nn  = INSN(9,5);
13593    UInt dd  = INSN(4,0);
13594    if (sz == BITS2(0,0) && opc <= BITS3(1,1,0)) {
13595       /* -------- 00,000 SHA1C     Qd,    Sn,    Vm.4S -------- */
13596       /* -------- 00,001 SHA1P     Qd,    Sn,    Vm.4S -------- */
13597       /* -------- 00,010 SHA1M     Qd,    Sn,    Vm.4S -------- */
13598       /* -------- 00,011 SHA1SU0   Vd.4S, Vn.4S, Vm.4S -------- */
13599       /* -------- 00,100 SHA256H   Qd,    Qn,    Vm.4S -------- */
13600       /* -------- 00,101 SHA256H2  Qd,    Qn,    Vm.4S -------- */
13601       /* -------- 00,110 SHA256SU1 Vd.4S, Vn.4S, Vm.4S -------- */
13602       vassert(opc < 7);
13603       const HChar* inames[7]
13604          = { "sha1c", "sha1p", "sha1m", "sha1su0",
13605              "sha256h", "sha256h2", "sha256su1" };
13606       void(*helpers[7])(V128*,ULong,ULong,ULong,ULong,ULong,ULong)
13607          = { &arm64g_dirtyhelper_SHA1C,    &arm64g_dirtyhelper_SHA1P,
13608              &arm64g_dirtyhelper_SHA1M,    &arm64g_dirtyhelper_SHA1SU0,
13609              &arm64g_dirtyhelper_SHA256H,  &arm64g_dirtyhelper_SHA256H2,
13610              &arm64g_dirtyhelper_SHA256SU1 };
13611       const HChar* hnames[7]
13612          = { "arm64g_dirtyhelper_SHA1C",    "arm64g_dirtyhelper_SHA1P",
13613              "arm64g_dirtyhelper_SHA1M",    "arm64g_dirtyhelper_SHA1SU0",
13614              "arm64g_dirtyhelper_SHA256H",  "arm64g_dirtyhelper_SHA256H2",
13615              "arm64g_dirtyhelper_SHA256SU1" };
13616       IRTemp vD      = newTemp(Ity_V128);
13617       IRTemp vN      = newTemp(Ity_V128);
13618       IRTemp vM      = newTemp(Ity_V128);
13619       IRTemp vDhi    = newTemp(Ity_I64);
13620       IRTemp vDlo    = newTemp(Ity_I64);
13621       IRTemp vNhiPre = newTemp(Ity_I64);
13622       IRTemp vNloPre = newTemp(Ity_I64);
13623       IRTemp vNhi    = newTemp(Ity_I64);
13624       IRTemp vNlo    = newTemp(Ity_I64);
13625       IRTemp vMhi    = newTemp(Ity_I64);
13626       IRTemp vMlo    = newTemp(Ity_I64);
13627       assign(vD,      getQReg128(dd));
13628       assign(vN,      getQReg128(nn));
13629       assign(vM,      getQReg128(mm));
13630       assign(vDhi,    unop(Iop_V128HIto64, mkexpr(vD)));
13631       assign(vDlo,    unop(Iop_V128to64,   mkexpr(vD)));
13632       assign(vNhiPre, unop(Iop_V128HIto64, mkexpr(vN)));
13633       assign(vNloPre, unop(Iop_V128to64,   mkexpr(vN)));
13634       assign(vMhi,    unop(Iop_V128HIto64, mkexpr(vM)));
13635       assign(vMlo,    unop(Iop_V128to64,   mkexpr(vM)));
13636       /* Mask off any bits of the N register operand that aren't actually
13637          needed, so that Memcheck doesn't complain unnecessarily. */
13638       switch (opc) {
13639          case BITS3(0,0,0): case BITS3(0,0,1): case BITS3(0,1,0):
13640             assign(vNhi, mkU64(0));
13641             assign(vNlo, unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(vNloPre))));
13642             break;
13643          case BITS3(0,1,1): case BITS3(1,0,0):
13644          case BITS3(1,0,1): case BITS3(1,1,0):
13645             assign(vNhi, mkexpr(vNhiPre));
13646             assign(vNlo, mkexpr(vNloPre));
13647             break;
13648          default:
13649             vassert(0);
13650       }
13651       IRTemp res = newTemp(Ity_V128);
13652       IRDirty* di
13653          = unsafeIRDirty_1_N( res, 0/*regparms*/, hnames[opc], helpers[opc],
13654                               mkIRExprVec_7(
13655                                  IRExpr_VECRET(),
13656                                  mkexpr(vDhi), mkexpr(vDlo), mkexpr(vNhi),
13657                                  mkexpr(vNlo), mkexpr(vMhi), mkexpr(vMlo)));
13658       stmt(IRStmt_Dirty(di));
13659       putQReg128(dd, mkexpr(res));
13660       switch (opc) {
13661          case BITS3(0,0,0): case BITS3(0,0,1): case BITS3(0,1,0):
13662             DIP("%s q%u, s%u, v%u.4s\n", inames[opc], dd, nn, mm);
13663             break;
13664          case BITS3(0,1,1): case BITS3(1,1,0):
13665             DIP("%s v%u.4s, v%u.4s, v%u.4s\n", inames[opc], dd, nn, mm);
13666             break;
13667          case BITS3(1,0,0): case BITS3(1,0,1):
13668             DIP("%s q%u, q%u, v%u.4s\n", inames[opc], dd, nn, mm);
13669             break;
13670          default:
13671             vassert(0);
13672       }
13673       return True;
13674    }
13675
13676    return False;
13677 #  undef INSN
13678 }
13679
13680
13681 static
13682 Bool dis_AdvSIMD_crypto_two_reg_sha(/*MB_OUT*/DisResult* dres, UInt insn)
13683 {
13684    /* 31   28   23 21    16  11 9 4
13685       0101 1110 sz 10100 opc 10 n d
13686       Decode fields are: sz,opc
13687    */
13688 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13689    if (INSN(31,24) != BITS8(0,1,0,1,1,1,1,0)
13690        || INSN(21,17) != BITS5(1,0,1,0,0) || INSN(11,10) != BITS2(1,0)) {
13691       return False;
13692    }
13693    UInt sz  = INSN(23,22);
13694    UInt opc = INSN(16,12);
13695    UInt nn  = INSN(9,5);
13696    UInt dd  = INSN(4,0);
13697    if (sz == BITS2(0,0) && opc <= BITS5(0,0,0,1,0)) {
13698       /* -------- 00,00000 SHA1H     Sd,    Sn    -------- */
13699       /* -------- 00,00001 SHA1SU1   Vd.4S, Vn.4S -------- */
13700       /* -------- 00,00010 SHA256SU0 Vd.4S, Vn.4S -------- */
13701       vassert(opc < 3);
13702       const HChar* inames[3] = { "sha1h", "sha1su1", "sha256su0" };
13703       IRTemp vD   = newTemp(Ity_V128);
13704       IRTemp vN   = newTemp(Ity_V128);
13705       IRTemp vDhi = newTemp(Ity_I64);
13706       IRTemp vDlo = newTemp(Ity_I64);
13707       IRTemp vNhi = newTemp(Ity_I64);
13708       IRTemp vNlo = newTemp(Ity_I64);
13709       assign(vD,   getQReg128(dd));
13710       assign(vN,   getQReg128(nn));
13711       assign(vDhi, unop(Iop_V128HIto64, mkexpr(vD)));
13712       assign(vDlo, unop(Iop_V128to64,   mkexpr(vD)));
13713       assign(vNhi, unop(Iop_V128HIto64, mkexpr(vN)));
13714       assign(vNlo, unop(Iop_V128to64,   mkexpr(vN)));
13715       /* Mask off any bits of the N register operand that aren't actually
13716          needed, so that Memcheck doesn't complain unnecessarily.  Also
13717          construct the calls, given that the helper functions don't take
13718          the same number of arguments. */
13719       IRDirty* di  = NULL;
13720       IRTemp   res = newTemp(Ity_V128);
13721       switch (opc) {
13722          case BITS5(0,0,0,0,0): {
13723             IRExpr* vNloMasked = unop(Iop_32Uto64,
13724                                       unop(Iop_64to32, mkexpr(vNlo)));
13725             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
13726                                     "arm64g_dirtyhelper_SHA1H",
13727                                     &arm64g_dirtyhelper_SHA1H,
13728                                     mkIRExprVec_3(
13729                                        IRExpr_VECRET(),
13730                                        mkU64(0), vNloMasked) );
13731             break;
13732          }
13733          case BITS5(0,0,0,0,1):
13734             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
13735                                     "arm64g_dirtyhelper_SHA1SU1",
13736                                     &arm64g_dirtyhelper_SHA1SU1,
13737                                     mkIRExprVec_5(
13738                                        IRExpr_VECRET(),
13739                                        mkexpr(vDhi), mkexpr(vDlo),
13740                                        mkexpr(vNhi), mkexpr(vNlo)) );
13741             break;
13742          case BITS5(0,0,0,1,0):
13743             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
13744                                     "arm64g_dirtyhelper_SHA256SU0",
13745                                     &arm64g_dirtyhelper_SHA256SU0,
13746                                     mkIRExprVec_5(
13747                                        IRExpr_VECRET(),
13748                                        mkexpr(vDhi), mkexpr(vDlo),
13749                                        mkexpr(vNhi), mkexpr(vNlo)) );
13750             break;
13751          default:
13752             vassert(0);
13753       }
13754       stmt(IRStmt_Dirty(di));
13755       putQReg128(dd, mkexpr(res));
13756       switch (opc) {
13757          case BITS5(0,0,0,0,0):
13758             DIP("%s s%u, s%u\n", inames[opc], dd, nn);
13759             break;
13760          case BITS5(0,0,0,0,1): case BITS5(0,0,0,1,0):
13761             DIP("%s v%u.4s, v%u.4s\n", inames[opc], dd, nn);
13762             break;
13763          default:
13764             vassert(0);
13765       }
13766       return True;
13767    }
13768
13769    return False;
13770 #  undef INSN
13771 }
13772
13773
13774 static
13775 Bool dis_AdvSIMD_fp_compare(/*MB_OUT*/DisResult* dres, UInt insn)
13776 {
13777    /* 31  28    23 21 20 15 13   9 4
13778       000 11110 ty 1  m  op 1000 n opcode2
13779       The first 3 bits are really "M 0 S", but M and S are always zero.
13780       Decode fields are: ty,op,opcode2
13781    */
13782 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13783    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
13784        || INSN(21,21) != 1 || INSN(13,10) != BITS4(1,0,0,0)) {
13785       return False;
13786    }
13787    UInt ty      = INSN(23,22);
13788    UInt mm      = INSN(20,16);
13789    UInt op      = INSN(15,14);
13790    UInt nn      = INSN(9,5);
13791    UInt opcode2 = INSN(4,0);
13792    vassert(ty < 4);
13793
13794    if (ty <= X01 && op == X00
13795        && (opcode2 & BITS5(0,0,1,1,1)) == BITS5(0,0,0,0,0)) {
13796       /* -------- 0x,00,00000 FCMP  d_d,   s_s -------- */
13797       /* -------- 0x,00,01000 FCMP  d_#0, s_#0 -------- */
13798       /* -------- 0x,00,10000 FCMPE d_d,   s_s -------- */
13799       /* -------- 0x,00,11000 FCMPE d_#0, s_#0 -------- */
13800       /* 31        23   20    15      9 4
13801          000 11110 01 1     m 00 1000 n 10 000  FCMPE Dn, Dm
13802          000 11110 01 1 00000 00 1000 n 11 000  FCMPE Dn, #0.0
13803          000 11110 01 1     m 00 1000 n 00 000  FCMP  Dn, Dm
13804          000 11110 01 1 00000 00 1000 n 01 000  FCMP  Dn, #0.0
13805
13806          000 11110 00 1     m 00 1000 n 10 000  FCMPE Sn, Sm
13807          000 11110 00 1 00000 00 1000 n 11 000  FCMPE Sn, #0.0
13808          000 11110 00 1     m 00 1000 n 00 000  FCMP  Sn, Sm
13809          000 11110 00 1 00000 00 1000 n 01 000  FCMP  Sn, #0.0
13810
13811          FCMPE generates Invalid Operation exn if either arg is any kind
13812          of NaN.  FCMP generates Invalid Operation exn if either arg is a
13813          signalling NaN.  We ignore this detail here and produce the same
13814          IR for both.
13815       */
13816       Bool   isD     = (ty & 1) == 1;
13817       Bool   isCMPE  = (opcode2 & 16) == 16;
13818       Bool   cmpZero = (opcode2 & 8) == 8;
13819       IRType ity     = isD ? Ity_F64 : Ity_F32;
13820       Bool   valid   = True;
13821       if (cmpZero && mm != 0) valid = False;
13822       if (valid) {
13823          IRTemp argL  = newTemp(ity);
13824          IRTemp argR  = newTemp(ity);
13825          IRTemp irRes = newTemp(Ity_I32);
13826          assign(argL, getQRegLO(nn, ity));
13827          assign(argR,
13828                 cmpZero
13829                    ? (IRExpr_Const(isD ? IRConst_F64i(0) : IRConst_F32i(0)))
13830                    : getQRegLO(mm, ity));
13831          assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32,
13832                              mkexpr(argL), mkexpr(argR)));
13833          IRTemp nzcv = mk_convert_IRCmpF64Result_to_NZCV(irRes);
13834          IRTemp nzcv_28x0 = newTemp(Ity_I64);
13835          assign(nzcv_28x0, binop(Iop_Shl64, mkexpr(nzcv), mkU8(28)));
13836          setFlags_COPY(nzcv_28x0);
13837          DIP("fcmp%s %s, %s\n", isCMPE ? "e" : "", nameQRegLO(nn, ity),
13838              cmpZero ? "#0.0" : nameQRegLO(mm, ity));
13839          return True;
13840       }
13841       return False;
13842    }
13843
13844    return False;
13845 #  undef INSN
13846 }
13847
13848
13849 static
13850 Bool dis_AdvSIMD_fp_conditional_compare(/*MB_OUT*/DisResult* dres, UInt insn)
13851 {
13852    /* 31  28    23 21 20 15   11 9 4  3
13853       000 11110 ty 1  m  cond 01 n op nzcv
13854       The first 3 bits are really "M 0 S", but M and S are always zero.
13855       Decode fields are: ty,op
13856    */
13857 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13858    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
13859        || INSN(21,21) != 1 || INSN(11,10) != BITS2(0,1)) {
13860       return False;
13861    }
13862    UInt ty   = INSN(23,22);
13863    UInt mm   = INSN(20,16);
13864    UInt cond = INSN(15,12);
13865    UInt nn   = INSN(9,5);
13866    UInt op   = INSN(4,4);
13867    UInt nzcv = INSN(3,0);
13868    vassert(ty < 4 && op <= 1);
13869
13870    if (ty <= BITS2(0,1)) {
13871       /* -------- 00,0 FCCMP  s_s -------- */
13872       /* -------- 00,1 FCCMPE s_s -------- */
13873       /* -------- 01,0 FCCMP  d_d -------- */
13874       /* -------- 01,1 FCCMPE d_d -------- */
13875
13876       /* FCCMPE generates Invalid Operation exn if either arg is any kind
13877          of NaN.  FCCMP generates Invalid Operation exn if either arg is a
13878          signalling NaN.  We ignore this detail here and produce the same
13879          IR for both.
13880       */
13881       Bool   isD    = (ty & 1) == 1;
13882       Bool   isCMPE = op == 1;
13883       IRType ity    = isD ? Ity_F64 : Ity_F32;
13884       IRTemp argL   = newTemp(ity);
13885       IRTemp argR   = newTemp(ity);
13886       IRTemp irRes  = newTemp(Ity_I32);
13887       assign(argL,  getQRegLO(nn, ity));
13888       assign(argR,  getQRegLO(mm, ity));
13889       assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32,
13890                           mkexpr(argL), mkexpr(argR)));
13891       IRTemp condT = newTemp(Ity_I1);
13892       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
13893       IRTemp nzcvT = mk_convert_IRCmpF64Result_to_NZCV(irRes);
13894
13895       IRTemp nzcvT_28x0 = newTemp(Ity_I64);
13896       assign(nzcvT_28x0, binop(Iop_Shl64, mkexpr(nzcvT), mkU8(28)));
13897
13898       IRExpr* nzcvF_28x0 = mkU64(((ULong)nzcv) << 28);
13899
13900       IRTemp nzcv_28x0 = newTemp(Ity_I64);
13901       assign(nzcv_28x0, IRExpr_ITE(mkexpr(condT),
13902                                    mkexpr(nzcvT_28x0), nzcvF_28x0));
13903       setFlags_COPY(nzcv_28x0);
13904       DIP("fccmp%s %s, %s, #%u, %s\n", isCMPE ? "e" : "",
13905           nameQRegLO(nn, ity), nameQRegLO(mm, ity), nzcv, nameCC(cond));
13906       return True;
13907    }
13908
13909    return False;
13910 #  undef INSN
13911 }
13912
13913
13914 static
13915 Bool dis_AdvSIMD_fp_conditional_select(/*MB_OUT*/DisResult* dres, UInt insn)
13916 {
13917    /* 31        23 21 20 15   11 9 5
13918       000 11110 ty 1  m  cond 11 n d
13919       The first 3 bits are really "M 0 S", but M and S are always zero.
13920       Decode fields: ty
13921    */
13922 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13923    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0) || INSN(21,21) != 1
13924        || INSN(11,10) != BITS2(1,1)) {
13925       return False;
13926    }
13927    UInt ty   = INSN(23,22);
13928    UInt mm   = INSN(20,16);
13929    UInt cond = INSN(15,12);
13930    UInt nn   = INSN(9,5);
13931    UInt dd   = INSN(4,0);
13932    if (ty <= X01) {
13933       /* -------- 00: FCSEL s_s -------- */
13934       /* -------- 00: FCSEL d_d -------- */
13935       IRType ity = ty == X01 ? Ity_F64 : Ity_F32;
13936       IRTemp srcT = newTemp(ity);
13937       IRTemp srcF = newTemp(ity);
13938       IRTemp res  = newTemp(ity);
13939       assign(srcT, getQRegLO(nn, ity));
13940       assign(srcF, getQRegLO(mm, ity));
13941       assign(res, IRExpr_ITE(
13942                      unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
13943                      mkexpr(srcT), mkexpr(srcF)));
13944       putQReg128(dd, mkV128(0x0000));
13945       putQRegLO(dd, mkexpr(res));
13946       DIP("fcsel %s, %s, %s, %s\n",
13947           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity),
13948           nameCC(cond));
13949       return True;
13950    }
13951    return False;
13952 #  undef INSN
13953 }
13954
13955
13956 static
13957 Bool dis_AdvSIMD_fp_data_proc_1_source(/*MB_OUT*/DisResult* dres, UInt insn)
13958 {
13959    /* 31  28    23 21 20     14    9 4
13960       000 11110 ty 1  opcode 10000 n d
13961       The first 3 bits are really "M 0 S", but M and S are always zero.
13962       Decode fields: ty,opcode
13963    */
13964 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13965    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
13966        || INSN(21,21) != 1 || INSN(14,10) != BITS5(1,0,0,0,0)) {
13967       return False;
13968    }
13969    UInt ty     = INSN(23,22);
13970    UInt opcode = INSN(20,15);
13971    UInt nn     = INSN(9,5);
13972    UInt dd     = INSN(4,0);
13973
13974    if (ty <= X01 && opcode <= BITS6(0,0,0,0,1,1)) {
13975       /* -------- 0x,000000: FMOV  d_d, s_s -------- */
13976       /* -------- 0x,000001: FABS  d_d, s_s -------- */
13977       /* -------- 0x,000010: FNEG  d_d, s_s -------- */
13978       /* -------- 0x,000011: FSQRT d_d, s_s -------- */
13979       IRType ity = ty == X01 ? Ity_F64 : Ity_F32;
13980       IRTemp src = newTemp(ity);
13981       IRTemp res = newTemp(ity);
13982       const HChar* nm = "??";
13983       assign(src, getQRegLO(nn, ity));
13984       switch (opcode) {
13985          case BITS6(0,0,0,0,0,0):
13986             nm = "fmov"; assign(res, mkexpr(src)); break;
13987          case BITS6(0,0,0,0,0,1):
13988             nm = "fabs"; assign(res, unop(mkABSF(ity), mkexpr(src))); break;
13989          case BITS6(0,0,0,0,1,0):
13990             nm = "fabs"; assign(res, unop(mkNEGF(ity), mkexpr(src))); break;
13991          case BITS6(0,0,0,0,1,1):
13992             nm = "fsqrt";
13993             assign(res, binop(mkSQRTF(ity),
13994                               mkexpr(mk_get_IR_rounding_mode()),
13995                               mkexpr(src))); break;
13996          default:
13997             vassert(0);
13998       }
13999       putQReg128(dd, mkV128(0x0000));
14000       putQRegLO(dd, mkexpr(res));
14001       DIP("%s %s, %s\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
14002       return True;
14003    }
14004
14005    if (   (ty == X11 && (opcode == BITS6(0,0,0,1,0,0)
14006                          || opcode == BITS6(0,0,0,1,0,1)))
14007        || (ty == X00 && (opcode == BITS6(0,0,0,1,1,1)
14008                          || opcode == BITS6(0,0,0,1,0,1)))
14009        || (ty == X01 && (opcode == BITS6(0,0,0,1,1,1)
14010                          || opcode == BITS6(0,0,0,1,0,0)))) {
14011       /* -------- 11,000100: FCVT s_h -------- */
14012       /* -------- 11,000101: FCVT d_h -------- */
14013       /* -------- 00,000111: FCVT h_s -------- */
14014       /* -------- 00,000101: FCVT d_s -------- */
14015       /* -------- 01,000111: FCVT h_d -------- */
14016       /* -------- 01,000100: FCVT s_d -------- */
14017       /* 31        23 21    16 14    9 4
14018          000 11110 11 10001 00 10000 n d   FCVT Sd, Hn
14019          --------- 11 ----- 01 ---------   FCVT Dd, Hn
14020          --------- 00 ----- 11 ---------   FCVT Hd, Sn
14021          --------- 00 ----- 01 ---------   FCVT Dd, Sn
14022          --------- 01 ----- 11 ---------   FCVT Hd, Dn
14023          --------- 01 ----- 00 ---------   FCVT Sd, Dn
14024          Rounding, when dst is smaller than src, is per the FPCR.
14025       */
14026       UInt b2322 = ty;
14027       UInt b1615 = opcode & BITS2(1,1);
14028       switch ((b2322 << 2) | b1615) {
14029          case BITS4(0,0,0,1):   // S -> D
14030          case BITS4(1,1,0,1): { // H -> D
14031             Bool   srcIsH = b2322 == BITS2(1,1);
14032             IRType srcTy  = srcIsH ? Ity_F16 : Ity_F32;
14033             IRTemp res    = newTemp(Ity_F64);
14034             assign(res, unop(srcIsH ? Iop_F16toF64 : Iop_F32toF64,
14035                              getQRegLO(nn, srcTy)));
14036             putQReg128(dd, mkV128(0x0000));
14037             putQRegLO(dd, mkexpr(res));
14038             DIP("fcvt %s, %s\n",
14039                 nameQRegLO(dd, Ity_F64), nameQRegLO(nn, srcTy));
14040             return True;
14041          }
14042          case BITS4(0,1,0,0):   // D -> S
14043          case BITS4(0,1,1,1): { // D -> H
14044             Bool   dstIsH = b1615 == BITS2(1,1);
14045             IRType dstTy  = dstIsH ? Ity_F16 : Ity_F32;
14046             IRTemp res    = newTemp(dstTy);
14047             assign(res, binop(dstIsH ? Iop_F64toF16 : Iop_F64toF32,
14048                               mkexpr(mk_get_IR_rounding_mode()),
14049                               getQRegLO(nn, Ity_F64)));
14050             putQReg128(dd, mkV128(0x0000));
14051             putQRegLO(dd, mkexpr(res));
14052             DIP("fcvt %s, %s\n",
14053                 nameQRegLO(dd, dstTy), nameQRegLO(nn, Ity_F64));
14054             return True;
14055          }
14056          case BITS4(0,0,1,1):   // S -> H
14057          case BITS4(1,1,0,0): { // H -> S
14058             Bool   toH   = b1615 == BITS2(1,1);
14059             IRType srcTy = toH ? Ity_F32 : Ity_F16;
14060             IRType dstTy = toH ? Ity_F16 : Ity_F32;
14061             IRTemp res = newTemp(dstTy);
14062             if (toH) {
14063                assign(res, binop(Iop_F32toF16,
14064                                  mkexpr(mk_get_IR_rounding_mode()),
14065                                  getQRegLO(nn, srcTy)));
14066
14067             } else {
14068                assign(res, unop(Iop_F16toF32,
14069                                 getQRegLO(nn, srcTy)));
14070             }
14071             putQReg128(dd, mkV128(0x0000));
14072             putQRegLO(dd, mkexpr(res));
14073             DIP("fcvt %s, %s\n",
14074                 nameQRegLO(dd, dstTy), nameQRegLO(nn, srcTy));
14075             return True;
14076          }
14077          default:
14078             break;
14079       }
14080       /* else unhandled */
14081       return False;
14082    }
14083
14084    if (ty <= X01
14085        && opcode >= BITS6(0,0,1,0,0,0) && opcode <= BITS6(0,0,1,1,1,1)
14086        && opcode != BITS6(0,0,1,1,0,1)) {
14087       /* -------- 0x,001000 FRINTN d_d, s_s -------- */
14088       /* -------- 0x,001001 FRINTP d_d, s_s -------- */
14089       /* -------- 0x,001010 FRINTM d_d, s_s -------- */
14090       /* -------- 0x,001011 FRINTZ d_d, s_s -------- */
14091       /* -------- 0x,001100 FRINTA d_d, s_s -------- */
14092       /* -------- 0x,001110 FRINTX d_d, s_s -------- */
14093       /* -------- 0x,001111 FRINTI d_d, s_s -------- */
14094       /* 31        23 21   17  14    9 4
14095          000 11110 0x 1001 111 10000 n d  FRINTI Fd, Fm (round per FPCR)
14096                            rm
14097          x==0 => S-registers, x==1 => D-registers
14098          rm (17:15) encodings:
14099             111 per FPCR  (FRINTI)
14100             001 +inf      (FRINTP)
14101             010 -inf      (FRINTM)
14102             011 zero      (FRINTZ)
14103             000 tieeven   (FRINTN) -- !! FIXME KLUDGED !!
14104             100 tieaway   (FRINTA) -- !! FIXME KLUDGED !!
14105             110 per FPCR + "exact = TRUE" (FRINTX)
14106             101 unallocated
14107       */
14108       Bool    isD   = (ty & 1) == 1;
14109       UInt    rm    = opcode & BITS6(0,0,0,1,1,1);
14110       IRType  ity   = isD ? Ity_F64 : Ity_F32;
14111       IRExpr* irrmE = NULL;
14112       UChar   ch    = '?';
14113       switch (rm) {
14114          case BITS3(0,1,1): ch = 'z'; irrmE = mkU32(Irrm_ZERO); break;
14115          case BITS3(0,1,0): ch = 'm'; irrmE = mkU32(Irrm_NegINF); break;
14116          case BITS3(0,0,1): ch = 'p'; irrmE = mkU32(Irrm_PosINF); break;
14117          // The following is a kludge.  Should be: Irrm_NEAREST_TIE_AWAY_0
14118          case BITS3(1,0,0): ch = 'a'; irrmE = mkU32(Irrm_NEAREST); break;
14119          // I am unsure about the following, due to the "integral exact"
14120          // description in the manual.  What does it mean? (frintx, that is)
14121          case BITS3(1,1,0):
14122             ch = 'x'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
14123          case BITS3(1,1,1):
14124             ch = 'i'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
14125          // The following is a kludge.  There's no Irrm_ value to represent
14126          // this ("to nearest, with ties to even")
14127          case BITS3(0,0,0): ch = 'n'; irrmE = mkU32(Irrm_NEAREST); break;
14128          default: break;
14129       }
14130       if (irrmE) {
14131          IRTemp src = newTemp(ity);
14132          IRTemp dst = newTemp(ity);
14133          assign(src, getQRegLO(nn, ity));
14134          assign(dst, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
14135                            irrmE, mkexpr(src)));
14136          putQReg128(dd, mkV128(0x0000));
14137          putQRegLO(dd, mkexpr(dst));
14138          DIP("frint%c %s, %s\n",
14139              ch, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
14140          return True;
14141       }
14142       return False;
14143    }
14144
14145    return False;
14146 #  undef INSN
14147 }
14148
14149
14150 static
14151 Bool dis_AdvSIMD_fp_data_proc_2_source(/*MB_OUT*/DisResult* dres, UInt insn)
14152 {
14153    /* 31  28    23 21 20 15     11 9 4
14154       000 11110 ty 1  m  opcode 10 n d
14155       The first 3 bits are really "M 0 S", but M and S are always zero.
14156       Decode fields: ty, opcode
14157    */
14158 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14159    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
14160        || INSN(21,21) != 1 || INSN(11,10) != BITS2(1,0)) {
14161       return False;
14162    }
14163    UInt ty     = INSN(23,22);
14164    UInt mm     = INSN(20,16);
14165    UInt opcode = INSN(15,12);
14166    UInt nn     = INSN(9,5);
14167    UInt dd     = INSN(4,0);
14168
14169    if (ty <= X01 && opcode <= BITS4(0,1,1,1)) {
14170       /* ------- 0x,0000: FMUL d_d, s_s ------- */
14171       /* ------- 0x,0001: FDIV d_d, s_s ------- */
14172       /* ------- 0x,0010: FADD d_d, s_s ------- */
14173       /* ------- 0x,0011: FSUB d_d, s_s ------- */
14174       /* ------- 0x,0100: FMAX d_d, s_s ------- */
14175       /* ------- 0x,0101: FMIN d_d, s_s ------- */
14176       /* ------- 0x,0110: FMAXNM d_d, s_s ------- (FIXME KLUDGED) */
14177       /* ------- 0x,0111: FMINNM d_d, s_s ------- (FIXME KLUDGED) */
14178       IRType ity = ty == X00 ? Ity_F32 : Ity_F64;
14179       IROp   iop = Iop_INVALID;
14180       const HChar* nm = "???";
14181       switch (opcode) {
14182          case BITS4(0,0,0,0): nm = "fmul"; iop = mkMULF(ity); break;
14183          case BITS4(0,0,0,1): nm = "fdiv"; iop = mkDIVF(ity); break;
14184          case BITS4(0,0,1,0): nm = "fadd"; iop = mkADDF(ity); break;
14185          case BITS4(0,0,1,1): nm = "fsub"; iop = mkSUBF(ity); break;
14186          case BITS4(0,1,0,0): nm = "fmax"; iop = mkVecMAXF(ty+2); break;
14187          case BITS4(0,1,0,1): nm = "fmin"; iop = mkVecMINF(ty+2); break;
14188          case BITS4(0,1,1,0): nm = "fmaxnm"; iop = mkVecMAXF(ty+2); break; //!!
14189          case BITS4(0,1,1,1): nm = "fminnm"; iop = mkVecMINF(ty+2); break; //!!
14190          default: vassert(0);
14191       }
14192       if (opcode <= BITS4(0,0,1,1)) {
14193          // This is really not good code.  TODO: avoid width-changing
14194          IRTemp res = newTemp(ity);
14195          assign(res, triop(iop, mkexpr(mk_get_IR_rounding_mode()),
14196                                 getQRegLO(nn, ity), getQRegLO(mm, ity)));
14197          putQReg128(dd, mkV128(0));
14198          putQRegLO(dd, mkexpr(res));
14199       } else {
14200          putQReg128(dd, unop(mkVecZEROHIxxOFV128(ty+2),
14201                              binop(iop, getQReg128(nn), getQReg128(mm))));
14202       }
14203       DIP("%s %s, %s, %s\n",
14204           nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
14205       return True;
14206    }
14207
14208    if (ty <= X01 && opcode == BITS4(1,0,0,0)) {
14209       /* ------- 0x,1000: FNMUL d_d, s_s ------- */
14210       IRType ity  = ty == X00 ? Ity_F32 : Ity_F64;
14211       IROp   iop  = mkMULF(ity);
14212       IROp   iopn = mkNEGF(ity);
14213       const HChar* nm = "fnmul";
14214       IRExpr* resE = unop(iopn,
14215                           triop(iop, mkexpr(mk_get_IR_rounding_mode()),
14216                                 getQRegLO(nn, ity), getQRegLO(mm, ity)));
14217       IRTemp  res  = newTemp(ity);
14218       assign(res, resE);
14219       putQReg128(dd, mkV128(0));
14220       putQRegLO(dd, mkexpr(res));
14221       DIP("%s %s, %s, %s\n",
14222           nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
14223       return True;
14224    }
14225
14226    return False;
14227 #  undef INSN
14228 }
14229
14230
14231 static
14232 Bool dis_AdvSIMD_fp_data_proc_3_source(/*MB_OUT*/DisResult* dres, UInt insn)
14233 {
14234    /* 31  28    23 21 20 15 14 9 4
14235       000 11111 ty o1 m  o0 a  n d
14236       The first 3 bits are really "M 0 S", but M and S are always zero.
14237       Decode fields: ty,o1,o0
14238    */
14239 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14240    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,1)) {
14241       return False;
14242    }
14243    UInt ty    = INSN(23,22);
14244    UInt bitO1 = INSN(21,21);
14245    UInt mm    = INSN(20,16);
14246    UInt bitO0 = INSN(15,15);
14247    UInt aa    = INSN(14,10);
14248    UInt nn    = INSN(9,5);
14249    UInt dd    = INSN(4,0);
14250    vassert(ty < 4);
14251
14252    if (ty <= X01) {
14253       /* -------- 0x,0,0 FMADD  d_d_d_d, s_s_s_s -------- */
14254       /* -------- 0x,0,1 FMSUB  d_d_d_d, s_s_s_s -------- */
14255       /* -------- 0x,1,0 FNMADD d_d_d_d, s_s_s_s -------- */
14256       /* -------- 0x,1,1 FNMSUB d_d_d_d, s_s_s_s -------- */
14257       /* -------------------- F{N}M{ADD,SUB} -------------------- */
14258       /* 31          22   20 15 14 9 4   ix
14259          000 11111 0 sz 0 m  0  a  n d   0   FMADD  Fd,Fn,Fm,Fa
14260          000 11111 0 sz 0 m  1  a  n d   1   FMSUB  Fd,Fn,Fm,Fa
14261          000 11111 0 sz 1 m  0  a  n d   2   FNMADD Fd,Fn,Fm,Fa
14262          000 11111 0 sz 1 m  1  a  n d   3   FNMSUB Fd,Fn,Fm,Fa
14263          where Fx=Dx when sz=1, Fx=Sx when sz=0
14264
14265                   -----SPEC------    ----IMPL----
14266          fmadd       a +    n * m    a + n * m
14267          fmsub       a + (-n) * m    a - n * m
14268          fnmadd   (-a) + (-n) * m    -(a + n * m)
14269          fnmsub   (-a) +    n * m    -(a - n * m)
14270       */
14271       Bool    isD   = (ty & 1) == 1;
14272       UInt    ix    = (bitO1 << 1) | bitO0;
14273       IRType  ity   = isD ? Ity_F64 : Ity_F32;
14274       IROp    opADD = mkADDF(ity);
14275       IROp    opSUB = mkSUBF(ity);
14276       IROp    opMUL = mkMULF(ity);
14277       IROp    opNEG = mkNEGF(ity);
14278       IRTemp  res   = newTemp(ity);
14279       IRExpr* eA    = getQRegLO(aa, ity);
14280       IRExpr* eN    = getQRegLO(nn, ity);
14281       IRExpr* eM    = getQRegLO(mm, ity);
14282       IRExpr* rm    = mkexpr(mk_get_IR_rounding_mode());
14283       IRExpr* eNxM  = triop(opMUL, rm, eN, eM);
14284       switch (ix) {
14285          case 0:  assign(res, triop(opADD, rm, eA, eNxM)); break;
14286          case 1:  assign(res, triop(opSUB, rm, eA, eNxM)); break;
14287          case 2:  assign(res, unop(opNEG, triop(opADD, rm, eA, eNxM))); break;
14288          case 3:  assign(res, unop(opNEG, triop(opSUB, rm, eA, eNxM))); break;
14289          default: vassert(0);
14290       }
14291       putQReg128(dd, mkV128(0x0000));
14292       putQRegLO(dd, mkexpr(res));
14293       const HChar* names[4] = { "fmadd", "fmsub", "fnmadd", "fnmsub" };
14294       DIP("%s %s, %s, %s, %s\n",
14295           names[ix], nameQRegLO(dd, ity), nameQRegLO(nn, ity),
14296                      nameQRegLO(mm, ity), nameQRegLO(aa, ity));
14297       return True;
14298    }
14299
14300    return False;
14301 #  undef INSN
14302 }
14303
14304
14305 static
14306 Bool dis_AdvSIMD_fp_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
14307 {
14308    /* 31  28    23 21 20   12  9    4
14309       000 11110 ty 1  imm8 100 imm5 d
14310       The first 3 bits are really "M 0 S", but M and S are always zero.
14311    */
14312 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14313    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
14314        || INSN(21,21) != 1 || INSN(12,10) != BITS3(1,0,0)) {
14315       return False;
14316    }
14317    UInt ty     = INSN(23,22);
14318    UInt imm8   = INSN(20,13);
14319    UInt imm5   = INSN(9,5);
14320    UInt dd     = INSN(4,0);
14321
14322    /* ------- 00,00000: FMOV s_imm ------- */
14323    /* ------- 01,00000: FMOV d_imm ------- */
14324    if (ty <= X01 && imm5 == BITS5(0,0,0,0,0)) {
14325       Bool  isD  = (ty & 1) == 1;
14326       ULong imm  = VFPExpandImm(imm8, isD ? 64 : 32);
14327       if (!isD) {
14328          vassert(0 == (imm & 0xFFFFFFFF00000000ULL));
14329       }
14330       putQReg128(dd, mkV128(0));
14331       putQRegLO(dd, isD ? mkU64(imm) : mkU32(imm & 0xFFFFFFFFULL));
14332       DIP("fmov %s, #0x%llx\n",
14333           nameQRegLO(dd, isD ? Ity_F64 : Ity_F32), imm);
14334       return True;
14335    }
14336
14337    return False;
14338 #  undef INSN
14339 }
14340
14341
14342 static
14343 Bool dis_AdvSIMD_fp_to_from_fixedp_conv(/*MB_OUT*/DisResult* dres, UInt insn)
14344 {
14345 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14346    /* 31 30 29 28    23   21 20    18     15    9 4
14347       sf  0  0 11110 type 0  rmode opcode scale n d
14348       The first 3 bits are really "sf 0 S", but S is always zero.
14349       Decode fields: sf,type,rmode,opcode
14350    */
14351 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14352    if (INSN(30,29) != BITS2(0,0)
14353        || INSN(28,24) != BITS5(1,1,1,1,0)
14354        || INSN(21,21) != 0) {
14355       return False;
14356    }
14357    UInt bitSF = INSN(31,31);
14358    UInt ty    = INSN(23,22); // type
14359    UInt rm    = INSN(20,19); // rmode
14360    UInt op    = INSN(18,16); // opcode
14361    UInt sc    = INSN(15,10); // scale
14362    UInt nn    = INSN(9,5);
14363    UInt dd    = INSN(4,0);
14364
14365    if (ty <= X01 && rm == X11
14366        && (op == BITS3(0,0,0) || op == BITS3(0,0,1))) {
14367       /* -------- (ix) sf ty rm opc -------- */
14368       /* -------- 0    0  00 11 000: FCVTZS w_s_#fbits -------- */
14369       /* -------- 1    0  01 11 000: FCVTZS w_d_#fbits -------- */
14370       /* -------- 2    1  00 11 000: FCVTZS x_s_#fbits -------- */
14371       /* -------- 3    1  01 11 000: FCVTZS x_d_#fbits -------- */
14372
14373       /* -------- 4    0  00 11 001: FCVTZU w_s_#fbits -------- */
14374       /* -------- 5    0  01 11 001: FCVTZU w_d_#fbits -------- */
14375       /* -------- 6    1  00 11 001: FCVTZU x_s_#fbits -------- */
14376       /* -------- 7    1  01 11 001: FCVTZU x_d_#fbits -------- */
14377       Bool isI64 = bitSF == 1;
14378       Bool isF64 = (ty & 1) == 1;
14379       Bool isU   = (op & 1) == 1;
14380       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
14381
14382       Int fbits = 64 - sc;
14383       vassert(fbits >= 1 && fbits <= (isI64 ? 64 : 32));
14384
14385       Double  scale  = two_to_the_plus(fbits);
14386       IRExpr* scaleE = isF64 ? IRExpr_Const(IRConst_F64(scale))
14387                              : IRExpr_Const(IRConst_F32( (Float)scale ));
14388       IROp    opMUL  = isF64 ? Iop_MulF64 : Iop_MulF32;
14389
14390       const IROp ops[8]
14391         = { Iop_F32toI32S, Iop_F64toI32S, Iop_F32toI64S, Iop_F64toI64S,
14392             Iop_F32toI32U, Iop_F64toI32U, Iop_F32toI64U, Iop_F64toI64U };
14393       IRTemp irrm = newTemp(Ity_I32);
14394       assign(irrm, mkU32(Irrm_ZERO));
14395
14396       IRExpr* src = getQRegLO(nn, isF64 ? Ity_F64 : Ity_F32);
14397       IRExpr* res = binop(ops[ix], mkexpr(irrm),
14398                                    triop(opMUL, mkexpr(irrm), src, scaleE));
14399       putIRegOrZR(isI64, dd, res);
14400
14401       DIP("fcvtz%c %s, %s, #%d\n",
14402           isU ? 'u' : 's', nameIRegOrZR(isI64, dd),
14403           nameQRegLO(nn, isF64 ? Ity_F64 : Ity_F32), fbits);
14404       return True;
14405    }
14406
14407    /* ------ sf,ty,rm,opc ------ */
14408    /* ------ x,0x,00,010  SCVTF s/d, w/x, #fbits  ------ */
14409    /* ------ x,0x,00,011  UCVTF s/d, w/x, #fbits  ------ */
14410    /* (ix) sf  S 28    ty   rm opc 15    9 4
14411       0    0 0 0 11110 00 0 00 010 scale n d  SCVTF Sd, Wn, #fbits
14412       1    0 0 0 11110 01 0 00 010 scale n d  SCVTF Dd, Wn, #fbits
14413       2    1 0 0 11110 00 0 00 010 scale n d  SCVTF Sd, Xn, #fbits
14414       3    1 0 0 11110 01 0 00 010 scale n d  SCVTF Dd, Xn, #fbits
14415
14416       4    0 0 0 11110 00 0 00 011 scale n d  UCVTF Sd, Wn, #fbits
14417       5    0 0 0 11110 01 0 00 011 scale n d  UCVTF Dd, Wn, #fbits
14418       6    1 0 0 11110 00 0 00 011 scale n d  UCVTF Sd, Xn, #fbits
14419       7    1 0 0 11110 01 0 00 011 scale n d  UCVTF Dd, Xn, #fbits
14420
14421       These are signed/unsigned conversion from integer registers to
14422       FP registers, all 4 32/64-bit combinations, rounded per FPCR,
14423       scaled per |scale|.
14424    */
14425    if (ty <= X01 && rm == X00
14426        && (op == BITS3(0,1,0) || op == BITS3(0,1,1))
14427        && (bitSF == 1 || ((sc >> 5) & 1) == 1)) {
14428       Bool isI64 = bitSF == 1;
14429       Bool isF64 = (ty & 1) == 1;
14430       Bool isU   = (op & 1) == 1;
14431       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
14432
14433       Int fbits = 64 - sc;
14434       vassert(fbits >= 1 && fbits <= (isI64 ? 64 : 32));
14435
14436       Double  scale  = two_to_the_minus(fbits);
14437       IRExpr* scaleE = isF64 ? IRExpr_Const(IRConst_F64(scale))
14438                              : IRExpr_Const(IRConst_F32( (Float)scale ));
14439       IROp    opMUL  = isF64 ? Iop_MulF64 : Iop_MulF32;
14440
14441       const IROp ops[8]
14442         = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
14443             Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
14444       IRExpr* src = getIRegOrZR(isI64, nn);
14445       IRExpr* res = (isF64 && !isI64)
14446                        ? unop(ops[ix], src)
14447                        : binop(ops[ix],
14448                                mkexpr(mk_get_IR_rounding_mode()), src);
14449       putQReg128(dd, mkV128(0));
14450       putQRegLO(dd, triop(opMUL, mkU32(Irrm_NEAREST), res, scaleE));
14451
14452       DIP("%ccvtf %s, %s, #%d\n",
14453           isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32),
14454           nameIRegOrZR(isI64, nn), fbits);
14455       return True;
14456    }
14457
14458    return False;
14459 #  undef INSN
14460 }
14461
14462
14463 static
14464 Bool dis_AdvSIMD_fp_to_from_int_conv(/*MB_OUT*/DisResult* dres, UInt insn)
14465 {
14466    /* 31 30 29 28    23   21 20    18     15     9 4
14467       sf  0  0 11110 type 1  rmode opcode 000000 n d
14468       The first 3 bits are really "sf 0 S", but S is always zero.
14469       Decode fields: sf,type,rmode,opcode
14470    */
14471 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14472    if (INSN(30,29) != BITS2(0,0)
14473        || INSN(28,24) != BITS5(1,1,1,1,0)
14474        || INSN(21,21) != 1
14475        || INSN(15,10) != BITS6(0,0,0,0,0,0)) {
14476       return False;
14477    }
14478    UInt bitSF = INSN(31,31);
14479    UInt ty    = INSN(23,22); // type
14480    UInt rm    = INSN(20,19); // rmode
14481    UInt op    = INSN(18,16); // opcode
14482    UInt nn    = INSN(9,5);
14483    UInt dd    = INSN(4,0);
14484
14485    // op = 000, 001
14486    /* -------- FCVT{N,P,M,Z,A}{S,U} (scalar, integer) -------- */
14487    /*    30       23   20 18  15     9 4
14488       sf 00 11110 0x 1 00 000 000000 n d  FCVTNS Rd, Fn (round to
14489       sf 00 11110 0x 1 00 001 000000 n d  FCVTNU Rd, Fn  nearest)
14490       ---------------- 01 --------------  FCVTP-------- (round to +inf)
14491       ---------------- 10 --------------  FCVTM-------- (round to -inf)
14492       ---------------- 11 --------------  FCVTZ-------- (round to zero)
14493       ---------------- 00 100 ----------  FCVTAS------- (nearest, ties away)
14494       ---------------- 00 101 ----------  FCVTAU------- (nearest, ties away)
14495
14496       Rd is Xd when sf==1, Wd when sf==0
14497       Fn is Dn when x==1, Sn when x==0
14498       20:19 carry the rounding mode, using the same encoding as FPCR
14499    */
14500    if (ty <= X01
14501        && (   ((op == BITS3(0,0,0) || op == BITS3(0,0,1)) && True)
14502            || ((op == BITS3(1,0,0) || op == BITS3(1,0,1)) && rm == BITS2(0,0))
14503           )
14504       ) {
14505       Bool isI64 = bitSF == 1;
14506       Bool isF64 = (ty & 1) == 1;
14507       Bool isU   = (op & 1) == 1;
14508       /* Decide on the IR rounding mode to use. */
14509       IRRoundingMode irrm = 8; /*impossible*/
14510       HChar ch = '?';
14511       if (op == BITS3(0,0,0) || op == BITS3(0,0,1)) {
14512          switch (rm) {
14513             case BITS2(0,0): ch = 'n'; irrm = Irrm_NEAREST; break;
14514             case BITS2(0,1): ch = 'p'; irrm = Irrm_PosINF; break;
14515             case BITS2(1,0): ch = 'm'; irrm = Irrm_NegINF; break;
14516             case BITS2(1,1): ch = 'z'; irrm = Irrm_ZERO; break;
14517             default: vassert(0);
14518          }
14519       } else {
14520          vassert(op == BITS3(1,0,0) || op == BITS3(1,0,1));
14521          switch (rm) {
14522             case BITS2(0,0): ch = 'a'; irrm = Irrm_NEAREST; break;
14523             default: vassert(0);
14524          }
14525       }
14526       vassert(irrm != 8);
14527       /* Decide on the conversion primop, based on the source size,
14528          dest size and signedness (8 possibilities).  Case coding:
14529             F32 ->s I32   0
14530             F32 ->u I32   1
14531             F32 ->s I64   2
14532             F32 ->u I64   3
14533             F64 ->s I32   4
14534             F64 ->u I32   5
14535             F64 ->s I64   6
14536             F64 ->u I64   7
14537       */
14538       UInt ix = (isF64 ? 4 : 0) | (isI64 ? 2 : 0) | (isU ? 1 : 0);
14539       vassert(ix < 8);
14540       const IROp iops[8]
14541          = { Iop_F32toI32S, Iop_F32toI32U, Iop_F32toI64S, Iop_F32toI64U,
14542              Iop_F64toI32S, Iop_F64toI32U, Iop_F64toI64S, Iop_F64toI64U };
14543       IROp iop = iops[ix];
14544       // A bit of ATCery: bounce all cases we haven't seen an example of.
14545       if (/* F32toI32S */
14546              (iop == Iop_F32toI32S && irrm == Irrm_ZERO)   /* FCVTZS Wd,Sn */
14547           || (iop == Iop_F32toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Sn */
14548           || (iop == Iop_F32toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Sn */
14549           || (iop == Iop_F32toI32S && irrm == Irrm_NEAREST)/* FCVT{A,N}S W,S */
14550           /* F32toI32U */
14551           || (iop == Iop_F32toI32U && irrm == Irrm_ZERO)   /* FCVTZU Wd,Sn */
14552           || (iop == Iop_F32toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Sn */
14553           || (iop == Iop_F32toI32U && irrm == Irrm_PosINF) /* FCVTPU Wd,Sn */
14554           || (iop == Iop_F32toI32U && irrm == Irrm_NEAREST)/* FCVT{A,N}U W,S */
14555           /* F32toI64S */
14556           || (iop == Iop_F32toI64S && irrm == Irrm_ZERO)   /* FCVTZS Xd,Sn */
14557           || (iop == Iop_F32toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Sn */
14558           || (iop == Iop_F32toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Sn */
14559           || (iop == Iop_F32toI64S && irrm == Irrm_NEAREST)/* FCVT{A,N}S X,S */
14560           /* F32toI64U */
14561           || (iop == Iop_F32toI64U && irrm == Irrm_ZERO)   /* FCVTZU Xd,Sn */
14562           || (iop == Iop_F32toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Sn */
14563           || (iop == Iop_F32toI64U && irrm == Irrm_PosINF) /* FCVTPU Xd,Sn */
14564           || (iop == Iop_F32toI64U && irrm == Irrm_NEAREST)/* FCVT{A,N}U X,S */
14565           /* F64toI32S */
14566           || (iop == Iop_F64toI32S && irrm == Irrm_ZERO)   /* FCVTZS Wd,Dn */
14567           || (iop == Iop_F64toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Dn */
14568           || (iop == Iop_F64toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Dn */
14569           || (iop == Iop_F64toI32S && irrm == Irrm_NEAREST)/* FCVT{A,N}S W,D */
14570           /* F64toI32U */
14571           || (iop == Iop_F64toI32U && irrm == Irrm_ZERO)   /* FCVTZU Wd,Dn */
14572           || (iop == Iop_F64toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Dn */
14573           || (iop == Iop_F64toI32U && irrm == Irrm_PosINF) /* FCVTPU Wd,Dn */
14574           || (iop == Iop_F64toI32U && irrm == Irrm_NEAREST)/* FCVT{A,N}U W,D */
14575           /* F64toI64S */
14576           || (iop == Iop_F64toI64S && irrm == Irrm_ZERO)   /* FCVTZS Xd,Dn */
14577           || (iop == Iop_F64toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Dn */
14578           || (iop == Iop_F64toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Dn */
14579           || (iop == Iop_F64toI64S && irrm == Irrm_NEAREST)/* FCVT{A,N}S X,D */
14580           /* F64toI64U */
14581           || (iop == Iop_F64toI64U && irrm == Irrm_ZERO)   /* FCVTZU Xd,Dn */
14582           || (iop == Iop_F64toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Dn */
14583           || (iop == Iop_F64toI64U && irrm == Irrm_PosINF) /* FCVTPU Xd,Dn */
14584           || (iop == Iop_F64toI64U && irrm == Irrm_NEAREST)/* FCVT{A,N}U X,D */
14585          ) {
14586         /* validated */
14587       } else {
14588         return False;
14589       }
14590       IRType srcTy  = isF64 ? Ity_F64 : Ity_F32;
14591       IRType dstTy  = isI64 ? Ity_I64 : Ity_I32;
14592       IRTemp src    = newTemp(srcTy);
14593       IRTemp dst    = newTemp(dstTy);
14594       assign(src, getQRegLO(nn, srcTy));
14595       assign(dst, binop(iop, mkU32(irrm), mkexpr(src)));
14596       putIRegOrZR(isI64, dd, mkexpr(dst));
14597       DIP("fcvt%c%c %s, %s\n", ch, isU ? 'u' : 's',
14598           nameIRegOrZR(isI64, dd), nameQRegLO(nn, srcTy));
14599       return True;
14600    }
14601
14602    // op = 010, 011
14603    /* -------------- {S,U}CVTF (scalar, integer) -------------- */
14604    /* (ix) sf  S 28    ty   rm op  15     9 4
14605       0    0 0 0 11110 00 1 00 010 000000 n d  SCVTF Sd, Wn
14606       1    0 0 0 11110 01 1 00 010 000000 n d  SCVTF Dd, Wn
14607       2    1 0 0 11110 00 1 00 010 000000 n d  SCVTF Sd, Xn
14608       3    1 0 0 11110 01 1 00 010 000000 n d  SCVTF Dd, Xn
14609
14610       4    0 0 0 11110 00 1 00 011 000000 n d  UCVTF Sd, Wn
14611       5    0 0 0 11110 01 1 00 011 000000 n d  UCVTF Dd, Wn
14612       6    1 0 0 11110 00 1 00 011 000000 n d  UCVTF Sd, Xn
14613       7    1 0 0 11110 01 1 00 011 000000 n d  UCVTF Dd, Xn
14614
14615       These are signed/unsigned conversion from integer registers to
14616       FP registers, all 4 32/64-bit combinations, rounded per FPCR.
14617    */
14618    if (ty <= X01 && rm == X00 && (op == BITS3(0,1,0) || op == BITS3(0,1,1))) {
14619       Bool isI64 = bitSF == 1;
14620       Bool isF64 = (ty & 1) == 1;
14621       Bool isU   = (op & 1) == 1;
14622       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
14623       const IROp ops[8]
14624         = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
14625             Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
14626       IRExpr* src = getIRegOrZR(isI64, nn);
14627       IRExpr* res = (isF64 && !isI64)
14628                        ? unop(ops[ix], src)
14629                        : binop(ops[ix],
14630                                mkexpr(mk_get_IR_rounding_mode()), src);
14631       putQReg128(dd, mkV128(0));
14632       putQRegLO(dd, res);
14633       DIP("%ccvtf %s, %s\n",
14634           isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32),
14635           nameIRegOrZR(isI64, nn));
14636       return True;
14637    }
14638
14639    // op = 110, 111
14640    /* -------- FMOV (general) -------- */
14641    /* case sf  S       ty   rm op  15     9 4
14642        (1) 0 0 0 11110 00 1 00 111 000000 n d     FMOV Sd,      Wn
14643        (2) 1 0 0 11110 01 1 00 111 000000 n d     FMOV Dd,      Xn
14644        (3) 1 0 0 11110 10 1 01 111 000000 n d     FMOV Vd.D[1], Xn
14645
14646        (4) 0 0 0 11110 00 1 00 110 000000 n d     FMOV Wd, Sn
14647        (5) 1 0 0 11110 01 1 00 110 000000 n d     FMOV Xd, Dn
14648        (6) 1 0 0 11110 10 1 01 110 000000 n d     FMOV Xd, Vn.D[1]
14649    */
14650    if (1) {
14651       UInt ix = 0; // case
14652       if (bitSF == 0) {
14653          if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,1))
14654             ix = 1;
14655          else
14656          if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,0))
14657             ix = 4;
14658       } else {
14659          vassert(bitSF == 1);
14660          if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,1))
14661             ix = 2;
14662          else
14663          if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,0))
14664             ix = 5;
14665          else
14666          if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,1))
14667             ix = 3;
14668          else
14669          if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,0))
14670             ix = 6;
14671       }
14672       if (ix > 0) {
14673          switch (ix) {
14674             case 1:
14675                putQReg128(dd, mkV128(0));
14676                putQRegLO(dd, getIReg32orZR(nn));
14677                DIP("fmov s%u, w%u\n", dd, nn);
14678                break;
14679             case 2:
14680                putQReg128(dd, mkV128(0));
14681                putQRegLO(dd, getIReg64orZR(nn));
14682                DIP("fmov d%u, x%u\n", dd, nn);
14683                break;
14684             case 3:
14685                putQRegHI64(dd, getIReg64orZR(nn));
14686                DIP("fmov v%u.d[1], x%u\n", dd, nn);
14687                break;
14688             case 4:
14689                putIReg32orZR(dd, getQRegLO(nn, Ity_I32));
14690                DIP("fmov w%u, s%u\n", dd, nn);
14691                break;
14692             case 5:
14693                putIReg64orZR(dd, getQRegLO(nn, Ity_I64));
14694                DIP("fmov x%u, d%u\n", dd, nn);
14695                break;
14696             case 6:
14697                putIReg64orZR(dd, getQRegHI64(nn));
14698                DIP("fmov x%u, v%u.d[1]\n", dd, nn);
14699                break;
14700             default:
14701                vassert(0);
14702          }
14703          return True;
14704       }
14705       /* undecodable; fall through */
14706    }
14707
14708    return False;
14709 #  undef INSN
14710 }
14711
14712
14713 static
14714 Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
14715 {
14716    Bool ok;
14717    ok = dis_AdvSIMD_EXT(dres, insn);
14718    if (UNLIKELY(ok)) return True;
14719    ok = dis_AdvSIMD_TBL_TBX(dres, insn);
14720    if (UNLIKELY(ok)) return True;
14721    ok = dis_AdvSIMD_ZIP_UZP_TRN(dres, insn);
14722    if (UNLIKELY(ok)) return True;
14723    ok = dis_AdvSIMD_across_lanes(dres, insn);
14724    if (UNLIKELY(ok)) return True;
14725    ok = dis_AdvSIMD_copy(dres, insn);
14726    if (UNLIKELY(ok)) return True;
14727    ok = dis_AdvSIMD_modified_immediate(dres, insn);
14728    if (UNLIKELY(ok)) return True;
14729    ok = dis_AdvSIMD_scalar_copy(dres, insn);
14730    if (UNLIKELY(ok)) return True;
14731    ok = dis_AdvSIMD_scalar_pairwise(dres, insn);
14732    if (UNLIKELY(ok)) return True;
14733    ok = dis_AdvSIMD_scalar_shift_by_imm(dres, insn);
14734    if (UNLIKELY(ok)) return True;
14735    ok = dis_AdvSIMD_scalar_three_different(dres, insn);
14736    if (UNLIKELY(ok)) return True;
14737    ok = dis_AdvSIMD_scalar_three_same(dres, insn);
14738    if (UNLIKELY(ok)) return True;
14739    ok = dis_AdvSIMD_scalar_three_same_extra(dres, insn);
14740    if (UNLIKELY(ok)) return True;
14741    ok = dis_AdvSIMD_scalar_two_reg_misc(dres, insn);
14742    if (UNLIKELY(ok)) return True;
14743    ok = dis_AdvSIMD_scalar_x_indexed_element(dres, insn);
14744    if (UNLIKELY(ok)) return True;
14745    ok = dis_AdvSIMD_shift_by_immediate(dres, insn);
14746    if (UNLIKELY(ok)) return True;
14747    ok = dis_AdvSIMD_three_different(dres, insn);
14748    if (UNLIKELY(ok)) return True;
14749    ok = dis_AdvSIMD_three_same(dres, insn);
14750    if (UNLIKELY(ok)) return True;
14751    ok = dis_AdvSIMD_three_same_extra(dres, insn);
14752    if (UNLIKELY(ok)) return True;
14753    ok = dis_AdvSIMD_two_reg_misc(dres, insn);
14754    if (UNLIKELY(ok)) return True;
14755    ok = dis_AdvSIMD_vector_x_indexed_elem(dres, insn);
14756    if (UNLIKELY(ok)) return True;
14757    ok = dis_AdvSIMD_crypto_aes(dres, insn);
14758    if (UNLIKELY(ok)) return True;
14759    ok = dis_AdvSIMD_crypto_three_reg_sha(dres, insn);
14760    if (UNLIKELY(ok)) return True;
14761    ok = dis_AdvSIMD_crypto_two_reg_sha(dres, insn);
14762    if (UNLIKELY(ok)) return True;
14763    ok = dis_AdvSIMD_fp_compare(dres, insn);
14764    if (UNLIKELY(ok)) return True;
14765    ok = dis_AdvSIMD_fp_conditional_compare(dres, insn);
14766    if (UNLIKELY(ok)) return True;
14767    ok = dis_AdvSIMD_fp_conditional_select(dres, insn);
14768    if (UNLIKELY(ok)) return True;
14769    ok = dis_AdvSIMD_fp_data_proc_1_source(dres, insn);
14770    if (UNLIKELY(ok)) return True;
14771    ok = dis_AdvSIMD_fp_data_proc_2_source(dres, insn);
14772    if (UNLIKELY(ok)) return True;
14773    ok = dis_AdvSIMD_fp_data_proc_3_source(dres, insn);
14774    if (UNLIKELY(ok)) return True;
14775    ok = dis_AdvSIMD_fp_immediate(dres, insn);
14776    if (UNLIKELY(ok)) return True;
14777    ok = dis_AdvSIMD_fp_to_from_fixedp_conv(dres, insn);
14778    if (UNLIKELY(ok)) return True;
14779    ok = dis_AdvSIMD_fp_to_from_int_conv(dres, insn);
14780    if (UNLIKELY(ok)) return True;
14781    return False;
14782 }
14783
14784
14785 /*------------------------------------------------------------*/
14786 /*--- Disassemble a single ARM64 instruction               ---*/
14787 /*------------------------------------------------------------*/
14788
14789 /* Disassemble a single ARM64 instruction into IR.  The instruction
14790    has is located at |guest_instr| and has guest IP of
14791    |guest_PC_curr_instr|, which will have been set before the call
14792    here.  Returns True iff the instruction was decoded, in which case
14793    *dres will be set accordingly, or False, in which case *dres should
14794    be ignored by the caller. */
14795
14796 static
14797 Bool disInstr_ARM64_WRK (
14798         /*MB_OUT*/DisResult* dres,
14799         const UChar* guest_instr,
14800         const VexArchInfo* archinfo,
14801         const VexAbiInfo*  abiinfo
14802      )
14803 {
14804    // A macro to fish bits out of 'insn'.
14805 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14806
14807 //ZZ    DisResult dres;
14808 //ZZ    UInt      insn;
14809 //ZZ    //Bool      allow_VFP = False;
14810 //ZZ    //UInt      hwcaps = archinfo->hwcaps;
14811 //ZZ    IRTemp    condT; /* :: Ity_I32 */
14812 //ZZ    UInt      summary;
14813 //ZZ    HChar     dis_buf[128];  // big enough to hold LDMIA etc text
14814 //ZZ
14815 //ZZ    /* What insn variants are we supporting today? */
14816 //ZZ    //allow_VFP  = (0 != (hwcaps & VEX_HWCAPS_ARM_VFP));
14817 //ZZ    // etc etc
14818
14819    /* Set result defaults. */
14820    dres->whatNext    = Dis_Continue;
14821    dres->len         = 4;
14822    dres->jk_StopHere = Ijk_INVALID;
14823    dres->hint        = Dis_HintNone;
14824
14825    /* At least this is simple on ARM64: insns are all 4 bytes long, and
14826       4-aligned.  So just fish the whole thing out of memory right now
14827       and have done. */
14828    UInt insn = getUIntLittleEndianly( guest_instr );
14829
14830    if (0) vex_printf("insn: 0x%x\n", insn);
14831
14832    DIP("\t(arm64) 0x%llx:  ", (ULong)guest_PC_curr_instr);
14833
14834    vassert(0 == (guest_PC_curr_instr & 3ULL));
14835
14836    /* ----------------------------------------------------------- */
14837
14838    /* Spot "Special" instructions (see comment at top of file). */
14839    {
14840       const UChar* code = guest_instr;
14841       /* Spot the 16-byte preamble:
14842             93CC0D8C   ror x12, x12, #3
14843             93CC358C   ror x12, x12, #13
14844             93CCCD8C   ror x12, x12, #51
14845             93CCF58C   ror x12, x12, #61
14846       */
14847       UInt word1 = 0x93CC0D8C;
14848       UInt word2 = 0x93CC358C;
14849       UInt word3 = 0x93CCCD8C;
14850       UInt word4 = 0x93CCF58C;
14851       if (getUIntLittleEndianly(code+ 0) == word1 &&
14852           getUIntLittleEndianly(code+ 4) == word2 &&
14853           getUIntLittleEndianly(code+ 8) == word3 &&
14854           getUIntLittleEndianly(code+12) == word4) {
14855          /* Got a "Special" instruction preamble.  Which one is it? */
14856          if (getUIntLittleEndianly(code+16) == 0xAA0A014A
14857                                                /* orr x10,x10,x10 */) {
14858             /* X3 = client_request ( X4 ) */
14859             DIP("x3 = client_request ( x4 )\n");
14860             putPC(mkU64( guest_PC_curr_instr + 20 ));
14861             dres->jk_StopHere = Ijk_ClientReq;
14862             dres->whatNext    = Dis_StopHere;
14863             return True;
14864          }
14865          else
14866          if (getUIntLittleEndianly(code+16) == 0xAA0B016B
14867                                                /* orr x11,x11,x11 */) {
14868             /* X3 = guest_NRADDR */
14869             DIP("x3 = guest_NRADDR\n");
14870             dres->len = 20;
14871             putIReg64orZR(3, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
14872             return True;
14873          }
14874          else
14875          if (getUIntLittleEndianly(code+16) == 0xAA0C018C
14876                                                /* orr x12,x12,x12 */) {
14877             /*  branch-and-link-to-noredir X8 */
14878             DIP("branch-and-link-to-noredir x8\n");
14879             putIReg64orZR(30, mkU64(guest_PC_curr_instr + 20));
14880             putPC(getIReg64orZR(8));
14881             dres->jk_StopHere = Ijk_NoRedir;
14882             dres->whatNext    = Dis_StopHere;
14883             return True;
14884          }
14885          else
14886          if (getUIntLittleEndianly(code+16) == 0xAA090129
14887                                                /* orr x9,x9,x9 */) {
14888             /* IR injection */
14889             DIP("IR injection\n");
14890             vex_inject_ir(irsb, Iend_LE);
14891             // Invalidate the current insn. The reason is that the IRop we're
14892             // injecting here can change. In which case the translation has to
14893             // be redone. For ease of handling, we simply invalidate all the
14894             // time.
14895             stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_PC_curr_instr)));
14896             stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(20)));
14897             putPC(mkU64( guest_PC_curr_instr + 20 ));
14898             dres->whatNext    = Dis_StopHere;
14899             dres->jk_StopHere = Ijk_InvalICache;
14900             return True;
14901          }
14902          /* We don't know what it is. */
14903          return False;
14904          /*NOTREACHED*/
14905       }
14906    }
14907
14908    /* ----------------------------------------------------------- */
14909
14910    /* Main ARM64 instruction decoder starts here. */
14911
14912    Bool ok = False;
14913
14914    /* insn[28:25] determines the top-level grouping, so let's start
14915       off with that.
14916
14917       For all of these dis_ARM64_ functions, we pass *dres with the
14918       normal default results "insn OK, 4 bytes long, keep decoding" so
14919       they don't need to change it.  However, decodes of control-flow
14920       insns may cause *dres to change.
14921    */
14922    switch (INSN(28,25)) {
14923       case BITS4(1,0,0,0): case BITS4(1,0,0,1):
14924          // Data processing - immediate
14925          ok = dis_ARM64_data_processing_immediate(dres, insn);
14926          break;
14927       case BITS4(1,0,1,0): case BITS4(1,0,1,1):
14928          // Branch, exception generation and system instructions
14929          ok = dis_ARM64_branch_etc(dres, insn, archinfo, abiinfo);
14930          break;
14931       case BITS4(0,1,0,0): case BITS4(0,1,1,0):
14932       case BITS4(1,1,0,0): case BITS4(1,1,1,0):
14933          // Loads and stores
14934          ok = dis_ARM64_load_store(dres, insn, abiinfo);
14935          break;
14936       case BITS4(0,1,0,1): case BITS4(1,1,0,1):
14937          // Data processing - register
14938          ok = dis_ARM64_data_processing_register(dres, insn);
14939          break;
14940       case BITS4(0,1,1,1): case BITS4(1,1,1,1):
14941          // Data processing - SIMD and floating point
14942          ok = dis_ARM64_simd_and_fp(dres, insn);
14943          break;
14944       case BITS4(0,0,0,0): case BITS4(0,0,0,1):
14945       case BITS4(0,0,1,0): case BITS4(0,0,1,1):
14946          // UNALLOCATED
14947          break;
14948       default:
14949          vassert(0); /* Can't happen */
14950    }
14951
14952    /* If the next-level down decoders failed, make sure |dres| didn't
14953       get changed. */
14954    if (!ok) {
14955       vassert(dres->whatNext    == Dis_Continue);
14956       vassert(dres->len         == 4);
14957       vassert(dres->jk_StopHere == Ijk_INVALID);
14958    }
14959
14960    return ok;
14961
14962 #  undef INSN
14963 }
14964
14965
14966 /*------------------------------------------------------------*/
14967 /*--- Top-level fn                                         ---*/
14968 /*------------------------------------------------------------*/
14969
14970 /* Disassemble a single instruction into IR.  The instruction
14971    is located in host memory at &guest_code[delta]. */
14972
14973 DisResult disInstr_ARM64 ( IRSB*        irsb_IN,
14974                            const UChar* guest_code_IN,
14975                            Long         delta_IN,
14976                            Addr         guest_IP,
14977                            VexArch      guest_arch,
14978                            const VexArchInfo* archinfo,
14979                            const VexAbiInfo*  abiinfo,
14980                            VexEndness   host_endness_IN,
14981                            Bool         sigill_diag_IN )
14982 {
14983    DisResult dres;
14984    vex_bzero(&dres, sizeof(dres));
14985
14986    /* Set globals (see top of this file) */
14987    vassert(guest_arch == VexArchARM64);
14988
14989    irsb                = irsb_IN;
14990    host_endness        = host_endness_IN;
14991    guest_PC_curr_instr = (Addr64)guest_IP;
14992
14993    /* Sanity checks */
14994    /* (x::UInt - 2) <= 15   ===   x >= 2 && x <= 17 (I hope) */
14995    vassert((archinfo->arm64_dMinLine_lg2_szB - 2) <= 15);
14996    vassert((archinfo->arm64_iMinLine_lg2_szB - 2) <= 15);
14997
14998    /* Try to decode */
14999    Bool ok = disInstr_ARM64_WRK( &dres,
15000                                  &guest_code_IN[delta_IN],
15001                                  archinfo, abiinfo );
15002    if (ok) {
15003       /* All decode successes end up here. */
15004       vassert(dres.len == 4 || dres.len == 20);
15005       switch (dres.whatNext) {
15006          case Dis_Continue:
15007             putPC( mkU64(dres.len + guest_PC_curr_instr) );
15008             break;
15009          case Dis_StopHere:
15010             break;
15011          default:
15012             vassert(0);
15013       }
15014       DIP("\n");
15015    } else {
15016       /* All decode failures end up here. */
15017       if (sigill_diag_IN) {
15018          Int   i, j;
15019          UChar buf[64];
15020          UInt  insn
15021                   = getUIntLittleEndianly( &guest_code_IN[delta_IN] );
15022          vex_bzero(buf, sizeof(buf));
15023          for (i = j = 0; i < 32; i++) {
15024             if (i > 0) {
15025               if ((i & 7) == 0) buf[j++] = ' ';
15026               else if ((i & 3) == 0) buf[j++] = '\'';
15027             }
15028             buf[j++] = (insn & (1<<(31-i))) ? '1' : '0';
15029          }
15030          vex_printf("disInstr(arm64): unhandled instruction 0x%08x\n", insn);
15031          vex_printf("disInstr(arm64): %s\n", buf);
15032       }
15033
15034       /* Tell the dispatcher that this insn cannot be decoded, and so
15035          has not been executed, and (is currently) the next to be
15036          executed.  PC should be up-to-date since it is made so at the
15037          start of each insn, but nevertheless be paranoid and update
15038          it again right now. */
15039       putPC( mkU64(guest_PC_curr_instr) );
15040       dres.len         = 0;
15041       dres.whatNext    = Dis_StopHere;
15042       dres.jk_StopHere = Ijk_NoDecode;
15043    }
15044    return dres;
15045 }
15046
15047
15048 /*--------------------------------------------------------------------*/
15049 /*--- end                                       guest_arm64_toIR.c ---*/
15050 /*--------------------------------------------------------------------*/