VEX/priv/guest_arm64_toIR.c

   1 /* -*- mode: C; c-basic-offset: 3; -*- */
   2
   3 /*--------------------------------------------------------------------*/
   4 /*--- begin                                     guest_arm64_toIR.c ---*/
   5 /*--------------------------------------------------------------------*/
   6
   7 /*
   8    This file is part of Valgrind, a dynamic binary instrumentation
   9    framework.
  10
  11    Copyright (C) 2013-2017 OpenWorks
  12       info@open-works.net
  13
  14    This program is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU General Public License as
  16    published by the Free Software Foundation; either version 2 of the
  17    License, or (at your option) any later version.
  18
  19    This program is distributed in the hope that it will be useful, but
  20    WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    General Public License for more details.
  23
  24    You should have received a copy of the GNU General Public License
  25    along with this program; if not, see <http://www.gnu.org/licenses/>.
  26
  27    The GNU General Public License is contained in the file COPYING.
  28 */
  29
  30 /* KNOWN LIMITATIONS 2014-Nov-16
  31
  32    * Correctness: FMAXNM, FMINNM are implemented the same as FMAX/FMIN.
  33
  34      Also FP comparison "unordered" .. is implemented as normal FP
  35      comparison.
  36
  37      Both should be fixed.  They behave incorrectly in the presence of
  38      NaNs.
  39
  40      FMULX is treated the same as FMUL.  That's also not correct.
  41
  42    * Floating multiply-add (etc) insns.  Are split into a multiply and
  43      an add, and so suffer double rounding and hence sometimes the
  44      least significant mantissa bit is incorrect.  Fix: use the IR
  45      multiply-add IROps instead.
  46
  47    * FRINTA, FRINTN are kludged .. they just round to nearest.  No special
  48      handling for the "ties" case.  FRINTX might be dubious too.
  49
  50    * Ditto FCVTXN.  No idea what "round to odd" means.  This implementation
  51      just rounds to nearest.
  52 */
  53
  54 /* "Special" instructions.
  55
  56    This instruction decoder can decode four special instructions
  57    which mean nothing natively (are no-ops as far as regs/mem are
  58    concerned) but have meaning for supporting Valgrind.  A special
  59    instruction is flagged by a 16-byte preamble:
  60
  61       93CC0D8C 93CC358C 93CCCD8C 93CCF58C
  62       (ror x12, x12, #3;   ror x12, x12, #13
  63        ror x12, x12, #51;  ror x12, x12, #61)
  64
  65    Following that, one of the following 3 are allowed
  66    (standard interpretation in parentheses):
  67
  68       AA0A014A (orr x10,x10,x10)   X3 = client_request ( X4 )
  69       AA0B016B (orr x11,x11,x11)   X3 = guest_NRADDR
  70       AA0C018C (orr x12,x12,x12)   branch-and-link-to-noredir X8
  71       AA090129 (orr x9,x9,x9)      IR injection
  72
  73    Any other bytes following the 16-byte preamble are illegal and
  74    constitute a failure in instruction decoding.  This all assumes
  75    that the preamble will never occur except in specific code
  76    fragments designed for Valgrind to catch.
  77 */
  78
  79 /* Translates ARM64 code to IR. */
  80
  81 #include "libvex_basictypes.h"
  82 #include "libvex_ir.h"
  83 #include "libvex.h"
  84 #include "libvex_guest_arm64.h"
  85
  86 #include "main_util.h"
  87 #include "main_globals.h"
  88 #include "guest_generic_bb_to_IR.h"
  89 #include "guest_arm64_defs.h"
  90
  91
  92 /*------------------------------------------------------------*/
  93 /*--- Globals                                              ---*/
  94 /*------------------------------------------------------------*/
  95
  96 /* These are set at the start of the translation of a instruction, so
  97    that we don't have to pass them around endlessly.  CONST means does
  98    not change during translation of the instruction.
  99 */
 100
 101 /* CONST: what is the host's endianness?  We need to know this in
 102    order to do sub-register accesses to the SIMD/FP registers
 103    correctly. */
 104 static VexEndness host_endness;
 105
 106 /* CONST: The guest address for the instruction currently being
 107    translated.  */
 108 static Addr64 guest_PC_curr_instr;
 109
 110 /* MOD: The IRSB* into which we're generating code. */
 111 static IRSB* irsb;
 112
 113
 114 /*------------------------------------------------------------*/
 115 /*--- Debugging output                                     ---*/
 116 /*------------------------------------------------------------*/
 117
 118 #define DIP(format, args...)           \
 119    if (vex_traceflags & VEX_TRACE_FE)  \
 120       vex_printf(format, ## args)
 121
 122 #define DIS(buf, format, args...)      \
 123    if (vex_traceflags & VEX_TRACE_FE)  \
 124       vex_sprintf(buf, format, ## args)
 125
 126
 127 /*------------------------------------------------------------*/
 128 /*--- Helper bits and pieces for deconstructing the        ---*/
 129 /*--- arm insn stream.                                     ---*/
 130 /*------------------------------------------------------------*/
 131
 132 /* Do a little-endian load of a 32-bit word, regardless of the
 133    endianness of the underlying host. */
 134 static inline UInt getUIntLittleEndianly ( const UChar* p )
 135 {
 136    UInt w = 0;
 137    w = (w << 8) | p[3];
 138    w = (w << 8) | p[2];
 139    w = (w << 8) | p[1];
 140    w = (w << 8) | p[0];
 141    return w;
 142 }
 143
 144 /* Sign extend a N-bit value up to 64 bits, by copying
 145    bit N-1 into all higher positions. */
 146 static ULong sx_to_64 ( ULong x, UInt n )
 147 {
 148    vassert(n > 1 && n < 64);
 149    x <<= (64-n);
 150    Long r = (Long)x;
 151    r >>= (64-n);
 152    return (ULong)r;
 153 }
 154
 155 //ZZ /* Do a little-endian load of a 16-bit word, regardless of the
 156 //ZZ    endianness of the underlying host. */
 157 //ZZ static inline UShort getUShortLittleEndianly ( UChar* p )
 158 //ZZ {
 159 //ZZ    UShort w = 0;
 160 //ZZ    w = (w << 8) | p[1];
 161 //ZZ    w = (w << 8) | p[0];
 162 //ZZ    return w;
 163 //ZZ }
 164 //ZZ
 165 //ZZ static UInt ROR32 ( UInt x, UInt sh ) {
 166 //ZZ    vassert(sh >= 0 && sh < 32);
 167 //ZZ    if (sh == 0)
 168 //ZZ       return x;
 169 //ZZ    else
 170 //ZZ       return (x << (32-sh)) | (x >> sh);
 171 //ZZ }
 172 //ZZ
 173 //ZZ static Int popcount32 ( UInt x )
 174 //ZZ {
 175 //ZZ    Int res = 0, i;
 176 //ZZ    for (i = 0; i < 32; i++) {
 177 //ZZ       res += (x & 1);
 178 //ZZ       x >>= 1;
 179 //ZZ    }
 180 //ZZ    return res;
 181 //ZZ }
 182 //ZZ
 183 //ZZ static UInt setbit32 ( UInt x, Int ix, UInt b )
 184 //ZZ {
 185 //ZZ    UInt mask = 1 << ix;
 186 //ZZ    x &= ~mask;
 187 //ZZ    x |= ((b << ix) & mask);
 188 //ZZ    return x;
 189 //ZZ }
 190
 191 #define BITS2(_b1,_b0)  \
 192    (((_b1) << 1) | (_b0))
 193
 194 #define BITS3(_b2,_b1,_b0)  \
 195   (((_b2) << 2) | ((_b1) << 1) | (_b0))
 196
 197 #define BITS4(_b3,_b2,_b1,_b0)  \
 198    (((_b3) << 3) | ((_b2) << 2) | ((_b1) << 1) | (_b0))
 199
 200 #define BITS8(_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 201    ((BITS4((_b7),(_b6),(_b5),(_b4)) << 4)  \
 202     | BITS4((_b3),(_b2),(_b1),(_b0)))
 203
 204 #define BITS5(_b4,_b3,_b2,_b1,_b0)  \
 205    (BITS8(0,0,0,(_b4),(_b3),(_b2),(_b1),(_b0)))
 206 #define BITS6(_b5,_b4,_b3,_b2,_b1,_b0)  \
 207    (BITS8(0,0,(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 208 #define BITS7(_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 209    (BITS8(0,(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 210
 211 #define BITS9(_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 212    (((_b8) << 8)  \
 213     | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 214
 215 #define BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 216    (((_b9) << 9) | ((_b8) << 8)  \
 217     | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 218
 219 #define BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 220    (((_b10) << 10)  \
 221     | BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
 222
 223 #define BITS12(_b11, _b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0) \
 224    (((_b11) << 11)  \
 225     | BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
 226
 227 #define X00 BITS2(0,0)
 228 #define X01 BITS2(0,1)
 229 #define X10 BITS2(1,0)
 230 #define X11 BITS2(1,1)
 231
 232 // produces _uint[_bMax:_bMin]
 233 #define SLICE_UInt(_uint,_bMax,_bMin)  \
 234    (( ((UInt)(_uint)) >> (_bMin))  \
 235     & (UInt)((1ULL << ((_bMax) - (_bMin) + 1)) - 1ULL))
 236
 237
 238 /*------------------------------------------------------------*/
 239 /*--- Helper bits and pieces for creating IR fragments.    ---*/
 240 /*------------------------------------------------------------*/
 241
 242 static IRExpr* mkV128 ( UShort w )
 243 {
 244    return IRExpr_Const(IRConst_V128(w));
 245 }
 246
 247 static IRExpr* mkU64 ( ULong i )
 248 {
 249    return IRExpr_Const(IRConst_U64(i));
 250 }
 251
 252 static IRExpr* mkU32 ( UInt i )
 253 {
 254    return IRExpr_Const(IRConst_U32(i));
 255 }
 256
 257 static IRExpr* mkU16 ( UInt i )
 258 {
 259    vassert(i < 65536);
 260    return IRExpr_Const(IRConst_U16(i));
 261 }
 262
 263 static IRExpr* mkU8 ( UInt i )
 264 {
 265    vassert(i < 256);
 266    return IRExpr_Const(IRConst_U8( (UChar)i ));
 267 }
 268
 269 static IRExpr* mkexpr ( IRTemp tmp )
 270 {
 271    return IRExpr_RdTmp(tmp);
 272 }
 273
 274 static IRExpr* unop ( IROp op, IRExpr* a )
 275 {
 276    return IRExpr_Unop(op, a);
 277 }
 278
 279 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
 280 {
 281    return IRExpr_Binop(op, a1, a2);
 282 }
 283
 284 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
 285 {
 286    return IRExpr_Triop(op, a1, a2, a3);
 287 }
 288
 289 static IRExpr* qop ( IROp op, IRExpr* a1, IRExpr* a2,
 290                               IRExpr* a3, IRExpr* a4 )
 291 {
 292    return IRExpr_Qop(op, a1, a2, a3, a4);
 293 }
 294
 295 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
 296 {
 297    return IRExpr_Load(Iend_LE, ty, addr);
 298 }
 299
 300 /* Add a statement to the list held by "irbb". */
 301 static void stmt ( IRStmt* st )
 302 {
 303    addStmtToIRSB( irsb, st );
 304 }
 305
 306 static void assign ( IRTemp dst, IRExpr* e )
 307 {
 308    stmt( IRStmt_WrTmp(dst, e) );
 309 }
 310
 311 static void storeLE ( IRExpr* addr, IRExpr* data )
 312 {
 313    stmt( IRStmt_Store(Iend_LE, addr, data) );
 314 }
 315
 316 //ZZ static void storeGuardedLE ( IRExpr* addr, IRExpr* data, IRTemp guardT )
 317 //ZZ {
 318 //ZZ    if (guardT == IRTemp_INVALID) {
 319 //ZZ       /* unconditional */
 320 //ZZ       storeLE(addr, data);
 321 //ZZ    } else {
 322 //ZZ       stmt( IRStmt_StoreG(Iend_LE, addr, data,
 323 //ZZ                           binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
 324 //ZZ    }
 325 //ZZ }
 326 //ZZ
 327 //ZZ static void loadGuardedLE ( IRTemp dst, IRLoadGOp cvt,
 328 //ZZ                             IRExpr* addr, IRExpr* alt,
 329 //ZZ                             IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
 330 //ZZ {
 331 //ZZ    if (guardT == IRTemp_INVALID) {
 332 //ZZ       /* unconditional */
 333 //ZZ       IRExpr* loaded = NULL;
 334 //ZZ       switch (cvt) {
 335 //ZZ          case ILGop_Ident32:
 336 //ZZ             loaded = loadLE(Ity_I32, addr); break;
 337 //ZZ          case ILGop_8Uto32:
 338 //ZZ             loaded = unop(Iop_8Uto32, loadLE(Ity_I8, addr)); break;
 339 //ZZ          case ILGop_8Sto32:
 340 //ZZ             loaded = unop(Iop_8Sto32, loadLE(Ity_I8, addr)); break;
 341 //ZZ          case ILGop_16Uto32:
 342 //ZZ             loaded = unop(Iop_16Uto32, loadLE(Ity_I16, addr)); break;
 343 //ZZ          case ILGop_16Sto32:
 344 //ZZ             loaded = unop(Iop_16Sto32, loadLE(Ity_I16, addr)); break;
 345 //ZZ          default:
 346 //ZZ             vassert(0);
 347 //ZZ       }
 348 //ZZ       vassert(loaded != NULL);
 349 //ZZ       assign(dst, loaded);
 350 //ZZ    } else {
 351 //ZZ       /* Generate a guarded load into 'dst', but apply 'cvt' to the
 352 //ZZ          loaded data before putting the data in 'dst'.  If the load
 353 //ZZ          does not take place, 'alt' is placed directly in 'dst'. */
 354 //ZZ       stmt( IRStmt_LoadG(Iend_LE, cvt, dst, addr, alt,
 355 //ZZ                          binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
 356 //ZZ    }
 357 //ZZ }
 358
 359 /* Generate a new temporary of the given type. */
 360 static IRTemp newTemp ( IRType ty )
 361 {
 362    vassert(isPlausibleIRType(ty));
 363    return newIRTemp( irsb->tyenv, ty );
 364 }
 365
 366 /* This is used in many places, so the brevity is an advantage. */
 367 static IRTemp newTempV128(void)
 368 {
 369    return newTemp(Ity_V128);
 370 }
 371
 372 /* Initialise V128 temporaries en masse. */
 373 static
 374 void newTempsV128_2(IRTemp* t1, IRTemp* t2)
 375 {
 376    vassert(t1 && *t1 == IRTemp_INVALID);
 377    vassert(t2 && *t2 == IRTemp_INVALID);
 378    *t1 = newTempV128();
 379    *t2 = newTempV128();
 380 }
 381
 382 static
 383 void newTempsV128_3(IRTemp* t1, IRTemp* t2, IRTemp* t3)
 384 {
 385    vassert(t1 && *t1 == IRTemp_INVALID);
 386    vassert(t2 && *t2 == IRTemp_INVALID);
 387    vassert(t3 && *t3 == IRTemp_INVALID);
 388    *t1 = newTempV128();
 389    *t2 = newTempV128();
 390    *t3 = newTempV128();
 391 }
 392
 393 static
 394 void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4)
 395 {
 396    vassert(t1 && *t1 == IRTemp_INVALID);
 397    vassert(t2 && *t2 == IRTemp_INVALID);
 398    vassert(t3 && *t3 == IRTemp_INVALID);
 399    vassert(t4 && *t4 == IRTemp_INVALID);
 400    *t1 = newTempV128();
 401    *t2 = newTempV128();
 402    *t3 = newTempV128();
 403    *t4 = newTempV128();
 404 }
 405
 406 static
 407 void newTempsV128_7(IRTemp* t1, IRTemp* t2, IRTemp* t3,
 408                     IRTemp* t4, IRTemp* t5, IRTemp* t6, IRTemp* t7)
 409 {
 410    vassert(t1 && *t1 == IRTemp_INVALID);
 411    vassert(t2 && *t2 == IRTemp_INVALID);
 412    vassert(t3 && *t3 == IRTemp_INVALID);
 413    vassert(t4 && *t4 == IRTemp_INVALID);
 414    vassert(t5 && *t5 == IRTemp_INVALID);
 415    vassert(t6 && *t6 == IRTemp_INVALID);
 416    vassert(t7 && *t7 == IRTemp_INVALID);
 417    *t1 = newTempV128();
 418    *t2 = newTempV128();
 419    *t3 = newTempV128();
 420    *t4 = newTempV128();
 421    *t5 = newTempV128();
 422    *t6 = newTempV128();
 423    *t7 = newTempV128();
 424 }
 425
 426 //ZZ /* Produces a value in 0 .. 3, which is encoded as per the type
 427 //ZZ    IRRoundingMode. */
 428 //ZZ static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
 429 //ZZ {
 430 //ZZ    return mkU32(Irrm_NEAREST);
 431 //ZZ }
 432 //ZZ
 433 //ZZ /* Generate an expression for SRC rotated right by ROT. */
 434 //ZZ static IRExpr* genROR32( IRTemp src, Int rot )
 435 //ZZ {
 436 //ZZ    vassert(rot >= 0 && rot < 32);
 437 //ZZ    if (rot == 0)
 438 //ZZ       return mkexpr(src);
 439 //ZZ    return
 440 //ZZ       binop(Iop_Or32,
 441 //ZZ             binop(Iop_Shl32, mkexpr(src), mkU8(32 - rot)),
 442 //ZZ             binop(Iop_Shr32, mkexpr(src), mkU8(rot)));
 443 //ZZ }
 444 //ZZ
 445 //ZZ static IRExpr* mkU128 ( ULong i )
 446 //ZZ {
 447 //ZZ    return binop(Iop_64HLtoV128, mkU64(i), mkU64(i));
 448 //ZZ }
 449 //ZZ
 450 //ZZ /* Generate a 4-aligned version of the given expression if
 451 //ZZ    the given condition is true.  Else return it unchanged. */
 452 //ZZ static IRExpr* align4if ( IRExpr* e, Bool b )
 453 //ZZ {
 454 //ZZ    if (b)
 455 //ZZ       return binop(Iop_And32, e, mkU32(~3));
 456 //ZZ    else
 457 //ZZ       return e;
 458 //ZZ }
 459
 460 /* Other IR construction helpers. */
 461 static IROp mkAND ( IRType ty ) {
 462    switch (ty) {
 463       case Ity_I32: return Iop_And32;
 464       case Ity_I64: return Iop_And64;
 465       default: vpanic("mkAND");
 466    }
 467 }
 468
 469 static IROp mkOR ( IRType ty ) {
 470    switch (ty) {
 471       case Ity_I32: return Iop_Or32;
 472       case Ity_I64: return Iop_Or64;
 473       default: vpanic("mkOR");
 474    }
 475 }
 476
 477 static IROp mkXOR ( IRType ty ) {
 478    switch (ty) {
 479       case Ity_I32: return Iop_Xor32;
 480       case Ity_I64: return Iop_Xor64;
 481       default: vpanic("mkXOR");
 482    }
 483 }
 484
 485 static IROp mkSHL ( IRType ty ) {
 486    switch (ty) {
 487       case Ity_I32: return Iop_Shl32;
 488       case Ity_I64: return Iop_Shl64;
 489       default: vpanic("mkSHL");
 490    }
 491 }
 492
 493 static IROp mkSHR ( IRType ty ) {
 494    switch (ty) {
 495       case Ity_I32: return Iop_Shr32;
 496       case Ity_I64: return Iop_Shr64;
 497       default: vpanic("mkSHR");
 498    }
 499 }
 500
 501 static IROp mkSAR ( IRType ty ) {
 502    switch (ty) {
 503       case Ity_I32: return Iop_Sar32;
 504       case Ity_I64: return Iop_Sar64;
 505       default: vpanic("mkSAR");
 506    }
 507 }
 508
 509 static IROp mkNOT ( IRType ty ) {
 510    switch (ty) {
 511       case Ity_I32: return Iop_Not32;
 512       case Ity_I64: return Iop_Not64;
 513       default: vpanic("mkNOT");
 514    }
 515 }
 516
 517 static IROp mkADD ( IRType ty ) {
 518    switch (ty) {
 519       case Ity_I32: return Iop_Add32;
 520       case Ity_I64: return Iop_Add64;
 521       default: vpanic("mkADD");
 522    }
 523 }
 524
 525 static IROp mkSUB ( IRType ty ) {
 526    switch (ty) {
 527       case Ity_I32: return Iop_Sub32;
 528       case Ity_I64: return Iop_Sub64;
 529       default: vpanic("mkSUB");
 530    }
 531 }
 532
 533 static IROp mkADDF ( IRType ty ) {
 534    switch (ty) {
 535       case Ity_F32: return Iop_AddF32;
 536       case Ity_F64: return Iop_AddF64;
 537       default: vpanic("mkADDF");
 538    }
 539 }
 540
 541 static IROp mkFMADDF ( IRType ty ) {
 542    switch (ty) {
 543       case Ity_F32: return Iop_MAddF32;
 544       case Ity_F64: return Iop_MAddF64;
 545       default: vpanic("mkFMADDF");
 546    }
 547 }
 548
 549 static IROp mkFMSUBF ( IRType ty ) {
 550    switch (ty) {
 551       case Ity_F32: return Iop_MSubF32;
 552       case Ity_F64: return Iop_MSubF64;
 553       default: vpanic("mkFMSUBF");
 554    }
 555 }
 556
 557 static IROp mkSUBF ( IRType ty ) {
 558    switch (ty) {
 559       case Ity_F32: return Iop_SubF32;
 560       case Ity_F64: return Iop_SubF64;
 561       default: vpanic("mkSUBF");
 562    }
 563 }
 564
 565 static IROp mkMULF ( IRType ty ) {
 566    switch (ty) {
 567       case Ity_F32: return Iop_MulF32;
 568       case Ity_F64: return Iop_MulF64;
 569       default: vpanic("mkMULF");
 570    }
 571 }
 572
 573 static IROp mkDIVF ( IRType ty ) {
 574    switch (ty) {
 575       case Ity_F32: return Iop_DivF32;
 576       case Ity_F64: return Iop_DivF64;
 577       default: vpanic("mkDIVF");
 578    }
 579 }
 580
 581 static IROp mkNEGF ( IRType ty ) {
 582    switch (ty) {
 583       case Ity_F32: return Iop_NegF32;
 584       case Ity_F64: return Iop_NegF64;
 585       default: vpanic("mkNEGF");
 586    }
 587 }
 588
 589 static IROp mkABSF ( IRType ty ) {
 590    switch (ty) {
 591       case Ity_F32: return Iop_AbsF32;
 592       case Ity_F64: return Iop_AbsF64;
 593       default: vpanic("mkABSF");
 594    }
 595 }
 596
 597 static IROp mkSQRTF ( IRType ty ) {
 598    switch (ty) {
 599       case Ity_F32: return Iop_SqrtF32;
 600       case Ity_F64: return Iop_SqrtF64;
 601       default: vpanic("mkSQRTF");
 602    }
 603 }
 604
 605 static IROp mkVecADD ( UInt size ) {
 606    const IROp ops[4]
 607       = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
 608    vassert(size < 4);
 609    return ops[size];
 610 }
 611
 612 static IROp mkVecQADDU ( UInt size ) {
 613    const IROp ops[4]
 614       = { Iop_QAdd8Ux16, Iop_QAdd16Ux8, Iop_QAdd32Ux4, Iop_QAdd64Ux2 };
 615    vassert(size < 4);
 616    return ops[size];
 617 }
 618
 619 static IROp mkVecQADDS ( UInt size ) {
 620    const IROp ops[4]
 621       = { Iop_QAdd8Sx16, Iop_QAdd16Sx8, Iop_QAdd32Sx4, Iop_QAdd64Sx2 };
 622    vassert(size < 4);
 623    return ops[size];
 624 }
 625
 626 static IROp mkVecQADDEXTSUSATUU ( UInt size ) {
 627    const IROp ops[4]
 628       = { Iop_QAddExtSUsatUU8x16, Iop_QAddExtSUsatUU16x8,
 629           Iop_QAddExtSUsatUU32x4, Iop_QAddExtSUsatUU64x2 };
 630    vassert(size < 4);
 631    return ops[size];
 632 }
 633
 634 static IROp mkVecQADDEXTUSSATSS ( UInt size ) {
 635    const IROp ops[4]
 636       = { Iop_QAddExtUSsatSS8x16, Iop_QAddExtUSsatSS16x8,
 637           Iop_QAddExtUSsatSS32x4, Iop_QAddExtUSsatSS64x2 };
 638    vassert(size < 4);
 639    return ops[size];
 640 }
 641
 642 static IROp mkVecSUB ( UInt size ) {
 643    const IROp ops[4]
 644       = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
 645    vassert(size < 4);
 646    return ops[size];
 647 }
 648
 649 static IROp mkVecQSUBU ( UInt size ) {
 650    const IROp ops[4]
 651       = { Iop_QSub8Ux16, Iop_QSub16Ux8, Iop_QSub32Ux4, Iop_QSub64Ux2 };
 652    vassert(size < 4);
 653    return ops[size];
 654 }
 655
 656 static IROp mkVecQSUBS ( UInt size ) {
 657    const IROp ops[4]
 658       = { Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4, Iop_QSub64Sx2 };
 659    vassert(size < 4);
 660    return ops[size];
 661 }
 662
 663 static IROp mkVecSARN ( UInt size ) {
 664    const IROp ops[4]
 665       = { Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4, Iop_SarN64x2 };
 666    vassert(size < 4);
 667    return ops[size];
 668 }
 669
 670 static IROp mkVecSHRN ( UInt size ) {
 671    const IROp ops[4]
 672       = { Iop_ShrN8x16, Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
 673    vassert(size < 4);
 674    return ops[size];
 675 }
 676
 677 static IROp mkVecSHLN ( UInt size ) {
 678    const IROp ops[4]
 679       = { Iop_ShlN8x16, Iop_ShlN16x8, Iop_ShlN32x4, Iop_ShlN64x2 };
 680    vassert(size < 4);
 681    return ops[size];
 682 }
 683
 684 static IROp mkVecCATEVENLANES ( UInt size ) {
 685    const IROp ops[4]
 686       = { Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8,
 687           Iop_CatEvenLanes32x4, Iop_InterleaveLO64x2 };
 688    vassert(size < 4);
 689    return ops[size];
 690 }
 691
 692 static IROp mkVecCATODDLANES ( UInt size ) {
 693    const IROp ops[4]
 694       = { Iop_CatOddLanes8x16, Iop_CatOddLanes16x8,
 695           Iop_CatOddLanes32x4, Iop_InterleaveHI64x2 };
 696    vassert(size < 4);
 697    return ops[size];
 698 }
 699
 700 static IROp mkVecINTERLEAVELO ( UInt size ) {
 701    const IROp ops[4]
 702       = { Iop_InterleaveLO8x16, Iop_InterleaveLO16x8,
 703           Iop_InterleaveLO32x4, Iop_InterleaveLO64x2 };
 704    vassert(size < 4);
 705    return ops[size];
 706 }
 707
 708 static IROp mkVecINTERLEAVEHI ( UInt size ) {
 709    const IROp ops[4]
 710       = { Iop_InterleaveHI8x16, Iop_InterleaveHI16x8,
 711           Iop_InterleaveHI32x4, Iop_InterleaveHI64x2 };
 712    vassert(size < 4);
 713    return ops[size];
 714 }
 715
 716 static IROp mkVecMAXU ( UInt size ) {
 717    const IROp ops[4]
 718       = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4, Iop_Max64Ux2 };
 719    vassert(size < 4);
 720    return ops[size];
 721 }
 722
 723 static IROp mkVecMAXS ( UInt size ) {
 724    const IROp ops[4]
 725       = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4, Iop_Max64Sx2 };
 726    vassert(size < 4);
 727    return ops[size];
 728 }
 729
 730 static IROp mkVecMINU ( UInt size ) {
 731    const IROp ops[4]
 732       = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4, Iop_Min64Ux2 };
 733    vassert(size < 4);
 734    return ops[size];
 735 }
 736
 737 static IROp mkVecMINS ( UInt size ) {
 738    const IROp ops[4]
 739       = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4, Iop_Min64Sx2 };
 740    vassert(size < 4);
 741    return ops[size];
 742 }
 743
 744 static IROp mkVecMUL ( UInt size ) {
 745    const IROp ops[4]
 746       = { Iop_Mul8x16, Iop_Mul16x8, Iop_Mul32x4, Iop_INVALID };
 747    vassert(size < 3);
 748    return ops[size];
 749 }
 750
 751 static IROp mkVecMULLU ( UInt sizeNarrow ) {
 752    const IROp ops[4]
 753       = { Iop_Mull8Ux8, Iop_Mull16Ux4, Iop_Mull32Ux2, Iop_INVALID };
 754    vassert(sizeNarrow < 3);
 755    return ops[sizeNarrow];
 756 }
 757
 758 static IROp mkVecMULLS ( UInt sizeNarrow ) {
 759    const IROp ops[4]
 760       = { Iop_Mull8Sx8, Iop_Mull16Sx4, Iop_Mull32Sx2, Iop_INVALID };
 761    vassert(sizeNarrow < 3);
 762    return ops[sizeNarrow];
 763 }
 764
 765 static IROp mkVecQDMULLS ( UInt sizeNarrow ) {
 766    const IROp ops[4]
 767       = { Iop_INVALID, Iop_QDMull16Sx4, Iop_QDMull32Sx2, Iop_INVALID };
 768    vassert(sizeNarrow < 3);
 769    return ops[sizeNarrow];
 770 }
 771
 772 static IROp mkVecCMPEQ ( UInt size ) {
 773    const IROp ops[4]
 774       = { Iop_CmpEQ8x16, Iop_CmpEQ16x8, Iop_CmpEQ32x4, Iop_CmpEQ64x2 };
 775    vassert(size < 4);
 776    return ops[size];
 777 }
 778
 779 static IROp mkVecCMPGTU ( UInt size ) {
 780    const IROp ops[4]
 781       = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4, Iop_CmpGT64Ux2 };
 782    vassert(size < 4);
 783    return ops[size];
 784 }
 785
 786 static IROp mkVecCMPGTS ( UInt size ) {
 787    const IROp ops[4]
 788       = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2 };
 789    vassert(size < 4);
 790    return ops[size];
 791 }
 792
 793 static IROp mkVecABS ( UInt size ) {
 794    const IROp ops[4]
 795       = { Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2 };
 796    vassert(size < 4);
 797    return ops[size];
 798 }
 799
 800 static IROp mkVecZEROHIxxOFV128 ( UInt size ) {
 801    const IROp ops[4]
 802       = { Iop_ZeroHI120ofV128, Iop_ZeroHI112ofV128,
 803           Iop_ZeroHI96ofV128,  Iop_ZeroHI64ofV128 };
 804    vassert(size < 4);
 805    return ops[size];
 806 }
 807
 808 static IRExpr* mkU ( IRType ty, ULong imm ) {
 809    switch (ty) {
 810       case Ity_I32: return mkU32((UInt)(imm & 0xFFFFFFFFULL));
 811       case Ity_I64: return mkU64(imm);
 812       default: vpanic("mkU");
 813    }
 814 }
 815
 816 static IROp mkVecQDMULHIS ( UInt size ) {
 817    const IROp ops[4]
 818       = { Iop_INVALID, Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4, Iop_INVALID };
 819    vassert(size < 4);
 820    return ops[size];
 821 }
 822
 823 static IROp mkVecQRDMULHIS ( UInt size ) {
 824    const IROp ops[4]
 825       = { Iop_INVALID, Iop_QRDMulHi16Sx8, Iop_QRDMulHi32Sx4, Iop_INVALID };
 826    vassert(size < 4);
 827    return ops[size];
 828 }
 829
 830 static IROp mkVecQANDUQSH ( UInt size ) {
 831    const IROp ops[4]
 832       = { Iop_QandUQsh8x16, Iop_QandUQsh16x8,
 833           Iop_QandUQsh32x4, Iop_QandUQsh64x2 };
 834    vassert(size < 4);
 835    return ops[size];
 836 }
 837
 838 static IROp mkVecQANDSQSH ( UInt size ) {
 839    const IROp ops[4]
 840       = { Iop_QandSQsh8x16, Iop_QandSQsh16x8,
 841           Iop_QandSQsh32x4, Iop_QandSQsh64x2 };
 842    vassert(size < 4);
 843    return ops[size];
 844 }
 845
 846 static IROp mkVecQANDUQRSH ( UInt size ) {
 847    const IROp ops[4]
 848       = { Iop_QandUQRsh8x16, Iop_QandUQRsh16x8,
 849           Iop_QandUQRsh32x4, Iop_QandUQRsh64x2 };
 850    vassert(size < 4);
 851    return ops[size];
 852 }
 853
 854 static IROp mkVecQANDSQRSH ( UInt size ) {
 855    const IROp ops[4]
 856       = { Iop_QandSQRsh8x16, Iop_QandSQRsh16x8,
 857           Iop_QandSQRsh32x4, Iop_QandSQRsh64x2 };
 858    vassert(size < 4);
 859    return ops[size];
 860 }
 861
 862 static IROp mkVecSHU ( UInt size ) {
 863    const IROp ops[4]
 864       = { Iop_Sh8Ux16, Iop_Sh16Ux8, Iop_Sh32Ux4, Iop_Sh64Ux2 };
 865    vassert(size < 4);
 866    return ops[size];
 867 }
 868
 869 static IROp mkVecSHS ( UInt size ) {
 870    const IROp ops[4]
 871       = { Iop_Sh8Sx16, Iop_Sh16Sx8, Iop_Sh32Sx4, Iop_Sh64Sx2 };
 872    vassert(size < 4);
 873    return ops[size];
 874 }
 875
 876 static IROp mkVecRSHU ( UInt size ) {
 877    const IROp ops[4]
 878       = { Iop_Rsh8Ux16, Iop_Rsh16Ux8, Iop_Rsh32Ux4, Iop_Rsh64Ux2 };
 879    vassert(size < 4);
 880    return ops[size];
 881 }
 882
 883 static IROp mkVecRSHS ( UInt size ) {
 884    const IROp ops[4]
 885       = { Iop_Rsh8Sx16, Iop_Rsh16Sx8, Iop_Rsh32Sx4, Iop_Rsh64Sx2 };
 886    vassert(size < 4);
 887    return ops[size];
 888 }
 889
 890 static IROp mkVecNARROWUN ( UInt sizeNarrow ) {
 891    const IROp ops[4]
 892       = { Iop_NarrowUn16to8x8, Iop_NarrowUn32to16x4,
 893           Iop_NarrowUn64to32x2, Iop_INVALID };
 894    vassert(sizeNarrow < 4);
 895    return ops[sizeNarrow];
 896 }
 897
 898 static IROp mkVecQNARROWUNSU ( UInt sizeNarrow ) {
 899    const IROp ops[4]
 900       = { Iop_QNarrowUn16Sto8Ux8,  Iop_QNarrowUn32Sto16Ux4,
 901           Iop_QNarrowUn64Sto32Ux2, Iop_INVALID };
 902    vassert(sizeNarrow < 4);
 903    return ops[sizeNarrow];
 904 }
 905
 906 static IROp mkVecQNARROWUNSS ( UInt sizeNarrow ) {
 907    const IROp ops[4]
 908       = { Iop_QNarrowUn16Sto8Sx8,  Iop_QNarrowUn32Sto16Sx4,
 909           Iop_QNarrowUn64Sto32Sx2, Iop_INVALID };
 910    vassert(sizeNarrow < 4);
 911    return ops[sizeNarrow];
 912 }
 913
 914 static IROp mkVecQNARROWUNUU ( UInt sizeNarrow ) {
 915    const IROp ops[4]
 916       = { Iop_QNarrowUn16Uto8Ux8,  Iop_QNarrowUn32Uto16Ux4,
 917           Iop_QNarrowUn64Uto32Ux2, Iop_INVALID };
 918    vassert(sizeNarrow < 4);
 919    return ops[sizeNarrow];
 920 }
 921
 922 static IROp mkVecQANDqshrNNARROWUU ( UInt sizeNarrow ) {
 923    const IROp ops[4]
 924       = { Iop_QandQShrNnarrow16Uto8Ux8, Iop_QandQShrNnarrow32Uto16Ux4,
 925           Iop_QandQShrNnarrow64Uto32Ux2, Iop_INVALID };
 926    vassert(sizeNarrow < 4);
 927    return ops[sizeNarrow];
 928 }
 929
 930 static IROp mkVecQANDqsarNNARROWSS ( UInt sizeNarrow ) {
 931    const IROp ops[4]
 932       = { Iop_QandQSarNnarrow16Sto8Sx8,  Iop_QandQSarNnarrow32Sto16Sx4,
 933           Iop_QandQSarNnarrow64Sto32Sx2, Iop_INVALID };
 934    vassert(sizeNarrow < 4);
 935    return ops[sizeNarrow];
 936 }
 937
 938 static IROp mkVecQANDqsarNNARROWSU ( UInt sizeNarrow ) {
 939    const IROp ops[4]
 940       = { Iop_QandQSarNnarrow16Sto8Ux8,  Iop_QandQSarNnarrow32Sto16Ux4,
 941           Iop_QandQSarNnarrow64Sto32Ux2, Iop_INVALID };
 942    vassert(sizeNarrow < 4);
 943    return ops[sizeNarrow];
 944 }
 945
 946 static IROp mkVecQANDqrshrNNARROWUU ( UInt sizeNarrow ) {
 947    const IROp ops[4]
 948       = { Iop_QandQRShrNnarrow16Uto8Ux8,  Iop_QandQRShrNnarrow32Uto16Ux4,
 949           Iop_QandQRShrNnarrow64Uto32Ux2, Iop_INVALID };
 950    vassert(sizeNarrow < 4);
 951    return ops[sizeNarrow];
 952 }
 953
 954 static IROp mkVecQANDqrsarNNARROWSS ( UInt sizeNarrow ) {
 955    const IROp ops[4]
 956       = { Iop_QandQRSarNnarrow16Sto8Sx8,  Iop_QandQRSarNnarrow32Sto16Sx4,
 957           Iop_QandQRSarNnarrow64Sto32Sx2, Iop_INVALID };
 958    vassert(sizeNarrow < 4);
 959    return ops[sizeNarrow];
 960 }
 961
 962 static IROp mkVecQANDqrsarNNARROWSU ( UInt sizeNarrow ) {
 963    const IROp ops[4]
 964       = { Iop_QandQRSarNnarrow16Sto8Ux8,  Iop_QandQRSarNnarrow32Sto16Ux4,
 965           Iop_QandQRSarNnarrow64Sto32Ux2, Iop_INVALID };
 966    vassert(sizeNarrow < 4);
 967    return ops[sizeNarrow];
 968 }
 969
 970 static IROp mkVecQSHLNSATUU ( UInt size ) {
 971    const IROp ops[4]
 972       = { Iop_QShlNsatUU8x16, Iop_QShlNsatUU16x8,
 973           Iop_QShlNsatUU32x4, Iop_QShlNsatUU64x2 };
 974    vassert(size < 4);
 975    return ops[size];
 976 }
 977
 978 static IROp mkVecQSHLNSATSS ( UInt size ) {
 979    const IROp ops[4]
 980       = { Iop_QShlNsatSS8x16, Iop_QShlNsatSS16x8,
 981           Iop_QShlNsatSS32x4, Iop_QShlNsatSS64x2 };
 982    vassert(size < 4);
 983    return ops[size];
 984 }
 985
 986 static IROp mkVecQSHLNSATSU ( UInt size ) {
 987    const IROp ops[4]
 988       = { Iop_QShlNsatSU8x16, Iop_QShlNsatSU16x8,
 989           Iop_QShlNsatSU32x4, Iop_QShlNsatSU64x2 };
 990    vassert(size < 4);
 991    return ops[size];
 992 }
 993
 994 static IROp mkVecADDF ( UInt size ) {
 995    const IROp ops[4]
 996       = { Iop_INVALID, Iop_Add16Fx8, Iop_Add32Fx4, Iop_Add64Fx2 };
 997    vassert(size < 4);
 998    return ops[size];
 999 }
1000
1001 static IROp mkVecMAXF ( UInt size ) {
1002    const IROp ops[4]
1003       = { Iop_INVALID, Iop_INVALID, Iop_Max32Fx4, Iop_Max64Fx2 };
1004    vassert(size < 4);
1005    return ops[size];
1006 }
1007
1008 static IROp mkVecMINF ( UInt size ) {
1009    const IROp ops[4]
1010       = { Iop_INVALID, Iop_INVALID, Iop_Min32Fx4, Iop_Min64Fx2 };
1011    vassert(size < 4);
1012    return ops[size];
1013 }
1014
1015 /* Generate IR to create 'arg rotated right by imm', for sane values
1016    of 'ty' and 'imm'. */
1017 static IRTemp mathROR ( IRType ty, IRTemp arg, UInt imm )
1018 {
1019    UInt w = 0;
1020    if (ty == Ity_I64) {
1021       w = 64;
1022    } else {
1023       vassert(ty == Ity_I32);
1024       w = 32;
1025    }
1026    vassert(w != 0);
1027    vassert(imm < w);
1028    if (imm == 0) {
1029       return arg;
1030    }
1031    IRTemp res = newTemp(ty);
1032    assign(res, binop(mkOR(ty),
1033                      binop(mkSHL(ty), mkexpr(arg), mkU8(w - imm)),
1034                      binop(mkSHR(ty), mkexpr(arg), mkU8(imm)) ));
1035    return res;
1036 }
1037
1038 /* Generate IR to set the returned temp to either all-zeroes or
1039    all ones, as a copy of arg<imm>. */
1040 static IRTemp mathREPLICATE ( IRType ty, IRTemp arg, UInt imm )
1041 {
1042    UInt w = 0;
1043    if (ty == Ity_I64) {
1044       w = 64;
1045    } else {
1046       vassert(ty == Ity_I32);
1047       w = 32;
1048    }
1049    vassert(w != 0);
1050    vassert(imm < w);
1051    IRTemp res = newTemp(ty);
1052    assign(res, binop(mkSAR(ty),
1053                      binop(mkSHL(ty), mkexpr(arg), mkU8(w - 1 - imm)),
1054                      mkU8(w - 1)));
1055    return res;
1056 }
1057
1058 /* S-widen 8/16/32/64 bit int expr to 64. */
1059 static IRExpr* widenSto64 ( IRType srcTy, IRExpr* e )
1060 {
1061    switch (srcTy) {
1062       case Ity_I64: return e;
1063       case Ity_I32: return unop(Iop_32Sto64, e);
1064       case Ity_I16: return unop(Iop_16Sto64, e);
1065       case Ity_I8:  return unop(Iop_8Sto64, e);
1066       default: vpanic("widenSto64(arm64)");
1067    }
1068 }
1069
1070 /* U-widen 8/16/32/64 bit int expr to 64. */
1071 static IRExpr* widenUto64 ( IRType srcTy, IRExpr* e )
1072 {
1073    switch (srcTy) {
1074       case Ity_I64: return e;
1075       case Ity_I32: return unop(Iop_32Uto64, e);
1076       case Ity_I16: return unop(Iop_16Uto64, e);
1077       case Ity_I8:  return unop(Iop_8Uto64, e);
1078       default: vpanic("widenUto64(arm64)");
1079    }
1080 }
1081
1082 /* Narrow 64 bit int expr to 8/16/32/64.  Clearly only some
1083    of these combinations make sense. */
1084 static IRExpr* narrowFrom64 ( IRType dstTy, IRExpr* e )
1085 {
1086    switch (dstTy) {
1087       case Ity_I64: return e;
1088       case Ity_I32: return unop(Iop_64to32, e);
1089       case Ity_I16: return unop(Iop_64to16, e);
1090       case Ity_I8:  return unop(Iop_64to8, e);
1091       default: vpanic("narrowFrom64(arm64)");
1092    }
1093 }
1094
1095
1096 /*------------------------------------------------------------*/
1097 /*--- Helpers for accessing guest registers.               ---*/
1098 /*------------------------------------------------------------*/
1099
1100 #define OFFB_X0       offsetof(VexGuestARM64State,guest_X0)
1101 #define OFFB_X1       offsetof(VexGuestARM64State,guest_X1)
1102 #define OFFB_X2       offsetof(VexGuestARM64State,guest_X2)
1103 #define OFFB_X3       offsetof(VexGuestARM64State,guest_X3)
1104 #define OFFB_X4       offsetof(VexGuestARM64State,guest_X4)
1105 #define OFFB_X5       offsetof(VexGuestARM64State,guest_X5)
1106 #define OFFB_X6       offsetof(VexGuestARM64State,guest_X6)
1107 #define OFFB_X7       offsetof(VexGuestARM64State,guest_X7)
1108 #define OFFB_X8       offsetof(VexGuestARM64State,guest_X8)
1109 #define OFFB_X9       offsetof(VexGuestARM64State,guest_X9)
1110 #define OFFB_X10      offsetof(VexGuestARM64State,guest_X10)
1111 #define OFFB_X11      offsetof(VexGuestARM64State,guest_X11)
1112 #define OFFB_X12      offsetof(VexGuestARM64State,guest_X12)
1113 #define OFFB_X13      offsetof(VexGuestARM64State,guest_X13)
1114 #define OFFB_X14      offsetof(VexGuestARM64State,guest_X14)
1115 #define OFFB_X15      offsetof(VexGuestARM64State,guest_X15)
1116 #define OFFB_X16      offsetof(VexGuestARM64State,guest_X16)
1117 #define OFFB_X17      offsetof(VexGuestARM64State,guest_X17)
1118 #define OFFB_X18      offsetof(VexGuestARM64State,guest_X18)
1119 #define OFFB_X19      offsetof(VexGuestARM64State,guest_X19)
1120 #define OFFB_X20      offsetof(VexGuestARM64State,guest_X20)
1121 #define OFFB_X21      offsetof(VexGuestARM64State,guest_X21)
1122 #define OFFB_X22      offsetof(VexGuestARM64State,guest_X22)
1123 #define OFFB_X23      offsetof(VexGuestARM64State,guest_X23)
1124 #define OFFB_X24      offsetof(VexGuestARM64State,guest_X24)
1125 #define OFFB_X25      offsetof(VexGuestARM64State,guest_X25)
1126 #define OFFB_X26      offsetof(VexGuestARM64State,guest_X26)
1127 #define OFFB_X27      offsetof(VexGuestARM64State,guest_X27)
1128 #define OFFB_X28      offsetof(VexGuestARM64State,guest_X28)
1129 #define OFFB_X29      offsetof(VexGuestARM64State,guest_X29)
1130 #define OFFB_X30      offsetof(VexGuestARM64State,guest_X30)
1131
1132 #define OFFB_XSP      offsetof(VexGuestARM64State,guest_XSP)
1133 #define OFFB_PC       offsetof(VexGuestARM64State,guest_PC)
1134
1135 #define OFFB_CC_OP    offsetof(VexGuestARM64State,guest_CC_OP)
1136 #define OFFB_CC_DEP1  offsetof(VexGuestARM64State,guest_CC_DEP1)
1137 #define OFFB_CC_DEP2  offsetof(VexGuestARM64State,guest_CC_DEP2)
1138 #define OFFB_CC_NDEP  offsetof(VexGuestARM64State,guest_CC_NDEP)
1139
1140 #define OFFB_TPIDR_EL0 offsetof(VexGuestARM64State,guest_TPIDR_EL0)
1141 #define OFFB_NRADDR   offsetof(VexGuestARM64State,guest_NRADDR)
1142
1143 #define OFFB_Q0       offsetof(VexGuestARM64State,guest_Q0)
1144 #define OFFB_Q1       offsetof(VexGuestARM64State,guest_Q1)
1145 #define OFFB_Q2       offsetof(VexGuestARM64State,guest_Q2)
1146 #define OFFB_Q3       offsetof(VexGuestARM64State,guest_Q3)
1147 #define OFFB_Q4       offsetof(VexGuestARM64State,guest_Q4)
1148 #define OFFB_Q5       offsetof(VexGuestARM64State,guest_Q5)
1149 #define OFFB_Q6       offsetof(VexGuestARM64State,guest_Q6)
1150 #define OFFB_Q7       offsetof(VexGuestARM64State,guest_Q7)
1151 #define OFFB_Q8       offsetof(VexGuestARM64State,guest_Q8)
1152 #define OFFB_Q9       offsetof(VexGuestARM64State,guest_Q9)
1153 #define OFFB_Q10      offsetof(VexGuestARM64State,guest_Q10)
1154 #define OFFB_Q11      offsetof(VexGuestARM64State,guest_Q11)
1155 #define OFFB_Q12      offsetof(VexGuestARM64State,guest_Q12)
1156 #define OFFB_Q13      offsetof(VexGuestARM64State,guest_Q13)
1157 #define OFFB_Q14      offsetof(VexGuestARM64State,guest_Q14)
1158 #define OFFB_Q15      offsetof(VexGuestARM64State,guest_Q15)
1159 #define OFFB_Q16      offsetof(VexGuestARM64State,guest_Q16)
1160 #define OFFB_Q17      offsetof(VexGuestARM64State,guest_Q17)
1161 #define OFFB_Q18      offsetof(VexGuestARM64State,guest_Q18)
1162 #define OFFB_Q19      offsetof(VexGuestARM64State,guest_Q19)
1163 #define OFFB_Q20      offsetof(VexGuestARM64State,guest_Q20)
1164 #define OFFB_Q21      offsetof(VexGuestARM64State,guest_Q21)
1165 #define OFFB_Q22      offsetof(VexGuestARM64State,guest_Q22)
1166 #define OFFB_Q23      offsetof(VexGuestARM64State,guest_Q23)
1167 #define OFFB_Q24      offsetof(VexGuestARM64State,guest_Q24)
1168 #define OFFB_Q25      offsetof(VexGuestARM64State,guest_Q25)
1169 #define OFFB_Q26      offsetof(VexGuestARM64State,guest_Q26)
1170 #define OFFB_Q27      offsetof(VexGuestARM64State,guest_Q27)
1171 #define OFFB_Q28      offsetof(VexGuestARM64State,guest_Q28)
1172 #define OFFB_Q29      offsetof(VexGuestARM64State,guest_Q29)
1173 #define OFFB_Q30      offsetof(VexGuestARM64State,guest_Q30)
1174 #define OFFB_Q31      offsetof(VexGuestARM64State,guest_Q31)
1175
1176 #define OFFB_FPCR     offsetof(VexGuestARM64State,guest_FPCR)
1177 #define OFFB_QCFLAG   offsetof(VexGuestARM64State,guest_QCFLAG)
1178
1179 #define OFFB_CMSTART  offsetof(VexGuestARM64State,guest_CMSTART)
1180 #define OFFB_CMLEN    offsetof(VexGuestARM64State,guest_CMLEN)
1181
1182 #define OFFB_LLSC_SIZE offsetof(VexGuestARM64State,guest_LLSC_SIZE)
1183 #define OFFB_LLSC_ADDR offsetof(VexGuestARM64State,guest_LLSC_ADDR)
1184 #define OFFB_LLSC_DATA offsetof(VexGuestARM64State,guest_LLSC_DATA)
1185
1186
1187 /* ---------------- Integer registers ---------------- */
1188
1189 static Int offsetIReg64 ( UInt iregNo )
1190 {
1191    /* Do we care about endianness here?  We do if sub-parts of integer
1192       registers are accessed. */
1193    switch (iregNo) {
1194       case 0:  return OFFB_X0;
1195       case 1:  return OFFB_X1;
1196       case 2:  return OFFB_X2;
1197       case 3:  return OFFB_X3;
1198       case 4:  return OFFB_X4;
1199       case 5:  return OFFB_X5;
1200       case 6:  return OFFB_X6;
1201       case 7:  return OFFB_X7;
1202       case 8:  return OFFB_X8;
1203       case 9:  return OFFB_X9;
1204       case 10: return OFFB_X10;
1205       case 11: return OFFB_X11;
1206       case 12: return OFFB_X12;
1207       case 13: return OFFB_X13;
1208       case 14: return OFFB_X14;
1209       case 15: return OFFB_X15;
1210       case 16: return OFFB_X16;
1211       case 17: return OFFB_X17;
1212       case 18: return OFFB_X18;
1213       case 19: return OFFB_X19;
1214       case 20: return OFFB_X20;
1215       case 21: return OFFB_X21;
1216       case 22: return OFFB_X22;
1217       case 23: return OFFB_X23;
1218       case 24: return OFFB_X24;
1219       case 25: return OFFB_X25;
1220       case 26: return OFFB_X26;
1221       case 27: return OFFB_X27;
1222       case 28: return OFFB_X28;
1223       case 29: return OFFB_X29;
1224       case 30: return OFFB_X30;
1225       /* but not 31 */
1226       default: vassert(0);
1227    }
1228 }
1229
1230 static Int offsetIReg64orSP ( UInt iregNo )
1231 {
1232    return iregNo == 31  ? OFFB_XSP  : offsetIReg64(iregNo);
1233 }
1234
1235 static const HChar* nameIReg64orZR ( UInt iregNo )
1236 {
1237    vassert(iregNo < 32);
1238    static const HChar* names[32]
1239       = { "x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",
1240           "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15",
1241           "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
1242           "x24", "x25", "x26", "x27", "x28", "x29", "x30", "xzr" };
1243    return names[iregNo];
1244 }
1245
1246 static const HChar* nameIReg64orSP ( UInt iregNo )
1247 {
1248    if (iregNo == 31) {
1249       return "sp";
1250    }
1251    vassert(iregNo < 31);
1252    return nameIReg64orZR(iregNo);
1253 }
1254
1255 static IRExpr* getIReg64orSP ( UInt iregNo )
1256 {
1257    vassert(iregNo < 32);
1258    return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
1259 }
1260
1261 static IRExpr* getIReg64orZR ( UInt iregNo )
1262 {
1263    if (iregNo == 31) {
1264       return mkU64(0);
1265    }
1266    vassert(iregNo < 31);
1267    return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
1268 }
1269
1270 static void putIReg64orSP ( UInt iregNo, IRExpr* e )
1271 {
1272    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1273    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
1274 }
1275
1276 static void putIReg64orZR ( UInt iregNo, IRExpr* e )
1277 {
1278    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1279    if (iregNo == 31) {
1280       return;
1281    }
1282    vassert(iregNo < 31);
1283    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
1284 }
1285
1286 static const HChar* nameIReg32orZR ( UInt iregNo )
1287 {
1288    vassert(iregNo < 32);
1289    static const HChar* names[32]
1290       = { "w0",  "w1",  "w2",  "w3",  "w4",  "w5",  "w6",  "w7",
1291           "w8",  "w9",  "w10", "w11", "w12", "w13", "w14", "w15",
1292           "w16", "w17", "w18", "w19", "w20", "w21", "w22", "w23",
1293           "w24", "w25", "w26", "w27", "w28", "w29", "w30", "wzr" };
1294    return names[iregNo];
1295 }
1296
1297 static const HChar* nameIReg32orSP ( UInt iregNo )
1298 {
1299    if (iregNo == 31) {
1300       return "wsp";
1301    }
1302    vassert(iregNo < 31);
1303    return nameIReg32orZR(iregNo);
1304 }
1305
1306 static IRExpr* getIReg32orSP ( UInt iregNo )
1307 {
1308    vassert(iregNo < 32);
1309    return unop(Iop_64to32,
1310                IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
1311 }
1312
1313 static IRExpr* getIReg32orZR ( UInt iregNo )
1314 {
1315    if (iregNo == 31) {
1316       return mkU32(0);
1317    }
1318    vassert(iregNo < 31);
1319    return unop(Iop_64to32,
1320                IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
1321 }
1322
1323 static void putIReg32orSP ( UInt iregNo, IRExpr* e )
1324 {
1325    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1326    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
1327 }
1328
1329 static void putIReg32orZR ( UInt iregNo, IRExpr* e )
1330 {
1331    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1332    if (iregNo == 31) {
1333       return;
1334    }
1335    vassert(iregNo < 31);
1336    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
1337 }
1338
1339 static const HChar* nameIRegOrSP ( Bool is64, UInt iregNo )
1340 {
1341    vassert(is64 == True || is64 == False);
1342    return is64 ? nameIReg64orSP(iregNo) : nameIReg32orSP(iregNo);
1343 }
1344
1345 static const HChar* nameIRegOrZR ( Bool is64, UInt iregNo )
1346 {
1347    vassert(is64 == True || is64 == False);
1348    return is64 ? nameIReg64orZR(iregNo) : nameIReg32orZR(iregNo);
1349 }
1350
1351 static IRExpr* getIRegOrZR ( Bool is64, UInt iregNo )
1352 {
1353    vassert(is64 == True || is64 == False);
1354    return is64 ? getIReg64orZR(iregNo) : getIReg32orZR(iregNo);
1355 }
1356
1357 static void putIRegOrZR ( Bool is64, UInt iregNo, IRExpr* e )
1358 {
1359    vassert(is64 == True || is64 == False);
1360    if (is64) putIReg64orZR(iregNo, e); else putIReg32orZR(iregNo, e);
1361 }
1362
1363 static void putPC ( IRExpr* e )
1364 {
1365    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1366    stmt( IRStmt_Put(OFFB_PC, e) );
1367 }
1368
1369
1370 /* ---------------- Vector (Q) registers ---------------- */
1371
1372 static Int offsetQReg128 ( UInt qregNo )
1373 {
1374    /* We don't care about endianness at this point.  It only becomes
1375       relevant when dealing with sections of these registers.*/
1376    switch (qregNo) {
1377       case 0:  return OFFB_Q0;
1378       case 1:  return OFFB_Q1;
1379       case 2:  return OFFB_Q2;
1380       case 3:  return OFFB_Q3;
1381       case 4:  return OFFB_Q4;
1382       case 5:  return OFFB_Q5;
1383       case 6:  return OFFB_Q6;
1384       case 7:  return OFFB_Q7;
1385       case 8:  return OFFB_Q8;
1386       case 9:  return OFFB_Q9;
1387       case 10: return OFFB_Q10;
1388       case 11: return OFFB_Q11;
1389       case 12: return OFFB_Q12;
1390       case 13: return OFFB_Q13;
1391       case 14: return OFFB_Q14;
1392       case 15: return OFFB_Q15;
1393       case 16: return OFFB_Q16;
1394       case 17: return OFFB_Q17;
1395       case 18: return OFFB_Q18;
1396       case 19: return OFFB_Q19;
1397       case 20: return OFFB_Q20;
1398       case 21: return OFFB_Q21;
1399       case 22: return OFFB_Q22;
1400       case 23: return OFFB_Q23;
1401       case 24: return OFFB_Q24;
1402       case 25: return OFFB_Q25;
1403       case 26: return OFFB_Q26;
1404       case 27: return OFFB_Q27;
1405       case 28: return OFFB_Q28;
1406       case 29: return OFFB_Q29;
1407       case 30: return OFFB_Q30;
1408       case 31: return OFFB_Q31;
1409       default: vassert(0);
1410    }
1411 }
1412
1413 /* Write to a complete Qreg. */
1414 static void putQReg128 ( UInt qregNo, IRExpr* e )
1415 {
1416    vassert(qregNo < 32);
1417    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_V128);
1418    stmt( IRStmt_Put(offsetQReg128(qregNo), e) );
1419 }
1420
1421 /* Read a complete Qreg. */
1422 static IRExpr* getQReg128 ( UInt qregNo )
1423 {
1424    vassert(qregNo < 32);
1425    return IRExpr_Get(offsetQReg128(qregNo), Ity_V128);
1426 }
1427
1428 /* Produce the IR type for some sub-part of a vector.  For 32- and 64-
1429    bit sub-parts we can choose either integer or float types, and
1430    choose float on the basis that that is the common use case and so
1431    will give least interference with Put-to-Get forwarding later
1432    on. */
1433 static IRType preferredVectorSubTypeFromSize ( UInt szB )
1434 {
1435    switch (szB) {
1436       case 1:  return Ity_I8;
1437       case 2:  return Ity_I16;
1438       case 4:  return Ity_I32; //Ity_F32;
1439       case 8:  return Ity_F64;
1440       case 16: return Ity_V128;
1441       default: vassert(0);
1442    }
1443 }
1444
1445 /* Find the offset of the laneNo'th lane of type laneTy in the given
1446    Qreg.  Since the host is little-endian, the least significant lane
1447    has the lowest offset. */
1448 static Int offsetQRegLane ( UInt qregNo, IRType laneTy, UInt laneNo )
1449 {
1450    vassert(host_endness == VexEndnessLE);
1451    Int base = offsetQReg128(qregNo);
1452    /* Since the host is little-endian, the least significant lane
1453       will be at the lowest address. */
1454    /* Restrict this to known types, so as to avoid silently accepting
1455       stupid types. */
1456    UInt laneSzB = 0;
1457    switch (laneTy) {
1458       case Ity_I8:                 laneSzB = 1;  break;
1459       case Ity_F16: case Ity_I16:  laneSzB = 2;  break;
1460       case Ity_F32: case Ity_I32:  laneSzB = 4;  break;
1461       case Ity_F64: case Ity_I64:  laneSzB = 8;  break;
1462       case Ity_V128:               laneSzB = 16; break;
1463       default: break;
1464    }
1465    vassert(laneSzB > 0);
1466    UInt minOff = laneNo * laneSzB;
1467    UInt maxOff = minOff + laneSzB - 1;
1468    vassert(maxOff < 16);
1469    return base + minOff;
1470 }
1471
1472 /* Put to the least significant lane of a Qreg. */
1473 static void putQRegLO ( UInt qregNo, IRExpr* e )
1474 {
1475    IRType ty  = typeOfIRExpr(irsb->tyenv, e);
1476    Int    off = offsetQRegLane(qregNo, ty, 0);
1477    switch (ty) {
1478       case Ity_I8:  case Ity_I16: case Ity_I32: case Ity_I64:
1479       case Ity_F16: case Ity_F32: case Ity_F64: case Ity_V128:
1480          break;
1481       default:
1482          vassert(0); // Other cases are probably invalid
1483    }
1484    stmt(IRStmt_Put(off, e));
1485 }
1486
1487 /* Get from the least significant lane of a Qreg. */
1488 static IRExpr* getQRegLO ( UInt qregNo, IRType ty )
1489 {
1490    Int off = offsetQRegLane(qregNo, ty, 0);
1491    switch (ty) {
1492       case Ity_I8:
1493       case Ity_F16: case Ity_I16:
1494       case Ity_I32: case Ity_I64:
1495       case Ity_F32: case Ity_F64: case Ity_V128:
1496          break;
1497       default:
1498          vassert(0); // Other cases are ATC
1499    }
1500    return IRExpr_Get(off, ty);
1501 }
1502
1503 static const HChar* nameQRegLO ( UInt qregNo, IRType laneTy )
1504 {
1505    static const HChar* namesQ[32]
1506       = { "q0",  "q1",  "q2",  "q3",  "q4",  "q5",  "q6",  "q7",
1507           "q8",  "q9",  "q10", "q11", "q12", "q13", "q14", "q15",
1508           "q16", "q17", "q18", "q19", "q20", "q21", "q22", "q23",
1509           "q24", "q25", "q26", "q27", "q28", "q29", "q30", "q31" };
1510    static const HChar* namesD[32]
1511       = { "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
1512           "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
1513           "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
1514           "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" };
1515    static const HChar* namesS[32]
1516       = { "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
1517           "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15",
1518           "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
1519           "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31" };
1520    static const HChar* namesH[32]
1521       = { "h0",  "h1",  "h2",  "h3",  "h4",  "h5",  "h6",  "h7",
1522           "h8",  "h9",  "h10", "h11", "h12", "h13", "h14", "h15",
1523           "h16", "h17", "h18", "h19", "h20", "h21", "h22", "h23",
1524           "h24", "h25", "h26", "h27", "h28", "h29", "h30", "h31" };
1525    static const HChar* namesB[32]
1526       = { "b0",  "b1",  "b2",  "b3",  "b4",  "b5",  "b6",  "b7",
1527           "b8",  "b9",  "b10", "b11", "b12", "b13", "b14", "b15",
1528           "b16", "b17", "b18", "b19", "b20", "b21", "b22", "b23",
1529           "b24", "b25", "b26", "b27", "b28", "b29", "b30", "b31" };
1530    vassert(qregNo < 32);
1531    switch (sizeofIRType(laneTy)) {
1532       case 1:  return namesB[qregNo];
1533       case 2:  return namesH[qregNo];
1534       case 4:  return namesS[qregNo];
1535       case 8:  return namesD[qregNo];
1536       case 16: return namesQ[qregNo];
1537       default: vassert(0);
1538    }
1539    /*NOTREACHED*/
1540 }
1541
1542 static const HChar* nameQReg128 ( UInt qregNo )
1543 {
1544    return nameQRegLO(qregNo, Ity_V128);
1545 }
1546
1547 /* Find the offset of the most significant half (8 bytes) of the given
1548    Qreg.  This requires knowing the endianness of the host. */
1549 static Int offsetQRegHI64 ( UInt qregNo )
1550 {
1551    return offsetQRegLane(qregNo, Ity_I64, 1);
1552 }
1553
1554 static IRExpr* getQRegHI64 ( UInt qregNo )
1555 {
1556    return IRExpr_Get(offsetQRegHI64(qregNo), Ity_I64);
1557 }
1558
1559 static void putQRegHI64 ( UInt qregNo, IRExpr* e )
1560 {
1561    IRType ty  = typeOfIRExpr(irsb->tyenv, e);
1562    Int    off = offsetQRegHI64(qregNo);
1563    switch (ty) {
1564       case Ity_I64: case Ity_F64:
1565          break;
1566       default:
1567          vassert(0); // Other cases are plain wrong
1568    }
1569    stmt(IRStmt_Put(off, e));
1570 }
1571
1572 /* Put to a specified lane of a Qreg. */
1573 static void putQRegLane ( UInt qregNo, UInt laneNo, IRExpr* e )
1574 {
1575    IRType laneTy  = typeOfIRExpr(irsb->tyenv, e);
1576    Int    off     = offsetQRegLane(qregNo, laneTy, laneNo);
1577    switch (laneTy) {
1578       case Ity_F64: case Ity_I64:
1579       case Ity_I32: case Ity_F32:
1580       case Ity_I16: case Ity_F16:
1581       case Ity_I8:
1582          break;
1583       default:
1584          vassert(0); // Other cases are ATC
1585    }
1586    stmt(IRStmt_Put(off, e));
1587 }
1588
1589 /* Get from a specified lane of a Qreg. */
1590 static IRExpr* getQRegLane ( UInt qregNo, UInt laneNo, IRType laneTy )
1591 {
1592    Int off = offsetQRegLane(qregNo, laneTy, laneNo);
1593    switch (laneTy) {
1594       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
1595       case Ity_F64: case Ity_F32: case Ity_F16:
1596          break;
1597       default:
1598          vassert(0); // Other cases are ATC
1599    }
1600    return IRExpr_Get(off, laneTy);
1601 }
1602
1603
1604 //ZZ /* ---------------- Misc registers ---------------- */
1605 //ZZ
1606 //ZZ static void putMiscReg32 ( UInt    gsoffset,
1607 //ZZ                            IRExpr* e, /* :: Ity_I32 */
1608 //ZZ                            IRTemp  guardT /* :: Ity_I32, 0 or 1 */)
1609 //ZZ {
1610 //ZZ    switch (gsoffset) {
1611 //ZZ       case OFFB_FPSCR:   break;
1612 //ZZ       case OFFB_QFLAG32: break;
1613 //ZZ       case OFFB_GEFLAG0: break;
1614 //ZZ       case OFFB_GEFLAG1: break;
1615 //ZZ       case OFFB_GEFLAG2: break;
1616 //ZZ       case OFFB_GEFLAG3: break;
1617 //ZZ       default: vassert(0); /* awaiting more cases */
1618 //ZZ    }
1619 //ZZ    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1620 //ZZ
1621 //ZZ    if (guardT == IRTemp_INVALID) {
1622 //ZZ       /* unconditional write */
1623 //ZZ       stmt(IRStmt_Put(gsoffset, e));
1624 //ZZ    } else {
1625 //ZZ       stmt(IRStmt_Put(
1626 //ZZ          gsoffset,
1627 //ZZ          IRExpr_ITE( binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0)),
1628 //ZZ                      e, IRExpr_Get(gsoffset, Ity_I32) )
1629 //ZZ       ));
1630 //ZZ    }
1631 //ZZ }
1632 //ZZ
1633 //ZZ static IRTemp get_ITSTATE ( void )
1634 //ZZ {
1635 //ZZ    ASSERT_IS_THUMB;
1636 //ZZ    IRTemp t = newTemp(Ity_I32);
1637 //ZZ    assign(t, IRExpr_Get( OFFB_ITSTATE, Ity_I32));
1638 //ZZ    return t;
1639 //ZZ }
1640 //ZZ
1641 //ZZ static void put_ITSTATE ( IRTemp t )
1642 //ZZ {
1643 //ZZ    ASSERT_IS_THUMB;
1644 //ZZ    stmt( IRStmt_Put( OFFB_ITSTATE, mkexpr(t)) );
1645 //ZZ }
1646 //ZZ
1647 //ZZ static IRTemp get_QFLAG32 ( void )
1648 //ZZ {
1649 //ZZ    IRTemp t = newTemp(Ity_I32);
1650 //ZZ    assign(t, IRExpr_Get( OFFB_QFLAG32, Ity_I32));
1651 //ZZ    return t;
1652 //ZZ }
1653 //ZZ
1654 //ZZ static void put_QFLAG32 ( IRTemp t, IRTemp condT )
1655 //ZZ {
1656 //ZZ    putMiscReg32( OFFB_QFLAG32, mkexpr(t), condT );
1657 //ZZ }
1658 //ZZ
1659 //ZZ /* Stickily set the 'Q' flag (APSR bit 27) of the APSR (Application Program
1660 //ZZ    Status Register) to indicate that overflow or saturation occurred.
1661 //ZZ    Nb: t must be zero to denote no saturation, and any nonzero
1662 //ZZ    value to indicate saturation. */
1663 //ZZ static void or_into_QFLAG32 ( IRExpr* e, IRTemp condT )
1664 //ZZ {
1665 //ZZ    IRTemp old = get_QFLAG32();
1666 //ZZ    IRTemp nyu = newTemp(Ity_I32);
1667 //ZZ    assign(nyu, binop(Iop_Or32, mkexpr(old), e) );
1668 //ZZ    put_QFLAG32(nyu, condT);
1669 //ZZ }
1670
1671
1672 /* ---------------- FPCR stuff ---------------- */
1673
1674 /* Generate IR to get hold of the rounding mode bits in FPCR, and
1675    convert them to IR format.  Bind the final result to the
1676    returned temp. */
1677 static IRTemp /* :: Ity_I32 */ mk_get_IR_rounding_mode ( void )
1678 {
1679    /* The ARMvfp encoding for rounding mode bits is:
1680          00  to nearest
1681          01  to +infinity
1682          10  to -infinity
1683          11  to zero
1684       We need to convert that to the IR encoding:
1685          00  to nearest (the default)
1686          10  to +infinity
1687          01  to -infinity
1688          11  to zero
1689       Which can be done by swapping bits 0 and 1.
1690       The rmode bits are at 23:22 in FPSCR.
1691    */
1692    IRTemp armEncd = newTemp(Ity_I32);
1693    IRTemp swapped = newTemp(Ity_I32);
1694    /* Fish FPCR[23:22] out, and slide to bottom.  Doesn't matter that
1695       we don't zero out bits 24 and above, since the assignment to
1696       'swapped' will mask them out anyway. */
1697    assign(armEncd,
1698           binop(Iop_Shr32, IRExpr_Get(OFFB_FPCR, Ity_I32), mkU8(22)));
1699    /* Now swap them. */
1700    assign(swapped,
1701           binop(Iop_Or32,
1702                 binop(Iop_And32,
1703                       binop(Iop_Shl32, mkexpr(armEncd), mkU8(1)),
1704                       mkU32(2)),
1705                 binop(Iop_And32,
1706                       binop(Iop_Shr32, mkexpr(armEncd), mkU8(1)),
1707                       mkU32(1))
1708          ));
1709    return swapped;
1710 }
1711
1712
1713 /*------------------------------------------------------------*/
1714 /*--- Helpers for flag handling and conditional insns      ---*/
1715 /*------------------------------------------------------------*/
1716
1717 static const HChar* nameARM64Condcode ( ARM64Condcode cond )
1718 {
1719    switch (cond) {
1720       case ARM64CondEQ:  return "eq";
1721       case ARM64CondNE:  return "ne";
1722       case ARM64CondCS:  return "cs";  // or 'hs'
1723       case ARM64CondCC:  return "cc";  // or 'lo'
1724       case ARM64CondMI:  return "mi";
1725       case ARM64CondPL:  return "pl";
1726       case ARM64CondVS:  return "vs";
1727       case ARM64CondVC:  return "vc";
1728       case ARM64CondHI:  return "hi";
1729       case ARM64CondLS:  return "ls";
1730       case ARM64CondGE:  return "ge";
1731       case ARM64CondLT:  return "lt";
1732       case ARM64CondGT:  return "gt";
1733       case ARM64CondLE:  return "le";
1734       case ARM64CondAL:  return "al";
1735       case ARM64CondNV:  return "nv";
1736       default: vpanic("name_ARM64Condcode");
1737    }
1738 }
1739
1740 /* and a handy shorthand for it */
1741 static const HChar* nameCC ( ARM64Condcode cond ) {
1742    return nameARM64Condcode(cond);
1743 }
1744
1745
1746 /* Build IR to calculate some particular condition from stored
1747    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
1748    Ity_I64, suitable for narrowing.  Although the return type is
1749    Ity_I64, the returned value is either 0 or 1.  'cond' must be
1750    :: Ity_I64 and must denote the condition to compute in
1751    bits 7:4, and be zero everywhere else.
1752 */
1753 static IRExpr* mk_arm64g_calculate_condition_dyn ( IRExpr* cond )
1754 {
1755    vassert(typeOfIRExpr(irsb->tyenv, cond) == Ity_I64);
1756    /* And 'cond' had better produce a value in which only bits 7:4 are
1757       nonzero.  However, obviously we can't assert for that. */
1758
1759    /* So what we're constructing for the first argument is
1760       "(cond << 4) | stored-operation".
1761       However, as per comments above, 'cond' must be supplied
1762       pre-shifted to this function.
1763
1764       This pairing scheme requires that the ARM64_CC_OP_ values all fit
1765       in 4 bits.  Hence we are passing a (COND, OP) pair in the lowest
1766       8 bits of the first argument. */
1767    IRExpr** args
1768       = mkIRExprVec_4(
1769            binop(Iop_Or64, IRExpr_Get(OFFB_CC_OP, Ity_I64), cond),
1770            IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1771            IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1772            IRExpr_Get(OFFB_CC_NDEP, Ity_I64)
1773         );
1774    IRExpr* call
1775       = mkIRExprCCall(
1776            Ity_I64,
1777            0/*regparm*/,
1778            "arm64g_calculate_condition", &arm64g_calculate_condition,
1779            args
1780         );
1781
1782    /* Exclude the requested condition, OP and NDEP from definedness
1783       checking.  We're only interested in DEP1 and DEP2. */
1784    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1785    return call;
1786 }
1787
1788
1789 /* Build IR to calculate some particular condition from stored
1790    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
1791    Ity_I64, suitable for narrowing.  Although the return type is
1792    Ity_I64, the returned value is either 0 or 1.
1793 */
1794 static IRExpr* mk_arm64g_calculate_condition ( ARM64Condcode cond )
1795 {
1796   /* First arg is "(cond << 4) | condition".  This requires that the
1797      ARM64_CC_OP_ values all fit in 4 bits.  Hence we are passing a
1798      (COND, OP) pair in the lowest 8 bits of the first argument. */
1799    vassert(cond >= 0 && cond <= 15);
1800    return mk_arm64g_calculate_condition_dyn( mkU64(cond << 4) );
1801 }
1802
1803
1804 /* Build IR to calculate just the carry flag from stored
1805    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1806    Ity_I64. */
1807 static IRExpr* mk_arm64g_calculate_flag_c ( void )
1808 {
1809    IRExpr** args
1810       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1811                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1812                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1813                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1814    IRExpr* call
1815       = mkIRExprCCall(
1816            Ity_I64,
1817            0/*regparm*/,
1818            "arm64g_calculate_flag_c", &arm64g_calculate_flag_c,
1819            args
1820         );
1821    /* Exclude OP and NDEP from definedness checking.  We're only
1822       interested in DEP1 and DEP2. */
1823    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1824    return call;
1825 }
1826
1827
1828 //ZZ /* Build IR to calculate just the overflow flag from stored
1829 //ZZ    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1830 //ZZ    Ity_I32. */
1831 //ZZ static IRExpr* mk_armg_calculate_flag_v ( void )
1832 //ZZ {
1833 //ZZ    IRExpr** args
1834 //ZZ       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
1835 //ZZ                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
1836 //ZZ                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
1837 //ZZ                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
1838 //ZZ    IRExpr* call
1839 //ZZ       = mkIRExprCCall(
1840 //ZZ            Ity_I32,
1841 //ZZ            0/*regparm*/,
1842 //ZZ            "armg_calculate_flag_v", &armg_calculate_flag_v,
1843 //ZZ            args
1844 //ZZ         );
1845 //ZZ    /* Exclude OP and NDEP from definedness checking.  We're only
1846 //ZZ       interested in DEP1 and DEP2. */
1847 //ZZ    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1848 //ZZ    return call;
1849 //ZZ }
1850
1851
1852 /* Build IR to calculate N Z C V in bits 31:28 of the
1853    returned word. */
1854 static IRExpr* mk_arm64g_calculate_flags_nzcv ( void )
1855 {
1856    IRExpr** args
1857       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1858                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1859                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1860                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1861    IRExpr* call
1862       = mkIRExprCCall(
1863            Ity_I64,
1864            0/*regparm*/,
1865            "arm64g_calculate_flags_nzcv", &arm64g_calculate_flags_nzcv,
1866            args
1867         );
1868    /* Exclude OP and NDEP from definedness checking.  We're only
1869       interested in DEP1 and DEP2. */
1870    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1871    return call;
1872 }
1873
1874
1875 /* Build IR to set the flags thunk, in the most general case. */
1876 static
1877 void setFlags_D1_D2_ND ( UInt cc_op,
1878                          IRTemp t_dep1, IRTemp t_dep2, IRTemp t_ndep )
1879 {
1880    vassert(typeOfIRTemp(irsb->tyenv, t_dep1 == Ity_I64));
1881    vassert(typeOfIRTemp(irsb->tyenv, t_dep2 == Ity_I64));
1882    vassert(typeOfIRTemp(irsb->tyenv, t_ndep == Ity_I64));
1883    vassert(cc_op >= ARM64G_CC_OP_COPY && cc_op < ARM64G_CC_OP_NUMBER);
1884    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(cc_op) ));
1885    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t_dep1) ));
1886    stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(t_dep2) ));
1887    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(t_ndep) ));
1888 }
1889
1890 /* Build IR to set the flags thunk after ADD or SUB. */
1891 static
1892 void setFlags_ADD_SUB ( Bool is64, Bool isSUB, IRTemp argL, IRTemp argR )
1893 {
1894    IRTemp argL64 = IRTemp_INVALID;
1895    IRTemp argR64 = IRTemp_INVALID;
1896    IRTemp z64    = newTemp(Ity_I64);
1897    if (is64) {
1898       argL64 = argL;
1899       argR64 = argR;
1900    } else {
1901       argL64 = newTemp(Ity_I64);
1902       argR64 = newTemp(Ity_I64);
1903       assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
1904       assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
1905    }
1906    assign(z64, mkU64(0));
1907    UInt cc_op = ARM64G_CC_OP_NUMBER;
1908    /**/ if ( isSUB &&  is64) { cc_op = ARM64G_CC_OP_SUB64; }
1909    else if ( isSUB && !is64) { cc_op = ARM64G_CC_OP_SUB32; }
1910    else if (!isSUB &&  is64) { cc_op = ARM64G_CC_OP_ADD64; }
1911    else if (!isSUB && !is64) { cc_op = ARM64G_CC_OP_ADD32; }
1912    else                      { vassert(0); }
1913    setFlags_D1_D2_ND(cc_op, argL64, argR64, z64);
1914 }
1915
1916 /* Build IR to set the flags thunk after ADC or SBC. */
1917 static
1918 void setFlags_ADC_SBC ( Bool is64, Bool isSBC,
1919                         IRTemp argL, IRTemp argR, IRTemp oldC )
1920 {
1921    IRTemp argL64 = IRTemp_INVALID;
1922    IRTemp argR64 = IRTemp_INVALID;
1923    IRTemp oldC64 = IRTemp_INVALID;
1924    if (is64) {
1925       argL64 = argL;
1926       argR64 = argR;
1927       oldC64 = oldC;
1928    } else {
1929       argL64 = newTemp(Ity_I64);
1930       argR64 = newTemp(Ity_I64);
1931       oldC64 = newTemp(Ity_I64);
1932       assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
1933       assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
1934       assign(oldC64, unop(Iop_32Uto64, mkexpr(oldC)));
1935    }
1936    UInt cc_op = ARM64G_CC_OP_NUMBER;
1937    /**/ if ( isSBC &&  is64) { cc_op = ARM64G_CC_OP_SBC64; }
1938    else if ( isSBC && !is64) { cc_op = ARM64G_CC_OP_SBC32; }
1939    else if (!isSBC &&  is64) { cc_op = ARM64G_CC_OP_ADC64; }
1940    else if (!isSBC && !is64) { cc_op = ARM64G_CC_OP_ADC32; }
1941    else                      { vassert(0); }
1942    setFlags_D1_D2_ND(cc_op, argL64, argR64, oldC64);
1943 }
1944
1945 /* Build IR to set the flags thunk after ADD or SUB, if the given
1946    condition evaluates to True at run time.  If not, the flags are set
1947    to the specified NZCV value. */
1948 static
1949 void setFlags_ADD_SUB_conditionally (
1950         Bool is64, Bool isSUB,
1951         IRTemp cond, IRTemp argL, IRTemp argR, UInt nzcv
1952      )
1953 {
1954    /* Generate IR as follows:
1955         CC_OP   = ITE(cond, OP_{ADD,SUB}{32,64}, OP_COPY)
1956         CC_DEP1 = ITE(cond, argL64, nzcv << 28)
1957         CC_DEP2 = ITE(cond, argR64, 0)
1958         CC_NDEP = 0
1959    */
1960
1961    IRTemp z64 = newTemp(Ity_I64);
1962    assign(z64, mkU64(0));
1963
1964    /* Establish the operation and operands for the True case. */
1965    IRTemp t_dep1 = IRTemp_INVALID;
1966    IRTemp t_dep2 = IRTemp_INVALID;
1967    UInt   t_op   = ARM64G_CC_OP_NUMBER;
1968    /**/ if ( isSUB &&  is64) { t_op = ARM64G_CC_OP_SUB64; }
1969    else if ( isSUB && !is64) { t_op = ARM64G_CC_OP_SUB32; }
1970    else if (!isSUB &&  is64) { t_op = ARM64G_CC_OP_ADD64; }
1971    else if (!isSUB && !is64) { t_op = ARM64G_CC_OP_ADD32; }
1972    else                      { vassert(0); }
1973    /* */
1974    if (is64) {
1975       t_dep1 = argL;
1976       t_dep2 = argR;
1977    } else {
1978       t_dep1 = newTemp(Ity_I64);
1979       t_dep2 = newTemp(Ity_I64);
1980       assign(t_dep1, unop(Iop_32Uto64, mkexpr(argL)));
1981       assign(t_dep2, unop(Iop_32Uto64, mkexpr(argR)));
1982    }
1983
1984    /* Establish the operation and operands for the False case. */
1985    IRTemp f_dep1 = newTemp(Ity_I64);
1986    IRTemp f_dep2 = z64;
1987    UInt   f_op   = ARM64G_CC_OP_COPY;
1988    assign(f_dep1, mkU64(nzcv << 28));
1989
1990    /* Final thunk values */
1991    IRTemp dep1 = newTemp(Ity_I64);
1992    IRTemp dep2 = newTemp(Ity_I64);
1993    IRTemp op   = newTemp(Ity_I64);
1994
1995    assign(op,   IRExpr_ITE(mkexpr(cond), mkU64(t_op), mkU64(f_op)));
1996    assign(dep1, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep1), mkexpr(f_dep1)));
1997    assign(dep2, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep2), mkexpr(f_dep2)));
1998
1999    /* finally .. */
2000    stmt( IRStmt_Put( OFFB_CC_OP,   mkexpr(op) ));
2001    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(dep1) ));
2002    stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(dep2) ));
2003    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(z64) ));
2004 }
2005
2006 /* Build IR to set the flags thunk after AND/OR/XOR or variants thereof. */
2007 static
2008 void setFlags_LOGIC ( Bool is64, IRTemp res )
2009 {
2010    IRTemp res64 = IRTemp_INVALID;
2011    IRTemp z64   = newTemp(Ity_I64);
2012    UInt   cc_op = ARM64G_CC_OP_NUMBER;
2013    if (is64) {
2014       res64 = res;
2015       cc_op = ARM64G_CC_OP_LOGIC64;
2016    } else {
2017       res64 = newTemp(Ity_I64);
2018       assign(res64, unop(Iop_32Uto64, mkexpr(res)));
2019       cc_op = ARM64G_CC_OP_LOGIC32;
2020    }
2021    assign(z64, mkU64(0));
2022    setFlags_D1_D2_ND(cc_op, res64, z64, z64);
2023 }
2024
2025 /* Build IR to set the flags thunk to a given NZCV value.  NZCV is
2026    located in bits 31:28 of the supplied value. */
2027 static
2028 void setFlags_COPY ( IRTemp nzcv_28x0 )
2029 {
2030    IRTemp z64 = newTemp(Ity_I64);
2031    assign(z64, mkU64(0));
2032    setFlags_D1_D2_ND(ARM64G_CC_OP_COPY, nzcv_28x0, z64, z64);
2033 }
2034
2035
2036 //ZZ /* Minor variant of the above that sets NDEP to zero (if it
2037 //ZZ    sets it at all) */
2038 //ZZ static void setFlags_D1_D2 ( UInt cc_op, IRTemp t_dep1,
2039 //ZZ                              IRTemp t_dep2,
2040 //ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2041 //ZZ {
2042 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2043 //ZZ    assign( z32, mkU32(0) );
2044 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, t_dep2, z32, guardT );
2045 //ZZ }
2046 //ZZ
2047 //ZZ
2048 //ZZ /* Minor variant of the above that sets DEP2 to zero (if it
2049 //ZZ    sets it at all) */
2050 //ZZ static void setFlags_D1_ND ( UInt cc_op, IRTemp t_dep1,
2051 //ZZ                              IRTemp t_ndep,
2052 //ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2053 //ZZ {
2054 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2055 //ZZ    assign( z32, mkU32(0) );
2056 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, t_ndep, guardT );
2057 //ZZ }
2058 //ZZ
2059 //ZZ
2060 //ZZ /* Minor variant of the above that sets DEP2 and NDEP to zero (if it
2061 //ZZ    sets them at all) */
2062 //ZZ static void setFlags_D1 ( UInt cc_op, IRTemp t_dep1,
2063 //ZZ                           IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2064 //ZZ {
2065 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2066 //ZZ    assign( z32, mkU32(0) );
2067 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, z32, guardT );
2068 //ZZ }
2069
2070
2071 /*------------------------------------------------------------*/
2072 /*--- Misc math helpers                                    ---*/
2073 /*------------------------------------------------------------*/
2074
2075 /* Generate IR for ((x & mask) >>u sh) | ((x << sh) & mask) */
2076 static IRTemp math_SWAPHELPER ( IRTemp x, ULong mask, Int sh )
2077 {
2078    IRTemp maskT = newTemp(Ity_I64);
2079    IRTemp res   = newTemp(Ity_I64);
2080    vassert(sh >= 1 && sh <= 63);
2081    assign(maskT, mkU64(mask));
2082    assign( res,
2083            binop(Iop_Or64,
2084                  binop(Iop_Shr64,
2085                        binop(Iop_And64,mkexpr(x),mkexpr(maskT)),
2086                        mkU8(sh)),
2087                  binop(Iop_And64,
2088                        binop(Iop_Shl64,mkexpr(x),mkU8(sh)),
2089                        mkexpr(maskT))
2090                  )
2091            );
2092    return res;
2093 }
2094
2095 /* Generates byte swaps within 32-bit lanes. */
2096 static IRTemp math_UINTSWAP64 ( IRTemp src )
2097 {
2098    IRTemp res;
2099    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2100    res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
2101    return res;
2102 }
2103
2104 /* Generates byte swaps within 16-bit lanes. */
2105 static IRTemp math_USHORTSWAP64 ( IRTemp src )
2106 {
2107    IRTemp res;
2108    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2109    return res;
2110 }
2111
2112 /* Generates a 64-bit byte swap. */
2113 static IRTemp math_BYTESWAP64 ( IRTemp src )
2114 {
2115    IRTemp res;
2116    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2117    res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
2118    res = math_SWAPHELPER(res, 0xFFFFFFFF00000000ULL, 32);
2119    return res;
2120 }
2121
2122 /* Generates a 64-bit bit swap. */
2123 static IRTemp math_BITSWAP64 ( IRTemp src )
2124 {
2125    IRTemp res;
2126    res = math_SWAPHELPER(src, 0xAAAAAAAAAAAAAAAAULL, 1);
2127    res = math_SWAPHELPER(res, 0xCCCCCCCCCCCCCCCCULL, 2);
2128    res = math_SWAPHELPER(res, 0xF0F0F0F0F0F0F0F0ULL, 4);
2129    return math_BYTESWAP64(res);
2130 }
2131
2132 /* Duplicates the bits at the bottom of the given word to fill the
2133    whole word.  src :: Ity_I64 is assumed to have zeroes everywhere
2134    except for the bottom bits. */
2135 static IRTemp math_DUP_TO_64 ( IRTemp src, IRType srcTy )
2136 {
2137    if (srcTy == Ity_I8) {
2138       IRTemp t16 = newTemp(Ity_I64);
2139       assign(t16, binop(Iop_Or64, mkexpr(src),
2140                                   binop(Iop_Shl64, mkexpr(src), mkU8(8))));
2141       IRTemp t32 = newTemp(Ity_I64);
2142       assign(t32, binop(Iop_Or64, mkexpr(t16),
2143                                   binop(Iop_Shl64, mkexpr(t16), mkU8(16))));
2144       IRTemp t64 = newTemp(Ity_I64);
2145       assign(t64, binop(Iop_Or64, mkexpr(t32),
2146                                   binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
2147       return t64;
2148    }
2149    if (srcTy == Ity_I16) {
2150       IRTemp t32 = newTemp(Ity_I64);
2151       assign(t32, binop(Iop_Or64, mkexpr(src),
2152                                   binop(Iop_Shl64, mkexpr(src), mkU8(16))));
2153       IRTemp t64 = newTemp(Ity_I64);
2154       assign(t64, binop(Iop_Or64, mkexpr(t32),
2155                                   binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
2156       return t64;
2157    }
2158    if (srcTy == Ity_I32) {
2159       IRTemp t64 = newTemp(Ity_I64);
2160       assign(t64, binop(Iop_Or64, mkexpr(src),
2161                                   binop(Iop_Shl64, mkexpr(src), mkU8(32))));
2162       return t64;
2163    }
2164    if (srcTy == Ity_I64) {
2165       return src;
2166    }
2167    vassert(0);
2168 }
2169
2170
2171 /* Duplicates the src element exactly so as to fill a V128 value. */
2172 static IRTemp math_DUP_TO_V128 ( IRTemp src, IRType srcTy )
2173 {
2174    IRTemp res = newTempV128();
2175    if (srcTy == Ity_F64) {
2176       IRTemp i64 = newTemp(Ity_I64);
2177       assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(src)));
2178       assign(res, binop(Iop_64HLtoV128, mkexpr(i64), mkexpr(i64)));
2179       return res;
2180    }
2181    if (srcTy == Ity_F32) {
2182       IRTemp i64a = newTemp(Ity_I64);
2183       assign(i64a, unop(Iop_32Uto64, unop(Iop_ReinterpF32asI32, mkexpr(src))));
2184       IRTemp i64b = newTemp(Ity_I64);
2185       assign(i64b, binop(Iop_Or64, binop(Iop_Shl64, mkexpr(i64a), mkU8(32)),
2186                                    mkexpr(i64a)));
2187       assign(res, binop(Iop_64HLtoV128, mkexpr(i64b), mkexpr(i64b)));
2188       return res;
2189    }
2190    if (srcTy == Ity_I64) {
2191       assign(res, binop(Iop_64HLtoV128, mkexpr(src), mkexpr(src)));
2192       return res;
2193    }
2194    if (srcTy == Ity_I32 || srcTy == Ity_I16 || srcTy == Ity_I8) {
2195       IRTemp t1 = newTemp(Ity_I64);
2196       assign(t1, widenUto64(srcTy, mkexpr(src)));
2197       IRTemp t2 = math_DUP_TO_64(t1, srcTy);
2198       assign(res, binop(Iop_64HLtoV128, mkexpr(t2), mkexpr(t2)));
2199       return res;
2200    }
2201    vassert(0);
2202 }
2203
2204
2205 /* |fullWidth| is a full V128 width result.  Depending on bitQ,
2206    zero out the upper half. */
2207 static IRExpr* math_MAYBE_ZERO_HI64 ( UInt bitQ, IRTemp fullWidth )
2208 {
2209    if (bitQ == 1) return mkexpr(fullWidth);
2210    if (bitQ == 0) return unop(Iop_ZeroHI64ofV128, mkexpr(fullWidth));
2211    vassert(0);
2212 }
2213
2214 /* The same, but from an expression instead. */
2215 static IRExpr* math_MAYBE_ZERO_HI64_fromE ( UInt bitQ, IRExpr* fullWidth )
2216 {
2217    IRTemp fullWidthT = newTempV128();
2218    assign(fullWidthT, fullWidth);
2219    return math_MAYBE_ZERO_HI64(bitQ, fullWidthT);
2220 }
2221
2222
2223 /*------------------------------------------------------------*/
2224 /*--- FP comparison helpers                                ---*/
2225 /*------------------------------------------------------------*/
2226
2227 /* irRes :: Ity_I32 holds a floating point comparison result encoded
2228    as an IRCmpF64Result.  Generate code to convert it to an
2229    ARM64-encoded (N,Z,C,V) group in the lowest 4 bits of an I64 value.
2230    Assign a new temp to hold that value, and return the temp. */
2231 static
2232 IRTemp mk_convert_IRCmpF64Result_to_NZCV ( IRTemp irRes32 )
2233 {
2234    IRTemp ix       = newTemp(Ity_I64);
2235    IRTemp termL    = newTemp(Ity_I64);
2236    IRTemp termR    = newTemp(Ity_I64);
2237    IRTemp nzcv     = newTemp(Ity_I64);
2238    IRTemp irRes    = newTemp(Ity_I64);
2239
2240    /* This is where the fun starts.  We have to convert 'irRes' from
2241       an IR-convention return result (IRCmpF64Result) to an
2242       ARM-encoded (N,Z,C,V) group.  The final result is in the bottom
2243       4 bits of 'nzcv'. */
2244    /* Map compare result from IR to ARM(nzcv) */
2245    /*
2246       FP cmp result | IR   | ARM(nzcv)
2247       --------------------------------
2248       UN              0x45   0011
2249       LT              0x01   1000
2250       GT              0x00   0010
2251       EQ              0x40   0110
2252    */
2253    /* Now since you're probably wondering WTF ..
2254
2255       ix fishes the useful bits out of the IR value, bits 6 and 0, and
2256       places them side by side, giving a number which is 0, 1, 2 or 3.
2257
2258       termL is a sequence cooked up by GNU superopt.  It converts ix
2259          into an almost correct value NZCV value (incredibly), except
2260          for the case of UN, where it produces 0100 instead of the
2261          required 0011.
2262
2263       termR is therefore a correction term, also computed from ix.  It
2264          is 1 in the UN case and 0 for LT, GT and UN.  Hence, to get
2265          the final correct value, we subtract termR from termL.
2266
2267       Don't take my word for it.  There's a test program at the bottom
2268       of guest_arm_toIR.c, to try this out with.
2269    */
2270    assign(irRes, unop(Iop_32Uto64, mkexpr(irRes32)));
2271
2272    assign(
2273       ix,
2274       binop(Iop_Or64,
2275             binop(Iop_And64,
2276                   binop(Iop_Shr64, mkexpr(irRes), mkU8(5)),
2277                   mkU64(3)),
2278             binop(Iop_And64, mkexpr(irRes), mkU64(1))));
2279
2280    assign(
2281       termL,
2282       binop(Iop_Add64,
2283             binop(Iop_Shr64,
2284                   binop(Iop_Sub64,
2285                         binop(Iop_Shl64,
2286                               binop(Iop_Xor64, mkexpr(ix), mkU64(1)),
2287                               mkU8(62)),
2288                         mkU64(1)),
2289                   mkU8(61)),
2290             mkU64(1)));
2291
2292    assign(
2293       termR,
2294       binop(Iop_And64,
2295             binop(Iop_And64,
2296                   mkexpr(ix),
2297                   binop(Iop_Shr64, mkexpr(ix), mkU8(1))),
2298             mkU64(1)));
2299
2300    assign(nzcv, binop(Iop_Sub64, mkexpr(termL), mkexpr(termR)));
2301    return nzcv;
2302 }
2303
2304
2305 /*------------------------------------------------------------*/
2306 /*--- Data processing (immediate)                          ---*/
2307 /*------------------------------------------------------------*/
2308
2309 /* Helper functions for supporting "DecodeBitMasks" */
2310
2311 static ULong dbm_ROR ( Int width, ULong x, Int rot )
2312 {
2313    vassert(width > 0 && width <= 64);
2314    vassert(rot >= 0 && rot < width);
2315    if (rot == 0) return x;
2316    ULong res = x >> rot;
2317    res |= (x << (width - rot));
2318    if (width < 64)
2319      res &= ((1ULL << width) - 1);
2320    return res;
2321 }
2322
2323 static ULong dbm_RepTo64( Int esize, ULong x )
2324 {
2325    switch (esize) {
2326       case 64:
2327          return x;
2328       case 32:
2329          x &= 0xFFFFFFFF; x |= (x << 32);
2330          return x;
2331       case 16:
2332          x &= 0xFFFF; x |= (x << 16); x |= (x << 32);
2333          return x;
2334       case 8:
2335          x &= 0xFF; x |= (x << 8); x |= (x << 16); x |= (x << 32);
2336          return x;
2337       case 4:
2338          x &= 0xF; x |= (x << 4); x |= (x << 8);
2339          x |= (x << 16); x |= (x << 32);
2340          return x;
2341       case 2:
2342          x &= 0x3; x |= (x << 2); x |= (x << 4); x |= (x << 8);
2343          x |= (x << 16); x |= (x << 32);
2344          return x;
2345       default:
2346          break;
2347    }
2348    vpanic("dbm_RepTo64");
2349    /*NOTREACHED*/
2350    return 0;
2351 }
2352
2353 static Int dbm_highestSetBit ( ULong x )
2354 {
2355    Int i;
2356    for (i = 63; i >= 0; i--) {
2357       if (x & (1ULL << i))
2358          return i;
2359    }
2360    vassert(x == 0);
2361    return -1;
2362 }
2363
2364 static
2365 Bool dbm_DecodeBitMasks ( /*OUT*/ULong* wmask, /*OUT*/ULong* tmask,
2366                           ULong immN, ULong imms, ULong immr, Bool immediate,
2367                           UInt M /*32 or 64*/)
2368 {
2369    vassert(immN < (1ULL << 1));
2370    vassert(imms < (1ULL << 6));
2371    vassert(immr < (1ULL << 6));
2372    vassert(immediate == False || immediate == True);
2373    vassert(M == 32 || M == 64);
2374
2375    Int len = dbm_highestSetBit( ((immN << 6) & 64) | ((~imms) & 63) );
2376    if (len < 1) { /* printf("fail1\n"); */ return False; }
2377    vassert(len <= 6);
2378    vassert(M >= (1 << len));
2379
2380    vassert(len >= 1 && len <= 6);
2381    ULong levels = // (zeroes(6 - len) << (6-len)) | ones(len);
2382                   (1 << len) - 1;
2383    vassert(levels >= 1 && levels <= 63);
2384
2385    if (immediate && ((imms & levels) == levels)) {
2386       /* printf("fail2 imms %llu levels %llu len %d\n", imms, levels, len); */
2387       return False;
2388    }
2389
2390    ULong S = imms & levels;
2391    ULong R = immr & levels;
2392    Int   diff = S - R;
2393    diff &= 63;
2394    Int esize = 1 << len;
2395    vassert(2 <= esize && esize <= 64);
2396
2397    /* Be careful of these (1ULL << (S+1)) - 1 expressions, and the
2398       same below with d.  S can be 63 in which case we have an out of
2399       range and hence undefined shift. */
2400    vassert(S >= 0 && S <= 63);
2401    vassert(esize >= (S+1));
2402    ULong elem_s = // Zeroes(esize-(S+1)):Ones(S+1)
2403                   //(1ULL << (S+1)) - 1;
2404                   ((1ULL << S) - 1) + (1ULL << S);
2405
2406    Int d = // diff<len-1:0>
2407            diff & ((1 << len)-1);
2408    vassert(esize >= (d+1));
2409    vassert(d >= 0 && d <= 63);
2410
2411    ULong elem_d = // Zeroes(esize-(d+1)):Ones(d+1)
2412                   //(1ULL << (d+1)) - 1;
2413                   ((1ULL << d) - 1) + (1ULL << d);
2414
2415    if (esize != 64) vassert(elem_s < (1ULL << esize));
2416    if (esize != 64) vassert(elem_d < (1ULL << esize));
2417
2418    if (wmask) *wmask = dbm_RepTo64(esize, dbm_ROR(esize, elem_s, R));
2419    if (tmask) *tmask = dbm_RepTo64(esize, elem_d);
2420
2421    return True;
2422 }
2423
2424
2425 static
2426 Bool dis_ARM64_data_processing_immediate(/*MB_OUT*/DisResult* dres,
2427                                          UInt insn, Bool sigill_diag)
2428 {
2429 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
2430
2431    /* insn[28:23]
2432       10000x PC-rel addressing
2433       10001x Add/subtract (immediate)
2434       100100 Logical (immediate)
2435       100101 Move Wide (immediate)
2436       100110 Bitfield
2437       100111 Extract
2438    */
2439
2440    /* ------------------ ADD/SUB{,S} imm12 ------------------ */
2441    if (INSN(28,24) == BITS5(1,0,0,0,1)) {
2442       Bool is64   = INSN(31,31) == 1;
2443       Bool isSub  = INSN(30,30) == 1;
2444       Bool setCC  = INSN(29,29) == 1;
2445       UInt sh     = INSN(23,22);
2446       UInt uimm12 = INSN(21,10);
2447       UInt nn     = INSN(9,5);
2448       UInt dd     = INSN(4,0);
2449       const HChar* nm = isSub ? "sub" : "add";
2450       if (sh >= 2) {
2451          /* Invalid; fall through */
2452       } else {
2453          vassert(sh <= 1);
2454          uimm12 <<= (12 * sh);
2455          if (is64) {
2456             IRTemp argL  = newTemp(Ity_I64);
2457             IRTemp argR  = newTemp(Ity_I64);
2458             IRTemp res   = newTemp(Ity_I64);
2459             assign(argL, getIReg64orSP(nn));
2460             assign(argR, mkU64(uimm12));
2461             assign(res,  binop(isSub ? Iop_Sub64 : Iop_Add64,
2462                                mkexpr(argL), mkexpr(argR)));
2463             if (setCC) {
2464                putIReg64orZR(dd, mkexpr(res));
2465                setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
2466                DIP("%ss %s, %s, 0x%x\n",
2467                    nm, nameIReg64orZR(dd), nameIReg64orSP(nn), uimm12);
2468             } else {
2469                putIReg64orSP(dd, mkexpr(res));
2470                DIP("%s %s, %s, 0x%x\n",
2471                    nm, nameIReg64orSP(dd), nameIReg64orSP(nn), uimm12);
2472             }
2473          } else {
2474             IRTemp argL  = newTemp(Ity_I32);
2475             IRTemp argR  = newTemp(Ity_I32);
2476             IRTemp res   = newTemp(Ity_I32);
2477             assign(argL, getIReg32orSP(nn));
2478             assign(argR, mkU32(uimm12));
2479             assign(res,  binop(isSub ? Iop_Sub32 : Iop_Add32,
2480                                mkexpr(argL), mkexpr(argR)));
2481             if (setCC) {
2482                putIReg32orZR(dd, mkexpr(res));
2483                setFlags_ADD_SUB(False/*!is64*/, isSub, argL, argR);
2484                DIP("%ss %s, %s, 0x%x\n",
2485                    nm, nameIReg32orZR(dd), nameIReg32orSP(nn), uimm12);
2486             } else {
2487                putIReg32orSP(dd, mkexpr(res));
2488                DIP("%s %s, %s, 0x%x\n",
2489                    nm, nameIReg32orSP(dd), nameIReg32orSP(nn), uimm12);
2490             }
2491          }
2492          return True;
2493       }
2494    }
2495
2496    /* -------------------- ADR/ADRP -------------------- */
2497    if (INSN(28,24) == BITS5(1,0,0,0,0)) {
2498       UInt  bP    = INSN(31,31);
2499       UInt  immLo = INSN(30,29);
2500       UInt  immHi = INSN(23,5);
2501       UInt  rD    = INSN(4,0);
2502       ULong uimm  = (immHi << 2) | immLo;
2503       ULong simm  = sx_to_64(uimm, 21);
2504       ULong val;
2505       if (bP) {
2506          val = (guest_PC_curr_instr & 0xFFFFFFFFFFFFF000ULL) + (simm << 12);
2507       } else {
2508          val = guest_PC_curr_instr + simm;
2509       }
2510       putIReg64orZR(rD, mkU64(val));
2511       DIP("adr%s %s, 0x%llx\n", bP ? "p" : "", nameIReg64orZR(rD), val);
2512       return True;
2513    }
2514
2515    /* -------------------- LOGIC(imm) -------------------- */
2516    if (INSN(28,23) == BITS6(1,0,0,1,0,0)) {
2517       /* 31 30 28     22 21   15   9  4
2518          sf op 100100 N  immr imms Rn Rd
2519            op=00: AND  Rd|SP, Rn, #imm
2520            op=01: ORR  Rd|SP, Rn, #imm
2521            op=10: EOR  Rd|SP, Rn, #imm
2522            op=11: ANDS Rd|ZR, Rn, #imm
2523       */
2524       Bool  is64 = INSN(31,31) == 1;
2525       UInt  op   = INSN(30,29);
2526       UInt  N    = INSN(22,22);
2527       UInt  immR = INSN(21,16);
2528       UInt  immS = INSN(15,10);
2529       UInt  nn   = INSN(9,5);
2530       UInt  dd   = INSN(4,0);
2531       ULong imm  = 0;
2532       Bool  ok;
2533       if (N == 1 && !is64)
2534          goto after_logic_imm; /* not allowed; fall through */
2535       ok = dbm_DecodeBitMasks(&imm, NULL,
2536                               N, immS, immR, True, is64 ? 64 : 32);
2537       if (!ok)
2538          goto after_logic_imm;
2539
2540       const HChar* names[4] = { "and", "orr", "eor", "ands" };
2541       const IROp   ops64[4] = { Iop_And64, Iop_Or64, Iop_Xor64, Iop_And64 };
2542       const IROp   ops32[4] = { Iop_And32, Iop_Or32, Iop_Xor32, Iop_And32 };
2543
2544       vassert(op < 4);
2545       if (is64) {
2546          IRExpr* argL = getIReg64orZR(nn);
2547          IRExpr* argR = mkU64(imm);
2548          IRTemp  res  = newTemp(Ity_I64);
2549          assign(res, binop(ops64[op], argL, argR));
2550          if (op < 3) {
2551             putIReg64orSP(dd, mkexpr(res));
2552             DIP("%s %s, %s, 0x%llx\n", names[op],
2553                 nameIReg64orSP(dd), nameIReg64orZR(nn), imm);
2554          } else {
2555             putIReg64orZR(dd, mkexpr(res));
2556             setFlags_LOGIC(True/*is64*/, res);
2557             DIP("%s %s, %s, 0x%llx\n", names[op],
2558                 nameIReg64orZR(dd), nameIReg64orZR(nn), imm);
2559          }
2560       } else {
2561          IRExpr* argL = getIReg32orZR(nn);
2562          IRExpr* argR = mkU32((UInt)imm);
2563          IRTemp  res  = newTemp(Ity_I32);
2564          assign(res, binop(ops32[op], argL, argR));
2565          if (op < 3) {
2566             putIReg32orSP(dd, mkexpr(res));
2567             DIP("%s %s, %s, 0x%x\n", names[op],
2568                 nameIReg32orSP(dd), nameIReg32orZR(nn), (UInt)imm);
2569          } else {
2570             putIReg32orZR(dd, mkexpr(res));
2571             setFlags_LOGIC(False/*!is64*/, res);
2572             DIP("%s %s, %s, 0x%x\n", names[op],
2573                 nameIReg32orZR(dd), nameIReg32orZR(nn), (UInt)imm);
2574          }
2575       }
2576       return True;
2577    }
2578    after_logic_imm:
2579
2580    /* -------------------- MOV{Z,N,K} -------------------- */
2581    if (INSN(28,23) == BITS6(1,0,0,1,0,1)) {
2582       /* 31 30 28      22 20    4
2583          |  |  |       |  |     |
2584          sf 10 100 101 hw imm16 Rd   MOV(Z) Rd, (imm16 << (16*hw))
2585          sf 00 100 101 hw imm16 Rd   MOV(N) Rd, ~(imm16 << (16*hw))
2586          sf 11 100 101 hw imm16 Rd   MOV(K) Rd, (imm16 << (16*hw))
2587       */
2588       Bool is64   = INSN(31,31) == 1;
2589       UInt subopc = INSN(30,29);
2590       UInt hw     = INSN(22,21);
2591       UInt imm16  = INSN(20,5);
2592       UInt dd     = INSN(4,0);
2593       if (subopc == BITS2(0,1) || (!is64 && hw >= 2)) {
2594          /* invalid; fall through */
2595       } else {
2596          ULong imm64 = ((ULong)imm16) << (16 * hw);
2597          if (!is64)
2598             vassert(imm64 < 0x100000000ULL);
2599          switch (subopc) {
2600             case BITS2(1,0): // MOVZ
2601                putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
2602                DIP("movz %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
2603                break;
2604             case BITS2(0,0): // MOVN
2605                imm64 = ~imm64;
2606                if (!is64)
2607                   imm64 &= 0xFFFFFFFFULL;
2608                putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
2609                DIP("movn %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
2610                break;
2611             case BITS2(1,1): // MOVK
2612                /* This is more complex.  We are inserting a slice into
2613                   the destination register, so we need to have the old
2614                   value of it. */
2615                if (is64) {
2616                   IRTemp old = newTemp(Ity_I64);
2617                   assign(old, getIReg64orZR(dd));
2618                   ULong mask = 0xFFFFULL << (16 * hw);
2619                   IRExpr* res
2620                      = binop(Iop_Or64,
2621                              binop(Iop_And64, mkexpr(old), mkU64(~mask)),
2622                              mkU64(imm64));
2623                   putIReg64orZR(dd, res);
2624                   DIP("movk %s, 0x%x, lsl %u\n",
2625                       nameIReg64orZR(dd), imm16, 16*hw);
2626                } else {
2627                   IRTemp old = newTemp(Ity_I32);
2628                   assign(old, getIReg32orZR(dd));
2629                   vassert(hw <= 1);
2630                   UInt mask = ((UInt)0xFFFF) << (16 * hw);
2631                   IRExpr* res
2632                      = binop(Iop_Or32,
2633                              binop(Iop_And32, mkexpr(old), mkU32(~mask)),
2634                              mkU32((UInt)imm64));
2635                   putIReg32orZR(dd, res);
2636                   DIP("movk %s, 0x%x, lsl %u\n",
2637                       nameIReg32orZR(dd), imm16, 16*hw);
2638                }
2639                break;
2640             default:
2641                vassert(0);
2642          }
2643          return True;
2644       }
2645    }
2646
2647    /* -------------------- {U,S,}BFM -------------------- */
2648    /*    30 28     22 21   15   9  4
2649
2650       sf 10 100110 N  immr imms nn dd
2651          UBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2652          UBFM Xd, Xn, #immr, #imms   when sf=1, N=1
2653
2654       sf 00 100110 N  immr imms nn dd
2655          SBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2656          SBFM Xd, Xn, #immr, #imms   when sf=1, N=1
2657
2658       sf 01 100110 N  immr imms nn dd
2659          BFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2660          BFM Xd, Xn, #immr, #imms   when sf=1, N=1
2661    */
2662    if (INSN(28,23) == BITS6(1,0,0,1,1,0)) {
2663       UInt sf     = INSN(31,31);
2664       UInt opc    = INSN(30,29);
2665       UInt N      = INSN(22,22);
2666       UInt immR   = INSN(21,16);
2667       UInt immS   = INSN(15,10);
2668       UInt nn     = INSN(9,5);
2669       UInt dd     = INSN(4,0);
2670       Bool inZero = False;
2671       Bool extend = False;
2672       const HChar* nm = "???";
2673       /* skip invalid combinations */
2674       switch (opc) {
2675          case BITS2(0,0):
2676             inZero = True; extend = True; nm = "sbfm"; break;
2677          case BITS2(0,1):
2678             inZero = False; extend = False; nm = "bfm"; break;
2679          case BITS2(1,0):
2680             inZero = True; extend = False; nm = "ubfm"; break;
2681          case BITS2(1,1):
2682             goto after_bfm; /* invalid */
2683          default:
2684             vassert(0);
2685       }
2686       if (sf == 1 && N != 1) goto after_bfm;
2687       if (sf == 0 && (N != 0 || ((immR >> 5) & 1) != 0
2688                              || ((immS >> 5) & 1) != 0)) goto after_bfm;
2689       ULong wmask = 0, tmask = 0;
2690       Bool ok = dbm_DecodeBitMasks(&wmask, &tmask,
2691                                    N, immS, immR, False, sf == 1 ? 64 : 32);
2692       if (!ok) goto after_bfm; /* hmmm */
2693
2694       Bool   is64 = sf == 1;
2695       IRType ty   = is64 ? Ity_I64 : Ity_I32;
2696
2697       // Handle plain shifts explicitly.  These are functionally identical to
2698       // the general case below, but iropt isn't clever enough to reduce those
2699       // sequences to plain shifts.  So give it a hand.
2700       if (is64 && immS == 63 && immR >= 1 && immR <= 63) {
2701          if (opc == BITS2(0,0)) {
2702             // 64-bit signed shift right
2703             putIReg64orZR(dd, binop(Iop_Sar64, getIReg64orZR(nn), mkU8(immR)));
2704             DIP("asr %s, %s, #%u\n",
2705                 nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR);
2706             return True;
2707          }
2708          if (opc == BITS2(1,0)) {
2709             // 64-bit unsigned shift right
2710             putIReg64orZR(dd, binop(Iop_Shr64, getIReg64orZR(nn), mkU8(immR)));
2711             DIP("lsr %s, %s, #%u\n",
2712                 nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR);
2713             return True;
2714          }
2715       }
2716
2717       if (!is64 && immS == 31 && immR >= 1 && immR <= 31) {
2718          if (opc == BITS2(0,0)) {
2719             // 32-bit signed shift right
2720             putIReg32orZR(dd, binop(Iop_Sar32, getIReg32orZR(nn), mkU8(immR)));
2721             DIP("asr %s, %s, #%u\n",
2722                 nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR);
2723             return True;
2724          }
2725          if (opc == BITS2(1,0)) {
2726             // 32-bit unsigned shift right
2727             putIReg32orZR(dd, binop(Iop_Shr32, getIReg32orZR(nn), mkU8(immR)));
2728             DIP("lsr %s, %s, #%u\n",
2729                 nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR);
2730             return True;
2731          }
2732       }
2733
2734       if (is64 && immS >= 0 && immS <= 62
2735           && immR == immS + 1 && opc == BITS2(1,0)) {
2736          // 64-bit shift left
2737          UInt shift = 64 - immR;
2738          vassert(shift >= 1 && shift <= 63);
2739          putIReg64orZR(dd, binop(Iop_Shl64, getIReg64orZR(nn), mkU8(shift)));
2740          DIP("lsl %s, %s, #%u\n",
2741              nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), shift);
2742          return True;
2743       }
2744       if (!is64 && immS >= 0 && immS <= 30
2745           && immR == immS + 1 && opc == BITS2(1,0)) {
2746          // 32-bit shift left
2747          UInt shift = 32 - immR;
2748          vassert(shift >= 1 && shift <= 31);
2749          putIReg32orZR(dd, binop(Iop_Shl32, getIReg32orZR(nn), mkU8(shift)));
2750          DIP("lsl %s, %s, #%u\n",
2751              nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), shift);
2752          return True;
2753       }
2754
2755       // Also special-case sxtw.
2756       if (opc == BITS2(0,0) && immR == 0) {
2757          if (is64) {
2758             // The destination size is 64 bits.
2759             if (immS == 31) {
2760                putIReg64orZR(dd, unop(Iop_32Sto64, getIReg32orZR(nn)));
2761                DIP("sxtw %s, %s\n", nameIReg64orZR(dd), nameIReg32orZR(nn));
2762                return True;
2763             }
2764             if (immS == 15) {
2765                putIReg64orZR(dd, unop(Iop_16Sto64,
2766                                       unop(Iop_64to16, getIReg64orZR(nn))));
2767                DIP("sxth %s, %s\n", nameIReg64orZR(dd), nameIReg32orZR(nn));
2768                return True;
2769             }
2770             if (immS == 7) {
2771                putIReg64orZR(dd, unop(Iop_8Sto64,
2772                                       unop(Iop_64to8, getIReg64orZR(nn))));
2773                DIP("sxtb %s, %s\n", nameIReg64orZR(dd), nameIReg32orZR(nn));
2774                return True;
2775             }
2776          } else {
2777             // The destination size is 32 bits.
2778             if (immS == 15) {
2779                putIReg32orZR(dd, unop(Iop_16Sto32,
2780                                       unop(Iop_64to16, getIReg64orZR(nn))));
2781                DIP("sxth %s, %s\n", nameIReg32orZR(dd), nameIReg32orZR(nn));
2782                return True;
2783             }
2784             if (immS == 7) {
2785                putIReg32orZR(dd, unop(Iop_8Sto32,
2786                                       unop(Iop_64to8, getIReg64orZR(nn))));
2787                DIP("sxtb %s, %s\n", nameIReg32orZR(dd), nameIReg32orZR(nn));
2788                return True;
2789             }
2790          }
2791       }
2792
2793       // None of the special cases apply.  We have to use the (slow) general
2794       // case.
2795       IRTemp dst = newTemp(ty);
2796       IRTemp src = newTemp(ty);
2797       IRTemp bot = newTemp(ty);
2798       IRTemp top = newTemp(ty);
2799       IRTemp res = newTemp(ty);
2800       assign(dst, inZero ? mkU(ty,0) : getIRegOrZR(is64, dd));
2801       assign(src, getIRegOrZR(is64, nn));
2802       /* perform bitfield move on low bits */
2803       assign(bot, binop(mkOR(ty),
2804                         binop(mkAND(ty), mkexpr(dst), mkU(ty, ~wmask)),
2805                         binop(mkAND(ty), mkexpr(mathROR(ty, src, immR)),
2806                                          mkU(ty, wmask))));
2807       /* determine extension bits (sign, zero or dest register) */
2808       assign(top, mkexpr(extend ? mathREPLICATE(ty, src, immS) : dst));
2809       /* combine extension bits and result bits */
2810       assign(res, binop(mkOR(ty),
2811                         binop(mkAND(ty), mkexpr(top), mkU(ty, ~tmask)),
2812                         binop(mkAND(ty), mkexpr(bot), mkU(ty, tmask))));
2813       putIRegOrZR(is64, dd, mkexpr(res));
2814       DIP("%s %s, %s, immR=%u, immS=%u\n",
2815           nm, nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR, immS);
2816       return True;
2817    }
2818    after_bfm:
2819
2820    /* ---------------------- EXTR ---------------------- */
2821    /*   30 28     22 20 15   9 4
2822       1 00 100111 10 m  imm6 n d  EXTR Xd, Xn, Xm, #imm6
2823       0 00 100111 00 m  imm6 n d  EXTR Wd, Wn, Wm, #imm6 when #imm6 < 32
2824    */
2825    if (INSN(30,23) == BITS8(0,0,1,0,0,1,1,1) && INSN(21,21) == 0) {
2826       Bool is64  = INSN(31,31) == 1;
2827       UInt mm    = INSN(20,16);
2828       UInt imm6  = INSN(15,10);
2829       UInt nn    = INSN(9,5);
2830       UInt dd    = INSN(4,0);
2831       Bool valid = True;
2832       if (INSN(31,31) != INSN(22,22))
2833         valid = False;
2834       if (!is64 && imm6 >= 32)
2835         valid = False;
2836       if (!valid) goto after_extr;
2837       IRType ty    = is64 ? Ity_I64 : Ity_I32;
2838       IRTemp srcHi = newTemp(ty);
2839       IRTemp srcLo = newTemp(ty);
2840       IRTemp res   = newTemp(ty);
2841       assign(srcHi, getIRegOrZR(is64, nn));
2842       assign(srcLo, getIRegOrZR(is64, mm));
2843       if (imm6 == 0) {
2844         assign(res, mkexpr(srcLo));
2845       } else {
2846         UInt szBits = 8 * sizeofIRType(ty);
2847         vassert(imm6 > 0 && imm6 < szBits);
2848         assign(res, binop(mkOR(ty),
2849                           binop(mkSHL(ty), mkexpr(srcHi), mkU8(szBits-imm6)),
2850                           binop(mkSHR(ty), mkexpr(srcLo), mkU8(imm6))));
2851       }
2852       putIRegOrZR(is64, dd, mkexpr(res));
2853       DIP("extr %s, %s, %s, #%u\n",
2854           nameIRegOrZR(is64,dd),
2855           nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm), imm6);
2856       return True;
2857    }
2858   after_extr:
2859
2860    if (sigill_diag) {
2861       vex_printf("ARM64 front end: data_processing_immediate\n");
2862    }
2863    return False;
2864 #  undef INSN
2865 }
2866
2867
2868 /*------------------------------------------------------------*/
2869 /*--- Data processing (register) instructions              ---*/
2870 /*------------------------------------------------------------*/
2871
2872 static const HChar* nameSH ( UInt sh ) {
2873    switch (sh) {
2874       case 0: return "lsl";
2875       case 1: return "lsr";
2876       case 2: return "asr";
2877       case 3: return "ror";
2878       default: vassert(0);
2879    }
2880 }
2881
2882 /* Generate IR to get a register value, possibly shifted by an
2883    immediate.  Returns either a 32- or 64-bit temporary holding the
2884    result.  After the shift, the value can optionally be NOT-ed
2885    too.
2886
2887    sh_how coding: 00=SHL, 01=SHR, 10=SAR, 11=ROR.  sh_amt may only be
2888    in the range 0 to (is64 ? 64 : 32)-1.  For some instructions, ROR
2889    isn't allowed, but it's the job of the caller to check that.
2890 */
2891 static IRTemp getShiftedIRegOrZR ( Bool is64,
2892                                    UInt sh_how, UInt sh_amt, UInt regNo,
2893                                    Bool invert )
2894 {
2895    vassert(sh_how < 4);
2896    vassert(sh_amt < (is64 ? 64 : 32));
2897    IRType ty = is64 ? Ity_I64 : Ity_I32;
2898    IRTemp t0 = newTemp(ty);
2899    assign(t0, getIRegOrZR(is64, regNo));
2900    IRTemp t1 = newTemp(ty);
2901    switch (sh_how) {
2902       case BITS2(0,0):
2903          assign(t1, binop(mkSHL(ty), mkexpr(t0), mkU8(sh_amt)));
2904          break;
2905       case BITS2(0,1):
2906          assign(t1, binop(mkSHR(ty), mkexpr(t0), mkU8(sh_amt)));
2907          break;
2908       case BITS2(1,0):
2909          assign(t1, binop(mkSAR(ty), mkexpr(t0), mkU8(sh_amt)));
2910          break;
2911       case BITS2(1,1):
2912          assign(t1, mkexpr(mathROR(ty, t0, sh_amt)));
2913          break;
2914       default:
2915          vassert(0);
2916    }
2917    if (invert) {
2918       IRTemp t2 = newTemp(ty);
2919       assign(t2, unop(mkNOT(ty), mkexpr(t1)));
2920       return t2;
2921    } else {
2922       return t1;
2923    }
2924 }
2925
2926
2927 static
2928 Bool dis_ARM64_data_processing_register(/*MB_OUT*/DisResult* dres,
2929                                         UInt insn, Bool sigill_diag)
2930 {
2931 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
2932
2933    /* ------------------- ADD/SUB(reg) ------------------- */
2934    /* x==0 => 32 bit op      x==1 => 64 bit op
2935       sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR(NOT ALLOWED)
2936
2937       31 30 29 28    23 21 20 15   9  4
2938       |  |  |  |     |  |  |  |    |  |
2939       x  0  0  01011 sh 0  Rm imm6 Rn Rd   ADD  Rd,Rn, sh(Rm,imm6)
2940       x  0  1  01011 sh 0  Rm imm6 Rn Rd   ADDS Rd,Rn, sh(Rm,imm6)
2941       x  1  0  01011 sh 0  Rm imm6 Rn Rd   SUB  Rd,Rn, sh(Rm,imm6)
2942       x  1  1  01011 sh 0  Rm imm6 Rn Rd   SUBS Rd,Rn, sh(Rm,imm6)
2943    */
2944    if (INSN(28,24) == BITS5(0,1,0,1,1) && INSN(21,21) == 0) {
2945       UInt   bX    = INSN(31,31);
2946       UInt   bOP   = INSN(30,30); /* 0: ADD, 1: SUB */
2947       UInt   bS    = INSN(29, 29); /* set flags? */
2948       UInt   sh    = INSN(23,22);
2949       UInt   rM    = INSN(20,16);
2950       UInt   imm6  = INSN(15,10);
2951       UInt   rN    = INSN(9,5);
2952       UInt   rD    = INSN(4,0);
2953       Bool   isSUB = bOP == 1;
2954       Bool   is64  = bX == 1;
2955       IRType ty    = is64 ? Ity_I64 : Ity_I32;
2956       if ((!is64 && imm6 > 31) || sh == BITS2(1,1)) {
2957          /* invalid; fall through */
2958       } else {
2959          IRTemp argL = newTemp(ty);
2960          assign(argL, getIRegOrZR(is64, rN));
2961          IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, False);
2962          IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
2963          IRTemp res  = newTemp(ty);
2964          assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
2965          if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
2966          if (bS) {
2967             setFlags_ADD_SUB(is64, isSUB, argL, argR);
2968          }
2969          DIP("%s%s %s, %s, %s, %s #%u\n",
2970              bOP ? "sub" : "add", bS ? "s" : "",
2971              nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
2972              nameIRegOrZR(is64, rM), nameSH(sh), imm6);
2973          return True;
2974       }
2975    }
2976
2977    /* ------------------- ADC/SBC(reg) ------------------- */
2978    /* x==0 => 32 bit op      x==1 => 64 bit op
2979
2980       31 30 29 28    23 21 20 15     9  4
2981       |  |  |  |     |  |  |  |      |  |
2982       x  0  0  11010 00 0  Rm 000000 Rn Rd   ADC  Rd,Rn,Rm
2983       x  0  1  11010 00 0  Rm 000000 Rn Rd   ADCS Rd,Rn,Rm
2984       x  1  0  11010 00 0  Rm 000000 Rn Rd   SBC  Rd,Rn,Rm
2985       x  1  1  11010 00 0  Rm 000000 Rn Rd   SBCS Rd,Rn,Rm
2986    */
2987
2988    if (INSN(28,21) == BITS8(1,1,0,1,0,0,0,0) && INSN(15,10) == 0 ) {
2989       UInt   bX    = INSN(31,31);
2990       UInt   bOP   = INSN(30,30); /* 0: ADC, 1: SBC */
2991       UInt   bS    = INSN(29,29); /* set flags */
2992       UInt   rM    = INSN(20,16);
2993       UInt   rN    = INSN(9,5);
2994       UInt   rD    = INSN(4,0);
2995
2996       Bool   isSUB = bOP == 1;
2997       Bool   is64  = bX == 1;
2998       IRType ty    = is64 ? Ity_I64 : Ity_I32;
2999
3000       IRTemp oldC = newTemp(ty);
3001       assign(oldC,
3002              is64 ? mk_arm64g_calculate_flag_c()
3003                   : unop(Iop_64to32, mk_arm64g_calculate_flag_c()) );
3004
3005       IRTemp argL = newTemp(ty);
3006       assign(argL, getIRegOrZR(is64, rN));
3007       IRTemp argR = newTemp(ty);
3008       assign(argR, getIRegOrZR(is64, rM));
3009
3010       IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
3011       IRTemp res  = newTemp(ty);
3012       if (isSUB) {
3013          IRExpr* one = is64 ? mkU64(1) : mkU32(1);
3014          IROp xorOp = is64 ? Iop_Xor64 : Iop_Xor32;
3015          assign(res,
3016                 binop(op,
3017                       binop(op, mkexpr(argL), mkexpr(argR)),
3018                       binop(xorOp, mkexpr(oldC), one)));
3019       } else {
3020          assign(res,
3021                 binop(op,
3022                       binop(op, mkexpr(argL), mkexpr(argR)),
3023                       mkexpr(oldC)));
3024       }
3025
3026       if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
3027
3028       if (bS) {
3029          setFlags_ADC_SBC(is64, isSUB, argL, argR, oldC);
3030       }
3031
3032       DIP("%s%s %s, %s, %s\n",
3033           bOP ? "sbc" : "adc", bS ? "s" : "",
3034           nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
3035           nameIRegOrZR(is64, rM));
3036       return True;
3037    }
3038
3039    /* -------------------- LOGIC(reg) -------------------- */
3040    /* x==0 => 32 bit op      x==1 => 64 bit op
3041       N==0 => inv? is no-op (no inversion)
3042       N==1 => inv? is NOT
3043       sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR
3044
3045       31 30 28    23 21 20 15   9  4
3046       |  |  |     |  |  |  |    |  |
3047       x  00 01010 sh N  Rm imm6 Rn Rd  AND  Rd,Rn, inv?(sh(Rm,imm6))
3048       x  01 01010 sh N  Rm imm6 Rn Rd  ORR  Rd,Rn, inv?(sh(Rm,imm6))
3049       x  10 01010 sh N  Rm imm6 Rn Rd  EOR  Rd,Rn, inv?(sh(Rm,imm6))
3050       x  11 01010 sh N  Rm imm6 Rn Rd  ANDS Rd,Rn, inv?(sh(Rm,imm6))
3051       With N=1, the names are: BIC ORN EON BICS
3052    */
3053    if (INSN(28,24) == BITS5(0,1,0,1,0)) {
3054       UInt   bX   = INSN(31,31);
3055       UInt   sh   = INSN(23,22);
3056       UInt   bN   = INSN(21,21);
3057       UInt   rM   = INSN(20,16);
3058       UInt   imm6 = INSN(15,10);
3059       UInt   rN   = INSN(9,5);
3060       UInt   rD   = INSN(4,0);
3061       Bool   is64 = bX == 1;
3062       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3063       if (!is64 && imm6 > 31) {
3064          /* invalid; fall though */
3065       } else {
3066          IRTemp argL = newTemp(ty);
3067          assign(argL, getIRegOrZR(is64, rN));
3068          IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, bN == 1);
3069          IROp   op   = Iop_INVALID;
3070          switch (INSN(30,29)) {
3071             case BITS2(0,0): case BITS2(1,1): op = mkAND(ty); break;
3072             case BITS2(0,1):                  op = mkOR(ty);  break;
3073             case BITS2(1,0):                  op = mkXOR(ty); break;
3074             default: vassert(0);
3075          }
3076          IRTemp res = newTemp(ty);
3077          assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
3078          if (INSN(30,29) == BITS2(1,1)) {
3079             setFlags_LOGIC(is64, res);
3080          }
3081          putIRegOrZR(is64, rD, mkexpr(res));
3082
3083          static const HChar* names_op[8]
3084             = { "and", "orr", "eor", "ands", "bic", "orn", "eon", "bics" };
3085          vassert(((bN << 2) | INSN(30,29)) < 8);
3086          const HChar* nm_op = names_op[(bN << 2) | INSN(30,29)];
3087          /* Special-case the printing of "MOV" */
3088          if (rN == 31/*zr*/ && sh == 0/*LSL*/ && imm6 == 0 && bN == 0) {
3089             DIP("mov %s, %s\n", nameIRegOrZR(is64, rD),
3090                                 nameIRegOrZR(is64, rM));
3091          } else {
3092             DIP("%s %s, %s, %s, %s #%u\n", nm_op,
3093                 nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
3094                 nameIRegOrZR(is64, rM), nameSH(sh), imm6);
3095          }
3096          return True;
3097       }
3098    }
3099
3100    /* -------------------- {U,S}MULH -------------------- */
3101    /* 31       23 22 20 15     9   4
3102       10011011 1  10 Rm 011111 Rn Rd   UMULH Xd,Xn,Xm
3103       10011011 0  10 Rm 011111 Rn Rd   SMULH Xd,Xn,Xm
3104    */
3105    if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1)
3106        && INSN(22,21) == BITS2(1,0) && INSN(15,10) == BITS6(0,1,1,1,1,1)) {
3107       Bool isU = INSN(23,23) == 1;
3108       UInt mm  = INSN(20,16);
3109       UInt nn  = INSN(9,5);
3110       UInt dd  = INSN(4,0);
3111       putIReg64orZR(dd, unop(Iop_128HIto64,
3112                              binop(isU ? Iop_MullU64 : Iop_MullS64,
3113                                    getIReg64orZR(nn), getIReg64orZR(mm))));
3114       DIP("%cmulh %s, %s, %s\n",
3115           isU ? 'u' : 's',
3116           nameIReg64orZR(dd), nameIReg64orZR(nn), nameIReg64orZR(mm));
3117       return True;
3118    }
3119
3120    /* -------------------- M{ADD,SUB} -------------------- */
3121    /* 31 30           20 15 14 9 4
3122       sf 00 11011 000 m  0  a  n r   MADD Rd,Rn,Rm,Ra  d = a+m*n
3123       sf 00 11011 000 m  1  a  n r   MADD Rd,Rn,Rm,Ra  d = a-m*n
3124    */
3125    if (INSN(30,21) == BITS10(0,0,1,1,0,1,1,0,0,0)) {
3126       Bool is64  = INSN(31,31) == 1;
3127       UInt mm    = INSN(20,16);
3128       Bool isAdd = INSN(15,15) == 0;
3129       UInt aa    = INSN(14,10);
3130       UInt nn    = INSN(9,5);
3131       UInt dd    = INSN(4,0);
3132       if (is64) {
3133          putIReg64orZR(
3134             dd,
3135             binop(isAdd ? Iop_Add64 : Iop_Sub64,
3136                   getIReg64orZR(aa),
3137                   binop(Iop_Mul64, getIReg64orZR(mm), getIReg64orZR(nn))));
3138       } else {
3139          putIReg32orZR(
3140             dd,
3141             binop(isAdd ? Iop_Add32 : Iop_Sub32,
3142                   getIReg32orZR(aa),
3143                   binop(Iop_Mul32, getIReg32orZR(mm), getIReg32orZR(nn))));
3144       }
3145       DIP("%s %s, %s, %s, %s\n",
3146           isAdd ? "madd" : "msub",
3147           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
3148           nameIRegOrZR(is64, mm), nameIRegOrZR(is64, aa));
3149       return True;
3150    }
3151
3152    /* ---------------- CS{EL,INC,INV,NEG} ---------------- */
3153    /* 31 30 28        20 15   11 9  4
3154       sf 00 1101 0100 mm cond 00 nn dd   CSEL  Rd,Rn,Rm
3155       sf 00 1101 0100 mm cond 01 nn dd   CSINC Rd,Rn,Rm
3156       sf 10 1101 0100 mm cond 00 nn dd   CSINV Rd,Rn,Rm
3157       sf 10 1101 0100 mm cond 01 nn dd   CSNEG Rd,Rn,Rm
3158       In all cases, the operation is: Rd = if cond then Rn else OP(Rm)
3159    */
3160    if (INSN(29,21) == BITS9(0, 1,1,0,1, 0,1,0,0) && INSN(11,11) == 0) {
3161       Bool    is64 = INSN(31,31) == 1;
3162       UInt    b30  = INSN(30,30);
3163       UInt    mm   = INSN(20,16);
3164       UInt    cond = INSN(15,12);
3165       UInt    b10  = INSN(10,10);
3166       UInt    nn   = INSN(9,5);
3167       UInt    dd   = INSN(4,0);
3168       UInt    op   = (b30 << 1) | b10; /* 00=id 01=inc 10=inv 11=neg */
3169       IRType  ty   = is64 ? Ity_I64 : Ity_I32;
3170       IRExpr* argL = getIRegOrZR(is64, nn);
3171       IRExpr* argR = getIRegOrZR(is64, mm);
3172       switch (op) {
3173          case BITS2(0,0):
3174             break;
3175          case BITS2(0,1):
3176             argR = binop(mkADD(ty), argR, mkU(ty,1));
3177             break;
3178          case BITS2(1,0):
3179             argR = unop(mkNOT(ty), argR);
3180             break;
3181          case BITS2(1,1):
3182             argR = binop(mkSUB(ty), mkU(ty,0), argR);
3183             break;
3184          default:
3185             vassert(0);
3186       }
3187       putIRegOrZR(
3188          is64, dd,
3189          IRExpr_ITE(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
3190                     argL, argR)
3191       );
3192       const HChar* op_nm[4] = { "csel", "csinc", "csinv", "csneg" };
3193       DIP("%s %s, %s, %s, %s\n", op_nm[op],
3194           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
3195           nameIRegOrZR(is64, mm), nameCC(cond));
3196       return True;
3197    }
3198
3199    /* -------------- ADD/SUB(extended reg) -------------- */
3200    /*     28         20 15  12   9 4
3201       000 01011 00 1 m  opt imm3 n d   ADD  Wd|SP, Wn|SP, Wm ext&lsld
3202       100 01011 00 1 m  opt imm3 n d   ADD  Xd|SP, Xn|SP, Rm ext&lsld
3203
3204       001 01011 00 1 m  opt imm3 n d   ADDS Wd,    Wn|SP, Wm ext&lsld
3205       101 01011 00 1 m  opt imm3 n d   ADDS Xd,    Xn|SP, Rm ext&lsld
3206
3207       010 01011 00 1 m  opt imm3 n d   SUB  Wd|SP, Wn|SP, Wm ext&lsld
3208       110 01011 00 1 m  opt imm3 n d   SUB  Xd|SP, Xn|SP, Rm ext&lsld
3209
3210       011 01011 00 1 m  opt imm3 n d   SUBS Wd,    Wn|SP, Wm ext&lsld
3211       111 01011 00 1 m  opt imm3 n d   SUBS Xd,    Xn|SP, Rm ext&lsld
3212
3213       The 'm' operand is extended per opt, thusly:
3214
3215         000   Xm & 0xFF           UXTB
3216         001   Xm & 0xFFFF         UXTH
3217         010   Xm & (2^32)-1       UXTW
3218         011   Xm                  UXTX
3219
3220         100   Xm sx from bit 7    SXTB
3221         101   Xm sx from bit 15   SXTH
3222         110   Xm sx from bit 31   SXTW
3223         111   Xm                  SXTX
3224
3225       In the 64 bit case (bit31 == 1), UXTX and SXTX are the identity
3226       operation on Xm.  In the 32 bit case, UXTW, UXTX, SXTW and SXTX
3227       are the identity operation on Wm.
3228
3229       After extension, the value is shifted left by imm3 bits, which
3230       may only be in the range 0 .. 4 inclusive.
3231    */
3232    if (INSN(28,21) == BITS8(0,1,0,1,1,0,0,1) && INSN(12,10) <= 4) {
3233       Bool is64  = INSN(31,31) == 1;
3234       Bool isSub = INSN(30,30) == 1;
3235       Bool setCC = INSN(29,29) == 1;
3236       UInt mm    = INSN(20,16);
3237       UInt opt   = INSN(15,13);
3238       UInt imm3  = INSN(12,10);
3239       UInt nn    = INSN(9,5);
3240       UInt dd    = INSN(4,0);
3241       const HChar* nameExt[8] = { "uxtb", "uxth", "uxtw", "uxtx",
3242                                   "sxtb", "sxth", "sxtw", "sxtx" };
3243       /* Do almost the same thing in the 32- and 64-bit cases. */
3244       IRTemp xN = newTemp(Ity_I64);
3245       IRTemp xM = newTemp(Ity_I64);
3246       assign(xN, getIReg64orSP(nn));
3247       assign(xM, getIReg64orZR(mm));
3248       IRExpr* xMw  = mkexpr(xM); /* "xM widened" */
3249       Int     shSX = 0;
3250       /* widen Xm .. */
3251       switch (opt) {
3252          case BITS3(0,0,0): // UXTB
3253             xMw = binop(Iop_And64, xMw, mkU64(0xFF)); break;
3254          case BITS3(0,0,1): // UXTH
3255             xMw = binop(Iop_And64, xMw, mkU64(0xFFFF)); break;
3256          case BITS3(0,1,0): // UXTW -- noop for the 32bit case
3257             if (is64) {
3258                xMw = unop(Iop_32Uto64, unop(Iop_64to32, xMw));
3259             }
3260             break;
3261          case BITS3(0,1,1): // UXTX -- always a noop
3262             break;
3263          case BITS3(1,0,0): // SXTB
3264             shSX = 56; goto sxTo64;
3265          case BITS3(1,0,1): // SXTH
3266             shSX = 48; goto sxTo64;
3267          case BITS3(1,1,0): // SXTW -- noop for the 32bit case
3268             if (is64) {
3269                shSX = 32; goto sxTo64;
3270             }
3271             break;
3272          case BITS3(1,1,1): // SXTX -- always a noop
3273             break;
3274          sxTo64:
3275             vassert(shSX >= 32);
3276             xMw = binop(Iop_Sar64, binop(Iop_Shl64, xMw, mkU8(shSX)),
3277                         mkU8(shSX));
3278             break;
3279          default:
3280             vassert(0);
3281       }
3282       /* and now shift */
3283       IRTemp argL = xN;
3284       IRTemp argR = newTemp(Ity_I64);
3285       assign(argR, binop(Iop_Shl64, xMw, mkU8(imm3)));
3286       IRTemp res = newTemp(Ity_I64);
3287       assign(res, binop(isSub ? Iop_Sub64 : Iop_Add64,
3288                         mkexpr(argL), mkexpr(argR)));
3289       if (is64) {
3290          if (setCC) {
3291             putIReg64orZR(dd, mkexpr(res));
3292             setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
3293          } else {
3294             putIReg64orSP(dd, mkexpr(res));
3295          }
3296       } else {
3297          if (setCC) {
3298             IRTemp argL32 = newTemp(Ity_I32);
3299             IRTemp argR32 = newTemp(Ity_I32);
3300             putIReg32orZR(dd, unop(Iop_64to32, mkexpr(res)));
3301             assign(argL32, unop(Iop_64to32, mkexpr(argL)));
3302             assign(argR32, unop(Iop_64to32, mkexpr(argR)));
3303             setFlags_ADD_SUB(False/*!is64*/, isSub, argL32, argR32);
3304          } else {
3305             putIReg32orSP(dd, unop(Iop_64to32, mkexpr(res)));
3306          }
3307       }
3308       DIP("%s%s %s, %s, %s %s lsl %u\n",
3309           isSub ? "sub" : "add", setCC ? "s" : "",
3310           setCC ? nameIRegOrZR(is64, dd) : nameIRegOrSP(is64, dd),
3311           nameIRegOrSP(is64, nn), nameIRegOrSP(is64, mm),
3312           nameExt[opt], imm3);
3313       return True;
3314    }
3315
3316    /* ---------------- CCMP/CCMN(imm) ---------------- */
3317    /* Bizarrely, these appear in the "data processing register"
3318       category, even though they are operations against an
3319       immediate. */
3320    /* 31   29        20   15   11 9    3
3321       sf 1 111010010 imm5 cond 10 Rn 0 nzcv   CCMP Rn, #imm5, #nzcv, cond
3322       sf 0 111010010 imm5 cond 10 Rn 0 nzcv   CCMN Rn, #imm5, #nzcv, cond
3323
3324       Operation is:
3325          (CCMP) flags = if cond then flags-after-sub(Rn,imm5) else nzcv
3326          (CCMN) flags = if cond then flags-after-add(Rn,imm5) else nzcv
3327    */
3328    if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
3329        && INSN(11,10) == BITS2(1,0) && INSN(4,4) == 0) {
3330       Bool is64  = INSN(31,31) == 1;
3331       Bool isSUB = INSN(30,30) == 1;
3332       UInt imm5  = INSN(20,16);
3333       UInt cond  = INSN(15,12);
3334       UInt nn    = INSN(9,5);
3335       UInt nzcv  = INSN(3,0);
3336
3337       IRTemp condT = newTemp(Ity_I1);
3338       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
3339
3340       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3341       IRTemp argL = newTemp(ty);
3342       IRTemp argR = newTemp(ty);
3343
3344       if (is64) {
3345          assign(argL, getIReg64orZR(nn));
3346          assign(argR, mkU64(imm5));
3347       } else {
3348          assign(argL, getIReg32orZR(nn));
3349          assign(argR, mkU32(imm5));
3350       }
3351       setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
3352
3353       DIP("ccm%c %s, #%u, #%u, %s\n",
3354           isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
3355           imm5, nzcv, nameCC(cond));
3356       return True;
3357    }
3358
3359    /* ---------------- CCMP/CCMN(reg) ---------------- */
3360    /* 31   29        20 15   11 9    3
3361       sf 1 111010010 Rm cond 00 Rn 0 nzcv   CCMP Rn, Rm, #nzcv, cond
3362       sf 0 111010010 Rm cond 00 Rn 0 nzcv   CCMN Rn, Rm, #nzcv, cond
3363       Operation is:
3364          (CCMP) flags = if cond then flags-after-sub(Rn,Rm) else nzcv
3365          (CCMN) flags = if cond then flags-after-add(Rn,Rm) else nzcv
3366    */
3367    if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
3368        && INSN(11,10) == BITS2(0,0) && INSN(4,4) == 0) {
3369       Bool is64  = INSN(31,31) == 1;
3370       Bool isSUB = INSN(30,30) == 1;
3371       UInt mm    = INSN(20,16);
3372       UInt cond  = INSN(15,12);
3373       UInt nn    = INSN(9,5);
3374       UInt nzcv  = INSN(3,0);
3375
3376       IRTemp condT = newTemp(Ity_I1);
3377       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
3378
3379       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3380       IRTemp argL = newTemp(ty);
3381       IRTemp argR = newTemp(ty);
3382
3383       if (is64) {
3384          assign(argL, getIReg64orZR(nn));
3385          assign(argR, getIReg64orZR(mm));
3386       } else {
3387          assign(argL, getIReg32orZR(nn));
3388          assign(argR, getIReg32orZR(mm));
3389       }
3390       setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
3391
3392       DIP("ccm%c %s, %s, #%u, %s\n",
3393           isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
3394           nameIRegOrZR(is64, mm), nzcv, nameCC(cond));
3395       return True;
3396    }
3397
3398
3399    /* -------------- REV/REV16/REV32/RBIT -------------- */
3400    /* 31 30 28       20    15   11 9 4
3401
3402       1  10 11010110 00000 0000 11 n d    (1) REV   Xd, Xn
3403       0  10 11010110 00000 0000 10 n d    (2) REV   Wd, Wn
3404
3405       1  10 11010110 00000 0000 00 n d    (3) RBIT  Xd, Xn
3406       0  10 11010110 00000 0000 00 n d    (4) RBIT  Wd, Wn
3407
3408       1  10 11010110 00000 0000 01 n d    (5) REV16 Xd, Xn
3409       0  10 11010110 00000 0000 01 n d    (6) REV16 Wd, Wn
3410
3411       1  10 11010110 00000 0000 10 n d    (7) REV32 Xd, Xn
3412    */
3413    if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
3414        && INSN(20,12) == BITS9(0,0,0,0,0,0,0,0,0)) {
3415       UInt b31 = INSN(31,31);
3416       UInt opc = INSN(11,10);
3417
3418       UInt ix = 0;
3419       /**/ if (b31 == 1 && opc == BITS2(1,1)) ix = 1;
3420       else if (b31 == 0 && opc == BITS2(1,0)) ix = 2;
3421       else if (b31 == 1 && opc == BITS2(0,0)) ix = 3;
3422       else if (b31 == 0 && opc == BITS2(0,0)) ix = 4;
3423       else if (b31 == 1 && opc == BITS2(0,1)) ix = 5;
3424       else if (b31 == 0 && opc == BITS2(0,1)) ix = 6;
3425       else if (b31 == 1 && opc == BITS2(1,0)) ix = 7;
3426       if (ix >= 1 && ix <= 7) {
3427          Bool   is64  = ix == 1 || ix == 3 || ix == 5 || ix == 7;
3428          UInt   nn    = INSN(9,5);
3429          UInt   dd    = INSN(4,0);
3430          IRTemp src   = newTemp(Ity_I64);
3431          IRTemp dst   = IRTemp_INVALID;
3432          IRTemp (*math)(IRTemp) = NULL;
3433          switch (ix) {
3434             case 1: case 2: math = math_BYTESWAP64;   break;
3435             case 3: case 4: math = math_BITSWAP64;    break;
3436             case 5: case 6: math = math_USHORTSWAP64; break;
3437             case 7:         math = math_UINTSWAP64;   break;
3438             default: vassert(0);
3439          }
3440          const HChar* names[7]
3441            = { "rev", "rev", "rbit", "rbit", "rev16", "rev16", "rev32" };
3442          const HChar* nm = names[ix-1];
3443          vassert(math);
3444          if (ix == 6) {
3445             /* This has to be special cased, since the logic below doesn't
3446                handle it correctly. */
3447             assign(src, getIReg64orZR(nn));
3448             dst = math(src);
3449             putIReg64orZR(dd,
3450                           unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(dst))));
3451          } else if (is64) {
3452             assign(src, getIReg64orZR(nn));
3453             dst = math(src);
3454             putIReg64orZR(dd, mkexpr(dst));
3455          } else {
3456             assign(src, binop(Iop_Shl64, getIReg64orZR(nn), mkU8(32)));
3457             dst = math(src);
3458             putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
3459          }
3460          DIP("%s %s, %s\n", nm,
3461              nameIRegOrZR(is64,dd), nameIRegOrZR(is64,nn));
3462          return True;
3463       }
3464       /* else fall through */
3465    }
3466
3467    /* -------------------- CLZ/CLS -------------------- */
3468    /*    30 28   24   20    15      9 4
3469       sf 10 1101 0110 00000 00010 0 n d    CLZ Rd, Rn
3470       sf 10 1101 0110 00000 00010 1 n d    CLS Rd, Rn
3471    */
3472    if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
3473        && INSN(20,11) == BITS10(0,0,0,0,0,0,0,0,1,0)) {
3474       Bool   is64  = INSN(31,31) == 1;
3475       Bool   isCLS = INSN(10,10) == 1;
3476       UInt   nn    = INSN(9,5);
3477       UInt   dd    = INSN(4,0);
3478       IRTemp src   = newTemp(Ity_I64);
3479       IRTemp srcZ  = newTemp(Ity_I64);
3480       IRTemp dst   = newTemp(Ity_I64);
3481       /* Get the argument, widened out to 64 bit */
3482       if (is64) {
3483          assign(src, getIReg64orZR(nn));
3484       } else {
3485          assign(src, binop(Iop_Shl64,
3486                            unop(Iop_32Uto64, getIReg32orZR(nn)), mkU8(32)));
3487       }
3488       /* If this is CLS, mash the arg around accordingly */
3489       if (isCLS) {
3490          IRExpr* one = mkU8(1);
3491          assign(srcZ,
3492          binop(Iop_Xor64,
3493                binop(Iop_Shl64, mkexpr(src), one),
3494                binop(Iop_Shl64, binop(Iop_Shr64, mkexpr(src), one), one)));
3495       } else {
3496          assign(srcZ, mkexpr(src));
3497       }
3498       /* And compute CLZ. */
3499       if (is64) {
3500          assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
3501                                 mkU64(isCLS ? 63 : 64),
3502                                 unop(Iop_Clz64, mkexpr(srcZ))));
3503          putIReg64orZR(dd, mkexpr(dst));
3504       } else {
3505          assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
3506                                 mkU64(isCLS ? 31 : 32),
3507                                 unop(Iop_Clz64, mkexpr(srcZ))));
3508          putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
3509       }
3510       DIP("cl%c %s, %s\n", isCLS ? 's' : 'z',
3511           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn));
3512       return True;
3513    }
3514
3515    /* ------------------ LSLV/LSRV/ASRV/RORV ------------------ */
3516    /*    30 28        20 15   11 9 4
3517       sf 00 1101 0110 m  0010 00 n d   LSLV Rd,Rn,Rm
3518       sf 00 1101 0110 m  0010 01 n d   LSRV Rd,Rn,Rm
3519       sf 00 1101 0110 m  0010 10 n d   ASRV Rd,Rn,Rm
3520       sf 00 1101 0110 m  0010 11 n d   RORV Rd,Rn,Rm
3521    */
3522    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3523        && INSN(15,12) == BITS4(0,0,1,0)) {
3524       Bool   is64 = INSN(31,31) == 1;
3525       UInt   mm   = INSN(20,16);
3526       UInt   op   = INSN(11,10);
3527       UInt   nn   = INSN(9,5);
3528       UInt   dd   = INSN(4,0);
3529       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3530       IRTemp srcL = newTemp(ty);
3531       IRTemp srcR = newTemp(Ity_I64);
3532       IRTemp res  = newTemp(ty);
3533       IROp   iop  = Iop_INVALID;
3534       assign(srcL, getIRegOrZR(is64, nn));
3535       assign(srcR, binop(Iop_And64, getIReg64orZR(mm),
3536                                     mkU64(is64 ? 63 : 31)));
3537       if (op < 3) {
3538          // LSLV, LSRV, ASRV
3539          switch (op) {
3540             case BITS2(0,0): iop = mkSHL(ty); break;
3541             case BITS2(0,1): iop = mkSHR(ty); break;
3542             case BITS2(1,0): iop = mkSAR(ty); break;
3543             default: vassert(0);
3544          }
3545          assign(res, binop(iop, mkexpr(srcL),
3546                                 unop(Iop_64to8, mkexpr(srcR))));
3547       } else {
3548          // RORV
3549          IROp opSHL = mkSHL(ty);
3550          IROp opSHR = mkSHR(ty);
3551          IROp opOR  = mkOR(ty);
3552          IRExpr* width = mkU64(is64 ? 64: 32);
3553          assign(
3554             res,
3555             IRExpr_ITE(
3556                binop(Iop_CmpEQ64, mkexpr(srcR), mkU64(0)),
3557                mkexpr(srcL),
3558                binop(opOR,
3559                      binop(opSHL,
3560                            mkexpr(srcL),
3561                            unop(Iop_64to8, binop(Iop_Sub64, width,
3562                                                             mkexpr(srcR)))),
3563                      binop(opSHR,
3564                            mkexpr(srcL), unop(Iop_64to8, mkexpr(srcR))))
3565          ));
3566       }
3567       putIRegOrZR(is64, dd, mkexpr(res));
3568       vassert(op < 4);
3569       const HChar* names[4] = { "lslv", "lsrv", "asrv", "rorv" };
3570       DIP("%s %s, %s, %s\n",
3571           names[op], nameIRegOrZR(is64,dd),
3572                      nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm));
3573       return True;
3574    }
3575
3576    /* -------------------- SDIV/UDIV -------------------- */
3577    /*    30 28        20 15    10 9 4
3578       sf 00 1101 0110 m  00001  1 n d  SDIV Rd,Rn,Rm
3579       sf 00 1101 0110 m  00001  0 n d  UDIV Rd,Rn,Rm
3580    */
3581    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3582        && INSN(15,11) == BITS5(0,0,0,0,1)) {
3583       Bool is64 = INSN(31,31) == 1;
3584       UInt mm   = INSN(20,16);
3585       Bool isS  = INSN(10,10) == 1;
3586       UInt nn   = INSN(9,5);
3587       UInt dd   = INSN(4,0);
3588       if (isS) {
3589          putIRegOrZR(is64, dd, binop(is64 ? Iop_DivS64 : Iop_DivS32,
3590                                      getIRegOrZR(is64, nn),
3591                                      getIRegOrZR(is64, mm)));
3592       } else {
3593          putIRegOrZR(is64, dd, binop(is64 ? Iop_DivU64 : Iop_DivU32,
3594                                      getIRegOrZR(is64, nn),
3595                                      getIRegOrZR(is64, mm)));
3596       }
3597       DIP("%cdiv %s, %s, %s\n", isS ? 's' : 'u',
3598           nameIRegOrZR(is64, dd),
3599           nameIRegOrZR(is64, nn), nameIRegOrZR(is64, mm));
3600       return True;
3601    }
3602
3603    /* ------------------ {S,U}M{ADD,SUB}L ------------------ */
3604    /* 31        23  20 15 14 9 4
3605       1001 1011 101 m  0  a  n d   UMADDL Xd,Wn,Wm,Xa
3606       1001 1011 001 m  0  a  n d   SMADDL Xd,Wn,Wm,Xa
3607       1001 1011 101 m  1  a  n d   UMSUBL Xd,Wn,Wm,Xa
3608       1001 1011 001 m  1  a  n d   SMSUBL Xd,Wn,Wm,Xa
3609       with operation
3610          Xd = Xa +/- (Wn *u/s Wm)
3611    */
3612    if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1) && INSN(22,21) == BITS2(0,1)) {
3613       Bool   isU   = INSN(23,23) == 1;
3614       UInt   mm    = INSN(20,16);
3615       Bool   isAdd = INSN(15,15) == 0;
3616       UInt   aa    = INSN(14,10);
3617       UInt   nn    = INSN(9,5);
3618       UInt   dd    = INSN(4,0);
3619       IRTemp wN    = newTemp(Ity_I32);
3620       IRTemp wM    = newTemp(Ity_I32);
3621       IRTemp xA    = newTemp(Ity_I64);
3622       IRTemp muld  = newTemp(Ity_I64);
3623       IRTemp res   = newTemp(Ity_I64);
3624       assign(wN, getIReg32orZR(nn));
3625       assign(wM, getIReg32orZR(mm));
3626       assign(xA, getIReg64orZR(aa));
3627       assign(muld, binop(isU ? Iop_MullU32 : Iop_MullS32,
3628                          mkexpr(wN), mkexpr(wM)));
3629       assign(res, binop(isAdd ? Iop_Add64 : Iop_Sub64,
3630                         mkexpr(xA), mkexpr(muld)));
3631       putIReg64orZR(dd, mkexpr(res));
3632       DIP("%cm%sl %s, %s, %s, %s\n", isU ? 'u' : 's', isAdd ? "add" : "sub",
3633           nameIReg64orZR(dd), nameIReg32orZR(nn),
3634           nameIReg32orZR(mm), nameIReg64orZR(aa));
3635       return True;
3636    }
3637
3638    /* -------------------- CRC32/CRC32C -------------------- */
3639    /* 31 30           20 15   11 9 4
3640       sf 00 1101 0110 m  0100 sz n d   CRC32<sz>  Wd, Wn, Wm|Xm
3641       sf 00 1101 0110 m  0101 sz n d   CRC32C<sz> Wd, Wn, Wm|Xm
3642    */
3643    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3644        && INSN(15,13) == BITS3(0,1,0)) {
3645       UInt bitSF = INSN(31,31);
3646       UInt mm    = INSN(20,16);
3647       UInt bitC  = INSN(12,12);
3648       UInt sz    = INSN(11,10);
3649       UInt nn    = INSN(9,5);
3650       UInt dd    = INSN(4,0);
3651       vassert(sz >= 0 && sz <= 3);
3652       if ((bitSF == 0 && sz <= BITS2(1,0))
3653           || (bitSF == 1 && sz == BITS2(1,1))) {
3654          UInt ix = (bitC == 1 ? 4 : 0) | sz;
3655          void* helpers[8]
3656             = { &arm64g_calc_crc32b,   &arm64g_calc_crc32h,
3657                 &arm64g_calc_crc32w,   &arm64g_calc_crc32x,
3658                 &arm64g_calc_crc32cb,  &arm64g_calc_crc32ch,
3659                 &arm64g_calc_crc32cw,  &arm64g_calc_crc32cx };
3660          const HChar* hNames[8]
3661             = { "arm64g_calc_crc32b",  "arm64g_calc_crc32h",
3662                 "arm64g_calc_crc32w",  "arm64g_calc_crc32x",
3663                 "arm64g_calc_crc32cb", "arm64g_calc_crc32ch",
3664                 "arm64g_calc_crc32cw", "arm64g_calc_crc32cx" };
3665          const HChar* iNames[8]
3666             = { "crc32b",  "crc32h",  "crc32w",  "crc32x",
3667                 "crc32cb", "crc32ch", "crc32cw", "crc32cx" };
3668
3669          IRTemp srcN = newTemp(Ity_I64);
3670          assign(srcN, unop(Iop_32Uto64, unop(Iop_64to32, getIReg64orZR(nn))));
3671
3672          IRTemp  srcM = newTemp(Ity_I64);
3673          IRExpr* at64 = getIReg64orZR(mm);
3674          switch (sz) {
3675             case BITS2(0,0):
3676                assign(srcM, binop(Iop_And64, at64, mkU64(0xFF))); break;
3677             case BITS2(0,1):
3678                assign(srcM, binop(Iop_And64, at64, mkU64(0xFFFF))); break;
3679             case BITS2(1,0):
3680                assign(srcM, binop(Iop_And64, at64, mkU64(0xFFFFFFFF))); break;
3681             case BITS2(1,1):
3682                assign(srcM, at64); break;
3683             default:
3684                vassert(0);
3685          }
3686
3687          vassert(ix >= 0 && ix <= 7);
3688
3689          putIReg64orZR(
3690             dd,
3691             unop(Iop_32Uto64,
3692                  unop(Iop_64to32,
3693                       mkIRExprCCall(Ity_I64, 0/*regparm*/,
3694                                     hNames[ix], helpers[ix],
3695                                     mkIRExprVec_2(mkexpr(srcN),
3696                                                   mkexpr(srcM))))));
3697
3698          DIP("%s %s, %s, %s\n", iNames[ix],
3699              nameIReg32orZR(dd),
3700              nameIReg32orZR(nn), nameIRegOrZR(bitSF == 1, mm));
3701          return True;
3702       }
3703       /* fall through */
3704    }
3705
3706    if (sigill_diag) {
3707       vex_printf("ARM64 front end: data_processing_register\n");
3708    }
3709    return False;
3710 #  undef INSN
3711 }
3712
3713
3714 /*------------------------------------------------------------*/
3715 /*--- Math helpers for vector interleave/deinterleave      ---*/
3716 /*------------------------------------------------------------*/
3717
3718 #define EX(_tmp) \
3719            mkexpr(_tmp)
3720 #define SL(_hi128,_lo128,_nbytes) \
3721            ( (_nbytes) == 0 \
3722                 ? (_lo128) \
3723                 : triop(Iop_SliceV128,(_hi128),(_lo128),mkU8(_nbytes)) )
3724 #define ROR(_v128,_nbytes) \
3725            SL((_v128),(_v128),(_nbytes))
3726 #define ROL(_v128,_nbytes) \
3727            SL((_v128),(_v128),16-(_nbytes))
3728 #define SHR(_v128,_nbytes) \
3729            binop(Iop_ShrV128,(_v128),mkU8(8*(_nbytes)))
3730 #define SHL(_v128,_nbytes) \
3731            binop(Iop_ShlV128,(_v128),mkU8(8*(_nbytes)))
3732 #define ILO64x2(_argL,_argR) \
3733            binop(Iop_InterleaveLO64x2,(_argL),(_argR))
3734 #define IHI64x2(_argL,_argR) \
3735            binop(Iop_InterleaveHI64x2,(_argL),(_argR))
3736 #define ILO32x4(_argL,_argR) \
3737            binop(Iop_InterleaveLO32x4,(_argL),(_argR))
3738 #define IHI32x4(_argL,_argR) \
3739            binop(Iop_InterleaveHI32x4,(_argL),(_argR))
3740 #define ILO16x8(_argL,_argR) \
3741            binop(Iop_InterleaveLO16x8,(_argL),(_argR))
3742 #define IHI16x8(_argL,_argR) \
3743            binop(Iop_InterleaveHI16x8,(_argL),(_argR))
3744 #define ILO8x16(_argL,_argR) \
3745            binop(Iop_InterleaveLO8x16,(_argL),(_argR))
3746 #define IHI8x16(_argL,_argR) \
3747            binop(Iop_InterleaveHI8x16,(_argL),(_argR))
3748 #define CEV32x4(_argL,_argR) \
3749            binop(Iop_CatEvenLanes32x4,(_argL),(_argR))
3750 #define COD32x4(_argL,_argR) \
3751            binop(Iop_CatOddLanes32x4,(_argL),(_argR))
3752 #define COD16x8(_argL,_argR) \
3753            binop(Iop_CatOddLanes16x8,(_argL),(_argR))
3754 #define COD8x16(_argL,_argR) \
3755            binop(Iop_CatOddLanes8x16,(_argL),(_argR))
3756 #define CEV8x16(_argL,_argR) \
3757            binop(Iop_CatEvenLanes8x16,(_argL),(_argR))
3758 #define AND(_arg1,_arg2) \
3759            binop(Iop_AndV128,(_arg1),(_arg2))
3760 #define OR2(_arg1,_arg2) \
3761            binop(Iop_OrV128,(_arg1),(_arg2))
3762 #define OR3(_arg1,_arg2,_arg3) \
3763            binop(Iop_OrV128,(_arg1),binop(Iop_OrV128,(_arg2),(_arg3)))
3764 #define OR4(_arg1,_arg2,_arg3,_arg4) \
3765            binop(Iop_OrV128, \
3766                  binop(Iop_OrV128,(_arg1),(_arg2)), \
3767                  binop(Iop_OrV128,(_arg3),(_arg4)))
3768
3769
3770 /* Do interleaving for 1 128 bit vector, for ST1 insns. */
3771 static
3772 void math_INTERLEAVE1_128( /*OUTx1*/ IRTemp* i0,
3773                            UInt laneSzBlg2, IRTemp u0 )
3774 {
3775    assign(*i0, mkexpr(u0));
3776 }
3777
3778
3779 /* Do interleaving for 2 128 bit vectors, for ST2 insns. */
3780 static
3781 void math_INTERLEAVE2_128( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
3782                            UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
3783 {
3784    /* This is pretty easy, since we have primitives directly to
3785       hand. */
3786    if (laneSzBlg2 == 3) {
3787       // 64x2
3788       // u1 == B1 B0, u0 == A1 A0
3789       // i1 == B1 A1, i0 == B0 A0
3790       assign(*i0, binop(Iop_InterleaveLO64x2, mkexpr(u1), mkexpr(u0)));
3791       assign(*i1, binop(Iop_InterleaveHI64x2, mkexpr(u1), mkexpr(u0)));
3792       return;
3793    }
3794    if (laneSzBlg2 == 2) {
3795       // 32x4
3796       // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
3797       // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
3798       assign(*i0, binop(Iop_InterleaveLO32x4, mkexpr(u1), mkexpr(u0)));
3799       assign(*i1, binop(Iop_InterleaveHI32x4, mkexpr(u1), mkexpr(u0)));
3800       return;
3801    }
3802    if (laneSzBlg2 == 1) {
3803       // 16x8
3804       // u1 == B{7..0}, u0 == A{7..0}
3805       // i0 == B3 A3 B2 A2 B1 A1 B0 A0
3806       // i1 == B7 A7 B6 A6 B5 A5 B4 A4
3807       assign(*i0, binop(Iop_InterleaveLO16x8, mkexpr(u1), mkexpr(u0)));
3808       assign(*i1, binop(Iop_InterleaveHI16x8, mkexpr(u1), mkexpr(u0)));
3809       return;
3810    }
3811    if (laneSzBlg2 == 0) {
3812       // 8x16
3813       // u1 == B{f..0}, u0 == A{f..0}
3814       // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
3815       // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
3816       assign(*i0, binop(Iop_InterleaveLO8x16, mkexpr(u1), mkexpr(u0)));
3817       assign(*i1, binop(Iop_InterleaveHI8x16, mkexpr(u1), mkexpr(u0)));
3818       return;
3819    }
3820    /*NOTREACHED*/
3821    vassert(0);
3822 }
3823
3824
3825 /* Do interleaving for 3 128 bit vectors, for ST3 insns. */
3826 static
3827 void math_INTERLEAVE3_128(
3828         /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
3829         UInt laneSzBlg2,
3830         IRTemp u0, IRTemp u1, IRTemp u2 )
3831 {
3832    if (laneSzBlg2 == 3) {
3833       // 64x2
3834       // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
3835       // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
3836       assign(*i2, IHI64x2( EX(u2), EX(u1) ));
3837       assign(*i1, ILO64x2( ROR(EX(u0),8), EX(u2) ));
3838       assign(*i0, ILO64x2( EX(u1), EX(u0) ));
3839       return;
3840    }
3841
3842    if (laneSzBlg2 == 2) {
3843       // 32x4
3844       // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
3845       // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
3846       // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
3847       IRTemp p0    = newTempV128();
3848       IRTemp p1    = newTempV128();
3849       IRTemp p2    = newTempV128();
3850       IRTemp c1100 = newTempV128();
3851       IRTemp c0011 = newTempV128();
3852       IRTemp c0110 = newTempV128();
3853       assign(c1100, mkV128(0xFF00));
3854       assign(c0011, mkV128(0x00FF));
3855       assign(c0110, mkV128(0x0FF0));
3856       // First interleave them at 64x2 granularity,
3857       // generating partial ("p") values.
3858       math_INTERLEAVE3_128(&p0, &p1, &p2, 3, u0, u1, u2);
3859       // And more shuffling around for the final answer
3860       assign(*i2, OR2( AND( IHI32x4(EX(p2), ROL(EX(p2),8)), EX(c1100) ),
3861                        AND( IHI32x4(ROR(EX(p1),4), EX(p2)), EX(c0011) ) ));
3862       assign(*i1, OR3( SHL(EX(p2),12),
3863                        AND(EX(p1),EX(c0110)),
3864                        SHR(EX(p0),12) ));
3865       assign(*i0, OR2( AND( ILO32x4(EX(p0),ROL(EX(p1),4)), EX(c1100) ),
3866                        AND( ILO32x4(ROR(EX(p0),8),EX(p0)), EX(c0011) ) ));
3867       return;
3868    }
3869
3870    if (laneSzBlg2 == 1) {
3871       // 16x8
3872       // u2 == C7 C6 C5 C4 C3 C2 C1 C0
3873       // u1 == B7 B6 B5 B4 B3 B2 B1 B0
3874       // u0 == A7 A6 A5 A4 A3 A2 A1 A0
3875       //
3876       // p2 == C7 C6 B7 B6 A7 A6 C5 C4
3877       // p1 == B5 B4 A5 A4 C3 C2 B3 B2
3878       // p0 == A3 A2 C1 C0 B1 B0 A1 A0
3879       //
3880       // i2 == C7 B7 A7 C6 B6 A6 C5 B5
3881       // i1 == A5 C4 B4 A4 C4 B3 A3 C2
3882       // i0 == B2 A2 C1 B1 A1 C0 B0 A0
3883       IRTemp p0    = newTempV128();
3884       IRTemp p1    = newTempV128();
3885       IRTemp p2    = newTempV128();
3886       IRTemp c1000 = newTempV128();
3887       IRTemp c0100 = newTempV128();
3888       IRTemp c0010 = newTempV128();
3889       IRTemp c0001 = newTempV128();
3890       assign(c1000, mkV128(0xF000));
3891       assign(c0100, mkV128(0x0F00));
3892       assign(c0010, mkV128(0x00F0));
3893       assign(c0001, mkV128(0x000F));
3894       // First interleave them at 32x4 granularity,
3895       // generating partial ("p") values.
3896       math_INTERLEAVE3_128(&p0, &p1, &p2, 2, u0, u1, u2);
3897       // And more shuffling around for the final answer
3898       assign(*i2,
3899              OR4( AND( IHI16x8( EX(p2),        ROL(EX(p2),4) ), EX(c1000) ),
3900                   AND( IHI16x8( ROL(EX(p2),6), EX(p2)        ), EX(c0100) ),
3901                   AND( IHI16x8( ROL(EX(p2),2), ROL(EX(p2),6) ), EX(c0010) ),
3902                   AND( ILO16x8( ROR(EX(p2),2), ROL(EX(p1),2) ), EX(c0001) )
3903       ));
3904       assign(*i1,
3905              OR4( AND( IHI16x8( ROL(EX(p1),4), ROR(EX(p2),2) ), EX(c1000) ),
3906                   AND( IHI16x8( EX(p1),        ROL(EX(p1),4) ), EX(c0100) ),
3907                   AND( IHI16x8( ROL(EX(p1),4), ROL(EX(p1),8) ), EX(c0010) ),
3908                   AND( IHI16x8( ROR(EX(p0),6), ROL(EX(p1),4) ), EX(c0001) )
3909       ));
3910       assign(*i0,
3911              OR4( AND( IHI16x8( ROR(EX(p1),2), ROL(EX(p0),2) ), EX(c1000) ),
3912                   AND( IHI16x8( ROL(EX(p0),2), ROL(EX(p0),6) ), EX(c0100) ),
3913                   AND( IHI16x8( ROL(EX(p0),8), ROL(EX(p0),2) ), EX(c0010) ),
3914                   AND( IHI16x8( ROL(EX(p0),4), ROL(EX(p0),8) ), EX(c0001) )
3915       ));
3916       return;
3917    }
3918
3919    if (laneSzBlg2 == 0) {
3920       // 8x16.  It doesn't seem worth the hassle of first doing a
3921       // 16x8 interleave, so just generate all 24 partial results
3922       // directly :-(
3923       // u2 == Cf .. C0, u1 == Bf .. B0, u0 == Af .. A0
3924       // i2 == Cf Bf Af Ce .. Bb Ab Ca
3925       // i1 == Ba Aa C9 B9 .. A6 C5 B5
3926       // i0 == A5 C4 B4 A4 .. C0 B0 A0
3927
3928       IRTemp i2_FEDC = newTempV128(); IRTemp i2_BA98 = newTempV128();
3929       IRTemp i2_7654 = newTempV128(); IRTemp i2_3210 = newTempV128();
3930       IRTemp i1_FEDC = newTempV128(); IRTemp i1_BA98 = newTempV128();
3931       IRTemp i1_7654 = newTempV128(); IRTemp i1_3210 = newTempV128();
3932       IRTemp i0_FEDC = newTempV128(); IRTemp i0_BA98 = newTempV128();
3933       IRTemp i0_7654 = newTempV128(); IRTemp i0_3210 = newTempV128();
3934       IRTemp i2_hi64 = newTempV128(); IRTemp i2_lo64 = newTempV128();
3935       IRTemp i1_hi64 = newTempV128(); IRTemp i1_lo64 = newTempV128();
3936       IRTemp i0_hi64 = newTempV128(); IRTemp i0_lo64 = newTempV128();
3937
3938       // eg XXXX(qqq, CC, 0xF, BB, 0xA)) sets qqq to be a vector
3939       // of the form 14 bytes junk : CC[0xF] : BB[0xA]
3940       //
3941 #     define XXXX(_tempName,_srcVec1,_srcShift1,_srcVec2,_srcShift2) \
3942          IRTemp t_##_tempName = newTempV128(); \
3943          assign(t_##_tempName, \
3944                 ILO8x16( ROR(EX(_srcVec1),(_srcShift1)), \
3945                          ROR(EX(_srcVec2),(_srcShift2)) ) )
3946
3947       // Let CC, BB, AA be (handy) aliases of u2, u1, u0 respectively
3948       IRTemp CC = u2; IRTemp BB = u1; IRTemp AA = u0;
3949
3950       // The slicing and reassembly are done as interleavedly as possible,
3951       // so as to minimise the demand for registers in the back end, which
3952       // was observed to be a problem in testing.
3953
3954       XXXX(CfBf, CC, 0xf, BB, 0xf); // i2[15:14]
3955       XXXX(AfCe, AA, 0xf, CC, 0xe);
3956       assign(i2_FEDC, ILO16x8(EX(t_CfBf), EX(t_AfCe)));
3957
3958       XXXX(BeAe, BB, 0xe, AA, 0xe);
3959       XXXX(CdBd, CC, 0xd, BB, 0xd);
3960       assign(i2_BA98, ILO16x8(EX(t_BeAe), EX(t_CdBd)));
3961       assign(i2_hi64, ILO32x4(EX(i2_FEDC), EX(i2_BA98)));
3962
3963       XXXX(AdCc, AA, 0xd, CC, 0xc);
3964       XXXX(BcAc, BB, 0xc, AA, 0xc);
3965       assign(i2_7654, ILO16x8(EX(t_AdCc), EX(t_BcAc)));
3966
3967       XXXX(CbBb, CC, 0xb, BB, 0xb);
3968       XXXX(AbCa, AA, 0xb, CC, 0xa); // i2[1:0]
3969       assign(i2_3210, ILO16x8(EX(t_CbBb), EX(t_AbCa)));
3970       assign(i2_lo64, ILO32x4(EX(i2_7654), EX(i2_3210)));
3971       assign(*i2, ILO64x2(EX(i2_hi64), EX(i2_lo64)));
3972
3973       XXXX(BaAa, BB, 0xa, AA, 0xa); // i1[15:14]
3974       XXXX(C9B9, CC, 0x9, BB, 0x9);
3975       assign(i1_FEDC, ILO16x8(EX(t_BaAa), EX(t_C9B9)));
3976
3977       XXXX(A9C8, AA, 0x9, CC, 0x8);
3978       XXXX(B8A8, BB, 0x8, AA, 0x8);
3979       assign(i1_BA98, ILO16x8(EX(t_A9C8), EX(t_B8A8)));
3980       assign(i1_hi64, ILO32x4(EX(i1_FEDC), EX(i1_BA98)));
3981
3982       XXXX(C7B7, CC, 0x7, BB, 0x7);
3983       XXXX(A7C6, AA, 0x7, CC, 0x6);
3984       assign(i1_7654, ILO16x8(EX(t_C7B7), EX(t_A7C6)));
3985
3986       XXXX(B6A6, BB, 0x6, AA, 0x6);
3987       XXXX(C5B5, CC, 0x5, BB, 0x5); // i1[1:0]
3988       assign(i1_3210, ILO16x8(EX(t_B6A6), EX(t_C5B5)));
3989       assign(i1_lo64, ILO32x4(EX(i1_7654), EX(i1_3210)));
3990       assign(*i1, ILO64x2(EX(i1_hi64), EX(i1_lo64)));
3991
3992       XXXX(A5C4, AA, 0x5, CC, 0x4); // i0[15:14]
3993       XXXX(B4A4, BB, 0x4, AA, 0x4);
3994       assign(i0_FEDC, ILO16x8(EX(t_A5C4), EX(t_B4A4)));
3995
3996       XXXX(C3B3, CC, 0x3, BB, 0x3);
3997       XXXX(A3C2, AA, 0x3, CC, 0x2);
3998       assign(i0_BA98, ILO16x8(EX(t_C3B3), EX(t_A3C2)));
3999       assign(i0_hi64, ILO32x4(EX(i0_FEDC), EX(i0_BA98)));
4000
4001       XXXX(B2A2, BB, 0x2, AA, 0x2);
4002       XXXX(C1B1, CC, 0x1, BB, 0x1);
4003       assign(i0_7654, ILO16x8(EX(t_B2A2), EX(t_C1B1)));
4004
4005       XXXX(A1C0, AA, 0x1, CC, 0x0);
4006       XXXX(B0A0, BB, 0x0, AA, 0x0); // i0[1:0]
4007       assign(i0_3210, ILO16x8(EX(t_A1C0), EX(t_B0A0)));
4008       assign(i0_lo64, ILO32x4(EX(i0_7654), EX(i0_3210)));
4009       assign(*i0, ILO64x2(EX(i0_hi64), EX(i0_lo64)));
4010
4011 #     undef XXXX
4012       return;
4013    }
4014
4015    /*NOTREACHED*/
4016    vassert(0);
4017 }
4018
4019
4020 /* Do interleaving for 4 128 bit vectors, for ST4 insns. */
4021 static
4022 void math_INTERLEAVE4_128(
4023         /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
4024         UInt laneSzBlg2,
4025         IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
4026 {
4027    if (laneSzBlg2 == 3) {
4028       // 64x2
4029       assign(*i0, ILO64x2(EX(u1), EX(u0)));
4030       assign(*i1, ILO64x2(EX(u3), EX(u2)));
4031       assign(*i2, IHI64x2(EX(u1), EX(u0)));
4032       assign(*i3, IHI64x2(EX(u3), EX(u2)));
4033       return;
4034    }
4035    if (laneSzBlg2 == 2) {
4036       // 32x4
4037       // First, interleave at the 64-bit lane size.
4038       IRTemp p0 = newTempV128();
4039       IRTemp p1 = newTempV128();
4040       IRTemp p2 = newTempV128();
4041       IRTemp p3 = newTempV128();
4042       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 3, u0, u1, u2, u3);
4043       // And interleave (cat) at the 32 bit size.
4044       assign(*i0, CEV32x4(EX(p1), EX(p0)));
4045       assign(*i1, COD32x4(EX(p1), EX(p0)));
4046       assign(*i2, CEV32x4(EX(p3), EX(p2)));
4047       assign(*i3, COD32x4(EX(p3), EX(p2)));
4048       return;
4049    }
4050    if (laneSzBlg2 == 1) {
4051       // 16x8
4052       // First, interleave at the 32-bit lane size.
4053       IRTemp p0 = newTempV128();
4054       IRTemp p1 = newTempV128();
4055       IRTemp p2 = newTempV128();
4056       IRTemp p3 = newTempV128();
4057       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 2, u0, u1, u2, u3);
4058       // And rearrange within each vector, to get the right 16 bit lanes.
4059       assign(*i0, COD16x8(EX(p0), SHL(EX(p0), 2)));
4060       assign(*i1, COD16x8(EX(p1), SHL(EX(p1), 2)));
4061       assign(*i2, COD16x8(EX(p2), SHL(EX(p2), 2)));
4062       assign(*i3, COD16x8(EX(p3), SHL(EX(p3), 2)));
4063       return;
4064    }
4065    if (laneSzBlg2 == 0) {
4066       // 8x16
4067       // First, interleave at the 16-bit lane size.
4068       IRTemp p0 = newTempV128();
4069       IRTemp p1 = newTempV128();
4070       IRTemp p2 = newTempV128();
4071       IRTemp p3 = newTempV128();
4072       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 1, u0, u1, u2, u3);
4073       // And rearrange within each vector, to get the right 8 bit lanes.
4074       assign(*i0, IHI32x4(COD8x16(EX(p0),EX(p0)), CEV8x16(EX(p0),EX(p0))));
4075       assign(*i1, IHI32x4(COD8x16(EX(p1),EX(p1)), CEV8x16(EX(p1),EX(p1))));
4076       assign(*i2, IHI32x4(COD8x16(EX(p2),EX(p2)), CEV8x16(EX(p2),EX(p2))));
4077       assign(*i3, IHI32x4(COD8x16(EX(p3),EX(p3)), CEV8x16(EX(p3),EX(p3))));
4078       return;
4079    }
4080    /*NOTREACHED*/
4081    vassert(0);
4082 }
4083
4084
4085 /* Do deinterleaving for 1 128 bit vector, for LD1 insns. */
4086 static
4087 void math_DEINTERLEAVE1_128( /*OUTx1*/ IRTemp* u0,
4088                              UInt laneSzBlg2, IRTemp i0 )
4089 {
4090    assign(*u0, mkexpr(i0));
4091 }
4092
4093
4094 /* Do deinterleaving for 2 128 bit vectors, for LD2 insns. */
4095 static
4096 void math_DEINTERLEAVE2_128( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
4097                              UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
4098 {
4099    /* This is pretty easy, since we have primitives directly to
4100       hand. */
4101    if (laneSzBlg2 == 3) {
4102       // 64x2
4103       // i1 == B1 A1, i0 == B0 A0
4104       // u1 == B1 B0, u0 == A1 A0
4105       assign(*u0, binop(Iop_InterleaveLO64x2, mkexpr(i1), mkexpr(i0)));
4106       assign(*u1, binop(Iop_InterleaveHI64x2, mkexpr(i1), mkexpr(i0)));
4107       return;
4108    }
4109    if (laneSzBlg2 == 2) {
4110       // 32x4
4111       // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
4112       // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
4113       assign(*u0, binop(Iop_CatEvenLanes32x4, mkexpr(i1), mkexpr(i0)));
4114       assign(*u1, binop(Iop_CatOddLanes32x4, mkexpr(i1), mkexpr(i0)));
4115       return;
4116    }
4117    if (laneSzBlg2 == 1) {
4118       // 16x8
4119       // i0 == B3 A3 B2 A2 B1 A1 B0 A0
4120       // i1 == B7 A7 B6 A6 B5 A5 B4 A4
4121       // u1 == B{7..0}, u0 == A{7..0}
4122       assign(*u0, binop(Iop_CatEvenLanes16x8, mkexpr(i1), mkexpr(i0)));
4123       assign(*u1, binop(Iop_CatOddLanes16x8,  mkexpr(i1), mkexpr(i0)));
4124       return;
4125    }
4126    if (laneSzBlg2 == 0) {
4127       // 8x16
4128       // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
4129       // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
4130       // u1 == B{f..0}, u0 == A{f..0}
4131       assign(*u0, binop(Iop_CatEvenLanes8x16, mkexpr(i1), mkexpr(i0)));
4132       assign(*u1, binop(Iop_CatOddLanes8x16,  mkexpr(i1), mkexpr(i0)));
4133       return;
4134    }
4135    /*NOTREACHED*/
4136    vassert(0);
4137 }
4138
4139
4140 /* Do deinterleaving for 3 128 bit vectors, for LD3 insns. */
4141 static
4142 void math_DEINTERLEAVE3_128(
4143         /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
4144         UInt laneSzBlg2,
4145         IRTemp i0, IRTemp i1, IRTemp i2 )
4146 {
4147    if (laneSzBlg2 == 3) {
4148       // 64x2
4149       // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
4150       // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
4151       assign(*u2, ILO64x2( ROL(EX(i2),8), EX(i1)        ));
4152       assign(*u1, ILO64x2( EX(i2),        ROL(EX(i0),8) ));
4153       assign(*u0, ILO64x2( ROL(EX(i1),8), EX(i0)        ));
4154       return;
4155    }
4156
4157    if (laneSzBlg2 == 2) {
4158       // 32x4
4159       // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
4160       // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
4161       // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
4162       IRTemp t_a1c0b0a0 = newTempV128();
4163       IRTemp t_a2c1b1a1 = newTempV128();
4164       IRTemp t_a3c2b2a2 = newTempV128();
4165       IRTemp t_a0c3b3a3 = newTempV128();
4166       IRTemp p0 = newTempV128();
4167       IRTemp p1 = newTempV128();
4168       IRTemp p2 = newTempV128();
4169       // Compute some intermediate values.
4170       assign(t_a1c0b0a0, EX(i0));
4171       assign(t_a2c1b1a1, SL(EX(i1),EX(i0),3*4));
4172       assign(t_a3c2b2a2, SL(EX(i2),EX(i1),2*4));
4173       assign(t_a0c3b3a3, SL(EX(i0),EX(i2),1*4));
4174       // First deinterleave into lane-pairs
4175       assign(p0, ILO32x4(EX(t_a2c1b1a1),EX(t_a1c0b0a0)));
4176       assign(p1, ILO64x2(ILO32x4(EX(t_a0c3b3a3), EX(t_a3c2b2a2)),
4177                          IHI32x4(EX(t_a2c1b1a1), EX(t_a1c0b0a0))));
4178       assign(p2, ILO32x4(ROR(EX(t_a0c3b3a3),1*4), ROR(EX(t_a3c2b2a2),1*4)));
4179       // Then deinterleave at 64x2 granularity.
4180       math_DEINTERLEAVE3_128(u0, u1, u2, 3, p0, p1, p2);
4181       return;
4182    }
4183
4184    if (laneSzBlg2 == 1) {
4185       // 16x8
4186       // u2 == C7 C6 C5 C4 C3 C2 C1 C0
4187       // u1 == B7 B6 B5 B4 B3 B2 B1 B0
4188       // u0 == A7 A6 A5 A4 A3 A2 A1 A0
4189       //
4190       // i2 == C7 B7 A7 C6 B6 A6 C5 B5
4191       // i1 == A5 C4 B4 A4 C4 B3 A3 C2
4192       // i0 == B2 A2 C1 B1 A1 C0 B0 A0
4193       //
4194       // p2 == C7 C6 B7 B6 A7 A6 C5 C4
4195       // p1 == B5 B4 A5 A4 C3 C2 B3 B2
4196       // p0 == A3 A2 C1 C0 B1 B0 A1 A0
4197
4198       IRTemp s0, s1, s2, s3, t0, t1, t2, t3, p0, p1, p2, c00111111;
4199       s0 = s1 = s2 = s3
4200          = t0 = t1 = t2 = t3 = p0 = p1 = p2 = c00111111 = IRTemp_INVALID;
4201       newTempsV128_4(&s0, &s1, &s2, &s3);
4202       newTempsV128_4(&t0, &t1, &t2, &t3);
4203       newTempsV128_4(&p0, &p1, &p2, &c00111111);
4204
4205       // s0 == b2a2 c1b1a1 c0b0a0
4206       // s1 == b4a4 c3b3c3 c2b2a2
4207       // s2 == b6a6 c5b5a5 c4b4a4
4208       // s3 == b0a0 c7b7a7 c6b6a6
4209       assign(s0, EX(i0));
4210       assign(s1, SL(EX(i1),EX(i0),6*2));
4211       assign(s2, SL(EX(i2),EX(i1),4*2));
4212       assign(s3, SL(EX(i0),EX(i2),2*2));
4213
4214       // t0 == 0 0 c1c0 b1b0 a1a0
4215       // t1 == 0 0 c3c2 b3b2 a3a2
4216       // t2 == 0 0 c5c4 b5b4 a5a4
4217       // t3 == 0 0 c7c6 b7b6 a7a6
4218       assign(c00111111, mkV128(0x0FFF));
4219       assign(t0, AND( ILO16x8( ROR(EX(s0),3*2), EX(s0)), EX(c00111111)));
4220       assign(t1, AND( ILO16x8( ROR(EX(s1),3*2), EX(s1)), EX(c00111111)));
4221       assign(t2, AND( ILO16x8( ROR(EX(s2),3*2), EX(s2)), EX(c00111111)));
4222       assign(t3, AND( ILO16x8( ROR(EX(s3),3*2), EX(s3)), EX(c00111111)));
4223
4224       assign(p0, OR2(EX(t0),          SHL(EX(t1),6*2)));
4225       assign(p1, OR2(SHL(EX(t2),4*2), SHR(EX(t1),2*2)));
4226       assign(p2, OR2(SHL(EX(t3),2*2), SHR(EX(t2),4*2)));
4227
4228       // Then deinterleave at 32x4 granularity.
4229       math_DEINTERLEAVE3_128(u0, u1, u2, 2, p0, p1, p2);
4230       return;
4231    }
4232
4233    if (laneSzBlg2 == 0) {
4234       // 8x16.  This is the same scheme as for 16x8, with twice the
4235       // number of intermediate values.
4236       //
4237       // u2 == C{f..0}
4238       // u1 == B{f..0}
4239       // u0 == A{f..0}
4240       //
4241       // i2 == CBA{f} CBA{e} CBA{d} CBA{c} CBA{b} C{a}
4242       // i1 ==  BA{a} CBA{9} CBA{8} CBA{7} CBA{6} CB{5}
4243       // i0 ==   A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
4244       //
4245       // p2 == C{fe} B{fe} A{fe} C{dc} B{dc} A{dc} C{ba} B{ba}
4246       // p1 == A{ba} C{98} B{98} A{98} C{76} B{76} A{76} C{54}
4247       // p0 == B{54} A{54} C{32} B{32} A{32} C{10} B{10} A{10}
4248       //
4249       IRTemp s0, s1, s2, s3, s4, s5, s6, s7,
4250              t0, t1, t2, t3, t4, t5, t6, t7, p0, p1, p2, cMASK;
4251       s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7
4252          = t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = p0 = p1 = p2 = cMASK
4253          = IRTemp_INVALID;
4254       newTempsV128_4(&s0, &s1, &s2, &s3);
4255       newTempsV128_4(&s4, &s5, &s6, &s7);
4256       newTempsV128_4(&t0, &t1, &t2, &t3);
4257       newTempsV128_4(&t4, &t5, &t6, &t7);
4258       newTempsV128_4(&p0, &p1, &p2, &cMASK);
4259
4260       // s0 == A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
4261       // s1 == A{7} CBA{6} CBA{5} CBA{4} CBA{3} CBA{2}
4262       // s2 == A{9} CBA{8} CBA{7} CBA{6} CBA{5} CBA{4}
4263       // s3 == A{b} CBA{a} CBA{9} CBA{8} CBA{7} CBA{6}
4264       // s4 == A{d} CBA{c} CBA{b} CBA{a} CBA{9} CBA{8}
4265       // s5 == A{f} CBA{e} CBA{d} CBA{c} CBA{b} CBA{a}
4266       // s6 == A{1} CBA{0} CBA{f} CBA{e} CBA{d} CBA{c}
4267       // s7 == A{3} CBA{2} CBA{1} CBA{0} CBA{f} CBA{e}
4268       assign(s0, SL(EX(i1),EX(i0), 0));
4269       assign(s1, SL(EX(i1),EX(i0), 6));
4270       assign(s2, SL(EX(i1),EX(i0),12));
4271       assign(s3, SL(EX(i2),EX(i1), 2));
4272       assign(s4, SL(EX(i2),EX(i1), 8));
4273       assign(s5, SL(EX(i2),EX(i1),14));
4274       assign(s6, SL(EX(i0),EX(i2), 4));
4275       assign(s7, SL(EX(i0),EX(i2),10));
4276
4277       // t0 == 0--(ten)--0 C1 C0 B1 B0 A1 A0
4278       // t1 == 0--(ten)--0 C3 C2 B3 B2 A3 A2
4279       // t2 == 0--(ten)--0 C5 C4 B5 B4 A5 A4
4280       // t3 == 0--(ten)--0 C7 C6 B7 B6 A7 A6
4281       // t4 == 0--(ten)--0 C9 C8 B9 B8 A9 A8
4282       // t5 == 0--(ten)--0 Cb Ca Bb Ba Ab Aa
4283       // t6 == 0--(ten)--0 Cd Cc Bd Bc Ad Ac
4284       // t7 == 0--(ten)--0 Cf Ce Bf Be Af Ae
4285       assign(cMASK, mkV128(0x003F));
4286       assign(t0, AND( ILO8x16( ROR(EX(s0),3), EX(s0)), EX(cMASK)));
4287       assign(t1, AND( ILO8x16( ROR(EX(s1),3), EX(s1)), EX(cMASK)));
4288       assign(t2, AND( ILO8x16( ROR(EX(s2),3), EX(s2)), EX(cMASK)));
4289       assign(t3, AND( ILO8x16( ROR(EX(s3),3), EX(s3)), EX(cMASK)));
4290       assign(t4, AND( ILO8x16( ROR(EX(s4),3), EX(s4)), EX(cMASK)));
4291       assign(t5, AND( ILO8x16( ROR(EX(s5),3), EX(s5)), EX(cMASK)));
4292       assign(t6, AND( ILO8x16( ROR(EX(s6),3), EX(s6)), EX(cMASK)));
4293       assign(t7, AND( ILO8x16( ROR(EX(s7),3), EX(s7)), EX(cMASK)));
4294
4295       assign(p0, OR3( SHL(EX(t2),12), SHL(EX(t1),6), EX(t0) ));
4296       assign(p1, OR4( SHL(EX(t5),14), SHL(EX(t4),8),
4297                  SHL(EX(t3),2), SHR(EX(t2),4) ));
4298       assign(p2, OR3( SHL(EX(t7),10), SHL(EX(t6),4), SHR(EX(t5),2) ));
4299
4300       // Then deinterleave at 16x8 granularity.
4301       math_DEINTERLEAVE3_128(u0, u1, u2, 1, p0, p1, p2);
4302       return;
4303    }
4304
4305    /*NOTREACHED*/
4306    vassert(0);
4307 }
4308
4309
4310 /* Do deinterleaving for 4 128 bit vectors, for LD4 insns. */
4311 static
4312 void math_DEINTERLEAVE4_128(
4313         /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
4314         UInt laneSzBlg2,
4315         IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
4316 {
4317    if (laneSzBlg2 == 3) {
4318       // 64x2
4319       assign(*u0, ILO64x2(EX(i2), EX(i0)));
4320       assign(*u1, IHI64x2(EX(i2), EX(i0)));
4321       assign(*u2, ILO64x2(EX(i3), EX(i1)));
4322       assign(*u3, IHI64x2(EX(i3), EX(i1)));
4323       return;
4324    }
4325    if (laneSzBlg2 == 2) {
4326       // 32x4
4327       IRTemp p0 = newTempV128();
4328       IRTemp p2 = newTempV128();
4329       IRTemp p1 = newTempV128();
4330       IRTemp p3 = newTempV128();
4331       assign(p0, ILO32x4(EX(i1), EX(i0)));
4332       assign(p1, IHI32x4(EX(i1), EX(i0)));
4333       assign(p2, ILO32x4(EX(i3), EX(i2)));
4334       assign(p3, IHI32x4(EX(i3), EX(i2)));
4335       // And now do what we did for the 64-bit case.
4336       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 3, p0, p1, p2, p3);
4337       return;
4338    }
4339    if (laneSzBlg2 == 1) {
4340       // 16x8
4341       // Deinterleave into 32-bit chunks, then do as the 32-bit case.
4342       IRTemp p0 = newTempV128();
4343       IRTemp p1 = newTempV128();
4344       IRTemp p2 = newTempV128();
4345       IRTemp p3 = newTempV128();
4346       assign(p0, IHI16x8(EX(i0), SHL(EX(i0), 8)));
4347       assign(p1, IHI16x8(EX(i1), SHL(EX(i1), 8)));
4348       assign(p2, IHI16x8(EX(i2), SHL(EX(i2), 8)));
4349       assign(p3, IHI16x8(EX(i3), SHL(EX(i3), 8)));
4350       // From here on is like the 32 bit case.
4351       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 2, p0, p1, p2, p3);
4352       return;
4353    }
4354    if (laneSzBlg2 == 0) {
4355       // 8x16
4356       // Deinterleave into 16-bit chunks, then do as the 16-bit case.
4357       IRTemp p0 = newTempV128();
4358       IRTemp p1 = newTempV128();
4359       IRTemp p2 = newTempV128();
4360       IRTemp p3 = newTempV128();
4361       assign(p0, IHI64x2( IHI8x16(EX(i0),ROL(EX(i0),4)),
4362                           ILO8x16(EX(i0),ROL(EX(i0),4)) ));
4363       assign(p1, IHI64x2( IHI8x16(EX(i1),ROL(EX(i1),4)),
4364                           ILO8x16(EX(i1),ROL(EX(i1),4)) ));
4365       assign(p2, IHI64x2( IHI8x16(EX(i2),ROL(EX(i2),4)),
4366                           ILO8x16(EX(i2),ROL(EX(i2),4)) ));
4367       assign(p3, IHI64x2( IHI8x16(EX(i3),ROL(EX(i3),4)),
4368                           ILO8x16(EX(i3),ROL(EX(i3),4)) ));
4369       // From here on is like the 16 bit case.
4370       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 1, p0, p1, p2, p3);
4371       return;
4372    }
4373    /*NOTREACHED*/
4374    vassert(0);
4375 }
4376
4377
4378 /* Wrappers that use the full-width (de)interleavers to do half-width
4379    (de)interleaving.  The scheme is to clone each input lane in the
4380    lower half of each incoming value, do a full width (de)interleave
4381    at the next lane size up, and remove every other lane of the the
4382    result.  The returned values may have any old junk in the upper
4383    64 bits -- the caller must ignore that. */
4384
4385 /* Helper function -- get doubling and narrowing operations. */
4386 static
4387 void math_get_doubler_and_halver ( /*OUT*/IROp* doubler,
4388                                    /*OUT*/IROp* halver,
4389                                    UInt laneSzBlg2 )
4390 {
4391    switch (laneSzBlg2) {
4392       case 2:
4393          *doubler = Iop_InterleaveLO32x4; *halver = Iop_CatEvenLanes32x4;
4394          break;
4395       case 1:
4396          *doubler = Iop_InterleaveLO16x8; *halver = Iop_CatEvenLanes16x8;
4397          break;
4398       case 0:
4399          *doubler = Iop_InterleaveLO8x16; *halver = Iop_CatEvenLanes8x16;
4400          break;
4401       default:
4402          vassert(0);
4403    }
4404 }
4405
4406 /* Do interleaving for 1 64 bit vector, for ST1 insns. */
4407 static
4408 void math_INTERLEAVE1_64( /*OUTx1*/ IRTemp* i0,
4409                           UInt laneSzBlg2, IRTemp u0 )
4410 {
4411    assign(*i0, mkexpr(u0));
4412 }
4413
4414
4415 /* Do interleaving for 2 64 bit vectors, for ST2 insns. */
4416 static
4417 void math_INTERLEAVE2_64( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
4418                           UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
4419 {
4420    if (laneSzBlg2 == 3) {
4421       // 1x64, degenerate case
4422       assign(*i0, EX(u0));
4423       assign(*i1, EX(u1));
4424       return;
4425    }
4426
4427    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4428    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4429    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4430
4431    IRTemp du0 = newTempV128();
4432    IRTemp du1 = newTempV128();
4433    assign(du0, binop(doubler, EX(u0), EX(u0)));
4434    assign(du1, binop(doubler, EX(u1), EX(u1)));
4435    IRTemp di0 = newTempV128();
4436    IRTemp di1 = newTempV128();
4437    math_INTERLEAVE2_128(&di0, &di1, laneSzBlg2 + 1, du0, du1);
4438    assign(*i0, binop(halver, EX(di0), EX(di0)));
4439    assign(*i1, binop(halver, EX(di1), EX(di1)));
4440 }
4441
4442
4443 /* Do interleaving for 3 64 bit vectors, for ST3 insns. */
4444 static
4445 void math_INTERLEAVE3_64(
4446         /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
4447         UInt laneSzBlg2,
4448         IRTemp u0, IRTemp u1, IRTemp u2 )
4449 {
4450    if (laneSzBlg2 == 3) {
4451       // 1x64, degenerate case
4452       assign(*i0, EX(u0));
4453       assign(*i1, EX(u1));
4454       assign(*i2, EX(u2));
4455       return;
4456    }
4457
4458    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4459    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4460    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4461
4462    IRTemp du0 = newTempV128();
4463    IRTemp du1 = newTempV128();
4464    IRTemp du2 = newTempV128();
4465    assign(du0, binop(doubler, EX(u0), EX(u0)));
4466    assign(du1, binop(doubler, EX(u1), EX(u1)));
4467    assign(du2, binop(doubler, EX(u2), EX(u2)));
4468    IRTemp di0 = newTempV128();
4469    IRTemp di1 = newTempV128();
4470    IRTemp di2 = newTempV128();
4471    math_INTERLEAVE3_128(&di0, &di1, &di2, laneSzBlg2 + 1, du0, du1, du2);
4472    assign(*i0, binop(halver, EX(di0), EX(di0)));
4473    assign(*i1, binop(halver, EX(di1), EX(di1)));
4474    assign(*i2, binop(halver, EX(di2), EX(di2)));
4475 }
4476
4477
4478 /* Do interleaving for 4 64 bit vectors, for ST4 insns. */
4479 static
4480 void math_INTERLEAVE4_64(
4481         /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
4482         UInt laneSzBlg2,
4483         IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
4484 {
4485    if (laneSzBlg2 == 3) {
4486       // 1x64, degenerate case
4487       assign(*i0, EX(u0));
4488       assign(*i1, EX(u1));
4489       assign(*i2, EX(u2));
4490       assign(*i3, EX(u3));
4491       return;
4492    }
4493
4494    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4495    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4496    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4497
4498    IRTemp du0 = newTempV128();
4499    IRTemp du1 = newTempV128();
4500    IRTemp du2 = newTempV128();
4501    IRTemp du3 = newTempV128();
4502    assign(du0, binop(doubler, EX(u0), EX(u0)));
4503    assign(du1, binop(doubler, EX(u1), EX(u1)));
4504    assign(du2, binop(doubler, EX(u2), EX(u2)));
4505    assign(du3, binop(doubler, EX(u3), EX(u3)));
4506    IRTemp di0 = newTempV128();
4507    IRTemp di1 = newTempV128();
4508    IRTemp di2 = newTempV128();
4509    IRTemp di3 = newTempV128();
4510    math_INTERLEAVE4_128(&di0, &di1, &di2, &di3,
4511                         laneSzBlg2 + 1, du0, du1, du2, du3);
4512    assign(*i0, binop(halver, EX(di0), EX(di0)));
4513    assign(*i1, binop(halver, EX(di1), EX(di1)));
4514    assign(*i2, binop(halver, EX(di2), EX(di2)));
4515    assign(*i3, binop(halver, EX(di3), EX(di3)));
4516 }
4517
4518
4519 /* Do deinterleaving for 1 64 bit vector, for LD1 insns. */
4520 static
4521 void math_DEINTERLEAVE1_64( /*OUTx1*/ IRTemp* u0,
4522                             UInt laneSzBlg2, IRTemp i0 )
4523 {
4524    assign(*u0, mkexpr(i0));
4525 }
4526
4527
4528 /* Do deinterleaving for 2 64 bit vectors, for LD2 insns. */
4529 static
4530 void math_DEINTERLEAVE2_64( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
4531                             UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
4532 {
4533    if (laneSzBlg2 == 3) {
4534       // 1x64, degenerate case
4535       assign(*u0, EX(i0));
4536       assign(*u1, EX(i1));
4537       return;
4538    }
4539
4540    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4541    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4542    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4543
4544    IRTemp di0 = newTempV128();
4545    IRTemp di1 = newTempV128();
4546    assign(di0, binop(doubler, EX(i0), EX(i0)));
4547    assign(di1, binop(doubler, EX(i1), EX(i1)));
4548
4549    IRTemp du0 = newTempV128();
4550    IRTemp du1 = newTempV128();
4551    math_DEINTERLEAVE2_128(&du0, &du1, laneSzBlg2 + 1, di0, di1);
4552    assign(*u0, binop(halver, EX(du0), EX(du0)));
4553    assign(*u1, binop(halver, EX(du1), EX(du1)));
4554 }
4555
4556
4557 /* Do deinterleaving for 3 64 bit vectors, for LD3 insns. */
4558 static
4559 void math_DEINTERLEAVE3_64(
4560         /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
4561         UInt laneSzBlg2,
4562         IRTemp i0, IRTemp i1, IRTemp i2 )
4563 {
4564    if (laneSzBlg2 == 3) {
4565       // 1x64, degenerate case
4566       assign(*u0, EX(i0));
4567       assign(*u1, EX(i1));
4568       assign(*u2, EX(i2));
4569       return;
4570    }
4571
4572    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4573    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4574    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4575
4576    IRTemp di0 = newTempV128();
4577    IRTemp di1 = newTempV128();
4578    IRTemp di2 = newTempV128();
4579    assign(di0, binop(doubler, EX(i0), EX(i0)));
4580    assign(di1, binop(doubler, EX(i1), EX(i1)));
4581    assign(di2, binop(doubler, EX(i2), EX(i2)));
4582    IRTemp du0 = newTempV128();
4583    IRTemp du1 = newTempV128();
4584    IRTemp du2 = newTempV128();
4585    math_DEINTERLEAVE3_128(&du0, &du1, &du2, laneSzBlg2 + 1, di0, di1, di2);
4586    assign(*u0, binop(halver, EX(du0), EX(du0)));
4587    assign(*u1, binop(halver, EX(du1), EX(du1)));
4588    assign(*u2, binop(halver, EX(du2), EX(du2)));
4589 }
4590
4591
4592 /* Do deinterleaving for 4 64 bit vectors, for LD4 insns. */
4593 static
4594 void math_DEINTERLEAVE4_64(
4595         /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
4596         UInt laneSzBlg2,
4597         IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
4598 {
4599    if (laneSzBlg2 == 3) {
4600       // 1x64, degenerate case
4601       assign(*u0, EX(i0));
4602       assign(*u1, EX(i1));
4603       assign(*u2, EX(i2));
4604       assign(*u3, EX(i3));
4605       return;
4606    }
4607
4608    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4609    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4610    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4611
4612    IRTemp di0 = newTempV128();
4613    IRTemp di1 = newTempV128();
4614    IRTemp di2 = newTempV128();
4615    IRTemp di3 = newTempV128();
4616    assign(di0, binop(doubler, EX(i0), EX(i0)));
4617    assign(di1, binop(doubler, EX(i1), EX(i1)));
4618    assign(di2, binop(doubler, EX(i2), EX(i2)));
4619    assign(di3, binop(doubler, EX(i3), EX(i3)));
4620    IRTemp du0 = newTempV128();
4621    IRTemp du1 = newTempV128();
4622    IRTemp du2 = newTempV128();
4623    IRTemp du3 = newTempV128();
4624    math_DEINTERLEAVE4_128(&du0, &du1, &du2, &du3,
4625                           laneSzBlg2 + 1, di0, di1, di2, di3);
4626    assign(*u0, binop(halver, EX(du0), EX(du0)));
4627    assign(*u1, binop(halver, EX(du1), EX(du1)));
4628    assign(*u2, binop(halver, EX(du2), EX(du2)));
4629    assign(*u3, binop(halver, EX(du3), EX(du3)));
4630 }
4631
4632
4633 #undef EX
4634 #undef SL
4635 #undef ROR
4636 #undef ROL
4637 #undef SHR
4638 #undef SHL
4639 #undef ILO64x2
4640 #undef IHI64x2
4641 #undef ILO32x4
4642 #undef IHI32x4
4643 #undef ILO16x8
4644 #undef IHI16x8
4645 #undef ILO16x8
4646 #undef IHI16x8
4647 #undef CEV32x4
4648 #undef COD32x4
4649 #undef COD16x8
4650 #undef COD8x16
4651 #undef CEV8x16
4652 #undef AND
4653 #undef OR2
4654 #undef OR3
4655 #undef OR4
4656
4657
4658 /*------------------------------------------------------------*/
4659 /*--- Load and Store instructions                          ---*/
4660 /*------------------------------------------------------------*/
4661
4662 /* Generate the EA for a "reg + reg" style amode.  This is done from
4663    parts of the insn, but for sanity checking sake it takes the whole
4664    insn.  This appears to depend on insn[15:12], with opt=insn[15:13]
4665    and S=insn[12]:
4666
4667    The possible forms, along with their opt:S values, are:
4668       011:0   Xn|SP + Xm
4669       111:0   Xn|SP + Xm
4670       011:1   Xn|SP + Xm * transfer_szB
4671       111:1   Xn|SP + Xm * transfer_szB
4672       010:0   Xn|SP + 32Uto64(Wm)
4673       010:1   Xn|SP + 32Uto64(Wm) * transfer_szB
4674       110:0   Xn|SP + 32Sto64(Wm)
4675       110:1   Xn|SP + 32Sto64(Wm) * transfer_szB
4676
4677    Rm is insn[20:16].  Rn is insn[9:5].  Rt is insn[4:0].  Log2 of
4678    the transfer size is insn[23,31,30].  For integer loads/stores,
4679    insn[23] is zero, hence szLg2 can be at most 3 in such cases.
4680
4681    If the decoding fails, it returns IRTemp_INVALID.
4682
4683    isInt is True iff this is decoding is for transfers to/from integer
4684    registers.  If False it is for transfers to/from vector registers.
4685 */
4686 static IRTemp gen_indexed_EA ( /*OUT*/HChar* buf, UInt insn, Bool isInt )
4687 {
4688    UInt    optS  = SLICE_UInt(insn, 15, 12);
4689    UInt    mm    = SLICE_UInt(insn, 20, 16);
4690    UInt    nn    = SLICE_UInt(insn, 9, 5);
4691    UInt    szLg2 = (isInt ? 0 : (SLICE_UInt(insn, 23, 23) << 2))
4692                    | SLICE_UInt(insn, 31, 30); // Log2 of the size
4693
4694    buf[0] = 0;
4695
4696    /* Sanity checks, that this really is a load/store insn. */
4697    if (SLICE_UInt(insn, 11, 10) != BITS2(1,0))
4698       goto fail;
4699
4700    if (isInt
4701        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,1,1)/*LDR*/
4702        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,0,1)/*STR*/
4703        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,0,1)/*LDRSbhw Xt*/
4704        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,1,1))/*LDRSbhw Wt*/
4705       goto fail;
4706
4707    if (!isInt
4708        && SLICE_UInt(insn, 29, 24) != BITS6(1,1,1,1,0,0)) /*LDR/STR*/
4709       goto fail;
4710
4711    /* Throw out non-verified but possibly valid cases. */
4712    switch (szLg2) {
4713       case BITS3(0,0,0): break; //  8 bit, valid for both int and vec
4714       case BITS3(0,0,1): break; // 16 bit, valid for both int and vec
4715       case BITS3(0,1,0): break; // 32 bit, valid for both int and vec
4716       case BITS3(0,1,1): break; // 64 bit, valid for both int and vec
4717       case BITS3(1,0,0): // can only ever be valid for the vector case
4718                          if (isInt) goto fail; else break;
4719       case BITS3(1,0,1): // these sizes are never valid
4720       case BITS3(1,1,0):
4721       case BITS3(1,1,1): goto fail;
4722
4723       default: vassert(0);
4724    }
4725
4726    IRExpr* rhs  = NULL;
4727    switch (optS) {
4728       case BITS4(1,1,1,0): goto fail; //ATC
4729       case BITS4(0,1,1,0):
4730          rhs = getIReg64orZR(mm);
4731          vex_sprintf(buf, "[%s, %s]",
4732                      nameIReg64orZR(nn), nameIReg64orZR(mm));
4733          break;
4734       case BITS4(1,1,1,1): goto fail; //ATC
4735       case BITS4(0,1,1,1):
4736          rhs = binop(Iop_Shl64, getIReg64orZR(mm), mkU8(szLg2));
4737          vex_sprintf(buf, "[%s, %s lsl %u]",
4738                      nameIReg64orZR(nn), nameIReg64orZR(mm), szLg2);
4739          break;
4740       case BITS4(0,1,0,0):
4741          rhs = unop(Iop_32Uto64, getIReg32orZR(mm));
4742          vex_sprintf(buf, "[%s, %s uxtx]",
4743                      nameIReg64orZR(nn), nameIReg32orZR(mm));
4744          break;
4745       case BITS4(0,1,0,1):
4746          rhs = binop(Iop_Shl64,
4747                      unop(Iop_32Uto64, getIReg32orZR(mm)), mkU8(szLg2));
4748          vex_sprintf(buf, "[%s, %s uxtx, lsl %u]",
4749                      nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
4750          break;
4751       case BITS4(1,1,0,0):
4752          rhs = unop(Iop_32Sto64, getIReg32orZR(mm));
4753          vex_sprintf(buf, "[%s, %s sxtx]",
4754                      nameIReg64orZR(nn), nameIReg32orZR(mm));
4755          break;
4756       case BITS4(1,1,0,1):
4757          rhs = binop(Iop_Shl64,
4758                      unop(Iop_32Sto64, getIReg32orZR(mm)), mkU8(szLg2));
4759          vex_sprintf(buf, "[%s, %s sxtx, lsl %u]",
4760                      nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
4761          break;
4762       default:
4763          /* The rest appear to be genuinely invalid */
4764          goto fail;
4765    }
4766
4767    vassert(rhs);
4768    IRTemp res = newTemp(Ity_I64);
4769    assign(res, binop(Iop_Add64, getIReg64orSP(nn), rhs));
4770    return res;
4771
4772   fail:
4773    if (0 /*really, sigill_diag, but that causes too much plumbing*/) {
4774       vex_printf("gen_indexed_EA: unhandled case optS == 0x%x\n", optS);
4775    }
4776    return IRTemp_INVALID;
4777 }
4778
4779
4780 /* Generate an 8/16/32/64 bit integer store to ADDR for the lowest
4781    bits of DATAE :: Ity_I64. */
4782 static void gen_narrowing_store ( UInt szB, IRTemp addr, IRExpr* dataE )
4783 {
4784    IRExpr* addrE = mkexpr(addr);
4785    switch (szB) {
4786       case 8:
4787          storeLE(addrE, dataE);
4788          break;
4789       case 4:
4790          storeLE(addrE, unop(Iop_64to32, dataE));
4791          break;
4792       case 2:
4793          storeLE(addrE, unop(Iop_64to16, dataE));
4794          break;
4795       case 1:
4796          storeLE(addrE, unop(Iop_64to8, dataE));
4797          break;
4798       default:
4799          vassert(0);
4800    }
4801 }
4802
4803
4804 /* Generate an 8/16/32/64 bit unsigned widening load from ADDR,
4805    placing the result in an Ity_I64 temporary. */
4806 static IRTemp gen_zwidening_load ( UInt szB, IRTemp addr )
4807 {
4808    IRTemp  res   = newTemp(Ity_I64);
4809    IRExpr* addrE = mkexpr(addr);
4810    switch (szB) {
4811       case 8:
4812          assign(res, loadLE(Ity_I64,addrE));
4813          break;
4814       case 4:
4815          assign(res, unop(Iop_32Uto64, loadLE(Ity_I32,addrE)));
4816          break;
4817       case 2:
4818          assign(res, unop(Iop_16Uto64, loadLE(Ity_I16,addrE)));
4819          break;
4820       case 1:
4821          assign(res, unop(Iop_8Uto64, loadLE(Ity_I8,addrE)));
4822          break;
4823       default:
4824          vassert(0);
4825    }
4826    return res;
4827 }
4828
4829
4830 /* Generate a "standard 7" name, from bitQ and size.  But also
4831    allow ".1d" since that's occasionally useful. */
4832 static
4833 const HChar* nameArr_Q_SZ ( UInt bitQ, UInt size )
4834 {
4835    vassert(bitQ <= 1 && size <= 3);
4836    const HChar* nms[8]
4837       = { "8b", "4h", "2s", "1d", "16b", "8h", "4s", "2d" };
4838    UInt ix = (bitQ << 2) | size;
4839    vassert(ix < 8);
4840    return nms[ix];
4841 }
4842
4843
4844 static
4845 Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
4846                           const VexAbiInfo* abiinfo, Bool sigill_diag)
4847 {
4848 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
4849
4850    /* ------------ LDR,STR (immediate, uimm12) ----------- */
4851    /* uimm12 is scaled by the transfer size
4852
4853       31 29  26    21    9  4
4854       |  |   |     |     |  |
4855       11 111 00100 imm12 nn tt    STR  Xt, [Xn|SP, #imm12 * 8]
4856       11 111 00101 imm12 nn tt    LDR  Xt, [Xn|SP, #imm12 * 8]
4857
4858       10 111 00100 imm12 nn tt    STR  Wt, [Xn|SP, #imm12 * 4]
4859       10 111 00101 imm12 nn tt    LDR  Wt, [Xn|SP, #imm12 * 4]
4860
4861       01 111 00100 imm12 nn tt    STRH Wt, [Xn|SP, #imm12 * 2]
4862       01 111 00101 imm12 nn tt    LDRH Wt, [Xn|SP, #imm12 * 2]
4863
4864       00 111 00100 imm12 nn tt    STRB Wt, [Xn|SP, #imm12 * 1]
4865       00 111 00101 imm12 nn tt    LDRB Wt, [Xn|SP, #imm12 * 1]
4866    */
4867    if (INSN(29,23) == BITS7(1,1,1,0,0,1,0)) {
4868       UInt   szLg2 = INSN(31,30);
4869       UInt   szB   = 1 << szLg2;
4870       Bool   isLD  = INSN(22,22) == 1;
4871       UInt   offs  = INSN(21,10) * szB;
4872       UInt   nn    = INSN(9,5);
4873       UInt   tt    = INSN(4,0);
4874       IRTemp ta    = newTemp(Ity_I64);
4875       assign(ta, binop(Iop_Add64, getIReg64orSP(nn), mkU64(offs)));
4876       if (nn == 31) { /* FIXME generate stack alignment check */ }
4877       vassert(szLg2 < 4);
4878       if (isLD) {
4879          putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, ta)));
4880       } else {
4881          gen_narrowing_store(szB, ta, getIReg64orZR(tt));
4882       }
4883       const HChar* ld_name[4] = { "ldrb", "ldrh", "ldr", "ldr" };
4884       const HChar* st_name[4] = { "strb", "strh", "str", "str" };
4885       DIP("%s %s, [%s, #%u]\n",
4886           (isLD ? ld_name : st_name)[szLg2], nameIRegOrZR(szB == 8, tt),
4887           nameIReg64orSP(nn), offs);
4888       return True;
4889    }
4890
4891    /* ------------ LDUR,STUR (immediate, simm9) ----------- */
4892    /*
4893       31 29  26      20   11 9  4
4894       |  |   |       |    |  |  |
4895       (at-Rn-then-Rn=EA)  |  |  |
4896       sz 111 00000 0 imm9 01 Rn Rt   STR Rt, [Xn|SP], #simm9
4897       sz 111 00001 0 imm9 01 Rn Rt   LDR Rt, [Xn|SP], #simm9
4898
4899       (at-EA-then-Rn=EA)
4900       sz 111 00000 0 imm9 11 Rn Rt   STR Rt, [Xn|SP, #simm9]!
4901       sz 111 00001 0 imm9 11 Rn Rt   LDR Rt, [Xn|SP, #simm9]!
4902
4903       (at-EA)
4904       sz 111 00000 0 imm9 00 Rn Rt   STR Rt, [Xn|SP, #simm9]
4905       sz 111 00001 0 imm9 00 Rn Rt   LDR Rt, [Xn|SP, #simm9]
4906
4907       simm9 is unscaled.
4908
4909       The case 'wback && Rn == Rt && Rt != 31' is disallowed.  In the
4910       load case this is because would create two competing values for
4911       Rt.  In the store case the reason is unclear, but the spec
4912       disallows it anyway.
4913
4914       Stores are narrowing, loads are unsigned widening.  sz encodes
4915       the transfer size in the normal way: 00=1, 01=2, 10=4, 11=8.
4916    */
4917    if ((INSN(29,21) & BITS9(1,1,1, 1,1,1,1,0, 1))
4918        == BITS9(1,1,1, 0,0,0,0,0, 0)) {
4919       UInt szLg2  = INSN(31,30);
4920       UInt szB    = 1 << szLg2;
4921       Bool isLoad = INSN(22,22) == 1;
4922       UInt imm9   = INSN(20,12);
4923       UInt nn     = INSN(9,5);
4924       UInt tt     = INSN(4,0);
4925       Bool wBack  = INSN(10,10) == 1;
4926       UInt how    = INSN(11,10);
4927       if (how == BITS2(1,0) || (wBack && nn == tt && tt != 31)) {
4928          /* undecodable; fall through */
4929       } else {
4930          if (nn == 31) { /* FIXME generate stack alignment check */ }
4931
4932          // Compute the transfer address TA and the writeback address WA.
4933          IRTemp tRN = newTemp(Ity_I64);
4934          assign(tRN, getIReg64orSP(nn));
4935          IRTemp tEA = newTemp(Ity_I64);
4936          Long simm9 = (Long)sx_to_64(imm9, 9);
4937          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
4938
4939          IRTemp tTA = newTemp(Ity_I64);
4940          IRTemp tWA = newTemp(Ity_I64);
4941          switch (how) {
4942             case BITS2(0,1):
4943                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
4944             case BITS2(1,1):
4945                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
4946             case BITS2(0,0):
4947                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
4948             default:
4949                vassert(0); /* NOTREACHED */
4950          }
4951
4952          /* Normally rN would be updated after the transfer.  However, in
4953             the special cases typifed by
4954                str x30, [sp,#-16]!
4955                str w1, [sp,#-32]!
4956             it is necessary to update SP before the transfer, (1)
4957             because Memcheck will otherwise complain about a write
4958             below the stack pointer, and (2) because the segfault
4959             stack extension mechanism will otherwise extend the stack
4960             only down to SP before the instruction, which might not be
4961             far enough, if the -16/-32 bit takes the actual access
4962             address to the next page.
4963          */
4964          Bool earlyWBack
4965            = wBack && simm9 < 0 && (szB == 8 || szB == 4)
4966              && how == BITS2(1,1) && nn == 31 && !isLoad;
4967
4968          if (wBack && earlyWBack)
4969             putIReg64orSP(nn, mkexpr(tEA));
4970
4971          if (isLoad) {
4972             putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, tTA)));
4973          } else {
4974             gen_narrowing_store(szB, tTA, getIReg64orZR(tt));
4975          }
4976
4977          if (wBack && !earlyWBack)
4978             putIReg64orSP(nn, mkexpr(tEA));
4979
4980          const HChar* ld_name[4] = { "ldurb", "ldurh", "ldur", "ldur" };
4981          const HChar* st_name[4] = { "sturb", "sturh", "stur", "stur" };
4982          const HChar* fmt_str = NULL;
4983          switch (how) {
4984             case BITS2(0,1):
4985                fmt_str = "%s %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
4986                break;
4987             case BITS2(1,1):
4988                fmt_str = "%s %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
4989                break;
4990             case BITS2(0,0):
4991                fmt_str = "%s %s, [%s, #%lld] (at-Rn)\n";
4992                break;
4993             default:
4994                vassert(0);
4995          }
4996          DIP(fmt_str, (isLoad ? ld_name : st_name)[szLg2],
4997                       nameIRegOrZR(szB == 8, tt),
4998                       nameIReg64orSP(nn), simm9);
4999          return True;
5000       }
5001    }
5002
5003    /* -------- LDP,STP (immediate, simm7) (INT REGS) -------- */
5004    /* L==1 => mm==LD
5005       L==0 => mm==ST
5006       x==0 => 32 bit transfers, and zero extended loads
5007       x==1 => 64 bit transfers
5008       simm7 is scaled by the (single-register) transfer size
5009
5010       (at-Rn-then-Rn=EA)
5011       x0 101 0001 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP], #imm
5012
5013       (at-EA-then-Rn=EA)
5014       x0 101 0011 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]!
5015
5016       (at-EA)
5017       x0 101 0010 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]
5018    */
5019    UInt insn_30_23 = INSN(30,23);
5020    if (insn_30_23 == BITS8(0,1,0,1,0,0,0,1)
5021        || insn_30_23 == BITS8(0,1,0,1,0,0,1,1)
5022        || insn_30_23 == BITS8(0,1,0,1,0,0,1,0)) {
5023       UInt bL     = INSN(22,22);
5024       UInt bX     = INSN(31,31);
5025       UInt bWBack = INSN(23,23);
5026       UInt rT1    = INSN(4,0);
5027       UInt rN     = INSN(9,5);
5028       UInt rT2    = INSN(14,10);
5029       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5030       if ((bWBack && (rT1 == rN || rT2 == rN) && rN != 31)
5031           || (bL && rT1 == rT2)) {
5032          /* undecodable; fall through */
5033       } else {
5034          if (rN == 31) { /* FIXME generate stack alignment check */ }
5035
5036          // Compute the transfer address TA and the writeback address WA.
5037          IRTemp tRN = newTemp(Ity_I64);
5038          assign(tRN, getIReg64orSP(rN));
5039          IRTemp tEA = newTemp(Ity_I64);
5040          simm7 = (bX ? 8 : 4) * simm7;
5041          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5042
5043          IRTemp tTA = newTemp(Ity_I64);
5044          IRTemp tWA = newTemp(Ity_I64);
5045          switch (INSN(24,23)) {
5046             case BITS2(0,1):
5047                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5048             case BITS2(1,1):
5049                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5050             case BITS2(1,0):
5051                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5052             default:
5053                vassert(0); /* NOTREACHED */
5054          }
5055
5056          /* Normally rN would be updated after the transfer.  However, in
5057             the special case typifed by
5058                stp x29, x30, [sp,#-112]!
5059             it is necessary to update SP before the transfer, (1)
5060             because Memcheck will otherwise complain about a write
5061             below the stack pointer, and (2) because the segfault
5062             stack extension mechanism will otherwise extend the stack
5063             only down to SP before the instruction, which might not be
5064             far enough, if the -112 bit takes the actual access
5065             address to the next page.
5066          */
5067          Bool earlyWBack
5068            = bWBack && simm7 < 0
5069              && INSN(24,23) == BITS2(1,1) && rN == 31 && bL == 0;
5070
5071          if (bWBack && earlyWBack)
5072             putIReg64orSP(rN, mkexpr(tEA));
5073
5074          /**/ if (bL == 1 && bX == 1) {
5075             // 64 bit load
5076             putIReg64orZR(rT1, loadLE(Ity_I64,
5077                                       binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
5078             putIReg64orZR(rT2, loadLE(Ity_I64,
5079                                       binop(Iop_Add64,mkexpr(tTA),mkU64(8))));
5080          } else if (bL == 1 && bX == 0) {
5081             // 32 bit load
5082             putIReg32orZR(rT1, loadLE(Ity_I32,
5083                                       binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
5084             putIReg32orZR(rT2, loadLE(Ity_I32,
5085                                       binop(Iop_Add64,mkexpr(tTA),mkU64(4))));
5086          } else if (bL == 0 && bX == 1) {
5087             // 64 bit store
5088             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
5089                     getIReg64orZR(rT1));
5090             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(8)),
5091                     getIReg64orZR(rT2));
5092          } else {
5093             vassert(bL == 0 && bX == 0);
5094             // 32 bit store
5095             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
5096                     getIReg32orZR(rT1));
5097             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(4)),
5098                     getIReg32orZR(rT2));
5099          }
5100
5101          if (bWBack && !earlyWBack)
5102             putIReg64orSP(rN, mkexpr(tEA));
5103
5104          const HChar* fmt_str = NULL;
5105          switch (INSN(24,23)) {
5106             case BITS2(0,1):
5107                fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5108                break;
5109             case BITS2(1,1):
5110                fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5111                break;
5112             case BITS2(1,0):
5113                fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
5114                break;
5115             default:
5116                vassert(0);
5117          }
5118          DIP(fmt_str, bL == 0 ? "st" : "ld",
5119                       nameIRegOrZR(bX == 1, rT1),
5120                       nameIRegOrZR(bX == 1, rT2),
5121                       nameIReg64orSP(rN), simm7);
5122          return True;
5123       }
5124    }
5125
5126    /* -------- LDPSW (immediate, simm7) (INT REGS) -------- */
5127    /* Does 32 bit transfers which are sign extended to 64 bits.
5128       simm7 is scaled by the (single-register) transfer size
5129
5130       (at-Rn-then-Rn=EA)
5131       01 101 0001 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP], #imm
5132
5133       (at-EA-then-Rn=EA)
5134       01 101 0011 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP, #imm]!
5135
5136       (at-EA)
5137       01 101 0010 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP, #imm]
5138    */
5139    UInt insn_31_22 = INSN(31,22);
5140    if (insn_31_22 == BITS10(0,1,1,0,1,0,0,0,1,1)
5141        || insn_31_22 == BITS10(0,1,1,0,1,0,0,1,1,1)
5142        || insn_31_22 == BITS10(0,1,1,0,1,0,0,1,0,1)) {
5143       UInt bWBack = INSN(23,23);
5144       UInt rT1    = INSN(4,0);
5145       UInt rN     = INSN(9,5);
5146       UInt rT2    = INSN(14,10);
5147       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5148       if ((bWBack && (rT1 == rN || rT2 == rN) && rN != 31)
5149           || (rT1 == rT2)) {
5150          /* undecodable; fall through */
5151       } else {
5152          if (rN == 31) { /* FIXME generate stack alignment check */ }
5153
5154          // Compute the transfer address TA and the writeback address WA.
5155          IRTemp tRN = newTemp(Ity_I64);
5156          assign(tRN, getIReg64orSP(rN));
5157          IRTemp tEA = newTemp(Ity_I64);
5158          simm7 = 4 * simm7;
5159          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5160
5161          IRTemp tTA = newTemp(Ity_I64);
5162          IRTemp tWA = newTemp(Ity_I64);
5163          switch (INSN(24,23)) {
5164             case BITS2(0,1):
5165                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5166             case BITS2(1,1):
5167                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5168             case BITS2(1,0):
5169                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5170             default:
5171                vassert(0); /* NOTREACHED */
5172          }
5173
5174          // 32 bit load, sign extended to 64 bits
5175          putIReg64orZR(rT1, unop(Iop_32Sto64,
5176                                  loadLE(Ity_I32, binop(Iop_Add64,
5177                                                        mkexpr(tTA),
5178                                                        mkU64(0)))));
5179          putIReg64orZR(rT2, unop(Iop_32Sto64,
5180                                  loadLE(Ity_I32, binop(Iop_Add64,
5181                                                        mkexpr(tTA),
5182                                                        mkU64(4)))));
5183          if (bWBack)
5184             putIReg64orSP(rN, mkexpr(tEA));
5185
5186          const HChar* fmt_str = NULL;
5187          switch (INSN(24,23)) {
5188             case BITS2(0,1):
5189                fmt_str = "ldpsw %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5190                break;
5191             case BITS2(1,1):
5192                fmt_str = "ldpsw %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5193                break;
5194             case BITS2(1,0):
5195                fmt_str = "ldpsw %s, %s, [%s, #%lld] (at-Rn)\n";
5196                break;
5197             default:
5198                vassert(0);
5199          }
5200          DIP(fmt_str, nameIReg64orZR(rT1),
5201                       nameIReg64orZR(rT2),
5202                       nameIReg64orSP(rN), simm7);
5203          return True;
5204       }
5205    }
5206
5207    /* ---------------- LDR (literal, int reg) ---------------- */
5208    /* 31 29      23    4
5209       00 011 000 imm19 Rt   LDR   Wt, [PC + sxTo64(imm19 << 2)]
5210       01 011 000 imm19 Rt   LDR   Xt, [PC + sxTo64(imm19 << 2)]
5211       10 011 000 imm19 Rt   LDRSW Xt, [PC + sxTo64(imm19 << 2)]
5212       11 011 000 imm19 Rt   prefetch  [PC + sxTo64(imm19 << 2)]
5213       Just handles the first two cases for now.
5214    */
5215    if (INSN(29,24) == BITS6(0,1,1,0,0,0) && INSN(31,31) == 0) {
5216       UInt  imm19 = INSN(23,5);
5217       UInt  rT    = INSN(4,0);
5218       UInt  bX    = INSN(30,30);
5219       ULong ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
5220       if (bX) {
5221          putIReg64orZR(rT, loadLE(Ity_I64, mkU64(ea)));
5222       } else {
5223          putIReg32orZR(rT, loadLE(Ity_I32, mkU64(ea)));
5224       }
5225       DIP("ldr %s, 0x%llx (literal)\n", nameIRegOrZR(bX == 1, rT), ea);
5226       return True;
5227    }
5228
5229    /* -------------- {LD,ST}R (integer register) --------------- */
5230    /* 31 29        20 15     12 11 9  4
5231       |  |         |  |      |  |  |  |
5232       11 111000011 Rm option S  10 Rn Rt  LDR  Xt, [Xn|SP, R<m>{ext/sh}]
5233       10 111000011 Rm option S  10 Rn Rt  LDR  Wt, [Xn|SP, R<m>{ext/sh}]
5234       01 111000011 Rm option S  10 Rn Rt  LDRH Wt, [Xn|SP, R<m>{ext/sh}]
5235       00 111000011 Rm option S  10 Rn Rt  LDRB Wt, [Xn|SP, R<m>{ext/sh}]
5236
5237       11 111000001 Rm option S  10 Rn Rt  STR  Xt, [Xn|SP, R<m>{ext/sh}]
5238       10 111000001 Rm option S  10 Rn Rt  STR  Wt, [Xn|SP, R<m>{ext/sh}]
5239       01 111000001 Rm option S  10 Rn Rt  STRH Wt, [Xn|SP, R<m>{ext/sh}]
5240       00 111000001 Rm option S  10 Rn Rt  STRB Wt, [Xn|SP, R<m>{ext/sh}]
5241    */
5242    if (INSN(29,23) == BITS7(1,1,1,0,0,0,0)
5243        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5244       HChar  dis_buf[64];
5245       UInt   szLg2 = INSN(31,30);
5246       Bool   isLD  = INSN(22,22) == 1;
5247       UInt   tt    = INSN(4,0);
5248       IRTemp ea    = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
5249       if (ea != IRTemp_INVALID) {
5250          switch (szLg2) {
5251             case 3: /* 64 bit */
5252                if (isLD) {
5253                   putIReg64orZR(tt, loadLE(Ity_I64, mkexpr(ea)));
5254                   DIP("ldr %s, %s\n", nameIReg64orZR(tt), dis_buf);
5255                } else {
5256                   storeLE(mkexpr(ea), getIReg64orZR(tt));
5257                   DIP("str %s, %s\n", nameIReg64orZR(tt), dis_buf);
5258                }
5259                break;
5260             case 2: /* 32 bit */
5261                if (isLD) {
5262                   putIReg32orZR(tt, loadLE(Ity_I32, mkexpr(ea)));
5263                   DIP("ldr %s, %s\n", nameIReg32orZR(tt), dis_buf);
5264                } else {
5265                   storeLE(mkexpr(ea), getIReg32orZR(tt));
5266                   DIP("str %s, %s\n", nameIReg32orZR(tt), dis_buf);
5267                }
5268                break;
5269             case 1: /* 16 bit */
5270                if (isLD) {
5271                   putIReg64orZR(tt, unop(Iop_16Uto64,
5272                                          loadLE(Ity_I16, mkexpr(ea))));
5273                   DIP("ldruh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5274                } else {
5275                   storeLE(mkexpr(ea), unop(Iop_64to16, getIReg64orZR(tt)));
5276                   DIP("strh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5277                }
5278                break;
5279             case 0: /* 8 bit */
5280                if (isLD) {
5281                   putIReg64orZR(tt, unop(Iop_8Uto64,
5282                                          loadLE(Ity_I8, mkexpr(ea))));
5283                   DIP("ldrub %s, %s\n", nameIReg32orZR(tt), dis_buf);
5284                } else {
5285                   storeLE(mkexpr(ea), unop(Iop_64to8, getIReg64orZR(tt)));
5286                   DIP("strb %s, %s\n", nameIReg32orZR(tt), dis_buf);
5287                }
5288                break;
5289             default:
5290                vassert(0);
5291          }
5292          return True;
5293       }
5294    }
5295
5296    /* -------------- LDRS{B,H,W} (uimm12) -------------- */
5297    /* 31 29  26  23 21    9 4
5298       10 111 001 10 imm12 n t   LDRSW Xt, [Xn|SP, #pimm12 * 4]
5299       01 111 001 1x imm12 n t   LDRSH Rt, [Xn|SP, #pimm12 * 2]
5300       00 111 001 1x imm12 n t   LDRSB Rt, [Xn|SP, #pimm12 * 1]
5301       where
5302          Rt is Wt when x==1, Xt when x==0
5303    */
5304    if (INSN(29,23) == BITS7(1,1,1,0,0,1,1)) {
5305       /* Further checks on bits 31:30 and 22 */
5306       Bool valid = False;
5307       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5308          case BITS3(1,0,0):
5309          case BITS3(0,1,0): case BITS3(0,1,1):
5310          case BITS3(0,0,0): case BITS3(0,0,1):
5311             valid = True;
5312             break;
5313       }
5314       if (valid) {
5315          UInt    szLg2 = INSN(31,30);
5316          UInt    bitX  = INSN(22,22);
5317          UInt    imm12 = INSN(21,10);
5318          UInt    nn    = INSN(9,5);
5319          UInt    tt    = INSN(4,0);
5320          UInt    szB   = 1 << szLg2;
5321          IRExpr* ea    = binop(Iop_Add64,
5322                                getIReg64orSP(nn), mkU64(imm12 * szB));
5323          switch (szB) {
5324             case 4:
5325                vassert(bitX == 0);
5326                putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, ea)));
5327                DIP("ldrsw %s, [%s, #%u]\n", nameIReg64orZR(tt),
5328                    nameIReg64orSP(nn), imm12 * szB);
5329                break;
5330             case 2:
5331                if (bitX == 1) {
5332                   putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, ea)));
5333                } else {
5334                   putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, ea)));
5335                }
5336                DIP("ldrsh %s, [%s, #%u]\n",
5337                    nameIRegOrZR(bitX == 0, tt),
5338                    nameIReg64orSP(nn), imm12 * szB);
5339                break;
5340             case 1:
5341                if (bitX == 1) {
5342                   putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, ea)));
5343                } else {
5344                   putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, ea)));
5345                }
5346                DIP("ldrsb %s, [%s, #%u]\n",
5347                    nameIRegOrZR(bitX == 0, tt),
5348                    nameIReg64orSP(nn), imm12 * szB);
5349                break;
5350             default:
5351                vassert(0);
5352          }
5353          return True;
5354       }
5355       /* else fall through */
5356    }
5357
5358    /* -------------- LDRS{B,H,W} (simm9, upd) -------------- */
5359    /* (at-Rn-then-Rn=EA)
5360       31 29      23 21 20   11 9 4
5361       00 111 000 1x 0  imm9 01 n t  LDRSB Rt, [Xn|SP], #simm9
5362       01 111 000 1x 0  imm9 01 n t  LDRSH Rt, [Xn|SP], #simm9
5363       10 111 000 10 0  imm9 01 n t  LDRSW Xt, [Xn|SP], #simm9
5364
5365       (at-EA-then-Rn=EA)
5366       00 111 000 1x 0  imm9 11 n t  LDRSB Rt, [Xn|SP, #simm9]!
5367       01 111 000 1x 0  imm9 11 n t  LDRSH Rt, [Xn|SP, #simm9]!
5368       10 111 000 10 0  imm9 11 n t  LDRSW Xt, [Xn|SP, #simm9]!
5369       where
5370          Rt is Wt when x==1, Xt when x==0
5371          transfer-at-Rn when [11]==0, at EA when [11]==1
5372    */
5373    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5374        && INSN(21,21) == 0 && INSN(10,10) == 1) {
5375       /* Further checks on bits 31:30 and 22 */
5376       Bool valid = False;
5377       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5378          case BITS3(1,0,0):                    // LDRSW Xt
5379          case BITS3(0,1,0): case BITS3(0,1,1): // LDRSH Xt, Wt
5380          case BITS3(0,0,0): case BITS3(0,0,1): // LDRSB Xt, Wt
5381             valid = True;
5382             break;
5383       }
5384       if (valid) {
5385          UInt   szLg2 = INSN(31,30);
5386          UInt   imm9  = INSN(20,12);
5387          Bool   atRN  = INSN(11,11) == 0;
5388          UInt   nn    = INSN(9,5);
5389          UInt   tt    = INSN(4,0);
5390          IRTemp tRN   = newTemp(Ity_I64);
5391          IRTemp tEA   = newTemp(Ity_I64);
5392          IRTemp tTA   = IRTemp_INVALID;
5393          ULong  simm9 = sx_to_64(imm9, 9);
5394          Bool   is64  = INSN(22,22) == 0;
5395          assign(tRN, getIReg64orSP(nn));
5396          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5397          tTA = atRN ? tRN : tEA;
5398          HChar ch = '?';
5399          /* There are 5 cases:
5400                byte     load,           SX to 64
5401                byte     load, SX to 32, ZX to 64
5402                halfword load,           SX to 64
5403                halfword load, SX to 32, ZX to 64
5404                word     load,           SX to 64
5405             The ifs below handle them in the listed order.
5406          */
5407          if (szLg2 == 0) {
5408             ch = 'b';
5409             if (is64) {
5410                putIReg64orZR(tt, unop(Iop_8Sto64,
5411                                       loadLE(Ity_I8, mkexpr(tTA))));
5412             } else {
5413                putIReg32orZR(tt, unop(Iop_8Sto32,
5414                                       loadLE(Ity_I8, mkexpr(tTA))));
5415             }
5416          }
5417          else if (szLg2 == 1) {
5418             ch = 'h';
5419             if (is64) {
5420                putIReg64orZR(tt, unop(Iop_16Sto64,
5421                                       loadLE(Ity_I16, mkexpr(tTA))));
5422             } else {
5423                putIReg32orZR(tt, unop(Iop_16Sto32,
5424                                       loadLE(Ity_I16, mkexpr(tTA))));
5425             }
5426          }
5427          else if (szLg2 == 2 && is64) {
5428             ch = 'w';
5429             putIReg64orZR(tt, unop(Iop_32Sto64,
5430                                    loadLE(Ity_I32, mkexpr(tTA))));
5431          }
5432          else {
5433             vassert(0);
5434          }
5435          putIReg64orSP(nn, mkexpr(tEA));
5436          DIP(atRN ? "ldrs%c %s, [%s], #%llu\n" : "ldrs%c %s, [%s, #%llu]!",
5437              ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), simm9);
5438          return True;
5439       }
5440       /* else fall through */
5441    }
5442
5443    /* -------------- LDRS{B,H,W} (simm9, noUpd) -------------- */
5444    /* 31 29      23 21 20   11 9 4
5445       00 111 000 1x 0  imm9 00 n t  LDURSB Rt, [Xn|SP, #simm9]
5446       01 111 000 1x 0  imm9 00 n t  LDURSH Rt, [Xn|SP, #simm9]
5447       10 111 000 10 0  imm9 00 n t  LDURSW Xt, [Xn|SP, #simm9]
5448       where
5449          Rt is Wt when x==1, Xt when x==0
5450    */
5451    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5452        && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
5453       /* Further checks on bits 31:30 and 22 */
5454       Bool valid = False;
5455       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5456          case BITS3(1,0,0):                    // LDURSW Xt
5457          case BITS3(0,1,0): case BITS3(0,1,1): // LDURSH Xt, Wt
5458          case BITS3(0,0,0): case BITS3(0,0,1): // LDURSB Xt, Wt
5459             valid = True;
5460             break;
5461       }
5462       if (valid) {
5463          UInt   szLg2 = INSN(31,30);
5464          UInt   imm9  = INSN(20,12);
5465          UInt   nn    = INSN(9,5);
5466          UInt   tt    = INSN(4,0);
5467          IRTemp tRN   = newTemp(Ity_I64);
5468          IRTemp tEA   = newTemp(Ity_I64);
5469          ULong  simm9 = sx_to_64(imm9, 9);
5470          Bool   is64  = INSN(22,22) == 0;
5471          assign(tRN, getIReg64orSP(nn));
5472          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5473          HChar ch = '?';
5474          /* There are 5 cases:
5475                byte     load,           SX to 64
5476                byte     load, SX to 32, ZX to 64
5477                halfword load,           SX to 64
5478                halfword load, SX to 32, ZX to 64
5479                word     load,           SX to 64
5480             The ifs below handle them in the listed order.
5481          */
5482          if (szLg2 == 0) {
5483             ch = 'b';
5484             if (is64) {
5485                putIReg64orZR(tt, unop(Iop_8Sto64,
5486                                       loadLE(Ity_I8, mkexpr(tEA))));
5487             } else {
5488                putIReg32orZR(tt, unop(Iop_8Sto32,
5489                                       loadLE(Ity_I8, mkexpr(tEA))));
5490             }
5491          }
5492          else if (szLg2 == 1) {
5493             ch = 'h';
5494             if (is64) {
5495                putIReg64orZR(tt, unop(Iop_16Sto64,
5496                                       loadLE(Ity_I16, mkexpr(tEA))));
5497             } else {
5498                putIReg32orZR(tt, unop(Iop_16Sto32,
5499                                       loadLE(Ity_I16, mkexpr(tEA))));
5500             }
5501          }
5502          else if (szLg2 == 2 && is64) {
5503             ch = 'w';
5504             putIReg64orZR(tt, unop(Iop_32Sto64,
5505                                    loadLE(Ity_I32, mkexpr(tEA))));
5506          }
5507          else {
5508             vassert(0);
5509          }
5510          DIP("ldurs%c %s, [%s, #%lld]\n",
5511              ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), (Long)simm9);
5512          return True;
5513       }
5514       /* else fall through */
5515    }
5516
5517    /* -------- LDP,STP (immediate, simm7) (FP&VEC) -------- */
5518    /* L==1    => mm==LD
5519       L==0    => mm==ST
5520       sz==00  => 32 bit (S) transfers
5521       sz==01  => 64 bit (D) transfers
5522       sz==10  => 128 bit (Q) transfers
5523       sz==11  isn't allowed
5524       simm7 is scaled by the (single-register) transfer size
5525
5526       31 29  26   22 21   14 9 4
5527
5528       sz 101 1000 L  imm7 t2 n t1   mmNP SDQt1, SDQt2, [Xn|SP, #imm]
5529                                     (at-EA, with nontemporal hint)
5530
5531       sz 101 1001 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP], #imm
5532                                     (at-Rn-then-Rn=EA)
5533
5534       sz 101 1010 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]
5535                                     (at-EA)
5536
5537       sz 101 1011 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]!
5538                                     (at-EA-then-Rn=EA)
5539    */
5540    if (INSN(29,25) == BITS5(1,0,1,1,0)) {
5541       UInt szSlg2 = INSN(31,30); // log2 of the xfer size in 32-bit units
5542       Bool isLD   = INSN(22,22) == 1;
5543       Bool wBack  = INSN(23,23) == 1;
5544       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5545       UInt tt2    = INSN(14,10);
5546       UInt nn     = INSN(9,5);
5547       UInt tt1    = INSN(4,0);
5548       if (szSlg2 == BITS2(1,1) || (isLD && tt1 == tt2)) {
5549          /* undecodable; fall through */
5550       } else {
5551          if (nn == 31) { /* FIXME generate stack alignment check */ }
5552
5553          // Compute the transfer address TA and the writeback address WA.
5554          UInt   szB = 4 << szSlg2; /* szB is the per-register size */
5555          IRTemp tRN = newTemp(Ity_I64);
5556          assign(tRN, getIReg64orSP(nn));
5557          IRTemp tEA = newTemp(Ity_I64);
5558          simm7 = szB * simm7;
5559          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5560
5561          IRTemp tTA = newTemp(Ity_I64);
5562          IRTemp tWA = newTemp(Ity_I64);
5563          switch (INSN(24,23)) {
5564             case BITS2(0,1):
5565                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5566             case BITS2(1,1):
5567                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5568             case BITS2(1,0):
5569             case BITS2(0,0):
5570                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5571             default:
5572                vassert(0); /* NOTREACHED */
5573          }
5574
5575          IRType ty = Ity_INVALID;
5576          switch (szB) {
5577             case 4:  ty = Ity_F32;  break;
5578             case 8:  ty = Ity_F64;  break;
5579             case 16: ty = Ity_V128; break;
5580             default: vassert(0);
5581          }
5582
5583          /* Normally rN would be updated after the transfer.  However, in
5584             the special cases typifed by
5585                stp q0, q1, [sp,#-512]!
5586                stp d0, d1, [sp,#-512]!
5587                stp s0, s1, [sp,#-512]!
5588             it is necessary to update SP before the transfer, (1)
5589             because Memcheck will otherwise complain about a write
5590             below the stack pointer, and (2) because the segfault
5591             stack extension mechanism will otherwise extend the stack
5592             only down to SP before the instruction, which might not be
5593             far enough, if the -512 bit takes the actual access
5594             address to the next page.
5595          */
5596          Bool earlyWBack
5597            = wBack && simm7 < 0
5598              && INSN(24,23) == BITS2(1,1) && nn == 31 && !isLD;
5599
5600          if (wBack && earlyWBack)
5601             putIReg64orSP(nn, mkexpr(tEA));
5602
5603          if (isLD) {
5604             if (szB < 16) {
5605                putQReg128(tt1, mkV128(0x0000));
5606             }
5607             putQRegLO(tt1,
5608                       loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(0))));
5609             if (szB < 16) {
5610                putQReg128(tt2, mkV128(0x0000));
5611             }
5612             putQRegLO(tt2,
5613                       loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(szB))));
5614          } else {
5615             storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(0)),
5616                     getQRegLO(tt1, ty));
5617             storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(szB)),
5618                     getQRegLO(tt2, ty));
5619          }
5620
5621          if (wBack && !earlyWBack)
5622             putIReg64orSP(nn, mkexpr(tEA));
5623
5624          const HChar* fmt_str = NULL;
5625          switch (INSN(24,23)) {
5626             case BITS2(0,1):
5627                fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5628                break;
5629             case BITS2(1,1):
5630                fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5631                break;
5632             case BITS2(1,0):
5633                fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
5634                break;
5635             case BITS2(0,0):
5636                fmt_str = "%snp %s, %s, [%s, #%lld] (at-Rn)\n";
5637                break;
5638             default:
5639                vassert(0);
5640          }
5641          DIP(fmt_str, isLD ? "ld" : "st",
5642                       nameQRegLO(tt1, ty), nameQRegLO(tt2, ty),
5643                       nameIReg64orSP(nn), simm7);
5644          return True;
5645       }
5646    }
5647
5648    /* -------------- {LD,ST}R (vector register) --------------- */
5649    /* 31 29     23  20 15     12 11 9  4
5650       |  |      |   |  |      |  |  |  |
5651       00 111100 011 Rm option S  10 Rn Rt  LDR Bt, [Xn|SP, R<m>{ext/sh}]
5652       01 111100 011 Rm option S  10 Rn Rt  LDR Ht, [Xn|SP, R<m>{ext/sh}]
5653       10 111100 011 Rm option S  10 Rn Rt  LDR St, [Xn|SP, R<m>{ext/sh}]
5654       11 111100 011 Rm option S  10 Rn Rt  LDR Dt, [Xn|SP, R<m>{ext/sh}]
5655       00 111100 111 Rm option S  10 Rn Rt  LDR Qt, [Xn|SP, R<m>{ext/sh}]
5656
5657       00 111100 001 Rm option S  10 Rn Rt  STR Bt, [Xn|SP, R<m>{ext/sh}]
5658       01 111100 001 Rm option S  10 Rn Rt  STR Ht, [Xn|SP, R<m>{ext/sh}]
5659       10 111100 001 Rm option S  10 Rn Rt  STR St, [Xn|SP, R<m>{ext/sh}]
5660       11 111100 001 Rm option S  10 Rn Rt  STR Dt, [Xn|SP, R<m>{ext/sh}]
5661       00 111100 101 Rm option S  10 Rn Rt  STR Qt, [Xn|SP, R<m>{ext/sh}]
5662    */
5663    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5664        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5665       HChar  dis_buf[64];
5666       UInt   szLg2 = (INSN(23,23) << 2) | INSN(31,30);
5667       Bool   isLD  = INSN(22,22) == 1;
5668       UInt   tt    = INSN(4,0);
5669       if (szLg2 > 4) goto after_LDR_STR_vector_register;
5670       IRTemp ea    = gen_indexed_EA(dis_buf, insn, False/*to/from vec regs*/);
5671       if (ea == IRTemp_INVALID) goto after_LDR_STR_vector_register;
5672       switch (szLg2) {
5673          case 0: /* 8 bit */
5674             if (isLD) {
5675                putQReg128(tt, mkV128(0x0000));
5676                putQRegLO(tt, loadLE(Ity_I8, mkexpr(ea)));
5677                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
5678             } else {
5679                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I8));
5680                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
5681             }
5682             break;
5683          case 1:
5684             if (isLD) {
5685                putQReg128(tt, mkV128(0x0000));
5686                putQRegLO(tt, loadLE(Ity_I16, mkexpr(ea)));
5687                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
5688             } else {
5689                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I16));
5690                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
5691             }
5692             break;
5693          case 2: /* 32 bit */
5694             if (isLD) {
5695                putQReg128(tt, mkV128(0x0000));
5696                putQRegLO(tt, loadLE(Ity_I32, mkexpr(ea)));
5697                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
5698             } else {
5699                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I32));
5700                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
5701             }
5702             break;
5703          case 3: /* 64 bit */
5704             if (isLD) {
5705                putQReg128(tt, mkV128(0x0000));
5706                putQRegLO(tt, loadLE(Ity_I64, mkexpr(ea)));
5707                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
5708             } else {
5709                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I64));
5710                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
5711             }
5712             break;
5713          case 4:
5714             if (isLD) {
5715                putQReg128(tt, loadLE(Ity_V128, mkexpr(ea)));
5716                DIP("ldr %s, %s\n", nameQReg128(tt), dis_buf);
5717             } else {
5718                storeLE(mkexpr(ea), getQReg128(tt));
5719                DIP("str %s, %s\n", nameQReg128(tt), dis_buf);
5720             }
5721             break;
5722          default:
5723             vassert(0);
5724       }
5725       return True;
5726    }
5727   after_LDR_STR_vector_register:
5728
5729    /* ---------- LDRS{B,H,W} (integer register, SX) ---------- */
5730    /* 31 29      22 20 15  12 11 9  4
5731       |  |       |  |  |   |  |  |  |
5732       10 1110001 01 Rm opt S 10 Rn Rt    LDRSW Xt, [Xn|SP, R<m>{ext/sh}]
5733
5734       01 1110001 01 Rm opt S 10 Rn Rt    LDRSH Xt, [Xn|SP, R<m>{ext/sh}]
5735       01 1110001 11 Rm opt S 10 Rn Rt    LDRSH Wt, [Xn|SP, R<m>{ext/sh}]
5736
5737       00 1110001 01 Rm opt S 10 Rn Rt    LDRSB Xt, [Xn|SP, R<m>{ext/sh}]
5738       00 1110001 11 Rm opt S 10 Rn Rt    LDRSB Wt, [Xn|SP, R<m>{ext/sh}]
5739    */
5740    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5741        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5742       HChar  dis_buf[64];
5743       UInt   szLg2  = INSN(31,30);
5744       Bool   sxTo64 = INSN(22,22) == 0; // else sx to 32 and zx to 64
5745       UInt   tt     = INSN(4,0);
5746       if (szLg2 == 3) goto after_LDRS_integer_register;
5747       IRTemp ea     = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
5748       if (ea == IRTemp_INVALID) goto after_LDRS_integer_register;
5749       /* Enumerate the 5 variants explicitly. */
5750       if (szLg2 == 2/*32 bit*/ && sxTo64) {
5751          putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, mkexpr(ea))));
5752          DIP("ldrsw %s, %s\n", nameIReg64orZR(tt), dis_buf);
5753          return True;
5754       }
5755       else
5756       if (szLg2 == 1/*16 bit*/) {
5757          if (sxTo64) {
5758             putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, mkexpr(ea))));
5759             DIP("ldrsh %s, %s\n", nameIReg64orZR(tt), dis_buf);
5760          } else {
5761             putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, mkexpr(ea))));
5762             DIP("ldrsh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5763          }
5764          return True;
5765       }
5766       else
5767       if (szLg2 == 0/*8 bit*/) {
5768          if (sxTo64) {
5769             putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, mkexpr(ea))));
5770             DIP("ldrsb %s, %s\n", nameIReg64orZR(tt), dis_buf);
5771          } else {
5772             putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, mkexpr(ea))));
5773             DIP("ldrsb %s, %s\n", nameIReg32orZR(tt), dis_buf);
5774          }
5775          return True;
5776       }
5777       /* else it's an invalid combination */
5778    }
5779   after_LDRS_integer_register:
5780
5781    /* -------- LDR/STR (immediate, SIMD&FP, unsigned offset) -------- */
5782    /* This is the Unsigned offset variant only.  The Post-Index and
5783       Pre-Index variants are below.
5784
5785       31 29      23 21    9 4
5786       00 111 101 01 imm12 n t   LDR Bt, [Xn|SP + imm12 * 1]
5787       01 111 101 01 imm12 n t   LDR Ht, [Xn|SP + imm12 * 2]
5788       10 111 101 01 imm12 n t   LDR St, [Xn|SP + imm12 * 4]
5789       11 111 101 01 imm12 n t   LDR Dt, [Xn|SP + imm12 * 8]
5790       00 111 101 11 imm12 n t   LDR Qt, [Xn|SP + imm12 * 16]
5791
5792       00 111 101 00 imm12 n t   STR Bt, [Xn|SP + imm12 * 1]
5793       01 111 101 00 imm12 n t   STR Ht, [Xn|SP + imm12 * 2]
5794       10 111 101 00 imm12 n t   STR St, [Xn|SP + imm12 * 4]
5795       11 111 101 00 imm12 n t   STR Dt, [Xn|SP + imm12 * 8]
5796       00 111 101 10 imm12 n t   STR Qt, [Xn|SP + imm12 * 16]
5797    */
5798    if (INSN(29,24) == BITS6(1,1,1,1,0,1)
5799        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4) {
5800       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5801       Bool   isLD   = INSN(22,22) == 1;
5802       UInt   pimm12 = INSN(21,10) << szLg2;
5803       UInt   nn     = INSN(9,5);
5804       UInt   tt     = INSN(4,0);
5805       IRTemp tEA    = newTemp(Ity_I64);
5806       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5807       assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(pimm12)));
5808       if (isLD) {
5809          if (szLg2 < 4) {
5810             putQReg128(tt, mkV128(0x0000));
5811          }
5812          putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
5813       } else {
5814          storeLE(mkexpr(tEA), getQRegLO(tt, ty));
5815       }
5816       DIP("%s %s, [%s, #%u]\n",
5817           isLD ? "ldr" : "str",
5818           nameQRegLO(tt, ty), nameIReg64orSP(nn), pimm12);
5819       return True;
5820    }
5821
5822    /* -------- LDR/STR (immediate, SIMD&FP, pre/post index) -------- */
5823    /* These are the Post-Index and Pre-Index variants.
5824
5825       31 29      23   20   11 9 4
5826       (at-Rn-then-Rn=EA)
5827       00 111 100 01 0 imm9 01 n t   LDR Bt, [Xn|SP], #simm
5828       01 111 100 01 0 imm9 01 n t   LDR Ht, [Xn|SP], #simm
5829       10 111 100 01 0 imm9 01 n t   LDR St, [Xn|SP], #simm
5830       11 111 100 01 0 imm9 01 n t   LDR Dt, [Xn|SP], #simm
5831       00 111 100 11 0 imm9 01 n t   LDR Qt, [Xn|SP], #simm
5832
5833       (at-EA-then-Rn=EA)
5834       00 111 100 01 0 imm9 11 n t   LDR Bt, [Xn|SP, #simm]!
5835       01 111 100 01 0 imm9 11 n t   LDR Ht, [Xn|SP, #simm]!
5836       10 111 100 01 0 imm9 11 n t   LDR St, [Xn|SP, #simm]!
5837       11 111 100 01 0 imm9 11 n t   LDR Dt, [Xn|SP, #simm]!
5838       00 111 100 11 0 imm9 11 n t   LDR Qt, [Xn|SP, #simm]!
5839
5840       Stores are the same except with bit 22 set to 0.
5841    */
5842    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5843        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
5844        && INSN(21,21) == 0 && INSN(10,10) == 1) {
5845       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5846       Bool   isLD   = INSN(22,22) == 1;
5847       UInt   imm9   = INSN(20,12);
5848       Bool   atRN   = INSN(11,11) == 0;
5849       UInt   nn     = INSN(9,5);
5850       UInt   tt     = INSN(4,0);
5851       IRTemp tRN    = newTemp(Ity_I64);
5852       IRTemp tEA    = newTemp(Ity_I64);
5853       IRTemp tTA    = IRTemp_INVALID;
5854       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5855       ULong  simm9  = sx_to_64(imm9, 9);
5856       assign(tRN, getIReg64orSP(nn));
5857       assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5858       tTA = atRN ? tRN : tEA;
5859
5860       /* Do early writeback for the cases typified by
5861             str d8, [sp, #-32]!
5862             str d10, [sp, #-128]!
5863             str q1, [sp, #-32]!
5864          for the same reasons as described in a similar comment in the
5865          "LDP,STP (immediate, simm7) (FP&VEC)" case just above.
5866       */
5867       Bool earlyWBack
5868          = !atRN && !isLD && (ty == Ity_F64 || ty == Ity_V128)
5869            && nn == 31 && ((Long)simm9) < 0;
5870
5871       if (earlyWBack)
5872          putIReg64orSP(nn, mkexpr(tEA));
5873
5874       if (isLD) {
5875          if (szLg2 < 4) {
5876             putQReg128(tt, mkV128(0x0000));
5877          }
5878          putQRegLO(tt, loadLE(ty, mkexpr(tTA)));
5879       } else {
5880          storeLE(mkexpr(tTA), getQRegLO(tt, ty));
5881       }
5882
5883       if (!earlyWBack)
5884          putIReg64orSP(nn, mkexpr(tEA));
5885
5886       DIP(atRN ? "%s %s, [%s], #%lld\n" : "%s %s, [%s, #%lld]!\n",
5887           isLD ? "ldr" : "str",
5888           nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
5889       return True;
5890    }
5891
5892    /* -------- LDUR/STUR (unscaled offset, SIMD&FP) -------- */
5893    /* 31 29      23   20   11 9 4
5894       00 111 100 01 0 imm9 00 n t   LDR Bt, [Xn|SP, #simm]
5895       01 111 100 01 0 imm9 00 n t   LDR Ht, [Xn|SP, #simm]
5896       10 111 100 01 0 imm9 00 n t   LDR St, [Xn|SP, #simm]
5897       11 111 100 01 0 imm9 00 n t   LDR Dt, [Xn|SP, #simm]
5898       00 111 100 11 0 imm9 00 n t   LDR Qt, [Xn|SP, #simm]
5899
5900       00 111 100 00 0 imm9 00 n t   STR Bt, [Xn|SP, #simm]
5901       01 111 100 00 0 imm9 00 n t   STR Ht, [Xn|SP, #simm]
5902       10 111 100 00 0 imm9 00 n t   STR St, [Xn|SP, #simm]
5903       11 111 100 00 0 imm9 00 n t   STR Dt, [Xn|SP, #simm]
5904       00 111 100 10 0 imm9 00 n t   STR Qt, [Xn|SP, #simm]
5905    */
5906    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5907        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
5908        && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
5909       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5910       Bool   isLD   = INSN(22,22) == 1;
5911       UInt   imm9   = INSN(20,12);
5912       UInt   nn     = INSN(9,5);
5913       UInt   tt     = INSN(4,0);
5914       ULong  simm9  = sx_to_64(imm9, 9);
5915       IRTemp tEA    = newTemp(Ity_I64);
5916       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5917       assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(simm9)));
5918       if (isLD) {
5919          if (szLg2 < 4) {
5920             putQReg128(tt, mkV128(0x0000));
5921          }
5922          putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
5923       } else {
5924          storeLE(mkexpr(tEA), getQRegLO(tt, ty));
5925       }
5926       DIP("%s %s, [%s, #%lld]\n",
5927           isLD ? "ldur" : "stur",
5928           nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
5929       return True;
5930    }
5931
5932    /* ---------------- LDR (literal, SIMD&FP) ---------------- */
5933    /* 31 29      23    4
5934       00 011 100 imm19 t    LDR St, [PC + sxTo64(imm19 << 2)]
5935       01 011 100 imm19 t    LDR Dt, [PC + sxTo64(imm19 << 2)]
5936       10 011 100 imm19 t    LDR Qt, [PC + sxTo64(imm19 << 2)]
5937    */
5938    if (INSN(29,24) == BITS6(0,1,1,1,0,0) && INSN(31,30) < BITS2(1,1)) {
5939       UInt   szB   = 4 << INSN(31,30);
5940       UInt   imm19 = INSN(23,5);
5941       UInt   tt    = INSN(4,0);
5942       ULong  ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
5943       IRType ty    = preferredVectorSubTypeFromSize(szB);
5944       putQReg128(tt, mkV128(0x0000));
5945       putQRegLO(tt, loadLE(ty, mkU64(ea)));
5946       DIP("ldr %s, 0x%llx (literal)\n", nameQRegLO(tt, ty), ea);
5947       return True;
5948    }
5949
5950    /* ------ LD1/ST1 (multiple 1-elem structs to/from 1 reg  ------ */
5951    /* ------ LD2/ST2 (multiple 2-elem structs to/from 2 regs ------ */
5952    /* ------ LD3/ST3 (multiple 3-elem structs to/from 3 regs ------ */
5953    /* ------ LD4/ST4 (multiple 4-elem structs to/from 4 regs ------ */
5954    /* 31 29  26   22 21 20    15   11 9 4
5955
5956       0q 001 1000 L  0  00000 0000 sz n t  xx4 {Vt..t+3.T}, [Xn|SP]
5957       0q 001 1001 L  0  m     0000 sz n t  xx4 {Vt..t+3.T}, [Xn|SP], step
5958
5959       0q 001 1000 L  0  00000 0100 sz n t  xx3 {Vt..t+2.T}, [Xn|SP]
5960       0q 001 1001 L  0  m     0100 sz n t  xx3 {Vt..t+2.T}, [Xn|SP], step
5961
5962       0q 001 1000 L  0  00000 1000 sz n t  xx2 {Vt..t+1.T}, [Xn|SP]
5963       0q 001 1001 L  0  m     1000 sz n t  xx2 {Vt..t+1.T}, [Xn|SP], step
5964
5965       0q 001 1000 L  0  00000 0111 sz n t  xx1 {Vt.T},      [Xn|SP]
5966       0q 001 1001 L  0  m     0111 sz n t  xx1 {Vt.T},      [Xn|SP], step
5967
5968       T    = defined by Q and sz in the normal way
5969       step = if m == 11111 then transfer-size else Xm
5970       xx   = case L of 1 -> LD ; 0 -> ST
5971    */
5972    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
5973        && INSN(21,21) == 0) {
5974       Bool bitQ  = INSN(30,30);
5975       Bool isPX  = INSN(23,23) == 1;
5976       Bool isLD  = INSN(22,22) == 1;
5977       UInt mm    = INSN(20,16);
5978       UInt opc   = INSN(15,12);
5979       UInt sz    = INSN(11,10);
5980       UInt nn    = INSN(9,5);
5981       UInt tt    = INSN(4,0);
5982       Bool isQ   = bitQ == 1;
5983       Bool is1d  = sz == BITS2(1,1) && !isQ;
5984       UInt nRegs = 0;
5985       switch (opc) {
5986          case BITS4(0,0,0,0): nRegs = 4; break;
5987          case BITS4(0,1,0,0): nRegs = 3; break;
5988          case BITS4(1,0,0,0): nRegs = 2; break;
5989          case BITS4(0,1,1,1): nRegs = 1; break;
5990          default: break;
5991       }
5992
5993       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
5994          If we see it, set nRegs to 0 so as to cause the next conditional
5995          to fail. */
5996       if (!isPX && mm != 0)
5997          nRegs = 0;
5998
5999       if (nRegs == 1                             /* .1d is allowed */
6000           || (nRegs >= 2 && nRegs <= 4 && !is1d) /* .1d is not allowed */) {
6001
6002          UInt xferSzB = (isQ ? 16 : 8) * nRegs;
6003
6004          /* Generate the transfer address (TA) and if necessary the
6005             writeback address (WB) */
6006          IRTemp tTA = newTemp(Ity_I64);
6007          assign(tTA, getIReg64orSP(nn));
6008          if (nn == 31) { /* FIXME generate stack alignment check */ }
6009          IRTemp tWB = IRTemp_INVALID;
6010          if (isPX) {
6011             tWB = newTemp(Ity_I64);
6012             assign(tWB, binop(Iop_Add64,
6013                               mkexpr(tTA),
6014                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6015                                                      : getIReg64orZR(mm)));
6016          }
6017
6018          /* -- BEGIN generate the transfers -- */
6019
6020          IRTemp u0, u1, u2, u3, i0, i1, i2, i3;
6021          u0 = u1 = u2 = u3 = i0 = i1 = i2 = i3 = IRTemp_INVALID;
6022          switch (nRegs) {
6023             case 4: u3 = newTempV128(); i3 = newTempV128(); /* fallthru */
6024             case 3: u2 = newTempV128(); i2 = newTempV128(); /* fallthru */
6025             case 2: u1 = newTempV128(); i1 = newTempV128(); /* fallthru */
6026             case 1: u0 = newTempV128(); i0 = newTempV128(); break;
6027             default: vassert(0);
6028          }
6029
6030          /* -- Multiple 128 or 64 bit stores -- */
6031          if (!isLD) {
6032             switch (nRegs) {
6033                case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
6034                case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
6035                case 2: assign(u1, getQReg128((tt+1) % 32)); /* fallthru */
6036                case 1: assign(u0, getQReg128((tt+0) % 32)); break;
6037                default: vassert(0);
6038             }
6039             switch (nRegs) {
6040                case 4:  (isQ ? math_INTERLEAVE4_128 : math_INTERLEAVE4_64)
6041                            (&i0, &i1, &i2, &i3, sz, u0, u1, u2, u3);
6042                         break;
6043                case 3:  (isQ ? math_INTERLEAVE3_128 : math_INTERLEAVE3_64)
6044                            (&i0, &i1, &i2, sz, u0, u1, u2);
6045                         break;
6046                case 2:  (isQ ? math_INTERLEAVE2_128 : math_INTERLEAVE2_64)
6047                            (&i0, &i1, sz, u0, u1);
6048                         break;
6049                case 1:  (isQ ? math_INTERLEAVE1_128 : math_INTERLEAVE1_64)
6050                            (&i0, sz, u0);
6051                         break;
6052                default: vassert(0);
6053             }
6054 #           define MAYBE_NARROW_TO_64(_expr) \
6055                       (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
6056             UInt step = isQ ? 16 : 8;
6057             switch (nRegs) {
6058                case 4:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
6059                                  MAYBE_NARROW_TO_64(mkexpr(i3)) );
6060                         /* fallthru */
6061                case 3:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
6062                                  MAYBE_NARROW_TO_64(mkexpr(i2)) );
6063                         /* fallthru */
6064                case 2:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
6065                                  MAYBE_NARROW_TO_64(mkexpr(i1)) );
6066                         /* fallthru */
6067                case 1:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
6068                                  MAYBE_NARROW_TO_64(mkexpr(i0)) );
6069                         break;
6070                default: vassert(0);
6071             }
6072 #           undef MAYBE_NARROW_TO_64
6073          }
6074
6075          /* -- Multiple 128 or 64 bit loads -- */
6076          else /* isLD */ {
6077             UInt   step   = isQ ? 16 : 8;
6078             IRType loadTy = isQ ? Ity_V128 : Ity_I64;
6079 #           define MAYBE_WIDEN_FROM_64(_expr) \
6080                       (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
6081             switch (nRegs) {
6082                case 4:
6083                   assign(i3, MAYBE_WIDEN_FROM_64(
6084                                 loadLE(loadTy,
6085                                        binop(Iop_Add64, mkexpr(tTA),
6086                                                         mkU64(3 * step)))));
6087                   /* fallthru */
6088                case 3:
6089                   assign(i2, MAYBE_WIDEN_FROM_64(
6090                                 loadLE(loadTy,
6091                                        binop(Iop_Add64, mkexpr(tTA),
6092                                                         mkU64(2 * step)))));
6093                   /* fallthru */
6094                case 2:
6095                   assign(i1, MAYBE_WIDEN_FROM_64(
6096                                 loadLE(loadTy,
6097                                        binop(Iop_Add64, mkexpr(tTA),
6098                                                         mkU64(1 * step)))));
6099                   /* fallthru */
6100                case 1:
6101                   assign(i0, MAYBE_WIDEN_FROM_64(
6102                                 loadLE(loadTy,
6103                                        binop(Iop_Add64, mkexpr(tTA),
6104                                                         mkU64(0 * step)))));
6105                   break;
6106                default:
6107                   vassert(0);
6108             }
6109 #           undef MAYBE_WIDEN_FROM_64
6110             switch (nRegs) {
6111                case 4:  (isQ ? math_DEINTERLEAVE4_128 : math_DEINTERLEAVE4_64)
6112                            (&u0, &u1, &u2, &u3, sz, i0,i1,i2,i3);
6113                         break;
6114                case 3:  (isQ ? math_DEINTERLEAVE3_128 : math_DEINTERLEAVE3_64)
6115                            (&u0, &u1, &u2, sz, i0, i1, i2);
6116                         break;
6117                case 2:  (isQ ? math_DEINTERLEAVE2_128 : math_DEINTERLEAVE2_64)
6118                            (&u0, &u1, sz, i0, i1);
6119                         break;
6120                case 1:  (isQ ? math_DEINTERLEAVE1_128 : math_DEINTERLEAVE1_64)
6121                            (&u0, sz, i0);
6122                         break;
6123                default: vassert(0);
6124             }
6125             switch (nRegs) {
6126                case 4:  putQReg128( (tt+3) % 32,
6127                                     math_MAYBE_ZERO_HI64(bitQ, u3));
6128                         /* fallthru */
6129                case 3:  putQReg128( (tt+2) % 32,
6130                                     math_MAYBE_ZERO_HI64(bitQ, u2));
6131                         /* fallthru */
6132                case 2:  putQReg128( (tt+1) % 32,
6133                                     math_MAYBE_ZERO_HI64(bitQ, u1));
6134                         /* fallthru */
6135                case 1:  putQReg128( (tt+0) % 32,
6136                                     math_MAYBE_ZERO_HI64(bitQ, u0));
6137                         break;
6138                default: vassert(0);
6139             }
6140          }
6141
6142          /* -- END generate the transfers -- */
6143
6144          /* Do the writeback, if necessary */
6145          if (isPX) {
6146             putIReg64orSP(nn, mkexpr(tWB));
6147          }
6148
6149          HChar pxStr[20];
6150          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6151          if (isPX) {
6152             if (mm == BITS5(1,1,1,1,1))
6153                vex_sprintf(pxStr, ", #%u", xferSzB);
6154             else
6155                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6156          }
6157          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6158          DIP("%s%u {v%u.%s .. v%u.%s}, [%s]%s\n",
6159              isLD ? "ld" : "st", nRegs,
6160              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6161              pxStr);
6162
6163          if (nRegs >= 3) {
6164             dres->hint = Dis_HintVerbose;
6165          }
6166          return True;
6167       }
6168       /* else fall through */
6169    }
6170
6171    /* ------ LD1/ST1 (multiple 1-elem structs to/from 2 regs  ------ */
6172    /* ------ LD1/ST1 (multiple 1-elem structs to/from 3 regs  ------ */
6173    /* ------ LD1/ST1 (multiple 1-elem structs to/from 4 regs  ------ */
6174    /* 31 29  26   22 21 20    15   11 9 4
6175
6176       0q 001 1000 L  0  00000 0010 sz n t  xx1 {Vt..t+3.T}, [Xn|SP]
6177       0q 001 1001 L  0  m     0010 sz n t  xx1 {Vt..t+3.T}, [Xn|SP], step
6178
6179       0q 001 1000 L  0  00000 0110 sz n t  xx1 {Vt..t+2.T}, [Xn|SP]
6180       0q 001 1001 L  0  m     0110 sz n t  xx1 {Vt..t+2.T}, [Xn|SP], step
6181
6182       0q 001 1000 L  0  00000 1010 sz n t  xx1 {Vt..t+1.T}, [Xn|SP]
6183       0q 001 1001 L  0  m     1010 sz n t  xx1 {Vt..t+1.T}, [Xn|SP], step
6184
6185       T    = defined by Q and sz in the normal way
6186       step = if m == 11111 then transfer-size else Xm
6187       xx   = case L of 1 -> LD ; 0 -> ST
6188    */
6189    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
6190        && INSN(21,21) == 0) {
6191       Bool bitQ  = INSN(30,30);
6192       Bool isPX  = INSN(23,23) == 1;
6193       Bool isLD  = INSN(22,22) == 1;
6194       UInt mm    = INSN(20,16);
6195       UInt opc   = INSN(15,12);
6196       UInt sz    = INSN(11,10);
6197       UInt nn    = INSN(9,5);
6198       UInt tt    = INSN(4,0);
6199       Bool isQ   = bitQ == 1;
6200       UInt nRegs = 0;
6201       switch (opc) {
6202          case BITS4(0,0,1,0): nRegs = 4; break;
6203          case BITS4(0,1,1,0): nRegs = 3; break;
6204          case BITS4(1,0,1,0): nRegs = 2; break;
6205          default: break;
6206       }
6207
6208       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
6209          If we see it, set nRegs to 0 so as to cause the next conditional
6210          to fail. */
6211       if (!isPX && mm != 0)
6212          nRegs = 0;
6213
6214       if (nRegs >= 2 && nRegs <= 4) {
6215
6216          UInt xferSzB = (isQ ? 16 : 8) * nRegs;
6217
6218          /* Generate the transfer address (TA) and if necessary the
6219             writeback address (WB) */
6220          IRTemp tTA = newTemp(Ity_I64);
6221          assign(tTA, getIReg64orSP(nn));
6222          if (nn == 31) { /* FIXME generate stack alignment check */ }
6223          IRTemp tWB = IRTemp_INVALID;
6224          if (isPX) {
6225             tWB = newTemp(Ity_I64);
6226             assign(tWB, binop(Iop_Add64,
6227                               mkexpr(tTA),
6228                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6229                                                      : getIReg64orZR(mm)));
6230          }
6231
6232          /* -- BEGIN generate the transfers -- */
6233
6234          IRTemp u0, u1, u2, u3;
6235          u0 = u1 = u2 = u3 = IRTemp_INVALID;
6236          switch (nRegs) {
6237             case 4: u3 = newTempV128(); /* fallthru */
6238             case 3: u2 = newTempV128(); /* fallthru */
6239             case 2: u1 = newTempV128();
6240                     u0 = newTempV128(); break;
6241             default: vassert(0);
6242          }
6243
6244          /* -- Multiple 128 or 64 bit stores -- */
6245          if (!isLD) {
6246             switch (nRegs) {
6247                case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
6248                case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
6249                case 2: assign(u1, getQReg128((tt+1) % 32));
6250                        assign(u0, getQReg128((tt+0) % 32)); break;
6251                default: vassert(0);
6252             }
6253 #           define MAYBE_NARROW_TO_64(_expr) \
6254                       (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
6255             UInt step = isQ ? 16 : 8;
6256             switch (nRegs) {
6257                case 4:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
6258                                  MAYBE_NARROW_TO_64(mkexpr(u3)) );
6259                         /* fallthru */
6260                case 3:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
6261                                  MAYBE_NARROW_TO_64(mkexpr(u2)) );
6262                         /* fallthru */
6263                case 2:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
6264                                  MAYBE_NARROW_TO_64(mkexpr(u1)) );
6265                         storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
6266                                  MAYBE_NARROW_TO_64(mkexpr(u0)) );
6267                         break;
6268                default: vassert(0);
6269             }
6270 #           undef MAYBE_NARROW_TO_64
6271          }
6272
6273          /* -- Multiple 128 or 64 bit loads -- */
6274          else /* isLD */ {
6275             UInt   step   = isQ ? 16 : 8;
6276             IRType loadTy = isQ ? Ity_V128 : Ity_I64;
6277 #           define MAYBE_WIDEN_FROM_64(_expr) \
6278                       (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
6279             switch (nRegs) {
6280                case 4:
6281                   assign(u3, MAYBE_WIDEN_FROM_64(
6282                                 loadLE(loadTy,
6283                                        binop(Iop_Add64, mkexpr(tTA),
6284                                                         mkU64(3 * step)))));
6285                   /* fallthru */
6286                case 3:
6287                   assign(u2, MAYBE_WIDEN_FROM_64(
6288                                 loadLE(loadTy,
6289                                        binop(Iop_Add64, mkexpr(tTA),
6290                                                         mkU64(2 * step)))));
6291                   /* fallthru */
6292                case 2:
6293                   assign(u1, MAYBE_WIDEN_FROM_64(
6294                                 loadLE(loadTy,
6295                                        binop(Iop_Add64, mkexpr(tTA),
6296                                                         mkU64(1 * step)))));
6297                   assign(u0, MAYBE_WIDEN_FROM_64(
6298                                 loadLE(loadTy,
6299                                        binop(Iop_Add64, mkexpr(tTA),
6300                                                         mkU64(0 * step)))));
6301                   break;
6302                default:
6303                   vassert(0);
6304             }
6305 #           undef MAYBE_WIDEN_FROM_64
6306             switch (nRegs) {
6307                case 4:  putQReg128( (tt+3) % 32,
6308                                     math_MAYBE_ZERO_HI64(bitQ, u3));
6309                         /* fallthru */
6310                case 3:  putQReg128( (tt+2) % 32,
6311                                     math_MAYBE_ZERO_HI64(bitQ, u2));
6312                         /* fallthru */
6313                case 2:  putQReg128( (tt+1) % 32,
6314                                     math_MAYBE_ZERO_HI64(bitQ, u1));
6315                         putQReg128( (tt+0) % 32,
6316                                     math_MAYBE_ZERO_HI64(bitQ, u0));
6317                         break;
6318                default: vassert(0);
6319             }
6320          }
6321
6322          /* -- END generate the transfers -- */
6323
6324          /* Do the writeback, if necessary */
6325          if (isPX) {
6326             putIReg64orSP(nn, mkexpr(tWB));
6327          }
6328
6329          HChar pxStr[20];
6330          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6331          if (isPX) {
6332             if (mm == BITS5(1,1,1,1,1))
6333                vex_sprintf(pxStr, ", #%u", xferSzB);
6334             else
6335                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6336          }
6337          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6338          DIP("%s1 {v%u.%s .. v%u.%s}, [%s]%s\n",
6339              isLD ? "ld" : "st",
6340              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6341              pxStr);
6342
6343          return True;
6344       }
6345       /* else fall through */
6346    }
6347
6348    /* ---------- LD1R (single structure, replicate) ---------- */
6349    /* ---------- LD2R (single structure, replicate) ---------- */
6350    /* ---------- LD3R (single structure, replicate) ---------- */
6351    /* ---------- LD4R (single structure, replicate) ---------- */
6352    /* 31 29       22 20    15    11 9 4
6353       0q 001 1010 10 00000 110 0 sz n t  LD1R {Vt.T}, [Xn|SP]
6354       0q 001 1011 10 m     110 0 sz n t  LD1R {Vt.T}, [Xn|SP], step
6355
6356       0q 001 1010 11 00000 110 0 sz n t  LD2R {Vt..t+1.T}, [Xn|SP]
6357       0q 001 1011 11 m     110 0 sz n t  LD2R {Vt..t+1.T}, [Xn|SP], step
6358
6359       0q 001 1010 10 00000 111 0 sz n t  LD3R {Vt..t+2.T}, [Xn|SP]
6360       0q 001 1011 10 m     111 0 sz n t  LD3R {Vt..t+2.T}, [Xn|SP], step
6361
6362       0q 001 1010 11 00000 111 0 sz n t  LD4R {Vt..t+3.T}, [Xn|SP]
6363       0q 001 1011 11 m     111 0 sz n t  LD4R {Vt..t+3.T}, [Xn|SP], step
6364
6365       step = if m == 11111 then transfer-size else Xm
6366    */
6367    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)
6368        && INSN(22,22) == 1 && INSN(15,14) == BITS2(1,1)
6369        && INSN(12,12) == 0) {
6370       UInt   bitQ  = INSN(30,30);
6371       Bool   isPX  = INSN(23,23) == 1;
6372       UInt   nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
6373       UInt   mm    = INSN(20,16);
6374       UInt   sz    = INSN(11,10);
6375       UInt   nn    = INSN(9,5);
6376       UInt   tt    = INSN(4,0);
6377
6378       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
6379       if (isPX || mm == 0) {
6380
6381          IRType ty    = integerIRTypeOfSize(1 << sz);
6382
6383          UInt laneSzB = 1 << sz;
6384          UInt xferSzB = laneSzB * nRegs;
6385
6386          /* Generate the transfer address (TA) and if necessary the
6387             writeback address (WB) */
6388          IRTemp tTA = newTemp(Ity_I64);
6389          assign(tTA, getIReg64orSP(nn));
6390          if (nn == 31) { /* FIXME generate stack alignment check */ }
6391          IRTemp tWB = IRTemp_INVALID;
6392          if (isPX) {
6393             tWB = newTemp(Ity_I64);
6394             assign(tWB, binop(Iop_Add64,
6395                               mkexpr(tTA),
6396                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6397                                                      : getIReg64orZR(mm)));
6398          }
6399
6400          /* Do the writeback, if necessary */
6401          if (isPX) {
6402             putIReg64orSP(nn, mkexpr(tWB));
6403          }
6404
6405          IRTemp e0, e1, e2, e3, v0, v1, v2, v3;
6406          e0 = e1 = e2 = e3 = v0 = v1 = v2 = v3 = IRTemp_INVALID;
6407          switch (nRegs) {
6408             case 4:
6409                e3 = newTemp(ty);
6410                assign(e3, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6411                                                       mkU64(3 * laneSzB))));
6412                v3 = math_DUP_TO_V128(e3, ty);
6413                putQReg128((tt+3) % 32, math_MAYBE_ZERO_HI64(bitQ, v3));
6414                /* fallthrough */
6415             case 3:
6416                e2 = newTemp(ty);
6417                assign(e2, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6418                                                       mkU64(2 * laneSzB))));
6419                v2 = math_DUP_TO_V128(e2, ty);
6420                putQReg128((tt+2) % 32, math_MAYBE_ZERO_HI64(bitQ, v2));
6421                /* fallthrough */
6422             case 2:
6423                e1 = newTemp(ty);
6424                assign(e1, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6425                                                       mkU64(1 * laneSzB))));
6426                v1 = math_DUP_TO_V128(e1, ty);
6427                putQReg128((tt+1) % 32, math_MAYBE_ZERO_HI64(bitQ, v1));
6428                /* fallthrough */
6429             case 1:
6430                e0 = newTemp(ty);
6431                assign(e0, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6432                                                       mkU64(0 * laneSzB))));
6433                v0 = math_DUP_TO_V128(e0, ty);
6434                putQReg128((tt+0) % 32, math_MAYBE_ZERO_HI64(bitQ, v0));
6435                break;
6436             default:
6437                vassert(0);
6438          }
6439
6440          HChar pxStr[20];
6441          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6442          if (isPX) {
6443             if (mm == BITS5(1,1,1,1,1))
6444                vex_sprintf(pxStr, ", #%u", xferSzB);
6445             else
6446                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6447          }
6448          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6449          DIP("ld%ur {v%u.%s .. v%u.%s}, [%s]%s\n",
6450              nRegs,
6451              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6452              pxStr);
6453
6454          return True;
6455       }
6456       /* else fall through */
6457    }
6458
6459    /* ------ LD1/ST1 (single structure, to/from one lane) ------ */
6460    /* ------ LD2/ST2 (single structure, to/from one lane) ------ */
6461    /* ------ LD3/ST3 (single structure, to/from one lane) ------ */
6462    /* ------ LD4/ST4 (single structure, to/from one lane) ------ */
6463    /* 31 29       22 21 20    15    11 9 4
6464       0q 001 1010 L  0  00000 xx0 S sz n t  op1 {Vt.T}[ix], [Xn|SP]
6465       0q 001 1011 L  0  m     xx0 S sz n t  op1 {Vt.T}[ix], [Xn|SP], step
6466
6467       0q 001 1010 L  1  00000 xx0 S sz n t  op2 {Vt..t+1.T}[ix], [Xn|SP]
6468       0q 001 1011 L  1  m     xx0 S sz n t  op2 {Vt..t+1.T}[ix], [Xn|SP], step
6469
6470       0q 001 1010 L  0  00000 xx1 S sz n t  op3 {Vt..t+2.T}[ix], [Xn|SP]
6471       0q 001 1011 L  0  m     xx1 S sz n t  op3 {Vt..t+2.T}[ix], [Xn|SP], step
6472
6473       0q 001 1010 L  1  00000 xx1 S sz n t  op4 {Vt..t+3.T}[ix], [Xn|SP]
6474       0q 001 1011 L  1  m     xx1 S sz n t  op4 {Vt..t+3.T}[ix], [Xn|SP], step
6475
6476       step = if m == 11111 then transfer-size else Xm
6477       op   = case L of 1 -> LD ; 0 -> ST
6478
6479       laneszB,ix = case xx:q:S:sz of 00:b:b:bb -> 1, bbbb
6480                                      01:b:b:b0 -> 2, bbb
6481                                      10:b:b:00 -> 4, bb
6482                                      10:b:0:01 -> 8, b
6483    */
6484    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)) {
6485       UInt   bitQ  = INSN(30,30);
6486       Bool   isPX  = INSN(23,23) == 1;
6487       Bool   isLD  = INSN(22,22) == 1;
6488       UInt   nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
6489       UInt   mm    = INSN(20,16);
6490       UInt   xx    = INSN(15,14);
6491       UInt   bitS  = INSN(12,12);
6492       UInt   sz    = INSN(11,10);
6493       UInt   nn    = INSN(9,5);
6494       UInt   tt    = INSN(4,0);
6495
6496       Bool valid = True;
6497
6498       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
6499       if (!isPX && mm != 0)
6500          valid = False;
6501
6502       UInt laneSzB = 0;  /* invalid */
6503       UInt ix      = 16; /* invalid */
6504
6505       UInt xx_q_S_sz = (xx << 4) | (bitQ << 3) | (bitS << 2) | sz;
6506       switch (xx_q_S_sz) {
6507          case 0x00: case 0x01: case 0x02: case 0x03:
6508          case 0x04: case 0x05: case 0x06: case 0x07:
6509          case 0x08: case 0x09: case 0x0A: case 0x0B:
6510          case 0x0C: case 0x0D: case 0x0E: case 0x0F:
6511             laneSzB = 1; ix = xx_q_S_sz & 0xF;
6512             break;
6513          case 0x10: case 0x12: case 0x14: case 0x16:
6514          case 0x18: case 0x1A: case 0x1C: case 0x1E:
6515             laneSzB = 2; ix = (xx_q_S_sz >> 1) & 7;
6516             break;
6517          case 0x20: case 0x24: case 0x28: case 0x2C:
6518             laneSzB = 4; ix = (xx_q_S_sz >> 2) & 3;
6519             break;
6520          case 0x21: case 0x29:
6521             laneSzB = 8; ix = (xx_q_S_sz >> 3) & 1;
6522             break;
6523          default:
6524             break;
6525       }
6526
6527       if (valid && laneSzB != 0) {
6528
6529          IRType ty      = integerIRTypeOfSize(laneSzB);
6530          UInt   xferSzB = laneSzB * nRegs;
6531
6532          /* Generate the transfer address (TA) and if necessary the
6533             writeback address (WB) */
6534          IRTemp tTA = newTemp(Ity_I64);
6535          assign(tTA, getIReg64orSP(nn));
6536          if (nn == 31) { /* FIXME generate stack alignment check */ }
6537          IRTemp tWB = IRTemp_INVALID;
6538          if (isPX) {
6539             tWB = newTemp(Ity_I64);
6540             assign(tWB, binop(Iop_Add64,
6541                               mkexpr(tTA),
6542                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6543                                                      : getIReg64orZR(mm)));
6544          }
6545
6546          /* Do the writeback, if necessary */
6547          if (isPX) {
6548             putIReg64orSP(nn, mkexpr(tWB));
6549          }
6550
6551          switch (nRegs) {
6552             case 4: {
6553                IRExpr* addr
6554                   = binop(Iop_Add64, mkexpr(tTA), mkU64(3 * laneSzB));
6555                if (isLD) {
6556                   putQRegLane((tt+3) % 32, ix, loadLE(ty, addr));
6557                } else {
6558                   storeLE(addr, getQRegLane((tt+3) % 32, ix, ty));
6559                }
6560             }
6561             /* fallthrough */
6562             case 3: {
6563                IRExpr* addr
6564                   = binop(Iop_Add64, mkexpr(tTA), mkU64(2 * laneSzB));
6565                if (isLD) {
6566                   putQRegLane((tt+2) % 32, ix, loadLE(ty, addr));
6567                } else {
6568                   storeLE(addr, getQRegLane((tt+2) % 32, ix, ty));
6569                }
6570             }
6571             /* fallthrough */
6572             case 2: {
6573                IRExpr* addr
6574                   = binop(Iop_Add64, mkexpr(tTA), mkU64(1 * laneSzB));
6575                if (isLD) {
6576                   putQRegLane((tt+1) % 32, ix, loadLE(ty, addr));
6577                } else {
6578                   storeLE(addr, getQRegLane((tt+1) % 32, ix, ty));
6579                }
6580             }
6581             /* fallthrough */
6582             case 1: {
6583                IRExpr* addr
6584                   = binop(Iop_Add64, mkexpr(tTA), mkU64(0 * laneSzB));
6585                if (isLD) {
6586                   putQRegLane((tt+0) % 32, ix, loadLE(ty, addr));
6587                } else {
6588                   storeLE(addr, getQRegLane((tt+0) % 32, ix, ty));
6589                }
6590                break;
6591             }
6592             default:
6593                vassert(0);
6594          }
6595
6596          HChar pxStr[20];
6597          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6598          if (isPX) {
6599             if (mm == BITS5(1,1,1,1,1))
6600                vex_sprintf(pxStr, ", #%u", xferSzB);
6601             else
6602                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6603          }
6604          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6605          DIP("%s%u {v%u.%s .. v%u.%s}[%u], [%s]%s\n",
6606              isLD ? "ld" : "st", nRegs,
6607              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr,
6608              ix, nameIReg64orSP(nn), pxStr);
6609
6610          return True;
6611       }
6612       /* else fall through */
6613    }
6614
6615    /* ------------------ LD{,A}X{R,RH,RB} ------------------ */
6616    /* ------------------ ST{,L}X{R,RH,RB} ------------------ */
6617    /* 31 29     23  20      14    9 4
6618       sz 001000 010 11111 0 11111 n t   LDX{R,RH,RB}  Rt, [Xn|SP]
6619       sz 001000 010 11111 1 11111 n t   LDAX{R,RH,RB} Rt, [Xn|SP]
6620       sz 001000 000 s     0 11111 n t   STX{R,RH,RB}  Ws, Rt, [Xn|SP]
6621       sz 001000 000 s     1 11111 n t   STLX{R,RH,RB} Ws, Rt, [Xn|SP]
6622    */
6623    /* For the "standard" implementation we pass through the LL and SC to
6624       the host.  For the "fallback" implementation, for details see
6625         https://bugs.kde.org/show_bug.cgi?id=344524 and
6626         https://bugs.kde.org/show_bug.cgi?id=369459,
6627       but in short:
6628
6629       LoadLinked(addr)
6630         gs.LLsize = load_size // 1, 2, 4 or 8
6631         gs.LLaddr = addr
6632         gs.LLdata = zeroExtend(*addr)
6633
6634       StoreCond(addr, data)
6635         tmp_LLsize = gs.LLsize
6636         gs.LLsize = 0 // "no transaction"
6637         if tmp_LLsize != store_size        -> fail
6638         if addr != gs.LLaddr               -> fail
6639         if zeroExtend(*addr) != gs.LLdata  -> fail
6640         cas_ok = CAS(store_size, addr, gs.LLdata -> data)
6641         if !cas_ok                         -> fail
6642         succeed
6643
6644       When thread scheduled
6645         gs.LLsize = 0 // "no transaction"
6646         (coregrind/m_scheduler/scheduler.c, run_thread_for_a_while()
6647          has to do this bit)
6648    */
6649    if (INSN(29,23) == BITS7(0,0,1,0,0,0,0)
6650        && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,0)
6651        && INSN(14,10) == BITS5(1,1,1,1,1)) {
6652       UInt szBlg2     = INSN(31,30);
6653       Bool isLD       = INSN(22,22) == 1;
6654       Bool isAcqOrRel = INSN(15,15) == 1;
6655       UInt ss         = INSN(20,16);
6656       UInt nn         = INSN(9,5);
6657       UInt tt         = INSN(4,0);
6658
6659       vassert(szBlg2 < 4);
6660       UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
6661       IRType ty  = integerIRTypeOfSize(szB);
6662       const HChar* suffix[4] = { "rb", "rh", "r", "r" };
6663
6664       IRTemp ea = newTemp(Ity_I64);
6665       assign(ea, getIReg64orSP(nn));
6666       /* FIXME generate check that ea is szB-aligned */
6667
6668       if (isLD && ss == BITS5(1,1,1,1,1)) {
6669          IRTemp res = newTemp(ty);
6670          if (abiinfo->guest__use_fallback_LLSC) {
6671             // Do the load first so we don't update any guest state
6672             // if it faults.
6673             IRTemp loaded_data64 = newTemp(Ity_I64);
6674             assign(loaded_data64, widenUto64(ty, loadLE(ty, mkexpr(ea))));
6675             stmt( IRStmt_Put( OFFB_LLSC_DATA, mkexpr(loaded_data64) ));
6676             stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
6677             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(szB) ));
6678             putIReg64orZR(tt, mkexpr(loaded_data64));
6679          } else {
6680             stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
6681             putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
6682          }
6683          if (isAcqOrRel) {
6684             stmt(IRStmt_MBE(Imbe_Fence));
6685          }
6686          DIP("ld%sx%s %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
6687              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
6688              abiinfo->guest__use_fallback_LLSC
6689                 ? "(fallback implementation)" : "");
6690          return True;
6691       }
6692       if (!isLD) {
6693          if (isAcqOrRel) {
6694             stmt(IRStmt_MBE(Imbe_Fence));
6695          }
6696          IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
6697          if (abiinfo->guest__use_fallback_LLSC) {
6698             // This is really ugly, since we don't have any way to do
6699             // proper if-then-else.  First, set up as if the SC failed,
6700             // and jump forwards if it really has failed.
6701
6702             // Continuation address
6703             IRConst* nia = IRConst_U64(guest_PC_curr_instr + 4);
6704
6705             // "the SC failed".  Any non-zero value means failure.
6706             putIReg64orZR(ss, mkU64(1));
6707
6708             IRTemp tmp_LLsize = newTemp(Ity_I64);
6709             assign(tmp_LLsize, IRExpr_Get(OFFB_LLSC_SIZE, Ity_I64));
6710             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) // "no transaction"
6711             ));
6712             // Fail if no or wrong-size transaction
6713             vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
6714             stmt( IRStmt_Exit(
6715                      binop(Iop_CmpNE64, mkexpr(tmp_LLsize), mkU64(szB)),
6716                      Ijk_Boring, nia, OFFB_PC
6717             ));
6718             // Fail if the address doesn't match the LL address
6719             stmt( IRStmt_Exit(
6720                       binop(Iop_CmpNE64, mkexpr(ea),
6721                                          IRExpr_Get(OFFB_LLSC_ADDR, Ity_I64)),
6722                       Ijk_Boring, nia, OFFB_PC
6723             ));
6724             // Fail if the data doesn't match the LL data
6725             IRTemp llsc_data64 = newTemp(Ity_I64);
6726             assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA, Ity_I64));
6727             stmt( IRStmt_Exit(
6728                       binop(Iop_CmpNE64, widenUto64(ty, loadLE(ty, mkexpr(ea))),
6729                                          mkexpr(llsc_data64)),
6730                       Ijk_Boring, nia, OFFB_PC
6731             ));
6732             // Try to CAS the new value in.
6733             IRTemp old = newTemp(ty);
6734             IRTemp expd = newTemp(ty);
6735             assign(expd, narrowFrom64(ty, mkexpr(llsc_data64)));
6736             stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
6737                                      Iend_LE, mkexpr(ea),
6738                                      /*expdHi*/NULL, mkexpr(expd),
6739                                      /*dataHi*/NULL, data
6740             )));
6741             // Fail if the CAS failed (viz, old != expd)
6742             stmt( IRStmt_Exit(
6743                       binop(Iop_CmpNE64,
6744                             widenUto64(ty, mkexpr(old)),
6745                             widenUto64(ty, mkexpr(expd))),
6746                       Ijk_Boring, nia, OFFB_PC
6747             ));
6748             // Otherwise we succeeded (!)
6749             putIReg64orZR(ss, mkU64(0));
6750          } else {
6751             IRTemp res = newTemp(Ity_I1);
6752             stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
6753             /* IR semantics: res is 1 if store succeeds, 0 if it fails.
6754                Need to set rS to 1 on failure, 0 on success. */
6755             putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
6756                                                mkU64(1)));
6757          }
6758          DIP("st%sx%s %s, %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
6759              nameIRegOrZR(False, ss),
6760              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
6761              abiinfo->guest__use_fallback_LLSC
6762                 ? "(fallback implementation)" : "");
6763          return True;
6764       }
6765       /* else fall through */
6766    }
6767
6768    /* ------------------ LDA{R,RH,RB} ------------------ */
6769    /* ------------------ STL{R,RH,RB} ------------------ */
6770    /* 31 29     23  20      14    9 4
6771       sz 001000 110 11111 1 11111 n t   LDAR<sz> Rt, [Xn|SP]
6772       sz 001000 100 11111 1 11111 n t   STLR<sz> Rt, [Xn|SP]
6773    */
6774    if (INSN(29,23) == BITS7(0,0,1,0,0,0,1)
6775        && INSN(21,10) == BITS12(0,1,1,1,1,1,1,1,1,1,1,1)) {
6776       UInt szBlg2 = INSN(31,30);
6777       Bool isLD   = INSN(22,22) == 1;
6778       UInt nn     = INSN(9,5);
6779       UInt tt     = INSN(4,0);
6780
6781       vassert(szBlg2 < 4);
6782       UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
6783       IRType ty  = integerIRTypeOfSize(szB);
6784       const HChar* suffix[4] = { "rb", "rh", "r", "r" };
6785
6786       IRTemp ea = newTemp(Ity_I64);
6787       assign(ea, getIReg64orSP(nn));
6788       /* FIXME generate check that ea is szB-aligned */
6789
6790       if (isLD) {
6791          IRTemp res = newTemp(ty);
6792          assign(res, loadLE(ty, mkexpr(ea)));
6793          putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
6794          stmt(IRStmt_MBE(Imbe_Fence));
6795          DIP("lda%s %s, [%s]\n", suffix[szBlg2],
6796              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
6797       } else {
6798          stmt(IRStmt_MBE(Imbe_Fence));
6799          IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
6800          storeLE(mkexpr(ea), data);
6801          DIP("stl%s %s, [%s]\n", suffix[szBlg2],
6802              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
6803       }
6804       return True;
6805    }
6806
6807    /* The PRFM cases that follow are possibly allow Rt values (the
6808       prefetch operation) which are not allowed by the documentation.
6809       This should be looked into. */
6810    /* ------------------ PRFM (immediate) ------------------ */
6811    /* 31           21    9 4
6812       11 111 00110 imm12 n t   PRFM pfrop=Rt, [Xn|SP, #pimm]
6813    */
6814    if (INSN(31,22) == BITS10(1,1,1,1,1,0,0,1,1,0)) {
6815       UInt imm12 = INSN(21,10);
6816       UInt nn    = INSN(9,5);
6817       UInt tt    = INSN(4,0);
6818       /* Generating any IR here is pointless, except for documentation
6819          purposes, as it will get optimised away later. */
6820       IRTemp ea = newTemp(Ity_I64);
6821       assign(ea, binop(Iop_Add64, getIReg64orSP(nn), mkU64(imm12 * 8)));
6822       DIP("prfm prfop=%u, [%s, #%u]\n", tt, nameIReg64orSP(nn), imm12 * 8);
6823       return True;
6824    }
6825
6826    /* ------------------ PRFM (register) ------------------ */
6827    /* 31 29      22 20 15  12 11 9  4
6828       11 1110001 01 Rm opt S  10 Rn Rt    PRFM pfrop=Rt, [Xn|SP, R<m>{ext/sh}]
6829    */
6830    if (INSN(31,21) == BITS11(1,1,1,1,1,0,0,0,1,0,1)
6831        && INSN(11,10) == BITS2(1,0)) {
6832       HChar  dis_buf[64];
6833       UInt   tt = INSN(4,0);
6834       IRTemp ea = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
6835       if (ea != IRTemp_INVALID) {
6836          /* No actual code to generate. */
6837          DIP("prfm prfop=%u, %s\n", tt, dis_buf);
6838          return True;
6839       }
6840    }
6841
6842    /* ------------------ PRFM (unscaled offset) ------------------ */
6843    /* 31 29      22 20   11 9  4
6844       11 1110001 00 imm9 00 Rn Rt    PRFM pfrop=Rt, [Xn|SP, #simm]
6845    */
6846    if (INSN(31,21) == BITS11(1,1, 1,1,1,0,0,0,1, 0,0)
6847        && INSN(11,10) == BITS2(0,0)) {
6848       ULong  imm9   = INSN(20,12);
6849       UInt   nn     = INSN(9,5);
6850       UInt   tt     = INSN(4,0);
6851       ULong  offset = sx_to_64(imm9, 9);
6852       IRTemp ea     = newTemp(Ity_I64);
6853       assign(ea, binop(Iop_Add64, getIReg64orSP(nn), mkU64(offset)));
6854       /* No actual code to generate. */
6855       DIP("prfum prfop=%u, [%s, #0x%llx]\n", tt, nameIReg64orSP(nn), offset);
6856       return True;
6857    }
6858
6859    /* ---------------- ARMv8.1-LSE: Atomic Memory Operations ---------------- */
6860    /* 31 29     23 22 21 20 15   11 9 4
6861       sz 111000 A  R  1  s  0000 00 n t LDADD{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
6862       sz 111000 A  R  1  s  0001 00 n t LDCLR{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
6863       sz 111000 A  R  1  s  0010 00 n t LDEOR{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
6864       sz 111000 A  R  1  s  0011 00 n t LDSET{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
6865       sz 111000 A  R  1  s  0100 00 n t LDSMAX{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
6866       sz 111000 A  R  1  s  0101 00 n t LDSMIN{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
6867       sz 111000 A  R  1  s  0110 00 n t LDUMAX{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
6868       sz 111000 A  R  1  s  0111 00 n t LDUMIN{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
6869       sz 111000 A  R  1  s  1000 00 n t SWP{,A}{,L}<sz>    <Rs>, <Rt>, [<Xn|SP>]
6870    */
6871    if (INSN(29,24) == BITS6(1,1,1,0,0,0)
6872        && INSN(21,21) == 1
6873        && (INSN(15,12) <= BITS4(1,0,0,0))
6874        && INSN(11,10) == BITS2(0,0)) {
6875       UInt szBlg2 = INSN(31,30);
6876       Bool isAcq = INSN(23,23) == 1;
6877       Bool isRel = INSN(22,22) == 1;
6878       UInt ss  = INSN(20,16);
6879       UInt opc = INSN(15,12);
6880       UInt nn  = INSN(9,5);
6881       UInt tt  = INSN(4,0);
6882
6883       const HChar* nm = NULL;
6884       const HChar* suffix[4] = { "b", "h", "", "" };
6885
6886       vassert(szBlg2 < 4);
6887       UInt  szB = 1 << szBlg2; /* 1, 2, 4 or 8 bytes*/
6888       IRType ty = integerIRTypeOfSize(szB);
6889       Bool is64 = szB == 8;
6890       Bool isSigned = (opc == 4) || (opc == 5) /*smax || smin*/;
6891
6892       // IR used to emulate these atomic memory ops:
6893       // 1) barrier
6894       // 2) load
6895       // 3) widen operands and do arithmetic/logic op
6896       // 4) cas to see if target memory updated
6897       // 5) barrier
6898       // 6) repeat from 1) if cas says target memory not updated
6899       // 7) update register
6900
6901       IRTemp ea = newTemp(Ity_I64);
6902       assign(ea, getIReg64orSP(nn));
6903
6904       // Insert barrier before loading for acquire and acquire-release variants:
6905       // A and AL.
6906       if (isAcq && (tt != 31))
6907          stmt(IRStmt_MBE(Imbe_Fence));
6908
6909       // Load LHS from memory, RHS from register.
6910       IRTemp orig = newTemp(ty);
6911       assign(orig, loadLE(ty, mkexpr(ea)));
6912       IRExpr *lhs = mkexpr(orig);
6913       IRExpr *rhs = narrowFrom64(ty, getIReg64orZR(ss));
6914       IRExpr *res = NULL;
6915
6916       lhs = isSigned ? widenSto64(ty, lhs) : widenUto64(ty, lhs);
6917       rhs = isSigned ? widenSto64(ty, rhs) : widenUto64(ty, rhs);
6918
6919       // Perform the operation.
6920       switch (opc) {
6921          case 0:
6922             nm = "ldadd";
6923             res = binop(Iop_Add64, lhs, rhs);
6924             break;
6925          case 1:
6926             nm = "ldclr";
6927             res = binop(Iop_And64, lhs, unop(mkNOT(Ity_I64), rhs));
6928             break;
6929          case 2:
6930             nm = "ldeor";
6931             res = binop(Iop_Xor64, lhs, rhs);
6932             break;
6933          case 3:
6934             nm = "ldset";
6935             res = binop(Iop_Or64, lhs, rhs);
6936             break;
6937          case 4:
6938             nm = "ldsmax";
6939             res = IRExpr_ITE(binop(Iop_CmpLT64S, lhs, rhs), rhs, lhs);
6940             break;
6941          case 5:
6942             nm = "ldsmin";
6943             res = IRExpr_ITE(binop(Iop_CmpLT64S, lhs, rhs), lhs, rhs);
6944             break;
6945          case 6:
6946             nm = "ldumax";
6947             res = IRExpr_ITE(binop(Iop_CmpLT64U, lhs, rhs), rhs, lhs);
6948             break;
6949          case 7:
6950             nm = "ldumin";
6951             res = IRExpr_ITE(binop(Iop_CmpLT64U, lhs, rhs), lhs, rhs);
6952             break;
6953          case 8:
6954             nm = "swp";
6955             res = rhs;
6956             break;
6957          default:
6958             vassert(0);
6959             break;
6960       }
6961
6962       // Store the result back if LHS remains unchanged in memory.
6963       IRTemp old = newTemp(ty);
6964       stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
6965                                Iend_LE, mkexpr(ea),
6966                                /*expdHi*/NULL, mkexpr(orig),
6967                                /*dataHi*/NULL, narrowFrom64(ty, res))) );
6968
6969       // Insert barrier after storing for release and acquire-release variants:
6970       // L and AL.
6971       if (isRel)
6972          stmt(IRStmt_MBE(Imbe_Fence));
6973
6974       // Retry if the CAS failed (i.e. when old != orig).
6975       IRConst* nia = IRConst_U64(guest_PC_curr_instr);
6976       stmt( IRStmt_Exit(
6977                 binop(Iop_CasCmpNE64,
6978                       widenUto64(ty, mkexpr(old)),
6979                       widenUto64(ty, mkexpr(orig))),
6980                 Ijk_Boring, nia, OFFB_PC ));
6981       // Otherwise we succeeded.
6982       putIReg64orZR(tt, widenUto64(ty, mkexpr(old)));
6983
6984       DIP("%s%s%s%s %s, %s, [%s]\n", nm, isAcq ? "a" : "", isRel ? "l" : "",
6985           suffix[szBlg2], nameIRegOrZR(is64, ss), nameIRegOrZR(is64, tt),
6986           nameIReg64orSP(nn));
6987       return True;
6988    }
6989
6990    /* ------------------ ARMv8.1-LSE: Compare-and-Swap ------------------ */
6991    /* 31 29      22 21 20 15 14    9 4
6992       sz 0010001 A  1  s  R  11111 n t CAS{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
6993    */
6994    if (INSN(29,23) == BITS7(0,0,1,0,0,0,1)
6995        && INSN(21,21) == 1
6996        && INSN(14,10) == BITS5(1,1,1,1,1)) {
6997       UInt szBlg2 = INSN(31,30);
6998       Bool isAcq = INSN(22,22) == 1;
6999       Bool isRel = INSN(15,15) == 1;
7000       UInt ss  = INSN(20,16);
7001       UInt nn  = INSN(9,5);
7002       UInt tt  = INSN(4,0);
7003
7004       const HChar* suffix[4] = { "b", "h", "", "" };
7005
7006       UInt  szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
7007       IRType ty = integerIRTypeOfSize(szB);
7008       Bool is64 = szB == 8;
7009
7010       IRExpr *exp = narrowFrom64(ty, getIReg64orZR(ss));
7011       IRExpr *new = narrowFrom64(ty, getIReg64orZR(tt));
7012
7013       if (isAcq)
7014          stmt(IRStmt_MBE(Imbe_Fence));
7015
7016       // Store the result back if LHS remains unchanged in memory.
7017       IRTemp old = newTemp(ty);
7018       stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
7019                                Iend_LE, getIReg64orSP(nn),
7020                                /*expdHi*/NULL, exp,
7021                                /*dataHi*/NULL, new)) );
7022
7023       if (isRel)
7024          stmt(IRStmt_MBE(Imbe_Fence));
7025
7026       putIReg64orZR(ss, widenUto64(ty, mkexpr(old)));
7027       DIP("cas%s%s%s %s, %s, [%s]\n",
7028           isAcq ? "a" : "", isRel ? "l" : "", suffix[szBlg2],
7029           nameIRegOrZR(is64, ss), nameIRegOrZR(is64, tt), nameIReg64orSP(nn));
7030       return True;
7031    }
7032
7033    /* ---------------- ARMv8.1-LSE: Compare-and-Swap Pair --------------- */
7034    /* 31 30 29      22 21 20 15 14    9 4
7035       0  sz 0010000 A  1  s  R  11111 n t CASP{,A}{,L} <Rs>, <Rt>, [<Xn|SP>]
7036    */
7037    if (INSN(31,31) == 0
7038        && INSN(29,23) == BITS7(0,0,1,0,0,0,0)
7039        && INSN(21,21) == 1
7040        && INSN(14,10) == BITS5(1,1,1,1,1)) {
7041       UInt is64 = INSN(30,30);
7042       Bool isAcq = INSN(22,22) == 1;
7043       Bool isRel = INSN(15,15) == 1;
7044       UInt ss  = INSN(20,16);
7045       UInt nn  = INSN(9,5);
7046       UInt tt  = INSN(4,0);
7047
7048       if ((ss & 0x1) || (tt & 0x1)) {
7049          /* undefined; fall through */
7050       } else {
7051          IRExpr *expLo = getIRegOrZR(is64, ss);
7052          IRExpr *expHi = getIRegOrZR(is64, ss + 1);
7053          IRExpr *newLo = getIRegOrZR(is64, tt);
7054          IRExpr *newHi = getIRegOrZR(is64, tt + 1);
7055          IRTemp oldLo = newTemp(is64 ? Ity_I64 : Ity_I32);
7056          IRTemp oldHi = newTemp(is64 ? Ity_I64 : Ity_I32);
7057
7058          if (isAcq)
7059             stmt(IRStmt_MBE(Imbe_Fence));
7060
7061          stmt( IRStmt_CAS(mkIRCAS(oldHi, oldLo,
7062                                   Iend_LE, getIReg64orSP(nn),
7063                                   expHi, expLo,
7064                                   newHi, newLo)) );
7065
7066          if (isRel)
7067             stmt(IRStmt_MBE(Imbe_Fence));
7068
7069          putIRegOrZR(is64, ss, mkexpr(oldLo));
7070          putIRegOrZR(is64, ss+1, mkexpr(oldHi));
7071          DIP("casp%s%s %s, %s, %s, %s, [%s]\n",
7072              isAcq ? "a" : "", isRel ? "l" : "",
7073              nameIRegOrZR(is64, ss), nameIRegOrZR(is64, ss+1),
7074              nameIRegOrZR(is64, tt), nameIRegOrZR(is64, tt+1),
7075              nameIReg64orSP(nn));
7076          return True;
7077       }
7078    }
7079
7080    if (sigill_diag) {
7081       vex_printf("ARM64 front end: load_store\n");
7082    }
7083
7084    return False;
7085 #  undef INSN
7086 }
7087
7088
7089 /*------------------------------------------------------------*/
7090 /*--- Control flow and misc instructions                   ---*/
7091 /*------------------------------------------------------------*/
7092
7093 static
7094 Bool dis_ARM64_branch_etc(/*MB_OUT*/DisResult* dres, UInt insn,
7095                           const VexArchInfo* archinfo,
7096                           const VexAbiInfo* abiinfo, Bool sigill_diag)
7097 {
7098 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
7099
7100    /* ---------------------- B cond ----------------------- */
7101    /* 31        24    4 3
7102       0101010 0 imm19 0 cond */
7103    if (INSN(31,24) == BITS8(0,1,0,1,0,1,0,0) && INSN(4,4) == 0) {
7104       UInt  cond   = INSN(3,0);
7105       ULong uimm64 = INSN(23,5) << 2;
7106       Long  simm64 = (Long)sx_to_64(uimm64, 21);
7107       vassert(dres->whatNext    == Dis_Continue);
7108       vassert(dres->len         == 4);
7109       vassert(dres->jk_StopHere == Ijk_INVALID);
7110       stmt( IRStmt_Exit(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
7111                         Ijk_Boring,
7112                         IRConst_U64(guest_PC_curr_instr + simm64),
7113                         OFFB_PC) );
7114       putPC(mkU64(guest_PC_curr_instr + 4));
7115       dres->whatNext    = Dis_StopHere;
7116       dres->jk_StopHere = Ijk_Boring;
7117       DIP("b.%s 0x%llx\n", nameCC(cond), guest_PC_curr_instr + simm64);
7118       return True;
7119    }
7120
7121    /* -------------------- B{L} uncond -------------------- */
7122    if (INSN(30,26) == BITS5(0,0,1,0,1)) {
7123       /* 000101 imm26  B  (PC + sxTo64(imm26 << 2))
7124          100101 imm26  B  (PC + sxTo64(imm26 << 2))
7125       */
7126       UInt  bLink  = INSN(31,31);
7127       ULong uimm64 = INSN(25,0) << 2;
7128       Long  simm64 = (Long)sx_to_64(uimm64, 28);
7129       if (bLink) {
7130          putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
7131       }
7132       putPC(mkU64(guest_PC_curr_instr + simm64));
7133       dres->whatNext = Dis_StopHere;
7134       dres->jk_StopHere = Ijk_Call;
7135       DIP("b%s 0x%llx\n", bLink == 1 ? "l" : "",
7136                           guest_PC_curr_instr + simm64);
7137       return True;
7138    }
7139
7140    /* --------------------- B{L} reg --------------------- */
7141    /* 31      24 22 20    15     9  4
7142       1101011 00 10 11111 000000 nn 00000  RET  Rn
7143       1101011 00 01 11111 000000 nn 00000  CALL Rn
7144       1101011 00 00 11111 000000 nn 00000  JMP  Rn
7145    */
7146    if (INSN(31,23) == BITS9(1,1,0,1,0,1,1,0,0)
7147        && INSN(20,16) == BITS5(1,1,1,1,1)
7148        && INSN(15,10) == BITS6(0,0,0,0,0,0)
7149        && INSN(4,0) == BITS5(0,0,0,0,0)) {
7150       UInt branch_type = INSN(22,21);
7151       UInt nn          = INSN(9,5);
7152       if (branch_type == BITS2(1,0) /* RET */) {
7153          putPC(getIReg64orZR(nn));
7154          dres->whatNext = Dis_StopHere;
7155          dres->jk_StopHere = Ijk_Ret;
7156          DIP("ret %s\n", nameIReg64orZR(nn));
7157          return True;
7158       }
7159       if (branch_type == BITS2(0,1) /* CALL */) {
7160          IRTemp dst = newTemp(Ity_I64);
7161          assign(dst, getIReg64orZR(nn));
7162          putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
7163          putPC(mkexpr(dst));
7164          dres->whatNext = Dis_StopHere;
7165          dres->jk_StopHere = Ijk_Call;
7166          DIP("blr %s\n", nameIReg64orZR(nn));
7167          return True;
7168       }
7169       if (branch_type == BITS2(0,0) /* JMP */) {
7170          putPC(getIReg64orZR(nn));
7171          dres->whatNext = Dis_StopHere;
7172          dres->jk_StopHere = Ijk_Boring;
7173          DIP("jmp %s\n", nameIReg64orZR(nn));
7174          return True;
7175       }
7176    }
7177
7178    /* -------------------- CB{N}Z -------------------- */
7179    /* sf 011 010 1 imm19 Rt   CBNZ Xt|Wt, (PC + sxTo64(imm19 << 2))
7180       sf 011 010 0 imm19 Rt   CBZ  Xt|Wt, (PC + sxTo64(imm19 << 2))
7181    */
7182    if (INSN(30,25) == BITS6(0,1,1,0,1,0)) {
7183       Bool    is64   = INSN(31,31) == 1;
7184       Bool    bIfZ   = INSN(24,24) == 0;
7185       ULong   uimm64 = INSN(23,5) << 2;
7186       UInt    rT     = INSN(4,0);
7187       Long    simm64 = (Long)sx_to_64(uimm64, 21);
7188       IRExpr* cond   = NULL;
7189       if (is64) {
7190          cond = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
7191                       getIReg64orZR(rT), mkU64(0));
7192       } else {
7193          cond = binop(bIfZ ? Iop_CmpEQ32 : Iop_CmpNE32,
7194                       getIReg32orZR(rT), mkU32(0));
7195       }
7196       stmt( IRStmt_Exit(cond,
7197                         Ijk_Boring,
7198                         IRConst_U64(guest_PC_curr_instr + simm64),
7199                         OFFB_PC) );
7200       putPC(mkU64(guest_PC_curr_instr + 4));
7201       dres->whatNext    = Dis_StopHere;
7202       dres->jk_StopHere = Ijk_Boring;
7203       DIP("cb%sz %s, 0x%llx\n",
7204           bIfZ ? "" : "n", nameIRegOrZR(is64, rT),
7205           guest_PC_curr_instr + simm64);
7206       return True;
7207    }
7208
7209    /* -------------------- TB{N}Z -------------------- */
7210    /* 31 30      24 23  18  5 4
7211       b5 011 011 1  b40 imm14 t  TBNZ Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
7212       b5 011 011 0  b40 imm14 t  TBZ  Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
7213    */
7214    if (INSN(30,25) == BITS6(0,1,1,0,1,1)) {
7215       UInt    b5     = INSN(31,31);
7216       Bool    bIfZ   = INSN(24,24) == 0;
7217       UInt    b40    = INSN(23,19);
7218       UInt    imm14  = INSN(18,5);
7219       UInt    tt     = INSN(4,0);
7220       UInt    bitNo  = (b5 << 5) | b40;
7221       ULong   uimm64 = imm14 << 2;
7222       Long    simm64 = sx_to_64(uimm64, 16);
7223       IRExpr* cond
7224          = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
7225                  binop(Iop_And64,
7226                        binop(Iop_Shr64, getIReg64orZR(tt), mkU8(bitNo)),
7227                        mkU64(1)),
7228                  mkU64(0));
7229       stmt( IRStmt_Exit(cond,
7230                         Ijk_Boring,
7231                         IRConst_U64(guest_PC_curr_instr + simm64),
7232                         OFFB_PC) );
7233       putPC(mkU64(guest_PC_curr_instr + 4));
7234       dres->whatNext    = Dis_StopHere;
7235       dres->jk_StopHere = Ijk_Boring;
7236       DIP("tb%sz %s, #%u, 0x%llx\n",
7237           bIfZ ? "" : "n", nameIReg64orZR(tt), bitNo,
7238           guest_PC_curr_instr + simm64);
7239       return True;
7240    }
7241
7242    /* -------------------- SVC -------------------- */
7243    /* 11010100 000 imm16 000 01
7244       Don't bother with anything except the imm16==0 case.
7245    */
7246    if (INSN(31,0) == 0xD4000001) {
7247       putPC(mkU64(guest_PC_curr_instr + 4));
7248       dres->whatNext    = Dis_StopHere;
7249       dres->jk_StopHere = Ijk_Sys_syscall;
7250       DIP("svc #0\n");
7251       return True;
7252    }
7253
7254    /* ------------------ M{SR,RS} ------------------ */
7255    /* ---- Cases for TPIDR_EL0 ----
7256       0xD51BD0 010 Rt   MSR tpidr_el0, rT
7257       0xD53BD0 010 Rt   MRS rT, tpidr_el0
7258    */
7259    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51BD040 /*MSR*/
7260        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53BD040 /*MRS*/) {
7261       Bool toSys = INSN(21,21) == 0;
7262       UInt tt    = INSN(4,0);
7263       if (toSys) {
7264          stmt( IRStmt_Put( OFFB_TPIDR_EL0, getIReg64orZR(tt)) );
7265          DIP("msr tpidr_el0, %s\n", nameIReg64orZR(tt));
7266       } else {
7267          putIReg64orZR(tt, IRExpr_Get( OFFB_TPIDR_EL0, Ity_I64 ));
7268          DIP("mrs %s, tpidr_el0\n", nameIReg64orZR(tt));
7269       }
7270       return True;
7271    }
7272    /* ---- Cases for FPCR ----
7273       0xD51B44 000 Rt  MSR fpcr, rT
7274       0xD53B44 000 Rt  MSR rT, fpcr
7275    */
7276    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4400 /*MSR*/
7277        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4400 /*MRS*/) {
7278       Bool toSys = INSN(21,21) == 0;
7279       UInt tt    = INSN(4,0);
7280       if (toSys) {
7281          stmt( IRStmt_Put( OFFB_FPCR, getIReg32orZR(tt)) );
7282          DIP("msr fpcr, %s\n", nameIReg64orZR(tt));
7283       } else {
7284          putIReg32orZR(tt, IRExpr_Get(OFFB_FPCR, Ity_I32));
7285          DIP("mrs %s, fpcr\n", nameIReg64orZR(tt));
7286       }
7287       return True;
7288    }
7289    /* ---- Cases for FPSR ----
7290       0xD51B44 001 Rt  MSR fpsr, rT
7291       0xD53B44 001 Rt  MSR rT, fpsr
7292       The only part of this we model is FPSR.QC.  All other bits
7293       are ignored when writing to it and RAZ when reading from it.
7294    */
7295    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4420 /*MSR*/
7296        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4420 /*MRS*/) {
7297       Bool toSys = INSN(21,21) == 0;
7298       UInt tt    = INSN(4,0);
7299       if (toSys) {
7300          /* Just deal with FPSR.QC.  Make up a V128 value which is
7301             zero if Xt[27] is zero and any other value if Xt[27] is
7302             nonzero. */
7303          IRTemp qc64 = newTemp(Ity_I64);
7304          assign(qc64, binop(Iop_And64,
7305                             binop(Iop_Shr64, getIReg64orZR(tt), mkU8(27)),
7306                             mkU64(1)));
7307          IRExpr* qcV128 = binop(Iop_64HLtoV128, mkexpr(qc64), mkexpr(qc64));
7308          stmt( IRStmt_Put( OFFB_QCFLAG, qcV128 ) );
7309          DIP("msr fpsr, %s\n", nameIReg64orZR(tt));
7310       } else {
7311          /* Generate a value which is all zeroes except for bit 27,
7312             which must be zero if QCFLAG is all zeroes and one otherwise. */
7313          IRTemp qcV128 = newTempV128();
7314          assign(qcV128, IRExpr_Get( OFFB_QCFLAG, Ity_V128 ));
7315          IRTemp qc64 = newTemp(Ity_I64);
7316          assign(qc64, binop(Iop_Or64, unop(Iop_V128HIto64, mkexpr(qcV128)),
7317                                       unop(Iop_V128to64,   mkexpr(qcV128))));
7318          IRExpr* res = binop(Iop_Shl64,
7319                              unop(Iop_1Uto64,
7320                                   binop(Iop_CmpNE64, mkexpr(qc64), mkU64(0))),
7321                              mkU8(27));
7322          putIReg64orZR(tt, res);
7323          DIP("mrs %s, fpsr\n", nameIReg64orZR(tt));
7324       }
7325       return True;
7326    }
7327    /* ---- Cases for NZCV ----
7328       D51B42 000 Rt  MSR nzcv, rT
7329       D53B42 000 Rt  MRS rT, nzcv
7330       The only parts of NZCV that actually exist are bits 31:28, which
7331       are the N Z C and V bits themselves.  Hence the flags thunk provides
7332       all the state we need.
7333    */
7334    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4200 /*MSR*/
7335        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4200 /*MRS*/) {
7336       Bool  toSys = INSN(21,21) == 0;
7337       UInt  tt    = INSN(4,0);
7338       if (toSys) {
7339          IRTemp t = newTemp(Ity_I64);
7340          assign(t, binop(Iop_And64, getIReg64orZR(tt), mkU64(0xF0000000ULL)));
7341          setFlags_COPY(t);
7342          DIP("msr %s, nzcv\n", nameIReg32orZR(tt));
7343       } else {
7344          IRTemp res = newTemp(Ity_I64);
7345          assign(res, mk_arm64g_calculate_flags_nzcv());
7346          putIReg32orZR(tt, unop(Iop_64to32, mkexpr(res)));
7347          DIP("mrs %s, nzcv\n", nameIReg64orZR(tt));
7348       }
7349       return True;
7350    }
7351    /* ---- Cases for DCZID_EL0 ----
7352       Don't support arbitrary reads and writes to this register.  Just
7353       return the value 16, which indicates that the DC ZVA instruction
7354       is not permitted, so we don't have to emulate it.
7355       D5 3B 00 111 Rt  MRS rT, dczid_el0
7356    */
7357    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B00E0) {
7358       UInt tt = INSN(4,0);
7359       putIReg64orZR(tt, mkU64(1<<4));
7360       DIP("mrs %s, dczid_el0 (FAKED)\n", nameIReg64orZR(tt));
7361       return True;
7362    }
7363    /* ---- Cases for CTR_EL0 ----
7364       We just handle reads, and make up a value from the D and I line
7365       sizes in the VexArchInfo we are given, and patch in the following
7366       fields that the Foundation model gives ("natively"):
7367       CWG = 0b0100, ERG = 0b0100, L1Ip = 0b11
7368       D5 3B 00 001 Rt  MRS rT, dczid_el0
7369    */
7370    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B0020) {
7371       UInt tt = INSN(4,0);
7372       /* Need to generate a value from dMinLine_lg2_szB and
7373          dMinLine_lg2_szB.  The value in the register is in 32-bit
7374          units, so need to subtract 2 from the values in the
7375          VexArchInfo.  We can assume that the values here are valid --
7376          disInstr_ARM64 checks them -- so there's no need to deal with
7377          out-of-range cases. */
7378       vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
7379               && archinfo->arm64_dMinLine_lg2_szB <= 17
7380               && archinfo->arm64_iMinLine_lg2_szB >= 2
7381               && archinfo->arm64_iMinLine_lg2_szB <= 17);
7382       UInt val
7383          = 0x8440c000 | ((0xF & (archinfo->arm64_dMinLine_lg2_szB - 2)) << 16)
7384                       | ((0xF & (archinfo->arm64_iMinLine_lg2_szB - 2)) << 0);
7385       putIReg64orZR(tt, mkU64(val));
7386       DIP("mrs %s, ctr_el0\n", nameIReg64orZR(tt));
7387       return True;
7388    }
7389    /* ---- Cases for CNTVCT_EL0 ----
7390       This is a timestamp counter of some sort.  Support reads of it only
7391       by passing through to the host.
7392       D5 3B E0 010 Rt  MRS Xt, cntvct_el0
7393    */
7394    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53BE040) {
7395       UInt     tt   = INSN(4,0);
7396       IRTemp   val  = newTemp(Ity_I64);
7397       IRExpr** args = mkIRExprVec_0();
7398       IRDirty* d    = unsafeIRDirty_1_N (
7399                          val,
7400                          0/*regparms*/,
7401                          "arm64g_dirtyhelper_MRS_CNTVCT_EL0",
7402                          &arm64g_dirtyhelper_MRS_CNTVCT_EL0,
7403                          args
7404                       );
7405       /* execute the dirty call, dumping the result in val. */
7406       stmt( IRStmt_Dirty(d) );
7407       putIReg64orZR(tt, mkexpr(val));
7408       DIP("mrs %s, cntvct_el0\n", nameIReg64orZR(tt));
7409       return True;
7410    }
7411    /* ---- Cases for CNTFRQ_EL0 ----
7412       This is always RO at EL0, so it's safe to pass through to the host.
7413       D5 3B E0 000 Rt  MRS Xt, cntfrq_el0
7414    */
7415    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53BE000) {
7416       UInt     tt   = INSN(4,0);
7417       IRTemp   val  = newTemp(Ity_I64);
7418       IRExpr** args = mkIRExprVec_0();
7419       IRDirty* d    = unsafeIRDirty_1_N (
7420                          val,
7421                          0/*regparms*/,
7422                          "arm64g_dirtyhelper_MRS_CNTFRQ_EL0",
7423                          &arm64g_dirtyhelper_MRS_CNTFRQ_EL0,
7424                          args
7425                       );
7426       /* execute the dirty call, dumping the result in val. */
7427       stmt( IRStmt_Dirty(d) );
7428       putIReg64orZR(tt, mkexpr(val));
7429       DIP("mrs %s, cntfrq_el0\n", nameIReg64orZR(tt));
7430       return True;
7431    }
7432
7433    /* ------------------ IC_IVAU ------------------ */
7434    /* D5 0B 75 001 Rt  ic ivau, rT
7435    */
7436    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD50B7520) {
7437       /* We will always be provided with a valid iMinLine value. */
7438       vassert(archinfo->arm64_iMinLine_lg2_szB >= 2
7439               && archinfo->arm64_iMinLine_lg2_szB <= 17);
7440       /* Round the requested address, in rT, down to the start of the
7441          containing block. */
7442       UInt   tt      = INSN(4,0);
7443       ULong  lineszB = 1ULL << archinfo->arm64_iMinLine_lg2_szB;
7444       IRTemp addr    = newTemp(Ity_I64);
7445       assign( addr, binop( Iop_And64,
7446                            getIReg64orZR(tt),
7447                            mkU64(~(lineszB - 1))) );
7448       /* Set the invalidation range, request exit-and-invalidate, with
7449          continuation at the next instruction. */
7450       stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
7451       stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
7452       /* be paranoid ... */
7453       stmt( IRStmt_MBE(Imbe_Fence) );
7454       putPC(mkU64( guest_PC_curr_instr + 4 ));
7455       dres->whatNext    = Dis_StopHere;
7456       dres->jk_StopHere = Ijk_InvalICache;
7457       DIP("ic ivau, %s\n", nameIReg64orZR(tt));
7458       return True;
7459    }
7460
7461    /* ------------------ DC_CVAU ------------------ */
7462    /* D5 0B 7B 001 Rt  dc cvau, rT
7463       D5 0B 7E 001 Rt  dc civac, rT
7464    */
7465    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD50B7B20
7466        || (INSN(31,0) & 0xFFFFFFE0) == 0xD50B7E20) {
7467       /* Exactly the same scheme as for IC IVAU, except we observe the
7468          dMinLine size, and request an Ijk_FlushDCache instead of
7469          Ijk_InvalICache. */
7470       /* We will always be provided with a valid dMinLine value. */
7471       vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
7472               && archinfo->arm64_dMinLine_lg2_szB <= 17);
7473       /* Round the requested address, in rT, down to the start of the
7474          containing block. */
7475       UInt   tt      = INSN(4,0);
7476       ULong  lineszB = 1ULL << archinfo->arm64_dMinLine_lg2_szB;
7477       IRTemp addr    = newTemp(Ity_I64);
7478       assign( addr, binop( Iop_And64,
7479                            getIReg64orZR(tt),
7480                            mkU64(~(lineszB - 1))) );
7481       /* Set the flush range, request exit-and-flush, with
7482          continuation at the next instruction. */
7483       stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
7484       stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
7485       /* be paranoid ... */
7486       stmt( IRStmt_MBE(Imbe_Fence) );
7487       putPC(mkU64( guest_PC_curr_instr + 4 ));
7488       dres->whatNext    = Dis_StopHere;
7489       dres->jk_StopHere = Ijk_FlushDCache;
7490       DIP("dc cvau, %s\n", nameIReg64orZR(tt));
7491       return True;
7492    }
7493
7494    /* ------------------ ISB, DMB, DSB ------------------ */
7495    /* 31          21            11  7 6  4
7496       11010 10100 0 00 011 0011 CRm 1 01 11111  DMB opt
7497       11010 10100 0 00 011 0011 CRm 1 00 11111  DSB opt
7498       11010 10100 0 00 011 0011 CRm 1 10 11111  ISB opt
7499    */
7500    if (INSN(31,22) == BITS10(1,1,0,1,0,1,0,1,0,0)
7501        && INSN(21,12) == BITS10(0,0,0,0,1,1,0,0,1,1)
7502        && INSN(7,7) == 1
7503        && INSN(6,5) <= BITS2(1,0) && INSN(4,0) == BITS5(1,1,1,1,1)) {
7504       UInt opc = INSN(6,5);
7505       UInt CRm = INSN(11,8);
7506       vassert(opc <= 2 && CRm <= 15);
7507       stmt(IRStmt_MBE(Imbe_Fence));
7508       const HChar* opNames[3]
7509          = { "dsb", "dmb", "isb" };
7510       const HChar* howNames[16]
7511          = { "#0", "oshld", "oshst", "osh", "#4", "nshld", "nshst", "nsh",
7512              "#8", "ishld", "ishst", "ish", "#12", "ld", "st", "sy" };
7513       DIP("%s %s\n", opNames[opc], howNames[CRm]);
7514       return True;
7515    }
7516
7517    /* -------------------- NOP -------------------- */
7518    if (INSN(31,0) == 0xD503201F) {
7519       DIP("nop\n");
7520       return True;
7521    }
7522
7523    /* -------------------- BRK -------------------- */
7524    /* 31        23  20    4
7525       1101 0100 001 imm16 00000  BRK #imm16
7526    */
7527    if (INSN(31,24) == BITS8(1,1,0,1,0,1,0,0)
7528        && INSN(23,21) == BITS3(0,0,1) && INSN(4,0) == BITS5(0,0,0,0,0)) {
7529       UInt imm16 = INSN(20,5);
7530       /* Request SIGTRAP and then restart of this insn. */
7531       putPC(mkU64(guest_PC_curr_instr + 0));
7532       dres->whatNext    = Dis_StopHere;
7533       dres->jk_StopHere = Ijk_SigTRAP;
7534       DIP("brk #%u\n", imm16);
7535       return True;
7536    }
7537
7538    /* ------------------- YIELD ------------------- */
7539    /* 31        23        15        7
7540       1101 0101 0000 0011 0010 0000 0011 1111
7541    */
7542    if (INSN(31,0) == 0xD503203F) {
7543       /* Request yield followed by continuation at the next insn. */
7544       putPC(mkU64(guest_PC_curr_instr + 4));
7545       dres->whatNext    = Dis_StopHere;
7546       dres->jk_StopHere = Ijk_Yield;
7547       DIP("yield\n");
7548       return True;
7549    }
7550
7551    /* -------------------- HINT ------------------- */
7552    /* 31        23        15   11   4 3
7553       1101 0101 0000 0011 0010 imm7 1 1111
7554       Catch otherwise unhandled HINT instructions - any
7555       like YIELD which are explicitly handled should go
7556       above this case.
7557    */
7558    if (INSN(31,24) == BITS8(1,1,0,1,0,1,0,1)
7559        && INSN(23,16) == BITS8(0,0,0,0,0,0,1,1)
7560        && INSN(15,12) == BITS4(0,0,1,0)
7561        && INSN(4,0) == BITS5(1,1,1,1,1)) {
7562       UInt imm7 = INSN(11,5);
7563       DIP("hint #%u\n", imm7);
7564       return True;
7565    }
7566
7567    /* ------------------- CLREX ------------------ */
7568    /* 31        23        15   11 7
7569       1101 0101 0000 0011 0011 m  0101 1111  CLREX CRm
7570       CRm is apparently ignored.
7571    */
7572    if ((INSN(31,0) & 0xFFFFF0FF) == 0xD503305F) {
7573       UInt mm = INSN(11,8);
7574       /* AFAICS, this simply cancels a (all?) reservations made by a
7575          (any?) preceding LDREX(es).  Arrange to hand it through to
7576          the back end. */
7577       if (abiinfo->guest__use_fallback_LLSC) {
7578          stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) )); // "no transaction"
7579       } else {
7580          stmt( IRStmt_MBE(Imbe_CancelReservation) );
7581       }
7582       DIP("clrex #%u\n", mm);
7583       return True;
7584    }
7585
7586    if (sigill_diag) {
7587       vex_printf("ARM64 front end: branch_etc\n");
7588    }
7589    return False;
7590 #  undef INSN
7591 }
7592
7593
7594 /*------------------------------------------------------------*/
7595 /*--- SIMD and FP instructions: helper functions           ---*/
7596 /*------------------------------------------------------------*/
7597
7598 /* Some constructors for interleave/deinterleave expressions. */
7599
7600 static IRExpr* mk_CatEvenLanes64x2 ( IRTemp a10, IRTemp b10 ) {
7601    // returns a0 b0
7602    return binop(Iop_InterleaveLO64x2, mkexpr(a10), mkexpr(b10));
7603 }
7604
7605 static IRExpr* mk_CatOddLanes64x2 ( IRTemp a10, IRTemp b10 ) {
7606    // returns a1 b1
7607    return binop(Iop_InterleaveHI64x2, mkexpr(a10), mkexpr(b10));
7608 }
7609
7610 static IRExpr* mk_CatEvenLanes32x4 ( IRTemp a3210, IRTemp b3210 ) {
7611    // returns a2 a0 b2 b0
7612    return binop(Iop_CatEvenLanes32x4, mkexpr(a3210), mkexpr(b3210));
7613 }
7614
7615 static IRExpr* mk_CatOddLanes32x4 ( IRTemp a3210, IRTemp b3210 ) {
7616    // returns a3 a1 b3 b1
7617    return binop(Iop_CatOddLanes32x4, mkexpr(a3210), mkexpr(b3210));
7618 }
7619
7620 static IRExpr* mk_InterleaveLO32x4 ( IRTemp a3210, IRTemp b3210 ) {
7621    // returns a1 b1 a0 b0
7622    return binop(Iop_InterleaveLO32x4, mkexpr(a3210), mkexpr(b3210));
7623 }
7624
7625 static IRExpr* mk_InterleaveHI32x4 ( IRTemp a3210, IRTemp b3210 ) {
7626    // returns a3 b3 a2 b2
7627    return binop(Iop_InterleaveHI32x4, mkexpr(a3210), mkexpr(b3210));
7628 }
7629
7630 static IRExpr* mk_CatEvenLanes16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7631    // returns a6 a4 a2 a0 b6 b4 b2 b0
7632    return binop(Iop_CatEvenLanes16x8, mkexpr(a76543210), mkexpr(b76543210));
7633 }
7634
7635 static IRExpr* mk_CatOddLanes16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7636    // returns a7 a5 a3 a1 b7 b5 b3 b1
7637    return binop(Iop_CatOddLanes16x8, mkexpr(a76543210), mkexpr(b76543210));
7638 }
7639
7640 static IRExpr* mk_InterleaveLO16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7641    // returns a3 b3 a2 b2 a1 b1 a0 b0
7642    return binop(Iop_InterleaveLO16x8, mkexpr(a76543210), mkexpr(b76543210));
7643 }
7644
7645 static IRExpr* mk_InterleaveHI16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7646    // returns a7 b7 a6 b6 a5 b5 a4 b4
7647    return binop(Iop_InterleaveHI16x8, mkexpr(a76543210), mkexpr(b76543210));
7648 }
7649
7650 static IRExpr* mk_CatEvenLanes8x16 ( IRTemp aFEDCBA9876543210,
7651                                      IRTemp bFEDCBA9876543210 ) {
7652    // returns aE aC aA a8 a6 a4 a2 a0 bE bC bA b8 b6 b4 b2 b0
7653    return binop(Iop_CatEvenLanes8x16, mkexpr(aFEDCBA9876543210),
7654                                       mkexpr(bFEDCBA9876543210));
7655 }
7656
7657 static IRExpr* mk_CatOddLanes8x16 ( IRTemp aFEDCBA9876543210,
7658                                     IRTemp bFEDCBA9876543210 ) {
7659    // returns aF aD aB a9 a7 a5 a3 a1 bF bD bB b9 b7 b5 b3 b1
7660    return binop(Iop_CatOddLanes8x16, mkexpr(aFEDCBA9876543210),
7661                                      mkexpr(bFEDCBA9876543210));
7662 }
7663
7664 static IRExpr* mk_InterleaveLO8x16 ( IRTemp aFEDCBA9876543210,
7665                                      IRTemp bFEDCBA9876543210 ) {
7666    // returns a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0
7667    return binop(Iop_InterleaveLO8x16, mkexpr(aFEDCBA9876543210),
7668                                       mkexpr(bFEDCBA9876543210));
7669 }
7670
7671 static IRExpr* mk_InterleaveHI8x16 ( IRTemp aFEDCBA9876543210,
7672                                      IRTemp bFEDCBA9876543210 ) {
7673    // returns aF bF aE bE aD bD aC bC aB bB aA bA a9 b9 a8 b8
7674    return binop(Iop_InterleaveHI8x16, mkexpr(aFEDCBA9876543210),
7675                                       mkexpr(bFEDCBA9876543210));
7676 }
7677
7678 /* Generate N copies of |bit| in the bottom of a ULong. */
7679 static ULong Replicate ( ULong bit, Int N )
7680 {
7681    vassert(bit <= 1 && N >= 1 && N < 64);
7682    if (bit == 0) {
7683       return 0;
7684     } else {
7685       /* Careful.  This won't work for N == 64. */
7686       return (1ULL << N) - 1;
7687    }
7688 }
7689
7690 static ULong Replicate32x2 ( ULong bits32 )
7691 {
7692    vassert(0 == (bits32 & ~0xFFFFFFFFULL));
7693    return (bits32 << 32) | bits32;
7694 }
7695
7696 static ULong Replicate16x4 ( ULong bits16 )
7697 {
7698    vassert(0 == (bits16 & ~0xFFFFULL));
7699    return Replicate32x2((bits16 << 16) | bits16);
7700 }
7701
7702 static ULong Replicate8x8 ( ULong bits8 )
7703 {
7704    vassert(0 == (bits8 & ~0xFFULL));
7705    return Replicate16x4((bits8 << 8) | bits8);
7706 }
7707
7708 /* Expand the VFPExpandImm-style encoding in the bottom 8 bits of
7709    |imm8| to either a 32-bit value if N is 32 or a 64 bit value if N
7710    is 64.  In the former case, the upper 32 bits of the returned value
7711    are guaranteed to be zero. */
7712 static ULong VFPExpandImm ( ULong imm8, Int N )
7713 {
7714    vassert(imm8 <= 0xFF);
7715    vassert(N == 32 || N == 64);
7716    Int E = ((N == 32) ? 8 : 11) - 2; // The spec incorrectly omits the -2.
7717    Int F = N - E - 1;
7718    ULong imm8_6 = (imm8 >> 6) & 1;
7719    /* sign: 1 bit */
7720    /* exp:  E bits */
7721    /* frac: F bits */
7722    ULong sign = (imm8 >> 7) & 1;
7723    ULong exp  = ((imm8_6 ^ 1) << (E-1)) | Replicate(imm8_6, E-1);
7724    ULong frac = ((imm8 & 63) << (F-6)) | Replicate(0, F-6);
7725    vassert(sign < (1ULL << 1));
7726    vassert(exp  < (1ULL << E));
7727    vassert(frac < (1ULL << F));
7728    vassert(1 + E + F == N);
7729    ULong res = (sign << (E+F)) | (exp << F) | frac;
7730    return res;
7731 }
7732
7733 /* Expand an AdvSIMDExpandImm-style encoding into a 64-bit value.
7734    This might fail, as indicated by the returned Bool.  Page 2530 of
7735    the manual. */
7736 static Bool AdvSIMDExpandImm ( /*OUT*/ULong* res,
7737                                UInt op, UInt cmode, UInt imm8 )
7738 {
7739    vassert(op <= 1);
7740    vassert(cmode <= 15);
7741    vassert(imm8 <= 255);
7742
7743    *res = 0; /* will overwrite iff returning True */
7744
7745    ULong imm64    = 0;
7746    Bool  testimm8 = False;
7747
7748    switch (cmode >> 1) {
7749       case 0:
7750          testimm8 = False; imm64 = Replicate32x2(imm8); break;
7751       case 1:
7752          testimm8 = True; imm64 = Replicate32x2(imm8 << 8); break;
7753       case 2:
7754          testimm8 = True; imm64 = Replicate32x2(imm8 << 16); break;
7755       case 3:
7756          testimm8 = True; imm64 = Replicate32x2(imm8 << 24); break;
7757       case 4:
7758           testimm8 = False; imm64 = Replicate16x4(imm8); break;
7759       case 5:
7760           testimm8 = True; imm64 = Replicate16x4(imm8 << 8); break;
7761       case 6:
7762           testimm8 = True;
7763           if ((cmode & 1) == 0)
7764               imm64 = Replicate32x2((imm8 << 8) | 0xFF);
7765           else
7766               imm64 = Replicate32x2((imm8 << 16) | 0xFFFF);
7767           break;
7768       case 7:
7769          testimm8 = False;
7770          if ((cmode & 1) == 0 && op == 0)
7771              imm64 = Replicate8x8(imm8);
7772          if ((cmode & 1) == 0 && op == 1) {
7773              imm64 = 0;   imm64 |= (imm8 & 0x80) ? 0xFF : 0x00;
7774              imm64 <<= 8; imm64 |= (imm8 & 0x40) ? 0xFF : 0x00;
7775              imm64 <<= 8; imm64 |= (imm8 & 0x20) ? 0xFF : 0x00;
7776              imm64 <<= 8; imm64 |= (imm8 & 0x10) ? 0xFF : 0x00;
7777              imm64 <<= 8; imm64 |= (imm8 & 0x08) ? 0xFF : 0x00;
7778              imm64 <<= 8; imm64 |= (imm8 & 0x04) ? 0xFF : 0x00;
7779              imm64 <<= 8; imm64 |= (imm8 & 0x02) ? 0xFF : 0x00;
7780              imm64 <<= 8; imm64 |= (imm8 & 0x01) ? 0xFF : 0x00;
7781          }
7782          if ((cmode & 1) == 1 && op == 0) {
7783             ULong imm8_7  = (imm8 >> 7) & 1;
7784             ULong imm8_6  = (imm8 >> 6) & 1;
7785             ULong imm8_50 = imm8 & 63;
7786             ULong imm32 = (imm8_7                 << (1 + 5 + 6 + 19))
7787                           | ((imm8_6 ^ 1)         << (5 + 6 + 19))
7788                           | (Replicate(imm8_6, 5) << (6 + 19))
7789                           | (imm8_50              << 19);
7790             imm64 = Replicate32x2(imm32);
7791          }
7792          if ((cmode & 1) == 1 && op == 1) {
7793             // imm64 = imm8<7>:NOT(imm8<6>)
7794             //                :Replicate(imm8<6>,8):imm8<5:0>:Zeros(48);
7795             ULong imm8_7  = (imm8 >> 7) & 1;
7796             ULong imm8_6  = (imm8 >> 6) & 1;
7797             ULong imm8_50 = imm8 & 63;
7798             imm64 = (imm8_7 << 63) | ((imm8_6 ^ 1) << 62)
7799                     | (Replicate(imm8_6, 8) << 54)
7800                     | (imm8_50 << 48);
7801          }
7802          break;
7803       default:
7804         vassert(0);
7805    }
7806
7807    if (testimm8 && imm8 == 0)
7808       return False;
7809
7810    *res = imm64;
7811    return True;
7812 }
7813
7814 /* Help a bit for decoding laneage for vector operations that can be
7815    of the form 4x32, 2x64 or 2x32-and-zero-upper-half, as encoded by Q
7816    and SZ bits, typically for vector floating point. */
7817 static Bool getLaneInfo_Q_SZ ( /*OUT*/IRType* tyI,  /*OUT*/IRType* tyF,
7818                                /*OUT*/UInt* nLanes, /*OUT*/Bool* zeroUpper,
7819                                /*OUT*/const HChar** arrSpec,
7820                                Bool bitQ, Bool bitSZ )
7821 {
7822    vassert(bitQ == True || bitQ == False);
7823    vassert(bitSZ == True || bitSZ == False);
7824    if (bitQ && bitSZ) { // 2x64
7825       if (tyI)       *tyI       = Ity_I64;
7826       if (tyF)       *tyF       = Ity_F64;
7827       if (nLanes)    *nLanes    = 2;
7828       if (zeroUpper) *zeroUpper = False;
7829       if (arrSpec)   *arrSpec   = "2d";
7830       return True;
7831    }
7832    if (bitQ && !bitSZ) { // 4x32
7833       if (tyI)       *tyI       = Ity_I32;
7834       if (tyF)       *tyF       = Ity_F32;
7835       if (nLanes)    *nLanes    = 4;
7836       if (zeroUpper) *zeroUpper = False;
7837       if (arrSpec)   *arrSpec   = "4s";
7838       return True;
7839    }
7840    if (!bitQ && !bitSZ) { // 2x32
7841       if (tyI)       *tyI       = Ity_I32;
7842       if (tyF)       *tyF       = Ity_F32;
7843       if (nLanes)    *nLanes    = 2;
7844       if (zeroUpper) *zeroUpper = True;
7845       if (arrSpec)   *arrSpec   = "2s";
7846       return True;
7847    }
7848    // Else impliedly 1x64, which isn't allowed.
7849    return False;
7850 }
7851
7852 /* Helper for decoding laneage for shift-style vector operations
7853    that involve an immediate shift amount. */
7854 static Bool getLaneInfo_IMMH_IMMB ( /*OUT*/UInt* shift, /*OUT*/UInt* szBlg2,
7855                                     UInt immh, UInt immb )
7856 {
7857    vassert(immh < (1<<4));
7858    vassert(immb < (1<<3));
7859    UInt immhb = (immh << 3) | immb;
7860    if (immh & 8) {
7861       if (shift)  *shift  = 128 - immhb;
7862       if (szBlg2) *szBlg2 = 3;
7863       return True;
7864    }
7865    if (immh & 4) {
7866       if (shift)  *shift  = 64 - immhb;
7867       if (szBlg2) *szBlg2 = 2;
7868       return True;
7869    }
7870    if (immh & 2) {
7871       if (shift)  *shift  = 32 - immhb;
7872       if (szBlg2) *szBlg2 = 1;
7873       return True;
7874    }
7875    if (immh & 1) {
7876       if (shift)  *shift  = 16 - immhb;
7877       if (szBlg2) *szBlg2 = 0;
7878       return True;
7879    }
7880    return False;
7881 }
7882
7883 /* Generate IR to fold all lanes of the V128 value in 'src' as
7884    characterised by the operator 'op', and return the result in the
7885    bottom bits of a V128, with all other bits set to zero. */
7886 static IRTemp math_FOLDV ( IRTemp src, IROp op )
7887 {
7888    /* The basic idea is to use repeated applications of Iop_CatEven*
7889       and Iop_CatOdd* operators to 'src' so as to clone each lane into
7890       a complete vector.  Then fold all those vectors with 'op' and
7891       zero out all but the least significant lane. */
7892    switch (op) {
7893       case Iop_Min8Sx16: case Iop_Min8Ux16:
7894       case Iop_Max8Sx16: case Iop_Max8Ux16: case Iop_Add8x16: {
7895          /* NB: temp naming here is misleading -- the naming is for 8
7896             lanes of 16 bit, whereas what is being operated on is 16
7897             lanes of 8 bits. */
7898          IRTemp x76543210 = src;
7899          IRTemp x76547654 = newTempV128();
7900          IRTemp x32103210 = newTempV128();
7901          assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
7902          assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
7903          IRTemp x76767676 = newTempV128();
7904          IRTemp x54545454 = newTempV128();
7905          IRTemp x32323232 = newTempV128();
7906          IRTemp x10101010 = newTempV128();
7907          assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
7908          assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
7909          assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
7910          assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
7911          IRTemp x77777777 = newTempV128();
7912          IRTemp x66666666 = newTempV128();
7913          IRTemp x55555555 = newTempV128();
7914          IRTemp x44444444 = newTempV128();
7915          IRTemp x33333333 = newTempV128();
7916          IRTemp x22222222 = newTempV128();
7917          IRTemp x11111111 = newTempV128();
7918          IRTemp x00000000 = newTempV128();
7919          assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
7920          assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
7921          assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
7922          assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
7923          assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
7924          assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
7925          assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
7926          assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
7927          /* Naming not misleading after here. */
7928          IRTemp xAllF = newTempV128();
7929          IRTemp xAllE = newTempV128();
7930          IRTemp xAllD = newTempV128();
7931          IRTemp xAllC = newTempV128();
7932          IRTemp xAllB = newTempV128();
7933          IRTemp xAllA = newTempV128();
7934          IRTemp xAll9 = newTempV128();
7935          IRTemp xAll8 = newTempV128();
7936          IRTemp xAll7 = newTempV128();
7937          IRTemp xAll6 = newTempV128();
7938          IRTemp xAll5 = newTempV128();
7939          IRTemp xAll4 = newTempV128();
7940          IRTemp xAll3 = newTempV128();
7941          IRTemp xAll2 = newTempV128();
7942          IRTemp xAll1 = newTempV128();
7943          IRTemp xAll0 = newTempV128();
7944          assign(xAllF, mk_CatOddLanes8x16 (x77777777, x77777777));
7945          assign(xAllE, mk_CatEvenLanes8x16(x77777777, x77777777));
7946          assign(xAllD, mk_CatOddLanes8x16 (x66666666, x66666666));
7947          assign(xAllC, mk_CatEvenLanes8x16(x66666666, x66666666));
7948          assign(xAllB, mk_CatOddLanes8x16 (x55555555, x55555555));
7949          assign(xAllA, mk_CatEvenLanes8x16(x55555555, x55555555));
7950          assign(xAll9, mk_CatOddLanes8x16 (x44444444, x44444444));
7951          assign(xAll8, mk_CatEvenLanes8x16(x44444444, x44444444));
7952          assign(xAll7, mk_CatOddLanes8x16 (x33333333, x33333333));
7953          assign(xAll6, mk_CatEvenLanes8x16(x33333333, x33333333));
7954          assign(xAll5, mk_CatOddLanes8x16 (x22222222, x22222222));
7955          assign(xAll4, mk_CatEvenLanes8x16(x22222222, x22222222));
7956          assign(xAll3, mk_CatOddLanes8x16 (x11111111, x11111111));
7957          assign(xAll2, mk_CatEvenLanes8x16(x11111111, x11111111));
7958          assign(xAll1, mk_CatOddLanes8x16 (x00000000, x00000000));
7959          assign(xAll0, mk_CatEvenLanes8x16(x00000000, x00000000));
7960          IRTemp maxFE = newTempV128();
7961          IRTemp maxDC = newTempV128();
7962          IRTemp maxBA = newTempV128();
7963          IRTemp max98 = newTempV128();
7964          IRTemp max76 = newTempV128();
7965          IRTemp max54 = newTempV128();
7966          IRTemp max32 = newTempV128();
7967          IRTemp max10 = newTempV128();
7968          assign(maxFE, binop(op, mkexpr(xAllF), mkexpr(xAllE)));
7969          assign(maxDC, binop(op, mkexpr(xAllD), mkexpr(xAllC)));
7970          assign(maxBA, binop(op, mkexpr(xAllB), mkexpr(xAllA)));
7971          assign(max98, binop(op, mkexpr(xAll9), mkexpr(xAll8)));
7972          assign(max76, binop(op, mkexpr(xAll7), mkexpr(xAll6)));
7973          assign(max54, binop(op, mkexpr(xAll5), mkexpr(xAll4)));
7974          assign(max32, binop(op, mkexpr(xAll3), mkexpr(xAll2)));
7975          assign(max10, binop(op, mkexpr(xAll1), mkexpr(xAll0)));
7976          IRTemp maxFEDC = newTempV128();
7977          IRTemp maxBA98 = newTempV128();
7978          IRTemp max7654 = newTempV128();
7979          IRTemp max3210 = newTempV128();
7980          assign(maxFEDC, binop(op, mkexpr(maxFE), mkexpr(maxDC)));
7981          assign(maxBA98, binop(op, mkexpr(maxBA), mkexpr(max98)));
7982          assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
7983          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
7984          IRTemp maxFEDCBA98 = newTempV128();
7985          IRTemp max76543210 = newTempV128();
7986          assign(maxFEDCBA98, binop(op, mkexpr(maxFEDC), mkexpr(maxBA98)));
7987          assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
7988          IRTemp maxAllLanes = newTempV128();
7989          assign(maxAllLanes, binop(op, mkexpr(maxFEDCBA98),
7990                                        mkexpr(max76543210)));
7991          IRTemp res = newTempV128();
7992          assign(res, unop(Iop_ZeroHI120ofV128, mkexpr(maxAllLanes)));
7993          return res;
7994       }
7995       case Iop_Min16Sx8: case Iop_Min16Ux8:
7996       case Iop_Max16Sx8: case Iop_Max16Ux8: case Iop_Add16x8: {
7997          IRTemp x76543210 = src;
7998          IRTemp x76547654 = newTempV128();
7999          IRTemp x32103210 = newTempV128();
8000          assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
8001          assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
8002          IRTemp x76767676 = newTempV128();
8003          IRTemp x54545454 = newTempV128();
8004          IRTemp x32323232 = newTempV128();
8005          IRTemp x10101010 = newTempV128();
8006          assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
8007          assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
8008          assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
8009          assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
8010          IRTemp x77777777 = newTempV128();
8011          IRTemp x66666666 = newTempV128();
8012          IRTemp x55555555 = newTempV128();
8013          IRTemp x44444444 = newTempV128();
8014          IRTemp x33333333 = newTempV128();
8015          IRTemp x22222222 = newTempV128();
8016          IRTemp x11111111 = newTempV128();
8017          IRTemp x00000000 = newTempV128();
8018          assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
8019          assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
8020          assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
8021          assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
8022          assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
8023          assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
8024          assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
8025          assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
8026          IRTemp max76 = newTempV128();
8027          IRTemp max54 = newTempV128();
8028          IRTemp max32 = newTempV128();
8029          IRTemp max10 = newTempV128();
8030          assign(max76, binop(op, mkexpr(x77777777), mkexpr(x66666666)));
8031          assign(max54, binop(op, mkexpr(x55555555), mkexpr(x44444444)));
8032          assign(max32, binop(op, mkexpr(x33333333), mkexpr(x22222222)));
8033          assign(max10, binop(op, mkexpr(x11111111), mkexpr(x00000000)));
8034          IRTemp max7654 = newTempV128();
8035          IRTemp max3210 = newTempV128();
8036          assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
8037          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
8038          IRTemp max76543210 = newTempV128();
8039          assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
8040          IRTemp res = newTempV128();
8041          assign(res, unop(Iop_ZeroHI112ofV128, mkexpr(max76543210)));
8042          return res;
8043       }
8044       case Iop_Max32Fx4: case Iop_Min32Fx4:
8045       case Iop_Min32Sx4: case Iop_Min32Ux4:
8046       case Iop_Max32Sx4: case Iop_Max32Ux4: case Iop_Add32x4: {
8047          IRTemp x3210 = src;
8048          IRTemp x3232 = newTempV128();
8049          IRTemp x1010 = newTempV128();
8050          assign(x3232, mk_CatOddLanes64x2 (x3210, x3210));
8051          assign(x1010, mk_CatEvenLanes64x2(x3210, x3210));
8052          IRTemp x3333 = newTempV128();
8053          IRTemp x2222 = newTempV128();
8054          IRTemp x1111 = newTempV128();
8055          IRTemp x0000 = newTempV128();
8056          assign(x3333, mk_CatOddLanes32x4 (x3232, x3232));
8057          assign(x2222, mk_CatEvenLanes32x4(x3232, x3232));
8058          assign(x1111, mk_CatOddLanes32x4 (x1010, x1010));
8059          assign(x0000, mk_CatEvenLanes32x4(x1010, x1010));
8060          IRTemp max32 = newTempV128();
8061          IRTemp max10 = newTempV128();
8062          assign(max32, binop(op, mkexpr(x3333), mkexpr(x2222)));
8063          assign(max10, binop(op, mkexpr(x1111), mkexpr(x0000)));
8064          IRTemp max3210 = newTempV128();
8065          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
8066          IRTemp res = newTempV128();
8067          assign(res, unop(Iop_ZeroHI96ofV128, mkexpr(max3210)));
8068          return res;
8069       }
8070       case Iop_Add64x2: {
8071          IRTemp x10 = src;
8072          IRTemp x00 = newTempV128();
8073          IRTemp x11 = newTempV128();
8074          assign(x11, binop(Iop_InterleaveHI64x2, mkexpr(x10), mkexpr(x10)));
8075          assign(x00, binop(Iop_InterleaveLO64x2, mkexpr(x10), mkexpr(x10)));
8076          IRTemp max10 = newTempV128();
8077          assign(max10, binop(op, mkexpr(x11), mkexpr(x00)));
8078          IRTemp res = newTempV128();
8079          assign(res, unop(Iop_ZeroHI64ofV128, mkexpr(max10)));
8080          return res;
8081       }
8082       default:
8083          vassert(0);
8084    }
8085 }
8086
8087
8088 /* Generate IR for TBL and TBX.  This deals with the 128 bit case
8089    only. */
8090 static IRTemp math_TBL_TBX ( IRTemp tab[4], UInt len, IRTemp src,
8091                              IRTemp oor_values )
8092 {
8093    vassert(len >= 0 && len <= 3);
8094
8095    /* Generate some useful constants as concisely as possible. */
8096    IRTemp half15 = newTemp(Ity_I64);
8097    assign(half15, mkU64(0x0F0F0F0F0F0F0F0FULL));
8098    IRTemp half16 = newTemp(Ity_I64);
8099    assign(half16, mkU64(0x1010101010101010ULL));
8100
8101    /* A zero vector */
8102    IRTemp allZero = newTempV128();
8103    assign(allZero, mkV128(0x0000));
8104    /* A vector containing 15 in each 8-bit lane */
8105    IRTemp all15 = newTempV128();
8106    assign(all15, binop(Iop_64HLtoV128, mkexpr(half15), mkexpr(half15)));
8107    /* A vector containing 16 in each 8-bit lane */
8108    IRTemp all16 = newTempV128();
8109    assign(all16, binop(Iop_64HLtoV128, mkexpr(half16), mkexpr(half16)));
8110    /* A vector containing 32 in each 8-bit lane */
8111    IRTemp all32 = newTempV128();
8112    assign(all32, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all16)));
8113    /* A vector containing 48 in each 8-bit lane */
8114    IRTemp all48 = newTempV128();
8115    assign(all48, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all32)));
8116    /* A vector containing 64 in each 8-bit lane */
8117    IRTemp all64 = newTempV128();
8118    assign(all64, binop(Iop_Add8x16, mkexpr(all32), mkexpr(all32)));
8119
8120    /* Group the 16/32/48/64 vectors so as to be indexable. */
8121    IRTemp allXX[4] = { all16, all32, all48, all64 };
8122
8123    /* Compute the result for each table vector, with zeroes in places
8124       where the index values are out of range, and OR them into the
8125       running vector. */
8126    IRTemp running_result = newTempV128();
8127    assign(running_result, mkV128(0));
8128
8129    UInt tabent;
8130    for (tabent = 0; tabent <= len; tabent++) {
8131       vassert(tabent >= 0 && tabent < 4);
8132       IRTemp bias = newTempV128();
8133       assign(bias,
8134              mkexpr(tabent == 0 ? allZero : allXX[tabent-1]));
8135       IRTemp biased_indices = newTempV128();
8136       assign(biased_indices,
8137              binop(Iop_Sub8x16, mkexpr(src), mkexpr(bias)));
8138       IRTemp valid_mask = newTempV128();
8139       assign(valid_mask,
8140              binop(Iop_CmpGT8Ux16, mkexpr(all16), mkexpr(biased_indices)));
8141       IRTemp safe_biased_indices = newTempV128();
8142       assign(safe_biased_indices,
8143              binop(Iop_AndV128, mkexpr(biased_indices), mkexpr(all15)));
8144       IRTemp results_or_junk = newTempV128();
8145       assign(results_or_junk,
8146              binop(Iop_Perm8x16, mkexpr(tab[tabent]),
8147                                  mkexpr(safe_biased_indices)));
8148       IRTemp results_or_zero = newTempV128();
8149       assign(results_or_zero,
8150              binop(Iop_AndV128, mkexpr(results_or_junk), mkexpr(valid_mask)));
8151       /* And OR that into the running result. */
8152       IRTemp tmp = newTempV128();
8153       assign(tmp, binop(Iop_OrV128, mkexpr(results_or_zero),
8154                         mkexpr(running_result)));
8155       running_result = tmp;
8156    }
8157
8158    /* So now running_result holds the overall result where the indices
8159       are in range, and zero in out-of-range lanes.  Now we need to
8160       compute an overall validity mask and use this to copy in the
8161       lanes in the oor_values for out of range indices.  This is
8162       unnecessary for TBL but will get folded out by iropt, so we lean
8163       on that and generate the same code for TBL and TBX here. */
8164    IRTemp overall_valid_mask = newTempV128();
8165    assign(overall_valid_mask,
8166           binop(Iop_CmpGT8Ux16, mkexpr(allXX[len]), mkexpr(src)));
8167    IRTemp result = newTempV128();
8168    assign(result,
8169           binop(Iop_OrV128,
8170                 mkexpr(running_result),
8171                 binop(Iop_AndV128,
8172                       mkexpr(oor_values),
8173                       unop(Iop_NotV128, mkexpr(overall_valid_mask)))));
8174    return result;
8175 }
8176
8177
8178 /* Let |argL| and |argR| be V128 values, and let |opI64x2toV128| be
8179    an op which takes two I64s and produces a V128.  That is, a widening
8180    operator.  Generate IR which applies |opI64x2toV128| to either the
8181    lower (if |is2| is False) or upper (if |is2| is True) halves of
8182    |argL| and |argR|, and return the value in a new IRTemp.
8183 */
8184 static
8185 IRTemp math_BINARY_WIDENING_V128 ( Bool is2, IROp opI64x2toV128,
8186                                    IRExpr* argL, IRExpr* argR )
8187 {
8188    IRTemp res   = newTempV128();
8189    IROp   slice = is2 ? Iop_V128HIto64 : Iop_V128to64;
8190    assign(res, binop(opI64x2toV128, unop(slice, argL),
8191                                     unop(slice, argR)));
8192    return res;
8193 }
8194
8195
8196 /* Generate signed/unsigned absolute difference vector IR. */
8197 static
8198 IRTemp math_ABD ( Bool isU, UInt size, IRExpr* argLE, IRExpr* argRE )
8199 {
8200    vassert(size <= 3);
8201    IRTemp argL = newTempV128();
8202    IRTemp argR = newTempV128();
8203    IRTemp msk  = newTempV128();
8204    IRTemp res  = newTempV128();
8205    assign(argL, argLE);
8206    assign(argR, argRE);
8207    assign(msk, binop(isU ? mkVecCMPGTU(size) : mkVecCMPGTS(size),
8208                      mkexpr(argL), mkexpr(argR)));
8209    assign(res,
8210           binop(Iop_OrV128,
8211                 binop(Iop_AndV128,
8212                       binop(mkVecSUB(size), mkexpr(argL), mkexpr(argR)),
8213                       mkexpr(msk)),
8214                 binop(Iop_AndV128,
8215                       binop(mkVecSUB(size), mkexpr(argR), mkexpr(argL)),
8216                       unop(Iop_NotV128, mkexpr(msk)))));
8217    return res;
8218 }
8219
8220
8221 /* Generate IR that takes a V128 and sign- or zero-widens
8222    either the lower or upper set of lanes to twice-as-wide,
8223    resulting in a new V128 value. */
8224 static
8225 IRTemp math_WIDEN_LO_OR_HI_LANES ( Bool zWiden, Bool fromUpperHalf,
8226                                    UInt sizeNarrow, IRExpr* srcE )
8227 {
8228    IRTemp src = newTempV128();
8229    IRTemp res = newTempV128();
8230    assign(src, srcE);
8231    switch (sizeNarrow) {
8232       case X10:
8233          assign(res,
8234                 binop(zWiden ? Iop_ShrN64x2 : Iop_SarN64x2,
8235                       binop(fromUpperHalf ? Iop_InterleaveHI32x4
8236                                           : Iop_InterleaveLO32x4,
8237                             mkexpr(src),
8238                             mkexpr(src)),
8239                       mkU8(32)));
8240          break;
8241       case X01:
8242          assign(res,
8243                 binop(zWiden ? Iop_ShrN32x4 : Iop_SarN32x4,
8244                       binop(fromUpperHalf ? Iop_InterleaveHI16x8
8245                                           : Iop_InterleaveLO16x8,
8246                             mkexpr(src),
8247                             mkexpr(src)),
8248                       mkU8(16)));
8249          break;
8250       case X00:
8251          assign(res,
8252                 binop(zWiden ? Iop_ShrN16x8 : Iop_SarN16x8,
8253                       binop(fromUpperHalf ? Iop_InterleaveHI8x16
8254                                           : Iop_InterleaveLO8x16,
8255                             mkexpr(src),
8256                             mkexpr(src)),
8257                       mkU8(8)));
8258          break;
8259       default:
8260          vassert(0);
8261    }
8262    return res;
8263 }
8264
8265
8266 /* Generate IR that takes a V128 and sign- or zero-widens
8267    either the even or odd lanes to twice-as-wide,
8268    resulting in a new V128 value. */
8269 static
8270 IRTemp math_WIDEN_EVEN_OR_ODD_LANES ( Bool zWiden, Bool fromOdd,
8271                                       UInt sizeNarrow, IRExpr* srcE )
8272 {
8273    IRTemp src   = newTempV128();
8274    IRTemp res   = newTempV128();
8275    IROp   opSAR = mkVecSARN(sizeNarrow+1);
8276    IROp   opSHR = mkVecSHRN(sizeNarrow+1);
8277    IROp   opSHL = mkVecSHLN(sizeNarrow+1);
8278    IROp   opSxR = zWiden ? opSHR : opSAR;
8279    UInt   amt   = 0;
8280    switch (sizeNarrow) {
8281       case X10: amt = 32; break;
8282       case X01: amt = 16; break;
8283       case X00: amt = 8;  break;
8284       default: vassert(0);
8285    }
8286    assign(src, srcE);
8287    if (fromOdd) {
8288       assign(res, binop(opSxR, mkexpr(src), mkU8(amt)));
8289    } else {
8290       assign(res, binop(opSxR, binop(opSHL, mkexpr(src), mkU8(amt)),
8291                                mkU8(amt)));
8292    }
8293    return res;
8294 }
8295
8296
8297 /* Generate IR that takes two V128s and narrows (takes lower half)
8298    of each lane, producing a single V128 value. */
8299 static
8300 IRTemp math_NARROW_LANES ( IRTemp argHi, IRTemp argLo, UInt sizeNarrow )
8301 {
8302    IRTemp res = newTempV128();
8303    assign(res, binop(mkVecCATEVENLANES(sizeNarrow),
8304                      mkexpr(argHi), mkexpr(argLo)));
8305    return res;
8306 }
8307
8308
8309 /* Return a temp which holds the vector dup of the lane of width
8310    (1 << size) obtained from src[laneNo]. */
8311 static
8312 IRTemp math_DUP_VEC_ELEM ( IRExpr* src, UInt size, UInt laneNo )
8313 {
8314    vassert(size <= 3);
8315    /* Normalise |laneNo| so it is of the form
8316       x000 for D, xx00 for S, xxx0 for H, and xxxx for B.
8317       This puts the bits we want to inspect at constant offsets
8318       regardless of the value of |size|.
8319    */
8320    UInt ix = laneNo << size;
8321    vassert(ix <= 15);
8322    IROp ops[4] = { Iop_INVALID, Iop_INVALID, Iop_INVALID, Iop_INVALID };
8323    switch (size) {
8324       case 0: /* B */
8325          ops[0] = (ix & 1) ? Iop_CatOddLanes8x16 : Iop_CatEvenLanes8x16;
8326          /* fallthrough */
8327       case 1: /* H */
8328          ops[1] = (ix & 2) ? Iop_CatOddLanes16x8 : Iop_CatEvenLanes16x8;
8329          /* fallthrough */
8330       case 2: /* S */
8331          ops[2] = (ix & 4) ? Iop_CatOddLanes32x4 : Iop_CatEvenLanes32x4;
8332          /* fallthrough */
8333       case 3: /* D */
8334          ops[3] = (ix & 8) ? Iop_InterleaveHI64x2 : Iop_InterleaveLO64x2;
8335          break;
8336       default:
8337          vassert(0);
8338    }
8339    IRTemp res = newTempV128();
8340    assign(res, src);
8341    Int i;
8342    for (i = 3; i >= 0; i--) {
8343       if (ops[i] == Iop_INVALID)
8344          break;
8345       IRTemp tmp = newTempV128();
8346       assign(tmp, binop(ops[i], mkexpr(res), mkexpr(res)));
8347       res = tmp;
8348    }
8349    return res;
8350 }
8351
8352
8353 /* Let |srcV| be a V128 value, and let |imm5| be a lane-and-size
8354    selector encoded as shown below.  Return a new V128 holding the
8355    selected lane from |srcV| dup'd out to V128, and also return the
8356    lane number, log2 of the lane size in bytes, and width-character via
8357    *laneNo, *laneSzLg2 and *laneCh respectively.  It may be that imm5
8358    is an invalid selector, in which case return
8359    IRTemp_INVALID, 0, 0 and '?' respectively.
8360
8361    imm5 = xxxx1   signifies .b[xxxx]
8362         = xxx10   .h[xxx]
8363         = xx100   .s[xx]
8364         = x1000   .d[x]
8365         otherwise invalid
8366 */
8367 static
8368 IRTemp handle_DUP_VEC_ELEM ( /*OUT*/UInt* laneNo,
8369                              /*OUT*/UInt* laneSzLg2, /*OUT*/HChar* laneCh,
8370                              IRExpr* srcV, UInt imm5 )
8371 {
8372    *laneNo    = 0;
8373    *laneSzLg2 = 0;
8374    *laneCh    = '?';
8375
8376    if (imm5 & 1) {
8377       *laneNo    = (imm5 >> 1) & 15;
8378       *laneSzLg2 = 0;
8379       *laneCh    = 'b';
8380    }
8381    else if (imm5 & 2) {
8382       *laneNo    = (imm5 >> 2) & 7;
8383       *laneSzLg2 = 1;
8384       *laneCh    = 'h';
8385    }
8386    else if (imm5 & 4) {
8387       *laneNo    = (imm5 >> 3) & 3;
8388       *laneSzLg2 = 2;
8389       *laneCh    = 's';
8390    }
8391    else if (imm5 & 8) {
8392       *laneNo    = (imm5 >> 4) & 1;
8393       *laneSzLg2 = 3;
8394       *laneCh    = 'd';
8395    }
8396    else {
8397       /* invalid */
8398       return IRTemp_INVALID;
8399    }
8400
8401    return math_DUP_VEC_ELEM(srcV, *laneSzLg2, *laneNo);
8402 }
8403
8404
8405 /* Clone |imm| to every lane of a V128, with lane size log2 of |size|. */
8406 static
8407 IRTemp math_VEC_DUP_IMM ( UInt size, ULong imm )
8408 {
8409    IRType ty  = Ity_INVALID;
8410    IRTemp rcS = IRTemp_INVALID;
8411    switch (size) {
8412       case X01:
8413          vassert(imm <= 0xFFFFULL);
8414          ty  = Ity_I16;
8415          rcS = newTemp(ty); assign(rcS, mkU16( (UShort)imm ));
8416          break;
8417       case X10:
8418          vassert(imm <= 0xFFFFFFFFULL);
8419          ty  = Ity_I32;
8420          rcS = newTemp(ty); assign(rcS, mkU32( (UInt)imm ));
8421          break;
8422       case X11:
8423          ty  = Ity_I64;
8424          rcS = newTemp(ty); assign(rcS, mkU64(imm)); break;
8425       default:
8426          vassert(0);
8427    }
8428    IRTemp rcV = math_DUP_TO_V128(rcS, ty);
8429    return rcV;
8430 }
8431
8432
8433 /* Let |new64| be a V128 in which only the lower 64 bits are interesting,
8434    and the upper can contain any value -- it is ignored.  If |is2| is False,
8435    generate IR to put |new64| in the lower half of vector reg |dd| and zero
8436    the upper half.  If |is2| is True, generate IR to put |new64| in the upper
8437    half of vector reg |dd| and leave the lower half unchanged.  This
8438    simulates the behaviour of the "foo/foo2" instructions in which the
8439    destination is half the width of sources, for example addhn/addhn2.
8440 */
8441 static
8442 void putLO64andZUorPutHI64 ( Bool is2, UInt dd, IRTemp new64 )
8443 {
8444    if (is2) {
8445       /* Get the old contents of Vdd, zero the upper half, and replace
8446          it with 'x'. */
8447       IRTemp t_zero_oldLO = newTempV128();
8448       assign(t_zero_oldLO, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
8449       IRTemp t_newHI_zero = newTempV128();
8450       assign(t_newHI_zero, binop(Iop_InterleaveLO64x2, mkexpr(new64),
8451                                                        mkV128(0x0000)));
8452       IRTemp res = newTempV128();
8453       assign(res, binop(Iop_OrV128, mkexpr(t_zero_oldLO),
8454                                     mkexpr(t_newHI_zero)));
8455       putQReg128(dd, mkexpr(res));
8456    } else {
8457       /* This is simple. */
8458       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(new64)));
8459    }
8460 }
8461
8462
8463 /* Compute vector SQABS at lane size |size| for |srcE|, returning
8464    the q result in |*qabs| and the normal result in |*nabs|. */
8465 static
8466 void math_SQABS ( /*OUT*/IRTemp* qabs, /*OUT*/IRTemp* nabs,
8467                   IRExpr* srcE, UInt size )
8468 {
8469       IRTemp src, mask, maskn, nsub, qsub;
8470       src = mask = maskn = nsub = qsub = IRTemp_INVALID;
8471       newTempsV128_7(&src, &mask, &maskn, &nsub, &qsub, nabs, qabs);
8472       assign(src,   srcE);
8473       assign(mask,  binop(mkVecCMPGTS(size),  mkV128(0x0000), mkexpr(src)));
8474       assign(maskn, unop(Iop_NotV128, mkexpr(mask)));
8475       assign(nsub,  binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
8476       assign(qsub,  binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
8477       assign(*nabs, binop(Iop_OrV128,
8478                           binop(Iop_AndV128, mkexpr(nsub), mkexpr(mask)),
8479                           binop(Iop_AndV128, mkexpr(src),  mkexpr(maskn))));
8480       assign(*qabs, binop(Iop_OrV128,
8481                           binop(Iop_AndV128, mkexpr(qsub), mkexpr(mask)),
8482                           binop(Iop_AndV128, mkexpr(src),  mkexpr(maskn))));
8483 }
8484
8485
8486 /* Compute vector SQNEG at lane size |size| for |srcE|, returning
8487    the q result in |*qneg| and the normal result in |*nneg|. */
8488 static
8489 void math_SQNEG ( /*OUT*/IRTemp* qneg, /*OUT*/IRTemp* nneg,
8490                   IRExpr* srcE, UInt size )
8491 {
8492       IRTemp src = IRTemp_INVALID;
8493       newTempsV128_3(&src, nneg, qneg);
8494       assign(src,   srcE);
8495       assign(*nneg, binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
8496       assign(*qneg, binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
8497 }
8498
8499
8500 /* Zero all except the least significant lane of |srcE|, where |size|
8501    indicates the lane size in the usual way. */
8502 static IRTemp math_ZERO_ALL_EXCEPT_LOWEST_LANE ( UInt size, IRExpr* srcE )
8503 {
8504    vassert(size < 4);
8505    IRTemp t = newTempV128();
8506    assign(t, unop(mkVecZEROHIxxOFV128(size), srcE));
8507    return t;
8508 }
8509
8510
8511 /* Generate IR to compute vector widening MULL from either the lower
8512    (is2==False) or upper (is2==True) halves of vecN and vecM.  The
8513    widening multiplies are unsigned when isU==True and signed when
8514    isU==False.  |size| is the narrow lane size indication.  Optionally,
8515    the product may be added to or subtracted from vecD, at the wide lane
8516    size.  This happens when |mas| is 'a' (add) or 's' (sub).  When |mas|
8517    is 'm' (only multiply) then the accumulate part does not happen, and
8518    |vecD| is expected to == IRTemp_INVALID.
8519
8520    Only size==0 (h_b_b), size==1 (s_h_h) and size==2 (d_s_s) variants
8521    are allowed.  The result is returned in a new IRTemp, which is
8522    returned in *res. */
8523 static
8524 void math_MULL_ACC ( /*OUT*/IRTemp* res,
8525                      Bool is2, Bool isU, UInt size, HChar mas,
8526                      IRTemp vecN, IRTemp vecM, IRTemp vecD )
8527 {
8528    vassert(res && *res == IRTemp_INVALID);
8529    vassert(size <= 2);
8530    vassert(mas == 'm' || mas == 'a' || mas == 's');
8531    if (mas == 'm') vassert(vecD == IRTemp_INVALID);
8532    IROp   mulOp = isU ? mkVecMULLU(size) : mkVecMULLS(size);
8533    IROp   accOp = (mas == 'a') ? mkVecADD(size+1)
8534                   : (mas == 's' ? mkVecSUB(size+1)
8535                   : Iop_INVALID);
8536    IRTemp mul   = math_BINARY_WIDENING_V128(is2, mulOp,
8537                                             mkexpr(vecN), mkexpr(vecM));
8538    *res = newTempV128();
8539    assign(*res, mas == 'm' ? mkexpr(mul)
8540                            : binop(accOp, mkexpr(vecD), mkexpr(mul)));
8541 }
8542
8543
8544 /* Same as math_MULL_ACC, except the multiply is signed widening,
8545    the multiplied value is then doubled, before being added to or
8546    subtracted from the accumulated value.  And everything is
8547    saturated.  In all cases, saturation residuals are returned
8548    via (sat1q, sat1n), and in the accumulate cases,
8549    via (sat2q, sat2n) too.  All results are returned in new temporaries.
8550    In the no-accumulate case, *sat2q and *sat2n are never instantiated,
8551    so the caller can tell this has happened. */
8552 static
8553 void math_SQDMULL_ACC ( /*OUT*/IRTemp* res,
8554                         /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
8555                         /*OUT*/IRTemp* sat2q, /*OUT*/IRTemp* sat2n,
8556                         Bool is2, UInt size, HChar mas,
8557                         IRTemp vecN, IRTemp vecM, IRTemp vecD )
8558 {
8559    vassert(size <= 2);
8560    vassert(mas == 'm' || mas == 'a' || mas == 's');
8561    /* Compute
8562          sat1q = vecN.D[is2] *sq vecM.d[is2] *q 2
8563          sat1n = vecN.D[is2] *s  vecM.d[is2] *  2
8564       IOW take either the low or high halves of vecN and vecM, signed widen,
8565       multiply, double that, and signedly saturate.  Also compute the same
8566       but without saturation.
8567    */
8568    vassert(sat2q && *sat2q == IRTemp_INVALID);
8569    vassert(sat2n && *sat2n == IRTemp_INVALID);
8570    newTempsV128_3(sat1q, sat1n, res);
8571    IRTemp tq = math_BINARY_WIDENING_V128(is2, mkVecQDMULLS(size),
8572                                          mkexpr(vecN), mkexpr(vecM));
8573    IRTemp tn = math_BINARY_WIDENING_V128(is2, mkVecMULLS(size),
8574                                          mkexpr(vecN), mkexpr(vecM));
8575    assign(*sat1q, mkexpr(tq));
8576    assign(*sat1n, binop(mkVecADD(size+1), mkexpr(tn), mkexpr(tn)));
8577
8578    /* If there is no accumulation, the final result is sat1q,
8579       and there's no assignment to sat2q or sat2n. */
8580    if (mas == 'm') {
8581       assign(*res, mkexpr(*sat1q));
8582       return;
8583    }
8584
8585    /* Compute
8586          sat2q  = vecD +sq/-sq sat1q
8587          sat2n  = vecD +/-     sat1n
8588          result = sat2q
8589    */
8590    newTempsV128_2(sat2q, sat2n);
8591    assign(*sat2q, binop(mas == 'a' ? mkVecQADDS(size+1) : mkVecQSUBS(size+1),
8592                         mkexpr(vecD), mkexpr(*sat1q)));
8593    assign(*sat2n, binop(mas == 'a' ? mkVecADD(size+1) : mkVecSUB(size+1),
8594                         mkexpr(vecD), mkexpr(*sat1n)));
8595    assign(*res, mkexpr(*sat2q));
8596 }
8597
8598
8599 /* Generate IR for widening signed vector multiplies.  The operands
8600    have their lane width signedly widened, and they are then multiplied
8601    at the wider width, returning results in two new IRTemps. */
8602 static
8603 void math_MULLS ( /*OUT*/IRTemp* resHI, /*OUT*/IRTemp* resLO,
8604                   UInt sizeNarrow, IRTemp argL, IRTemp argR )
8605 {
8606    vassert(sizeNarrow <= 2);
8607    newTempsV128_2(resHI, resLO);
8608    IRTemp argLhi = newTemp(Ity_I64);
8609    IRTemp argLlo = newTemp(Ity_I64);
8610    IRTemp argRhi = newTemp(Ity_I64);
8611    IRTemp argRlo = newTemp(Ity_I64);
8612    assign(argLhi, unop(Iop_V128HIto64, mkexpr(argL)));
8613    assign(argLlo, unop(Iop_V128to64,   mkexpr(argL)));
8614    assign(argRhi, unop(Iop_V128HIto64, mkexpr(argR)));
8615    assign(argRlo, unop(Iop_V128to64,   mkexpr(argR)));
8616    IROp opMulls = mkVecMULLS(sizeNarrow);
8617    assign(*resHI, binop(opMulls, mkexpr(argLhi), mkexpr(argRhi)));
8618    assign(*resLO, binop(opMulls, mkexpr(argLlo), mkexpr(argRlo)));
8619 }
8620
8621
8622 /* Generate IR for SQDMULH and SQRDMULH: signedly wideningly multiply,
8623    double that, possibly add a rounding constant (R variants), and take
8624    the high half. */
8625 static
8626 void math_SQDMULH ( /*OUT*/IRTemp* res,
8627                     /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
8628                     Bool isR, UInt size, IRTemp vN, IRTemp vM )
8629 {
8630    vassert(size == X01 || size == X10); /* s or h only */
8631
8632    newTempsV128_3(res, sat1q, sat1n);
8633
8634    IRTemp mullsHI = IRTemp_INVALID, mullsLO = IRTemp_INVALID;
8635    math_MULLS(&mullsHI, &mullsLO, size, vN, vM);
8636
8637    IRTemp addWide = mkVecADD(size+1);
8638
8639    if (isR) {
8640       assign(*sat1q, binop(mkVecQRDMULHIS(size), mkexpr(vN), mkexpr(vM)));
8641
8642       Int    rcShift    = size == X01 ? 15 : 31;
8643       IRTemp roundConst = math_VEC_DUP_IMM(size+1, 1ULL << rcShift);
8644       assign(*sat1n,
8645              binop(mkVecCATODDLANES(size),
8646                    binop(addWide,
8647                          binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
8648                          mkexpr(roundConst)),
8649                    binop(addWide,
8650                          binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO)),
8651                          mkexpr(roundConst))));
8652    } else {
8653       assign(*sat1q, binop(mkVecQDMULHIS(size), mkexpr(vN), mkexpr(vM)));
8654
8655       assign(*sat1n,
8656              binop(mkVecCATODDLANES(size),
8657                    binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
8658                    binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO))));
8659    }
8660
8661    assign(*res, mkexpr(*sat1q));
8662 }
8663
8664 /* Generate IR for SQRDMLAH and SQRDMLSH: signedly wideningly multiply,
8665    double, add a rounding constant, take the high half and accumulate. */
8666 static
8667 void math_SQRDMLAH ( /*OUT*/IRTemp* res, /*OUT*/IRTemp* res_nosat, Bool isAdd,
8668                      UInt size, IRTemp vD, IRTemp vN, IRTemp vM )
8669 {
8670    vassert(size == X01 || size == X10); /* s or h only */
8671
8672    /* SQRDMLAH = SQADD(A, SQRDMULH(B, C)) */
8673
8674    IRTemp mul, mul_nosat, dummy;
8675    mul = mul_nosat = dummy = IRTemp_INVALID;
8676    math_SQDMULH(&mul, &dummy, &mul_nosat, True/*R*/, size, vN, vM);
8677
8678    IROp  op = isAdd ? mkVecADD(size)   : mkVecSUB(size);
8679    IROp qop = isAdd ? mkVecQADDS(size) : mkVecQSUBS(size);
8680    newTempsV128_2(res, res_nosat);
8681    assign(*res, binop(qop, mkexpr(vD), mkexpr(mul)));
8682    assign(*res_nosat, binop(op, mkexpr(vD), mkexpr(mul_nosat)));
8683 }
8684
8685
8686 /* Generate IR for SQSHL, UQSHL, SQSHLU by imm.  Put the result in
8687    a new temp in *res, and the Q difference pair in new temps in
8688    *qDiff1 and *qDiff2 respectively.  |nm| denotes which of the
8689    three operations it is. */
8690 static
8691 void math_QSHL_IMM ( /*OUT*/IRTemp* res,
8692                      /*OUT*/IRTemp* qDiff1, /*OUT*/IRTemp* qDiff2,
8693                      IRTemp src, UInt size, UInt shift, const HChar* nm )
8694 {
8695    vassert(size <= 3);
8696    UInt laneBits = 8 << size;
8697    vassert(shift < laneBits);
8698    newTempsV128_3(res, qDiff1, qDiff2);
8699    IRTemp z128 = newTempV128();
8700    assign(z128, mkV128(0x0000));
8701
8702    /* UQSHL */
8703    if (vex_streq(nm, "uqshl")) {
8704       IROp qop = mkVecQSHLNSATUU(size);
8705       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
8706       if (shift == 0) {
8707          /* No shift means no saturation. */
8708          assign(*qDiff1, mkexpr(z128));
8709          assign(*qDiff2, mkexpr(z128));
8710       } else {
8711          /* Saturation has occurred if any of the shifted-out bits are
8712             nonzero.  We get the shifted-out bits by right-shifting the
8713             original value. */
8714          UInt rshift = laneBits - shift;
8715          vassert(rshift >= 1 && rshift < laneBits);
8716          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
8717          assign(*qDiff2, mkexpr(z128));
8718       }
8719       return;
8720    }
8721
8722    /* SQSHL */
8723    if (vex_streq(nm, "sqshl")) {
8724       IROp qop = mkVecQSHLNSATSS(size);
8725       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
8726       if (shift == 0) {
8727          /* No shift means no saturation. */
8728          assign(*qDiff1, mkexpr(z128));
8729          assign(*qDiff2, mkexpr(z128));
8730       } else {
8731          /* Saturation has occurred if any of the shifted-out bits are
8732             different from the top bit of the original value. */
8733          UInt rshift = laneBits - 1 - shift;
8734          vassert(rshift >= 0 && rshift < laneBits-1);
8735          /* qDiff1 is the shifted out bits, and the top bit of the original
8736             value, preceded by zeroes. */
8737          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
8738          /* qDiff2 is the top bit of the original value, cloned the
8739             correct number of times. */
8740          assign(*qDiff2, binop(mkVecSHRN(size),
8741                                binop(mkVecSARN(size), mkexpr(src),
8742                                                       mkU8(laneBits-1)),
8743                                mkU8(rshift)));
8744          /* This also succeeds in comparing the top bit of the original
8745             value to itself, which is a bit stupid, but not wrong. */
8746       }
8747       return;
8748    }
8749
8750    /* SQSHLU */
8751    if (vex_streq(nm, "sqshlu")) {
8752       IROp qop = mkVecQSHLNSATSU(size);
8753       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
8754       if (shift == 0) {
8755          /* If there's no shift, saturation depends on the top bit
8756             of the source. */
8757          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(laneBits-1)));
8758          assign(*qDiff2, mkexpr(z128));
8759       } else {
8760          /* Saturation has occurred if any of the shifted-out bits are
8761             nonzero.  We get the shifted-out bits by right-shifting the
8762             original value. */
8763          UInt rshift = laneBits - shift;
8764          vassert(rshift >= 1 && rshift < laneBits);
8765          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
8766          assign(*qDiff2, mkexpr(z128));
8767       }
8768       return;
8769    }
8770
8771    vassert(0);
8772 }
8773
8774
8775 /* Generate IR to do SRHADD and URHADD. */
8776 static
8777 IRTemp math_RHADD ( UInt size, Bool isU, IRTemp aa, IRTemp bb )
8778 {
8779    /* Generate this:
8780       (A >> 1) + (B >> 1) + (((A & 1) + (B & 1) + 1) >> 1)
8781    */
8782    vassert(size <= 3);
8783    IROp opSHR = isU ? mkVecSHRN(size) : mkVecSARN(size);
8784    IROp opADD = mkVecADD(size);
8785    /* The only tricky bit is to generate the correct vector 1 constant. */
8786    const ULong ones64[4]
8787       = { 0x0101010101010101ULL, 0x0001000100010001ULL,
8788           0x0000000100000001ULL, 0x0000000000000001ULL };
8789    IRTemp imm64 = newTemp(Ity_I64);
8790    assign(imm64, mkU64(ones64[size]));
8791    IRTemp vecOne = newTempV128();
8792    assign(vecOne, binop(Iop_64HLtoV128, mkexpr(imm64), mkexpr(imm64)));
8793    IRTemp scaOne = newTemp(Ity_I8);
8794    assign(scaOne, mkU8(1));
8795    IRTemp res = newTempV128();
8796    assign(res,
8797           binop(opADD,
8798                 binop(opSHR, mkexpr(aa), mkexpr(scaOne)),
8799                 binop(opADD,
8800                       binop(opSHR, mkexpr(bb), mkexpr(scaOne)),
8801                       binop(opSHR,
8802                             binop(opADD,
8803                                   binop(opADD,
8804                                         binop(Iop_AndV128, mkexpr(aa),
8805                                                            mkexpr(vecOne)),
8806                                         binop(Iop_AndV128, mkexpr(bb),
8807                                                            mkexpr(vecOne))
8808                                   ),
8809                                   mkexpr(vecOne)
8810                             ),
8811                             mkexpr(scaOne)
8812                       )
8813                 )
8814           )
8815    );
8816    return res;
8817 }
8818
8819
8820 /* QCFLAG tracks the SIMD sticky saturation status.  Update the status
8821    thusly: if, after application of |opZHI| to both |qres| and |nres|,
8822    they have the same value, leave QCFLAG unchanged.  Otherwise, set it
8823    (implicitly) to 1.  |opZHI| may only be one of the Iop_ZeroHIxxofV128
8824    operators, or Iop_INVALID, in which case |qres| and |nres| are used
8825    unmodified.  The presence |opZHI| means this function can be used to
8826    generate QCFLAG update code for both scalar and vector SIMD operations.
8827 */
8828 static
8829 void updateQCFLAGwithDifferenceZHI ( IRTemp qres, IRTemp nres, IROp opZHI )
8830 {
8831    IRTemp diff      = newTempV128();
8832    IRTemp oldQCFLAG = newTempV128();
8833    IRTemp newQCFLAG = newTempV128();
8834    if (opZHI == Iop_INVALID) {
8835       assign(diff, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres)));
8836    } else {
8837       vassert(opZHI == Iop_ZeroHI64ofV128
8838               || opZHI == Iop_ZeroHI96ofV128 || opZHI == Iop_ZeroHI112ofV128);
8839       assign(diff, unop(opZHI, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres))));
8840    }
8841    assign(oldQCFLAG, IRExpr_Get(OFFB_QCFLAG, Ity_V128));
8842    assign(newQCFLAG, binop(Iop_OrV128, mkexpr(oldQCFLAG), mkexpr(diff)));
8843    stmt(IRStmt_Put(OFFB_QCFLAG, mkexpr(newQCFLAG)));
8844 }
8845
8846
8847 /* A variant of updateQCFLAGwithDifferenceZHI in which |qres| and |nres|
8848    are used unmodified, hence suitable for QCFLAG updates for whole-vector
8849    operations. */
8850 static
8851 void updateQCFLAGwithDifference ( IRTemp qres, IRTemp nres )
8852 {
8853    updateQCFLAGwithDifferenceZHI(qres, nres, Iop_INVALID);
8854 }
8855
8856
8857 /* Generate IR to rearrange two vector values in a way which is useful
8858    for doing S/D/H add-pair etc operations.  There are 5 cases:
8859
8860    2d:  [m1 m0] [n1 n0]  -->  [m1 n1] [m0 n0]
8861
8862    4s:  [m3 m2 m1 m0] [n3 n2 n1 n0]  -->  [m3 m1 n3 n1] [m2 m0 n2 n0]
8863
8864    8h:  [m7 m6 m5 m4 m3 m2 m1 m0] [n7 n6 n5 n4 n3 n2 n1 n0] -->
8865         [m7 m5 n7 n5 m3 m1 n3 n1] [m6 m4 n6 n4 m2 m0 n2 n0]
8866
8867    2s:  [m3 m2 m1 m0] [n3 n2 n1 n0]  -->  [0 0 m1 n1] [0 0 m0 n0]
8868
8869    4h:  [m7 m6 m5 m4 m3 m2 m1 m0] [n7 n6 n5 n4 n3 n2 n1 n0] -->
8870         [ 0  0  0  0 m3 m1 n3 n1] [ 0  0  0  0 m2 m0 n2 n0]
8871 */
8872 static
8873 void math_REARRANGE_FOR_FLOATING_PAIRWISE (
8874         /*OUT*/IRTemp* rearrL, /*OUT*/IRTemp* rearrR,
8875         IRTemp vecM, IRTemp vecN, ARM64VecESize sz, UInt bitQ
8876      )
8877 {
8878    vassert(rearrL && *rearrL == IRTemp_INVALID);
8879    vassert(rearrR && *rearrR == IRTemp_INVALID);
8880    *rearrL = newTempV128();
8881    *rearrR = newTempV128();
8882
8883    switch (sz) {
8884       case ARM64VSizeD:
8885          // 2d case
8886          vassert(bitQ == 1);
8887          assign(*rearrL, binop(Iop_InterleaveHI64x2, mkexpr(vecM), mkexpr(vecN)));
8888          assign(*rearrR, binop(Iop_InterleaveLO64x2, mkexpr(vecM), mkexpr(vecN)));
8889          break;
8890
8891       case ARM64VSizeS:
8892          if (bitQ == 1) {
8893             // 4s case
8894             assign(*rearrL, binop(Iop_CatOddLanes32x4,  mkexpr(vecM), mkexpr(vecN)));
8895             assign(*rearrR, binop(Iop_CatEvenLanes32x4, mkexpr(vecM), mkexpr(vecN)));
8896          } else {
8897             // 2s case
8898             IRTemp m1n1m0n0 = newTempV128();
8899             IRTemp m0n0m1n1 = newTempV128();
8900             assign(m1n1m0n0, binop(Iop_InterleaveLO32x4,
8901                                    mkexpr(vecM), mkexpr(vecN)));
8902             assign(m0n0m1n1, triop(Iop_SliceV128,
8903                                    mkexpr(m1n1m0n0), mkexpr(m1n1m0n0), mkU8(8)));
8904             assign(*rearrL, unop(Iop_ZeroHI64ofV128, mkexpr(m1n1m0n0)));
8905             assign(*rearrR, unop(Iop_ZeroHI64ofV128, mkexpr(m0n0m1n1)));
8906          }
8907          break;
8908
8909       case ARM64VSizeH:
8910          if (bitQ == 1) {
8911             // 8h case
8912             assign(*rearrL, binop(Iop_CatOddLanes16x8,  mkexpr(vecM), mkexpr(vecN)));
8913             assign(*rearrR, binop(Iop_CatEvenLanes16x8, mkexpr(vecM), mkexpr(vecN)));
8914          } else {
8915             // 4h case
8916             IRTemp m3m1n3n1 = newTempV128();
8917             IRTemp m2m0n2n0 = newTempV128();
8918             assign(m3m1n3n1, binop(Iop_CatOddLanes16x8, mkexpr(vecM), mkexpr(vecN)));
8919             assign(m2m0n2n0, binop(Iop_CatEvenLanes16x8, mkexpr(vecM), mkexpr(vecN)));
8920             assign(*rearrL, unop(Iop_ZeroHI64ofV128,
8921                                  binop(Iop_CatEvenLanes32x4, mkexpr(m3m1n3n1),
8922                                                              mkexpr(m3m1n3n1))));
8923             assign(*rearrR, unop(Iop_ZeroHI64ofV128,
8924                                  binop(Iop_CatEvenLanes32x4, mkexpr(m2m0n2n0),
8925                                                              mkexpr(m2m0n2n0))));
8926          }
8927          break;
8928
8929       default: vpanic("math_REARRANGE_FOR_FLOATING_PAIRWISE");
8930    }
8931 }
8932
8933
8934 /* Returns 2.0 ^ (-n) for n in 1 .. 64 */
8935 static Double two_to_the_minus ( Int n )
8936 {
8937    if (n == 1) return 0.5;
8938    vassert(n >= 2 && n <= 64);
8939    Int half = n / 2;
8940    return two_to_the_minus(half) * two_to_the_minus(n - half);
8941 }
8942
8943
8944 /* Returns 2.0 ^ n for n in 1 .. 64 */
8945 static Double two_to_the_plus ( Int n )
8946 {
8947    if (n == 1) return 2.0;
8948    vassert(n >= 2 && n <= 64);
8949    Int half = n / 2;
8950    return two_to_the_plus(half) * two_to_the_plus(n - half);
8951 }
8952
8953
8954 /*------------------------------------------------------------*/
8955 /*--- SIMD and FP instructions                             ---*/
8956 /*------------------------------------------------------------*/
8957
8958 static
8959 Bool dis_AdvSIMD_EXT(/*MB_OUT*/DisResult* dres, UInt insn)
8960 {
8961    /* 31  29     23  21 20 15 14   10 9 4
8962       0 q 101110 op2 0  m  0  imm4 0  n d
8963       Decode fields: op2
8964    */
8965 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8966    if (INSN(31,31) != 0
8967        || INSN(29,24) != BITS6(1,0,1,1,1,0)
8968        || INSN(21,21) != 0 || INSN(15,15) != 0 || INSN(10,10) != 0) {
8969       return False;
8970    }
8971    UInt bitQ = INSN(30,30);
8972    UInt op2  = INSN(23,22);
8973    UInt mm   = INSN(20,16);
8974    UInt imm4 = INSN(14,11);
8975    UInt nn   = INSN(9,5);
8976    UInt dd   = INSN(4,0);
8977
8978    if (op2 == BITS2(0,0)) {
8979       /* -------- 00: EXT 16b_16b_16b, 8b_8b_8b -------- */
8980       IRTemp sHi = newTempV128();
8981       IRTemp sLo = newTempV128();
8982       IRTemp res = newTempV128();
8983       assign(sHi, getQReg128(mm));
8984       assign(sLo, getQReg128(nn));
8985       if (bitQ == 1) {
8986          if (imm4 == 0) {
8987             assign(res, mkexpr(sLo));
8988          } else {
8989             vassert(imm4 >= 1 && imm4 <= 15);
8990             assign(res, triop(Iop_SliceV128,
8991                               mkexpr(sHi), mkexpr(sLo), mkU8(imm4)));
8992          }
8993          putQReg128(dd, mkexpr(res));
8994          DIP("ext v%u.16b, v%u.16b, v%u.16b, #%u\n", dd, nn, mm, imm4);
8995       } else {
8996          if (imm4 >= 8) return False;
8997          if (imm4 == 0) {
8998             assign(res, mkexpr(sLo));
8999          } else {
9000             vassert(imm4 >= 1 && imm4 <= 7);
9001             IRTemp hi64lo64 = newTempV128();
9002             assign(hi64lo64, binop(Iop_InterleaveLO64x2,
9003                                    mkexpr(sHi), mkexpr(sLo)));
9004             assign(res, triop(Iop_SliceV128,
9005                               mkexpr(hi64lo64), mkexpr(hi64lo64), mkU8(imm4)));
9006          }
9007          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9008          DIP("ext v%u.8b, v%u.8b, v%u.8b, #%u\n", dd, nn, mm, imm4);
9009       }
9010       return True;
9011    }
9012
9013    return False;
9014 #  undef INSN
9015 }
9016
9017
9018 static
9019 Bool dis_AdvSIMD_TBL_TBX(/*MB_OUT*/DisResult* dres, UInt insn)
9020 {
9021    /* 31  29     23  21 20 15 14  12 11 9 4
9022       0 q 001110 op2 0  m  0  len op 00 n d
9023       Decode fields: op2,len,op
9024    */
9025 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9026    if (INSN(31,31) != 0
9027        || INSN(29,24) != BITS6(0,0,1,1,1,0)
9028        || INSN(21,21) != 0
9029        || INSN(15,15) != 0
9030        || INSN(11,10) != BITS2(0,0)) {
9031       return False;
9032    }
9033    UInt bitQ  = INSN(30,30);
9034    UInt op2   = INSN(23,22);
9035    UInt mm    = INSN(20,16);
9036    UInt len   = INSN(14,13);
9037    UInt bitOP = INSN(12,12);
9038    UInt nn    = INSN(9,5);
9039    UInt dd    = INSN(4,0);
9040
9041    if (op2 == X00) {
9042       /* -------- 00,xx,0 TBL, xx register table -------- */
9043       /* -------- 00,xx,1 TBX, xx register table -------- */
9044       /* 31  28        20 15 14  12  9 4
9045          0q0 01110 000 m  0  len 000 n d  TBL Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
9046          0q0 01110 000 m  0  len 100 n d  TBX Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
9047          where Ta = 16b(q=1) or 8b(q=0)
9048       */
9049       Bool isTBX = bitOP == 1;
9050       /* The out-of-range values to use. */
9051       IRTemp oor_values = newTempV128();
9052       assign(oor_values, isTBX ? getQReg128(dd) : mkV128(0));
9053       /* src value */
9054       IRTemp src = newTempV128();
9055       assign(src, getQReg128(mm));
9056       /* The table values */
9057       IRTemp tab[4];
9058       UInt   i;
9059       for (i = 0; i <= len; i++) {
9060          vassert(i < 4);
9061          tab[i] = newTempV128();
9062          assign(tab[i], getQReg128((nn + i) % 32));
9063       }
9064       IRTemp res = math_TBL_TBX(tab, len, src, oor_values);
9065       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9066       const HChar* Ta = bitQ ==1 ? "16b" : "8b";
9067       const HChar* nm = isTBX ? "tbx" : "tbl";
9068       DIP("%s %s.%s, {v%u.16b .. v%u.16b}, %s.%s\n",
9069           nm, nameQReg128(dd), Ta, nn, (nn + len) % 32, nameQReg128(mm), Ta);
9070       return True;
9071    }
9072
9073 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9074    return False;
9075 #  undef INSN
9076 }
9077
9078
9079 static
9080 Bool dis_AdvSIMD_ZIP_UZP_TRN(/*MB_OUT*/DisResult* dres, UInt insn)
9081 {
9082    /* 31  29     23   21 20 15 14     11 9 4
9083       0 q 001110 size 0  m  0  opcode 10 n d
9084       Decode fields: opcode
9085    */
9086 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9087    if (INSN(31,31) != 0
9088        || INSN(29,24) != BITS6(0,0,1,1,1,0)
9089        || INSN(21,21) != 0 || INSN(15,15) != 0 || INSN(11,10) != BITS2(1,0)) {
9090       return False;
9091    }
9092    UInt bitQ   = INSN(30,30);
9093    UInt size   = INSN(23,22);
9094    UInt mm     = INSN(20,16);
9095    UInt opcode = INSN(14,12);
9096    UInt nn     = INSN(9,5);
9097    UInt dd     = INSN(4,0);
9098
9099    if (opcode == BITS3(0,0,1) || opcode == BITS3(1,0,1)) {
9100       /* -------- 001 UZP1 std7_std7_std7 -------- */
9101       /* -------- 101 UZP2 std7_std7_std7 -------- */
9102       if (bitQ == 0 && size == X11) return False; // implied 1d case
9103       Bool   isUZP1 = opcode == BITS3(0,0,1);
9104       IROp   op     = isUZP1 ? mkVecCATEVENLANES(size)
9105                              : mkVecCATODDLANES(size);
9106       IRTemp preL = newTempV128();
9107       IRTemp preR = newTempV128();
9108       IRTemp res  = newTempV128();
9109       if (bitQ == 0) {
9110          assign(preL, binop(Iop_InterleaveLO64x2, getQReg128(mm),
9111                                                   getQReg128(nn)));
9112          assign(preR, mkexpr(preL));
9113       } else {
9114          assign(preL, getQReg128(mm));
9115          assign(preR, getQReg128(nn));
9116       }
9117       assign(res, binop(op, mkexpr(preL), mkexpr(preR)));
9118       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9119       const HChar* nm  = isUZP1 ? "uzp1" : "uzp2";
9120       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9121       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
9122           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
9123       return True;
9124    }
9125
9126    if (opcode == BITS3(0,1,0) || opcode == BITS3(1,1,0)) {
9127       /* -------- 010 TRN1 std7_std7_std7 -------- */
9128       /* -------- 110 TRN2 std7_std7_std7 -------- */
9129       if (bitQ == 0 && size == X11) return False; // implied 1d case
9130       Bool   isTRN1 = opcode == BITS3(0,1,0);
9131       IROp   op1    = isTRN1 ? mkVecCATEVENLANES(size)
9132                              : mkVecCATODDLANES(size);
9133       IROp op2 = mkVecINTERLEAVEHI(size);
9134       IRTemp srcM = newTempV128();
9135       IRTemp srcN = newTempV128();
9136       IRTemp res  = newTempV128();
9137       assign(srcM, getQReg128(mm));
9138       assign(srcN, getQReg128(nn));
9139       assign(res, binop(op2, binop(op1, mkexpr(srcM), mkexpr(srcM)),
9140                              binop(op1, mkexpr(srcN), mkexpr(srcN))));
9141       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9142       const HChar* nm  = isTRN1 ? "trn1" : "trn2";
9143       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9144       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
9145           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
9146       return True;
9147    }
9148
9149    if (opcode == BITS3(0,1,1) || opcode == BITS3(1,1,1)) {
9150       /* -------- 011 ZIP1 std7_std7_std7 -------- */
9151       /* -------- 111 ZIP2 std7_std7_std7 -------- */
9152       if (bitQ == 0 && size == X11) return False; // implied 1d case
9153       Bool   isZIP1 = opcode == BITS3(0,1,1);
9154       IROp   op     = isZIP1 ? mkVecINTERLEAVELO(size)
9155                              : mkVecINTERLEAVEHI(size);
9156       IRTemp preL = newTempV128();
9157       IRTemp preR = newTempV128();
9158       IRTemp res  = newTempV128();
9159       if (bitQ == 0 && !isZIP1) {
9160          IRTemp z128 = newTempV128();
9161          assign(z128, mkV128(0x0000));
9162          // preL = Vm shifted left 32 bits
9163          // preR = Vn shifted left 32 bits
9164          assign(preL, triop(Iop_SliceV128,
9165                             getQReg128(mm), mkexpr(z128), mkU8(12)));
9166          assign(preR, triop(Iop_SliceV128,
9167                             getQReg128(nn), mkexpr(z128), mkU8(12)));
9168
9169       } else {
9170          assign(preL, getQReg128(mm));
9171          assign(preR, getQReg128(nn));
9172       }
9173       assign(res, binop(op, mkexpr(preL), mkexpr(preR)));
9174       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9175       const HChar* nm  = isZIP1 ? "zip1" : "zip2";
9176       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9177       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
9178           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
9179       return True;
9180    }
9181
9182    return False;
9183 #  undef INSN
9184 }
9185
9186
9187 static
9188 Bool dis_AdvSIMD_across_lanes(/*MB_OUT*/DisResult* dres, UInt insn)
9189 {
9190    /* 31    28    23   21    16     11 9 4
9191       0 q u 01110 size 11000 opcode 10 n d
9192       Decode fields: u,size,opcode
9193    */
9194 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9195    if (INSN(31,31) != 0
9196        || INSN(28,24) != BITS5(0,1,1,1,0)
9197        || INSN(21,17) != BITS5(1,1,0,0,0) || INSN(11,10) != BITS2(1,0)) {
9198       return False;
9199    }
9200    UInt bitQ   = INSN(30,30);
9201    UInt bitU   = INSN(29,29);
9202    UInt size   = INSN(23,22);
9203    UInt opcode = INSN(16,12);
9204    UInt nn     = INSN(9,5);
9205    UInt dd     = INSN(4,0);
9206
9207    if (opcode == BITS5(0,0,0,1,1)) {
9208       /* -------- 0,xx,00011 SADDLV -------- */
9209       /* -------- 1,xx,00011 UADDLV -------- */
9210       /* size is the narrow size */
9211       if (size == X11 || (size == X10 && bitQ == 0)) return False;
9212       Bool   isU = bitU == 1;
9213       IRTemp src = newTempV128();
9214       assign(src, getQReg128(nn));
9215       /* The basic plan is to widen the lower half, and if Q = 1,
9216          the upper half too.  Add them together (if Q = 1), and in
9217          either case fold with add at twice the lane width.
9218       */
9219       IRExpr* widened
9220          = mkexpr(math_WIDEN_LO_OR_HI_LANES(
9221                      isU, False/*!fromUpperHalf*/, size, mkexpr(src)));
9222       if (bitQ == 1) {
9223          widened
9224             = binop(mkVecADD(size+1),
9225                     widened,
9226                     mkexpr(math_WIDEN_LO_OR_HI_LANES(
9227                               isU, True/*fromUpperHalf*/, size, mkexpr(src)))
9228               );
9229       }
9230       /* Now fold. */
9231       IRTemp tWi = newTempV128();
9232       assign(tWi, widened);
9233       IRTemp res = math_FOLDV(tWi, mkVecADD(size+1));
9234       putQReg128(dd, mkexpr(res));
9235       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9236       const HChar  ch  = "bhsd"[size];
9237       DIP("%s %s.%c, %s.%s\n", isU ? "uaddlv" : "saddlv",
9238           nameQReg128(dd), ch, nameQReg128(nn), arr);
9239       return True;
9240    }
9241
9242    UInt ix = 0;
9243    /**/ if (opcode == BITS5(0,1,0,1,0)) { ix = bitU == 0 ? 1 : 2; }
9244    else if (opcode == BITS5(1,1,0,1,0)) { ix = bitU == 0 ? 3 : 4; }
9245    else if (opcode == BITS5(1,1,0,1,1) && bitU == 0) { ix = 5; }
9246    /**/
9247    if (ix != 0) {
9248       /* -------- 0,xx,01010: SMAXV -------- (1) */
9249       /* -------- 1,xx,01010: UMAXV -------- (2) */
9250       /* -------- 0,xx,11010: SMINV -------- (3) */
9251       /* -------- 1,xx,11010: UMINV -------- (4) */
9252       /* -------- 0,xx,11011: ADDV  -------- (5) */
9253       vassert(ix >= 1 && ix <= 5);
9254       if (size == X11) return False; // 1d,2d cases not allowed
9255       if (size == X10 && bitQ == 0) return False; // 2s case not allowed
9256       const IROp opMAXS[3]
9257          = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4 };
9258       const IROp opMAXU[3]
9259          = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4 };
9260       const IROp opMINS[3]
9261          = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4 };
9262       const IROp opMINU[3]
9263          = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4 };
9264       const IROp opADD[3]
9265          = { Iop_Add8x16,  Iop_Add16x8,  Iop_Add32x4 };
9266       vassert(size < 3);
9267       IROp op = Iop_INVALID;
9268       const HChar* nm = NULL;
9269       switch (ix) {
9270          case 1: op = opMAXS[size]; nm = "smaxv"; break;
9271          case 2: op = opMAXU[size]; nm = "umaxv"; break;
9272          case 3: op = opMINS[size]; nm = "sminv"; break;
9273          case 4: op = opMINU[size]; nm = "uminv"; break;
9274          case 5: op = opADD[size];  nm = "addv";  break;
9275          default: vassert(0);
9276       }
9277       vassert(op != Iop_INVALID && nm != NULL);
9278       IRTemp tN1 = newTempV128();
9279       assign(tN1, getQReg128(nn));
9280       /* If Q == 0, we're just folding lanes in the lower half of
9281          the value.  In which case, copy the lower half of the
9282          source into the upper half, so we can then treat it the
9283          same as the full width case.  Except for the addition case,
9284          in which we have to zero out the upper half. */
9285       IRTemp tN2 = newTempV128();
9286       assign(tN2, bitQ == 0
9287                      ? (ix == 5 ? unop(Iop_ZeroHI64ofV128, mkexpr(tN1))
9288                                 : mk_CatEvenLanes64x2(tN1,tN1))
9289                      : mkexpr(tN1));
9290       IRTemp res = math_FOLDV(tN2, op);
9291       if (res == IRTemp_INVALID)
9292          return False; /* means math_FOLDV
9293                           doesn't handle this case yet */
9294       putQReg128(dd, mkexpr(res));
9295       const IRType tys[3] = { Ity_I8, Ity_I16, Ity_I32 };
9296       IRType laneTy = tys[size];
9297       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9298       DIP("%s %s, %s.%s\n", nm,
9299           nameQRegLO(dd, laneTy), nameQReg128(nn), arr);
9300       return True;
9301    }
9302
9303    if ((size == X00 || size == X10)
9304        && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) {
9305       /* -------- 0,00,01100: FMAXMNV s_4s -------- */
9306       /* -------- 0,10,01100: FMINMNV s_4s -------- */
9307       /* -------- 1,00,01111: FMAXV   s_4s -------- */
9308       /* -------- 1,10,01111: FMINV   s_4s -------- */
9309       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
9310       if (bitQ == 0) return False; // Only 4s is allowed
9311       Bool   isMIN = (size & 2) == 2;
9312       Bool   isNM  = opcode == BITS5(0,1,1,0,0);
9313       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(2);
9314       IRTemp src = newTempV128();
9315       assign(src, getQReg128(nn));
9316       IRTemp res = math_FOLDV(src, opMXX);
9317       putQReg128(dd, mkexpr(res));
9318       DIP("%s%sv s%u, %u.4s\n",
9319           isMIN ? "fmin" : "fmax", isNM ? "nm" : "", dd, nn);
9320       return True;
9321    }
9322
9323 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9324    return False;
9325 #  undef INSN
9326 }
9327
9328
9329 static
9330 Bool dis_AdvSIMD_copy(/*MB_OUT*/DisResult* dres, UInt insn)
9331 {
9332    /* 31     28       20   15 14   10 9 4
9333       0 q op 01110000 imm5 0  imm4 1  n d
9334       Decode fields: q,op,imm4
9335    */
9336 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9337    if (INSN(31,31) != 0
9338        || INSN(28,21) != BITS8(0,1,1,1,0,0,0,0)
9339        || INSN(15,15) != 0 || INSN(10,10) != 1) {
9340       return False;
9341    }
9342    UInt bitQ  = INSN(30,30);
9343    UInt bitOP = INSN(29,29);
9344    UInt imm5  = INSN(20,16);
9345    UInt imm4  = INSN(14,11);
9346    UInt nn    = INSN(9,5);
9347    UInt dd    = INSN(4,0);
9348
9349    /* -------- x,0,0000: DUP (element, vector) -------- */
9350    /* 31  28       20   15     9 4
9351       0q0 01110000 imm5 000001 n d  DUP Vd.T, Vn.Ts[index]
9352    */
9353    if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
9354       UInt   laneNo    = 0;
9355       UInt   laneSzLg2 = 0;
9356       HChar  laneCh    = '?';
9357       IRTemp res       = handle_DUP_VEC_ELEM(&laneNo, &laneSzLg2, &laneCh,
9358                                              getQReg128(nn), imm5);
9359       if (res == IRTemp_INVALID)
9360          return False;
9361       if (bitQ == 0 && laneSzLg2 == X11)
9362          return False; /* .1d case */
9363       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9364       const HChar* arT = nameArr_Q_SZ(bitQ, laneSzLg2);
9365       DIP("dup %s.%s, %s.%c[%u]\n",
9366            nameQReg128(dd), arT, nameQReg128(nn), laneCh, laneNo);
9367       return True;
9368    }
9369
9370    /* -------- x,0,0001: DUP (general, vector) -------- */
9371    /* 31  28       20   15       9 4
9372       0q0 01110000 imm5 0 0001 1 n d  DUP Vd.T, Rn
9373       Q=0 writes 64, Q=1 writes 128
9374       imm5: xxxx1  8B(q=0)      or 16b(q=1),     R=W
9375             xxx10  4H(q=0)      or 8H(q=1),      R=W
9376             xx100  2S(q=0)      or 4S(q=1),      R=W
9377             x1000  Invalid(q=0) or 2D(q=1),      R=X
9378             x0000  Invalid(q=0) or Invalid(q=1)
9379       Require op=0, imm4=0001
9380    */
9381    if (bitOP == 0 && imm4 == BITS4(0,0,0,1)) {
9382       Bool   isQ = bitQ == 1;
9383       IRTemp w0  = newTemp(Ity_I64);
9384       const HChar* arT = "??";
9385       IRType laneTy = Ity_INVALID;
9386       if (imm5 & 1) {
9387          arT    = isQ ? "16b" : "8b";
9388          laneTy = Ity_I8;
9389          assign(w0, unop(Iop_8Uto64, unop(Iop_64to8, getIReg64orZR(nn))));
9390       }
9391       else if (imm5 & 2) {
9392          arT    = isQ ? "8h" : "4h";
9393          laneTy = Ity_I16;
9394          assign(w0, unop(Iop_16Uto64, unop(Iop_64to16, getIReg64orZR(nn))));
9395       }
9396       else if (imm5 & 4) {
9397          arT    = isQ ? "4s" : "2s";
9398          laneTy = Ity_I32;
9399          assign(w0, unop(Iop_32Uto64, unop(Iop_64to32, getIReg64orZR(nn))));
9400       }
9401       else if ((imm5 & 8) && isQ) {
9402          arT    = "2d";
9403          laneTy = Ity_I64;
9404          assign(w0, getIReg64orZR(nn));
9405       }
9406       else {
9407          /* invalid; leave laneTy unchanged. */
9408       }
9409       /* */
9410       if (laneTy != Ity_INVALID) {
9411          IRTemp w1 = math_DUP_TO_64(w0, laneTy);
9412          putQReg128(dd, binop(Iop_64HLtoV128,
9413                               isQ ? mkexpr(w1) : mkU64(0), mkexpr(w1)));
9414          DIP("dup %s.%s, %s\n",
9415              nameQReg128(dd), arT, nameIRegOrZR(laneTy == Ity_I64, nn));
9416          return True;
9417       }
9418       /* invalid */
9419       return False;
9420    }
9421
9422    /* -------- 1,0,0011: INS (general) -------- */
9423    /* 31  28       20   15     9 4
9424       010 01110000 imm5 000111 n d  INS Vd.Ts[ix], Rn
9425       where Ts,ix = case imm5 of xxxx1 -> B, xxxx
9426                                  xxx10 -> H, xxx
9427                                  xx100 -> S, xx
9428                                  x1000 -> D, x
9429    */
9430    if (bitQ == 1 && bitOP == 0 && imm4 == BITS4(0,0,1,1)) {
9431       HChar   ts     = '?';
9432       UInt    laneNo = 16;
9433       IRExpr* src    = NULL;
9434       if (imm5 & 1) {
9435          src    = unop(Iop_64to8, getIReg64orZR(nn));
9436          laneNo = (imm5 >> 1) & 15;
9437          ts     = 'b';
9438       }
9439       else if (imm5 & 2) {
9440          src    = unop(Iop_64to16, getIReg64orZR(nn));
9441          laneNo = (imm5 >> 2) & 7;
9442          ts     = 'h';
9443       }
9444       else if (imm5 & 4) {
9445          src    = unop(Iop_64to32, getIReg64orZR(nn));
9446          laneNo = (imm5 >> 3) & 3;
9447          ts     = 's';
9448       }
9449       else if (imm5 & 8) {
9450          src    = getIReg64orZR(nn);
9451          laneNo = (imm5 >> 4) & 1;
9452          ts     = 'd';
9453       }
9454       /* */
9455       if (src) {
9456          vassert(laneNo < 16);
9457          putQRegLane(dd, laneNo, src);
9458          DIP("ins %s.%c[%u], %s\n",
9459              nameQReg128(dd), ts, laneNo, nameIReg64orZR(nn));
9460          return True;
9461       }
9462       /* invalid */
9463       return False;
9464    }
9465
9466    /* -------- x,0,0101: SMOV -------- */
9467    /* -------- x,0,0111: UMOV -------- */
9468    /* 31  28        20   15     9 4
9469       0q0 01110 000 imm5 001111 n d  UMOV Xd/Wd, Vn.Ts[index]
9470       0q0 01110 000 imm5 001011 n d  SMOV Xd/Wd, Vn.Ts[index]
9471       dest is Xd when q==1, Wd when q==0
9472       UMOV:
9473          Ts,index,ops = case q:imm5 of
9474                           0:xxxx1 -> B, xxxx, 8Uto64
9475                           1:xxxx1 -> invalid
9476                           0:xxx10 -> H, xxx,  16Uto64
9477                           1:xxx10 -> invalid
9478                           0:xx100 -> S, xx,   32Uto64
9479                           1:xx100 -> invalid
9480                           1:x1000 -> D, x,    copy64
9481                           other   -> invalid
9482       SMOV:
9483          Ts,index,ops = case q:imm5 of
9484                           0:xxxx1 -> B, xxxx, (32Uto64 . 8Sto32)
9485                           1:xxxx1 -> B, xxxx, 8Sto64
9486                           0:xxx10 -> H, xxx,  (32Uto64 . 16Sto32)
9487                           1:xxx10 -> H, xxx,  16Sto64
9488                           0:xx100 -> invalid
9489                           1:xx100 -> S, xx,   32Sto64
9490                           1:x1000 -> invalid
9491                           other   -> invalid
9492    */
9493    if (bitOP == 0 && (imm4 == BITS4(0,1,0,1) || imm4 == BITS4(0,1,1,1))) {
9494       Bool isU  = (imm4 & 2) == 2;
9495       const HChar* arTs = "??";
9496       UInt    laneNo = 16; /* invalid */
9497       // Setting 'res' to non-NULL determines valid/invalid
9498       IRExpr* res    = NULL;
9499       if (!bitQ && (imm5 & 1)) { // 0:xxxx1
9500          laneNo = (imm5 >> 1) & 15;
9501          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
9502          res = isU ? unop(Iop_8Uto64, lane)
9503                    : unop(Iop_32Uto64, unop(Iop_8Sto32, lane));
9504          arTs = "b";
9505       }
9506       else if (bitQ && (imm5 & 1)) { // 1:xxxx1
9507          laneNo = (imm5 >> 1) & 15;
9508          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
9509          res = isU ? NULL
9510                    : unop(Iop_8Sto64, lane);
9511          arTs = "b";
9512       }
9513       else if (!bitQ && (imm5 & 2)) { // 0:xxx10
9514          laneNo = (imm5 >> 2) & 7;
9515          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
9516          res = isU ? unop(Iop_16Uto64, lane)
9517                    : unop(Iop_32Uto64, unop(Iop_16Sto32, lane));
9518          arTs = "h";
9519       }
9520       else if (bitQ && (imm5 & 2)) { // 1:xxx10
9521          laneNo = (imm5 >> 2) & 7;
9522          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
9523          res = isU ? NULL
9524                    : unop(Iop_16Sto64, lane);
9525          arTs = "h";
9526       }
9527       else if (!bitQ && (imm5 & 4)) { // 0:xx100
9528          laneNo = (imm5 >> 3) & 3;
9529          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
9530          res = isU ? unop(Iop_32Uto64, lane)
9531                    : NULL;
9532          arTs = "s";
9533       }
9534       else if (bitQ && (imm5 & 4)) { // 1:xxx10
9535          laneNo = (imm5 >> 3) & 3;
9536          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
9537          res = isU ? NULL
9538                    : unop(Iop_32Sto64, lane);
9539          arTs = "s";
9540       }
9541       else if (bitQ && (imm5 & 8)) { // 1:x1000
9542          laneNo = (imm5 >> 4) & 1;
9543          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I64);
9544          res = isU ? lane
9545                    : NULL;
9546          arTs = "d";
9547       }
9548       /* */
9549       if (res) {
9550          vassert(laneNo < 16);
9551          putIReg64orZR(dd, res);
9552          DIP("%cmov %s, %s.%s[%u]\n", isU ? 'u' : 's',
9553              nameIRegOrZR(bitQ == 1, dd),
9554              nameQReg128(nn), arTs, laneNo);
9555          return True;
9556       }
9557       /* invalid */
9558       return False;
9559    }
9560
9561    /* -------- 1,1,xxxx: INS (element) -------- */
9562    /* 31  28       20     14   9 4
9563       011 01110000 imm5 0 imm4 n d  INS Vd.Ts[ix1], Vn.Ts[ix2]
9564       where Ts,ix1,ix2
9565                = case imm5 of xxxx1 -> B, xxxx, imm4[3:0]
9566                               xxx10 -> H, xxx,  imm4[3:1]
9567                               xx100 -> S, xx,   imm4[3:2]
9568                               x1000 -> D, x,    imm4[3:3]
9569    */
9570    if (bitQ == 1 && bitOP == 1) {
9571       HChar   ts  = '?';
9572       IRType  ity = Ity_INVALID;
9573       UInt    ix1 = 16;
9574       UInt    ix2 = 16;
9575       if (imm5 & 1) {
9576          ts  = 'b';
9577          ity = Ity_I8;
9578          ix1 = (imm5 >> 1) & 15;
9579          ix2 = (imm4 >> 0) & 15;
9580       }
9581       else if (imm5 & 2) {
9582          ts  = 'h';
9583          ity = Ity_I16;
9584          ix1 = (imm5 >> 2) & 7;
9585          ix2 = (imm4 >> 1) & 7;
9586       }
9587       else if (imm5 & 4) {
9588          ts  = 's';
9589          ity = Ity_I32;
9590          ix1 = (imm5 >> 3) & 3;
9591          ix2 = (imm4 >> 2) & 3;
9592       }
9593       else if (imm5 & 8) {
9594          ts  = 'd';
9595          ity = Ity_I64;
9596          ix1 = (imm5 >> 4) & 1;
9597          ix2 = (imm4 >> 3) & 1;
9598       }
9599       /* */
9600       if (ity != Ity_INVALID) {
9601          vassert(ix1 < 16);
9602          vassert(ix2 < 16);
9603          putQRegLane(dd, ix1, getQRegLane(nn, ix2, ity));
9604          DIP("ins %s.%c[%u], %s.%c[%u]\n",
9605              nameQReg128(dd), ts, ix1, nameQReg128(nn), ts, ix2);
9606          return True;
9607       }
9608       /* invalid */
9609       return False;
9610    }
9611
9612    return False;
9613 #  undef INSN
9614 }
9615
9616
9617 static
9618 Bool dis_AdvSIMD_modified_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
9619 {
9620    /* 31    28          18  15    11 9     4
9621       0q op 01111 00000 abc cmode 01 defgh d
9622       Decode fields: q,op,cmode
9623       Bit 11 is really "o2", but it is always zero.
9624    */
9625 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9626    if (INSN(31,31) != 0
9627        || INSN(28,19) != BITS10(0,1,1,1,1,0,0,0,0,0)
9628        || INSN(11,10) != BITS2(0,1)) {
9629       return False;
9630    }
9631    UInt bitQ     = INSN(30,30);
9632    UInt bitOP    = INSN(29,29);
9633    UInt cmode    = INSN(15,12);
9634    UInt abcdefgh = (INSN(18,16) << 5) | INSN(9,5);
9635    UInt dd       = INSN(4,0);
9636
9637    ULong imm64lo  = 0;
9638    UInt  op_cmode = (bitOP << 4) | cmode;
9639    Bool  ok       = False;
9640    Bool  isORR    = False;
9641    Bool  isBIC    = False;
9642    Bool  isMOV    = False;
9643    Bool  isMVN    = False;
9644    Bool  isFMOV   = False;
9645    switch (op_cmode) {
9646       /* -------- x,0,0000 MOVI 32-bit shifted imm -------- */
9647       /* -------- x,0,0010 MOVI 32-bit shifted imm -------- */
9648       /* -------- x,0,0100 MOVI 32-bit shifted imm -------- */
9649       /* -------- x,0,0110 MOVI 32-bit shifted imm -------- */
9650       case BITS5(0,0,0,0,0): case BITS5(0,0,0,1,0):
9651       case BITS5(0,0,1,0,0): case BITS5(0,0,1,1,0): // 0:0xx0
9652          ok = True; isMOV = True; break;
9653
9654       /* -------- x,0,0001 ORR (vector, immediate) 32-bit -------- */
9655       /* -------- x,0,0011 ORR (vector, immediate) 32-bit -------- */
9656       /* -------- x,0,0101 ORR (vector, immediate) 32-bit -------- */
9657       /* -------- x,0,0111 ORR (vector, immediate) 32-bit -------- */
9658       case BITS5(0,0,0,0,1): case BITS5(0,0,0,1,1):
9659       case BITS5(0,0,1,0,1): case BITS5(0,0,1,1,1): // 0:0xx1
9660          ok = True; isORR = True; break;
9661
9662       /* -------- x,0,1000 MOVI 16-bit shifted imm -------- */
9663       /* -------- x,0,1010 MOVI 16-bit shifted imm -------- */
9664       case BITS5(0,1,0,0,0): case BITS5(0,1,0,1,0): // 0:10x0
9665          ok = True; isMOV = True; break;
9666
9667       /* -------- x,0,1001 ORR (vector, immediate) 16-bit -------- */
9668       /* -------- x,0,1011 ORR (vector, immediate) 16-bit -------- */
9669       case BITS5(0,1,0,0,1): case BITS5(0,1,0,1,1): // 0:10x1
9670          ok = True; isORR = True; break;
9671
9672       /* -------- x,0,1100 MOVI 32-bit shifting ones -------- */
9673       /* -------- x,0,1101 MOVI 32-bit shifting ones -------- */
9674       case BITS5(0,1,1,0,0): case BITS5(0,1,1,0,1): // 0:110x
9675          ok = True; isMOV = True; break;
9676
9677       /* -------- x,0,1110 MOVI 8-bit -------- */
9678       case BITS5(0,1,1,1,0):
9679          ok = True; isMOV = True; break;
9680
9681       /* -------- x,0,1111 FMOV (vector, immediate, F32) -------- */
9682       case BITS5(0,1,1,1,1): // 0:1111
9683          ok = True; isFMOV = True; break;
9684
9685       /* -------- x,1,0000 MVNI 32-bit shifted imm -------- */
9686       /* -------- x,1,0010 MVNI 32-bit shifted imm  -------- */
9687       /* -------- x,1,0100 MVNI 32-bit shifted imm  -------- */
9688       /* -------- x,1,0110 MVNI 32-bit shifted imm  -------- */
9689       case BITS5(1,0,0,0,0): case BITS5(1,0,0,1,0):
9690       case BITS5(1,0,1,0,0): case BITS5(1,0,1,1,0): // 1:0xx0
9691          ok = True; isMVN = True; break;
9692
9693       /* -------- x,1,0001 BIC (vector, immediate) 32-bit -------- */
9694       /* -------- x,1,0011 BIC (vector, immediate) 32-bit -------- */
9695       /* -------- x,1,0101 BIC (vector, immediate) 32-bit -------- */
9696       /* -------- x,1,0111 BIC (vector, immediate) 32-bit -------- */
9697       case BITS5(1,0,0,0,1): case BITS5(1,0,0,1,1):
9698       case BITS5(1,0,1,0,1): case BITS5(1,0,1,1,1): // 1:0xx1
9699          ok = True; isBIC = True; break;
9700
9701       /* -------- x,1,1000 MVNI 16-bit shifted imm -------- */
9702       /* -------- x,1,1010 MVNI 16-bit shifted imm -------- */
9703       case BITS5(1,1,0,0,0): case BITS5(1,1,0,1,0): // 1:10x0
9704          ok = True; isMVN = True; break;
9705
9706       /* -------- x,1,1001 BIC (vector, immediate) 16-bit -------- */
9707       /* -------- x,1,1011 BIC (vector, immediate) 16-bit -------- */
9708       case BITS5(1,1,0,0,1): case BITS5(1,1,0,1,1): // 1:10x1
9709          ok = True; isBIC = True; break;
9710
9711       /* -------- x,1,1100 MVNI 32-bit shifting ones -------- */
9712       /* -------- x,1,1101 MVNI 32-bit shifting ones -------- */
9713       case BITS5(1,1,1,0,0): case BITS5(1,1,1,0,1): // 1:110x
9714          ok = True; isMVN = True; break;
9715
9716       /* -------- 0,1,1110 MOVI 64-bit scalar -------- */
9717       /* -------- 1,1,1110 MOVI 64-bit vector -------- */
9718       case BITS5(1,1,1,1,0):
9719          ok = True; isMOV = True; break;
9720
9721       /* -------- 1,1,1111 FMOV (vector, immediate, F64) -------- */
9722       case BITS5(1,1,1,1,1): // 1:1111
9723          ok = bitQ == 1; isFMOV = True; break;
9724
9725       default:
9726         break;
9727    }
9728    if (ok) {
9729       vassert(1 == (isMOV ? 1 : 0) + (isMVN ? 1 : 0)
9730                    + (isORR ? 1 : 0) + (isBIC ? 1 : 0) + (isFMOV ? 1 : 0));
9731       ok = AdvSIMDExpandImm(&imm64lo, bitOP, cmode, abcdefgh);
9732    }
9733    if (ok) {
9734       if (isORR || isBIC) {
9735          ULong inv
9736             = isORR ? 0ULL : ~0ULL;
9737          IRExpr* immV128
9738             = binop(Iop_64HLtoV128, mkU64(inv ^ imm64lo), mkU64(inv ^ imm64lo));
9739          IRExpr* res
9740             = binop(isORR ? Iop_OrV128 : Iop_AndV128, getQReg128(dd), immV128);
9741          const HChar* nm = isORR ? "orr" : "bic";
9742          if (bitQ == 0) {
9743             putQReg128(dd, unop(Iop_ZeroHI64ofV128, res));
9744             DIP("%s %s.1d, %016llx\n", nm, nameQReg128(dd), imm64lo);
9745          } else {
9746             putQReg128(dd, res);
9747             DIP("%s %s.2d, #0x%016llx'%016llx\n", nm,
9748                 nameQReg128(dd), imm64lo, imm64lo);
9749          }
9750       }
9751       else if (isMOV || isMVN || isFMOV) {
9752          if (isMVN) imm64lo = ~imm64lo;
9753          ULong   imm64hi = bitQ == 0  ? 0  :  imm64lo;
9754          IRExpr* immV128 = binop(Iop_64HLtoV128, mkU64(imm64hi),
9755                                                  mkU64(imm64lo));
9756          putQReg128(dd, immV128);
9757          DIP("mov %s, #0x%016llx'%016llx\n", nameQReg128(dd), imm64hi, imm64lo);
9758       }
9759       return True;
9760    }
9761    /* else fall through */
9762
9763    return False;
9764 #  undef INSN
9765 }
9766
9767
9768 static
9769 Bool dis_AdvSIMD_scalar_copy(/*MB_OUT*/DisResult* dres, UInt insn)
9770 {
9771    /* 31    28       20   15 14   10 9 4
9772       01 op 11110000 imm5 0  imm4 1  n d
9773       Decode fields: op,imm4
9774    */
9775 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9776    if (INSN(31,30) != BITS2(0,1)
9777        || INSN(28,21) != BITS8(1,1,1,1,0,0,0,0)
9778        || INSN(15,15) != 0 || INSN(10,10) != 1) {
9779       return False;
9780    }
9781    UInt bitOP = INSN(29,29);
9782    UInt imm5  = INSN(20,16);
9783    UInt imm4  = INSN(14,11);
9784    UInt nn    = INSN(9,5);
9785    UInt dd    = INSN(4,0);
9786
9787    if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
9788       /* -------- 0,0000 DUP (element, scalar) -------- */
9789       IRTemp w0     = newTemp(Ity_I64);
9790       const HChar* arTs = "??";
9791       IRType laneTy = Ity_INVALID;
9792       UInt   laneNo = 16; /* invalid */
9793       if (imm5 & 1) {
9794          arTs   = "b";
9795          laneNo = (imm5 >> 1) & 15;
9796          laneTy = Ity_I8;
9797          assign(w0, unop(Iop_8Uto64, getQRegLane(nn, laneNo, laneTy)));
9798       }
9799       else if (imm5 & 2) {
9800          arTs   = "h";
9801          laneNo = (imm5 >> 2) & 7;
9802          laneTy = Ity_I16;
9803          assign(w0, unop(Iop_16Uto64, getQRegLane(nn, laneNo, laneTy)));
9804       }
9805       else if (imm5 & 4) {
9806          arTs   = "s";
9807          laneNo = (imm5 >> 3) & 3;
9808          laneTy = Ity_I32;
9809          assign(w0, unop(Iop_32Uto64, getQRegLane(nn, laneNo, laneTy)));
9810       }
9811       else if (imm5 & 8) {
9812          arTs   = "d";
9813          laneNo = (imm5 >> 4) & 1;
9814          laneTy = Ity_I64;
9815          assign(w0, getQRegLane(nn, laneNo, laneTy));
9816       }
9817       else {
9818          /* invalid; leave laneTy unchanged. */
9819       }
9820       /* */
9821       if (laneTy != Ity_INVALID) {
9822          vassert(laneNo < 16);
9823          putQReg128(dd, binop(Iop_64HLtoV128, mkU64(0), mkexpr(w0)));
9824          DIP("dup %s, %s.%s[%u]\n",
9825              nameQRegLO(dd, laneTy), nameQReg128(nn), arTs, laneNo);
9826          return True;
9827       }
9828       /* else fall through */
9829    }
9830
9831    return False;
9832 #  undef INSN
9833 }
9834
9835
9836 static
9837 Bool dis_AdvSIMD_scalar_pairwise(/*MB_OUT*/DisResult* dres, UInt insn,
9838                                  const VexArchInfo* archinfo)
9839 {
9840    /* 31   28    23 21    16     11 9 4
9841       01 u 11110 sz 11000 opcode 10 n d
9842       Decode fields: u,sz,opcode
9843    */
9844 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9845    if (INSN(31,30) != BITS2(0,1)
9846        || INSN(28,24) != BITS5(1,1,1,1,0)
9847        || INSN(21,17) != BITS5(1,1,0,0,0)
9848        || INSN(11,10) != BITS2(1,0)) {
9849       return False;
9850    }
9851    UInt bitU   = INSN(29,29);
9852    UInt sz     = INSN(23,22);
9853    UInt opcode = INSN(16,12);
9854    UInt nn     = INSN(9,5);
9855    UInt dd     = INSN(4,0);
9856
9857    if (bitU == 0 && sz == X11 && opcode == BITS5(1,1,0,1,1)) {
9858       /* -------- 0,11,11011 ADDP d_2d -------- */
9859       IRTemp xy = newTempV128();
9860       IRTemp xx = newTempV128();
9861       assign(xy, getQReg128(nn));
9862       assign(xx, binop(Iop_InterleaveHI64x2, mkexpr(xy), mkexpr(xy)));
9863       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
9864                           binop(Iop_Add64x2, mkexpr(xy), mkexpr(xx))));
9865       DIP("addp d%u, %s.2d\n", dd, nameQReg128(nn));
9866       return True;
9867    }
9868
9869    if (bitU == 1 && sz <= X01 && opcode == BITS5(0,1,1,0,1)) {
9870       /* -------- 1,00,01101 ADDP s_2s -------- */
9871       /* -------- 1,01,01101 ADDP d_2d -------- */
9872       Bool   isD   = sz == X01;
9873       IROp   opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
9874       IROp   opADD = mkVecADDF(isD ? 3 : 2);
9875       IRTemp src   = newTempV128();
9876       IRTemp argL  = newTempV128();
9877       IRTemp argR  = newTempV128();
9878       assign(src, getQReg128(nn));
9879       assign(argL, unop(opZHI, mkexpr(src)));
9880       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
9881                                                     mkU8(isD ? 8 : 4))));
9882       putQReg128(dd, unop(opZHI,
9883                           triop(opADD, mkexpr(mk_get_IR_rounding_mode()),
9884                                               mkexpr(argL), mkexpr(argR))));
9885       DIP(isD ? "faddp d%u, v%u.2d\n" : "faddp s%u, v%u.2s\n", dd, nn);
9886       return True;
9887    }
9888
9889    /* Half-precision floating point ADDP (v8.2). */
9890    if (bitU == 0 && sz <= X00 && opcode == BITS5(0,1,1,0,1)) {
9891       /* -------- 0,00,01101 ADDP h_2h -------- */
9892       if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
9893          return False;
9894       IROp   opZHI = mkVecZEROHIxxOFV128(1);
9895       IROp   opADD = mkVecADDF(1);
9896       IRTemp src   = newTempV128();
9897       IRTemp argL  = newTempV128();
9898       IRTemp argR  = newTempV128();
9899       assign(src, getQReg128(nn));
9900       assign(argL, unop(opZHI, mkexpr(src)));
9901       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
9902                                                     mkU8(2))));
9903       putQReg128(dd, unop(opZHI,
9904                           triop(opADD, mkexpr(mk_get_IR_rounding_mode()),
9905                                               mkexpr(argL), mkexpr(argR))));
9906       DIP("faddp h%u, v%u.2h\n", dd, nn);
9907       return True;
9908    }
9909
9910    if (bitU == 1
9911        && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) {
9912       /* -------- 1,0x,01100 FMAXNMP d_2d, s_2s -------- */
9913       /* -------- 1,1x,01100 FMINNMP d_2d, s_2s -------- */
9914       /* -------- 1,0x,01111 FMAXP   d_2d, s_2s -------- */
9915       /* -------- 1,1x,01111 FMINP   d_2d, s_2s -------- */
9916       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
9917       Bool   isD   = (sz & 1) == 1;
9918       Bool   isMIN = (sz & 2) == 2;
9919       Bool   isNM  = opcode == BITS5(0,1,1,0,0);
9920       IROp   opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
9921       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? 3 : 2);
9922       IRTemp src   = newTempV128();
9923       IRTemp argL  = newTempV128();
9924       IRTemp argR  = newTempV128();
9925       assign(src, getQReg128(nn));
9926       assign(argL, unop(opZHI, mkexpr(src)));
9927       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
9928                                                     mkU8(isD ? 8 : 4))));
9929       putQReg128(dd, unop(opZHI,
9930                           binop(opMXX, mkexpr(argL), mkexpr(argR))));
9931       HChar c = isD ? 'd' : 's';
9932       DIP("%s%sp %c%u, v%u.2%c\n",
9933            isMIN ? "fmin" : "fmax", isNM ? "nm" : "", c, dd, nn, c);
9934       return True;
9935    }
9936
9937    return False;
9938 #  undef INSN
9939 }
9940
9941
9942 static
9943 Bool dis_AdvSIMD_scalar_shift_by_imm(/*MB_OUT*/DisResult* dres, UInt insn)
9944 {
9945    /* 31   28     22   18   15     10 9 4
9946       01 u 111110 immh immb opcode 1  n d
9947       Decode fields: u,immh,opcode
9948    */
9949 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9950    if (INSN(31,30) != BITS2(0,1)
9951        || INSN(28,23) != BITS6(1,1,1,1,1,0) || INSN(10,10) != 1) {
9952       return False;
9953    }
9954    UInt bitU   = INSN(29,29);
9955    UInt immh   = INSN(22,19);
9956    UInt immb   = INSN(18,16);
9957    UInt opcode = INSN(15,11);
9958    UInt nn     = INSN(9,5);
9959    UInt dd     = INSN(4,0);
9960    UInt immhb  = (immh << 3) | immb;
9961
9962    if ((immh & 8) == 8
9963        && (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0))) {
9964       /* -------- 0,1xxx,00000 SSHR d_d_#imm -------- */
9965       /* -------- 1,1xxx,00000 USHR d_d_#imm -------- */
9966       /* -------- 0,1xxx,00010 SSRA d_d_#imm -------- */
9967       /* -------- 1,1xxx,00010 USRA d_d_#imm -------- */
9968       Bool isU   = bitU == 1;
9969       Bool isAcc = opcode == BITS5(0,0,0,1,0);
9970       UInt sh    = 128 - immhb;
9971       vassert(sh >= 1 && sh <= 64);
9972       IROp    op  = isU ? Iop_ShrN64x2 : Iop_SarN64x2;
9973       IRExpr* src = getQReg128(nn);
9974       IRTemp  shf = newTempV128();
9975       IRTemp  res = newTempV128();
9976       if (sh == 64 && isU) {
9977          assign(shf, mkV128(0x0000));
9978       } else {
9979          UInt nudge = 0;
9980          if (sh == 64) {
9981             vassert(!isU);
9982             nudge = 1;
9983          }
9984          assign(shf, binop(op, src, mkU8(sh - nudge)));
9985       }
9986       assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
9987                         : mkexpr(shf));
9988       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9989       const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
9990                               : (isU ? "ushr" : "sshr");
9991       DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
9992       return True;
9993    }
9994
9995    if ((immh & 8) == 8
9996        && (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0))) {
9997       /* -------- 0,1xxx,00100 SRSHR d_d_#imm -------- */
9998       /* -------- 1,1xxx,00100 URSHR d_d_#imm -------- */
9999       /* -------- 0,1xxx,00110 SRSRA d_d_#imm -------- */
10000       /* -------- 1,1xxx,00110 URSRA d_d_#imm -------- */
10001       Bool isU   = bitU == 1;
10002       Bool isAcc = opcode == BITS5(0,0,1,1,0);
10003       UInt sh    = 128 - immhb;
10004       vassert(sh >= 1 && sh <= 64);
10005       IROp    op  = isU ? Iop_Rsh64Ux2 : Iop_Rsh64Sx2;
10006       vassert(sh >= 1 && sh <= 64);
10007       IRExpr* src  = getQReg128(nn);
10008       IRTemp  imm8 = newTemp(Ity_I8);
10009       assign(imm8, mkU8((UChar)(-sh)));
10010       IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
10011       IRTemp  shf  = newTempV128();
10012       IRTemp  res  = newTempV128();
10013       assign(shf, binop(op, src, amt));
10014       assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
10015                         : mkexpr(shf));
10016       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10017       const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
10018                               : (isU ? "urshr" : "srshr");
10019       DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
10020       return True;
10021    }
10022
10023    if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,0,0)) {
10024       /* -------- 1,1xxx,01000 SRI d_d_#imm -------- */
10025       UInt sh = 128 - immhb;
10026       vassert(sh >= 1 && sh <= 64);
10027       if (sh == 64) {
10028          putQReg128(dd, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
10029       } else {
10030          /* sh is in range 1 .. 63 */
10031          ULong   nmask  = (ULong)(((Long)0x8000000000000000ULL) >> (sh-1));
10032          IRExpr* nmaskV = binop(Iop_64HLtoV128, mkU64(nmask), mkU64(nmask));
10033          IRTemp  res    = newTempV128();
10034          assign(res, binop(Iop_OrV128,
10035                            binop(Iop_AndV128, getQReg128(dd), nmaskV),
10036                            binop(Iop_ShrN64x2, getQReg128(nn), mkU8(sh))));
10037          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10038       }
10039       DIP("sri d%u, d%u, #%u\n", dd, nn, sh);
10040       return True;
10041    }
10042
10043    if (bitU == 0 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
10044       /* -------- 0,1xxx,01010 SHL d_d_#imm -------- */
10045       UInt sh = immhb - 64;
10046       vassert(sh >= 0 && sh < 64);
10047       putQReg128(dd,
10048                  unop(Iop_ZeroHI64ofV128,
10049                       sh == 0 ? getQReg128(nn)
10050                               : binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
10051       DIP("shl d%u, d%u, #%u\n", dd, nn, sh);
10052       return True;
10053    }
10054
10055    if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
10056       /* -------- 1,1xxx,01010 SLI d_d_#imm -------- */
10057       UInt sh = immhb - 64;
10058       vassert(sh >= 0 && sh < 64);
10059       if (sh == 0) {
10060          putQReg128(dd, unop(Iop_ZeroHI64ofV128, getQReg128(nn)));
10061       } else {
10062          /* sh is in range 1 .. 63 */
10063          ULong   nmask  = (1ULL << sh) - 1;
10064          IRExpr* nmaskV = binop(Iop_64HLtoV128, mkU64(nmask), mkU64(nmask));
10065          IRTemp  res    = newTempV128();
10066          assign(res, binop(Iop_OrV128,
10067                            binop(Iop_AndV128, getQReg128(dd), nmaskV),
10068                            binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
10069          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10070       }
10071       DIP("sli d%u, d%u, #%u\n", dd, nn, sh);
10072       return True;
10073    }
10074
10075    if (opcode == BITS5(0,1,1,1,0)
10076        || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
10077       /* -------- 0,01110  SQSHL  #imm -------- */
10078       /* -------- 1,01110  UQSHL  #imm -------- */
10079       /* -------- 1,01100  SQSHLU #imm -------- */
10080       UInt size  = 0;
10081       UInt shift = 0;
10082       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10083       if (!ok) return False;
10084       vassert(size >= 0 && size <= 3);
10085       /* The shift encoding has opposite sign for the leftwards case.
10086          Adjust shift to compensate. */
10087       UInt lanebits = 8 << size;
10088       shift = lanebits - shift;
10089       vassert(shift >= 0 && shift < lanebits);
10090       const HChar* nm = NULL;
10091       /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
10092       else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
10093       else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
10094       else vassert(0);
10095       IRTemp qDiff1 = IRTemp_INVALID;
10096       IRTemp qDiff2 = IRTemp_INVALID;
10097       IRTemp res = IRTemp_INVALID;
10098       IRTemp src = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn));
10099       /* This relies on the fact that the zeroed out lanes generate zeroed
10100          result lanes and don't saturate, so there's no point in trimming
10101          the resulting res, qDiff1 or qDiff2 values. */
10102       math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
10103       putQReg128(dd, mkexpr(res));
10104       updateQCFLAGwithDifference(qDiff1, qDiff2);
10105       const HChar arr = "bhsd"[size];
10106       DIP("%s %c%u, %c%u, #%u\n", nm, arr, dd, arr, nn, shift);
10107       return True;
10108    }
10109
10110    if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
10111        || (bitU == 1
10112            && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
10113       /* -------- 0,10010   SQSHRN #imm -------- */
10114       /* -------- 1,10010   UQSHRN #imm -------- */
10115       /* -------- 0,10011  SQRSHRN #imm -------- */
10116       /* -------- 1,10011  UQRSHRN #imm -------- */
10117       /* -------- 1,10000  SQSHRUN #imm -------- */
10118       /* -------- 1,10001 SQRSHRUN #imm -------- */
10119       UInt size  = 0;
10120       UInt shift = 0;
10121       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10122       if (!ok || size == X11) return False;
10123       vassert(size >= X00 && size <= X10);
10124       vassert(shift >= 1 && shift <= (8 << size));
10125       const HChar* nm = "??";
10126       IROp op = Iop_INVALID;
10127       /* Decide on the name and the operation. */
10128       /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
10129          nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size);
10130       }
10131       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
10132          nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size);
10133       }
10134       else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) {
10135          nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size);
10136       }
10137       else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
10138          nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size);
10139       }
10140       else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) {
10141          nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size);
10142       }
10143       else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) {
10144          nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size);
10145       }
10146       else vassert(0);
10147       /* Compute the result (Q, shifted value) pair. */
10148       IRTemp src128 = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size+1, getQReg128(nn));
10149       IRTemp pair   = newTempV128();
10150       assign(pair, binop(op, mkexpr(src128), mkU8(shift)));
10151       /* Update the result reg */
10152       IRTemp res64in128 = newTempV128();
10153       assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair)));
10154       putQReg128(dd, mkexpr(res64in128));
10155       /* Update the Q flag. */
10156       IRTemp q64q64 = newTempV128();
10157       assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair)));
10158       IRTemp z128 = newTempV128();
10159       assign(z128, mkV128(0x0000));
10160       updateQCFLAGwithDifference(q64q64, z128);
10161       /* */
10162       const HChar arrNarrow = "bhsd"[size];
10163       const HChar arrWide   = "bhsd"[size+1];
10164       DIP("%s %c%u, %c%u, #%u\n", nm, arrNarrow, dd, arrWide, nn, shift);
10165       return True;
10166    }
10167
10168    if (immh >= BITS4(0,1,0,0) && opcode == BITS5(1,1,1,0,0)) {
10169       /* -------- 0,!=00xx,11100 SCVTF d_d_imm, s_s_imm -------- */
10170       /* -------- 1,!=00xx,11100 UCVTF d_d_imm, s_s_imm -------- */
10171       UInt size  = 0;
10172       UInt fbits = 0;
10173       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
10174       /* The following holds because immh is never zero. */
10175       vassert(ok);
10176       /* The following holds because immh >= 0100. */
10177       vassert(size == X10 || size == X11);
10178       Bool isD = size == X11;
10179       Bool isU = bitU == 1;
10180       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
10181       Double  scale  = two_to_the_minus(fbits);
10182       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
10183                              : IRExpr_Const(IRConst_F32( (Float)scale ));
10184       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
10185       IROp    opCVT  = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
10186                            : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
10187       IRType tyF = isD ? Ity_F64 : Ity_F32;
10188       IRType tyI = isD ? Ity_I64 : Ity_I32;
10189       IRTemp src = newTemp(tyI);
10190       IRTemp res = newTemp(tyF);
10191       IRTemp rm  = mk_get_IR_rounding_mode();
10192       assign(src, getQRegLane(nn, 0, tyI));
10193       assign(res, triop(opMUL, mkexpr(rm),
10194                                binop(opCVT, mkexpr(rm), mkexpr(src)), scaleE));
10195       putQRegLane(dd, 0, mkexpr(res));
10196       if (!isD) {
10197          putQRegLane(dd, 1, mkU32(0));
10198       }
10199       putQRegLane(dd, 1, mkU64(0));
10200       const HChar ch = isD ? 'd' : 's';
10201       DIP("%s %c%u, %c%u, #%u\n", isU ? "ucvtf" : "scvtf",
10202           ch, dd, ch, nn, fbits);
10203       return True;
10204    }
10205
10206    if (immh >= BITS4(0,1,0,0) && opcode == BITS5(1,1,1,1,1)) {
10207       /* -------- 0,!=00xx,11111 FCVTZS d_d_imm, s_s_imm -------- */
10208       /* -------- 1,!=00xx,11111 FCVTZU d_d_imm, s_s_imm -------- */
10209       UInt size  = 0;
10210       UInt fbits = 0;
10211       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
10212       /* The following holds because immh is never zero. */
10213       vassert(ok);
10214       /* The following holds because immh >= 0100. */
10215       vassert(size == X10 || size == X11);
10216       Bool isD = size == X11;
10217       Bool isU = bitU == 1;
10218       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
10219       Double  scale  = two_to_the_plus(fbits);
10220       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
10221                            : IRExpr_Const(IRConst_F32( (Float)scale ));
10222       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
10223       IROp    opCVT  = isU ? (isD ? Iop_F64toI64U : Iop_F32toI32U)
10224                            : (isD ? Iop_F64toI64S : Iop_F32toI32S);
10225       IRType tyF = isD ? Ity_F64 : Ity_F32;
10226       IRType tyI = isD ? Ity_I64 : Ity_I32;
10227       IRTemp src = newTemp(tyF);
10228       IRTemp res = newTemp(tyI);
10229       IRTemp rm  = newTemp(Ity_I32);
10230       assign(src, getQRegLane(nn, 0, tyF));
10231       assign(rm,  mkU32(Irrm_ZERO));
10232       assign(res, binop(opCVT, mkexpr(rm),
10233                                triop(opMUL, mkexpr(rm), mkexpr(src), scaleE)));
10234       putQRegLane(dd, 0, mkexpr(res));
10235       if (!isD) {
10236          putQRegLane(dd, 1, mkU32(0));
10237       }
10238       putQRegLane(dd, 1, mkU64(0));
10239       const HChar ch = isD ? 'd' : 's';
10240       DIP("%s %c%u, %c%u, #%u\n", isU ? "fcvtzu" : "fcvtzs",
10241           ch, dd, ch, nn, fbits);
10242       return True;
10243    }
10244
10245 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10246    return False;
10247 #  undef INSN
10248 }
10249
10250
10251 static
10252 Bool dis_AdvSIMD_scalar_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
10253 {
10254    /* 31 29 28    23   21 20 15     11 9 4
10255       01 U  11110 size 1  m  opcode 00 n d
10256       Decode fields: u,opcode
10257    */
10258 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10259    if (INSN(31,30) != BITS2(0,1)
10260        || INSN(28,24) != BITS5(1,1,1,1,0)
10261        || INSN(21,21) != 1
10262        || INSN(11,10) != BITS2(0,0)) {
10263       return False;
10264    }
10265    UInt bitU   = INSN(29,29);
10266    UInt size   = INSN(23,22);
10267    UInt mm     = INSN(20,16);
10268    UInt opcode = INSN(15,12);
10269    UInt nn     = INSN(9,5);
10270    UInt dd     = INSN(4,0);
10271    vassert(size < 4);
10272
10273    if (bitU == 0
10274        && (opcode == BITS4(1,1,0,1)
10275            || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
10276       /* -------- 0,1101  SQDMULL -------- */ // 0 (ks)
10277       /* -------- 0,1001  SQDMLAL -------- */ // 1
10278       /* -------- 0,1011  SQDMLSL -------- */ // 2
10279       /* Widens, and size refers to the narrowed lanes. */
10280       UInt ks = 3;
10281       switch (opcode) {
10282          case BITS4(1,1,0,1): ks = 0; break;
10283          case BITS4(1,0,0,1): ks = 1; break;
10284          case BITS4(1,0,1,1): ks = 2; break;
10285          default: vassert(0);
10286       }
10287       vassert(ks >= 0 && ks <= 2);
10288       if (size == X00 || size == X11) return False;
10289       vassert(size <= 2);
10290       IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
10291       vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
10292       newTempsV128_3(&vecN, &vecM, &vecD);
10293       assign(vecN, getQReg128(nn));
10294       assign(vecM, getQReg128(mm));
10295       assign(vecD, getQReg128(dd));
10296       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
10297                        False/*!is2*/, size, "mas"[ks],
10298                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
10299       IROp opZHI = mkVecZEROHIxxOFV128(size+1);
10300       putQReg128(dd, unop(opZHI, mkexpr(res)));
10301       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
10302       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
10303       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
10304          updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
10305       }
10306       const HChar* nm        = ks == 0 ? "sqdmull"
10307                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
10308       const HChar  arrNarrow = "bhsd"[size];
10309       const HChar  arrWide   = "bhsd"[size+1];
10310       DIP("%s %c%u, %c%u, %c%u\n",
10311           nm, arrWide, dd, arrNarrow, nn, arrNarrow, mm);
10312       return True;
10313    }
10314
10315    return False;
10316 #  undef INSN
10317 }
10318
10319
10320 static
10321 Bool dis_AdvSIMD_scalar_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
10322 {
10323    /* 31 29 28    23   21 20 15     10 9 4
10324       01 U  11110 size 1  m  opcode 1  n d
10325       Decode fields: u,size,opcode
10326    */
10327 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10328    if (INSN(31,30) != BITS2(0,1)
10329        || INSN(28,24) != BITS5(1,1,1,1,0)
10330        || INSN(21,21) != 1
10331        || INSN(10,10) != 1) {
10332       return False;
10333    }
10334    UInt bitU   = INSN(29,29);
10335    UInt size   = INSN(23,22);
10336    UInt mm     = INSN(20,16);
10337    UInt opcode = INSN(15,11);
10338    UInt nn     = INSN(9,5);
10339    UInt dd     = INSN(4,0);
10340    vassert(size < 4);
10341
10342    if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
10343       /* -------- 0,xx,00001 SQADD std4_std4_std4 -------- */
10344       /* -------- 1,xx,00001 UQADD std4_std4_std4 -------- */
10345       /* -------- 0,xx,00101 SQSUB std4_std4_std4 -------- */
10346       /* -------- 1,xx,00101 UQSUB std4_std4_std4 -------- */
10347       Bool isADD = opcode == BITS5(0,0,0,0,1);
10348       Bool isU   = bitU == 1;
10349       IROp qop   = Iop_INVALID;
10350       IROp nop   = Iop_INVALID;
10351       if (isADD) {
10352          qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
10353          nop = mkVecADD(size);
10354       } else {
10355          qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
10356          nop = mkVecSUB(size);
10357       }
10358       IRTemp argL = newTempV128();
10359       IRTemp argR = newTempV128();
10360       IRTemp qres = newTempV128();
10361       IRTemp nres = newTempV128();
10362       assign(argL, getQReg128(nn));
10363       assign(argR, getQReg128(mm));
10364       assign(qres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10365                              size, binop(qop, mkexpr(argL), mkexpr(argR)))));
10366       assign(nres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10367                              size, binop(nop, mkexpr(argL), mkexpr(argR)))));
10368       putQReg128(dd, mkexpr(qres));
10369       updateQCFLAGwithDifference(qres, nres);
10370       const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd")
10371                                : (isU ? "uqsub" : "sqsub");
10372       const HChar  arr = "bhsd"[size];
10373       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10374       return True;
10375    }
10376
10377    if (size == X11 && opcode == BITS5(0,0,1,1,0)) {
10378       /* -------- 0,11,00110 CMGT d_d_d -------- */ // >s
10379       /* -------- 1,11,00110 CMHI d_d_d -------- */ // >u
10380       Bool    isGT = bitU == 0;
10381       IRExpr* argL = getQReg128(nn);
10382       IRExpr* argR = getQReg128(mm);
10383       IRTemp  res  = newTempV128();
10384       assign(res,
10385              isGT ? binop(Iop_CmpGT64Sx2, argL, argR)
10386                   : binop(Iop_CmpGT64Ux2, argL, argR));
10387       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10388       DIP("%s %s, %s, %s\n",isGT ? "cmgt" : "cmhi",
10389           nameQRegLO(dd, Ity_I64),
10390           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10391       return True;
10392    }
10393
10394    if (size == X11 && opcode == BITS5(0,0,1,1,1)) {
10395       /* -------- 0,11,00111 CMGE d_d_d -------- */ // >=s
10396       /* -------- 1,11,00111 CMHS d_d_d -------- */ // >=u
10397       Bool    isGE = bitU == 0;
10398       IRExpr* argL = getQReg128(nn);
10399       IRExpr* argR = getQReg128(mm);
10400       IRTemp  res  = newTempV128();
10401       assign(res,
10402              isGE ? unop(Iop_NotV128, binop(Iop_CmpGT64Sx2, argR, argL))
10403                   : unop(Iop_NotV128, binop(Iop_CmpGT64Ux2, argR, argL)));
10404       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10405       DIP("%s %s, %s, %s\n", isGE ? "cmge" : "cmhs",
10406           nameQRegLO(dd, Ity_I64),
10407           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10408       return True;
10409    }
10410
10411    if (size == X11 && (opcode == BITS5(0,1,0,0,0)
10412                        || opcode == BITS5(0,1,0,1,0))) {
10413       /* -------- 0,xx,01000 SSHL  d_d_d -------- */
10414       /* -------- 0,xx,01010 SRSHL d_d_d -------- */
10415       /* -------- 1,xx,01000 USHL  d_d_d -------- */
10416       /* -------- 1,xx,01010 URSHL d_d_d -------- */
10417       Bool isU = bitU == 1;
10418       Bool isR = opcode == BITS5(0,1,0,1,0);
10419       IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
10420                      : (isU ? mkVecSHU(size)  : mkVecSHS(size));
10421       IRTemp res = newTempV128();
10422       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
10423       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10424       const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
10425                              : (isU ? "ushl"  : "sshl");
10426       DIP("%s %s, %s, %s\n", nm,
10427           nameQRegLO(dd, Ity_I64),
10428           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10429       return True;
10430    }
10431
10432    if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
10433       /* -------- 0,xx,01001 SQSHL  std4_std4_std4 -------- */
10434       /* -------- 0,xx,01011 SQRSHL std4_std4_std4 -------- */
10435       /* -------- 1,xx,01001 UQSHL  std4_std4_std4 -------- */
10436       /* -------- 1,xx,01011 UQRSHL std4_std4_std4 -------- */
10437       Bool isU = bitU == 1;
10438       Bool isR = opcode == BITS5(0,1,0,1,1);
10439       IROp op  = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
10440                      : (isU ? mkVecQANDUQSH(size)  : mkVecQANDSQSH(size));
10441       /* This is a bit tricky.  Since we're only interested in the lowest
10442          lane of the result, we zero out all the rest in the operands, so
10443          as to ensure that other lanes don't pollute the returned Q value.
10444          This works because it means, for the lanes we don't care about, we
10445          are shifting zero by zero, which can never saturate. */
10446       IRTemp res256 = newTemp(Ity_V256);
10447       IRTemp resSH  = newTempV128();
10448       IRTemp resQ   = newTempV128();
10449       IRTemp zero   = newTempV128();
10450       assign(
10451          res256,
10452          binop(op,
10453                mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn))),
10454                mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(mm)))));
10455       assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
10456       assign(resQ,  unop(Iop_V256toV128_1, mkexpr(res256)));
10457       assign(zero,  mkV128(0x0000));
10458       putQReg128(dd, mkexpr(resSH));
10459       updateQCFLAGwithDifference(resQ, zero);
10460       const HChar* nm  = isR ? (isU ? "uqrshl" : "sqrshl")
10461                              : (isU ? "uqshl"  : "sqshl");
10462       const HChar  arr = "bhsd"[size];
10463       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10464       return True;
10465    }
10466
10467    if (size == X11 && opcode == BITS5(1,0,0,0,0)) {
10468       /* -------- 0,11,10000 ADD d_d_d -------- */
10469       /* -------- 1,11,10000 SUB d_d_d -------- */
10470       Bool   isSUB = bitU == 1;
10471       IRTemp res   = newTemp(Ity_I64);
10472       assign(res, binop(isSUB ? Iop_Sub64 : Iop_Add64,
10473                         getQRegLane(nn, 0, Ity_I64),
10474                         getQRegLane(mm, 0, Ity_I64)));
10475       putQRegLane(dd, 0, mkexpr(res));
10476       putQRegLane(dd, 1, mkU64(0));
10477       DIP("%s %s, %s, %s\n", isSUB ? "sub" : "add",
10478           nameQRegLO(dd, Ity_I64),
10479           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10480       return True;
10481    }
10482
10483    if (size == X11 && opcode == BITS5(1,0,0,0,1)) {
10484       /* -------- 0,11,10001 CMTST d_d_d -------- */ // &, != 0
10485       /* -------- 1,11,10001 CMEQ  d_d_d -------- */ // ==
10486       Bool    isEQ = bitU == 1;
10487       IRExpr* argL = getQReg128(nn);
10488       IRExpr* argR = getQReg128(mm);
10489       IRTemp  res  = newTempV128();
10490       assign(res,
10491              isEQ ? binop(Iop_CmpEQ64x2, argL, argR)
10492                   : unop(Iop_NotV128, binop(Iop_CmpEQ64x2,
10493                                             binop(Iop_AndV128, argL, argR),
10494                                             mkV128(0x0000))));
10495       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10496       DIP("%s %s, %s, %s\n", isEQ ? "cmeq" : "cmtst",
10497           nameQRegLO(dd, Ity_I64),
10498           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10499       return True;
10500    }
10501
10502    if (opcode == BITS5(1,0,1,1,0)) {
10503       /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
10504       /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
10505       if (size == X00 || size == X11) return False;
10506       Bool isR = bitU == 1;
10507       IRTemp res, sat1q, sat1n, vN, vM;
10508       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
10509       newTempsV128_2(&vN, &vM);
10510       assign(vN, getQReg128(nn));
10511       assign(vM, getQReg128(mm));
10512       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
10513       putQReg128(dd,
10514                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res))));
10515       updateQCFLAGwithDifference(
10516          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1q)),
10517          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1n)));
10518       const HChar  arr = "bhsd"[size];
10519       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
10520       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10521       return True;
10522    }
10523
10524    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
10525       /* -------- 1,1x,11010 FABD d_d_d, s_s_s -------- */
10526       IRType ity = size == X11 ? Ity_F64 : Ity_F32;
10527       IRTemp res = newTemp(ity);
10528       assign(res, unop(mkABSF(ity),
10529                        triop(mkSUBF(ity),
10530                              mkexpr(mk_get_IR_rounding_mode()),
10531                              getQRegLO(nn,ity), getQRegLO(mm,ity))));
10532       putQReg128(dd, mkV128(0x0000));
10533       putQRegLO(dd, mkexpr(res));
10534       DIP("fabd %s, %s, %s\n",
10535           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10536       return True;
10537    }
10538
10539    if (bitU == 0 && size <= X01 && opcode == BITS5(1,1,0,1,1)) {
10540       /* -------- 0,0x,11011 FMULX d_d_d, s_s_s -------- */
10541       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
10542       IRType ity = size == X01 ? Ity_F64 : Ity_F32;
10543       IRTemp res = newTemp(ity);
10544       assign(res, triop(mkMULF(ity),
10545                         mkexpr(mk_get_IR_rounding_mode()),
10546                         getQRegLO(nn,ity), getQRegLO(mm,ity)));
10547       putQReg128(dd, mkV128(0x0000));
10548       putQRegLO(dd, mkexpr(res));
10549       DIP("fmulx %s, %s, %s\n",
10550           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10551       return True;
10552    }
10553
10554    if (size <= X01 && opcode == BITS5(1,1,1,0,0)) {
10555       /* -------- 0,0x,11100 FCMEQ d_d_d, s_s_s -------- */
10556       /* -------- 1,0x,11100 FCMGE d_d_d, s_s_s -------- */
10557       Bool   isD   = size == X01;
10558       IRType ity   = isD ? Ity_F64 : Ity_F32;
10559       Bool   isGE  = bitU == 1;
10560       IROp   opCMP = isGE ? (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4)
10561                           : (isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4);
10562       IRTemp res   = newTempV128();
10563       assign(res, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn)) // swapd
10564                        : binop(opCMP, getQReg128(nn), getQReg128(mm)));
10565       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10566                                                              mkexpr(res))));
10567       DIP("%s %s, %s, %s\n", isGE ? "fcmge" : "fcmeq",
10568           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10569       return True;
10570    }
10571
10572    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,0,0)) {
10573       /* -------- 1,1x,11100 FCMGT d_d_d, s_s_s -------- */
10574       Bool   isD   = size == X11;
10575       IRType ity   = isD ? Ity_F64 : Ity_F32;
10576       IROp   opCMP = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
10577       IRTemp res   = newTempV128();
10578       assign(res, binop(opCMP, getQReg128(mm), getQReg128(nn))); // swapd
10579       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10580                                                              mkexpr(res))));
10581       DIP("%s %s, %s, %s\n", "fcmgt",
10582           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10583       return True;
10584    }
10585
10586    if (bitU == 1 && opcode == BITS5(1,1,1,0,1)) {
10587       /* -------- 1,0x,11101 FACGE d_d_d, s_s_s -------- */
10588       /* -------- 1,1x,11101 FACGT d_d_d, s_s_s -------- */
10589       Bool   isD   = (size & 1) == 1;
10590       IRType ity   = isD ? Ity_F64 : Ity_F32;
10591       Bool   isGT  = (size & 2) == 2;
10592       IROp   opCMP = isGT ? (isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4)
10593                           : (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4);
10594       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
10595       IRTemp res   = newTempV128();
10596       assign(res, binop(opCMP, unop(opABS, getQReg128(mm)),
10597                                unop(opABS, getQReg128(nn)))); // swapd
10598       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10599                                                              mkexpr(res))));
10600       DIP("%s %s, %s, %s\n", isGT ? "facgt" : "facge",
10601           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10602       return True;
10603    }
10604
10605    if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
10606       /* -------- 0,0x,11111: FRECPS  d_d_d, s_s_s -------- */
10607       /* -------- 0,1x,11111: FRSQRTS d_d_d, s_s_s -------- */
10608       Bool isSQRT = (size & 2) == 2;
10609       Bool isD    = (size & 1) == 1;
10610       IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
10611                            : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
10612       IRTemp res = newTempV128();
10613       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
10614       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10615                                                              mkexpr(res))));
10616       HChar c = isD ? 'd' : 's';
10617       DIP("%s %c%u, %c%u, %c%u\n", isSQRT ? "frsqrts" : "frecps",
10618           c, dd, c, nn, c, mm);
10619       return True;
10620    }
10621
10622    return False;
10623 #  undef INSN
10624 }
10625
10626 static
10627 Bool dis_AdvSIMD_scalar_three_same_extra(/*MB_OUT*/DisResult* dres, UInt insn)
10628 {
10629    /* 31 29 28    23   21 20 15     10 9 4
10630       01 U  11110 size 0  m  opcode 1  n d
10631       Decode fields: u,size,opcode
10632    */
10633 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10634    if (INSN(31,30) != BITS2(0,1)
10635        || INSN(28,24) != BITS5(1,1,1,1,0)
10636        || INSN(21,21) != 0
10637        || INSN(10,10) != 1) {
10638       return False;
10639    }
10640    UInt bitU   = INSN(29,29);
10641    UInt size   = INSN(23,22);
10642    UInt mm     = INSN(20,16);
10643    UInt opcode = INSN(15,11);
10644    UInt nn     = INSN(9,5);
10645    UInt dd     = INSN(4,0);
10646    vassert(size < 4);
10647    vassert(mm < 32 && nn < 32 && dd < 32);
10648
10649    if (bitU == 1 && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1))) {
10650       /* -------- xx,10000 SQRDMLAH s and h variants only -------- */
10651       /* -------- xx,10001 SQRDMLSH s and h variants only -------- */
10652       if (size == X00 || size == X11) return False;
10653       Bool isAdd = opcode == BITS5(1,0,0,0,0);
10654
10655       IRTemp res, res_nosat, vD, vN, vM;
10656       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
10657       newTempsV128_3(&vD, &vN, &vM);
10658       assign(vD, getQReg128(dd));
10659       assign(vN, getQReg128(nn));
10660       assign(vM, getQReg128(mm));
10661
10662       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
10663       putQReg128(dd,
10664                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res))));
10665       updateQCFLAGwithDifference(
10666          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res)),
10667          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res_nosat)));
10668
10669       const HChar  arr = "hs"[size];
10670       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
10671       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10672       return True;
10673    }
10674
10675    return False;
10676 #  undef INSN
10677 }
10678
10679
10680 static
10681 Bool dis_AdvSIMD_scalar_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
10682 {
10683    /* 31 29 28    23   21    16     11 9 4
10684       01 U  11110 size 10000 opcode 10 n d
10685       Decode fields: u,size,opcode
10686    */
10687 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10688    if (INSN(31,30) != BITS2(0,1)
10689        || INSN(28,24) != BITS5(1,1,1,1,0)
10690        || INSN(21,17) != BITS5(1,0,0,0,0)
10691        || INSN(11,10) != BITS2(1,0)) {
10692       return False;
10693    }
10694    UInt bitU   = INSN(29,29);
10695    UInt size   = INSN(23,22);
10696    UInt opcode = INSN(16,12);
10697    UInt nn     = INSN(9,5);
10698    UInt dd     = INSN(4,0);
10699    vassert(size < 4);
10700
10701    if (opcode == BITS5(0,0,0,1,1)) {
10702       /* -------- 0,xx,00011: SUQADD std4_std4 -------- */
10703       /* -------- 1,xx,00011: USQADD std4_std4 -------- */
10704       /* These are a bit tricky (to say the least).  See comments on
10705          the vector variants (in dis_AdvSIMD_two_reg_misc) below for
10706          details. */
10707       Bool   isUSQADD = bitU == 1;
10708       IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
10709                              : mkVecQADDEXTUSSATSS(size);
10710       IROp   nop  = mkVecADD(size);
10711       IRTemp argL = newTempV128();
10712       IRTemp argR = newTempV128();
10713       assign(argL, getQReg128(nn));
10714       assign(argR, getQReg128(dd));
10715       IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10716                        size, binop(qop, mkexpr(argL), mkexpr(argR)));
10717       IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10718                        size, binop(nop, mkexpr(argL), mkexpr(argR)));
10719       putQReg128(dd, mkexpr(qres));
10720       updateQCFLAGwithDifference(qres, nres);
10721       const HChar arr = "bhsd"[size];
10722       DIP("%s %c%u, %c%u\n", isUSQADD ? "usqadd" : "suqadd", arr, dd, arr, nn);
10723       return True;
10724    }
10725
10726    if (opcode == BITS5(0,0,1,1,1)) {
10727       /* -------- 0,xx,00111 SQABS std4_std4 -------- */
10728       /* -------- 1,xx,00111 SQNEG std4_std4 -------- */
10729       Bool isNEG = bitU == 1;
10730       IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
10731       (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
10732                                          getQReg128(nn), size );
10733       IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(qresFW));
10734       IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(nresFW));
10735       putQReg128(dd, mkexpr(qres));
10736       updateQCFLAGwithDifference(qres, nres);
10737       const HChar arr = "bhsd"[size];
10738       DIP("%s %c%u, %c%u\n", isNEG ? "sqneg" : "sqabs", arr, dd, arr, nn);
10739       return True;
10740    }
10741
10742    if (size == X11 && opcode == BITS5(0,1,0,0,0)) {
10743       /* -------- 0,11,01000: CMGT d_d_#0 -------- */ // >s 0
10744       /* -------- 1,11,01000: CMGE d_d_#0 -------- */ // >=s 0
10745       Bool    isGT = bitU == 0;
10746       IRExpr* argL = getQReg128(nn);
10747       IRExpr* argR = mkV128(0x0000);
10748       IRTemp  res  = newTempV128();
10749       assign(res, isGT ? binop(Iop_CmpGT64Sx2, argL, argR)
10750                        : unop(Iop_NotV128, binop(Iop_CmpGT64Sx2, argR, argL)));
10751       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10752       DIP("cm%s d%u, d%u, #0\n", isGT ? "gt" : "ge", dd, nn);
10753       return True;
10754    }
10755
10756    if (size == X11 && opcode == BITS5(0,1,0,0,1)) {
10757       /* -------- 0,11,01001: CMEQ d_d_#0 -------- */ // == 0
10758       /* -------- 1,11,01001: CMLE d_d_#0 -------- */ // <=s 0
10759       Bool    isEQ = bitU == 0;
10760       IRExpr* argL = getQReg128(nn);
10761       IRExpr* argR = mkV128(0x0000);
10762       IRTemp  res  = newTempV128();
10763       assign(res, isEQ ? binop(Iop_CmpEQ64x2, argL, argR)
10764                        : unop(Iop_NotV128,
10765                               binop(Iop_CmpGT64Sx2, argL, argR)));
10766       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10767       DIP("cm%s d%u, d%u, #0\n", isEQ ? "eq" : "le", dd, nn);
10768       return True;
10769    }
10770
10771    if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,0)) {
10772       /* -------- 0,11,01010: CMLT d_d_#0 -------- */ // <s 0
10773       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10774                           binop(Iop_CmpGT64Sx2, mkV128(0x0000),
10775                                                 getQReg128(nn))));
10776       DIP("cm%s d%u, d%u, #0\n", "lt", dd, nn);
10777       return True;
10778    }
10779
10780    if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
10781       /* -------- 0,11,01011 ABS d_d -------- */
10782       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10783                           unop(Iop_Abs64x2, getQReg128(nn))));
10784       DIP("abs d%u, d%u\n", dd, nn);
10785       return True;
10786    }
10787
10788    if (bitU == 1 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
10789       /* -------- 1,11,01011 NEG d_d -------- */
10790       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10791                           binop(Iop_Sub64x2, mkV128(0x0000), getQReg128(nn))));
10792       DIP("neg d%u, d%u\n", dd, nn);
10793       return True;
10794    }
10795
10796    UInt ix = 0; /*INVALID*/
10797    if (size >= X10) {
10798       switch (opcode) {
10799          case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 4 : 1; break;
10800          case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 5 : 2; break;
10801          case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;
10802          default: break;
10803       }
10804    }
10805    if (ix > 0) {
10806       /* -------- 0,1x,01100 FCMGT d_d_#0.0, s_s_#0.0 (ix 1) -------- */
10807       /* -------- 0,1x,01101 FCMEQ d_d_#0.0, s_s_#0.0 (ix 2) -------- */
10808       /* -------- 0,1x,01110 FCMLT d_d_#0.0, s_s_#0.0 (ix 3) -------- */
10809       /* -------- 1,1x,01100 FCMGE d_d_#0.0, s_s_#0.0 (ix 4) -------- */
10810       /* -------- 1,1x,01101 FCMLE d_d_#0.0, s_s_#0.0 (ix 5) -------- */
10811       Bool   isD     = size == X11;
10812       IRType ity     = isD ? Ity_F64 : Ity_F32;
10813       IROp   opCmpEQ = isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
10814       IROp   opCmpLE = isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
10815       IROp   opCmpLT = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
10816       IROp   opCmp   = Iop_INVALID;
10817       Bool   swap    = False;
10818       const HChar* nm = "??";
10819       switch (ix) {
10820          case 1: nm = "fcmgt"; opCmp = opCmpLT; swap = True; break;
10821          case 2: nm = "fcmeq"; opCmp = opCmpEQ; break;
10822          case 3: nm = "fcmlt"; opCmp = opCmpLT; break;
10823          case 4: nm = "fcmge"; opCmp = opCmpLE; swap = True; break;
10824          case 5: nm = "fcmle"; opCmp = opCmpLE; break;
10825          default: vassert(0);
10826       }
10827       IRExpr* zero = mkV128(0x0000);
10828       IRTemp res = newTempV128();
10829       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
10830                        : binop(opCmp, getQReg128(nn), zero));
10831       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10832                                                              mkexpr(res))));
10833
10834       DIP("%s %s, %s, #0.0\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
10835       return True;
10836    }
10837
10838    if (opcode == BITS5(1,0,1,0,0)
10839        || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) {
10840       /* -------- 0,xx,10100: SQXTN -------- */
10841       /* -------- 1,xx,10100: UQXTN -------- */
10842       /* -------- 1,xx,10010: SQXTUN -------- */
10843       if (size == X11) return False;
10844       vassert(size < 3);
10845       IROp  opN    = Iop_INVALID;
10846       Bool  zWiden = True;
10847       const HChar* nm = "??";
10848       /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) {
10849          opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False;
10850       }
10851       else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) {
10852          opN = mkVecQNARROWUNUU(size); nm = "uqxtn";
10853       }
10854       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
10855          opN = mkVecQNARROWUNSU(size); nm = "sqxtun";
10856       }
10857       else vassert(0);
10858       IRTemp src  = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10859                        size+1, getQReg128(nn));
10860       IRTemp resN = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10861                        size, unop(Iop_64UtoV128, unop(opN, mkexpr(src))));
10862       putQReg128(dd, mkexpr(resN));
10863       /* This widens zero lanes to zero, and compares it against zero, so all
10864          of the non-participating lanes make no contribution to the
10865          Q flag state. */
10866       IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/,
10867                                               size, mkexpr(resN));
10868       updateQCFLAGwithDifference(src, resW);
10869       const HChar arrNarrow = "bhsd"[size];
10870       const HChar arrWide   = "bhsd"[size+1];
10871       DIP("%s %c%u, %c%u\n", nm, arrNarrow, dd, arrWide, nn);
10872       return True;
10873    }
10874
10875    if (opcode == BITS5(1,0,1,1,0) && bitU == 1 && size == X01) {
10876       /* -------- 1,01,10110 FCVTXN s_d -------- */
10877       /* Using Irrm_NEAREST here isn't right.  The docs say "round to
10878          odd" but I don't know what that really means. */
10879       putQRegLO(dd,
10880                 binop(Iop_F64toF32, mkU32(Irrm_NEAREST),
10881                                     getQRegLO(nn, Ity_F64)));
10882       putQRegLane(dd, 1, mkU32(0));
10883       putQRegLane(dd, 1, mkU64(0));
10884       DIP("fcvtxn s%u, d%u\n", dd, nn);
10885       return True;
10886    }
10887
10888    ix = 0; /*INVALID*/
10889    switch (opcode) {
10890       case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
10891       case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
10892       case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
10893       default: break;
10894    }
10895    if (ix > 0) {
10896       /* -------- 0,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
10897       /* -------- 0,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
10898       /* -------- 0,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
10899       /* -------- 0,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
10900       /* -------- 0,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
10901       /* -------- 1,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
10902       /* -------- 1,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
10903       /* -------- 1,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
10904       /* -------- 1,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
10905       /* -------- 1,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
10906       Bool           isD  = (size & 1) == 1;
10907       IRType         tyF  = isD ? Ity_F64 : Ity_F32;
10908       IRType         tyI  = isD ? Ity_I64 : Ity_I32;
10909       IRRoundingMode irrm = 8; /*impossible*/
10910       HChar          ch   = '?';
10911       switch (ix) {
10912          case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
10913          case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
10914          case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
10915          case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
10916          case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
10917          default: vassert(0);
10918       }
10919       IROp cvt = Iop_INVALID;
10920       if (bitU == 1) {
10921          cvt = isD ? Iop_F64toI64U : Iop_F32toI32U;
10922       } else {
10923          cvt = isD ? Iop_F64toI64S : Iop_F32toI32S;
10924       }
10925       IRTemp src = newTemp(tyF);
10926       IRTemp res = newTemp(tyI);
10927       assign(src, getQRegLane(nn, 0, tyF));
10928       assign(res, binop(cvt, mkU32(irrm), mkexpr(src)));
10929       putQRegLane(dd, 0, mkexpr(res)); /* bits 31-0 or 63-0 */
10930       if (!isD) {
10931          putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
10932       }
10933       putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
10934       HChar sOrD = isD ? 'd' : 's';
10935       DIP("fcvt%c%c %c%u, %c%u\n", ch, bitU == 1 ? 'u' : 's',
10936           sOrD, dd, sOrD, nn);
10937       return True;
10938    }
10939
10940    if (size <= X01 && opcode == BITS5(1,1,1,0,1)) {
10941       /* -------- 0,0x,11101: SCVTF d_d, s_s -------- */
10942       /* -------- 1,0x,11101: UCVTF d_d, s_s -------- */
10943       Bool   isU = bitU == 1;
10944       Bool   isD = (size & 1) == 1;
10945       IRType tyI = isD ? Ity_I64 : Ity_I32;
10946       IROp   iop = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
10947                        : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
10948       IRTemp rm  = mk_get_IR_rounding_mode();
10949       putQRegLO(dd, binop(iop, mkexpr(rm), getQRegLO(nn, tyI)));
10950       if (!isD) {
10951          putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
10952       }
10953       putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
10954       HChar c = isD ? 'd' : 's';
10955       DIP("%ccvtf %c%u, %c%u\n", isU ? 'u' : 's', c, dd, c, nn);
10956       return True;
10957    }
10958
10959    if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
10960       /* -------- 0,1x,11101: FRECPE  d_d, s_s -------- */
10961       /* -------- 1,1x,11101: FRSQRTE d_d, s_s -------- */
10962       Bool isSQRT = bitU == 1;
10963       Bool isD    = (size & 1) == 1;
10964       IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
10965                            : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
10966       IRTemp resV = newTempV128();
10967       assign(resV, unop(op, getQReg128(nn)));
10968       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10969                                                              mkexpr(resV))));
10970       HChar c = isD ? 'd' : 's';
10971       DIP("%s %c%u, %c%u\n", isSQRT ? "frsqrte" : "frecpe", c, dd, c, nn);
10972       return True;
10973    }
10974
10975    if (bitU == 0 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
10976       /* -------- 0,1x,11111: FRECPX  d_d, s_s -------- */
10977       Bool   isD = (size & 1) == 1;
10978       IRType ty  = isD ? Ity_F64 : Ity_F32;
10979       IROp   op  = isD ? Iop_RecpExpF64 : Iop_RecpExpF32;
10980       IRTemp res = newTemp(ty);
10981       IRTemp rm  = mk_get_IR_rounding_mode();
10982       assign(res, binop(op, mkexpr(rm), getQRegLane(nn, 0, ty)));
10983       putQReg128(dd, mkV128(0x0000));
10984       putQRegLane(dd, 0, mkexpr(res));
10985       HChar c = isD ? 'd' : 's';
10986       DIP("%s %c%u, %c%u\n", "frecpx", c, dd, c, nn);
10987       return True;
10988    }
10989
10990    return False;
10991 #  undef INSN
10992 }
10993
10994
10995 static
10996 Bool dis_AdvSIMD_scalar_x_indexed_element(/*MB_OUT*/DisResult* dres, UInt insn)
10997 {
10998    /* 31   28    23   21 20 19 15     11   9 4
10999       01 U 11111 size L  M  m  opcode H  0 n d
11000       Decode fields are: u,size,opcode
11001       M is really part of the mm register number.  Individual
11002       cases need to inspect L and H though.
11003    */
11004 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11005    if (INSN(31,30) != BITS2(0,1)
11006        || INSN(28,24) != BITS5(1,1,1,1,1) || INSN(10,10) != 0) {
11007       return False;
11008    }
11009    UInt bitU   = INSN(29,29);
11010    UInt size   = INSN(23,22);
11011    UInt bitL   = INSN(21,21);
11012    UInt bitM   = INSN(20,20);
11013    UInt mmLO4  = INSN(19,16);
11014    UInt opcode = INSN(15,12);
11015    UInt bitH   = INSN(11,11);
11016    UInt nn     = INSN(9,5);
11017    UInt dd     = INSN(4,0);
11018    vassert(size < 4);
11019    vassert(bitH < 2 && bitM < 2 && bitL < 2);
11020
11021    if (bitU == 0 && size >= X10
11022        && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
11023       /* -------- 0,1x,0001 FMLA d_d_d[], s_s_s[] -------- */
11024       /* -------- 0,1x,0101 FMLS d_d_d[], s_s_s[] -------- */
11025       Bool isD   = (size & 1) == 1;
11026       Bool isSUB = opcode == BITS4(0,1,0,1);
11027       UInt index;
11028       if      (!isD)             index = (bitH << 1) | bitL;
11029       else if (isD && bitL == 0) index = bitH;
11030       else return False; // sz:L == x11 => unallocated encoding
11031       vassert(index < (isD ? 2 : 4));
11032       IRType ity   = isD ? Ity_F64 : Ity_F32;
11033       IRTemp elem  = newTemp(ity);
11034       UInt   mm    = (bitM << 4) | mmLO4;
11035       assign(elem, getQRegLane(mm, index, ity));
11036       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
11037       IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
11038       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
11039       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
11040       IRTemp rm    = mk_get_IR_rounding_mode();
11041       IRTemp t1    = newTempV128();
11042       IRTemp t2    = newTempV128();
11043       // FIXME: double rounding; use FMA primops instead
11044       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
11045       assign(t2, triop(isSUB ? opSUB : opADD,
11046                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
11047       putQReg128(dd,
11048                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
11049                                                          mkexpr(t2))));
11050       const HChar c = isD ? 'd' : 's';
11051       DIP("%s %c%u, %c%u, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
11052           c, dd, c, nn, nameQReg128(mm), c, index);
11053       return True;
11054    }
11055
11056    if (size >= X10 && opcode == BITS4(1,0,0,1)) {
11057       /* -------- 0,1x,1001 FMUL  d_d_d[], s_s_s[] -------- */
11058       /* -------- 1,1x,1001 FMULX d_d_d[], s_s_s[] -------- */
11059       Bool isD    = (size & 1) == 1;
11060       Bool isMULX = bitU == 1;
11061       UInt index;
11062       if      (!isD)             index = (bitH << 1) | bitL;
11063       else if (isD && bitL == 0) index = bitH;
11064       else return False; // sz:L == x11 => unallocated encoding
11065       vassert(index < (isD ? 2 : 4));
11066       IRType ity   = isD ? Ity_F64 : Ity_F32;
11067       IRTemp elem  = newTemp(ity);
11068       UInt   mm    = (bitM << 4) | mmLO4;
11069       assign(elem, getQRegLane(mm, index, ity));
11070       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
11071       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
11072       IRTemp rm    = mk_get_IR_rounding_mode();
11073       IRTemp t1    = newTempV128();
11074       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
11075       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
11076       putQReg128(dd,
11077                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
11078                                                          mkexpr(t1))));
11079       const HChar c = isD ? 'd' : 's';
11080       DIP("%s %c%u, %c%u, %s.%c[%u]\n", isMULX ? "fmulx" : "fmul",
11081           c, dd, c, nn, nameQReg128(mm), c, index);
11082       return True;
11083    }
11084
11085    if (bitU == 0
11086        && (opcode == BITS4(1,0,1,1)
11087            || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
11088       /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
11089       /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
11090       /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
11091       /* Widens, and size refers to the narrowed lanes. */
11092       UInt ks = 3;
11093       switch (opcode) {
11094          case BITS4(1,0,1,1): ks = 0; break;
11095          case BITS4(0,0,1,1): ks = 1; break;
11096          case BITS4(0,1,1,1): ks = 2; break;
11097          default: vassert(0);
11098       }
11099       vassert(ks >= 0 && ks <= 2);
11100       UInt mm  = 32; // invalid
11101       UInt ix  = 16; // invalid
11102       switch (size) {
11103          case X00:
11104             return False; // h_b_b[] case is not allowed
11105          case X01:
11106             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
11107          case X10:
11108             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
11109          case X11:
11110             return False; // q_d_d[] case is not allowed
11111          default:
11112             vassert(0);
11113       }
11114       vassert(mm < 32 && ix < 16);
11115       IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
11116       vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
11117       newTempsV128_2(&vecN, &vecD);
11118       assign(vecN, getQReg128(nn));
11119       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
11120       assign(vecD, getQReg128(dd));
11121       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
11122                        False/*!is2*/, size, "mas"[ks],
11123                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
11124       IROp opZHI = mkVecZEROHIxxOFV128(size+1);
11125       putQReg128(dd, unop(opZHI, mkexpr(res)));
11126       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
11127       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
11128       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
11129          updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
11130       }
11131       const HChar* nm        = ks == 0 ? "sqmull"
11132                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
11133       const HChar  arrNarrow = "bhsd"[size];
11134       const HChar  arrWide   = "bhsd"[size+1];
11135       DIP("%s %c%u, %c%u, v%u.%c[%u]\n",
11136           nm, arrWide, dd, arrNarrow, nn, dd, arrNarrow, ix);
11137       return True;
11138    }
11139
11140    if (bitU == 0 && (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1))) {
11141       /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
11142       /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
11143       UInt mm  = 32; // invalid
11144       UInt ix  = 16; // invalid
11145       switch (size) {
11146          case X00:
11147             return False; // b case is not allowed
11148          case X01:
11149             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
11150          case X10:
11151             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
11152          case X11:
11153             return False; // q case is not allowed
11154          default:
11155             vassert(0);
11156       }
11157       vassert(mm < 32 && ix < 16);
11158       Bool isR = opcode == BITS4(1,1,0,1);
11159       IRTemp res, sat1q, sat1n, vN, vM;
11160       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
11161       vN = newTempV128();
11162       assign(vN, getQReg128(nn));
11163       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
11164       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
11165       IROp opZHI = mkVecZEROHIxxOFV128(size);
11166       putQReg128(dd, unop(opZHI, mkexpr(res)));
11167       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
11168       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
11169       HChar ch         = size == X01 ? 'h' : 's';
11170       DIP("%s %c%u, %c%u, v%d.%c[%u]\n", nm, ch, dd, ch, nn, ch, (Int)dd, ix);
11171       return True;
11172    }
11173
11174    if (bitU == 1 && (opcode == BITS4(1,1,0,1) || opcode == BITS4(1,1,1,1))) {
11175       /* -------- 0,xx,1101 SQRDMLAH s and h variants only -------- */
11176       /* -------- 0,xx,1111 SQRDMLSH s and h variants only -------- */
11177       UInt mm  = 32; // invalid
11178       UInt ix  = 16; // invalid
11179       switch (size) {
11180          case X00:
11181             return False; // b case is not allowed
11182          case X01:
11183             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
11184          case X10:
11185             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
11186          case X11:
11187             return False; // d case is not allowed
11188          default:
11189             vassert(0);
11190       }
11191       vassert(size < 4);
11192       vassert(mm < 32 && ix < 16);
11193       Bool isAdd = opcode == BITS4(1,1,0,1);
11194
11195       IRTemp res, res_nosat, vD, vN, vM;
11196       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
11197       newTempsV128_2(&vD, &vN);
11198       assign(vD, getQReg128(dd));
11199       assign(vN, getQReg128(nn));
11200       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
11201
11202       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
11203       IROp opZHI = mkVecZEROHIxxOFV128(size);
11204       putQReg128(dd, unop(opZHI, mkexpr(res)));
11205       updateQCFLAGwithDifferenceZHI(res, res_nosat, opZHI);
11206
11207       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
11208       HChar ch         = size == X01 ? 'h' : 's';
11209       DIP("%s %c%u, %c%u, v%d.%c[%u]\n", nm, ch, dd, ch, nn, ch, (Int)dd, ix);
11210       return True;
11211    }
11212
11213    return False;
11214 #  undef INSN
11215 }
11216
11217
11218 static
11219 Bool dis_AdvSIMD_shift_by_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
11220 {
11221    /* 31    28     22   18   15     10 9 4
11222       0 q u 011110 immh immb opcode 1  n d
11223       Decode fields: u,opcode
11224    */
11225 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11226    if (INSN(31,31) != 0
11227        || INSN(28,23) != BITS6(0,1,1,1,1,0) || INSN(10,10) != 1) {
11228       return False;
11229    }
11230    UInt bitQ   = INSN(30,30);
11231    UInt bitU   = INSN(29,29);
11232    UInt immh   = INSN(22,19);
11233    UInt immb   = INSN(18,16);
11234    UInt opcode = INSN(15,11);
11235    UInt nn     = INSN(9,5);
11236    UInt dd     = INSN(4,0);
11237
11238    if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0)) {
11239       /* -------- 0,00000 SSHR std7_std7_#imm -------- */
11240       /* -------- 1,00000 USHR std7_std7_#imm -------- */
11241       /* -------- 0,00010 SSRA std7_std7_#imm -------- */
11242       /* -------- 1,00010 USRA std7_std7_#imm -------- */
11243       /* laneTy, shift = case immh:immb of
11244                          0001:xxx -> B, SHR:8-xxx
11245                          001x:xxx -> H, SHR:16-xxxx
11246                          01xx:xxx -> S, SHR:32-xxxxx
11247                          1xxx:xxx -> D, SHR:64-xxxxxx
11248                          other    -> invalid
11249       */
11250       UInt size  = 0;
11251       UInt shift = 0;
11252       Bool isQ   = bitQ == 1;
11253       Bool isU   = bitU == 1;
11254       Bool isAcc = opcode == BITS5(0,0,0,1,0);
11255       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11256       if (!ok || (bitQ == 0 && size == X11)) return False;
11257       vassert(size >= 0 && size <= 3);
11258       UInt lanebits = 8 << size;
11259       vassert(shift >= 1 && shift <= lanebits);
11260       IROp    op  = isU ? mkVecSHRN(size) : mkVecSARN(size);
11261       IRExpr* src = getQReg128(nn);
11262       IRTemp  shf = newTempV128();
11263       IRTemp  res = newTempV128();
11264       if (shift == lanebits && isU) {
11265          assign(shf, mkV128(0x0000));
11266       } else {
11267          UInt nudge = 0;
11268          if (shift == lanebits) {
11269             vassert(!isU);
11270             nudge = 1;
11271          }
11272          assign(shf, binop(op, src, mkU8(shift - nudge)));
11273       }
11274       assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
11275                         : mkexpr(shf));
11276       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11277       HChar laneCh = "bhsd"[size];
11278       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
11279       const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
11280                               : (isU ? "ushr" : "sshr");
11281       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
11282           nameQReg128(dd), nLanes, laneCh,
11283           nameQReg128(nn), nLanes, laneCh, shift);
11284       return True;
11285    }
11286
11287    if (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0)) {
11288       /* -------- 0,00100 SRSHR std7_std7_#imm -------- */
11289       /* -------- 1,00100 URSHR std7_std7_#imm -------- */
11290       /* -------- 0,00110 SRSRA std7_std7_#imm -------- */
11291       /* -------- 1,00110 URSRA std7_std7_#imm -------- */
11292       /* laneTy, shift = case immh:immb of
11293                          0001:xxx -> B, SHR:8-xxx
11294                          001x:xxx -> H, SHR:16-xxxx
11295                          01xx:xxx -> S, SHR:32-xxxxx
11296                          1xxx:xxx -> D, SHR:64-xxxxxx
11297                          other    -> invalid
11298       */
11299       UInt size  = 0;
11300       UInt shift = 0;
11301       Bool isQ   = bitQ == 1;
11302       Bool isU   = bitU == 1;
11303       Bool isAcc = opcode == BITS5(0,0,1,1,0);
11304       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11305       if (!ok || (bitQ == 0 && size == X11)) return False;
11306       vassert(size >= 0 && size <= 3);
11307       UInt lanebits = 8 << size;
11308       vassert(shift >= 1 && shift <= lanebits);
11309       IROp    op   = isU ? mkVecRSHU(size) : mkVecRSHS(size);
11310       IRExpr* src  = getQReg128(nn);
11311       IRTemp  imm8 = newTemp(Ity_I8);
11312       assign(imm8, mkU8((UChar)(-shift)));
11313       IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
11314       IRTemp  shf  = newTempV128();
11315       IRTemp  res  = newTempV128();
11316       assign(shf, binop(op, src, amt));
11317       assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
11318                         : mkexpr(shf));
11319       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11320       HChar laneCh = "bhsd"[size];
11321       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
11322       const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
11323                               : (isU ? "urshr" : "srshr");
11324       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
11325           nameQReg128(dd), nLanes, laneCh,
11326           nameQReg128(nn), nLanes, laneCh, shift);
11327       return True;
11328    }
11329
11330    if (bitU == 1 && opcode == BITS5(0,1,0,0,0)) {
11331       /* -------- 1,01000 SRI std7_std7_#imm -------- */
11332       /* laneTy, shift = case immh:immb of
11333                          0001:xxx -> B, SHR:8-xxx
11334                          001x:xxx -> H, SHR:16-xxxx
11335                          01xx:xxx -> S, SHR:32-xxxxx
11336                          1xxx:xxx -> D, SHR:64-xxxxxx
11337                          other    -> invalid
11338       */
11339       UInt size  = 0;
11340       UInt shift = 0;
11341       Bool isQ   = bitQ == 1;
11342       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11343       if (!ok || (bitQ == 0 && size == X11)) return False;
11344       vassert(size >= 0 && size <= 3);
11345       UInt lanebits = 8 << size;
11346       vassert(shift >= 1 && shift <= lanebits);
11347       IRExpr* src = getQReg128(nn);
11348       IRTemp  res = newTempV128();
11349       if (shift == lanebits) {
11350          assign(res, getQReg128(dd));
11351       } else {
11352          assign(res, binop(mkVecSHRN(size), src, mkU8(shift)));
11353          IRExpr* nmask = binop(mkVecSHLN(size),
11354                                mkV128(0xFFFF), mkU8(lanebits - shift));
11355          IRTemp  tmp   = newTempV128();
11356          assign(tmp, binop(Iop_OrV128,
11357                            mkexpr(res),
11358                            binop(Iop_AndV128, getQReg128(dd), nmask)));
11359          res = tmp;
11360       }
11361       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11362       HChar laneCh = "bhsd"[size];
11363       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
11364       DIP("%s %s.%u%c, %s.%u%c, #%u\n", "sri",
11365           nameQReg128(dd), nLanes, laneCh,
11366           nameQReg128(nn), nLanes, laneCh, shift);
11367       return True;
11368    }
11369
11370    if (opcode == BITS5(0,1,0,1,0)) {
11371       /* -------- 0,01010 SHL std7_std7_#imm -------- */
11372       /* -------- 1,01010 SLI std7_std7_#imm -------- */
11373       /* laneTy, shift = case immh:immb of
11374                          0001:xxx -> B, xxx
11375                          001x:xxx -> H, xxxx
11376                          01xx:xxx -> S, xxxxx
11377                          1xxx:xxx -> D, xxxxxx
11378                          other    -> invalid
11379       */
11380       UInt size  = 0;
11381       UInt shift = 0;
11382       Bool isSLI = bitU == 1;
11383       Bool isQ   = bitQ == 1;
11384       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11385       if (!ok || (bitQ == 0 && size == X11)) return False;
11386       vassert(size >= 0 && size <= 3);
11387       /* The shift encoding has opposite sign for the leftwards case.
11388          Adjust shift to compensate. */
11389       UInt lanebits = 8 << size;
11390       shift = lanebits - shift;
11391       vassert(shift >= 0 && shift < lanebits);
11392       IROp    op  = mkVecSHLN(size);
11393       IRExpr* src = getQReg128(nn);
11394       IRTemp  res = newTempV128();
11395       if (shift == 0) {
11396          assign(res, src);
11397       } else {
11398          assign(res, binop(op, src, mkU8(shift)));
11399          if (isSLI) {
11400             IRExpr* nmask = binop(mkVecSHRN(size),
11401                                   mkV128(0xFFFF), mkU8(lanebits - shift));
11402             IRTemp  tmp   = newTempV128();
11403             assign(tmp, binop(Iop_OrV128,
11404                               mkexpr(res),
11405                               binop(Iop_AndV128, getQReg128(dd), nmask)));
11406             res = tmp;
11407          }
11408       }
11409       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11410       HChar laneCh = "bhsd"[size];
11411       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
11412       const HChar* nm = isSLI ? "sli" : "shl";
11413       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
11414           nameQReg128(dd), nLanes, laneCh,
11415           nameQReg128(nn), nLanes, laneCh, shift);
11416       return True;
11417    }
11418
11419    if (opcode == BITS5(0,1,1,1,0)
11420        || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
11421       /* -------- 0,01110  SQSHL  std7_std7_#imm -------- */
11422       /* -------- 1,01110  UQSHL  std7_std7_#imm -------- */
11423       /* -------- 1,01100  SQSHLU std7_std7_#imm -------- */
11424       UInt size  = 0;
11425       UInt shift = 0;
11426       Bool isQ   = bitQ == 1;
11427       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11428       if (!ok || (bitQ == 0 && size == X11)) return False;
11429       vassert(size >= 0 && size <= 3);
11430       /* The shift encoding has opposite sign for the leftwards case.
11431          Adjust shift to compensate. */
11432       UInt lanebits = 8 << size;
11433       shift = lanebits - shift;
11434       vassert(shift >= 0 && shift < lanebits);
11435       const HChar* nm = NULL;
11436       /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
11437       else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
11438       else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
11439       else vassert(0);
11440       IRTemp qDiff1 = IRTemp_INVALID;
11441       IRTemp qDiff2 = IRTemp_INVALID;
11442       IRTemp res = IRTemp_INVALID;
11443       IRTemp src = newTempV128();
11444       assign(src, getQReg128(nn));
11445       math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
11446       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11447       updateQCFLAGwithDifferenceZHI(qDiff1, qDiff2,
11448                                     isQ ? Iop_INVALID : Iop_ZeroHI64ofV128);
11449       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11450       DIP("%s %s.%s, %s.%s, #%u\n", nm,
11451           nameQReg128(dd), arr, nameQReg128(nn), arr, shift);
11452       return True;
11453    }
11454
11455    if (bitU == 0
11456        && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1))) {
11457       /* -------- 0,10000  SHRN{,2} #imm -------- */
11458       /* -------- 0,10001 RSHRN{,2} #imm -------- */
11459       /* Narrows, and size is the narrow size. */
11460       UInt size  = 0;
11461       UInt shift = 0;
11462       Bool is2   = bitQ == 1;
11463       Bool isR   = opcode == BITS5(1,0,0,0,1);
11464       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11465       if (!ok || size == X11) return False;
11466       vassert(shift >= 1);
11467       IRTemp t1 = newTempV128();
11468       IRTemp t2 = newTempV128();
11469       IRTemp t3 = newTempV128();
11470       assign(t1, getQReg128(nn));
11471       assign(t2, isR ? binop(mkVecADD(size+1),
11472                              mkexpr(t1),
11473                              mkexpr(math_VEC_DUP_IMM(size+1, 1ULL<<(shift-1))))
11474                      : mkexpr(t1));
11475       assign(t3, binop(mkVecSHRN(size+1), mkexpr(t2), mkU8(shift)));
11476       IRTemp t4 = math_NARROW_LANES(t3, t3, size);
11477       putLO64andZUorPutHI64(is2, dd, t4);
11478       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11479       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11480       DIP("%s %s.%s, %s.%s, #%u\n", isR ? "rshrn" : "shrn",
11481           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
11482       return True;
11483    }
11484
11485    if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
11486        || (bitU == 1
11487            && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
11488       /* -------- 0,10010   SQSHRN{,2} #imm -------- */
11489       /* -------- 1,10010   UQSHRN{,2} #imm -------- */
11490       /* -------- 0,10011  SQRSHRN{,2} #imm -------- */
11491       /* -------- 1,10011  UQRSHRN{,2} #imm -------- */
11492       /* -------- 1,10000  SQSHRUN{,2} #imm -------- */
11493       /* -------- 1,10001 SQRSHRUN{,2} #imm -------- */
11494       UInt size  = 0;
11495       UInt shift = 0;
11496       Bool is2   = bitQ == 1;
11497       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11498       if (!ok || size == X11) return False;
11499       vassert(shift >= 1 && shift <= (8 << size));
11500       const HChar* nm = "??";
11501       IROp op = Iop_INVALID;
11502       /* Decide on the name and the operation. */
11503       /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
11504          nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size);
11505       }
11506       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
11507          nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size);
11508       }
11509       else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) {
11510          nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size);
11511       }
11512       else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
11513          nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size);
11514       }
11515       else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) {
11516          nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size);
11517       }
11518       else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) {
11519          nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size);
11520       }
11521       else vassert(0);
11522       /* Compute the result (Q, shifted value) pair. */
11523       IRTemp src128 = newTempV128();
11524       assign(src128, getQReg128(nn));
11525       IRTemp pair = newTempV128();
11526       assign(pair, binop(op, mkexpr(src128), mkU8(shift)));
11527       /* Update the result reg */
11528       IRTemp res64in128 = newTempV128();
11529       assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair)));
11530       putLO64andZUorPutHI64(is2, dd, res64in128);
11531       /* Update the Q flag. */
11532       IRTemp q64q64 = newTempV128();
11533       assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair)));
11534       IRTemp z128 = newTempV128();
11535       assign(z128, mkV128(0x0000));
11536       updateQCFLAGwithDifference(q64q64, z128);
11537       /* */
11538       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11539       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11540       DIP("%s %s.%s, %s.%s, #%u\n", nm,
11541           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
11542       return True;
11543    }
11544
11545    if (opcode == BITS5(1,0,1,0,0)) {
11546       /* -------- 0,10100 SSHLL{,2} #imm -------- */
11547       /* -------- 1,10100 USHLL{,2} #imm -------- */
11548       /* 31  28     22   18   15     9 4
11549          0q0 011110 immh immb 101001 n d  SSHLL Vd.Ta, Vn.Tb, #sh
11550          0q1 011110 immh immb 101001 n d  USHLL Vd.Ta, Vn.Tb, #sh
11551          where Ta,Tb,sh
11552            = case immh of 1xxx -> invalid
11553                           01xx -> 2d, 2s(q0)/4s(q1),  immh:immb - 32 (0..31)
11554                           001x -> 4s, 4h(q0)/8h(q1),  immh:immb - 16 (0..15)
11555                           0001 -> 8h, 8b(q0)/16b(q1), immh:immb - 8  (0..7)
11556                           0000 -> AdvSIMD modified immediate (???)
11557       */
11558       Bool    isQ   = bitQ == 1;
11559       Bool    isU   = bitU == 1;
11560       UInt    immhb = (immh << 3) | immb;
11561       IRTemp  src   = newTempV128();
11562       IRTemp  zero  = newTempV128();
11563       IRExpr* res   = NULL;
11564       UInt    sh    = 0;
11565       const HChar* ta = "??";
11566       const HChar* tb = "??";
11567       assign(src, getQReg128(nn));
11568       assign(zero, mkV128(0x0000));
11569       if (immh & 8) {
11570          /* invalid; don't assign to res */
11571       }
11572       else if (immh & 4) {
11573          sh = immhb - 32;
11574          vassert(sh < 32); /* so 32-sh is 1..32 */
11575          ta = "2d";
11576          tb = isQ ? "4s" : "2s";
11577          IRExpr* tmp = isQ ? mk_InterleaveHI32x4(src, zero)
11578                            : mk_InterleaveLO32x4(src, zero);
11579          res = binop(isU ? Iop_ShrN64x2 : Iop_SarN64x2, tmp, mkU8(32-sh));
11580       }
11581       else if (immh & 2) {
11582          sh = immhb - 16;
11583          vassert(sh < 16); /* so 16-sh is 1..16 */
11584          ta = "4s";
11585          tb = isQ ? "8h" : "4h";
11586          IRExpr* tmp = isQ ? mk_InterleaveHI16x8(src, zero)
11587                            : mk_InterleaveLO16x8(src, zero);
11588          res = binop(isU ? Iop_ShrN32x4 : Iop_SarN32x4, tmp, mkU8(16-sh));
11589       }
11590       else if (immh & 1) {
11591          sh = immhb - 8;
11592          vassert(sh < 8); /* so 8-sh is 1..8 */
11593          ta = "8h";
11594          tb = isQ ? "16b" : "8b";
11595          IRExpr* tmp = isQ ? mk_InterleaveHI8x16(src, zero)
11596                            : mk_InterleaveLO8x16(src, zero);
11597          res = binop(isU ? Iop_ShrN16x8 : Iop_SarN16x8, tmp, mkU8(8-sh));
11598       } else {
11599          vassert(immh == 0);
11600          /* invalid; don't assign to res */
11601       }
11602       /* */
11603       if (res) {
11604          putQReg128(dd, res);
11605          DIP("%cshll%s %s.%s, %s.%s, #%u\n",
11606              isU ? 'u' : 's', isQ ? "2" : "",
11607              nameQReg128(dd), ta, nameQReg128(nn), tb, sh);
11608          return True;
11609       }
11610       return False;
11611    }
11612
11613    if (opcode == BITS5(1,1,1,0,0)) {
11614       /* -------- 0,11100 SCVTF {2d_2d,4s_4s,2s_2s}_imm -------- */
11615       /* -------- 1,11100 UCVTF {2d_2d,4s_4s,2s_2s}_imm -------- */
11616       /* If immh is of the form 00xx, the insn is invalid. */
11617       if (immh < BITS4(0,1,0,0)) return False;
11618       UInt size  = 0;
11619       UInt fbits = 0;
11620       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
11621       /* The following holds because immh is never zero. */
11622       vassert(ok);
11623       /* The following holds because immh >= 0100. */
11624       vassert(size == X10 || size == X11);
11625       Bool isD = size == X11;
11626       Bool isU = bitU == 1;
11627       Bool isQ = bitQ == 1;
11628       if (isD && !isQ) return False; /* reject .1d case */
11629       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
11630       Double  scale  = two_to_the_minus(fbits);
11631       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
11632                            : IRExpr_Const(IRConst_F32( (Float)scale ));
11633       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
11634       IROp    opCVT  = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
11635                            : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
11636       IRType tyF = isD ? Ity_F64 : Ity_F32;
11637       IRType tyI = isD ? Ity_I64 : Ity_I32;
11638       UInt nLanes = (isQ ? 2 : 1) * (isD ? 1 : 2);
11639       vassert(nLanes == 2 || nLanes == 4);
11640       for (UInt i = 0; i < nLanes; i++) {
11641          IRTemp src = newTemp(tyI);
11642          IRTemp res = newTemp(tyF);
11643          IRTemp rm  = mk_get_IR_rounding_mode();
11644          assign(src, getQRegLane(nn, i, tyI));
11645          assign(res, triop(opMUL, mkexpr(rm),
11646                                   binop(opCVT, mkexpr(rm), mkexpr(src)),
11647                                   scaleE));
11648          putQRegLane(dd, i, mkexpr(res));
11649       }
11650       if (!isQ) {
11651          putQRegLane(dd, 1, mkU64(0));
11652       }
11653       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11654       DIP("%s %s.%s, %s.%s, #%u\n", isU ? "ucvtf" : "scvtf",
11655           nameQReg128(dd), arr, nameQReg128(nn), arr, fbits);
11656       return True;
11657    }
11658
11659    if (opcode == BITS5(1,1,1,1,1)) {
11660       /* -------- 0,11111 FCVTZS {2d_2d,4s_4s,2s_2s}_imm -------- */
11661       /* -------- 1,11111 FCVTZU {2d_2d,4s_4s,2s_2s}_imm -------- */
11662       /* If immh is of the form 00xx, the insn is invalid. */
11663       if (immh < BITS4(0,1,0,0)) return False;
11664       UInt size  = 0;
11665       UInt fbits = 0;
11666       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
11667       /* The following holds because immh is never zero. */
11668       vassert(ok);
11669       /* The following holds because immh >= 0100. */
11670       vassert(size == X10 || size == X11);
11671       Bool isD = size == X11;
11672       Bool isU = bitU == 1;
11673       Bool isQ = bitQ == 1;
11674       if (isD && !isQ) return False; /* reject .1d case */
11675       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
11676       Double  scale  = two_to_the_plus(fbits);
11677       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
11678                            : IRExpr_Const(IRConst_F32( (Float)scale ));
11679       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
11680       IROp    opCVT  = isU ? (isD ? Iop_F64toI64U : Iop_F32toI32U)
11681                            : (isD ? Iop_F64toI64S : Iop_F32toI32S);
11682       IRType tyF = isD ? Ity_F64 : Ity_F32;
11683       IRType tyI = isD ? Ity_I64 : Ity_I32;
11684       UInt nLanes = (isQ ? 2 : 1) * (isD ? 1 : 2);
11685       vassert(nLanes == 2 || nLanes == 4);
11686       for (UInt i = 0; i < nLanes; i++) {
11687          IRTemp src = newTemp(tyF);
11688          IRTemp res = newTemp(tyI);
11689          IRTemp rm  = newTemp(Ity_I32);
11690          assign(src, getQRegLane(nn, i, tyF));
11691          assign(rm,  mkU32(Irrm_ZERO));
11692          assign(res, binop(opCVT, mkexpr(rm),
11693                                   triop(opMUL, mkexpr(rm),
11694                                                mkexpr(src), scaleE)));
11695          putQRegLane(dd, i, mkexpr(res));
11696       }
11697       if (!isQ) {
11698          putQRegLane(dd, 1, mkU64(0));
11699       }
11700       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11701       DIP("%s %s.%s, %s.%s, #%u\n", isU ? "fcvtzu" : "fcvtzs",
11702           nameQReg128(dd), arr, nameQReg128(nn), arr, fbits);
11703       return True;
11704    }
11705
11706 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11707    return False;
11708 #  undef INSN
11709 }
11710
11711
11712 static
11713 Bool dis_AdvSIMD_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
11714 {
11715    /* 31 30 29 28    23   21 20 15     11 9 4
11716       0  Q  U  01110 size 1  m  opcode 00 n d
11717       Decode fields: u,opcode
11718    */
11719 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11720    if (INSN(31,31) != 0
11721        || INSN(28,24) != BITS5(0,1,1,1,0)
11722        || INSN(21,21) != 1
11723        || INSN(11,10) != BITS2(0,0)) {
11724       return False;
11725    }
11726    UInt bitQ   = INSN(30,30);
11727    UInt bitU   = INSN(29,29);
11728    UInt size   = INSN(23,22);
11729    UInt mm     = INSN(20,16);
11730    UInt opcode = INSN(15,12);
11731    UInt nn     = INSN(9,5);
11732    UInt dd     = INSN(4,0);
11733    vassert(size < 4);
11734    Bool is2    = bitQ == 1;
11735
11736    if (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,0,1,0)) {
11737       /* -------- 0,0000 SADDL{2} -------- */
11738       /* -------- 1,0000 UADDL{2} -------- */
11739       /* -------- 0,0010 SSUBL{2} -------- */
11740       /* -------- 1,0010 USUBL{2} -------- */
11741       /* Widens, and size refers to the narrow lanes. */
11742       if (size == X11) return False;
11743       vassert(size <= 2);
11744       Bool   isU   = bitU == 1;
11745       Bool   isADD = opcode == BITS4(0,0,0,0);
11746       IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
11747       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
11748       IRTemp res   = newTempV128();
11749       assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
11750                         mkexpr(argL), mkexpr(argR)));
11751       putQReg128(dd, mkexpr(res));
11752       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11753       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11754       const HChar* nm        = isADD ? (isU ? "uaddl" : "saddl")
11755                                      : (isU ? "usubl" : "ssubl");
11756       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11757           nameQReg128(dd), arrWide,
11758           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11759       return True;
11760    }
11761
11762    if (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,0,1,1)) {
11763       /* -------- 0,0001 SADDW{2} -------- */
11764       /* -------- 1,0001 UADDW{2} -------- */
11765       /* -------- 0,0011 SSUBW{2} -------- */
11766       /* -------- 1,0011 USUBW{2} -------- */
11767       /* Widens, and size refers to the narrow lanes. */
11768       if (size == X11) return False;
11769       vassert(size <= 2);
11770       Bool   isU   = bitU == 1;
11771       Bool   isADD = opcode == BITS4(0,0,0,1);
11772       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
11773       IRTemp res   = newTempV128();
11774       assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
11775                         getQReg128(nn), mkexpr(argR)));
11776       putQReg128(dd, mkexpr(res));
11777       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11778       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11779       const HChar* nm        = isADD ? (isU ? "uaddw" : "saddw")
11780                                      : (isU ? "usubw" : "ssubw");
11781       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11782           nameQReg128(dd), arrWide,
11783           nameQReg128(nn), arrWide, nameQReg128(mm), arrNarrow);
11784       return True;
11785    }
11786
11787    if (opcode == BITS4(0,1,0,0) || opcode == BITS4(0,1,1,0)) {
11788       /* -------- 0,0100  ADDHN{2} -------- */
11789       /* -------- 1,0100 RADDHN{2} -------- */
11790       /* -------- 0,0110  SUBHN{2} -------- */
11791       /* -------- 1,0110 RSUBHN{2} -------- */
11792       /* Narrows, and size refers to the narrowed lanes. */
11793       if (size == X11) return False;
11794       vassert(size <= 2);
11795       const UInt shift[3] = { 8, 16, 32 };
11796       Bool isADD = opcode == BITS4(0,1,0,0);
11797       Bool isR   = bitU == 1;
11798       /* Combined elements in wide lanes */
11799       IRTemp  wide  = newTempV128();
11800       IRExpr* wideE = binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
11801                             getQReg128(nn), getQReg128(mm));
11802       if (isR) {
11803          wideE = binop(mkVecADD(size+1),
11804                        wideE,
11805                        mkexpr(math_VEC_DUP_IMM(size+1,
11806                                                1ULL << (shift[size]-1))));
11807       }
11808       assign(wide, wideE);
11809       /* Top halves of elements, still in wide lanes */
11810       IRTemp shrd = newTempV128();
11811       assign(shrd, binop(mkVecSHRN(size+1), mkexpr(wide), mkU8(shift[size])));
11812       /* Elements now compacted into lower 64 bits */
11813       IRTemp new64 = newTempV128();
11814       assign(new64, binop(mkVecCATEVENLANES(size), mkexpr(shrd), mkexpr(shrd)));
11815       putLO64andZUorPutHI64(is2, dd, new64);
11816       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11817       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11818       const HChar* nm = isADD ? (isR ? "raddhn" : "addhn")
11819                               : (isR ? "rsubhn" : "subhn");
11820       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11821           nameQReg128(dd), arrNarrow,
11822           nameQReg128(nn), arrWide, nameQReg128(mm), arrWide);
11823       return True;
11824    }
11825
11826    if (opcode == BITS4(0,1,0,1) || opcode == BITS4(0,1,1,1)) {
11827       /* -------- 0,0101 SABAL{2} -------- */
11828       /* -------- 1,0101 UABAL{2} -------- */
11829       /* -------- 0,0111 SABDL{2} -------- */
11830       /* -------- 1,0111 UABDL{2} -------- */
11831       /* Widens, and size refers to the narrow lanes. */
11832       if (size == X11) return False;
11833       vassert(size <= 2);
11834       Bool   isU   = bitU == 1;
11835       Bool   isACC = opcode == BITS4(0,1,0,1);
11836       IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
11837       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
11838       IRTemp abd   = math_ABD(isU, size+1, mkexpr(argL), mkexpr(argR));
11839       IRTemp res   = newTempV128();
11840       assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(abd), getQReg128(dd))
11841                         : mkexpr(abd));
11842       putQReg128(dd, mkexpr(res));
11843       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11844       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11845       const HChar* nm        = isACC ? (isU ? "uabal" : "sabal")
11846                                      : (isU ? "uabdl" : "sabdl");
11847       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11848           nameQReg128(dd), arrWide,
11849           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11850       return True;
11851    }
11852
11853    if (opcode == BITS4(1,1,0,0)
11854        || opcode == BITS4(1,0,0,0) || opcode == BITS4(1,0,1,0)) {
11855       /* -------- 0,1100  SMULL{2} -------- */ // 0 (ks)
11856       /* -------- 1,1100  UMULL{2} -------- */ // 0
11857       /* -------- 0,1000  SMLAL{2} -------- */ // 1
11858       /* -------- 1,1000  UMLAL{2} -------- */ // 1
11859       /* -------- 0,1010  SMLSL{2} -------- */ // 2
11860       /* -------- 1,1010  UMLSL{2} -------- */ // 2
11861       /* Widens, and size refers to the narrow lanes. */
11862       UInt ks = 3;
11863       switch (opcode) {
11864          case BITS4(1,1,0,0): ks = 0; break;
11865          case BITS4(1,0,0,0): ks = 1; break;
11866          case BITS4(1,0,1,0): ks = 2; break;
11867          default: vassert(0);
11868       }
11869       vassert(ks >= 0 && ks <= 2);
11870       if (size == X11) return False;
11871       vassert(size <= 2);
11872       Bool   isU  = bitU == 1;
11873       IRTemp vecN = newTempV128();
11874       IRTemp vecM = newTempV128();
11875       IRTemp vecD = newTempV128();
11876       assign(vecN, getQReg128(nn));
11877       assign(vecM, getQReg128(mm));
11878       assign(vecD, getQReg128(dd));
11879       IRTemp res = IRTemp_INVALID;
11880       math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
11881                     vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
11882       putQReg128(dd, mkexpr(res));
11883       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11884       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11885       const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
11886       DIP("%c%s%s %s.%s, %s.%s, %s.%s\n", isU ? 'u' : 's', nm, is2 ? "2" : "",
11887           nameQReg128(dd), arrWide,
11888           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11889       return True;
11890    }
11891
11892    if (bitU == 0
11893        && (opcode == BITS4(1,1,0,1)
11894            || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
11895       /* -------- 0,1101  SQDMULL{2} -------- */ // 0 (ks)
11896       /* -------- 0,1001  SQDMLAL{2} -------- */ // 1
11897       /* -------- 0,1011  SQDMLSL{2} -------- */ // 2
11898       /* Widens, and size refers to the narrow lanes. */
11899       UInt ks = 3;
11900       switch (opcode) {
11901          case BITS4(1,1,0,1): ks = 0; break;
11902          case BITS4(1,0,0,1): ks = 1; break;
11903          case BITS4(1,0,1,1): ks = 2; break;
11904          default: vassert(0);
11905       }
11906       vassert(ks >= 0 && ks <= 2);
11907       if (size == X00 || size == X11) return False;
11908       vassert(size <= 2);
11909       IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
11910       vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
11911       newTempsV128_3(&vecN, &vecM, &vecD);
11912       assign(vecN, getQReg128(nn));
11913       assign(vecM, getQReg128(mm));
11914       assign(vecD, getQReg128(dd));
11915       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
11916                        is2, size, "mas"[ks],
11917                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
11918       putQReg128(dd, mkexpr(res));
11919       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
11920       updateQCFLAGwithDifference(sat1q, sat1n);
11921       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
11922          updateQCFLAGwithDifference(sat2q, sat2n);
11923       }
11924       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11925       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11926       const HChar* nm        = ks == 0 ? "sqdmull"
11927                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
11928       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11929           nameQReg128(dd), arrWide,
11930           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11931       return True;
11932    }
11933
11934    if (bitU == 0 && opcode == BITS4(1,1,1,0)) {
11935       /* -------- 0,1110  PMULL{2} -------- */
11936       /* Widens, and size refers to the narrow lanes. */
11937       if (size != X00 && size != X11) return False;
11938       IRTemp  res  = IRTemp_INVALID;
11939       IRExpr* srcN = getQReg128(nn);
11940       IRExpr* srcM = getQReg128(mm);
11941       const HChar* arrNarrow = NULL;
11942       const HChar* arrWide   = NULL;
11943       if (size == X00) {
11944          res = math_BINARY_WIDENING_V128(is2, Iop_PolynomialMull8x8,
11945                                          srcN, srcM);
11946          arrNarrow = nameArr_Q_SZ(bitQ, size);
11947          arrWide   = nameArr_Q_SZ(1,    size+1);
11948       } else {
11949          /* The same thing as the X00 case, except we have to call
11950             a helper to do it. */
11951          vassert(size == X11);
11952          res = newTemp(Ity_V128);
11953          IROp slice
11954             = is2 ? Iop_V128HIto64 : Iop_V128to64;
11955          IRExpr** args
11956             = mkIRExprVec_3( IRExpr_VECRET(),
11957                              unop(slice, srcN), unop(slice, srcM));
11958          IRDirty* di
11959             = unsafeIRDirty_1_N( res, 0/*regparms*/,
11960                                       "arm64g_dirtyhelper_PMULLQ",
11961                                       &arm64g_dirtyhelper_PMULLQ, args);
11962          stmt(IRStmt_Dirty(di));
11963          /* We can't use nameArr_Q_SZ for this because it can't deal with
11964             Q-sized (128 bit) results.  Hence do it by hand. */
11965          arrNarrow = bitQ == 0 ? "1d" : "2d";
11966          arrWide   = "1q";
11967       }
11968       putQReg128(dd, mkexpr(res));
11969       DIP("%s%s %s.%s, %s.%s, %s.%s\n", "pmull", is2 ? "2" : "",
11970           nameQReg128(dd), arrWide,
11971           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11972       return True;
11973    }
11974
11975    return False;
11976 #  undef INSN
11977 }
11978
11979
11980 static
11981 Bool dis_AdvSIMD_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
11982 {
11983    /* 31 30 29 28    23   21 20 15     10 9 4
11984       0  Q  U  01110 size 1  m  opcode 1  n d
11985       Decode fields: u,size,opcode
11986    */
11987 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11988    if (INSN(31,31) != 0
11989        || INSN(28,24) != BITS5(0,1,1,1,0)
11990        || INSN(21,21) != 1
11991        || INSN(10,10) != 1) {
11992       return False;
11993    }
11994    UInt bitQ   = INSN(30,30);
11995    UInt bitU   = INSN(29,29);
11996    UInt size   = INSN(23,22);
11997    UInt mm     = INSN(20,16);
11998    UInt opcode = INSN(15,11);
11999    UInt nn     = INSN(9,5);
12000    UInt dd     = INSN(4,0);
12001    vassert(size < 4);
12002
12003    if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,1,0,0)) {
12004       /* -------- 0,xx,00000 SHADD std6_std6_std6 -------- */
12005       /* -------- 1,xx,00000 UHADD std6_std6_std6 -------- */
12006       /* -------- 0,xx,00100 SHSUB std6_std6_std6 -------- */
12007       /* -------- 1,xx,00100 UHSUB std6_std6_std6 -------- */
12008       if (size == X11) return False;
12009       Bool isADD = opcode == BITS5(0,0,0,0,0);
12010       Bool isU   = bitU == 1;
12011       /* Widen both args out, do the math, narrow to final result. */
12012       IRTemp argL   = newTempV128();
12013       IRTemp argLhi = IRTemp_INVALID;
12014       IRTemp argLlo = IRTemp_INVALID;
12015       IRTemp argR   = newTempV128();
12016       IRTemp argRhi = IRTemp_INVALID;
12017       IRTemp argRlo = IRTemp_INVALID;
12018       IRTemp resHi  = newTempV128();
12019       IRTemp resLo  = newTempV128();
12020       IRTemp res    = IRTemp_INVALID;
12021       assign(argL, getQReg128(nn));
12022       argLlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argL));
12023       argLhi = math_WIDEN_LO_OR_HI_LANES(isU, True,  size, mkexpr(argL));
12024       assign(argR, getQReg128(mm));
12025       argRlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argR));
12026       argRhi = math_WIDEN_LO_OR_HI_LANES(isU, True,  size, mkexpr(argR));
12027       IROp opADDSUB = isADD ? mkVecADD(size+1) : mkVecSUB(size+1);
12028       IROp opSxR = isU ? mkVecSHRN(size+1) : mkVecSARN(size+1);
12029       assign(resHi, binop(opSxR,
12030                           binop(opADDSUB, mkexpr(argLhi), mkexpr(argRhi)),
12031                           mkU8(1)));
12032       assign(resLo, binop(opSxR,
12033                           binop(opADDSUB, mkexpr(argLlo), mkexpr(argRlo)),
12034                           mkU8(1)));
12035       res = math_NARROW_LANES ( resHi, resLo, size );
12036       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12037       const HChar* nm  = isADD ? (isU ? "uhadd" : "shadd")
12038                                : (isU ? "uhsub" : "shsub");
12039       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12040       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12041           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12042       return True;
12043    }
12044
12045    if (opcode == BITS5(0,0,0,1,0)) {
12046       /* -------- 0,xx,00010 SRHADD std7_std7_std7 -------- */
12047       /* -------- 1,xx,00010 URHADD std7_std7_std7 -------- */
12048       if (bitQ == 0 && size == X11) return False; // implied 1d case
12049       Bool   isU  = bitU == 1;
12050       IRTemp argL = newTempV128();
12051       IRTemp argR = newTempV128();
12052       assign(argL, getQReg128(nn));
12053       assign(argR, getQReg128(mm));
12054       IRTemp res = math_RHADD(size, isU, argL, argR);
12055       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12056       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12057       DIP("%s %s.%s, %s.%s, %s.%s\n", isU ? "urhadd" : "srhadd",
12058           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12059       return True;
12060    }
12061
12062    if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
12063       /* -------- 0,xx,00001 SQADD std7_std7_std7 -------- */
12064       /* -------- 1,xx,00001 UQADD std7_std7_std7 -------- */
12065       /* -------- 0,xx,00101 SQSUB std7_std7_std7 -------- */
12066       /* -------- 1,xx,00101 UQSUB std7_std7_std7 -------- */
12067       if (bitQ == 0 && size == X11) return False; // implied 1d case
12068       Bool isADD = opcode == BITS5(0,0,0,0,1);
12069       Bool isU   = bitU == 1;
12070       IROp qop   = Iop_INVALID;
12071       IROp nop   = Iop_INVALID;
12072       if (isADD) {
12073          qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
12074          nop = mkVecADD(size);
12075       } else {
12076          qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
12077          nop = mkVecSUB(size);
12078       }
12079       IRTemp argL = newTempV128();
12080       IRTemp argR = newTempV128();
12081       IRTemp qres = newTempV128();
12082       IRTemp nres = newTempV128();
12083       assign(argL, getQReg128(nn));
12084       assign(argR, getQReg128(mm));
12085       assign(qres, math_MAYBE_ZERO_HI64_fromE(
12086                       bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
12087       assign(nres, math_MAYBE_ZERO_HI64_fromE(
12088                       bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
12089       putQReg128(dd, mkexpr(qres));
12090       updateQCFLAGwithDifference(qres, nres);
12091       const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd")
12092                                : (isU ? "uqsub" : "sqsub");
12093       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12094       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12095           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12096       return True;
12097    }
12098
12099    if (bitU == 0 && opcode == BITS5(0,0,0,1,1)) {
12100       /* -------- 0,00,00011 AND 16b_16b_16b, 8b_8b_8b -------- */
12101       /* -------- 0,01,00011 BIC 16b_16b_16b, 8b_8b_8b -------- */
12102       /* -------- 0,10,00011 ORR 16b_16b_16b, 8b_8b_8b -------- */
12103       /* -------- 0,10,00011 ORN 16b_16b_16b, 8b_8b_8b -------- */
12104       Bool   isORx  = (size & 2) == 2;
12105       Bool   invert = (size & 1) == 1;
12106       IRTemp res    = newTempV128();
12107       assign(res, binop(isORx ? Iop_OrV128 : Iop_AndV128,
12108                         getQReg128(nn),
12109                         invert ? unop(Iop_NotV128, getQReg128(mm))
12110                                : getQReg128(mm)));
12111       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12112       const HChar* names[4] = { "and", "bic", "orr", "orn" };
12113       const HChar* ar = bitQ == 1 ? "16b" : "8b";
12114       DIP("%s %s.%s, %s.%s, %s.%s\n", names[INSN(23,22)],
12115           nameQReg128(dd), ar, nameQReg128(nn), ar, nameQReg128(mm), ar);
12116       return True;
12117    }
12118
12119    if (bitU == 1 && opcode == BITS5(0,0,0,1,1)) {
12120       /* -------- 1,00,00011 EOR 16b_16b_16b, 8b_8b_8b -------- */
12121       /* -------- 1,01,00011 BSL 16b_16b_16b, 8b_8b_8b -------- */
12122       /* -------- 1,10,00011 BIT 16b_16b_16b, 8b_8b_8b -------- */
12123       /* -------- 1,10,00011 BIF 16b_16b_16b, 8b_8b_8b -------- */
12124       IRTemp argD = newTempV128();
12125       IRTemp argN = newTempV128();
12126       IRTemp argM = newTempV128();
12127       assign(argD, getQReg128(dd));
12128       assign(argN, getQReg128(nn));
12129       assign(argM, getQReg128(mm));
12130       const IROp opXOR = Iop_XorV128;
12131       const IROp opAND = Iop_AndV128;
12132       const IROp opNOT = Iop_NotV128;
12133       IRTemp res = newTempV128();
12134       switch (size) {
12135          case BITS2(0,0): /* EOR */
12136             assign(res, binop(opXOR, mkexpr(argM), mkexpr(argN)));
12137             break;
12138          case BITS2(0,1): /* BSL */
12139             assign(res, binop(opXOR, mkexpr(argM),
12140                               binop(opAND,
12141                                     binop(opXOR, mkexpr(argM), mkexpr(argN)),
12142                                           mkexpr(argD))));
12143             break;
12144          case BITS2(1,0): /* BIT */
12145             assign(res, binop(opXOR, mkexpr(argD),
12146                               binop(opAND,
12147                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
12148                                     mkexpr(argM))));
12149             break;
12150          case BITS2(1,1): /* BIF */
12151             assign(res, binop(opXOR, mkexpr(argD),
12152                               binop(opAND,
12153                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
12154                                     unop(opNOT, mkexpr(argM)))));
12155             break;
12156          default:
12157             vassert(0);
12158       }
12159       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12160       const HChar* nms[4] = { "eor", "bsl", "bit", "bif" };
12161       const HChar* arr = bitQ == 1 ? "16b" : "8b";
12162       DIP("%s %s.%s, %s.%s, %s.%s\n", nms[size],
12163           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12164       return True;
12165    }
12166
12167    if (opcode == BITS5(0,0,1,1,0)) {
12168       /* -------- 0,xx,00110 CMGT std7_std7_std7 -------- */ // >s
12169       /* -------- 1,xx,00110 CMHI std7_std7_std7 -------- */ // >u
12170       if (bitQ == 0 && size == X11) return False; // implied 1d case
12171       Bool   isGT  = bitU == 0;
12172       IRExpr* argL = getQReg128(nn);
12173       IRExpr* argR = getQReg128(mm);
12174       IRTemp  res  = newTempV128();
12175       assign(res,
12176              isGT ? binop(mkVecCMPGTS(size), argL, argR)
12177                   : binop(mkVecCMPGTU(size), argL, argR));
12178       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12179       const HChar* nm  = isGT ? "cmgt" : "cmhi";
12180       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12181       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12182           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12183       return True;
12184    }
12185
12186    if (opcode == BITS5(0,0,1,1,1)) {
12187       /* -------- 0,xx,00111 CMGE std7_std7_std7 -------- */ // >=s
12188       /* -------- 1,xx,00111 CMHS std7_std7_std7 -------- */ // >=u
12189       if (bitQ == 0 && size == X11) return False; // implied 1d case
12190       Bool    isGE = bitU == 0;
12191       IRExpr* argL = getQReg128(nn);
12192       IRExpr* argR = getQReg128(mm);
12193       IRTemp  res  = newTempV128();
12194       assign(res,
12195              isGE ? unop(Iop_NotV128, binop(mkVecCMPGTS(size), argR, argL))
12196                   : unop(Iop_NotV128, binop(mkVecCMPGTU(size), argR, argL)));
12197       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12198       const HChar* nm  = isGE ? "cmge" : "cmhs";
12199       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12200       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12201           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12202       return True;
12203    }
12204
12205    if (opcode == BITS5(0,1,0,0,0) || opcode == BITS5(0,1,0,1,0)) {
12206       /* -------- 0,xx,01000 SSHL  std7_std7_std7 -------- */
12207       /* -------- 0,xx,01010 SRSHL std7_std7_std7 -------- */
12208       /* -------- 1,xx,01000 USHL  std7_std7_std7 -------- */
12209       /* -------- 1,xx,01010 URSHL std7_std7_std7 -------- */
12210       if (bitQ == 0 && size == X11) return False; // implied 1d case
12211       Bool isU = bitU == 1;
12212       Bool isR = opcode == BITS5(0,1,0,1,0);
12213       IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
12214                      : (isU ? mkVecSHU(size)  : mkVecSHS(size));
12215       IRTemp res = newTempV128();
12216       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
12217       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12218       const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
12219                              : (isU ? "ushl"  : "sshl");
12220       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12221       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12222           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12223       return True;
12224    }
12225
12226    if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
12227       /* -------- 0,xx,01001 SQSHL  std7_std7_std7 -------- */
12228       /* -------- 0,xx,01011 SQRSHL std7_std7_std7 -------- */
12229       /* -------- 1,xx,01001 UQSHL  std7_std7_std7 -------- */
12230       /* -------- 1,xx,01011 UQRSHL std7_std7_std7 -------- */
12231       if (bitQ == 0 && size == X11) return False; // implied 1d case
12232       Bool isU = bitU == 1;
12233       Bool isR = opcode == BITS5(0,1,0,1,1);
12234       IROp op  = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
12235                      : (isU ? mkVecQANDUQSH(size)  : mkVecQANDSQSH(size));
12236       /* This is a bit tricky.  If we're only interested in the lowest 64 bits
12237          of the result (viz, bitQ == 0), then we must adjust the operands to
12238          ensure that the upper part of the result, that we don't care about,
12239          doesn't pollute the returned Q value.  To do this, zero out the upper
12240          operand halves beforehand.  This works because it means, for the
12241          lanes we don't care about, we are shifting zero by zero, which can
12242          never saturate. */
12243       IRTemp res256 = newTemp(Ity_V256);
12244       IRTemp resSH  = newTempV128();
12245       IRTemp resQ   = newTempV128();
12246       IRTemp zero   = newTempV128();
12247       assign(res256, binop(op,
12248                            math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(nn)),
12249                            math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(mm))));
12250       assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
12251       assign(resQ,  unop(Iop_V256toV128_1, mkexpr(res256)));
12252       assign(zero,  mkV128(0x0000));
12253       putQReg128(dd, mkexpr(resSH));
12254       updateQCFLAGwithDifference(resQ, zero);
12255       const HChar* nm  = isR ? (isU ? "uqrshl" : "sqrshl")
12256                              : (isU ? "uqshl"  : "sqshl");
12257       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12258       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12259           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12260       return True;
12261    }
12262
12263    if (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,0,1)) {
12264       /* -------- 0,xx,01100 SMAX std7_std7_std7 -------- */
12265       /* -------- 1,xx,01100 UMAX std7_std7_std7 -------- */
12266       /* -------- 0,xx,01101 SMIN std7_std7_std7 -------- */
12267       /* -------- 1,xx,01101 UMIN std7_std7_std7 -------- */
12268       if (bitQ == 0 && size == X11) return False; // implied 1d case
12269       Bool isU   = bitU == 1;
12270       Bool isMAX = (opcode & 1) == 0;
12271       IROp op    = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
12272                          : (isU ? mkVecMINU(size) : mkVecMINS(size));
12273       IRTemp t   = newTempV128();
12274       assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
12275       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t));
12276       const HChar* nm = isMAX ? (isU ? "umax" : "smax")
12277                               : (isU ? "umin" : "smin");
12278       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12279       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12280           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12281       return True;
12282    }
12283
12284    if (opcode == BITS5(0,1,1,1,0) || opcode == BITS5(0,1,1,1,1)) {
12285       /* -------- 0,xx,01110 SABD std6_std6_std6 -------- */
12286       /* -------- 1,xx,01110 UABD std6_std6_std6 -------- */
12287       /* -------- 0,xx,01111 SABA std6_std6_std6 -------- */
12288       /* -------- 1,xx,01111 UABA std6_std6_std6 -------- */
12289       if (size == X11) return False; // 1d/2d cases not allowed
12290       Bool isU   = bitU == 1;
12291       Bool isACC = opcode == BITS5(0,1,1,1,1);
12292       vassert(size <= 2);
12293       IRTemp t1 = math_ABD(isU, size, getQReg128(nn), getQReg128(mm));
12294       IRTemp t2 = newTempV128();
12295       assign(t2, isACC ? binop(mkVecADD(size), mkexpr(t1), getQReg128(dd))
12296                        : mkexpr(t1));
12297       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
12298       const HChar* nm  = isACC ? (isU ? "uaba" : "saba")
12299                                : (isU ? "uabd" : "sabd");
12300       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12301       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12302           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12303       return True;
12304    }
12305
12306    if (opcode == BITS5(1,0,0,0,0)) {
12307       /* -------- 0,xx,10000 ADD std7_std7_std7 -------- */
12308       /* -------- 1,xx,10000 SUB std7_std7_std7 -------- */
12309       if (bitQ == 0 && size == X11) return False; // implied 1d case
12310       Bool   isSUB = bitU == 1;
12311       IROp   op    = isSUB ? mkVecSUB(size) : mkVecADD(size);
12312       IRTemp t     = newTempV128();
12313       assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
12314       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t));
12315       const HChar* nm  = isSUB ? "sub" : "add";
12316       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12317       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12318           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12319       return True;
12320    }
12321
12322    if (opcode == BITS5(1,0,0,0,1)) {
12323       /* -------- 0,xx,10001 CMTST std7_std7_std7 -------- */ // &, != 0
12324       /* -------- 1,xx,10001 CMEQ  std7_std7_std7 -------- */ // ==
12325       if (bitQ == 0 && size == X11) return False; // implied 1d case
12326       Bool    isEQ = bitU == 1;
12327       IRExpr* argL = getQReg128(nn);
12328       IRExpr* argR = getQReg128(mm);
12329       IRTemp  res  = newTempV128();
12330       assign(res,
12331              isEQ ? binop(mkVecCMPEQ(size), argL, argR)
12332                   : unop(Iop_NotV128, binop(mkVecCMPEQ(size),
12333                                             binop(Iop_AndV128, argL, argR),
12334                                             mkV128(0x0000))));
12335       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12336       const HChar* nm  = isEQ ? "cmeq" : "cmtst";
12337       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12338       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12339           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12340       return True;
12341    }
12342
12343    if (opcode == BITS5(1,0,0,1,0)) {
12344       /* -------- 0,xx,10010 MLA std7_std7_std7 -------- */
12345       /* -------- 1,xx,10010 MLS std7_std7_std7 -------- */
12346       if (bitQ == 0 && size == X11) return False; // implied 1d case
12347       Bool isMLS = bitU == 1;
12348       IROp   opMUL    = mkVecMUL(size);
12349       IROp   opADDSUB = isMLS ? mkVecSUB(size) : mkVecADD(size);
12350       IRTemp res      = newTempV128();
12351       if (opMUL != Iop_INVALID && opADDSUB != Iop_INVALID) {
12352          assign(res, binop(opADDSUB,
12353                            getQReg128(dd),
12354                            binop(opMUL, getQReg128(nn), getQReg128(mm))));
12355          putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12356          const HChar* arr = nameArr_Q_SZ(bitQ, size);
12357          DIP("%s %s.%s, %s.%s, %s.%s\n", isMLS ? "mls" : "mla",
12358              nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12359          return True;
12360       }
12361       return False;
12362    }
12363
12364    if (opcode == BITS5(1,0,0,1,1)) {
12365       /* -------- 0,xx,10011 MUL  std7_std7_std7 -------- */
12366       /* -------- 1,xx,10011 PMUL 16b_16b_16b, 8b_8b_8b -------- */
12367       if (bitQ == 0 && size == X11) return False; // implied 1d case
12368       Bool isPMUL = bitU == 1;
12369       const IROp opsPMUL[4]
12370          = { Iop_PolynomialMul8x16, Iop_INVALID, Iop_INVALID, Iop_INVALID };
12371       IROp   opMUL = isPMUL ? opsPMUL[size] : mkVecMUL(size);
12372       IRTemp res   = newTempV128();
12373       if (opMUL != Iop_INVALID) {
12374          assign(res, binop(opMUL, getQReg128(nn), getQReg128(mm)));
12375          putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12376          const HChar* arr = nameArr_Q_SZ(bitQ, size);
12377          DIP("%s %s.%s, %s.%s, %s.%s\n", isPMUL ? "pmul" : "mul",
12378              nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12379          return True;
12380       }
12381       return False;
12382    }
12383
12384    if (opcode == BITS5(1,0,1,0,0) || opcode == BITS5(1,0,1,0,1)) {
12385       /* -------- 0,xx,10100 SMAXP std6_std6_std6 -------- */
12386       /* -------- 1,xx,10100 UMAXP std6_std6_std6 -------- */
12387       /* -------- 0,xx,10101 SMINP std6_std6_std6 -------- */
12388       /* -------- 1,xx,10101 UMINP std6_std6_std6 -------- */
12389       if (size == X11) return False;
12390       Bool isU   = bitU == 1;
12391       Bool isMAX = opcode == BITS5(1,0,1,0,0);
12392       IRTemp vN  = newTempV128();
12393       IRTemp vM  = newTempV128();
12394       IROp op = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
12395                       : (isU ? mkVecMINU(size) : mkVecMINS(size));
12396       assign(vN, getQReg128(nn));
12397       assign(vM, getQReg128(mm));
12398       IRTemp res128 = newTempV128();
12399       assign(res128,
12400              binop(op,
12401                    binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
12402                    binop(mkVecCATODDLANES(size),  mkexpr(vM), mkexpr(vN))));
12403       /* In the half-width case, use CatEL32x4 to extract the half-width
12404          result from the full-width result. */
12405       IRExpr* res
12406          = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
12407                             binop(Iop_CatEvenLanes32x4, mkexpr(res128),
12408                                                         mkexpr(res128)))
12409                      : mkexpr(res128);
12410       putQReg128(dd, res);
12411       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12412       const HChar* nm  = isMAX ? (isU ? "umaxp" : "smaxp")
12413                                : (isU ? "uminp" : "sminp");
12414       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12415           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12416       return True;
12417    }
12418
12419    if (opcode == BITS5(1,0,1,1,0)) {
12420       /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
12421       /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
12422       if (size == X00 || size == X11) return False;
12423       Bool isR = bitU == 1;
12424       IRTemp res, sat1q, sat1n, vN, vM;
12425       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
12426       newTempsV128_2(&vN, &vM);
12427       assign(vN, getQReg128(nn));
12428       assign(vM, getQReg128(mm));
12429       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
12430       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12431       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
12432       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
12433       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12434       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
12435       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12436           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12437       return True;
12438    }
12439
12440    if (bitU == 0 && opcode == BITS5(1,0,1,1,1)) {
12441       /* -------- 0,xx,10111 ADDP std7_std7_std7 -------- */
12442       if (bitQ == 0 && size == X11) return False; // implied 1d case
12443       IRTemp vN = newTempV128();
12444       IRTemp vM = newTempV128();
12445       assign(vN, getQReg128(nn));
12446       assign(vM, getQReg128(mm));
12447       IRTemp res128 = newTempV128();
12448       assign(res128,
12449              binop(mkVecADD(size),
12450                    binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
12451                    binop(mkVecCATODDLANES(size),  mkexpr(vM), mkexpr(vN))));
12452       /* In the half-width case, use CatEL32x4 to extract the half-width
12453          result from the full-width result. */
12454       IRExpr* res
12455          = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
12456                             binop(Iop_CatEvenLanes32x4, mkexpr(res128),
12457                                                         mkexpr(res128)))
12458                      : mkexpr(res128);
12459       putQReg128(dd, res);
12460       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12461       DIP("addp %s.%s, %s.%s, %s.%s\n",
12462           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12463       return True;
12464    }
12465
12466    if (bitU == 0
12467        && (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,1,1,0))) {
12468       /* -------- 0,0x,11000 FMAXNM 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12469       /* -------- 0,1x,11000 FMINNM 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12470       /* -------- 0,0x,11110 FMAX   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12471       /* -------- 0,1x,11110 FMIN   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12472       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
12473       Bool   isD   = (size & 1) == 1;
12474       if (bitQ == 0 && isD) return False; // implied 1d case
12475       Bool   isMIN = (size & 2) == 2;
12476       Bool   isNM  = opcode == BITS5(1,1,0,0,0);
12477       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? X11 : X10);
12478       IRTemp res   = newTempV128();
12479       assign(res, binop(opMXX, getQReg128(nn), getQReg128(mm)));
12480       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12481       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12482       DIP("%s%s %s.%s, %s.%s, %s.%s\n",
12483           isMIN ? "fmin" : "fmax", isNM ? "nm" : "",
12484           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12485       return True;
12486    }
12487
12488    if (bitU == 0 && opcode == BITS5(1,1,0,0,1)) {
12489       /* -------- 0,0x,11001 FMLA 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12490       /* -------- 0,1x,11001 FMLS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12491       Bool isD   = (size & 1) == 1;
12492       Bool isSUB = (size & 2) == 2;
12493       if (bitQ == 0 && isD) return False; // implied 1d case
12494       IROp opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
12495       IROp opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
12496       IROp opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
12497       IRTemp rm = mk_get_IR_rounding_mode();
12498       IRTemp t1 = newTempV128();
12499       IRTemp t2 = newTempV128();
12500       // FIXME: double rounding; use FMA primops instead
12501       assign(t1, triop(opMUL,
12502                        mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12503       assign(t2, triop(isSUB ? opSUB : opADD,
12504                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
12505       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
12506       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12507       DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fmls" : "fmla",
12508           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12509       return True;
12510    }
12511
12512    if (bitU == 0 && opcode == BITS5(1,1,0,1,0)) {
12513       /* -------- 0,0x,11010 FADD 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12514       /* -------- 0,1x,11010 FSUB 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12515       Bool isD   = (size & 1) == 1;
12516       Bool isSUB = (size & 2) == 2;
12517       if (bitQ == 0 && isD) return False; // implied 1d case
12518       const IROp ops[4]
12519          = { Iop_Add32Fx4, Iop_Add64Fx2, Iop_Sub32Fx4, Iop_Sub64Fx2 };
12520       IROp   op = ops[size];
12521       IRTemp rm = mk_get_IR_rounding_mode();
12522       IRTemp t1 = newTempV128();
12523       IRTemp t2 = newTempV128();
12524       assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12525       assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
12526       putQReg128(dd, mkexpr(t2));
12527       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12528       DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fsub" : "fadd",
12529           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12530       return True;
12531    }
12532
12533    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
12534       /* -------- 1,1x,11010 FABD 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12535       Bool isD = (size & 1) == 1;
12536       if (bitQ == 0 && isD) return False; // implied 1d case
12537       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
12538       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
12539       IRTemp rm    = mk_get_IR_rounding_mode();
12540       IRTemp t1    = newTempV128();
12541       IRTemp t2    = newTempV128();
12542       // FIXME: use Abd primop instead?
12543       assign(t1, triop(opSUB, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12544       assign(t2, unop(opABS, mkexpr(t1)));
12545       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
12546       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12547       DIP("fabd %s.%s, %s.%s, %s.%s\n",
12548           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12549       return True;
12550    }
12551
12552    if (size <= X01 && opcode == BITS5(1,1,0,1,1)) {
12553       /* -------- 0,0x,11011 FMULX 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12554       /* -------- 1,0x,11011 FMUL  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12555       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
12556       Bool isD    = (size & 1) == 1;
12557       Bool isMULX = bitU == 0;
12558       if (bitQ == 0 && isD) return False; // implied 1d case
12559       IRTemp rm = mk_get_IR_rounding_mode();
12560       IRTemp t1 = newTempV128();
12561       assign(t1, triop(isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4,
12562                        mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12563       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12564       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12565       DIP("%s %s.%s, %s.%s, %s.%s\n", isMULX ? "fmulx" : "fmul",
12566           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12567       return True;
12568    }
12569
12570    if (size <= X01 && opcode == BITS5(1,1,1,0,0)) {
12571       /* -------- 0,0x,11100 FCMEQ 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12572       /* -------- 1,0x,11100 FCMGE 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12573       Bool isD = (size & 1) == 1;
12574       if (bitQ == 0 && isD) return False; // implied 1d case
12575       Bool   isGE  = bitU == 1;
12576       IROp   opCMP = isGE ? (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4)
12577                           : (isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4);
12578       IRTemp t1    = newTempV128();
12579       assign(t1, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn)) // swapd
12580                       : binop(opCMP, getQReg128(nn), getQReg128(mm)));
12581       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12582       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12583       DIP("%s %s.%s, %s.%s, %s.%s\n", isGE ? "fcmge" : "fcmeq",
12584           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12585       return True;
12586    }
12587
12588    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,0,0)) {
12589       /* -------- 1,1x,11100 FCMGT 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12590       Bool isD = (size & 1) == 1;
12591       if (bitQ == 0 && isD) return False; // implied 1d case
12592       IROp   opCMP = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
12593       IRTemp t1    = newTempV128();
12594       assign(t1, binop(opCMP, getQReg128(mm), getQReg128(nn))); // swapd
12595       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12596       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12597       DIP("%s %s.%s, %s.%s, %s.%s\n", "fcmgt",
12598           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12599       return True;
12600    }
12601
12602    if (bitU == 1 && opcode == BITS5(1,1,1,0,1)) {
12603       /* -------- 1,0x,11101 FACGE 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12604       /* -------- 1,1x,11101 FACGT 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12605       Bool isD  = (size & 1) == 1;
12606       Bool isGT = (size & 2) == 2;
12607       if (bitQ == 0 && isD) return False; // implied 1d case
12608       IROp   opCMP = isGT ? (isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4)
12609                           : (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4);
12610       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
12611       IRTemp t1    = newTempV128();
12612       assign(t1, binop(opCMP, unop(opABS, getQReg128(mm)),
12613                               unop(opABS, getQReg128(nn)))); // swapd
12614       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12615       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12616       DIP("%s %s.%s, %s.%s, %s.%s\n", isGT ? "facgt" : "facge",
12617           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12618       return True;
12619    }
12620
12621    if (bitU == 1
12622        && (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,1,1,0))) {
12623       /* -------- 1,0x,11000 FMAXNMP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12624       /* -------- 1,1x,11000 FMINNMP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12625       /* -------- 1,0x,11110 FMAXP   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12626       /* -------- 1,1x,11110 FMINP   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12627       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
12628       Bool isD = (size & 1) == 1;
12629       if (bitQ == 0 && isD) return False; // implied 1d case
12630       Bool   isMIN = (size & 2) == 2;
12631       Bool   isNM  = opcode == BITS5(1,1,0,0,0);
12632       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? 3 : 2);
12633       IRTemp srcN  = newTempV128();
12634       IRTemp srcM  = newTempV128();
12635       IRTemp preL  = IRTemp_INVALID;
12636       IRTemp preR  = IRTemp_INVALID;
12637       assign(srcN, getQReg128(nn));
12638       assign(srcM, getQReg128(mm));
12639       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR, srcM, srcN,
12640                                            isD ? ARM64VSizeD : ARM64VSizeS, bitQ);
12641       putQReg128(
12642          dd, math_MAYBE_ZERO_HI64_fromE(
12643                 bitQ,
12644                 binop(opMXX, mkexpr(preL), mkexpr(preR))));
12645       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12646       DIP("%s%sp %s.%s, %s.%s, %s.%s\n",
12647           isMIN ? "fmin" : "fmax", isNM ? "nm" : "",
12648           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12649       return True;
12650    }
12651
12652    if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,0,1,0)) {
12653       /* -------- 1,0x,11010 FADDP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12654       Bool isD = size == X01;
12655       if (bitQ == 0 && isD) return False; // implied 1d case
12656       IRTemp srcN = newTempV128();
12657       IRTemp srcM = newTempV128();
12658       IRTemp preL = IRTemp_INVALID;
12659       IRTemp preR = IRTemp_INVALID;
12660       assign(srcN, getQReg128(nn));
12661       assign(srcM, getQReg128(mm));
12662       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR, srcM, srcN,
12663                                            isD ? ARM64VSizeD : ARM64VSizeS, bitQ);
12664       putQReg128(
12665          dd, math_MAYBE_ZERO_HI64_fromE(
12666                 bitQ,
12667                 triop(mkVecADDF(isD ? 3 : 2),
12668                       mkexpr(mk_get_IR_rounding_mode()),
12669                       mkexpr(preL), mkexpr(preR))));
12670       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12671       DIP("%s %s.%s, %s.%s, %s.%s\n", "faddp",
12672           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12673       return True;
12674    }
12675
12676    if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,1,1,1)) {
12677       /* -------- 1,0x,11111 FDIV 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12678       Bool isD = (size & 1) == 1;
12679       if (bitQ == 0 && isD) return False; // implied 1d case
12680       vassert(size <= 1);
12681       const IROp ops[2] = { Iop_Div32Fx4, Iop_Div64Fx2 };
12682       IROp   op = ops[size];
12683       IRTemp rm = mk_get_IR_rounding_mode();
12684       IRTemp t1 = newTempV128();
12685       IRTemp t2 = newTempV128();
12686       assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12687       assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
12688       putQReg128(dd, mkexpr(t2));
12689       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12690       DIP("%s %s.%s, %s.%s, %s.%s\n", "fdiv",
12691           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12692       return True;
12693    }
12694
12695    if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
12696       /* -------- 0,0x,11111: FRECPS  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12697       /* -------- 0,1x,11111: FRSQRTS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12698       Bool isSQRT = (size & 2) == 2;
12699       Bool isD    = (size & 1) == 1;
12700       if (bitQ == 0 && isD) return False; // implied 1d case
12701       IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
12702                            : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
12703       IRTemp res = newTempV128();
12704       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
12705       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12706       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12707       DIP("%s %s.%s, %s.%s, %s.%s\n", isSQRT ? "frsqrts" : "frecps",
12708           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12709       return True;
12710    }
12711
12712    return False;
12713 #  undef INSN
12714 }
12715
12716
12717 static
12718 Bool dis_AdvSIMD_three_same_extra(/*MB_OUT*/DisResult* dres, UInt insn)
12719 {
12720    /* 31 30 29 28    23   21 20 15 14     10 9 4
12721       0  Q  U  01110 size 0  m  1  opcode 1  n d
12722       Decode fields: u,size,opcode
12723    */
12724 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12725    if (INSN(31,31) != 0
12726        || INSN(28,24) != BITS5(0,1,1,1,0)
12727        || INSN(21,21) != 0
12728        || INSN(15,15) != 1
12729        || INSN(10,10) != 1) {
12730       return False;
12731    }
12732    UInt bitQ   = INSN(30,30);
12733    UInt bitU   = INSN(29,29);
12734    UInt size   = INSN(23,22);
12735    UInt mm     = INSN(20,16);
12736    UInt opcode = INSN(14,11);
12737    UInt nn     = INSN(9,5);
12738    UInt dd     = INSN(4,0);
12739    vassert(size < 4);
12740    vassert(mm < 32 && nn < 32 && dd < 32);
12741
12742    if (bitU == 1 && (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,0,0,1))) {
12743       /* -------- 0,xx,10110 SQRDMLAH s and h variants only -------- */
12744       /* -------- 1,xx,10110 SQRDMLSH s and h variants only -------- */
12745       if (size == X00 || size == X11) return False;
12746       Bool isAdd = opcode == BITS4(0,0,0,0);
12747
12748       IRTemp res, res_nosat, vD, vN, vM;
12749       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
12750       newTempsV128_3(&vD, &vN, &vM);
12751       assign(vD, getQReg128(dd));
12752       assign(vN, getQReg128(nn));
12753       assign(vM, getQReg128(mm));
12754
12755       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
12756       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
12757       updateQCFLAGwithDifferenceZHI(res, res_nosat, opZHI);
12758       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12759
12760       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12761       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
12762       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12763           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12764       return True;
12765    }
12766
12767    return False;
12768 #  undef INSN
12769 }
12770
12771 static
12772 Bool dis_AdvSIMD_three_same_fp16(/*MB_OUT*/DisResult* dres, UInt insn)
12773 {
12774    /* 31 30 29 28    23   21 20 15     10 9 4
12775       0  Q  U  01110 size 0  m  opcode 1  n d
12776       Decode fields: u,size,opcode
12777    */
12778 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12779    if (INSN(31,31) != 0
12780        || INSN(28,24) != BITS5(0,1,1,1,0)
12781        || INSN(21,21) != 0
12782        || INSN(10,10) != 1) {
12783       return False;
12784    }
12785    UInt bitQ   = INSN(30,30);
12786    UInt bitU   = INSN(29,29);
12787    UInt size   = INSN(23,22);
12788    UInt mm     = INSN(20,16);
12789    UInt opcode = INSN(15,11);
12790    UInt nn     = INSN(9,5);
12791    UInt dd     = INSN(4,0);
12792    vassert(size < 4);
12793    vassert(mm < 32 && nn < 32 && dd < 32);
12794
12795    if (bitU == 1 && size == X01 && opcode == BITS5(0,0,0,1,0)) {
12796       /* -------- 1,01,00010 FADDP 4h_4h_4h, 8h_8h_8h -------- */
12797       IROp  opADD = mkVecADDF(1); //bitQ == 0 ? 0 : 1);
12798       IRTemp srcN = newTempV128();
12799       IRTemp srcM = newTempV128();
12800       IRTemp preL = IRTemp_INVALID;
12801       IRTemp preR = IRTemp_INVALID;
12802       assign(srcN, getQReg128(nn));
12803       assign(srcM, getQReg128(mm));
12804       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR, srcM, srcN,
12805                                            ARM64VSizeH, bitQ);
12806       putQReg128(
12807          dd, math_MAYBE_ZERO_HI64_fromE(
12808                 bitQ,
12809                 triop(opADD, mkexpr(mk_get_IR_rounding_mode()),
12810                       mkexpr(preL), mkexpr(preR))));
12811       const HChar* arr = bitQ == 0 ? "4h" : "8h";
12812       DIP("%s %s.%s, %s.%s, %s.%s\n", "faddp",
12813           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12814       return True;
12815    }
12816
12817    return False;
12818 #  undef INSN
12819 }
12820
12821
12822 static
12823 Bool dis_AdvSIMD_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
12824 {
12825    /* 31 30 29 28    23   21    16     11 9 4
12826       0  Q  U  01110 size 10000 opcode 10 n d
12827       Decode fields: U,size,opcode
12828    */
12829 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12830    if (INSN(31,31) != 0
12831        || INSN(28,24) != BITS5(0,1,1,1,0)
12832        || INSN(21,17) != BITS5(1,0,0,0,0)
12833        || INSN(11,10) != BITS2(1,0)) {
12834       return False;
12835    }
12836    UInt bitQ   = INSN(30,30);
12837    UInt bitU   = INSN(29,29);
12838    UInt size   = INSN(23,22);
12839    UInt opcode = INSN(16,12);
12840    UInt nn     = INSN(9,5);
12841    UInt dd     = INSN(4,0);
12842    vassert(size < 4);
12843
12844    if (bitU == 0 && size <= X10 && opcode == BITS5(0,0,0,0,0)) {
12845       /* -------- 0,00,00000: REV64 16b_16b, 8b_8b -------- */
12846       /* -------- 0,01,00000: REV64 8h_8h, 4h_4h -------- */
12847       /* -------- 0,10,00000: REV64 4s_4s, 2s_2s -------- */
12848       const IROp iops[3] = { Iop_Reverse8sIn64_x2,
12849                              Iop_Reverse16sIn64_x2, Iop_Reverse32sIn64_x2 };
12850       vassert(size <= 2);
12851       IRTemp res = newTempV128();
12852       assign(res, unop(iops[size], getQReg128(nn)));
12853       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12854       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12855       DIP("%s %s.%s, %s.%s\n", "rev64",
12856           nameQReg128(dd), arr, nameQReg128(nn), arr);
12857       return True;
12858    }
12859
12860    if (bitU == 1 && size <= X01 && opcode == BITS5(0,0,0,0,0)) {
12861       /* -------- 1,00,00000: REV32 16b_16b, 8b_8b -------- */
12862       /* -------- 1,01,00000: REV32 8h_8h, 4h_4h -------- */
12863       Bool   isH = size == X01;
12864       IRTemp res = newTempV128();
12865       IROp   iop = isH ? Iop_Reverse16sIn32_x4 : Iop_Reverse8sIn32_x4;
12866       assign(res, unop(iop, getQReg128(nn)));
12867       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12868       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12869       DIP("%s %s.%s, %s.%s\n", "rev32",
12870           nameQReg128(dd), arr, nameQReg128(nn), arr);
12871       return True;
12872    }
12873
12874    if (bitU == 0 && size == X00 && opcode == BITS5(0,0,0,0,1)) {
12875       /* -------- 0,00,00001: REV16 16b_16b, 8b_8b -------- */
12876       IRTemp res = newTempV128();
12877       assign(res, unop(Iop_Reverse8sIn16_x8, getQReg128(nn)));
12878       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12879       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12880       DIP("%s %s.%s, %s.%s\n", "rev16",
12881           nameQReg128(dd), arr, nameQReg128(nn), arr);
12882       return True;
12883    }
12884
12885    if (opcode == BITS5(0,0,0,1,0) || opcode == BITS5(0,0,1,1,0)) {
12886       /* -------- 0,xx,00010: SADDLP std6_std6 -------- */
12887       /* -------- 1,xx,00010: UADDLP std6_std6 -------- */
12888       /* -------- 0,xx,00110: SADALP std6_std6 -------- */
12889       /* -------- 1,xx,00110: UADALP std6_std6 -------- */
12890       /* Widens, and size refers to the narrow size. */
12891       if (size == X11) return False; // no 1d or 2d cases
12892       Bool   isU   = bitU == 1;
12893       Bool   isACC = opcode == BITS5(0,0,1,1,0);
12894       IRTemp src   = newTempV128();
12895       IRTemp sum   = newTempV128();
12896       IRTemp res   = newTempV128();
12897       assign(src, getQReg128(nn));
12898       assign(sum,
12899              binop(mkVecADD(size+1),
12900                    mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
12901                              isU, True/*fromOdd*/, size, mkexpr(src))),
12902                    mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
12903                              isU, False/*!fromOdd*/, size, mkexpr(src)))));
12904       assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(sum), getQReg128(dd))
12905                         : mkexpr(sum));
12906       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12907       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12908       const HChar* arrWide   = nameArr_Q_SZ(bitQ, size+1);
12909       DIP("%s %s.%s, %s.%s\n", isACC ? (isU ? "uadalp" : "sadalp")
12910                                      : (isU ? "uaddlp" : "saddlp"),
12911           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
12912       return True;
12913    }
12914
12915    if (opcode == BITS5(0,0,0,1,1)) {
12916       /* -------- 0,xx,00011: SUQADD std7_std7 -------- */
12917       /* -------- 1,xx,00011: USQADD std7_std7 -------- */
12918       if (bitQ == 0 && size == X11) return False; // implied 1d case
12919       Bool isUSQADD = bitU == 1;
12920       /* This is switched (in the US vs SU sense) deliberately.
12921          SUQADD corresponds to the ExtUSsatSS variants and
12922          USQADD corresponds to the ExtSUsatUU variants.
12923          See libvex_ir for more details. */
12924       IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
12925                              : mkVecQADDEXTUSSATSS(size);
12926       IROp   nop  = mkVecADD(size);
12927       IRTemp argL = newTempV128();
12928       IRTemp argR = newTempV128();
12929       IRTemp qres = newTempV128();
12930       IRTemp nres = newTempV128();
12931       /* Because the two arguments to the addition are implicitly
12932          extended differently (one signedly, the other unsignedly) it is
12933          important to present them to the primop in the correct order. */
12934       assign(argL, getQReg128(nn));
12935       assign(argR, getQReg128(dd));
12936       assign(qres, math_MAYBE_ZERO_HI64_fromE(
12937                       bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
12938       assign(nres, math_MAYBE_ZERO_HI64_fromE(
12939                       bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
12940       putQReg128(dd, mkexpr(qres));
12941       updateQCFLAGwithDifference(qres, nres);
12942       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12943       DIP("%s %s.%s, %s.%s\n", isUSQADD ? "usqadd" : "suqadd",
12944           nameQReg128(dd), arr, nameQReg128(nn), arr);
12945       return True;
12946    }
12947
12948    if (opcode == BITS5(0,0,1,0,0)) {
12949       /* -------- 0,xx,00100: CLS std6_std6 -------- */
12950       /* -------- 1,xx,00100: CLZ std6_std6 -------- */
12951       if (size == X11) return False; // no 1d or 2d cases
12952       const IROp opsCLS[3] = { Iop_Cls8x16, Iop_Cls16x8, Iop_Cls32x4 };
12953       const IROp opsCLZ[3] = { Iop_Clz8x16, Iop_Clz16x8, Iop_Clz32x4 };
12954       Bool   isCLZ = bitU == 1;
12955       IRTemp res   = newTempV128();
12956       vassert(size <= 2);
12957       assign(res, unop(isCLZ ? opsCLZ[size] : opsCLS[size], getQReg128(nn)));
12958       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12959       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12960       DIP("%s %s.%s, %s.%s\n", isCLZ ? "clz" : "cls",
12961           nameQReg128(dd), arr, nameQReg128(nn), arr);
12962       return True;
12963    }
12964
12965    if (size == X00 && opcode == BITS5(0,0,1,0,1)) {
12966       /* -------- 0,00,00101: CNT 16b_16b, 8b_8b -------- */
12967       /* -------- 1,00,00101: NOT 16b_16b, 8b_8b -------- */
12968       IRTemp res = newTempV128();
12969       assign(res, unop(bitU == 0 ? Iop_Cnt8x16 : Iop_NotV128, getQReg128(nn)));
12970       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12971       const HChar* arr = nameArr_Q_SZ(bitQ, 0);
12972       DIP("%s %s.%s, %s.%s\n", bitU == 0 ? "cnt" : "not",
12973           nameQReg128(dd), arr, nameQReg128(nn), arr);
12974       return True;
12975    }
12976
12977    if (bitU == 1 && size == X01 && opcode == BITS5(0,0,1,0,1)) {
12978       /* -------- 1,01,00101  RBIT 16b_16b, 8b_8b -------- */
12979       IRTemp res = newTempV128();
12980       assign(res, unop(Iop_Reverse1sIn8_x16, getQReg128(nn)));
12981       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12982       const HChar* arr = nameArr_Q_SZ(bitQ, 0);
12983       DIP("%s %s.%s, %s.%s\n", "rbit",
12984           nameQReg128(dd), arr, nameQReg128(nn), arr);
12985       return True;
12986    }
12987
12988    if (opcode == BITS5(0,0,1,1,1)) {
12989       /* -------- 0,xx,00111 SQABS std7_std7 -------- */
12990       /* -------- 1,xx,00111 SQNEG std7_std7 -------- */
12991       if (bitQ == 0 && size == X11) return False; // implied 1d case
12992       Bool   isNEG  = bitU == 1;
12993       IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
12994       (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
12995                                          getQReg128(nn), size );
12996       IRTemp qres = newTempV128(), nres = newTempV128();
12997       assign(qres, math_MAYBE_ZERO_HI64(bitQ, qresFW));
12998       assign(nres, math_MAYBE_ZERO_HI64(bitQ, nresFW));
12999       putQReg128(dd, mkexpr(qres));
13000       updateQCFLAGwithDifference(qres, nres);
13001       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13002       DIP("%s %s.%s, %s.%s\n", isNEG ? "sqneg" : "sqabs",
13003           nameQReg128(dd), arr, nameQReg128(nn), arr);
13004       return True;
13005    }
13006
13007    if (opcode == BITS5(0,1,0,0,0)) {
13008       /* -------- 0,xx,01000: CMGT std7_std7_#0 -------- */ // >s 0
13009       /* -------- 1,xx,01000: CMGE std7_std7_#0 -------- */ // >=s 0
13010       if (bitQ == 0 && size == X11) return False; // implied 1d case
13011       Bool    isGT  = bitU == 0;
13012       IRExpr* argL  = getQReg128(nn);
13013       IRExpr* argR  = mkV128(0x0000);
13014       IRTemp  res   = newTempV128();
13015       IROp    opGTS = mkVecCMPGTS(size);
13016       assign(res, isGT ? binop(opGTS, argL, argR)
13017                        : unop(Iop_NotV128, binop(opGTS, argR, argL)));
13018       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13019       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13020       DIP("cm%s %s.%s, %s.%s, #0\n", isGT ? "gt" : "ge",
13021           nameQReg128(dd), arr, nameQReg128(nn), arr);
13022       return True;
13023    }
13024
13025    if (opcode == BITS5(0,1,0,0,1)) {
13026       /* -------- 0,xx,01001: CMEQ std7_std7_#0 -------- */ // == 0
13027       /* -------- 1,xx,01001: CMLE std7_std7_#0 -------- */ // <=s 0
13028       if (bitQ == 0 && size == X11) return False; // implied 1d case
13029       Bool    isEQ = bitU == 0;
13030       IRExpr* argL = getQReg128(nn);
13031       IRExpr* argR = mkV128(0x0000);
13032       IRTemp  res  = newTempV128();
13033       assign(res, isEQ ? binop(mkVecCMPEQ(size), argL, argR)
13034                        : unop(Iop_NotV128,
13035                               binop(mkVecCMPGTS(size), argL, argR)));
13036       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13037       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13038       DIP("cm%s %s.%s, %s.%s, #0\n", isEQ ? "eq" : "le",
13039           nameQReg128(dd), arr, nameQReg128(nn), arr);
13040       return True;
13041    }
13042
13043    if (bitU == 0 && opcode == BITS5(0,1,0,1,0)) {
13044       /* -------- 0,xx,01010: CMLT std7_std7_#0 -------- */ // <s 0
13045       if (bitQ == 0 && size == X11) return False; // implied 1d case
13046       IRExpr* argL = getQReg128(nn);
13047       IRExpr* argR = mkV128(0x0000);
13048       IRTemp  res  = newTempV128();
13049       assign(res, binop(mkVecCMPGTS(size), argR, argL));
13050       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13051       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13052       DIP("cm%s %s.%s, %s.%s, #0\n", "lt",
13053           nameQReg128(dd), arr, nameQReg128(nn), arr);
13054       return True;
13055    }
13056
13057    if (bitU == 0 && opcode == BITS5(0,1,0,1,1)) {
13058       /* -------- 0,xx,01011: ABS std7_std7 -------- */
13059       if (bitQ == 0 && size == X11) return False; // implied 1d case
13060       IRTemp res = newTempV128();
13061       assign(res, unop(mkVecABS(size), getQReg128(nn)));
13062       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13063       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13064       DIP("abs %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
13065       return True;
13066    }
13067
13068    if (bitU == 1 && opcode == BITS5(0,1,0,1,1)) {
13069       /* -------- 1,xx,01011: NEG std7_std7 -------- */
13070       if (bitQ == 0 && size == X11) return False; // implied 1d case
13071       IRTemp res = newTempV128();
13072       assign(res, binop(mkVecSUB(size), mkV128(0x0000), getQReg128(nn)));
13073       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13074       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13075       DIP("neg %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
13076       return True;
13077    }
13078
13079    UInt ix = 0; /*INVALID*/
13080    if (size >= X10) {
13081       switch (opcode) {
13082          case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 4 : 1; break;
13083          case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 5 : 2; break;
13084          case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;
13085          default: break;
13086       }
13087    }
13088    if (ix > 0) {
13089       /* -------- 0,1x,01100 FCMGT 2d_2d,4s_4s,2s_2s _#0.0 (ix 1) -------- */
13090       /* -------- 0,1x,01101 FCMEQ 2d_2d,4s_4s,2s_2s _#0.0 (ix 2) -------- */
13091       /* -------- 0,1x,01110 FCMLT 2d_2d,4s_4s,2s_2s _#0.0 (ix 3) -------- */
13092       /* -------- 1,1x,01100 FCMGE 2d_2d,4s_4s,2s_2s _#0.0 (ix 4) -------- */
13093       /* -------- 1,1x,01101 FCMLE 2d_2d,4s_4s,2s_2s _#0.0 (ix 5) -------- */
13094       if (bitQ == 0 && size == X11) return False; // implied 1d case
13095       Bool   isD     = size == X11;
13096       IROp   opCmpEQ = isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
13097       IROp   opCmpLE = isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
13098       IROp   opCmpLT = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
13099       IROp   opCmp   = Iop_INVALID;
13100       Bool   swap    = False;
13101       const HChar* nm = "??";
13102       switch (ix) {
13103          case 1: nm = "fcmgt"; opCmp = opCmpLT; swap = True; break;
13104          case 2: nm = "fcmeq"; opCmp = opCmpEQ; break;
13105          case 3: nm = "fcmlt"; opCmp = opCmpLT; break;
13106          case 4: nm = "fcmge"; opCmp = opCmpLE; swap = True; break;
13107          case 5: nm = "fcmle"; opCmp = opCmpLE; break;
13108          default: vassert(0);
13109       }
13110       IRExpr* zero = mkV128(0x0000);
13111       IRTemp res = newTempV128();
13112       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
13113                        : binop(opCmp, getQReg128(nn), zero));
13114       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13115       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
13116       DIP("%s %s.%s, %s.%s, #0.0\n", nm,
13117           nameQReg128(dd), arr, nameQReg128(nn), arr);
13118       return True;
13119    }
13120
13121    if (size >= X10 && opcode == BITS5(0,1,1,1,1)) {
13122       /* -------- 0,1x,01111: FABS 2d_2d, 4s_4s, 2s_2s -------- */
13123       /* -------- 1,1x,01111: FNEG 2d_2d, 4s_4s, 2s_2s -------- */
13124       if (bitQ == 0 && size == X11) return False; // implied 1d case
13125       Bool   isFNEG = bitU == 1;
13126       IROp   op     = isFNEG ? (size == X10 ? Iop_Neg32Fx4 : Iop_Neg64Fx2)
13127                              : (size == X10 ? Iop_Abs32Fx4 : Iop_Abs64Fx2);
13128       IRTemp res = newTempV128();
13129       assign(res, unop(op, getQReg128(nn)));
13130       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13131       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
13132       DIP("%s %s.%s, %s.%s\n", isFNEG ? "fneg" : "fabs",
13133           nameQReg128(dd), arr, nameQReg128(nn), arr);
13134       return True;
13135    }
13136
13137    if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
13138       /* -------- 0,xx,10010: XTN{,2} -------- */
13139       if (size == X11) return False;
13140       vassert(size < 3);
13141       Bool   is2  = bitQ == 1;
13142       IROp   opN  = mkVecNARROWUN(size);
13143       IRTemp resN = newTempV128();
13144       assign(resN, unop(Iop_64UtoV128, unop(opN, getQReg128(nn))));
13145       putLO64andZUorPutHI64(is2, dd, resN);
13146       const HChar* nm        = "xtn";
13147       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13148       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13149       DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm,
13150           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
13151       return True;
13152    }
13153
13154    if (opcode == BITS5(1,0,1,0,0)
13155        || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) {
13156       /* -------- 0,xx,10100: SQXTN{,2} -------- */
13157       /* -------- 1,xx,10100: UQXTN{,2} -------- */
13158       /* -------- 1,xx,10010: SQXTUN{,2} -------- */
13159       if (size == X11) return False;
13160       vassert(size < 3);
13161       Bool  is2    = bitQ == 1;
13162       IROp  opN    = Iop_INVALID;
13163       Bool  zWiden = True;
13164       const HChar* nm = "??";
13165       /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) {
13166          opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False;
13167       }
13168       else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) {
13169          opN = mkVecQNARROWUNUU(size); nm = "uqxtn";
13170       }
13171       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
13172          opN = mkVecQNARROWUNSU(size); nm = "sqxtun";
13173       }
13174       else vassert(0);
13175       IRTemp src  = newTempV128();
13176       assign(src, getQReg128(nn));
13177       IRTemp resN = newTempV128();
13178       assign(resN, unop(Iop_64UtoV128, unop(opN, mkexpr(src))));
13179       putLO64andZUorPutHI64(is2, dd, resN);
13180       IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/,
13181                                               size, mkexpr(resN));
13182       updateQCFLAGwithDifference(src, resW);
13183       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13184       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13185       DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm,
13186           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
13187       return True;
13188    }
13189
13190    if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
13191       /* -------- 1,xx,10011 SHLL{2} #lane-width -------- */
13192       /* Widens, and size is the narrow size. */
13193       if (size == X11) return False;
13194       Bool is2   = bitQ == 1;
13195       IROp opINT = is2 ? mkVecINTERLEAVEHI(size) : mkVecINTERLEAVELO(size);
13196       IROp opSHL = mkVecSHLN(size+1);
13197       IRTemp src = newTempV128();
13198       IRTemp res = newTempV128();
13199       assign(src, getQReg128(nn));
13200       assign(res, binop(opSHL, binop(opINT, mkexpr(src), mkexpr(src)),
13201                                mkU8(8 << size)));
13202       putQReg128(dd, mkexpr(res));
13203       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13204       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13205       DIP("shll%s %s.%s, %s.%s, #%d\n", is2 ? "2" : "",
13206           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow, 8 << size);
13207       return True;
13208    }
13209
13210    if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,0)) {
13211       /* -------- 0,0x,10110: FCVTN 4h/8h_4s, 2s/4s_2d -------- */
13212       UInt   nLanes = size == X00 ? 4 : 2;
13213       IRType srcTy  = size == X00 ? Ity_F32 : Ity_F64;
13214       IROp   opCvt  = size == X00 ? Iop_F32toF16 : Iop_F64toF32;
13215       IRTemp rm     = mk_get_IR_rounding_mode();
13216       IRTemp src[nLanes];
13217       for (UInt i = 0; i < nLanes; i++) {
13218          src[i] = newTemp(srcTy);
13219          assign(src[i], getQRegLane(nn, i, srcTy));
13220       }
13221       for (UInt i = 0; i < nLanes; i++) {
13222          putQRegLane(dd, nLanes * bitQ + i,
13223                          binop(opCvt, mkexpr(rm), mkexpr(src[i])));
13224       }
13225       if (bitQ == 0) {
13226          putQRegLane(dd, 1, mkU64(0));
13227       }
13228       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
13229       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
13230       DIP("fcvtn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
13231           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
13232       return True;
13233    }
13234
13235    if (bitU == 1 && size == X01 && opcode == BITS5(1,0,1,1,0)) {
13236       /* -------- 1,01,10110: FCVTXN 2s/4s_2d -------- */
13237       /* Using Irrm_NEAREST here isn't right.  The docs say "round to
13238          odd" but I don't know what that really means. */
13239       IRType srcTy = Ity_F64;
13240       IROp   opCvt = Iop_F64toF32;
13241       IRTemp src[2];
13242       for (UInt i = 0; i < 2; i++) {
13243          src[i] = newTemp(srcTy);
13244          assign(src[i], getQRegLane(nn, i, srcTy));
13245       }
13246       for (UInt i = 0; i < 2; i++) {
13247          putQRegLane(dd, 2 * bitQ + i,
13248                          binop(opCvt, mkU32(Irrm_NEAREST), mkexpr(src[i])));
13249       }
13250       if (bitQ == 0) {
13251          putQRegLane(dd, 1, mkU64(0));
13252       }
13253       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
13254       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
13255       DIP("fcvtxn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
13256           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
13257       return True;
13258    }
13259
13260    if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,1)) {
13261       /* -------- 0,0x,10111: FCVTL 4s_4h/8h, 2d_2s/4s -------- */
13262       UInt   nLanes = size == X00 ? 4 : 2;
13263       IRType srcTy  = size == X00 ? Ity_F16 : Ity_F32;
13264       IROp   opCvt  = size == X00 ? Iop_F16toF32 : Iop_F32toF64;
13265       IRTemp src[nLanes];
13266       for (UInt i = 0; i < nLanes; i++) {
13267          src[i] = newTemp(srcTy);
13268          assign(src[i], getQRegLane(nn, nLanes * bitQ + i, srcTy));
13269       }
13270       for (UInt i = 0; i < nLanes; i++) {
13271          putQRegLane(dd, i, unop(opCvt, mkexpr(src[i])));
13272       }
13273       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
13274       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
13275       DIP("fcvtl%s %s.%s, %s.%s\n", bitQ ? "2" : "",
13276           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
13277       return True;
13278    }
13279
13280    ix = 0;
13281    if (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,0,0,1)) {
13282       ix = 1 + ((((bitU & 1) << 2) | ((size & 2) << 0)) | ((opcode & 1) << 0));
13283       // = 1 + bitU[0]:size[1]:opcode[0]
13284       vassert(ix >= 1 && ix <= 8);
13285       if (ix == 7) ix = 0;
13286    }
13287    if (ix > 0) {
13288       /* -------- 0,0x,11000 FRINTN 2d_2d, 4s_4s, 2s_2s (1) -------- */
13289       /* -------- 0,0x,11001 FRINTM 2d_2d, 4s_4s, 2s_2s (2) -------- */
13290       /* -------- 0,1x,11000 FRINTP 2d_2d, 4s_4s, 2s_2s (3) -------- */
13291       /* -------- 0,1x,11001 FRINTZ 2d_2d, 4s_4s, 2s_2s (4) -------- */
13292       /* -------- 1,0x,11000 FRINTA 2d_2d, 4s_4s, 2s_2s (5) -------- */
13293       /* -------- 1,0x,11001 FRINTX 2d_2d, 4s_4s, 2s_2s (6) -------- */
13294       /* -------- 1,1x,11000 (apparently unassigned)    (7) -------- */
13295       /* -------- 1,1x,11001 FRINTI 2d_2d, 4s_4s, 2s_2s (8) -------- */
13296       /* rm plan:
13297          FRINTN: tieeven -- !! FIXME KLUDGED !!
13298          FRINTM: -inf
13299          FRINTP: +inf
13300          FRINTZ: zero
13301          FRINTA: tieaway -- !! FIXME KLUDGED !!
13302          FRINTX: per FPCR + "exact = TRUE"
13303          FRINTI: per FPCR
13304       */
13305       Bool isD = (size & 1) == 1;
13306       if (bitQ == 0 && isD) return False; // implied 1d case
13307
13308       IRTemp irrmRM = mk_get_IR_rounding_mode();
13309
13310       UChar ch = '?';
13311       IRTemp irrm = newTemp(Ity_I32);
13312       switch (ix) {
13313          case 1: ch = 'n'; assign(irrm, mkU32(Irrm_NEAREST)); break;
13314          case 2: ch = 'm'; assign(irrm, mkU32(Irrm_NegINF)); break;
13315          case 3: ch = 'p'; assign(irrm, mkU32(Irrm_PosINF)); break;
13316          case 4: ch = 'z'; assign(irrm, mkU32(Irrm_ZERO)); break;
13317          // The following is a kludge.  Should be: Irrm_NEAREST_TIE_AWAY_0
13318          case 5: ch = 'a'; assign(irrm, mkU32(Irrm_NEAREST)); break;
13319          // I am unsure about the following, due to the "integral exact"
13320          // description in the manual.  What does it mean? (frintx, that is)
13321          case 6: ch = 'x'; assign(irrm, mkexpr(irrmRM)); break;
13322          case 8: ch = 'i'; assign(irrm, mkexpr(irrmRM)); break;
13323          default: vassert(0);
13324       }
13325
13326       IROp opRND = isD ? Iop_RoundF64toInt : Iop_RoundF32toInt;
13327       if (isD) {
13328          for (UInt i = 0; i < 2; i++) {
13329             putQRegLane(dd, i, binop(opRND, mkexpr(irrm),
13330                                             getQRegLane(nn, i, Ity_F64)));
13331          }
13332       } else {
13333          UInt n = bitQ==1 ? 4 : 2;
13334          for (UInt i = 0; i < n; i++) {
13335             putQRegLane(dd, i, binop(opRND, mkexpr(irrm),
13336                                             getQRegLane(nn, i, Ity_F32)));
13337          }
13338          if (bitQ == 0)
13339             putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
13340       }
13341       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13342       DIP("frint%c %s.%s, %s.%s\n", ch,
13343           nameQReg128(dd), arr, nameQReg128(nn), arr);
13344       return True;
13345    }
13346
13347    ix = 0; /*INVALID*/
13348    switch (opcode) {
13349       case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
13350       case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
13351       case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
13352       default: break;
13353    }
13354    if (ix > 0) {
13355       /* -------- 0,0x,11010 FCVTNS 2d_2d, 4s_4s, 2s_2s (ix 1) -------- */
13356       /* -------- 0,0x,11011 FCVTMS 2d_2d, 4s_4s, 2s_2s (ix 2) -------- */
13357       /* -------- 0,0x,11100 FCVTAS 2d_2d, 4s_4s, 2s_2s (ix 3) -------- */
13358       /* -------- 0,1x,11010 FCVTPS 2d_2d, 4s_4s, 2s_2s (ix 4) -------- */
13359       /* -------- 0,1x,11011 FCVTZS 2d_2d, 4s_4s, 2s_2s (ix 5) -------- */
13360       /* -------- 1,0x,11010 FCVTNS 2d_2d, 4s_4s, 2s_2s (ix 1) -------- */
13361       /* -------- 1,0x,11011 FCVTMS 2d_2d, 4s_4s, 2s_2s (ix 2) -------- */
13362       /* -------- 1,0x,11100 FCVTAS 2d_2d, 4s_4s, 2s_2s (ix 3) -------- */
13363       /* -------- 1,1x,11010 FCVTPS 2d_2d, 4s_4s, 2s_2s (ix 4) -------- */
13364       /* -------- 1,1x,11011 FCVTZS 2d_2d, 4s_4s, 2s_2s (ix 5) -------- */
13365       Bool isD = (size & 1) == 1;
13366       if (bitQ == 0 && isD) return False; // implied 1d case
13367
13368       IRRoundingMode irrm = 8; /*impossible*/
13369       HChar          ch   = '?';
13370       switch (ix) {
13371          case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
13372          case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
13373          case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
13374          case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
13375          case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
13376          default: vassert(0);
13377       }
13378       IROp cvt = Iop_INVALID;
13379       if (bitU == 1) {
13380          cvt = isD ? Iop_F64toI64U : Iop_F32toI32U;
13381       } else {
13382          cvt = isD ? Iop_F64toI64S : Iop_F32toI32S;
13383       }
13384       if (isD) {
13385          for (UInt i = 0; i < 2; i++) {
13386             putQRegLane(dd, i, binop(cvt, mkU32(irrm),
13387                                             getQRegLane(nn, i, Ity_F64)));
13388          }
13389       } else {
13390          UInt n = bitQ==1 ? 4 : 2;
13391          for (UInt i = 0; i < n; i++) {
13392             putQRegLane(dd, i, binop(cvt, mkU32(irrm),
13393                                             getQRegLane(nn, i, Ity_F32)));
13394          }
13395          if (bitQ == 0)
13396             putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
13397       }
13398       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13399       DIP("fcvt%c%c %s.%s, %s.%s\n", ch, bitU == 1 ? 'u' : 's',
13400           nameQReg128(dd), arr, nameQReg128(nn), arr);
13401       return True;
13402    }
13403
13404    if (size == X10 && opcode == BITS5(1,1,1,0,0)) {
13405       /* -------- 0,10,11100: URECPE  4s_4s, 2s_2s -------- */
13406       /* -------- 1,10,11100: URSQRTE 4s_4s, 2s_2s -------- */
13407       Bool isREC = bitU == 0;
13408       IROp op    = isREC ? Iop_RecipEst32Ux4 : Iop_RSqrtEst32Ux4;
13409       IRTemp res = newTempV128();
13410       assign(res, unop(op, getQReg128(nn)));
13411       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13412       const HChar* nm  = isREC ? "urecpe" : "ursqrte";
13413       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13414       DIP("%s %s.%s, %s.%s\n", nm,
13415           nameQReg128(dd), arr, nameQReg128(nn), arr);
13416       return True;
13417    }
13418
13419    if (size <= X01 && opcode == BITS5(1,1,1,0,1)) {
13420       /* -------- 0,0x,11101: SCVTF -------- */
13421       /* -------- 1,0x,11101: UCVTF -------- */
13422       /* 31  28      22 21       15     9 4
13423          0q0 01110 0 sz 1  00001 110110 n d  SCVTF Vd, Vn
13424          0q1 01110 0 sz 1  00001 110110 n d  UCVTF Vd, Vn
13425          with laneage:
13426          case sz:Q of 00 -> 2S, zero upper, 01 -> 4S, 10 -> illegal, 11 -> 2D
13427       */
13428       Bool isQ   = bitQ == 1;
13429       Bool isU   = bitU == 1;
13430       Bool isF64 = (size & 1) == 1;
13431       if (isQ || !isF64) {
13432          IRType tyF = Ity_INVALID, tyI = Ity_INVALID;
13433          UInt   nLanes = 0;
13434          Bool   zeroHI = False;
13435          const HChar* arrSpec = NULL;
13436          Bool   ok  = getLaneInfo_Q_SZ(&tyI, &tyF, &nLanes, &zeroHI, &arrSpec,
13437                                        isQ, isF64 );
13438          IROp   iop = isU ? (isF64 ? Iop_I64UtoF64 : Iop_I32UtoF32)
13439                           : (isF64 ? Iop_I64StoF64 : Iop_I32StoF32);
13440          IRTemp rm  = mk_get_IR_rounding_mode();
13441          UInt   i;
13442          vassert(ok); /* the 'if' above should ensure this */
13443          for (i = 0; i < nLanes; i++) {
13444             putQRegLane(dd, i,
13445                         binop(iop, mkexpr(rm), getQRegLane(nn, i, tyI)));
13446          }
13447          if (zeroHI) {
13448             putQRegLane(dd, 1, mkU64(0));
13449          }
13450          DIP("%ccvtf %s.%s, %s.%s\n", isU ? 'u' : 's',
13451              nameQReg128(dd), arrSpec, nameQReg128(nn), arrSpec);
13452          return True;
13453       }
13454       /* else fall through */
13455    }
13456
13457    if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
13458       /* -------- 0,1x,11101: FRECPE  2d_2d, 4s_4s, 2s_2s -------- */
13459       /* -------- 1,1x,11101: FRSQRTE 2d_2d, 4s_4s, 2s_2s -------- */
13460       Bool isSQRT = bitU == 1;
13461       Bool isD    = (size & 1) == 1;
13462       IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
13463                            : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
13464       if (bitQ == 0 && isD) return False; // implied 1d case
13465       IRTemp resV = newTempV128();
13466       assign(resV, unop(op, getQReg128(nn)));
13467       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
13468       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
13469       DIP("%s %s.%s, %s.%s\n", isSQRT ? "frsqrte" : "frecpe",
13470           nameQReg128(dd), arr, nameQReg128(nn), arr);
13471       return True;
13472    }
13473
13474    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
13475       /* -------- 1,1x,11111: FSQRT 2d_2d, 4s_4s, 2s_2s -------- */
13476       Bool isD = (size & 1) == 1;
13477       IROp op  = isD ? Iop_Sqrt64Fx2 : Iop_Sqrt32Fx4;
13478       if (bitQ == 0 && isD) return False; // implied 1d case
13479       IRTemp resV = newTempV128();
13480       assign(resV, binop(op, mkexpr(mk_get_IR_rounding_mode()),
13481                              getQReg128(nn)));
13482       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
13483       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
13484       DIP("%s %s.%s, %s.%s\n", "fsqrt",
13485           nameQReg128(dd), arr, nameQReg128(nn), arr);
13486       return True;
13487    }
13488
13489    return False;
13490 #  undef INSN
13491 }
13492
13493
13494 static
13495 Bool dis_AdvSIMD_vector_x_indexed_elem(/*MB_OUT*/DisResult* dres, UInt insn)
13496 {
13497    /* 31    28    23   21 20 19 15     11   9 4
13498       0 Q U 01111 size L  M  m  opcode H  0 n d
13499       Decode fields are: u,size,opcode
13500       M is really part of the mm register number.  Individual
13501       cases need to inspect L and H though.
13502    */
13503 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13504    if (INSN(31,31) != 0
13505        || INSN(28,24) != BITS5(0,1,1,1,1) || INSN(10,10) !=0) {
13506       return False;
13507    }
13508    UInt bitQ   = INSN(30,30);
13509    UInt bitU   = INSN(29,29);
13510    UInt size   = INSN(23,22);
13511    UInt bitL   = INSN(21,21);
13512    UInt bitM   = INSN(20,20);
13513    UInt mmLO4  = INSN(19,16);
13514    UInt opcode = INSN(15,12);
13515    UInt bitH   = INSN(11,11);
13516    UInt nn     = INSN(9,5);
13517    UInt dd     = INSN(4,0);
13518    vassert(size < 4);
13519    vassert(bitH < 2 && bitM < 2 && bitL < 2);
13520
13521    if (bitU == 0 && size >= X10
13522        && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
13523       /* -------- 0,1x,0001 FMLA 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
13524       /* -------- 0,1x,0101 FMLS 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
13525       if (bitQ == 0 && size == X11) return False; // implied 1d case
13526       Bool isD   = (size & 1) == 1;
13527       Bool isSUB = opcode == BITS4(0,1,0,1);
13528       UInt index;
13529       if      (!isD)             index = (bitH << 1) | bitL;
13530       else if (isD && bitL == 0) index = bitH;
13531       else return False; // sz:L == x11 => unallocated encoding
13532       vassert(index < (isD ? 2 : 4));
13533       IRType ity   = isD ? Ity_F64 : Ity_F32;
13534       IRTemp elem  = newTemp(ity);
13535       UInt   mm    = (bitM << 4) | mmLO4;
13536       assign(elem, getQRegLane(mm, index, ity));
13537       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
13538       IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
13539       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
13540       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
13541       IRTemp rm    = mk_get_IR_rounding_mode();
13542       IRTemp t1    = newTempV128();
13543       IRTemp t2    = newTempV128();
13544       // FIXME: double rounding; use FMA primops instead
13545       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
13546       assign(t2, triop(isSUB ? opSUB : opADD,
13547                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
13548       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
13549       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13550       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
13551           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm),
13552           isD ? 'd' : 's', index);
13553       return True;
13554    }
13555
13556    if (size >= X10 && opcode == BITS4(1,0,0,1)) {
13557       /* -------- 0,1x,1001 FMUL  2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
13558       /* -------- 1,1x,1001 FMULX 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
13559       if (bitQ == 0 && size == X11) return False; // implied 1d case
13560       Bool isD    = (size & 1) == 1;
13561       Bool isMULX = bitU == 1;
13562       UInt index;
13563       if      (!isD)             index = (bitH << 1) | bitL;
13564       else if (isD && bitL == 0) index = bitH;
13565       else return False; // sz:L == x11 => unallocated encoding
13566       vassert(index < (isD ? 2 : 4));
13567       IRType ity  = isD ? Ity_F64 : Ity_F32;
13568       IRTemp elem = newTemp(ity);
13569       UInt   mm   = (bitM << 4) | mmLO4;
13570       assign(elem, getQRegLane(mm, index, ity));
13571       IRTemp dupd = math_DUP_TO_V128(elem, ity);
13572       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
13573       IRTemp res  = newTempV128();
13574       assign(res, triop(isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4,
13575                         mkexpr(mk_get_IR_rounding_mode()),
13576                         getQReg128(nn), mkexpr(dupd)));
13577       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13578       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13579       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n",
13580           isMULX ? "fmulx" : "fmul", nameQReg128(dd), arr,
13581           nameQReg128(nn), arr, nameQReg128(mm), isD ? 'd' : 's', index);
13582       return True;
13583    }
13584
13585    if ((bitU == 1 && (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,1,0,0)))
13586        || (bitU == 0 && opcode == BITS4(1,0,0,0))) {
13587       /* -------- 1,xx,0000 MLA s/h variants only -------- */
13588       /* -------- 1,xx,0100 MLS s/h variants only -------- */
13589       /* -------- 0,xx,1000 MUL s/h variants only -------- */
13590       Bool isMLA = opcode == BITS4(0,0,0,0);
13591       Bool isMLS = opcode == BITS4(0,1,0,0);
13592       UInt mm    = 32; // invalid
13593       UInt ix    = 16; // invalid
13594       switch (size) {
13595          case X00:
13596             return False; // b case is not allowed
13597          case X01:
13598             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13599          case X10:
13600             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13601          case X11:
13602             return False; // d case is not allowed
13603          default:
13604             vassert(0);
13605       }
13606       vassert(mm < 32 && ix < 16);
13607       IROp   opMUL = mkVecMUL(size);
13608       IROp   opADD = mkVecADD(size);
13609       IROp   opSUB = mkVecSUB(size);
13610       HChar  ch    = size == X01 ? 'h' : 's';
13611       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13612       IRTemp vecD  = newTempV128();
13613       IRTemp vecN  = newTempV128();
13614       IRTemp res   = newTempV128();
13615       assign(vecD, getQReg128(dd));
13616       assign(vecN, getQReg128(nn));
13617       IRExpr* prod = binop(opMUL, mkexpr(vecN), mkexpr(vecM));
13618       if (isMLA || isMLS) {
13619          assign(res, binop(isMLA ? opADD : opSUB, mkexpr(vecD), prod));
13620       } else {
13621          assign(res, prod);
13622       }
13623       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13624       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13625       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isMLA ? "mla"
13626                                                 : (isMLS ? "mls" : "mul"),
13627           nameQReg128(dd), arr,
13628           nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
13629       return True;
13630    }
13631
13632    if (opcode == BITS4(1,0,1,0)
13633        || opcode == BITS4(0,0,1,0) || opcode == BITS4(0,1,1,0)) {
13634       /* -------- 0,xx,1010 SMULL s/h variants only -------- */ // 0 (ks)
13635       /* -------- 1,xx,1010 UMULL s/h variants only -------- */ // 0
13636       /* -------- 0,xx,0010 SMLAL s/h variants only -------- */ // 1
13637       /* -------- 1,xx,0010 UMLAL s/h variants only -------- */ // 1
13638       /* -------- 0,xx,0110 SMLSL s/h variants only -------- */ // 2
13639       /* -------- 1,xx,0110 SMLSL s/h variants only -------- */ // 2
13640       /* Widens, and size refers to the narrowed lanes. */
13641       UInt ks = 3;
13642       switch (opcode) {
13643          case BITS4(1,0,1,0): ks = 0; break;
13644          case BITS4(0,0,1,0): ks = 1; break;
13645          case BITS4(0,1,1,0): ks = 2; break;
13646          default: vassert(0);
13647       }
13648       vassert(ks >= 0 && ks <= 2);
13649       Bool isU = bitU == 1;
13650       Bool is2 = bitQ == 1;
13651       UInt mm  = 32; // invalid
13652       UInt ix  = 16; // invalid
13653       switch (size) {
13654          case X00:
13655             return False; // h_b_b[] case is not allowed
13656          case X01:
13657             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13658          case X10:
13659             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13660          case X11:
13661             return False; // q_d_d[] case is not allowed
13662          default:
13663             vassert(0);
13664       }
13665       vassert(mm < 32 && ix < 16);
13666       IRTemp vecN  = newTempV128();
13667       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13668       IRTemp vecD  = newTempV128();
13669       assign(vecN, getQReg128(nn));
13670       assign(vecD, getQReg128(dd));
13671       IRTemp res = IRTemp_INVALID;
13672       math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
13673                     vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
13674       putQReg128(dd, mkexpr(res));
13675       const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
13676       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13677       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13678       HChar ch               = size == X01 ? 'h' : 's';
13679       DIP("%c%s%s %s.%s, %s.%s, %s.%c[%u]\n",
13680           isU ? 'u' : 's', nm, is2 ? "2" : "",
13681           nameQReg128(dd), arrWide,
13682           nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
13683       return True;
13684    }
13685
13686    if (bitU == 0
13687        && (opcode == BITS4(1,0,1,1)
13688            || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
13689       /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
13690       /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
13691       /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
13692       /* Widens, and size refers to the narrowed lanes. */
13693       UInt ks = 3;
13694       switch (opcode) {
13695          case BITS4(1,0,1,1): ks = 0; break;
13696          case BITS4(0,0,1,1): ks = 1; break;
13697          case BITS4(0,1,1,1): ks = 2; break;
13698          default: vassert(0);
13699       }
13700       vassert(ks >= 0 && ks <= 2);
13701       Bool is2 = bitQ == 1;
13702       UInt mm  = 32; // invalid
13703       UInt ix  = 16; // invalid
13704       switch (size) {
13705          case X00:
13706             return False; // h_b_b[] case is not allowed
13707          case X01:
13708             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13709          case X10:
13710             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13711          case X11:
13712             return False; // q_d_d[] case is not allowed
13713          default:
13714             vassert(0);
13715       }
13716       vassert(mm < 32 && ix < 16);
13717       IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
13718       vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
13719       newTempsV128_2(&vecN, &vecD);
13720       assign(vecN, getQReg128(nn));
13721       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13722       assign(vecD, getQReg128(dd));
13723       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
13724                        is2, size, "mas"[ks],
13725                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
13726       putQReg128(dd, mkexpr(res));
13727       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
13728       updateQCFLAGwithDifference(sat1q, sat1n);
13729       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
13730          updateQCFLAGwithDifference(sat2q, sat2n);
13731       }
13732       const HChar* nm        = ks == 0 ? "sqdmull"
13733                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
13734       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13735       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13736       HChar ch               = size == X01 ? 'h' : 's';
13737       DIP("%s%s %s.%s, %s.%s, %s.%c[%u]\n",
13738           nm, is2 ? "2" : "",
13739           nameQReg128(dd), arrWide,
13740           nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
13741       return True;
13742    }
13743
13744    if (bitU == 0 && (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1))) {
13745       /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
13746       /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
13747       UInt mm  = 32; // invalid
13748       UInt ix  = 16; // invalid
13749       switch (size) {
13750          case X00:
13751             return False; // b case is not allowed
13752          case X01:
13753             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13754          case X10:
13755             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13756          case X11:
13757             return False; // q case is not allowed
13758          default:
13759             vassert(0);
13760       }
13761       vassert(mm < 32 && ix < 16);
13762       Bool isR = opcode == BITS4(1,1,0,1);
13763       IRTemp res, sat1q, sat1n, vN, vM;
13764       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
13765       vN = newTempV128();
13766       assign(vN, getQReg128(nn));
13767       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13768       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
13769       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13770       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
13771       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
13772       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
13773       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13774       HChar ch         = size == X01 ? 'h' : 's';
13775       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", nm,
13776           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
13777       return True;
13778    }
13779
13780    if (bitU == 1 && (opcode == BITS4(1,1,0,1) || opcode == BITS4(1,1,1,1))) {
13781       /* -------- 0,xx,1101 SQRDMLAH s and h variants only -------- */
13782       /* -------- 0,xx,1111 SQRDMLSH s and h variants only -------- */
13783       UInt mm  = 32; // invalid
13784       UInt ix  = 16; // invalid
13785       switch (size) {
13786          case X00:
13787             return False; // b case is not allowed
13788          case X01:        // h
13789             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13790          case X10:        // s
13791             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13792          case X11:
13793             return False; // d case is not allowed
13794          default:
13795             vassert(0);
13796       }
13797       vassert(mm < 32 && ix < 16);
13798
13799       IRTemp res, res_nosat, vD, vN, vM;
13800       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
13801       newTempsV128_2(&vD, &vN);
13802       assign(vD, getQReg128(dd));
13803       assign(vN, getQReg128(nn));
13804
13805       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13806       Bool isAdd = opcode == BITS4(1,1,0,1);
13807       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
13808       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
13809       updateQCFLAGwithDifferenceZHI(res, res_nosat, opZHI);
13810       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13811
13812       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13813       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
13814       HChar ch         = size == X01 ? 'h' : 's';
13815       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", nm,
13816           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), ch, ix);
13817       return True;
13818    }
13819
13820    return False;
13821 #  undef INSN
13822 }
13823
13824
13825 static
13826 Bool dis_AdvSIMD_crypto_aes(/*MB_OUT*/DisResult* dres, UInt insn)
13827 {
13828    /* 31        23   21    16     11 9 4
13829       0100 1110 size 10100 opcode 10 n d
13830       Decode fields are: size,opcode
13831       Size is always 00 in ARMv8, it appears.
13832    */
13833 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13834    if (INSN(31,24) != BITS8(0,1,0,0,1,1,1,0)
13835       || INSN(21,17) != BITS5(1,0,1,0,0) || INSN(11,10) != BITS2(1,0)) {
13836       return False;
13837    }
13838    UInt size   = INSN(23,22);
13839    UInt opcode = INSN(16,12);
13840    UInt nn     = INSN(9,5);
13841    UInt dd     = INSN(4,0);
13842
13843    if (size == BITS2(0,0)
13844        && (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,0,1))) {
13845       /* -------- 00,00100: AESE Vd.16b, Vn.16b -------- */
13846       /* -------- 00,00101: AESD Vd.16b, Vn.16b -------- */
13847       Bool   isD  = opcode == BITS5(0,0,1,0,1);
13848       IRTemp op1  = newTemp(Ity_V128);
13849       IRTemp op2  = newTemp(Ity_V128);
13850       IRTemp xord = newTemp(Ity_V128);
13851       IRTemp res  = newTemp(Ity_V128);
13852       void*        helper = isD ? &arm64g_dirtyhelper_AESD
13853                                 : &arm64g_dirtyhelper_AESE;
13854       const HChar* hname  = isD ? "arm64g_dirtyhelper_AESD"
13855                                 : "arm64g_dirtyhelper_AESE";
13856       assign(op1, getQReg128(dd));
13857       assign(op2, getQReg128(nn));
13858       assign(xord, binop(Iop_XorV128, mkexpr(op1), mkexpr(op2)));
13859       IRDirty* di
13860          = unsafeIRDirty_1_N( res, 0/*regparms*/, hname, helper,
13861                               mkIRExprVec_3(
13862                                  IRExpr_VECRET(),
13863                                  unop(Iop_V128HIto64, mkexpr(xord)),
13864                                  unop(Iop_V128to64, mkexpr(xord)) ) );
13865       stmt(IRStmt_Dirty(di));
13866       putQReg128(dd, mkexpr(res));
13867       DIP("aes%c %s.16b, %s.16b\n", isD ? 'd' : 'e',
13868                                     nameQReg128(dd), nameQReg128(nn));
13869       return True;
13870    }
13871
13872    if (size == BITS2(0,0)
13873        && (opcode == BITS5(0,0,1,1,0) || opcode == BITS5(0,0,1,1,1))) {
13874       /* -------- 00,00110: AESMC  Vd.16b, Vn.16b -------- */
13875       /* -------- 00,00111: AESIMC Vd.16b, Vn.16b -------- */
13876       Bool   isI  = opcode == BITS5(0,0,1,1,1);
13877       IRTemp src  = newTemp(Ity_V128);
13878       IRTemp res  = newTemp(Ity_V128);
13879       void*        helper = isI ? &arm64g_dirtyhelper_AESIMC
13880                                 : &arm64g_dirtyhelper_AESMC;
13881       const HChar* hname  = isI ? "arm64g_dirtyhelper_AESIMC"
13882                                 : "arm64g_dirtyhelper_AESMC";
13883       assign(src, getQReg128(nn));
13884       IRDirty* di
13885          = unsafeIRDirty_1_N( res, 0/*regparms*/, hname, helper,
13886                               mkIRExprVec_3(
13887                                  IRExpr_VECRET(),
13888                                  unop(Iop_V128HIto64, mkexpr(src)),
13889                                  unop(Iop_V128to64, mkexpr(src)) ) );
13890       stmt(IRStmt_Dirty(di));
13891       putQReg128(dd, mkexpr(res));
13892       DIP("aes%s %s.16b, %s.16b\n", isI ? "imc" : "mc",
13893                                     nameQReg128(dd), nameQReg128(nn));
13894       return True;
13895    }
13896
13897    return False;
13898 #  undef INSN
13899 }
13900
13901
13902 static
13903 Bool dis_AdvSIMD_crypto_three_reg_sha(/*MB_OUT*/DisResult* dres, UInt insn)
13904 {
13905    /* 31   28   23 21 20 15 14  11 9 4
13906       0101 1110 sz 0  m  0  opc 00 n d
13907       Decode fields are: sz,opc
13908    */
13909 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13910    if (INSN(31,24) != BITS8(0,1,0,1,1,1,1,0) || INSN(21,21) != 0
13911        || INSN(15,15) != 0 || INSN(11,10) != BITS2(0,0)) {
13912       return False;
13913    }
13914    UInt sz  = INSN(23,22);
13915    UInt mm  = INSN(20,16);
13916    UInt opc = INSN(14,12);
13917    UInt nn  = INSN(9,5);
13918    UInt dd  = INSN(4,0);
13919    if (sz == BITS2(0,0) && opc <= BITS3(1,1,0)) {
13920       /* -------- 00,000 SHA1C     Qd,    Sn,    Vm.4S -------- */
13921       /* -------- 00,001 SHA1P     Qd,    Sn,    Vm.4S -------- */
13922       /* -------- 00,010 SHA1M     Qd,    Sn,    Vm.4S -------- */
13923       /* -------- 00,011 SHA1SU0   Vd.4S, Vn.4S, Vm.4S -------- */
13924       /* -------- 00,100 SHA256H   Qd,    Qn,    Vm.4S -------- */
13925       /* -------- 00,101 SHA256H2  Qd,    Qn,    Vm.4S -------- */
13926       /* -------- 00,110 SHA256SU1 Vd.4S, Vn.4S, Vm.4S -------- */
13927       vassert(opc < 7);
13928       const HChar* inames[7]
13929          = { "sha1c", "sha1p", "sha1m", "sha1su0",
13930              "sha256h", "sha256h2", "sha256su1" };
13931       void(*helpers[7])(V128*,ULong,ULong,ULong,ULong,ULong,ULong)
13932          = { &arm64g_dirtyhelper_SHA1C,    &arm64g_dirtyhelper_SHA1P,
13933              &arm64g_dirtyhelper_SHA1M,    &arm64g_dirtyhelper_SHA1SU0,
13934              &arm64g_dirtyhelper_SHA256H,  &arm64g_dirtyhelper_SHA256H2,
13935              &arm64g_dirtyhelper_SHA256SU1 };
13936       const HChar* hnames[7]
13937          = { "arm64g_dirtyhelper_SHA1C",    "arm64g_dirtyhelper_SHA1P",
13938              "arm64g_dirtyhelper_SHA1M",    "arm64g_dirtyhelper_SHA1SU0",
13939              "arm64g_dirtyhelper_SHA256H",  "arm64g_dirtyhelper_SHA256H2",
13940              "arm64g_dirtyhelper_SHA256SU1" };
13941       IRTemp vD      = newTemp(Ity_V128);
13942       IRTemp vN      = newTemp(Ity_V128);
13943       IRTemp vM      = newTemp(Ity_V128);
13944       IRTemp vDhi    = newTemp(Ity_I64);
13945       IRTemp vDlo    = newTemp(Ity_I64);
13946       IRTemp vNhiPre = newTemp(Ity_I64);
13947       IRTemp vNloPre = newTemp(Ity_I64);
13948       IRTemp vNhi    = newTemp(Ity_I64);
13949       IRTemp vNlo    = newTemp(Ity_I64);
13950       IRTemp vMhi    = newTemp(Ity_I64);
13951       IRTemp vMlo    = newTemp(Ity_I64);
13952       assign(vD,      getQReg128(dd));
13953       assign(vN,      getQReg128(nn));
13954       assign(vM,      getQReg128(mm));
13955       assign(vDhi,    unop(Iop_V128HIto64, mkexpr(vD)));
13956       assign(vDlo,    unop(Iop_V128to64,   mkexpr(vD)));
13957       assign(vNhiPre, unop(Iop_V128HIto64, mkexpr(vN)));
13958       assign(vNloPre, unop(Iop_V128to64,   mkexpr(vN)));
13959       assign(vMhi,    unop(Iop_V128HIto64, mkexpr(vM)));
13960       assign(vMlo,    unop(Iop_V128to64,   mkexpr(vM)));
13961       /* Mask off any bits of the N register operand that aren't actually
13962          needed, so that Memcheck doesn't complain unnecessarily. */
13963       switch (opc) {
13964          case BITS3(0,0,0): case BITS3(0,0,1): case BITS3(0,1,0):
13965             assign(vNhi, mkU64(0));
13966             assign(vNlo, unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(vNloPre))));
13967             break;
13968          case BITS3(0,1,1): case BITS3(1,0,0):
13969          case BITS3(1,0,1): case BITS3(1,1,0):
13970             assign(vNhi, mkexpr(vNhiPre));
13971             assign(vNlo, mkexpr(vNloPre));
13972             break;
13973          default:
13974             vassert(0);
13975       }
13976       IRTemp res = newTemp(Ity_V128);
13977       IRDirty* di
13978          = unsafeIRDirty_1_N( res, 0/*regparms*/, hnames[opc], helpers[opc],
13979                               mkIRExprVec_7(
13980                                  IRExpr_VECRET(),
13981                                  mkexpr(vDhi), mkexpr(vDlo), mkexpr(vNhi),
13982                                  mkexpr(vNlo), mkexpr(vMhi), mkexpr(vMlo)));
13983       stmt(IRStmt_Dirty(di));
13984       putQReg128(dd, mkexpr(res));
13985       switch (opc) {
13986          case BITS3(0,0,0): case BITS3(0,0,1): case BITS3(0,1,0):
13987             DIP("%s q%u, s%u, v%u.4s\n", inames[opc], dd, nn, mm);
13988             break;
13989          case BITS3(0,1,1): case BITS3(1,1,0):
13990             DIP("%s v%u.4s, v%u.4s, v%u.4s\n", inames[opc], dd, nn, mm);
13991             break;
13992          case BITS3(1,0,0): case BITS3(1,0,1):
13993             DIP("%s q%u, q%u, v%u.4s\n", inames[opc], dd, nn, mm);
13994             break;
13995          default:
13996             vassert(0);
13997       }
13998       return True;
13999    }
14000
14001    return False;
14002 #  undef INSN
14003 }
14004
14005
14006 static
14007 Bool dis_AdvSIMD_crypto_two_reg_sha(/*MB_OUT*/DisResult* dres, UInt insn)
14008 {
14009    /* 31   28   23 21    16  11 9 4
14010       0101 1110 sz 10100 opc 10 n d
14011       Decode fields are: sz,opc
14012    */
14013 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14014    if (INSN(31,24) != BITS8(0,1,0,1,1,1,1,0)
14015        || INSN(21,17) != BITS5(1,0,1,0,0) || INSN(11,10) != BITS2(1,0)) {
14016       return False;
14017    }
14018    UInt sz  = INSN(23,22);
14019    UInt opc = INSN(16,12);
14020    UInt nn  = INSN(9,5);
14021    UInt dd  = INSN(4,0);
14022    if (sz == BITS2(0,0) && opc <= BITS5(0,0,0,1,0)) {
14023       /* -------- 00,00000 SHA1H     Sd,    Sn    -------- */
14024       /* -------- 00,00001 SHA1SU1   Vd.4S, Vn.4S -------- */
14025       /* -------- 00,00010 SHA256SU0 Vd.4S, Vn.4S -------- */
14026       vassert(opc < 3);
14027       const HChar* inames[3] = { "sha1h", "sha1su1", "sha256su0" };
14028       IRTemp vD   = newTemp(Ity_V128);
14029       IRTemp vN   = newTemp(Ity_V128);
14030       IRTemp vDhi = newTemp(Ity_I64);
14031       IRTemp vDlo = newTemp(Ity_I64);
14032       IRTemp vNhi = newTemp(Ity_I64);
14033       IRTemp vNlo = newTemp(Ity_I64);
14034       assign(vD,   getQReg128(dd));
14035       assign(vN,   getQReg128(nn));
14036       assign(vDhi, unop(Iop_V128HIto64, mkexpr(vD)));
14037       assign(vDlo, unop(Iop_V128to64,   mkexpr(vD)));
14038       assign(vNhi, unop(Iop_V128HIto64, mkexpr(vN)));
14039       assign(vNlo, unop(Iop_V128to64,   mkexpr(vN)));
14040       /* Mask off any bits of the N register operand that aren't actually
14041          needed, so that Memcheck doesn't complain unnecessarily.  Also
14042          construct the calls, given that the helper functions don't take
14043          the same number of arguments. */
14044       IRDirty* di  = NULL;
14045       IRTemp   res = newTemp(Ity_V128);
14046       switch (opc) {
14047          case BITS5(0,0,0,0,0): {
14048             IRExpr* vNloMasked = unop(Iop_32Uto64,
14049                                       unop(Iop_64to32, mkexpr(vNlo)));
14050             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
14051                                     "arm64g_dirtyhelper_SHA1H",
14052                                     &arm64g_dirtyhelper_SHA1H,
14053                                     mkIRExprVec_3(
14054                                        IRExpr_VECRET(),
14055                                        mkU64(0), vNloMasked) );
14056             break;
14057          }
14058          case BITS5(0,0,0,0,1):
14059             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
14060                                     "arm64g_dirtyhelper_SHA1SU1",
14061                                     &arm64g_dirtyhelper_SHA1SU1,
14062                                     mkIRExprVec_5(
14063                                        IRExpr_VECRET(),
14064                                        mkexpr(vDhi), mkexpr(vDlo),
14065                                        mkexpr(vNhi), mkexpr(vNlo)) );
14066             break;
14067          case BITS5(0,0,0,1,0):
14068             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
14069                                     "arm64g_dirtyhelper_SHA256SU0",
14070                                     &arm64g_dirtyhelper_SHA256SU0,
14071                                     mkIRExprVec_5(
14072                                        IRExpr_VECRET(),
14073                                        mkexpr(vDhi), mkexpr(vDlo),
14074                                        mkexpr(vNhi), mkexpr(vNlo)) );
14075             break;
14076          default:
14077             vassert(0);
14078       }
14079       stmt(IRStmt_Dirty(di));
14080       putQReg128(dd, mkexpr(res));
14081       switch (opc) {
14082          case BITS5(0,0,0,0,0):
14083             DIP("%s s%u, s%u\n", inames[opc], dd, nn);
14084             break;
14085          case BITS5(0,0,0,0,1): case BITS5(0,0,0,1,0):
14086             DIP("%s v%u.4s, v%u.4s\n", inames[opc], dd, nn);
14087             break;
14088          default:
14089             vassert(0);
14090       }
14091       return True;
14092    }
14093
14094    return False;
14095 #  undef INSN
14096 }
14097
14098
14099 static
14100 Bool dis_AdvSIMD_fp_compare(/*MB_OUT*/DisResult* dres, UInt insn)
14101 {
14102    /* 31  28    23 21 20 15 13   9 4
14103       000 11110 ty 1  m  op 1000 n opcode2
14104       The first 3 bits are really "M 0 S", but M and S are always zero.
14105       Decode fields are: ty,op,opcode2
14106    */
14107 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14108    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
14109        || INSN(21,21) != 1 || INSN(13,10) != BITS4(1,0,0,0)) {
14110       return False;
14111    }
14112    UInt ty      = INSN(23,22);
14113    UInt mm      = INSN(20,16);
14114    UInt op      = INSN(15,14);
14115    UInt nn      = INSN(9,5);
14116    UInt opcode2 = INSN(4,0);
14117    vassert(ty < 4);
14118
14119    if (ty <= X01 && op == X00
14120        && (opcode2 & BITS5(0,0,1,1,1)) == BITS5(0,0,0,0,0)) {
14121       /* -------- 0x,00,00000 FCMP  d_d,   s_s -------- */
14122       /* -------- 0x,00,01000 FCMP  d_#0, s_#0 -------- */
14123       /* -------- 0x,00,10000 FCMPE d_d,   s_s -------- */
14124       /* -------- 0x,00,11000 FCMPE d_#0, s_#0 -------- */
14125       /* 31        23   20    15      9 4
14126          000 11110 01 1     m 00 1000 n 10 000  FCMPE Dn, Dm
14127          000 11110 01 1 00000 00 1000 n 11 000  FCMPE Dn, #0.0
14128          000 11110 01 1     m 00 1000 n 00 000  FCMP  Dn, Dm
14129          000 11110 01 1 00000 00 1000 n 01 000  FCMP  Dn, #0.0
14130
14131          000 11110 00 1     m 00 1000 n 10 000  FCMPE Sn, Sm
14132          000 11110 00 1 00000 00 1000 n 11 000  FCMPE Sn, #0.0
14133          000 11110 00 1     m 00 1000 n 00 000  FCMP  Sn, Sm
14134          000 11110 00 1 00000 00 1000 n 01 000  FCMP  Sn, #0.0
14135
14136          FCMPE generates Invalid Operation exn if either arg is any kind
14137          of NaN.  FCMP generates Invalid Operation exn if either arg is a
14138          signalling NaN.  We ignore this detail here and produce the same
14139          IR for both.
14140       */
14141       Bool   isD     = (ty & 1) == 1;
14142       Bool   isCMPE  = (opcode2 & 16) == 16;
14143       Bool   cmpZero = (opcode2 & 8) == 8;
14144       IRType ity     = isD ? Ity_F64 : Ity_F32;
14145       Bool   valid   = True;
14146       if (cmpZero && mm != 0) valid = False;
14147       if (valid) {
14148          IRTemp argL  = newTemp(ity);
14149          IRTemp argR  = newTemp(ity);
14150          IRTemp irRes = newTemp(Ity_I32);
14151          assign(argL, getQRegLO(nn, ity));
14152          assign(argR,
14153                 cmpZero
14154                    ? (IRExpr_Const(isD ? IRConst_F64i(0) : IRConst_F32i(0)))
14155                    : getQRegLO(mm, ity));
14156          assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32,
14157                              mkexpr(argL), mkexpr(argR)));
14158          IRTemp nzcv = mk_convert_IRCmpF64Result_to_NZCV(irRes);
14159          IRTemp nzcv_28x0 = newTemp(Ity_I64);
14160          assign(nzcv_28x0, binop(Iop_Shl64, mkexpr(nzcv), mkU8(28)));
14161          setFlags_COPY(nzcv_28x0);
14162          DIP("fcmp%s %s, %s\n", isCMPE ? "e" : "", nameQRegLO(nn, ity),
14163              cmpZero ? "#0.0" : nameQRegLO(mm, ity));
14164          return True;
14165       }
14166       return False;
14167    }
14168
14169    return False;
14170 #  undef INSN
14171 }
14172
14173
14174 static
14175 Bool dis_AdvSIMD_fp_conditional_compare(/*MB_OUT*/DisResult* dres, UInt insn)
14176 {
14177    /* 31  28    23 21 20 15   11 9 4  3
14178       000 11110 ty 1  m  cond 01 n op nzcv
14179       The first 3 bits are really "M 0 S", but M and S are always zero.
14180       Decode fields are: ty,op
14181    */
14182 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14183    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
14184        || INSN(21,21) != 1 || INSN(11,10) != BITS2(0,1)) {
14185       return False;
14186    }
14187    UInt ty   = INSN(23,22);
14188    UInt mm   = INSN(20,16);
14189    UInt cond = INSN(15,12);
14190    UInt nn   = INSN(9,5);
14191    UInt op   = INSN(4,4);
14192    UInt nzcv = INSN(3,0);
14193    vassert(ty < 4 && op <= 1);
14194
14195    if (ty <= BITS2(0,1)) {
14196       /* -------- 00,0 FCCMP  s_s -------- */
14197       /* -------- 00,1 FCCMPE s_s -------- */
14198       /* -------- 01,0 FCCMP  d_d -------- */
14199       /* -------- 01,1 FCCMPE d_d -------- */
14200
14201       /* FCCMPE generates Invalid Operation exn if either arg is any kind
14202          of NaN.  FCCMP generates Invalid Operation exn if either arg is a
14203          signalling NaN.  We ignore this detail here and produce the same
14204          IR for both.
14205       */
14206       Bool   isD    = (ty & 1) == 1;
14207       Bool   isCMPE = op == 1;
14208       IRType ity    = isD ? Ity_F64 : Ity_F32;
14209       IRTemp argL   = newTemp(ity);
14210       IRTemp argR   = newTemp(ity);
14211       IRTemp irRes  = newTemp(Ity_I32);
14212       assign(argL,  getQRegLO(nn, ity));
14213       assign(argR,  getQRegLO(mm, ity));
14214       assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32,
14215                           mkexpr(argL), mkexpr(argR)));
14216       IRTemp condT = newTemp(Ity_I1);
14217       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
14218       IRTemp nzcvT = mk_convert_IRCmpF64Result_to_NZCV(irRes);
14219
14220       IRTemp nzcvT_28x0 = newTemp(Ity_I64);
14221       assign(nzcvT_28x0, binop(Iop_Shl64, mkexpr(nzcvT), mkU8(28)));
14222
14223       IRExpr* nzcvF_28x0 = mkU64(((ULong)nzcv) << 28);
14224
14225       IRTemp nzcv_28x0 = newTemp(Ity_I64);
14226       assign(nzcv_28x0, IRExpr_ITE(mkexpr(condT),
14227                                    mkexpr(nzcvT_28x0), nzcvF_28x0));
14228       setFlags_COPY(nzcv_28x0);
14229       DIP("fccmp%s %s, %s, #%u, %s\n", isCMPE ? "e" : "",
14230           nameQRegLO(nn, ity), nameQRegLO(mm, ity), nzcv, nameCC(cond));
14231       return True;
14232    }
14233
14234    return False;
14235 #  undef INSN
14236 }
14237
14238
14239 static
14240 Bool dis_AdvSIMD_fp_conditional_select(/*MB_OUT*/DisResult* dres, UInt insn)
14241 {
14242    /* 31        23 21 20 15   11 9 5
14243       000 11110 ty 1  m  cond 11 n d
14244       The first 3 bits are really "M 0 S", but M and S are always zero.
14245       Decode fields: ty
14246    */
14247 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14248    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0) || INSN(21,21) != 1
14249        || INSN(11,10) != BITS2(1,1)) {
14250       return False;
14251    }
14252    UInt ty   = INSN(23,22);
14253    UInt mm   = INSN(20,16);
14254    UInt cond = INSN(15,12);
14255    UInt nn   = INSN(9,5);
14256    UInt dd   = INSN(4,0);
14257    if (ty <= X01) {
14258       /* -------- 00: FCSEL s_s -------- */
14259       /* -------- 00: FCSEL d_d -------- */
14260       IRType ity = ty == X01 ? Ity_F64 : Ity_F32;
14261       IRTemp srcT = newTemp(ity);
14262       IRTemp srcF = newTemp(ity);
14263       IRTemp res  = newTemp(ity);
14264       assign(srcT, getQRegLO(nn, ity));
14265       assign(srcF, getQRegLO(mm, ity));
14266       assign(res, IRExpr_ITE(
14267                      unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
14268                      mkexpr(srcT), mkexpr(srcF)));
14269       putQReg128(dd, mkV128(0x0000));
14270       putQRegLO(dd, mkexpr(res));
14271       DIP("fcsel %s, %s, %s, %s\n",
14272           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity),
14273           nameCC(cond));
14274       return True;
14275    }
14276    return False;
14277 #  undef INSN
14278 }
14279
14280
14281 static
14282 Bool dis_AdvSIMD_fp_data_proc_1_source(/*MB_OUT*/DisResult* dres, UInt insn)
14283 {
14284    /* 31  28    23 21 20     14    9 4
14285       000 11110 ty 1  opcode 10000 n d
14286       The first 3 bits are really "M 0 S", but M and S are always zero.
14287       Decode fields: ty,opcode
14288    */
14289 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14290    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
14291        || INSN(21,21) != 1 || INSN(14,10) != BITS5(1,0,0,0,0)) {
14292       return False;
14293    }
14294    UInt ty     = INSN(23,22);
14295    UInt opcode = INSN(20,15);
14296    UInt nn     = INSN(9,5);
14297    UInt dd     = INSN(4,0);
14298
14299    if (ty <= X01 && opcode <= BITS6(0,0,0,0,1,1)) {
14300       /* -------- 0x,000000: FMOV  d_d, s_s -------- */
14301       /* -------- 0x,000001: FABS  d_d, s_s -------- */
14302       /* -------- 0x,000010: FNEG  d_d, s_s -------- */
14303       /* -------- 0x,000011: FSQRT d_d, s_s -------- */
14304       IRType ity = ty == X01 ? Ity_F64 : Ity_F32;
14305       IRTemp src = newTemp(ity);
14306       IRTemp res = newTemp(ity);
14307       const HChar* nm = "??";
14308       assign(src, getQRegLO(nn, ity));
14309       switch (opcode) {
14310          case BITS6(0,0,0,0,0,0):
14311             nm = "fmov"; assign(res, mkexpr(src)); break;
14312          case BITS6(0,0,0,0,0,1):
14313             nm = "fabs"; assign(res, unop(mkABSF(ity), mkexpr(src))); break;
14314          case BITS6(0,0,0,0,1,0):
14315             nm = "fabs"; assign(res, unop(mkNEGF(ity), mkexpr(src))); break;
14316          case BITS6(0,0,0,0,1,1):
14317             nm = "fsqrt";
14318             assign(res, binop(mkSQRTF(ity),
14319                               mkexpr(mk_get_IR_rounding_mode()),
14320                               mkexpr(src))); break;
14321          default:
14322             vassert(0);
14323       }
14324       putQReg128(dd, mkV128(0x0000));
14325       putQRegLO(dd, mkexpr(res));
14326       DIP("%s %s, %s\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
14327       return True;
14328    }
14329
14330    if (   (ty == X11 && (opcode == BITS6(0,0,0,1,0,0)
14331                          || opcode == BITS6(0,0,0,1,0,1)))
14332        || (ty == X00 && (opcode == BITS6(0,0,0,1,1,1)
14333                          || opcode == BITS6(0,0,0,1,0,1)))
14334        || (ty == X01 && (opcode == BITS6(0,0,0,1,1,1)
14335                          || opcode == BITS6(0,0,0,1,0,0)))) {
14336       /* -------- 11,000100: FCVT s_h -------- */
14337       /* -------- 11,000101: FCVT d_h -------- */
14338       /* -------- 00,000111: FCVT h_s -------- */
14339       /* -------- 00,000101: FCVT d_s -------- */
14340       /* -------- 01,000111: FCVT h_d -------- */
14341       /* -------- 01,000100: FCVT s_d -------- */
14342       /* 31        23 21    16 14    9 4
14343          000 11110 11 10001 00 10000 n d   FCVT Sd, Hn
14344          --------- 11 ----- 01 ---------   FCVT Dd, Hn
14345          --------- 00 ----- 11 ---------   FCVT Hd, Sn
14346          --------- 00 ----- 01 ---------   FCVT Dd, Sn
14347          --------- 01 ----- 11 ---------   FCVT Hd, Dn
14348          --------- 01 ----- 00 ---------   FCVT Sd, Dn
14349          Rounding, when dst is smaller than src, is per the FPCR.
14350       */
14351       UInt b2322 = ty;
14352       UInt b1615 = opcode & BITS2(1,1);
14353       switch ((b2322 << 2) | b1615) {
14354          case BITS4(0,0,0,1):   // S -> D
14355          case BITS4(1,1,0,1): { // H -> D
14356             Bool   srcIsH = b2322 == BITS2(1,1);
14357             IRType srcTy  = srcIsH ? Ity_F16 : Ity_F32;
14358             IRTemp res    = newTemp(Ity_F64);
14359             assign(res, unop(srcIsH ? Iop_F16toF64 : Iop_F32toF64,
14360                              getQRegLO(nn, srcTy)));
14361             putQReg128(dd, mkV128(0x0000));
14362             putQRegLO(dd, mkexpr(res));
14363             DIP("fcvt %s, %s\n",
14364                 nameQRegLO(dd, Ity_F64), nameQRegLO(nn, srcTy));
14365             return True;
14366          }
14367          case BITS4(0,1,0,0):   // D -> S
14368          case BITS4(0,1,1,1): { // D -> H
14369             Bool   dstIsH = b1615 == BITS2(1,1);
14370             IRType dstTy  = dstIsH ? Ity_F16 : Ity_F32;
14371             IRTemp res    = newTemp(dstTy);
14372             assign(res, binop(dstIsH ? Iop_F64toF16 : Iop_F64toF32,
14373                               mkexpr(mk_get_IR_rounding_mode()),
14374                               getQRegLO(nn, Ity_F64)));
14375             putQReg128(dd, mkV128(0x0000));
14376             putQRegLO(dd, mkexpr(res));
14377             DIP("fcvt %s, %s\n",
14378                 nameQRegLO(dd, dstTy), nameQRegLO(nn, Ity_F64));
14379             return True;
14380          }
14381          case BITS4(0,0,1,1):   // S -> H
14382          case BITS4(1,1,0,0): { // H -> S
14383             Bool   toH   = b1615 == BITS2(1,1);
14384             IRType srcTy = toH ? Ity_F32 : Ity_F16;
14385             IRType dstTy = toH ? Ity_F16 : Ity_F32;
14386             IRTemp res = newTemp(dstTy);
14387             if (toH) {
14388                assign(res, binop(Iop_F32toF16,
14389                                  mkexpr(mk_get_IR_rounding_mode()),
14390                                  getQRegLO(nn, srcTy)));
14391
14392             } else {
14393                assign(res, unop(Iop_F16toF32,
14394                                 getQRegLO(nn, srcTy)));
14395             }
14396             putQReg128(dd, mkV128(0x0000));
14397             putQRegLO(dd, mkexpr(res));
14398             DIP("fcvt %s, %s\n",
14399                 nameQRegLO(dd, dstTy), nameQRegLO(nn, srcTy));
14400             return True;
14401          }
14402          default:
14403             break;
14404       }
14405       /* else unhandled */
14406       return False;
14407    }
14408
14409    if (ty <= X01
14410        && opcode >= BITS6(0,0,1,0,0,0) && opcode <= BITS6(0,0,1,1,1,1)
14411        && opcode != BITS6(0,0,1,1,0,1)) {
14412       /* -------- 0x,001000 FRINTN d_d, s_s -------- */
14413       /* -------- 0x,001001 FRINTP d_d, s_s -------- */
14414       /* -------- 0x,001010 FRINTM d_d, s_s -------- */
14415       /* -------- 0x,001011 FRINTZ d_d, s_s -------- */
14416       /* -------- 0x,001100 FRINTA d_d, s_s -------- */
14417       /* -------- 0x,001110 FRINTX d_d, s_s -------- */
14418       /* -------- 0x,001111 FRINTI d_d, s_s -------- */
14419       /* 31        23 21   17  14    9 4
14420          000 11110 0x 1001 111 10000 n d  FRINTI Fd, Fm (round per FPCR)
14421                            rm
14422          x==0 => S-registers, x==1 => D-registers
14423          rm (17:15) encodings:
14424             111 per FPCR  (FRINTI)
14425             001 +inf      (FRINTP)
14426             010 -inf      (FRINTM)
14427             011 zero      (FRINTZ)
14428             000 tieeven   (FRINTN) -- !! FIXME KLUDGED !!
14429             100 tieaway   (FRINTA) -- !! FIXME KLUDGED !!
14430             110 per FPCR + "exact = TRUE" (FRINTX)
14431             101 unallocated
14432       */
14433       Bool    isD   = (ty & 1) == 1;
14434       UInt    rm    = opcode & BITS6(0,0,0,1,1,1);
14435       IRType  ity   = isD ? Ity_F64 : Ity_F32;
14436       IRExpr* irrmE = NULL;
14437       UChar   ch    = '?';
14438       switch (rm) {
14439          case BITS3(0,1,1): ch = 'z'; irrmE = mkU32(Irrm_ZERO); break;
14440          case BITS3(0,1,0): ch = 'm'; irrmE = mkU32(Irrm_NegINF); break;
14441          case BITS3(0,0,1): ch = 'p'; irrmE = mkU32(Irrm_PosINF); break;
14442          // The following is a kludge.  Should be: Irrm_NEAREST_TIE_AWAY_0
14443          case BITS3(1,0,0): ch = 'a'; irrmE = mkU32(Irrm_NEAREST); break;
14444          // I am unsure about the following, due to the "integral exact"
14445          // description in the manual.  What does it mean? (frintx, that is)
14446          case BITS3(1,1,0):
14447             ch = 'x'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
14448          case BITS3(1,1,1):
14449             ch = 'i'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
14450          // The following is a kludge.  There's no Irrm_ value to represent
14451          // this ("to nearest, with ties to even")
14452          case BITS3(0,0,0): ch = 'n'; irrmE = mkU32(Irrm_NEAREST); break;
14453          default: break;
14454       }
14455       if (irrmE) {
14456          IRTemp src = newTemp(ity);
14457          IRTemp dst = newTemp(ity);
14458          assign(src, getQRegLO(nn, ity));
14459          assign(dst, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
14460                            irrmE, mkexpr(src)));
14461          putQReg128(dd, mkV128(0x0000));
14462          putQRegLO(dd, mkexpr(dst));
14463          DIP("frint%c %s, %s\n",
14464              ch, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
14465          return True;
14466       }
14467       return False;
14468    }
14469
14470    return False;
14471 #  undef INSN
14472 }
14473
14474
14475 static
14476 Bool dis_AdvSIMD_fp_data_proc_2_source(/*MB_OUT*/DisResult* dres, UInt insn)
14477 {
14478    /* 31  28    23 21 20 15     11 9 4
14479       000 11110 ty 1  m  opcode 10 n d
14480       The first 3 bits are really "M 0 S", but M and S are always zero.
14481       Decode fields: ty, opcode
14482    */
14483 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14484    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
14485        || INSN(21,21) != 1 || INSN(11,10) != BITS2(1,0)) {
14486       return False;
14487    }
14488    UInt ty     = INSN(23,22);
14489    UInt mm     = INSN(20,16);
14490    UInt opcode = INSN(15,12);
14491    UInt nn     = INSN(9,5);
14492    UInt dd     = INSN(4,0);
14493
14494    if (ty <= X01 && opcode <= BITS4(0,1,1,1)) {
14495       /* ------- 0x,0000: FMUL d_d, s_s ------- */
14496       /* ------- 0x,0001: FDIV d_d, s_s ------- */
14497       /* ------- 0x,0010: FADD d_d, s_s ------- */
14498       /* ------- 0x,0011: FSUB d_d, s_s ------- */
14499       /* ------- 0x,0100: FMAX d_d, s_s ------- */
14500       /* ------- 0x,0101: FMIN d_d, s_s ------- */
14501       /* ------- 0x,0110: FMAXNM d_d, s_s ------- (FIXME KLUDGED) */
14502       /* ------- 0x,0111: FMINNM d_d, s_s ------- (FIXME KLUDGED) */
14503       IRType ity = ty == X00 ? Ity_F32 : Ity_F64;
14504       IROp   iop = Iop_INVALID;
14505       const HChar* nm = "???";
14506       switch (opcode) {
14507          case BITS4(0,0,0,0): nm = "fmul"; iop = mkMULF(ity); break;
14508          case BITS4(0,0,0,1): nm = "fdiv"; iop = mkDIVF(ity); break;
14509          case BITS4(0,0,1,0): nm = "fadd"; iop = mkADDF(ity); break;
14510          case BITS4(0,0,1,1): nm = "fsub"; iop = mkSUBF(ity); break;
14511          case BITS4(0,1,0,0): nm = "fmax"; iop = mkVecMAXF(ty+2); break;
14512          case BITS4(0,1,0,1): nm = "fmin"; iop = mkVecMINF(ty+2); break;
14513          case BITS4(0,1,1,0): nm = "fmaxnm"; iop = mkVecMAXF(ty+2); break; //!!
14514          case BITS4(0,1,1,1): nm = "fminnm"; iop = mkVecMINF(ty+2); break; //!!
14515          default: vassert(0);
14516       }
14517       if (opcode <= BITS4(0,0,1,1)) {
14518          // This is really not good code.  TODO: avoid width-changing
14519          IRTemp res = newTemp(ity);
14520          assign(res, triop(iop, mkexpr(mk_get_IR_rounding_mode()),
14521                                 getQRegLO(nn, ity), getQRegLO(mm, ity)));
14522          putQReg128(dd, mkV128(0));
14523          putQRegLO(dd, mkexpr(res));
14524       } else {
14525          putQReg128(dd, unop(mkVecZEROHIxxOFV128(ty+2),
14526                              binop(iop, getQReg128(nn), getQReg128(mm))));
14527       }
14528       DIP("%s %s, %s, %s\n",
14529           nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
14530       return True;
14531    }
14532
14533    if (ty <= X01 && opcode == BITS4(1,0,0,0)) {
14534       /* ------- 0x,1000: FNMUL d_d, s_s ------- */
14535       IRType ity  = ty == X00 ? Ity_F32 : Ity_F64;
14536       IROp   iop  = mkMULF(ity);
14537       IROp   iopn = mkNEGF(ity);
14538       const HChar* nm = "fnmul";
14539       IRExpr* resE = unop(iopn,
14540                           triop(iop, mkexpr(mk_get_IR_rounding_mode()),
14541                                 getQRegLO(nn, ity), getQRegLO(mm, ity)));
14542       IRTemp  res  = newTemp(ity);
14543       assign(res, resE);
14544       putQReg128(dd, mkV128(0));
14545       putQRegLO(dd, mkexpr(res));
14546       DIP("%s %s, %s, %s\n",
14547           nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
14548       return True;
14549    }
14550
14551    return False;
14552 #  undef INSN
14553 }
14554
14555
14556 static
14557 Bool dis_AdvSIMD_fp_data_proc_3_source(/*MB_OUT*/DisResult* dres, UInt insn)
14558 {
14559    /* 31  28    23 21 20 15 14 9 4
14560       000 11111 ty o1 m  o0 a  n d
14561       The first 3 bits are really "M 0 S", but M and S are always zero.
14562       Decode fields: ty,o1,o0
14563    */
14564 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14565    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,1)) {
14566       return False;
14567    }
14568    UInt ty    = INSN(23,22);
14569    UInt bitO1 = INSN(21,21);
14570    UInt mm    = INSN(20,16);
14571    UInt bitO0 = INSN(15,15);
14572    UInt aa    = INSN(14,10);
14573    UInt nn    = INSN(9,5);
14574    UInt dd    = INSN(4,0);
14575    vassert(ty < 4);
14576
14577    if (ty <= X01) {
14578       /* -------- 0x,0,0 FMADD  d_d_d_d, s_s_s_s -------- */
14579       /* -------- 0x,0,1 FMSUB  d_d_d_d, s_s_s_s -------- */
14580       /* -------- 0x,1,0 FNMADD d_d_d_d, s_s_s_s -------- */
14581       /* -------- 0x,1,1 FNMSUB d_d_d_d, s_s_s_s -------- */
14582       /* -------------------- F{N}M{ADD,SUB} -------------------- */
14583       /* 31          22   20 15 14 9 4   ix
14584          000 11111 0 sz 0 m  0  a  n d   0   FMADD  Fd,Fn,Fm,Fa
14585          000 11111 0 sz 0 m  1  a  n d   1   FMSUB  Fd,Fn,Fm,Fa
14586          000 11111 0 sz 1 m  0  a  n d   2   FNMADD Fd,Fn,Fm,Fa
14587          000 11111 0 sz 1 m  1  a  n d   3   FNMSUB Fd,Fn,Fm,Fa
14588          where Fx=Dx when sz=1, Fx=Sx when sz=0
14589
14590                   -----SPEC------    ----IMPL----
14591          fmadd       a +    n * m    fmadd (a, n, m)
14592          fmsub       a + (-n) * m    fmsub (a, n, m)
14593          fnmadd   (-a) + (-n) * m    fmadd (-a, -n, m)
14594          fnmsub   (-a) +    n * m    fmadd (-a, n, m)
14595
14596          Note Iop_MAdd/SubF32/64 take arguments in the order: rm, N, M, A
14597       */
14598       Bool    isD   = (ty & 1) == 1;
14599       UInt    ix    = (bitO1 << 1) | bitO0;
14600       IRType  ity   = isD ? Ity_F64 : Ity_F32;
14601       IROp    opFMADD = mkFMADDF(ity);
14602       IROp    opFMSUB = mkFMSUBF(ity);
14603       IROp    opNEG = mkNEGF(ity);
14604       IRTemp  res   = newTemp(ity);
14605       IRExpr* eA    = getQRegLO(aa, ity);
14606       IRExpr* eN    = getQRegLO(nn, ity);
14607       IRExpr* eM    = getQRegLO(mm, ity);
14608       IRExpr* rm    = mkexpr(mk_get_IR_rounding_mode());
14609       switch (ix) {
14610          case 0: /* FMADD */
14611             assign(res, qop(opFMADD, rm, eN, eM, eA));
14612             break;
14613          case 1: /* FMSUB */
14614             assign(res, qop(opFMSUB, rm, eN, eM, eA));
14615             break;
14616          case 2: /* FNMADD */
14617             assign(res, qop(opFMADD, rm, unop(opNEG, eN), eM,
14618                             unop(opNEG,eA)));
14619             break;
14620          case 3: /* FNMSUB */
14621             assign(res, qop(opFMADD, rm, eN, eM, unop(opNEG, eA)));
14622             break;
14623          default:
14624             vassert(0);
14625       }
14626       putQReg128(dd, mkV128(0x0000));
14627       putQRegLO(dd, mkexpr(res));
14628       const HChar* names[4] = { "fmadd", "fmsub", "fnmadd", "fnmsub" };
14629       DIP("%s %s, %s, %s, %s\n",
14630           names[ix], nameQRegLO(dd, ity), nameQRegLO(nn, ity),
14631                      nameQRegLO(mm, ity), nameQRegLO(aa, ity));
14632       return True;
14633    }
14634
14635    return False;
14636 #  undef INSN
14637 }
14638
14639
14640 static
14641 Bool dis_AdvSIMD_fp_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
14642 {
14643    /* 31  28    23 21 20   12  9    4
14644       000 11110 ty 1  imm8 100 imm5 d
14645       The first 3 bits are really "M 0 S", but M and S are always zero.
14646    */
14647 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14648    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
14649        || INSN(21,21) != 1 || INSN(12,10) != BITS3(1,0,0)) {
14650       return False;
14651    }
14652    UInt ty     = INSN(23,22);
14653    UInt imm8   = INSN(20,13);
14654    UInt imm5   = INSN(9,5);
14655    UInt dd     = INSN(4,0);
14656
14657    /* ------- 00,00000: FMOV s_imm ------- */
14658    /* ------- 01,00000: FMOV d_imm ------- */
14659    if (ty <= X01 && imm5 == BITS5(0,0,0,0,0)) {
14660       Bool  isD  = (ty & 1) == 1;
14661       ULong imm  = VFPExpandImm(imm8, isD ? 64 : 32);
14662       if (!isD) {
14663          vassert(0 == (imm & 0xFFFFFFFF00000000ULL));
14664       }
14665       putQReg128(dd, mkV128(0));
14666       putQRegLO(dd, isD ? mkU64(imm) : mkU32(imm & 0xFFFFFFFFULL));
14667       DIP("fmov %s, #0x%llx\n",
14668           nameQRegLO(dd, isD ? Ity_F64 : Ity_F32), imm);
14669       return True;
14670    }
14671
14672    return False;
14673 #  undef INSN
14674 }
14675
14676
14677 static
14678 Bool dis_AdvSIMD_fp_to_from_fixedp_conv(/*MB_OUT*/DisResult* dres, UInt insn)
14679 {
14680 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14681    /* 31 30 29 28    23   21 20    18     15    9 4
14682       sf  0  0 11110 type 0  rmode opcode scale n d
14683       The first 3 bits are really "sf 0 S", but S is always zero.
14684       Decode fields: sf,type,rmode,opcode
14685    */
14686 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14687    if (INSN(30,29) != BITS2(0,0)
14688        || INSN(28,24) != BITS5(1,1,1,1,0)
14689        || INSN(21,21) != 0) {
14690       return False;
14691    }
14692    UInt bitSF = INSN(31,31);
14693    UInt ty    = INSN(23,22); // type
14694    UInt rm    = INSN(20,19); // rmode
14695    UInt op    = INSN(18,16); // opcode
14696    UInt sc    = INSN(15,10); // scale
14697    UInt nn    = INSN(9,5);
14698    UInt dd    = INSN(4,0);
14699
14700    if (ty <= X01 && rm == X11
14701        && (op == BITS3(0,0,0) || op == BITS3(0,0,1))) {
14702       /* -------- (ix) sf ty rm opc -------- */
14703       /* -------- 0    0  00 11 000: FCVTZS w_s_#fbits -------- */
14704       /* -------- 1    0  01 11 000: FCVTZS w_d_#fbits -------- */
14705       /* -------- 2    1  00 11 000: FCVTZS x_s_#fbits -------- */
14706       /* -------- 3    1  01 11 000: FCVTZS x_d_#fbits -------- */
14707
14708       /* -------- 4    0  00 11 001: FCVTZU w_s_#fbits -------- */
14709       /* -------- 5    0  01 11 001: FCVTZU w_d_#fbits -------- */
14710       /* -------- 6    1  00 11 001: FCVTZU x_s_#fbits -------- */
14711       /* -------- 7    1  01 11 001: FCVTZU x_d_#fbits -------- */
14712       Bool isI64 = bitSF == 1;
14713       Bool isF64 = (ty & 1) == 1;
14714       Bool isU   = (op & 1) == 1;
14715       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
14716
14717       Int fbits = 64 - sc;
14718       vassert(fbits >= 1 && fbits <= (isI64 ? 64 : 32));
14719
14720       Double  scale  = two_to_the_plus(fbits);
14721       IRExpr* scaleE = isF64 ? IRExpr_Const(IRConst_F64(scale))
14722                              : IRExpr_Const(IRConst_F32( (Float)scale ));
14723       IROp    opMUL  = isF64 ? Iop_MulF64 : Iop_MulF32;
14724
14725       const IROp ops[8]
14726         = { Iop_F32toI32S, Iop_F64toI32S, Iop_F32toI64S, Iop_F64toI64S,
14727             Iop_F32toI32U, Iop_F64toI32U, Iop_F32toI64U, Iop_F64toI64U };
14728       IRTemp irrm = newTemp(Ity_I32);
14729       assign(irrm, mkU32(Irrm_ZERO));
14730
14731       IRExpr* src = getQRegLO(nn, isF64 ? Ity_F64 : Ity_F32);
14732       IRExpr* res = binop(ops[ix], mkexpr(irrm),
14733                                    triop(opMUL, mkexpr(irrm), src, scaleE));
14734       putIRegOrZR(isI64, dd, res);
14735
14736       DIP("fcvtz%c %s, %s, #%d\n",
14737           isU ? 'u' : 's', nameIRegOrZR(isI64, dd),
14738           nameQRegLO(nn, isF64 ? Ity_F64 : Ity_F32), fbits);
14739       return True;
14740    }
14741
14742    /* ------ sf,ty,rm,opc ------ */
14743    /* ------ x,0x,00,010  SCVTF s/d, w/x, #fbits  ------ */
14744    /* ------ x,0x,00,011  UCVTF s/d, w/x, #fbits  ------ */
14745    /* (ix) sf  S 28    ty   rm opc 15    9 4
14746       0    0 0 0 11110 00 0 00 010 scale n d  SCVTF Sd, Wn, #fbits
14747       1    0 0 0 11110 01 0 00 010 scale n d  SCVTF Dd, Wn, #fbits
14748       2    1 0 0 11110 00 0 00 010 scale n d  SCVTF Sd, Xn, #fbits
14749       3    1 0 0 11110 01 0 00 010 scale n d  SCVTF Dd, Xn, #fbits
14750
14751       4    0 0 0 11110 00 0 00 011 scale n d  UCVTF Sd, Wn, #fbits
14752       5    0 0 0 11110 01 0 00 011 scale n d  UCVTF Dd, Wn, #fbits
14753       6    1 0 0 11110 00 0 00 011 scale n d  UCVTF Sd, Xn, #fbits
14754       7    1 0 0 11110 01 0 00 011 scale n d  UCVTF Dd, Xn, #fbits
14755
14756       These are signed/unsigned conversion from integer registers to
14757       FP registers, all 4 32/64-bit combinations, rounded per FPCR,
14758       scaled per |scale|.
14759    */
14760    if (ty <= X01 && rm == X00
14761        && (op == BITS3(0,1,0) || op == BITS3(0,1,1))
14762        && (bitSF == 1 || ((sc >> 5) & 1) == 1)) {
14763       Bool isI64 = bitSF == 1;
14764       Bool isF64 = (ty & 1) == 1;
14765       Bool isU   = (op & 1) == 1;
14766       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
14767
14768       Int fbits = 64 - sc;
14769       vassert(fbits >= 1 && fbits <= (isI64 ? 64 : 32));
14770
14771       Double  scale  = two_to_the_minus(fbits);
14772       IRExpr* scaleE = isF64 ? IRExpr_Const(IRConst_F64(scale))
14773                              : IRExpr_Const(IRConst_F32( (Float)scale ));
14774       IROp    opMUL  = isF64 ? Iop_MulF64 : Iop_MulF32;
14775
14776       const IROp ops[8]
14777         = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
14778             Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
14779       IRExpr* src = getIRegOrZR(isI64, nn);
14780       IRExpr* res = (isF64 && !isI64)
14781                        ? unop(ops[ix], src)
14782                        : binop(ops[ix],
14783                                mkexpr(mk_get_IR_rounding_mode()), src);
14784       putQReg128(dd, mkV128(0));
14785       putQRegLO(dd, triop(opMUL, mkU32(Irrm_NEAREST), res, scaleE));
14786
14787       DIP("%ccvtf %s, %s, #%d\n",
14788           isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32),
14789           nameIRegOrZR(isI64, nn), fbits);
14790       return True;
14791    }
14792
14793    return False;
14794 #  undef INSN
14795 }
14796
14797
14798 static
14799 Bool dis_AdvSIMD_fp_to_from_int_conv(/*MB_OUT*/DisResult* dres, UInt insn)
14800 {
14801    /* 31 30 29 28    23   21 20    18     15     9 4
14802       sf  0  0 11110 type 1  rmode opcode 000000 n d
14803       The first 3 bits are really "sf 0 S", but S is always zero.
14804       Decode fields: sf,type,rmode,opcode
14805    */
14806 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14807    if (INSN(30,29) != BITS2(0,0)
14808        || INSN(28,24) != BITS5(1,1,1,1,0)
14809        || INSN(21,21) != 1
14810        || INSN(15,10) != BITS6(0,0,0,0,0,0)) {
14811       return False;
14812    }
14813    UInt bitSF = INSN(31,31);
14814    UInt ty    = INSN(23,22); // type
14815    UInt rm    = INSN(20,19); // rmode
14816    UInt op    = INSN(18,16); // opcode
14817    UInt nn    = INSN(9,5);
14818    UInt dd    = INSN(4,0);
14819
14820    // op = 000, 001
14821    /* -------- FCVT{N,P,M,Z,A}{S,U} (scalar, integer) -------- */
14822    /*    30       23   20 18  15     9 4
14823       sf 00 11110 0x 1 00 000 000000 n d  FCVTNS Rd, Fn (round to
14824       sf 00 11110 0x 1 00 001 000000 n d  FCVTNU Rd, Fn  nearest)
14825       ---------------- 01 --------------  FCVTP-------- (round to +inf)
14826       ---------------- 10 --------------  FCVTM-------- (round to -inf)
14827       ---------------- 11 --------------  FCVTZ-------- (round to zero)
14828       ---------------- 00 100 ----------  FCVTAS------- (nearest, ties away)
14829       ---------------- 00 101 ----------  FCVTAU------- (nearest, ties away)
14830
14831       Rd is Xd when sf==1, Wd when sf==0
14832       Fn is Dn when x==1, Sn when x==0
14833       20:19 carry the rounding mode, using the same encoding as FPCR
14834    */
14835    if (ty <= X01
14836        && (   ((op == BITS3(0,0,0) || op == BITS3(0,0,1)) && True)
14837            || ((op == BITS3(1,0,0) || op == BITS3(1,0,1)) && rm == BITS2(0,0))
14838           )
14839       ) {
14840       Bool isI64 = bitSF == 1;
14841       Bool isF64 = (ty & 1) == 1;
14842       Bool isU   = (op & 1) == 1;
14843       /* Decide on the IR rounding mode to use. */
14844       IRRoundingMode irrm = 8; /*impossible*/
14845       HChar ch = '?';
14846       if (op == BITS3(0,0,0) || op == BITS3(0,0,1)) {
14847          switch (rm) {
14848             case BITS2(0,0): ch = 'n'; irrm = Irrm_NEAREST; break;
14849             case BITS2(0,1): ch = 'p'; irrm = Irrm_PosINF; break;
14850             case BITS2(1,0): ch = 'm'; irrm = Irrm_NegINF; break;
14851             case BITS2(1,1): ch = 'z'; irrm = Irrm_ZERO; break;
14852             default: vassert(0);
14853          }
14854       } else {
14855          vassert(op == BITS3(1,0,0) || op == BITS3(1,0,1));
14856          switch (rm) {
14857             case BITS2(0,0): ch = 'a'; irrm = Irrm_NEAREST; break;
14858             default: vassert(0);
14859          }
14860       }
14861       vassert(irrm != 8);
14862       /* Decide on the conversion primop, based on the source size,
14863          dest size and signedness (8 possibilities).  Case coding:
14864             F32 ->s I32   0
14865             F32 ->u I32   1
14866             F32 ->s I64   2
14867             F32 ->u I64   3
14868             F64 ->s I32   4
14869             F64 ->u I32   5
14870             F64 ->s I64   6
14871             F64 ->u I64   7
14872       */
14873       UInt ix = (isF64 ? 4 : 0) | (isI64 ? 2 : 0) | (isU ? 1 : 0);
14874       vassert(ix < 8);
14875       const IROp iops[8]
14876          = { Iop_F32toI32S, Iop_F32toI32U, Iop_F32toI64S, Iop_F32toI64U,
14877              Iop_F64toI32S, Iop_F64toI32U, Iop_F64toI64S, Iop_F64toI64U };
14878       IROp iop = iops[ix];
14879       // A bit of ATCery: bounce all cases we haven't seen an example of.
14880       if (/* F32toI32S */
14881              (iop == Iop_F32toI32S && irrm == Irrm_ZERO)   /* FCVTZS Wd,Sn */
14882           || (iop == Iop_F32toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Sn */
14883           || (iop == Iop_F32toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Sn */
14884           || (iop == Iop_F32toI32S && irrm == Irrm_NEAREST)/* FCVT{A,N}S W,S */
14885           /* F32toI32U */
14886           || (iop == Iop_F32toI32U && irrm == Irrm_ZERO)   /* FCVTZU Wd,Sn */
14887           || (iop == Iop_F32toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Sn */
14888           || (iop == Iop_F32toI32U && irrm == Irrm_PosINF) /* FCVTPU Wd,Sn */
14889           || (iop == Iop_F32toI32U && irrm == Irrm_NEAREST)/* FCVT{A,N}U W,S */
14890           /* F32toI64S */
14891           || (iop == Iop_F32toI64S && irrm == Irrm_ZERO)   /* FCVTZS Xd,Sn */
14892           || (iop == Iop_F32toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Sn */
14893           || (iop == Iop_F32toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Sn */
14894           || (iop == Iop_F32toI64S && irrm == Irrm_NEAREST)/* FCVT{A,N}S X,S */
14895           /* F32toI64U */
14896           || (iop == Iop_F32toI64U && irrm == Irrm_ZERO)   /* FCVTZU Xd,Sn */
14897           || (iop == Iop_F32toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Sn */
14898           || (iop == Iop_F32toI64U && irrm == Irrm_PosINF) /* FCVTPU Xd,Sn */
14899           || (iop == Iop_F32toI64U && irrm == Irrm_NEAREST)/* FCVT{A,N}U X,S */
14900           /* F64toI32S */
14901           || (iop == Iop_F64toI32S && irrm == Irrm_ZERO)   /* FCVTZS Wd,Dn */
14902           || (iop == Iop_F64toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Dn */
14903           || (iop == Iop_F64toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Dn */
14904           || (iop == Iop_F64toI32S && irrm == Irrm_NEAREST)/* FCVT{A,N}S W,D */
14905           /* F64toI32U */
14906           || (iop == Iop_F64toI32U && irrm == Irrm_ZERO)   /* FCVTZU Wd,Dn */
14907           || (iop == Iop_F64toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Dn */
14908           || (iop == Iop_F64toI32U && irrm == Irrm_PosINF) /* FCVTPU Wd,Dn */
14909           || (iop == Iop_F64toI32U && irrm == Irrm_NEAREST)/* FCVT{A,N}U W,D */
14910           /* F64toI64S */
14911           || (iop == Iop_F64toI64S && irrm == Irrm_ZERO)   /* FCVTZS Xd,Dn */
14912           || (iop == Iop_F64toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Dn */
14913           || (iop == Iop_F64toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Dn */
14914           || (iop == Iop_F64toI64S && irrm == Irrm_NEAREST)/* FCVT{A,N}S X,D */
14915           /* F64toI64U */
14916           || (iop == Iop_F64toI64U && irrm == Irrm_ZERO)   /* FCVTZU Xd,Dn */
14917           || (iop == Iop_F64toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Dn */
14918           || (iop == Iop_F64toI64U && irrm == Irrm_PosINF) /* FCVTPU Xd,Dn */
14919           || (iop == Iop_F64toI64U && irrm == Irrm_NEAREST)/* FCVT{A,N}U X,D */
14920          ) {
14921         /* validated */
14922       } else {
14923         return False;
14924       }
14925       IRType srcTy  = isF64 ? Ity_F64 : Ity_F32;
14926       IRType dstTy  = isI64 ? Ity_I64 : Ity_I32;
14927       IRTemp src    = newTemp(srcTy);
14928       IRTemp dst    = newTemp(dstTy);
14929       assign(src, getQRegLO(nn, srcTy));
14930       assign(dst, binop(iop, mkU32(irrm), mkexpr(src)));
14931       putIRegOrZR(isI64, dd, mkexpr(dst));
14932       DIP("fcvt%c%c %s, %s\n", ch, isU ? 'u' : 's',
14933           nameIRegOrZR(isI64, dd), nameQRegLO(nn, srcTy));
14934       return True;
14935    }
14936
14937    // op = 010, 011
14938    /* -------------- {S,U}CVTF (scalar, integer) -------------- */
14939    /* (ix) sf  S 28    ty   rm op  15     9 4
14940       0    0 0 0 11110 00 1 00 010 000000 n d  SCVTF Sd, Wn
14941       1    0 0 0 11110 01 1 00 010 000000 n d  SCVTF Dd, Wn
14942       2    1 0 0 11110 00 1 00 010 000000 n d  SCVTF Sd, Xn
14943       3    1 0 0 11110 01 1 00 010 000000 n d  SCVTF Dd, Xn
14944
14945       4    0 0 0 11110 00 1 00 011 000000 n d  UCVTF Sd, Wn
14946       5    0 0 0 11110 01 1 00 011 000000 n d  UCVTF Dd, Wn
14947       6    1 0 0 11110 00 1 00 011 000000 n d  UCVTF Sd, Xn
14948       7    1 0 0 11110 01 1 00 011 000000 n d  UCVTF Dd, Xn
14949
14950       These are signed/unsigned conversion from integer registers to
14951       FP registers, all 4 32/64-bit combinations, rounded per FPCR.
14952    */
14953    if (ty <= X01 && rm == X00 && (op == BITS3(0,1,0) || op == BITS3(0,1,1))) {
14954       Bool isI64 = bitSF == 1;
14955       Bool isF64 = (ty & 1) == 1;
14956       Bool isU   = (op & 1) == 1;
14957       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
14958       const IROp ops[8]
14959         = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
14960             Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
14961       IRExpr* src = getIRegOrZR(isI64, nn);
14962       IRExpr* res = (isF64 && !isI64)
14963                        ? unop(ops[ix], src)
14964                        : binop(ops[ix],
14965                                mkexpr(mk_get_IR_rounding_mode()), src);
14966       putQReg128(dd, mkV128(0));
14967       putQRegLO(dd, res);
14968       DIP("%ccvtf %s, %s\n",
14969           isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32),
14970           nameIRegOrZR(isI64, nn));
14971       return True;
14972    }
14973
14974    // op = 110, 111
14975    /* -------- FMOV (general) -------- */
14976    /* case sf  S       ty   rm op  15     9 4
14977        (1) 0 0 0 11110 00 1 00 111 000000 n d     FMOV Sd,      Wn
14978        (2) 1 0 0 11110 01 1 00 111 000000 n d     FMOV Dd,      Xn
14979        (3) 1 0 0 11110 10 1 01 111 000000 n d     FMOV Vd.D[1], Xn
14980
14981        (4) 0 0 0 11110 00 1 00 110 000000 n d     FMOV Wd, Sn
14982        (5) 1 0 0 11110 01 1 00 110 000000 n d     FMOV Xd, Dn
14983        (6) 1 0 0 11110 10 1 01 110 000000 n d     FMOV Xd, Vn.D[1]
14984    */
14985    if (1) {
14986       UInt ix = 0; // case
14987       if (bitSF == 0) {
14988          if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,1))
14989             ix = 1;
14990          else
14991          if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,0))
14992             ix = 4;
14993       } else {
14994          vassert(bitSF == 1);
14995          if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,1))
14996             ix = 2;
14997          else
14998          if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,0))
14999             ix = 5;
15000          else
15001          if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,1))
15002             ix = 3;
15003          else
15004          if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,0))
15005             ix = 6;
15006       }
15007       if (ix > 0) {
15008          switch (ix) {
15009             case 1:
15010                putQReg128(dd, mkV128(0));
15011                putQRegLO(dd, getIReg32orZR(nn));
15012                DIP("fmov s%u, w%u\n", dd, nn);
15013                break;
15014             case 2:
15015                putQReg128(dd, mkV128(0));
15016                putQRegLO(dd, getIReg64orZR(nn));
15017                DIP("fmov d%u, x%u\n", dd, nn);
15018                break;
15019             case 3:
15020                putQRegHI64(dd, getIReg64orZR(nn));
15021                DIP("fmov v%u.d[1], x%u\n", dd, nn);
15022                break;
15023             case 4:
15024                putIReg32orZR(dd, getQRegLO(nn, Ity_I32));
15025                DIP("fmov w%u, s%u\n", dd, nn);
15026                break;
15027             case 5:
15028                putIReg64orZR(dd, getQRegLO(nn, Ity_I64));
15029                DIP("fmov x%u, d%u\n", dd, nn);
15030                break;
15031             case 6:
15032                putIReg64orZR(dd, getQRegHI64(nn));
15033                DIP("fmov x%u, v%u.d[1]\n", dd, nn);
15034                break;
15035             default:
15036                vassert(0);
15037          }
15038          return True;
15039       }
15040       /* undecodable; fall through */
15041    }
15042
15043    return False;
15044 #  undef INSN
15045 }
15046
15047
15048 static
15049 Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn,
15050                            const VexArchInfo* archinfo)
15051 {
15052    Bool ok;
15053    ok = dis_AdvSIMD_EXT(dres, insn);
15054    if (UNLIKELY(ok)) return True;
15055    ok = dis_AdvSIMD_TBL_TBX(dres, insn);
15056    if (UNLIKELY(ok)) return True;
15057    ok = dis_AdvSIMD_ZIP_UZP_TRN(dres, insn);
15058    if (UNLIKELY(ok)) return True;
15059    ok = dis_AdvSIMD_across_lanes(dres, insn);
15060    if (UNLIKELY(ok)) return True;
15061    ok = dis_AdvSIMD_copy(dres, insn);
15062    if (UNLIKELY(ok)) return True;
15063    ok = dis_AdvSIMD_modified_immediate(dres, insn);
15064    if (UNLIKELY(ok)) return True;
15065    ok = dis_AdvSIMD_scalar_copy(dres, insn);
15066    if (UNLIKELY(ok)) return True;
15067    ok = dis_AdvSIMD_scalar_pairwise(dres, insn, archinfo);
15068    if (UNLIKELY(ok)) return True;
15069    ok = dis_AdvSIMD_scalar_shift_by_imm(dres, insn);
15070    if (UNLIKELY(ok)) return True;
15071    ok = dis_AdvSIMD_scalar_three_different(dres, insn);
15072    if (UNLIKELY(ok)) return True;
15073    ok = dis_AdvSIMD_scalar_three_same(dres, insn);
15074    if (UNLIKELY(ok)) return True;
15075    ok = dis_AdvSIMD_scalar_three_same_extra(dres, insn);
15076    if (UNLIKELY(ok)) return True;
15077    ok = dis_AdvSIMD_scalar_two_reg_misc(dres, insn);
15078    if (UNLIKELY(ok)) return True;
15079    ok = dis_AdvSIMD_scalar_x_indexed_element(dres, insn);
15080    if (UNLIKELY(ok)) return True;
15081    ok = dis_AdvSIMD_shift_by_immediate(dres, insn);
15082    if (UNLIKELY(ok)) return True;
15083    ok = dis_AdvSIMD_three_different(dres, insn);
15084    if (UNLIKELY(ok)) return True;
15085    ok = dis_AdvSIMD_three_same(dres, insn);
15086    if (UNLIKELY(ok)) return True;
15087    ok = dis_AdvSIMD_three_same_extra(dres, insn);
15088    if (UNLIKELY(ok)) return True;
15089    ok = dis_AdvSIMD_three_same_fp16(dres, insn);
15090    if (UNLIKELY(ok)) return True;
15091    ok = dis_AdvSIMD_two_reg_misc(dres, insn);
15092    if (UNLIKELY(ok)) return True;
15093    ok = dis_AdvSIMD_vector_x_indexed_elem(dres, insn);
15094    if (UNLIKELY(ok)) return True;
15095    ok = dis_AdvSIMD_crypto_aes(dres, insn);
15096    if (UNLIKELY(ok)) return True;
15097    ok = dis_AdvSIMD_crypto_three_reg_sha(dres, insn);
15098    if (UNLIKELY(ok)) return True;
15099    ok = dis_AdvSIMD_crypto_two_reg_sha(dres, insn);
15100    if (UNLIKELY(ok)) return True;
15101    ok = dis_AdvSIMD_fp_compare(dres, insn);
15102    if (UNLIKELY(ok)) return True;
15103    ok = dis_AdvSIMD_fp_conditional_compare(dres, insn);
15104    if (UNLIKELY(ok)) return True;
15105    ok = dis_AdvSIMD_fp_conditional_select(dres, insn);
15106    if (UNLIKELY(ok)) return True;
15107    ok = dis_AdvSIMD_fp_data_proc_1_source(dres, insn);
15108    if (UNLIKELY(ok)) return True;
15109    ok = dis_AdvSIMD_fp_data_proc_2_source(dres, insn);
15110    if (UNLIKELY(ok)) return True;
15111    ok = dis_AdvSIMD_fp_data_proc_3_source(dres, insn);
15112    if (UNLIKELY(ok)) return True;
15113    ok = dis_AdvSIMD_fp_immediate(dres, insn);
15114    if (UNLIKELY(ok)) return True;
15115    ok = dis_AdvSIMD_fp_to_from_fixedp_conv(dres, insn);
15116    if (UNLIKELY(ok)) return True;
15117    ok = dis_AdvSIMD_fp_to_from_int_conv(dres, insn);
15118    if (UNLIKELY(ok)) return True;
15119    return False;
15120 }
15121
15122
15123 /*------------------------------------------------------------*/
15124 /*--- Disassemble a single ARM64 instruction               ---*/
15125 /*------------------------------------------------------------*/
15126
15127 /* Disassemble a single ARM64 instruction into IR.  The instruction
15128    has is located at |guest_instr| and has guest IP of
15129    |guest_PC_curr_instr|, which will have been set before the call
15130    here.  Returns True iff the instruction was decoded, in which case
15131    *dres will be set accordingly, or False, in which case *dres should
15132    be ignored by the caller. */
15133
15134 static
15135 Bool disInstr_ARM64_WRK (
15136         /*MB_OUT*/DisResult* dres,
15137         const UChar* guest_instr,
15138         const VexArchInfo* archinfo,
15139         const VexAbiInfo*  abiinfo,
15140         Bool sigill_diag
15141      )
15142 {
15143    // A macro to fish bits out of 'insn'.
15144 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15145
15146 //ZZ    DisResult dres;
15147 //ZZ    UInt      insn;
15148 //ZZ    //Bool      allow_VFP = False;
15149 //ZZ    //UInt      hwcaps = archinfo->hwcaps;
15150 //ZZ    IRTemp    condT; /* :: Ity_I32 */
15151 //ZZ    UInt      summary;
15152 //ZZ    HChar     dis_buf[128];  // big enough to hold LDMIA etc text
15153 //ZZ
15154 //ZZ    /* What insn variants are we supporting today? */
15155 //ZZ    //allow_VFP  = (0 != (hwcaps & VEX_HWCAPS_ARM_VFP));
15156 //ZZ    // etc etc
15157
15158    /* Set result defaults. */
15159    dres->whatNext    = Dis_Continue;
15160    dres->len         = 4;
15161    dres->jk_StopHere = Ijk_INVALID;
15162    dres->hint        = Dis_HintNone;
15163
15164    /* At least this is simple on ARM64: insns are all 4 bytes long, and
15165       4-aligned.  So just fish the whole thing out of memory right now
15166       and have done. */
15167    UInt insn = getUIntLittleEndianly( guest_instr );
15168
15169    if (0) vex_printf("insn: 0x%x\n", insn);
15170
15171    DIP("\t(arm64) 0x%llx:  ", (ULong)guest_PC_curr_instr);
15172
15173    vassert(0 == (guest_PC_curr_instr & 3ULL));
15174
15175    /* ----------------------------------------------------------- */
15176
15177    /* Spot "Special" instructions (see comment at top of file). */
15178    {
15179       const UChar* code = guest_instr;
15180       /* Spot the 16-byte preamble:
15181             93CC0D8C   ror x12, x12, #3
15182             93CC358C   ror x12, x12, #13
15183             93CCCD8C   ror x12, x12, #51
15184             93CCF58C   ror x12, x12, #61
15185       */
15186       UInt word1 = 0x93CC0D8C;
15187       UInt word2 = 0x93CC358C;
15188       UInt word3 = 0x93CCCD8C;
15189       UInt word4 = 0x93CCF58C;
15190       if (getUIntLittleEndianly(code+ 0) == word1 &&
15191           getUIntLittleEndianly(code+ 4) == word2 &&
15192           getUIntLittleEndianly(code+ 8) == word3 &&
15193           getUIntLittleEndianly(code+12) == word4) {
15194          /* Got a "Special" instruction preamble.  Which one is it? */
15195          if (getUIntLittleEndianly(code+16) == 0xAA0A014A
15196                                                /* orr x10,x10,x10 */) {
15197             /* X3 = client_request ( X4 ) */
15198             DIP("x3 = client_request ( x4 )\n");
15199             putPC(mkU64( guest_PC_curr_instr + 20 ));
15200             dres->jk_StopHere = Ijk_ClientReq;
15201             dres->whatNext    = Dis_StopHere;
15202             return True;
15203          }
15204          else
15205          if (getUIntLittleEndianly(code+16) == 0xAA0B016B
15206                                                /* orr x11,x11,x11 */) {
15207             /* X3 = guest_NRADDR */
15208             DIP("x3 = guest_NRADDR\n");
15209             dres->len = 20;
15210             putIReg64orZR(3, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
15211             return True;
15212          }
15213          else
15214          if (getUIntLittleEndianly(code+16) == 0xAA0C018C
15215                                                /* orr x12,x12,x12 */) {
15216             /*  branch-and-link-to-noredir X8 */
15217             DIP("branch-and-link-to-noredir x8\n");
15218             putIReg64orZR(30, mkU64(guest_PC_curr_instr + 20));
15219             putPC(getIReg64orZR(8));
15220             dres->jk_StopHere = Ijk_NoRedir;
15221             dres->whatNext    = Dis_StopHere;
15222             return True;
15223          }
15224          else
15225          if (getUIntLittleEndianly(code+16) == 0xAA090129
15226                                                /* orr x9,x9,x9 */) {
15227             /* IR injection */
15228             DIP("IR injection\n");
15229             vex_inject_ir(irsb, Iend_LE);
15230             // Invalidate the current insn. The reason is that the IRop we're
15231             // injecting here can change. In which case the translation has to
15232             // be redone. For ease of handling, we simply invalidate all the
15233             // time.
15234             stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_PC_curr_instr)));
15235             stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(20)));
15236             putPC(mkU64( guest_PC_curr_instr + 20 ));
15237             dres->whatNext    = Dis_StopHere;
15238             dres->jk_StopHere = Ijk_InvalICache;
15239             return True;
15240          }
15241          /* We don't know what it is. */
15242          return False;
15243          /*NOTREACHED*/
15244       }
15245    }
15246
15247    /* ----------------------------------------------------------- */
15248
15249    /* Main ARM64 instruction decoder starts here. */
15250
15251    Bool ok = False;
15252
15253    /* insn[28:25] determines the top-level grouping, so let's start
15254       off with that.
15255
15256       For all of these dis_ARM64_ functions, we pass *dres with the
15257       normal default results "insn OK, 4 bytes long, keep decoding" so
15258       they don't need to change it.  However, decodes of control-flow
15259       insns may cause *dres to change.
15260    */
15261    switch (INSN(28,25)) {
15262       case BITS4(1,0,0,0): case BITS4(1,0,0,1):
15263          // Data processing - immediate
15264          ok = dis_ARM64_data_processing_immediate(dres, insn, sigill_diag);
15265          break;
15266       case BITS4(1,0,1,0): case BITS4(1,0,1,1):
15267          // Branch, exception generation and system instructions
15268          ok = dis_ARM64_branch_etc(dres, insn, archinfo, abiinfo, sigill_diag);
15269          break;
15270       case BITS4(0,1,0,0): case BITS4(0,1,1,0):
15271       case BITS4(1,1,0,0): case BITS4(1,1,1,0):
15272          // Loads and stores
15273          ok = dis_ARM64_load_store(dres, insn, abiinfo, sigill_diag);
15274          break;
15275       case BITS4(0,1,0,1): case BITS4(1,1,0,1):
15276          // Data processing - register
15277          ok = dis_ARM64_data_processing_register(dres, insn, sigill_diag);
15278          break;
15279       case BITS4(0,1,1,1): case BITS4(1,1,1,1):
15280          // Data processing - SIMD and floating point
15281          ok = dis_ARM64_simd_and_fp(dres, insn, archinfo);
15282          break;
15283       case BITS4(0,0,0,0): case BITS4(0,0,0,1):
15284       case BITS4(0,0,1,0): case BITS4(0,0,1,1):
15285          // UNALLOCATED
15286          break;
15287       default:
15288          vassert(0); /* Can't happen */
15289    }
15290
15291    /* If the next-level down decoders failed, make sure |dres| didn't
15292       get changed. */
15293    if (!ok) {
15294       vassert(dres->whatNext    == Dis_Continue);
15295       vassert(dres->len         == 4);
15296       vassert(dres->jk_StopHere == Ijk_INVALID);
15297    }
15298
15299    return ok;
15300
15301 #  undef INSN
15302 }
15303
15304
15305 /*------------------------------------------------------------*/
15306 /*--- Top-level fn                                         ---*/
15307 /*------------------------------------------------------------*/
15308
15309 /* Disassemble a single instruction into IR.  The instruction
15310    is located in host memory at &guest_code[delta]. */
15311
15312 DisResult disInstr_ARM64 ( IRSB*        irsb_IN,
15313                            const UChar* guest_code_IN,
15314                            Long         delta_IN,
15315                            Addr         guest_IP,
15316                            VexArch      guest_arch,
15317                            const VexArchInfo* archinfo,
15318                            const VexAbiInfo*  abiinfo,
15319                            VexEndness   host_endness_IN,
15320                            Bool         sigill_diag_IN )
15321 {
15322    DisResult dres;
15323    vex_bzero(&dres, sizeof(dres));
15324
15325    /* Set globals (see top of this file) */
15326    vassert(guest_arch == VexArchARM64);
15327
15328    irsb                = irsb_IN;
15329    host_endness        = host_endness_IN;
15330    guest_PC_curr_instr = (Addr64)guest_IP;
15331
15332    /* Sanity checks */
15333    /* (x::UInt - 2) <= 15   ===   x >= 2 && x <= 17 (I hope) */
15334    vassert((archinfo->arm64_dMinLine_lg2_szB - 2) <= 15);
15335    vassert((archinfo->arm64_iMinLine_lg2_szB - 2) <= 15);
15336
15337    /* Try to decode */
15338    Bool ok = disInstr_ARM64_WRK( &dres,
15339                                  &guest_code_IN[delta_IN],
15340                                  archinfo, abiinfo, sigill_diag_IN );
15341    if (ok) {
15342       /* All decode successes end up here. */
15343       vassert(dres.len == 4 || dres.len == 20);
15344       switch (dres.whatNext) {
15345          case Dis_Continue:
15346             putPC( mkU64(dres.len + guest_PC_curr_instr) );
15347             break;
15348          case Dis_StopHere:
15349             break;
15350          default:
15351             vassert(0);
15352       }
15353       DIP("\n");
15354    } else {
15355       /* All decode failures end up here. */
15356       if (sigill_diag_IN) {
15357          Int   i, j;
15358          UChar buf[64];
15359          UInt  insn
15360                   = getUIntLittleEndianly( &guest_code_IN[delta_IN] );
15361          vex_bzero(buf, sizeof(buf));
15362          for (i = j = 0; i < 32; i++) {
15363             if (i > 0) {
15364               if ((i & 7) == 0) buf[j++] = ' ';
15365               else if ((i & 3) == 0) buf[j++] = '\'';
15366             }
15367             buf[j++] = (insn & (1<<(31-i))) ? '1' : '0';
15368          }
15369          vex_printf("disInstr(arm64): unhandled instruction 0x%08x\n", insn);
15370          vex_printf("disInstr(arm64): %s\n", buf);
15371       }
15372
15373       /* Tell the dispatcher that this insn cannot be decoded, and so
15374          has not been executed, and (is currently) the next to be
15375          executed.  PC should be up-to-date since it is made so at the
15376          start of each insn, but nevertheless be paranoid and update
15377          it again right now. */
15378       putPC( mkU64(guest_PC_curr_instr) );
15379       dres.len         = 0;
15380       dres.whatNext    = Dis_StopHere;
15381       dres.jk_StopHere = Ijk_NoDecode;
15382    }
15383    return dres;
15384 }
15385
15386
15387 /*--------------------------------------------------------------------*/
15388 /*--- end                                       guest_arm64_toIR.c ---*/
15389 /*--------------------------------------------------------------------*/