VEX/priv/guest_arm64_toIR.c

   1 /* -*- mode: C; c-basic-offset: 3; -*- */
   2
   3 /*--------------------------------------------------------------------*/
   4 /*--- begin                                     guest_arm64_toIR.c ---*/
   5 /*--------------------------------------------------------------------*/
   6
   7 /*
   8    This file is part of Valgrind, a dynamic binary instrumentation
   9    framework.
  10
  11    Copyright (C) 2013-2017 OpenWorks
  12       info@open-works.net
  13
  14    This program is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU General Public License as
  16    published by the Free Software Foundation; either version 2 of the
  17    License, or (at your option) any later version.
  18
  19    This program is distributed in the hope that it will be useful, but
  20    WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    General Public License for more details.
  23
  24    You should have received a copy of the GNU General Public License
  25    along with this program; if not, see <http://www.gnu.org/licenses/>.
  26
  27    The GNU General Public License is contained in the file COPYING.
  28 */
  29
  30 /* KNOWN LIMITATIONS 2014-Nov-16
  31
  32    * Correctness: FMAXNM, FMINNM are implemented the same as FMAX/FMIN.
  33
  34      Also FP comparison "unordered" .. is implemented as normal FP
  35      comparison.
  36
  37      Both should be fixed.  They behave incorrectly in the presence of
  38      NaNs.
  39
  40      FMULX is treated the same as FMUL.  That's also not correct.
  41
  42    * Floating multiply-add (etc) insns.  Are split into a multiply and
  43      an add, and so suffer double rounding and hence sometimes the
  44      least significant mantissa bit is incorrect.  Fix: use the IR
  45      multiply-add IROps instead.
  46
  47    * FRINTA, FRINTN are kludged .. they just round to nearest.  No special
  48      handling for the "ties" case.  FRINTX might be dubious too.
  49
  50    * Ditto FCVTXN.  No idea what "round to odd" means.  This implementation
  51      just rounds to nearest.
  52 */
  53
  54 /* "Special" instructions.
  55
  56    This instruction decoder can decode four special instructions
  57    which mean nothing natively (are no-ops as far as regs/mem are
  58    concerned) but have meaning for supporting Valgrind.  A special
  59    instruction is flagged by a 16-byte preamble:
  60
  61       93CC0D8C 93CC358C 93CCCD8C 93CCF58C
  62       (ror x12, x12, #3;   ror x12, x12, #13
  63        ror x12, x12, #51;  ror x12, x12, #61)
  64
  65    Following that, one of the following 3 are allowed
  66    (standard interpretation in parentheses):
  67
  68       AA0A014A (orr x10,x10,x10)   X3 = client_request ( X4 )
  69       AA0B016B (orr x11,x11,x11)   X3 = guest_NRADDR
  70       AA0C018C (orr x12,x12,x12)   branch-and-link-to-noredir X8
  71       AA090129 (orr x9,x9,x9)      IR injection
  72
  73    Any other bytes following the 16-byte preamble are illegal and
  74    constitute a failure in instruction decoding.  This all assumes
  75    that the preamble will never occur except in specific code
  76    fragments designed for Valgrind to catch.
  77 */
  78
  79 /* Translates ARM64 code to IR. */
  80
  81 #include "libvex_basictypes.h"
  82 #include "libvex_ir.h"
  83 #include "libvex.h"
  84 #include "libvex_guest_arm64.h"
  85
  86 #include "main_util.h"
  87 #include "main_globals.h"
  88 #include "guest_generic_bb_to_IR.h"
  89 #include "guest_arm64_defs.h"
  90
  91
  92 /*------------------------------------------------------------*/
  93 /*--- Globals                                              ---*/
  94 /*------------------------------------------------------------*/
  95
  96 /* These are set at the start of the translation of a instruction, so
  97    that we don't have to pass them around endlessly.  CONST means does
  98    not change during translation of the instruction.
  99 */
 100
 101 /* CONST: what is the host's endianness?  We need to know this in
 102    order to do sub-register accesses to the SIMD/FP registers
 103    correctly. */
 104 static VexEndness host_endness;
 105
 106 /* CONST: The guest address for the instruction currently being
 107    translated.  */
 108 static Addr64 guest_PC_curr_instr;
 109
 110 /* MOD: The IRSB* into which we're generating code. */
 111 static IRSB* irsb;
 112
 113
 114 /*------------------------------------------------------------*/
 115 /*--- Debugging output                                     ---*/
 116 /*------------------------------------------------------------*/
 117
 118 #define DIP(format, args...)           \
 119    if (vex_traceflags & VEX_TRACE_FE)  \
 120       vex_printf(format, ## args)
 121
 122 #define DIS(buf, format, args...)      \
 123    if (vex_traceflags & VEX_TRACE_FE)  \
 124       vex_sprintf(buf, format, ## args)
 125
 126
 127 /*------------------------------------------------------------*/
 128 /*--- Helper bits and pieces for deconstructing the        ---*/
 129 /*--- arm insn stream.                                     ---*/
 130 /*------------------------------------------------------------*/
 131
 132 /* Do a little-endian load of a 32-bit word, regardless of the
 133    endianness of the underlying host. */
 134 static inline UInt getUIntLittleEndianly ( const UChar* p )
 135 {
 136    UInt w = 0;
 137    w = (w << 8) | p[3];
 138    w = (w << 8) | p[2];
 139    w = (w << 8) | p[1];
 140    w = (w << 8) | p[0];
 141    return w;
 142 }
 143
 144 /* Sign extend a N-bit value up to 64 bits, by copying
 145    bit N-1 into all higher positions. */
 146 static ULong sx_to_64 ( ULong x, UInt n )
 147 {
 148    vassert(n > 1 && n < 64);
 149    x <<= (64-n);
 150    Long r = (Long)x;
 151    r >>= (64-n);
 152    return (ULong)r;
 153 }
 154
 155 //ZZ /* Do a little-endian load of a 16-bit word, regardless of the
 156 //ZZ    endianness of the underlying host. */
 157 //ZZ static inline UShort getUShortLittleEndianly ( UChar* p )
 158 //ZZ {
 159 //ZZ    UShort w = 0;
 160 //ZZ    w = (w << 8) | p[1];
 161 //ZZ    w = (w << 8) | p[0];
 162 //ZZ    return w;
 163 //ZZ }
 164 //ZZ
 165 //ZZ static UInt ROR32 ( UInt x, UInt sh ) {
 166 //ZZ    vassert(sh >= 0 && sh < 32);
 167 //ZZ    if (sh == 0)
 168 //ZZ       return x;
 169 //ZZ    else
 170 //ZZ       return (x << (32-sh)) | (x >> sh);
 171 //ZZ }
 172 //ZZ
 173 //ZZ static Int popcount32 ( UInt x )
 174 //ZZ {
 175 //ZZ    Int res = 0, i;
 176 //ZZ    for (i = 0; i < 32; i++) {
 177 //ZZ       res += (x & 1);
 178 //ZZ       x >>= 1;
 179 //ZZ    }
 180 //ZZ    return res;
 181 //ZZ }
 182 //ZZ
 183 //ZZ static UInt setbit32 ( UInt x, Int ix, UInt b )
 184 //ZZ {
 185 //ZZ    UInt mask = 1 << ix;
 186 //ZZ    x &= ~mask;
 187 //ZZ    x |= ((b << ix) & mask);
 188 //ZZ    return x;
 189 //ZZ }
 190
 191 #define BITS2(_b1,_b0)  \
 192    (((_b1) << 1) | (_b0))
 193
 194 #define BITS3(_b2,_b1,_b0)  \
 195   (((_b2) << 2) | ((_b1) << 1) | (_b0))
 196
 197 #define BITS4(_b3,_b2,_b1,_b0)  \
 198    (((_b3) << 3) | ((_b2) << 2) | ((_b1) << 1) | (_b0))
 199
 200 #define BITS8(_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 201    ((BITS4((_b7),(_b6),(_b5),(_b4)) << 4)  \
 202     | BITS4((_b3),(_b2),(_b1),(_b0)))
 203
 204 #define BITS5(_b4,_b3,_b2,_b1,_b0)  \
 205    (BITS8(0,0,0,(_b4),(_b3),(_b2),(_b1),(_b0)))
 206 #define BITS6(_b5,_b4,_b3,_b2,_b1,_b0)  \
 207    (BITS8(0,0,(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 208 #define BITS7(_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 209    (BITS8(0,(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 210
 211 #define BITS9(_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 212    (((_b8) << 8)  \
 213     | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 214
 215 #define BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 216    (((_b9) << 9) | ((_b8) << 8)  \
 217     | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 218
 219 #define BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 220    (((_b10) << 10)  \
 221     | BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
 222
 223 #define BITS12(_b11, _b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0) \
 224    (((_b11) << 11)  \
 225     | BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
 226
 227 #define X00 BITS2(0,0)
 228 #define X01 BITS2(0,1)
 229 #define X10 BITS2(1,0)
 230 #define X11 BITS2(1,1)
 231
 232 // produces _uint[_bMax:_bMin]
 233 #define SLICE_UInt(_uint,_bMax,_bMin)  \
 234    (( ((UInt)(_uint)) >> (_bMin))  \
 235     & (UInt)((1ULL << ((_bMax) - (_bMin) + 1)) - 1ULL))
 236
 237
 238 /*------------------------------------------------------------*/
 239 /*--- Helper bits and pieces for creating IR fragments.    ---*/
 240 /*------------------------------------------------------------*/
 241
 242 static IRExpr* mkV128 ( UShort w )
 243 {
 244    return IRExpr_Const(IRConst_V128(w));
 245 }
 246
 247 static IRExpr* mkU64 ( ULong i )
 248 {
 249    return IRExpr_Const(IRConst_U64(i));
 250 }
 251
 252 static IRExpr* mkU32 ( UInt i )
 253 {
 254    return IRExpr_Const(IRConst_U32(i));
 255 }
 256
 257 static IRExpr* mkU16 ( UInt i )
 258 {
 259    vassert(i < 65536);
 260    return IRExpr_Const(IRConst_U16(i));
 261 }
 262
 263 static IRExpr* mkU8 ( UInt i )
 264 {
 265    vassert(i < 256);
 266    return IRExpr_Const(IRConst_U8( (UChar)i ));
 267 }
 268
 269 static IRExpr* mkexpr ( IRTemp tmp )
 270 {
 271    return IRExpr_RdTmp(tmp);
 272 }
 273
 274 static IRExpr* unop ( IROp op, IRExpr* a )
 275 {
 276    return IRExpr_Unop(op, a);
 277 }
 278
 279 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
 280 {
 281    return IRExpr_Binop(op, a1, a2);
 282 }
 283
 284 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
 285 {
 286    return IRExpr_Triop(op, a1, a2, a3);
 287 }
 288
 289 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
 290 {
 291    return IRExpr_Load(Iend_LE, ty, addr);
 292 }
 293
 294 /* Add a statement to the list held by "irbb". */
 295 static void stmt ( IRStmt* st )
 296 {
 297    addStmtToIRSB( irsb, st );
 298 }
 299
 300 static void assign ( IRTemp dst, IRExpr* e )
 301 {
 302    stmt( IRStmt_WrTmp(dst, e) );
 303 }
 304
 305 static void storeLE ( IRExpr* addr, IRExpr* data )
 306 {
 307    stmt( IRStmt_Store(Iend_LE, addr, data) );
 308 }
 309
 310 //ZZ static void storeGuardedLE ( IRExpr* addr, IRExpr* data, IRTemp guardT )
 311 //ZZ {
 312 //ZZ    if (guardT == IRTemp_INVALID) {
 313 //ZZ       /* unconditional */
 314 //ZZ       storeLE(addr, data);
 315 //ZZ    } else {
 316 //ZZ       stmt( IRStmt_StoreG(Iend_LE, addr, data,
 317 //ZZ                           binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
 318 //ZZ    }
 319 //ZZ }
 320 //ZZ
 321 //ZZ static void loadGuardedLE ( IRTemp dst, IRLoadGOp cvt,
 322 //ZZ                             IRExpr* addr, IRExpr* alt,
 323 //ZZ                             IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
 324 //ZZ {
 325 //ZZ    if (guardT == IRTemp_INVALID) {
 326 //ZZ       /* unconditional */
 327 //ZZ       IRExpr* loaded = NULL;
 328 //ZZ       switch (cvt) {
 329 //ZZ          case ILGop_Ident32:
 330 //ZZ             loaded = loadLE(Ity_I32, addr); break;
 331 //ZZ          case ILGop_8Uto32:
 332 //ZZ             loaded = unop(Iop_8Uto32, loadLE(Ity_I8, addr)); break;
 333 //ZZ          case ILGop_8Sto32:
 334 //ZZ             loaded = unop(Iop_8Sto32, loadLE(Ity_I8, addr)); break;
 335 //ZZ          case ILGop_16Uto32:
 336 //ZZ             loaded = unop(Iop_16Uto32, loadLE(Ity_I16, addr)); break;
 337 //ZZ          case ILGop_16Sto32:
 338 //ZZ             loaded = unop(Iop_16Sto32, loadLE(Ity_I16, addr)); break;
 339 //ZZ          default:
 340 //ZZ             vassert(0);
 341 //ZZ       }
 342 //ZZ       vassert(loaded != NULL);
 343 //ZZ       assign(dst, loaded);
 344 //ZZ    } else {
 345 //ZZ       /* Generate a guarded load into 'dst', but apply 'cvt' to the
 346 //ZZ          loaded data before putting the data in 'dst'.  If the load
 347 //ZZ          does not take place, 'alt' is placed directly in 'dst'. */
 348 //ZZ       stmt( IRStmt_LoadG(Iend_LE, cvt, dst, addr, alt,
 349 //ZZ                          binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
 350 //ZZ    }
 351 //ZZ }
 352
 353 /* Generate a new temporary of the given type. */
 354 static IRTemp newTemp ( IRType ty )
 355 {
 356    vassert(isPlausibleIRType(ty));
 357    return newIRTemp( irsb->tyenv, ty );
 358 }
 359
 360 /* This is used in many places, so the brevity is an advantage. */
 361 static IRTemp newTempV128(void)
 362 {
 363    return newTemp(Ity_V128);
 364 }
 365
 366 /* Initialise V128 temporaries en masse. */
 367 static
 368 void newTempsV128_2(IRTemp* t1, IRTemp* t2)
 369 {
 370    vassert(t1 && *t1 == IRTemp_INVALID);
 371    vassert(t2 && *t2 == IRTemp_INVALID);
 372    *t1 = newTempV128();
 373    *t2 = newTempV128();
 374 }
 375
 376 static
 377 void newTempsV128_3(IRTemp* t1, IRTemp* t2, IRTemp* t3)
 378 {
 379    vassert(t1 && *t1 == IRTemp_INVALID);
 380    vassert(t2 && *t2 == IRTemp_INVALID);
 381    vassert(t3 && *t3 == IRTemp_INVALID);
 382    *t1 = newTempV128();
 383    *t2 = newTempV128();
 384    *t3 = newTempV128();
 385 }
 386
 387 static
 388 void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4)
 389 {
 390    vassert(t1 && *t1 == IRTemp_INVALID);
 391    vassert(t2 && *t2 == IRTemp_INVALID);
 392    vassert(t3 && *t3 == IRTemp_INVALID);
 393    vassert(t4 && *t4 == IRTemp_INVALID);
 394    *t1 = newTempV128();
 395    *t2 = newTempV128();
 396    *t3 = newTempV128();
 397    *t4 = newTempV128();
 398 }
 399
 400 static
 401 void newTempsV128_7(IRTemp* t1, IRTemp* t2, IRTemp* t3,
 402                     IRTemp* t4, IRTemp* t5, IRTemp* t6, IRTemp* t7)
 403 {
 404    vassert(t1 && *t1 == IRTemp_INVALID);
 405    vassert(t2 && *t2 == IRTemp_INVALID);
 406    vassert(t3 && *t3 == IRTemp_INVALID);
 407    vassert(t4 && *t4 == IRTemp_INVALID);
 408    vassert(t5 && *t5 == IRTemp_INVALID);
 409    vassert(t6 && *t6 == IRTemp_INVALID);
 410    vassert(t7 && *t7 == IRTemp_INVALID);
 411    *t1 = newTempV128();
 412    *t2 = newTempV128();
 413    *t3 = newTempV128();
 414    *t4 = newTempV128();
 415    *t5 = newTempV128();
 416    *t6 = newTempV128();
 417    *t7 = newTempV128();
 418 }
 419
 420 //ZZ /* Produces a value in 0 .. 3, which is encoded as per the type
 421 //ZZ    IRRoundingMode. */
 422 //ZZ static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
 423 //ZZ {
 424 //ZZ    return mkU32(Irrm_NEAREST);
 425 //ZZ }
 426 //ZZ
 427 //ZZ /* Generate an expression for SRC rotated right by ROT. */
 428 //ZZ static IRExpr* genROR32( IRTemp src, Int rot )
 429 //ZZ {
 430 //ZZ    vassert(rot >= 0 && rot < 32);
 431 //ZZ    if (rot == 0)
 432 //ZZ       return mkexpr(src);
 433 //ZZ    return
 434 //ZZ       binop(Iop_Or32,
 435 //ZZ             binop(Iop_Shl32, mkexpr(src), mkU8(32 - rot)),
 436 //ZZ             binop(Iop_Shr32, mkexpr(src), mkU8(rot)));
 437 //ZZ }
 438 //ZZ
 439 //ZZ static IRExpr* mkU128 ( ULong i )
 440 //ZZ {
 441 //ZZ    return binop(Iop_64HLtoV128, mkU64(i), mkU64(i));
 442 //ZZ }
 443 //ZZ
 444 //ZZ /* Generate a 4-aligned version of the given expression if
 445 //ZZ    the given condition is true.  Else return it unchanged. */
 446 //ZZ static IRExpr* align4if ( IRExpr* e, Bool b )
 447 //ZZ {
 448 //ZZ    if (b)
 449 //ZZ       return binop(Iop_And32, e, mkU32(~3));
 450 //ZZ    else
 451 //ZZ       return e;
 452 //ZZ }
 453
 454 /* Other IR construction helpers. */
 455 static IROp mkAND ( IRType ty ) {
 456    switch (ty) {
 457       case Ity_I32: return Iop_And32;
 458       case Ity_I64: return Iop_And64;
 459       default: vpanic("mkAND");
 460    }
 461 }
 462
 463 static IROp mkOR ( IRType ty ) {
 464    switch (ty) {
 465       case Ity_I32: return Iop_Or32;
 466       case Ity_I64: return Iop_Or64;
 467       default: vpanic("mkOR");
 468    }
 469 }
 470
 471 static IROp mkXOR ( IRType ty ) {
 472    switch (ty) {
 473       case Ity_I32: return Iop_Xor32;
 474       case Ity_I64: return Iop_Xor64;
 475       default: vpanic("mkXOR");
 476    }
 477 }
 478
 479 static IROp mkSHL ( IRType ty ) {
 480    switch (ty) {
 481       case Ity_I32: return Iop_Shl32;
 482       case Ity_I64: return Iop_Shl64;
 483       default: vpanic("mkSHL");
 484    }
 485 }
 486
 487 static IROp mkSHR ( IRType ty ) {
 488    switch (ty) {
 489       case Ity_I32: return Iop_Shr32;
 490       case Ity_I64: return Iop_Shr64;
 491       default: vpanic("mkSHR");
 492    }
 493 }
 494
 495 static IROp mkSAR ( IRType ty ) {
 496    switch (ty) {
 497       case Ity_I32: return Iop_Sar32;
 498       case Ity_I64: return Iop_Sar64;
 499       default: vpanic("mkSAR");
 500    }
 501 }
 502
 503 static IROp mkNOT ( IRType ty ) {
 504    switch (ty) {
 505       case Ity_I32: return Iop_Not32;
 506       case Ity_I64: return Iop_Not64;
 507       default: vpanic("mkNOT");
 508    }
 509 }
 510
 511 static IROp mkADD ( IRType ty ) {
 512    switch (ty) {
 513       case Ity_I32: return Iop_Add32;
 514       case Ity_I64: return Iop_Add64;
 515       default: vpanic("mkADD");
 516    }
 517 }
 518
 519 static IROp mkSUB ( IRType ty ) {
 520    switch (ty) {
 521       case Ity_I32: return Iop_Sub32;
 522       case Ity_I64: return Iop_Sub64;
 523       default: vpanic("mkSUB");
 524    }
 525 }
 526
 527 static IROp mkADDF ( IRType ty ) {
 528    switch (ty) {
 529       case Ity_F32: return Iop_AddF32;
 530       case Ity_F64: return Iop_AddF64;
 531       default: vpanic("mkADDF");
 532    }
 533 }
 534
 535 static IROp mkSUBF ( IRType ty ) {
 536    switch (ty) {
 537       case Ity_F32: return Iop_SubF32;
 538       case Ity_F64: return Iop_SubF64;
 539       default: vpanic("mkSUBF");
 540    }
 541 }
 542
 543 static IROp mkMULF ( IRType ty ) {
 544    switch (ty) {
 545       case Ity_F32: return Iop_MulF32;
 546       case Ity_F64: return Iop_MulF64;
 547       default: vpanic("mkMULF");
 548    }
 549 }
 550
 551 static IROp mkDIVF ( IRType ty ) {
 552    switch (ty) {
 553       case Ity_F32: return Iop_DivF32;
 554       case Ity_F64: return Iop_DivF64;
 555       default: vpanic("mkMULF");
 556    }
 557 }
 558
 559 static IROp mkNEGF ( IRType ty ) {
 560    switch (ty) {
 561       case Ity_F32: return Iop_NegF32;
 562       case Ity_F64: return Iop_NegF64;
 563       default: vpanic("mkNEGF");
 564    }
 565 }
 566
 567 static IROp mkABSF ( IRType ty ) {
 568    switch (ty) {
 569       case Ity_F32: return Iop_AbsF32;
 570       case Ity_F64: return Iop_AbsF64;
 571       default: vpanic("mkNEGF");
 572    }
 573 }
 574
 575 static IROp mkSQRTF ( IRType ty ) {
 576    switch (ty) {
 577       case Ity_F32: return Iop_SqrtF32;
 578       case Ity_F64: return Iop_SqrtF64;
 579       default: vpanic("mkNEGF");
 580    }
 581 }
 582
 583 static IROp mkVecADD ( UInt size ) {
 584    const IROp ops[4]
 585       = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
 586    vassert(size < 4);
 587    return ops[size];
 588 }
 589
 590 static IROp mkVecQADDU ( UInt size ) {
 591    const IROp ops[4]
 592       = { Iop_QAdd8Ux16, Iop_QAdd16Ux8, Iop_QAdd32Ux4, Iop_QAdd64Ux2 };
 593    vassert(size < 4);
 594    return ops[size];
 595 }
 596
 597 static IROp mkVecQADDS ( UInt size ) {
 598    const IROp ops[4]
 599       = { Iop_QAdd8Sx16, Iop_QAdd16Sx8, Iop_QAdd32Sx4, Iop_QAdd64Sx2 };
 600    vassert(size < 4);
 601    return ops[size];
 602 }
 603
 604 static IROp mkVecQADDEXTSUSATUU ( UInt size ) {
 605    const IROp ops[4]
 606       = { Iop_QAddExtSUsatUU8x16, Iop_QAddExtSUsatUU16x8,
 607           Iop_QAddExtSUsatUU32x4, Iop_QAddExtSUsatUU64x2 };
 608    vassert(size < 4);
 609    return ops[size];
 610 }
 611
 612 static IROp mkVecQADDEXTUSSATSS ( UInt size ) {
 613    const IROp ops[4]
 614       = { Iop_QAddExtUSsatSS8x16, Iop_QAddExtUSsatSS16x8,
 615           Iop_QAddExtUSsatSS32x4, Iop_QAddExtUSsatSS64x2 };
 616    vassert(size < 4);
 617    return ops[size];
 618 }
 619
 620 static IROp mkVecSUB ( UInt size ) {
 621    const IROp ops[4]
 622       = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
 623    vassert(size < 4);
 624    return ops[size];
 625 }
 626
 627 static IROp mkVecQSUBU ( UInt size ) {
 628    const IROp ops[4]
 629       = { Iop_QSub8Ux16, Iop_QSub16Ux8, Iop_QSub32Ux4, Iop_QSub64Ux2 };
 630    vassert(size < 4);
 631    return ops[size];
 632 }
 633
 634 static IROp mkVecQSUBS ( UInt size ) {
 635    const IROp ops[4]
 636       = { Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4, Iop_QSub64Sx2 };
 637    vassert(size < 4);
 638    return ops[size];
 639 }
 640
 641 static IROp mkVecSARN ( UInt size ) {
 642    const IROp ops[4]
 643       = { Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4, Iop_SarN64x2 };
 644    vassert(size < 4);
 645    return ops[size];
 646 }
 647
 648 static IROp mkVecSHRN ( UInt size ) {
 649    const IROp ops[4]
 650       = { Iop_ShrN8x16, Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
 651    vassert(size < 4);
 652    return ops[size];
 653 }
 654
 655 static IROp mkVecSHLN ( UInt size ) {
 656    const IROp ops[4]
 657       = { Iop_ShlN8x16, Iop_ShlN16x8, Iop_ShlN32x4, Iop_ShlN64x2 };
 658    vassert(size < 4);
 659    return ops[size];
 660 }
 661
 662 static IROp mkVecCATEVENLANES ( UInt size ) {
 663    const IROp ops[4]
 664       = { Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8,
 665           Iop_CatEvenLanes32x4, Iop_InterleaveLO64x2 };
 666    vassert(size < 4);
 667    return ops[size];
 668 }
 669
 670 static IROp mkVecCATODDLANES ( UInt size ) {
 671    const IROp ops[4]
 672       = { Iop_CatOddLanes8x16, Iop_CatOddLanes16x8,
 673           Iop_CatOddLanes32x4, Iop_InterleaveHI64x2 };
 674    vassert(size < 4);
 675    return ops[size];
 676 }
 677
 678 static IROp mkVecINTERLEAVELO ( UInt size ) {
 679    const IROp ops[4]
 680       = { Iop_InterleaveLO8x16, Iop_InterleaveLO16x8,
 681           Iop_InterleaveLO32x4, Iop_InterleaveLO64x2 };
 682    vassert(size < 4);
 683    return ops[size];
 684 }
 685
 686 static IROp mkVecINTERLEAVEHI ( UInt size ) {
 687    const IROp ops[4]
 688       = { Iop_InterleaveHI8x16, Iop_InterleaveHI16x8,
 689           Iop_InterleaveHI32x4, Iop_InterleaveHI64x2 };
 690    vassert(size < 4);
 691    return ops[size];
 692 }
 693
 694 static IROp mkVecMAXU ( UInt size ) {
 695    const IROp ops[4]
 696       = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4, Iop_Max64Ux2 };
 697    vassert(size < 4);
 698    return ops[size];
 699 }
 700
 701 static IROp mkVecMAXS ( UInt size ) {
 702    const IROp ops[4]
 703       = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4, Iop_Max64Sx2 };
 704    vassert(size < 4);
 705    return ops[size];
 706 }
 707
 708 static IROp mkVecMINU ( UInt size ) {
 709    const IROp ops[4]
 710       = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4, Iop_Min64Ux2 };
 711    vassert(size < 4);
 712    return ops[size];
 713 }
 714
 715 static IROp mkVecMINS ( UInt size ) {
 716    const IROp ops[4]
 717       = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4, Iop_Min64Sx2 };
 718    vassert(size < 4);
 719    return ops[size];
 720 }
 721
 722 static IROp mkVecMUL ( UInt size ) {
 723    const IROp ops[4]
 724       = { Iop_Mul8x16, Iop_Mul16x8, Iop_Mul32x4, Iop_INVALID };
 725    vassert(size < 3);
 726    return ops[size];
 727 }
 728
 729 static IROp mkVecMULLU ( UInt sizeNarrow ) {
 730    const IROp ops[4]
 731       = { Iop_Mull8Ux8, Iop_Mull16Ux4, Iop_Mull32Ux2, Iop_INVALID };
 732    vassert(sizeNarrow < 3);
 733    return ops[sizeNarrow];
 734 }
 735
 736 static IROp mkVecMULLS ( UInt sizeNarrow ) {
 737    const IROp ops[4]
 738       = { Iop_Mull8Sx8, Iop_Mull16Sx4, Iop_Mull32Sx2, Iop_INVALID };
 739    vassert(sizeNarrow < 3);
 740    return ops[sizeNarrow];
 741 }
 742
 743 static IROp mkVecQDMULLS ( UInt sizeNarrow ) {
 744    const IROp ops[4]
 745       = { Iop_INVALID, Iop_QDMull16Sx4, Iop_QDMull32Sx2, Iop_INVALID };
 746    vassert(sizeNarrow < 3);
 747    return ops[sizeNarrow];
 748 }
 749
 750 static IROp mkVecCMPEQ ( UInt size ) {
 751    const IROp ops[4]
 752       = { Iop_CmpEQ8x16, Iop_CmpEQ16x8, Iop_CmpEQ32x4, Iop_CmpEQ64x2 };
 753    vassert(size < 4);
 754    return ops[size];
 755 }
 756
 757 static IROp mkVecCMPGTU ( UInt size ) {
 758    const IROp ops[4]
 759       = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4, Iop_CmpGT64Ux2 };
 760    vassert(size < 4);
 761    return ops[size];
 762 }
 763
 764 static IROp mkVecCMPGTS ( UInt size ) {
 765    const IROp ops[4]
 766       = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2 };
 767    vassert(size < 4);
 768    return ops[size];
 769 }
 770
 771 static IROp mkVecABS ( UInt size ) {
 772    const IROp ops[4]
 773       = { Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2 };
 774    vassert(size < 4);
 775    return ops[size];
 776 }
 777
 778 static IROp mkVecZEROHIxxOFV128 ( UInt size ) {
 779    const IROp ops[4]
 780       = { Iop_ZeroHI120ofV128, Iop_ZeroHI112ofV128,
 781           Iop_ZeroHI96ofV128,  Iop_ZeroHI64ofV128 };
 782    vassert(size < 4);
 783    return ops[size];
 784 }
 785
 786 static IRExpr* mkU ( IRType ty, ULong imm ) {
 787    switch (ty) {
 788       case Ity_I32: return mkU32((UInt)(imm & 0xFFFFFFFFULL));
 789       case Ity_I64: return mkU64(imm);
 790       default: vpanic("mkU");
 791    }
 792 }
 793
 794 static IROp mkVecQDMULHIS ( UInt size ) {
 795    const IROp ops[4]
 796       = { Iop_INVALID, Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4, Iop_INVALID };
 797    vassert(size < 4);
 798    return ops[size];
 799 }
 800
 801 static IROp mkVecQRDMULHIS ( UInt size ) {
 802    const IROp ops[4]
 803       = { Iop_INVALID, Iop_QRDMulHi16Sx8, Iop_QRDMulHi32Sx4, Iop_INVALID };
 804    vassert(size < 4);
 805    return ops[size];
 806 }
 807
 808 static IROp mkVecQANDUQSH ( UInt size ) {
 809    const IROp ops[4]
 810       = { Iop_QandUQsh8x16, Iop_QandUQsh16x8,
 811           Iop_QandUQsh32x4, Iop_QandUQsh64x2 };
 812    vassert(size < 4);
 813    return ops[size];
 814 }
 815
 816 static IROp mkVecQANDSQSH ( UInt size ) {
 817    const IROp ops[4]
 818       = { Iop_QandSQsh8x16, Iop_QandSQsh16x8,
 819           Iop_QandSQsh32x4, Iop_QandSQsh64x2 };
 820    vassert(size < 4);
 821    return ops[size];
 822 }
 823
 824 static IROp mkVecQANDUQRSH ( UInt size ) {
 825    const IROp ops[4]
 826       = { Iop_QandUQRsh8x16, Iop_QandUQRsh16x8,
 827           Iop_QandUQRsh32x4, Iop_QandUQRsh64x2 };
 828    vassert(size < 4);
 829    return ops[size];
 830 }
 831
 832 static IROp mkVecQANDSQRSH ( UInt size ) {
 833    const IROp ops[4]
 834       = { Iop_QandSQRsh8x16, Iop_QandSQRsh16x8,
 835           Iop_QandSQRsh32x4, Iop_QandSQRsh64x2 };
 836    vassert(size < 4);
 837    return ops[size];
 838 }
 839
 840 static IROp mkVecSHU ( UInt size ) {
 841    const IROp ops[4]
 842       = { Iop_Sh8Ux16, Iop_Sh16Ux8, Iop_Sh32Ux4, Iop_Sh64Ux2 };
 843    vassert(size < 4);
 844    return ops[size];
 845 }
 846
 847 static IROp mkVecSHS ( UInt size ) {
 848    const IROp ops[4]
 849       = { Iop_Sh8Sx16, Iop_Sh16Sx8, Iop_Sh32Sx4, Iop_Sh64Sx2 };
 850    vassert(size < 4);
 851    return ops[size];
 852 }
 853
 854 static IROp mkVecRSHU ( UInt size ) {
 855    const IROp ops[4]
 856       = { Iop_Rsh8Ux16, Iop_Rsh16Ux8, Iop_Rsh32Ux4, Iop_Rsh64Ux2 };
 857    vassert(size < 4);
 858    return ops[size];
 859 }
 860
 861 static IROp mkVecRSHS ( UInt size ) {
 862    const IROp ops[4]
 863       = { Iop_Rsh8Sx16, Iop_Rsh16Sx8, Iop_Rsh32Sx4, Iop_Rsh64Sx2 };
 864    vassert(size < 4);
 865    return ops[size];
 866 }
 867
 868 static IROp mkVecNARROWUN ( UInt sizeNarrow ) {
 869    const IROp ops[4]
 870       = { Iop_NarrowUn16to8x8, Iop_NarrowUn32to16x4,
 871           Iop_NarrowUn64to32x2, Iop_INVALID };
 872    vassert(sizeNarrow < 4);
 873    return ops[sizeNarrow];
 874 }
 875
 876 static IROp mkVecQNARROWUNSU ( UInt sizeNarrow ) {
 877    const IROp ops[4]
 878       = { Iop_QNarrowUn16Sto8Ux8,  Iop_QNarrowUn32Sto16Ux4,
 879           Iop_QNarrowUn64Sto32Ux2, Iop_INVALID };
 880    vassert(sizeNarrow < 4);
 881    return ops[sizeNarrow];
 882 }
 883
 884 static IROp mkVecQNARROWUNSS ( UInt sizeNarrow ) {
 885    const IROp ops[4]
 886       = { Iop_QNarrowUn16Sto8Sx8,  Iop_QNarrowUn32Sto16Sx4,
 887           Iop_QNarrowUn64Sto32Sx2, Iop_INVALID };
 888    vassert(sizeNarrow < 4);
 889    return ops[sizeNarrow];
 890 }
 891
 892 static IROp mkVecQNARROWUNUU ( UInt sizeNarrow ) {
 893    const IROp ops[4]
 894       = { Iop_QNarrowUn16Uto8Ux8,  Iop_QNarrowUn32Uto16Ux4,
 895           Iop_QNarrowUn64Uto32Ux2, Iop_INVALID };
 896    vassert(sizeNarrow < 4);
 897    return ops[sizeNarrow];
 898 }
 899
 900 static IROp mkVecQANDqshrNNARROWUU ( UInt sizeNarrow ) {
 901    const IROp ops[4]
 902       = { Iop_QandQShrNnarrow16Uto8Ux8, Iop_QandQShrNnarrow32Uto16Ux4,
 903           Iop_QandQShrNnarrow64Uto32Ux2, Iop_INVALID };
 904    vassert(sizeNarrow < 4);
 905    return ops[sizeNarrow];
 906 }
 907
 908 static IROp mkVecQANDqsarNNARROWSS ( UInt sizeNarrow ) {
 909    const IROp ops[4]
 910       = { Iop_QandQSarNnarrow16Sto8Sx8,  Iop_QandQSarNnarrow32Sto16Sx4,
 911           Iop_QandQSarNnarrow64Sto32Sx2, Iop_INVALID };
 912    vassert(sizeNarrow < 4);
 913    return ops[sizeNarrow];
 914 }
 915
 916 static IROp mkVecQANDqsarNNARROWSU ( UInt sizeNarrow ) {
 917    const IROp ops[4]
 918       = { Iop_QandQSarNnarrow16Sto8Ux8,  Iop_QandQSarNnarrow32Sto16Ux4,
 919           Iop_QandQSarNnarrow64Sto32Ux2, Iop_INVALID };
 920    vassert(sizeNarrow < 4);
 921    return ops[sizeNarrow];
 922 }
 923
 924 static IROp mkVecQANDqrshrNNARROWUU ( UInt sizeNarrow ) {
 925    const IROp ops[4]
 926       = { Iop_QandQRShrNnarrow16Uto8Ux8,  Iop_QandQRShrNnarrow32Uto16Ux4,
 927           Iop_QandQRShrNnarrow64Uto32Ux2, Iop_INVALID };
 928    vassert(sizeNarrow < 4);
 929    return ops[sizeNarrow];
 930 }
 931
 932 static IROp mkVecQANDqrsarNNARROWSS ( UInt sizeNarrow ) {
 933    const IROp ops[4]
 934       = { Iop_QandQRSarNnarrow16Sto8Sx8,  Iop_QandQRSarNnarrow32Sto16Sx4,
 935           Iop_QandQRSarNnarrow64Sto32Sx2, Iop_INVALID };
 936    vassert(sizeNarrow < 4);
 937    return ops[sizeNarrow];
 938 }
 939
 940 static IROp mkVecQANDqrsarNNARROWSU ( UInt sizeNarrow ) {
 941    const IROp ops[4]
 942       = { Iop_QandQRSarNnarrow16Sto8Ux8,  Iop_QandQRSarNnarrow32Sto16Ux4,
 943           Iop_QandQRSarNnarrow64Sto32Ux2, Iop_INVALID };
 944    vassert(sizeNarrow < 4);
 945    return ops[sizeNarrow];
 946 }
 947
 948 static IROp mkVecQSHLNSATUU ( UInt size ) {
 949    const IROp ops[4]
 950       = { Iop_QShlNsatUU8x16, Iop_QShlNsatUU16x8,
 951           Iop_QShlNsatUU32x4, Iop_QShlNsatUU64x2 };
 952    vassert(size < 4);
 953    return ops[size];
 954 }
 955
 956 static IROp mkVecQSHLNSATSS ( UInt size ) {
 957    const IROp ops[4]
 958       = { Iop_QShlNsatSS8x16, Iop_QShlNsatSS16x8,
 959           Iop_QShlNsatSS32x4, Iop_QShlNsatSS64x2 };
 960    vassert(size < 4);
 961    return ops[size];
 962 }
 963
 964 static IROp mkVecQSHLNSATSU ( UInt size ) {
 965    const IROp ops[4]
 966       = { Iop_QShlNsatSU8x16, Iop_QShlNsatSU16x8,
 967           Iop_QShlNsatSU32x4, Iop_QShlNsatSU64x2 };
 968    vassert(size < 4);
 969    return ops[size];
 970 }
 971
 972 static IROp mkVecADDF ( UInt size ) {
 973    const IROp ops[4]
 974       = { Iop_INVALID, Iop_INVALID, Iop_Add32Fx4, Iop_Add64Fx2 };
 975    vassert(size < 4);
 976    return ops[size];
 977 }
 978
 979 static IROp mkVecMAXF ( UInt size ) {
 980    const IROp ops[4]
 981       = { Iop_INVALID, Iop_INVALID, Iop_Max32Fx4, Iop_Max64Fx2 };
 982    vassert(size < 4);
 983    return ops[size];
 984 }
 985
 986 static IROp mkVecMINF ( UInt size ) {
 987    const IROp ops[4]
 988       = { Iop_INVALID, Iop_INVALID, Iop_Min32Fx4, Iop_Min64Fx2 };
 989    vassert(size < 4);
 990    return ops[size];
 991 }
 992
 993 /* Generate IR to create 'arg rotated right by imm', for sane values
 994    of 'ty' and 'imm'. */
 995 static IRTemp mathROR ( IRType ty, IRTemp arg, UInt imm )
 996 {
 997    UInt w = 0;
 998    if (ty == Ity_I64) {
 999       w = 64;
1000    } else {
1001       vassert(ty == Ity_I32);
1002       w = 32;
1003    }
1004    vassert(w != 0);
1005    vassert(imm < w);
1006    if (imm == 0) {
1007       return arg;
1008    }
1009    IRTemp res = newTemp(ty);
1010    assign(res, binop(mkOR(ty),
1011                      binop(mkSHL(ty), mkexpr(arg), mkU8(w - imm)),
1012                      binop(mkSHR(ty), mkexpr(arg), mkU8(imm)) ));
1013    return res;
1014 }
1015
1016 /* Generate IR to set the returned temp to either all-zeroes or
1017    all ones, as a copy of arg<imm>. */
1018 static IRTemp mathREPLICATE ( IRType ty, IRTemp arg, UInt imm )
1019 {
1020    UInt w = 0;
1021    if (ty == Ity_I64) {
1022       w = 64;
1023    } else {
1024       vassert(ty == Ity_I32);
1025       w = 32;
1026    }
1027    vassert(w != 0);
1028    vassert(imm < w);
1029    IRTemp res = newTemp(ty);
1030    assign(res, binop(mkSAR(ty),
1031                      binop(mkSHL(ty), mkexpr(arg), mkU8(w - 1 - imm)),
1032                      mkU8(w - 1)));
1033    return res;
1034 }
1035
1036 /* S-widen 8/16/32/64 bit int expr to 64. */
1037 static IRExpr* widenSto64 ( IRType srcTy, IRExpr* e )
1038 {
1039    switch (srcTy) {
1040       case Ity_I64: return e;
1041       case Ity_I32: return unop(Iop_32Sto64, e);
1042       case Ity_I16: return unop(Iop_16Sto64, e);
1043       case Ity_I8:  return unop(Iop_8Sto64, e);
1044       default: vpanic("widenSto64(arm64)");
1045    }
1046 }
1047
1048 /* U-widen 8/16/32/64 bit int expr to 64. */
1049 static IRExpr* widenUto64 ( IRType srcTy, IRExpr* e )
1050 {
1051    switch (srcTy) {
1052       case Ity_I64: return e;
1053       case Ity_I32: return unop(Iop_32Uto64, e);
1054       case Ity_I16: return unop(Iop_16Uto64, e);
1055       case Ity_I8:  return unop(Iop_8Uto64, e);
1056       default: vpanic("widenUto64(arm64)");
1057    }
1058 }
1059
1060 /* Narrow 64 bit int expr to 8/16/32/64.  Clearly only some
1061    of these combinations make sense. */
1062 static IRExpr* narrowFrom64 ( IRType dstTy, IRExpr* e )
1063 {
1064    switch (dstTy) {
1065       case Ity_I64: return e;
1066       case Ity_I32: return unop(Iop_64to32, e);
1067       case Ity_I16: return unop(Iop_64to16, e);
1068       case Ity_I8:  return unop(Iop_64to8, e);
1069       default: vpanic("narrowFrom64(arm64)");
1070    }
1071 }
1072
1073
1074 /*------------------------------------------------------------*/
1075 /*--- Helpers for accessing guest registers.               ---*/
1076 /*------------------------------------------------------------*/
1077
1078 #define OFFB_X0       offsetof(VexGuestARM64State,guest_X0)
1079 #define OFFB_X1       offsetof(VexGuestARM64State,guest_X1)
1080 #define OFFB_X2       offsetof(VexGuestARM64State,guest_X2)
1081 #define OFFB_X3       offsetof(VexGuestARM64State,guest_X3)
1082 #define OFFB_X4       offsetof(VexGuestARM64State,guest_X4)
1083 #define OFFB_X5       offsetof(VexGuestARM64State,guest_X5)
1084 #define OFFB_X6       offsetof(VexGuestARM64State,guest_X6)
1085 #define OFFB_X7       offsetof(VexGuestARM64State,guest_X7)
1086 #define OFFB_X8       offsetof(VexGuestARM64State,guest_X8)
1087 #define OFFB_X9       offsetof(VexGuestARM64State,guest_X9)
1088 #define OFFB_X10      offsetof(VexGuestARM64State,guest_X10)
1089 #define OFFB_X11      offsetof(VexGuestARM64State,guest_X11)
1090 #define OFFB_X12      offsetof(VexGuestARM64State,guest_X12)
1091 #define OFFB_X13      offsetof(VexGuestARM64State,guest_X13)
1092 #define OFFB_X14      offsetof(VexGuestARM64State,guest_X14)
1093 #define OFFB_X15      offsetof(VexGuestARM64State,guest_X15)
1094 #define OFFB_X16      offsetof(VexGuestARM64State,guest_X16)
1095 #define OFFB_X17      offsetof(VexGuestARM64State,guest_X17)
1096 #define OFFB_X18      offsetof(VexGuestARM64State,guest_X18)
1097 #define OFFB_X19      offsetof(VexGuestARM64State,guest_X19)
1098 #define OFFB_X20      offsetof(VexGuestARM64State,guest_X20)
1099 #define OFFB_X21      offsetof(VexGuestARM64State,guest_X21)
1100 #define OFFB_X22      offsetof(VexGuestARM64State,guest_X22)
1101 #define OFFB_X23      offsetof(VexGuestARM64State,guest_X23)
1102 #define OFFB_X24      offsetof(VexGuestARM64State,guest_X24)
1103 #define OFFB_X25      offsetof(VexGuestARM64State,guest_X25)
1104 #define OFFB_X26      offsetof(VexGuestARM64State,guest_X26)
1105 #define OFFB_X27      offsetof(VexGuestARM64State,guest_X27)
1106 #define OFFB_X28      offsetof(VexGuestARM64State,guest_X28)
1107 #define OFFB_X29      offsetof(VexGuestARM64State,guest_X29)
1108 #define OFFB_X30      offsetof(VexGuestARM64State,guest_X30)
1109
1110 #define OFFB_XSP      offsetof(VexGuestARM64State,guest_XSP)
1111 #define OFFB_PC       offsetof(VexGuestARM64State,guest_PC)
1112
1113 #define OFFB_CC_OP    offsetof(VexGuestARM64State,guest_CC_OP)
1114 #define OFFB_CC_DEP1  offsetof(VexGuestARM64State,guest_CC_DEP1)
1115 #define OFFB_CC_DEP2  offsetof(VexGuestARM64State,guest_CC_DEP2)
1116 #define OFFB_CC_NDEP  offsetof(VexGuestARM64State,guest_CC_NDEP)
1117
1118 #define OFFB_TPIDR_EL0 offsetof(VexGuestARM64State,guest_TPIDR_EL0)
1119 #define OFFB_NRADDR   offsetof(VexGuestARM64State,guest_NRADDR)
1120
1121 #define OFFB_Q0       offsetof(VexGuestARM64State,guest_Q0)
1122 #define OFFB_Q1       offsetof(VexGuestARM64State,guest_Q1)
1123 #define OFFB_Q2       offsetof(VexGuestARM64State,guest_Q2)
1124 #define OFFB_Q3       offsetof(VexGuestARM64State,guest_Q3)
1125 #define OFFB_Q4       offsetof(VexGuestARM64State,guest_Q4)
1126 #define OFFB_Q5       offsetof(VexGuestARM64State,guest_Q5)
1127 #define OFFB_Q6       offsetof(VexGuestARM64State,guest_Q6)
1128 #define OFFB_Q7       offsetof(VexGuestARM64State,guest_Q7)
1129 #define OFFB_Q8       offsetof(VexGuestARM64State,guest_Q8)
1130 #define OFFB_Q9       offsetof(VexGuestARM64State,guest_Q9)
1131 #define OFFB_Q10      offsetof(VexGuestARM64State,guest_Q10)
1132 #define OFFB_Q11      offsetof(VexGuestARM64State,guest_Q11)
1133 #define OFFB_Q12      offsetof(VexGuestARM64State,guest_Q12)
1134 #define OFFB_Q13      offsetof(VexGuestARM64State,guest_Q13)
1135 #define OFFB_Q14      offsetof(VexGuestARM64State,guest_Q14)
1136 #define OFFB_Q15      offsetof(VexGuestARM64State,guest_Q15)
1137 #define OFFB_Q16      offsetof(VexGuestARM64State,guest_Q16)
1138 #define OFFB_Q17      offsetof(VexGuestARM64State,guest_Q17)
1139 #define OFFB_Q18      offsetof(VexGuestARM64State,guest_Q18)
1140 #define OFFB_Q19      offsetof(VexGuestARM64State,guest_Q19)
1141 #define OFFB_Q20      offsetof(VexGuestARM64State,guest_Q20)
1142 #define OFFB_Q21      offsetof(VexGuestARM64State,guest_Q21)
1143 #define OFFB_Q22      offsetof(VexGuestARM64State,guest_Q22)
1144 #define OFFB_Q23      offsetof(VexGuestARM64State,guest_Q23)
1145 #define OFFB_Q24      offsetof(VexGuestARM64State,guest_Q24)
1146 #define OFFB_Q25      offsetof(VexGuestARM64State,guest_Q25)
1147 #define OFFB_Q26      offsetof(VexGuestARM64State,guest_Q26)
1148 #define OFFB_Q27      offsetof(VexGuestARM64State,guest_Q27)
1149 #define OFFB_Q28      offsetof(VexGuestARM64State,guest_Q28)
1150 #define OFFB_Q29      offsetof(VexGuestARM64State,guest_Q29)
1151 #define OFFB_Q30      offsetof(VexGuestARM64State,guest_Q30)
1152 #define OFFB_Q31      offsetof(VexGuestARM64State,guest_Q31)
1153
1154 #define OFFB_FPCR     offsetof(VexGuestARM64State,guest_FPCR)
1155 #define OFFB_QCFLAG   offsetof(VexGuestARM64State,guest_QCFLAG)
1156
1157 #define OFFB_CMSTART  offsetof(VexGuestARM64State,guest_CMSTART)
1158 #define OFFB_CMLEN    offsetof(VexGuestARM64State,guest_CMLEN)
1159
1160 #define OFFB_LLSC_SIZE offsetof(VexGuestARM64State,guest_LLSC_SIZE)
1161 #define OFFB_LLSC_ADDR offsetof(VexGuestARM64State,guest_LLSC_ADDR)
1162 #define OFFB_LLSC_DATA offsetof(VexGuestARM64State,guest_LLSC_DATA)
1163
1164
1165 /* ---------------- Integer registers ---------------- */
1166
1167 static Int offsetIReg64 ( UInt iregNo )
1168 {
1169    /* Do we care about endianness here?  We do if sub-parts of integer
1170       registers are accessed. */
1171    switch (iregNo) {
1172       case 0:  return OFFB_X0;
1173       case 1:  return OFFB_X1;
1174       case 2:  return OFFB_X2;
1175       case 3:  return OFFB_X3;
1176       case 4:  return OFFB_X4;
1177       case 5:  return OFFB_X5;
1178       case 6:  return OFFB_X6;
1179       case 7:  return OFFB_X7;
1180       case 8:  return OFFB_X8;
1181       case 9:  return OFFB_X9;
1182       case 10: return OFFB_X10;
1183       case 11: return OFFB_X11;
1184       case 12: return OFFB_X12;
1185       case 13: return OFFB_X13;
1186       case 14: return OFFB_X14;
1187       case 15: return OFFB_X15;
1188       case 16: return OFFB_X16;
1189       case 17: return OFFB_X17;
1190       case 18: return OFFB_X18;
1191       case 19: return OFFB_X19;
1192       case 20: return OFFB_X20;
1193       case 21: return OFFB_X21;
1194       case 22: return OFFB_X22;
1195       case 23: return OFFB_X23;
1196       case 24: return OFFB_X24;
1197       case 25: return OFFB_X25;
1198       case 26: return OFFB_X26;
1199       case 27: return OFFB_X27;
1200       case 28: return OFFB_X28;
1201       case 29: return OFFB_X29;
1202       case 30: return OFFB_X30;
1203       /* but not 31 */
1204       default: vassert(0);
1205    }
1206 }
1207
1208 static Int offsetIReg64orSP ( UInt iregNo )
1209 {
1210    return iregNo == 31  ? OFFB_XSP  : offsetIReg64(iregNo);
1211 }
1212
1213 static const HChar* nameIReg64orZR ( UInt iregNo )
1214 {
1215    vassert(iregNo < 32);
1216    static const HChar* names[32]
1217       = { "x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",
1218           "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15",
1219           "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
1220           "x24", "x25", "x26", "x27", "x28", "x29", "x30", "xzr" };
1221    return names[iregNo];
1222 }
1223
1224 static const HChar* nameIReg64orSP ( UInt iregNo )
1225 {
1226    if (iregNo == 31) {
1227       return "sp";
1228    }
1229    vassert(iregNo < 31);
1230    return nameIReg64orZR(iregNo);
1231 }
1232
1233 static IRExpr* getIReg64orSP ( UInt iregNo )
1234 {
1235    vassert(iregNo < 32);
1236    return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
1237 }
1238
1239 static IRExpr* getIReg64orZR ( UInt iregNo )
1240 {
1241    if (iregNo == 31) {
1242       return mkU64(0);
1243    }
1244    vassert(iregNo < 31);
1245    return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
1246 }
1247
1248 static void putIReg64orSP ( UInt iregNo, IRExpr* e )
1249 {
1250    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1251    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
1252 }
1253
1254 static void putIReg64orZR ( UInt iregNo, IRExpr* e )
1255 {
1256    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1257    if (iregNo == 31) {
1258       return;
1259    }
1260    vassert(iregNo < 31);
1261    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
1262 }
1263
1264 static const HChar* nameIReg32orZR ( UInt iregNo )
1265 {
1266    vassert(iregNo < 32);
1267    static const HChar* names[32]
1268       = { "w0",  "w1",  "w2",  "w3",  "w4",  "w5",  "w6",  "w7",
1269           "w8",  "w9",  "w10", "w11", "w12", "w13", "w14", "w15",
1270           "w16", "w17", "w18", "w19", "w20", "w21", "w22", "w23",
1271           "w24", "w25", "w26", "w27", "w28", "w29", "w30", "wzr" };
1272    return names[iregNo];
1273 }
1274
1275 static const HChar* nameIReg32orSP ( UInt iregNo )
1276 {
1277    if (iregNo == 31) {
1278       return "wsp";
1279    }
1280    vassert(iregNo < 31);
1281    return nameIReg32orZR(iregNo);
1282 }
1283
1284 static IRExpr* getIReg32orSP ( UInt iregNo )
1285 {
1286    vassert(iregNo < 32);
1287    return unop(Iop_64to32,
1288                IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
1289 }
1290
1291 static IRExpr* getIReg32orZR ( UInt iregNo )
1292 {
1293    if (iregNo == 31) {
1294       return mkU32(0);
1295    }
1296    vassert(iregNo < 31);
1297    return unop(Iop_64to32,
1298                IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
1299 }
1300
1301 static void putIReg32orSP ( UInt iregNo, IRExpr* e )
1302 {
1303    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1304    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
1305 }
1306
1307 static void putIReg32orZR ( UInt iregNo, IRExpr* e )
1308 {
1309    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1310    if (iregNo == 31) {
1311       return;
1312    }
1313    vassert(iregNo < 31);
1314    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
1315 }
1316
1317 static const HChar* nameIRegOrSP ( Bool is64, UInt iregNo )
1318 {
1319    vassert(is64 == True || is64 == False);
1320    return is64 ? nameIReg64orSP(iregNo) : nameIReg32orSP(iregNo);
1321 }
1322
1323 static const HChar* nameIRegOrZR ( Bool is64, UInt iregNo )
1324 {
1325    vassert(is64 == True || is64 == False);
1326    return is64 ? nameIReg64orZR(iregNo) : nameIReg32orZR(iregNo);
1327 }
1328
1329 static IRExpr* getIRegOrZR ( Bool is64, UInt iregNo )
1330 {
1331    vassert(is64 == True || is64 == False);
1332    return is64 ? getIReg64orZR(iregNo) : getIReg32orZR(iregNo);
1333 }
1334
1335 static void putIRegOrZR ( Bool is64, UInt iregNo, IRExpr* e )
1336 {
1337    vassert(is64 == True || is64 == False);
1338    if (is64) putIReg64orZR(iregNo, e); else putIReg32orZR(iregNo, e);
1339 }
1340
1341 static void putPC ( IRExpr* e )
1342 {
1343    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1344    stmt( IRStmt_Put(OFFB_PC, e) );
1345 }
1346
1347
1348 /* ---------------- Vector (Q) registers ---------------- */
1349
1350 static Int offsetQReg128 ( UInt qregNo )
1351 {
1352    /* We don't care about endianness at this point.  It only becomes
1353       relevant when dealing with sections of these registers.*/
1354    switch (qregNo) {
1355       case 0:  return OFFB_Q0;
1356       case 1:  return OFFB_Q1;
1357       case 2:  return OFFB_Q2;
1358       case 3:  return OFFB_Q3;
1359       case 4:  return OFFB_Q4;
1360       case 5:  return OFFB_Q5;
1361       case 6:  return OFFB_Q6;
1362       case 7:  return OFFB_Q7;
1363       case 8:  return OFFB_Q8;
1364       case 9:  return OFFB_Q9;
1365       case 10: return OFFB_Q10;
1366       case 11: return OFFB_Q11;
1367       case 12: return OFFB_Q12;
1368       case 13: return OFFB_Q13;
1369       case 14: return OFFB_Q14;
1370       case 15: return OFFB_Q15;
1371       case 16: return OFFB_Q16;
1372       case 17: return OFFB_Q17;
1373       case 18: return OFFB_Q18;
1374       case 19: return OFFB_Q19;
1375       case 20: return OFFB_Q20;
1376       case 21: return OFFB_Q21;
1377       case 22: return OFFB_Q22;
1378       case 23: return OFFB_Q23;
1379       case 24: return OFFB_Q24;
1380       case 25: return OFFB_Q25;
1381       case 26: return OFFB_Q26;
1382       case 27: return OFFB_Q27;
1383       case 28: return OFFB_Q28;
1384       case 29: return OFFB_Q29;
1385       case 30: return OFFB_Q30;
1386       case 31: return OFFB_Q31;
1387       default: vassert(0);
1388    }
1389 }
1390
1391 /* Write to a complete Qreg. */
1392 static void putQReg128 ( UInt qregNo, IRExpr* e )
1393 {
1394    vassert(qregNo < 32);
1395    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_V128);
1396    stmt( IRStmt_Put(offsetQReg128(qregNo), e) );
1397 }
1398
1399 /* Read a complete Qreg. */
1400 static IRExpr* getQReg128 ( UInt qregNo )
1401 {
1402    vassert(qregNo < 32);
1403    return IRExpr_Get(offsetQReg128(qregNo), Ity_V128);
1404 }
1405
1406 /* Produce the IR type for some sub-part of a vector.  For 32- and 64-
1407    bit sub-parts we can choose either integer or float types, and
1408    choose float on the basis that that is the common use case and so
1409    will give least interference with Put-to-Get forwarding later
1410    on. */
1411 static IRType preferredVectorSubTypeFromSize ( UInt szB )
1412 {
1413    switch (szB) {
1414       case 1:  return Ity_I8;
1415       case 2:  return Ity_I16;
1416       case 4:  return Ity_I32; //Ity_F32;
1417       case 8:  return Ity_F64;
1418       case 16: return Ity_V128;
1419       default: vassert(0);
1420    }
1421 }
1422
1423 /* Find the offset of the laneNo'th lane of type laneTy in the given
1424    Qreg.  Since the host is little-endian, the least significant lane
1425    has the lowest offset. */
1426 static Int offsetQRegLane ( UInt qregNo, IRType laneTy, UInt laneNo )
1427 {
1428    vassert(host_endness == VexEndnessLE);
1429    Int base = offsetQReg128(qregNo);
1430    /* Since the host is little-endian, the least significant lane
1431       will be at the lowest address. */
1432    /* Restrict this to known types, so as to avoid silently accepting
1433       stupid types. */
1434    UInt laneSzB = 0;
1435    switch (laneTy) {
1436       case Ity_I8:                 laneSzB = 1;  break;
1437       case Ity_F16: case Ity_I16:  laneSzB = 2;  break;
1438       case Ity_F32: case Ity_I32:  laneSzB = 4;  break;
1439       case Ity_F64: case Ity_I64:  laneSzB = 8;  break;
1440       case Ity_V128:               laneSzB = 16; break;
1441       default: break;
1442    }
1443    vassert(laneSzB > 0);
1444    UInt minOff = laneNo * laneSzB;
1445    UInt maxOff = minOff + laneSzB - 1;
1446    vassert(maxOff < 16);
1447    return base + minOff;
1448 }
1449
1450 /* Put to the least significant lane of a Qreg. */
1451 static void putQRegLO ( UInt qregNo, IRExpr* e )
1452 {
1453    IRType ty  = typeOfIRExpr(irsb->tyenv, e);
1454    Int    off = offsetQRegLane(qregNo, ty, 0);
1455    switch (ty) {
1456       case Ity_I8:  case Ity_I16: case Ity_I32: case Ity_I64:
1457       case Ity_F16: case Ity_F32: case Ity_F64: case Ity_V128:
1458          break;
1459       default:
1460          vassert(0); // Other cases are probably invalid
1461    }
1462    stmt(IRStmt_Put(off, e));
1463 }
1464
1465 /* Get from the least significant lane of a Qreg. */
1466 static IRExpr* getQRegLO ( UInt qregNo, IRType ty )
1467 {
1468    Int off = offsetQRegLane(qregNo, ty, 0);
1469    switch (ty) {
1470       case Ity_I8:
1471       case Ity_F16: case Ity_I16:
1472       case Ity_I32: case Ity_I64:
1473       case Ity_F32: case Ity_F64: case Ity_V128:
1474          break;
1475       default:
1476          vassert(0); // Other cases are ATC
1477    }
1478    return IRExpr_Get(off, ty);
1479 }
1480
1481 static const HChar* nameQRegLO ( UInt qregNo, IRType laneTy )
1482 {
1483    static const HChar* namesQ[32]
1484       = { "q0",  "q1",  "q2",  "q3",  "q4",  "q5",  "q6",  "q7",
1485           "q8",  "q9",  "q10", "q11", "q12", "q13", "q14", "q15",
1486           "q16", "q17", "q18", "q19", "q20", "q21", "q22", "q23",
1487           "q24", "q25", "q26", "q27", "q28", "q29", "q30", "q31" };
1488    static const HChar* namesD[32]
1489       = { "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
1490           "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
1491           "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
1492           "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" };
1493    static const HChar* namesS[32]
1494       = { "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
1495           "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15",
1496           "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
1497           "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31" };
1498    static const HChar* namesH[32]
1499       = { "h0",  "h1",  "h2",  "h3",  "h4",  "h5",  "h6",  "h7",
1500           "h8",  "h9",  "h10", "h11", "h12", "h13", "h14", "h15",
1501           "h16", "h17", "h18", "h19", "h20", "h21", "h22", "h23",
1502           "h24", "h25", "h26", "h27", "h28", "h29", "h30", "h31" };
1503    static const HChar* namesB[32]
1504       = { "b0",  "b1",  "b2",  "b3",  "b4",  "b5",  "b6",  "b7",
1505           "b8",  "b9",  "b10", "b11", "b12", "b13", "b14", "b15",
1506           "b16", "b17", "b18", "b19", "b20", "b21", "b22", "b23",
1507           "b24", "b25", "b26", "b27", "b28", "b29", "b30", "b31" };
1508    vassert(qregNo < 32);
1509    switch (sizeofIRType(laneTy)) {
1510       case 1:  return namesB[qregNo];
1511       case 2:  return namesH[qregNo];
1512       case 4:  return namesS[qregNo];
1513       case 8:  return namesD[qregNo];
1514       case 16: return namesQ[qregNo];
1515       default: vassert(0);
1516    }
1517    /*NOTREACHED*/
1518 }
1519
1520 static const HChar* nameQReg128 ( UInt qregNo )
1521 {
1522    return nameQRegLO(qregNo, Ity_V128);
1523 }
1524
1525 /* Find the offset of the most significant half (8 bytes) of the given
1526    Qreg.  This requires knowing the endianness of the host. */
1527 static Int offsetQRegHI64 ( UInt qregNo )
1528 {
1529    return offsetQRegLane(qregNo, Ity_I64, 1);
1530 }
1531
1532 static IRExpr* getQRegHI64 ( UInt qregNo )
1533 {
1534    return IRExpr_Get(offsetQRegHI64(qregNo), Ity_I64);
1535 }
1536
1537 static void putQRegHI64 ( UInt qregNo, IRExpr* e )
1538 {
1539    IRType ty  = typeOfIRExpr(irsb->tyenv, e);
1540    Int    off = offsetQRegHI64(qregNo);
1541    switch (ty) {
1542       case Ity_I64: case Ity_F64:
1543          break;
1544       default:
1545          vassert(0); // Other cases are plain wrong
1546    }
1547    stmt(IRStmt_Put(off, e));
1548 }
1549
1550 /* Put to a specified lane of a Qreg. */
1551 static void putQRegLane ( UInt qregNo, UInt laneNo, IRExpr* e )
1552 {
1553    IRType laneTy  = typeOfIRExpr(irsb->tyenv, e);
1554    Int    off     = offsetQRegLane(qregNo, laneTy, laneNo);
1555    switch (laneTy) {
1556       case Ity_F64: case Ity_I64:
1557       case Ity_I32: case Ity_F32:
1558       case Ity_I16: case Ity_F16:
1559       case Ity_I8:
1560          break;
1561       default:
1562          vassert(0); // Other cases are ATC
1563    }
1564    stmt(IRStmt_Put(off, e));
1565 }
1566
1567 /* Get from a specified lane of a Qreg. */
1568 static IRExpr* getQRegLane ( UInt qregNo, UInt laneNo, IRType laneTy )
1569 {
1570    Int off = offsetQRegLane(qregNo, laneTy, laneNo);
1571    switch (laneTy) {
1572       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
1573       case Ity_F64: case Ity_F32: case Ity_F16:
1574          break;
1575       default:
1576          vassert(0); // Other cases are ATC
1577    }
1578    return IRExpr_Get(off, laneTy);
1579 }
1580
1581
1582 //ZZ /* ---------------- Misc registers ---------------- */
1583 //ZZ
1584 //ZZ static void putMiscReg32 ( UInt    gsoffset,
1585 //ZZ                            IRExpr* e, /* :: Ity_I32 */
1586 //ZZ                            IRTemp  guardT /* :: Ity_I32, 0 or 1 */)
1587 //ZZ {
1588 //ZZ    switch (gsoffset) {
1589 //ZZ       case OFFB_FPSCR:   break;
1590 //ZZ       case OFFB_QFLAG32: break;
1591 //ZZ       case OFFB_GEFLAG0: break;
1592 //ZZ       case OFFB_GEFLAG1: break;
1593 //ZZ       case OFFB_GEFLAG2: break;
1594 //ZZ       case OFFB_GEFLAG3: break;
1595 //ZZ       default: vassert(0); /* awaiting more cases */
1596 //ZZ    }
1597 //ZZ    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1598 //ZZ
1599 //ZZ    if (guardT == IRTemp_INVALID) {
1600 //ZZ       /* unconditional write */
1601 //ZZ       stmt(IRStmt_Put(gsoffset, e));
1602 //ZZ    } else {
1603 //ZZ       stmt(IRStmt_Put(
1604 //ZZ          gsoffset,
1605 //ZZ          IRExpr_ITE( binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0)),
1606 //ZZ                      e, IRExpr_Get(gsoffset, Ity_I32) )
1607 //ZZ       ));
1608 //ZZ    }
1609 //ZZ }
1610 //ZZ
1611 //ZZ static IRTemp get_ITSTATE ( void )
1612 //ZZ {
1613 //ZZ    ASSERT_IS_THUMB;
1614 //ZZ    IRTemp t = newTemp(Ity_I32);
1615 //ZZ    assign(t, IRExpr_Get( OFFB_ITSTATE, Ity_I32));
1616 //ZZ    return t;
1617 //ZZ }
1618 //ZZ
1619 //ZZ static void put_ITSTATE ( IRTemp t )
1620 //ZZ {
1621 //ZZ    ASSERT_IS_THUMB;
1622 //ZZ    stmt( IRStmt_Put( OFFB_ITSTATE, mkexpr(t)) );
1623 //ZZ }
1624 //ZZ
1625 //ZZ static IRTemp get_QFLAG32 ( void )
1626 //ZZ {
1627 //ZZ    IRTemp t = newTemp(Ity_I32);
1628 //ZZ    assign(t, IRExpr_Get( OFFB_QFLAG32, Ity_I32));
1629 //ZZ    return t;
1630 //ZZ }
1631 //ZZ
1632 //ZZ static void put_QFLAG32 ( IRTemp t, IRTemp condT )
1633 //ZZ {
1634 //ZZ    putMiscReg32( OFFB_QFLAG32, mkexpr(t), condT );
1635 //ZZ }
1636 //ZZ
1637 //ZZ /* Stickily set the 'Q' flag (APSR bit 27) of the APSR (Application Program
1638 //ZZ    Status Register) to indicate that overflow or saturation occurred.
1639 //ZZ    Nb: t must be zero to denote no saturation, and any nonzero
1640 //ZZ    value to indicate saturation. */
1641 //ZZ static void or_into_QFLAG32 ( IRExpr* e, IRTemp condT )
1642 //ZZ {
1643 //ZZ    IRTemp old = get_QFLAG32();
1644 //ZZ    IRTemp nyu = newTemp(Ity_I32);
1645 //ZZ    assign(nyu, binop(Iop_Or32, mkexpr(old), e) );
1646 //ZZ    put_QFLAG32(nyu, condT);
1647 //ZZ }
1648
1649
1650 /* ---------------- FPCR stuff ---------------- */
1651
1652 /* Generate IR to get hold of the rounding mode bits in FPCR, and
1653    convert them to IR format.  Bind the final result to the
1654    returned temp. */
1655 static IRTemp /* :: Ity_I32 */ mk_get_IR_rounding_mode ( void )
1656 {
1657    /* The ARMvfp encoding for rounding mode bits is:
1658          00  to nearest
1659          01  to +infinity
1660          10  to -infinity
1661          11  to zero
1662       We need to convert that to the IR encoding:
1663          00  to nearest (the default)
1664          10  to +infinity
1665          01  to -infinity
1666          11  to zero
1667       Which can be done by swapping bits 0 and 1.
1668       The rmode bits are at 23:22 in FPSCR.
1669    */
1670    IRTemp armEncd = newTemp(Ity_I32);
1671    IRTemp swapped = newTemp(Ity_I32);
1672    /* Fish FPCR[23:22] out, and slide to bottom.  Doesn't matter that
1673       we don't zero out bits 24 and above, since the assignment to
1674       'swapped' will mask them out anyway. */
1675    assign(armEncd,
1676           binop(Iop_Shr32, IRExpr_Get(OFFB_FPCR, Ity_I32), mkU8(22)));
1677    /* Now swap them. */
1678    assign(swapped,
1679           binop(Iop_Or32,
1680                 binop(Iop_And32,
1681                       binop(Iop_Shl32, mkexpr(armEncd), mkU8(1)),
1682                       mkU32(2)),
1683                 binop(Iop_And32,
1684                       binop(Iop_Shr32, mkexpr(armEncd), mkU8(1)),
1685                       mkU32(1))
1686          ));
1687    return swapped;
1688 }
1689
1690
1691 /*------------------------------------------------------------*/
1692 /*--- Helpers for flag handling and conditional insns      ---*/
1693 /*------------------------------------------------------------*/
1694
1695 static const HChar* nameARM64Condcode ( ARM64Condcode cond )
1696 {
1697    switch (cond) {
1698       case ARM64CondEQ:  return "eq";
1699       case ARM64CondNE:  return "ne";
1700       case ARM64CondCS:  return "cs";  // or 'hs'
1701       case ARM64CondCC:  return "cc";  // or 'lo'
1702       case ARM64CondMI:  return "mi";
1703       case ARM64CondPL:  return "pl";
1704       case ARM64CondVS:  return "vs";
1705       case ARM64CondVC:  return "vc";
1706       case ARM64CondHI:  return "hi";
1707       case ARM64CondLS:  return "ls";
1708       case ARM64CondGE:  return "ge";
1709       case ARM64CondLT:  return "lt";
1710       case ARM64CondGT:  return "gt";
1711       case ARM64CondLE:  return "le";
1712       case ARM64CondAL:  return "al";
1713       case ARM64CondNV:  return "nv";
1714       default: vpanic("name_ARM64Condcode");
1715    }
1716 }
1717
1718 /* and a handy shorthand for it */
1719 static const HChar* nameCC ( ARM64Condcode cond ) {
1720    return nameARM64Condcode(cond);
1721 }
1722
1723
1724 /* Build IR to calculate some particular condition from stored
1725    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
1726    Ity_I64, suitable for narrowing.  Although the return type is
1727    Ity_I64, the returned value is either 0 or 1.  'cond' must be
1728    :: Ity_I64 and must denote the condition to compute in
1729    bits 7:4, and be zero everywhere else.
1730 */
1731 static IRExpr* mk_arm64g_calculate_condition_dyn ( IRExpr* cond )
1732 {
1733    vassert(typeOfIRExpr(irsb->tyenv, cond) == Ity_I64);
1734    /* And 'cond' had better produce a value in which only bits 7:4 are
1735       nonzero.  However, obviously we can't assert for that. */
1736
1737    /* So what we're constructing for the first argument is
1738       "(cond << 4) | stored-operation".
1739       However, as per comments above, 'cond' must be supplied
1740       pre-shifted to this function.
1741
1742       This pairing scheme requires that the ARM64_CC_OP_ values all fit
1743       in 4 bits.  Hence we are passing a (COND, OP) pair in the lowest
1744       8 bits of the first argument. */
1745    IRExpr** args
1746       = mkIRExprVec_4(
1747            binop(Iop_Or64, IRExpr_Get(OFFB_CC_OP, Ity_I64), cond),
1748            IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1749            IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1750            IRExpr_Get(OFFB_CC_NDEP, Ity_I64)
1751         );
1752    IRExpr* call
1753       = mkIRExprCCall(
1754            Ity_I64,
1755            0/*regparm*/,
1756            "arm64g_calculate_condition", &arm64g_calculate_condition,
1757            args
1758         );
1759
1760    /* Exclude the requested condition, OP and NDEP from definedness
1761       checking.  We're only interested in DEP1 and DEP2. */
1762    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1763    return call;
1764 }
1765
1766
1767 /* Build IR to calculate some particular condition from stored
1768    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
1769    Ity_I64, suitable for narrowing.  Although the return type is
1770    Ity_I64, the returned value is either 0 or 1.
1771 */
1772 static IRExpr* mk_arm64g_calculate_condition ( ARM64Condcode cond )
1773 {
1774   /* First arg is "(cond << 4) | condition".  This requires that the
1775      ARM64_CC_OP_ values all fit in 4 bits.  Hence we are passing a
1776      (COND, OP) pair in the lowest 8 bits of the first argument. */
1777    vassert(cond >= 0 && cond <= 15);
1778    return mk_arm64g_calculate_condition_dyn( mkU64(cond << 4) );
1779 }
1780
1781
1782 /* Build IR to calculate just the carry flag from stored
1783    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1784    Ity_I64. */
1785 static IRExpr* mk_arm64g_calculate_flag_c ( void )
1786 {
1787    IRExpr** args
1788       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1789                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1790                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1791                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1792    IRExpr* call
1793       = mkIRExprCCall(
1794            Ity_I64,
1795            0/*regparm*/,
1796            "arm64g_calculate_flag_c", &arm64g_calculate_flag_c,
1797            args
1798         );
1799    /* Exclude OP and NDEP from definedness checking.  We're only
1800       interested in DEP1 and DEP2. */
1801    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1802    return call;
1803 }
1804
1805
1806 //ZZ /* Build IR to calculate just the overflow flag from stored
1807 //ZZ    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1808 //ZZ    Ity_I32. */
1809 //ZZ static IRExpr* mk_armg_calculate_flag_v ( void )
1810 //ZZ {
1811 //ZZ    IRExpr** args
1812 //ZZ       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
1813 //ZZ                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
1814 //ZZ                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
1815 //ZZ                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
1816 //ZZ    IRExpr* call
1817 //ZZ       = mkIRExprCCall(
1818 //ZZ            Ity_I32,
1819 //ZZ            0/*regparm*/,
1820 //ZZ            "armg_calculate_flag_v", &armg_calculate_flag_v,
1821 //ZZ            args
1822 //ZZ         );
1823 //ZZ    /* Exclude OP and NDEP from definedness checking.  We're only
1824 //ZZ       interested in DEP1 and DEP2. */
1825 //ZZ    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1826 //ZZ    return call;
1827 //ZZ }
1828
1829
1830 /* Build IR to calculate N Z C V in bits 31:28 of the
1831    returned word. */
1832 static IRExpr* mk_arm64g_calculate_flags_nzcv ( void )
1833 {
1834    IRExpr** args
1835       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1836                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1837                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1838                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1839    IRExpr* call
1840       = mkIRExprCCall(
1841            Ity_I64,
1842            0/*regparm*/,
1843            "arm64g_calculate_flags_nzcv", &arm64g_calculate_flags_nzcv,
1844            args
1845         );
1846    /* Exclude OP and NDEP from definedness checking.  We're only
1847       interested in DEP1 and DEP2. */
1848    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1849    return call;
1850 }
1851
1852
1853 /* Build IR to set the flags thunk, in the most general case. */
1854 static
1855 void setFlags_D1_D2_ND ( UInt cc_op,
1856                          IRTemp t_dep1, IRTemp t_dep2, IRTemp t_ndep )
1857 {
1858    vassert(typeOfIRTemp(irsb->tyenv, t_dep1 == Ity_I64));
1859    vassert(typeOfIRTemp(irsb->tyenv, t_dep2 == Ity_I64));
1860    vassert(typeOfIRTemp(irsb->tyenv, t_ndep == Ity_I64));
1861    vassert(cc_op >= ARM64G_CC_OP_COPY && cc_op < ARM64G_CC_OP_NUMBER);
1862    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(cc_op) ));
1863    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t_dep1) ));
1864    stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(t_dep2) ));
1865    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(t_ndep) ));
1866 }
1867
1868 /* Build IR to set the flags thunk after ADD or SUB. */
1869 static
1870 void setFlags_ADD_SUB ( Bool is64, Bool isSUB, IRTemp argL, IRTemp argR )
1871 {
1872    IRTemp argL64 = IRTemp_INVALID;
1873    IRTemp argR64 = IRTemp_INVALID;
1874    IRTemp z64    = newTemp(Ity_I64);
1875    if (is64) {
1876       argL64 = argL;
1877       argR64 = argR;
1878    } else {
1879       argL64 = newTemp(Ity_I64);
1880       argR64 = newTemp(Ity_I64);
1881       assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
1882       assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
1883    }
1884    assign(z64, mkU64(0));
1885    UInt cc_op = ARM64G_CC_OP_NUMBER;
1886    /**/ if ( isSUB &&  is64) { cc_op = ARM64G_CC_OP_SUB64; }
1887    else if ( isSUB && !is64) { cc_op = ARM64G_CC_OP_SUB32; }
1888    else if (!isSUB &&  is64) { cc_op = ARM64G_CC_OP_ADD64; }
1889    else if (!isSUB && !is64) { cc_op = ARM64G_CC_OP_ADD32; }
1890    else                      { vassert(0); }
1891    setFlags_D1_D2_ND(cc_op, argL64, argR64, z64);
1892 }
1893
1894 /* Build IR to set the flags thunk after ADC or SBC. */
1895 static
1896 void setFlags_ADC_SBC ( Bool is64, Bool isSBC,
1897                         IRTemp argL, IRTemp argR, IRTemp oldC )
1898 {
1899    IRTemp argL64 = IRTemp_INVALID;
1900    IRTemp argR64 = IRTemp_INVALID;
1901    IRTemp oldC64 = IRTemp_INVALID;
1902    if (is64) {
1903       argL64 = argL;
1904       argR64 = argR;
1905       oldC64 = oldC;
1906    } else {
1907       argL64 = newTemp(Ity_I64);
1908       argR64 = newTemp(Ity_I64);
1909       oldC64 = newTemp(Ity_I64);
1910       assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
1911       assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
1912       assign(oldC64, unop(Iop_32Uto64, mkexpr(oldC)));
1913    }
1914    UInt cc_op = ARM64G_CC_OP_NUMBER;
1915    /**/ if ( isSBC &&  is64) { cc_op = ARM64G_CC_OP_SBC64; }
1916    else if ( isSBC && !is64) { cc_op = ARM64G_CC_OP_SBC32; }
1917    else if (!isSBC &&  is64) { cc_op = ARM64G_CC_OP_ADC64; }
1918    else if (!isSBC && !is64) { cc_op = ARM64G_CC_OP_ADC32; }
1919    else                      { vassert(0); }
1920    setFlags_D1_D2_ND(cc_op, argL64, argR64, oldC64);
1921 }
1922
1923 /* Build IR to set the flags thunk after ADD or SUB, if the given
1924    condition evaluates to True at run time.  If not, the flags are set
1925    to the specified NZCV value. */
1926 static
1927 void setFlags_ADD_SUB_conditionally (
1928         Bool is64, Bool isSUB,
1929         IRTemp cond, IRTemp argL, IRTemp argR, UInt nzcv
1930      )
1931 {
1932    /* Generate IR as follows:
1933         CC_OP   = ITE(cond, OP_{ADD,SUB}{32,64}, OP_COPY)
1934         CC_DEP1 = ITE(cond, argL64, nzcv << 28)
1935         CC_DEP2 = ITE(cond, argR64, 0)
1936         CC_NDEP = 0
1937    */
1938
1939    IRTemp z64 = newTemp(Ity_I64);
1940    assign(z64, mkU64(0));
1941
1942    /* Establish the operation and operands for the True case. */
1943    IRTemp t_dep1 = IRTemp_INVALID;
1944    IRTemp t_dep2 = IRTemp_INVALID;
1945    UInt   t_op   = ARM64G_CC_OP_NUMBER;
1946    /**/ if ( isSUB &&  is64) { t_op = ARM64G_CC_OP_SUB64; }
1947    else if ( isSUB && !is64) { t_op = ARM64G_CC_OP_SUB32; }
1948    else if (!isSUB &&  is64) { t_op = ARM64G_CC_OP_ADD64; }
1949    else if (!isSUB && !is64) { t_op = ARM64G_CC_OP_ADD32; }
1950    else                      { vassert(0); }
1951    /* */
1952    if (is64) {
1953       t_dep1 = argL;
1954       t_dep2 = argR;
1955    } else {
1956       t_dep1 = newTemp(Ity_I64);
1957       t_dep2 = newTemp(Ity_I64);
1958       assign(t_dep1, unop(Iop_32Uto64, mkexpr(argL)));
1959       assign(t_dep2, unop(Iop_32Uto64, mkexpr(argR)));
1960    }
1961
1962    /* Establish the operation and operands for the False case. */
1963    IRTemp f_dep1 = newTemp(Ity_I64);
1964    IRTemp f_dep2 = z64;
1965    UInt   f_op   = ARM64G_CC_OP_COPY;
1966    assign(f_dep1, mkU64(nzcv << 28));
1967
1968    /* Final thunk values */
1969    IRTemp dep1 = newTemp(Ity_I64);
1970    IRTemp dep2 = newTemp(Ity_I64);
1971    IRTemp op   = newTemp(Ity_I64);
1972
1973    assign(op,   IRExpr_ITE(mkexpr(cond), mkU64(t_op), mkU64(f_op)));
1974    assign(dep1, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep1), mkexpr(f_dep1)));
1975    assign(dep2, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep2), mkexpr(f_dep2)));
1976
1977    /* finally .. */
1978    stmt( IRStmt_Put( OFFB_CC_OP,   mkexpr(op) ));
1979    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(dep1) ));
1980    stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(dep2) ));
1981    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(z64) ));
1982 }
1983
1984 /* Build IR to set the flags thunk after AND/OR/XOR or variants thereof. */
1985 static
1986 void setFlags_LOGIC ( Bool is64, IRTemp res )
1987 {
1988    IRTemp res64 = IRTemp_INVALID;
1989    IRTemp z64   = newTemp(Ity_I64);
1990    UInt   cc_op = ARM64G_CC_OP_NUMBER;
1991    if (is64) {
1992       res64 = res;
1993       cc_op = ARM64G_CC_OP_LOGIC64;
1994    } else {
1995       res64 = newTemp(Ity_I64);
1996       assign(res64, unop(Iop_32Uto64, mkexpr(res)));
1997       cc_op = ARM64G_CC_OP_LOGIC32;
1998    }
1999    assign(z64, mkU64(0));
2000    setFlags_D1_D2_ND(cc_op, res64, z64, z64);
2001 }
2002
2003 /* Build IR to set the flags thunk to a given NZCV value.  NZCV is
2004    located in bits 31:28 of the supplied value. */
2005 static
2006 void setFlags_COPY ( IRTemp nzcv_28x0 )
2007 {
2008    IRTemp z64 = newTemp(Ity_I64);
2009    assign(z64, mkU64(0));
2010    setFlags_D1_D2_ND(ARM64G_CC_OP_COPY, nzcv_28x0, z64, z64);
2011 }
2012
2013
2014 //ZZ /* Minor variant of the above that sets NDEP to zero (if it
2015 //ZZ    sets it at all) */
2016 //ZZ static void setFlags_D1_D2 ( UInt cc_op, IRTemp t_dep1,
2017 //ZZ                              IRTemp t_dep2,
2018 //ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2019 //ZZ {
2020 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2021 //ZZ    assign( z32, mkU32(0) );
2022 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, t_dep2, z32, guardT );
2023 //ZZ }
2024 //ZZ
2025 //ZZ
2026 //ZZ /* Minor variant of the above that sets DEP2 to zero (if it
2027 //ZZ    sets it at all) */
2028 //ZZ static void setFlags_D1_ND ( UInt cc_op, IRTemp t_dep1,
2029 //ZZ                              IRTemp t_ndep,
2030 //ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2031 //ZZ {
2032 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2033 //ZZ    assign( z32, mkU32(0) );
2034 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, t_ndep, guardT );
2035 //ZZ }
2036 //ZZ
2037 //ZZ
2038 //ZZ /* Minor variant of the above that sets DEP2 and NDEP to zero (if it
2039 //ZZ    sets them at all) */
2040 //ZZ static void setFlags_D1 ( UInt cc_op, IRTemp t_dep1,
2041 //ZZ                           IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2042 //ZZ {
2043 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2044 //ZZ    assign( z32, mkU32(0) );
2045 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, z32, guardT );
2046 //ZZ }
2047
2048
2049 /*------------------------------------------------------------*/
2050 /*--- Misc math helpers                                    ---*/
2051 /*------------------------------------------------------------*/
2052
2053 /* Generate IR for ((x & mask) >>u sh) | ((x << sh) & mask) */
2054 static IRTemp math_SWAPHELPER ( IRTemp x, ULong mask, Int sh )
2055 {
2056    IRTemp maskT = newTemp(Ity_I64);
2057    IRTemp res   = newTemp(Ity_I64);
2058    vassert(sh >= 1 && sh <= 63);
2059    assign(maskT, mkU64(mask));
2060    assign( res,
2061            binop(Iop_Or64,
2062                  binop(Iop_Shr64,
2063                        binop(Iop_And64,mkexpr(x),mkexpr(maskT)),
2064                        mkU8(sh)),
2065                  binop(Iop_And64,
2066                        binop(Iop_Shl64,mkexpr(x),mkU8(sh)),
2067                        mkexpr(maskT))
2068                  )
2069            );
2070    return res;
2071 }
2072
2073 /* Generates byte swaps within 32-bit lanes. */
2074 static IRTemp math_UINTSWAP64 ( IRTemp src )
2075 {
2076    IRTemp res;
2077    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2078    res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
2079    return res;
2080 }
2081
2082 /* Generates byte swaps within 16-bit lanes. */
2083 static IRTemp math_USHORTSWAP64 ( IRTemp src )
2084 {
2085    IRTemp res;
2086    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2087    return res;
2088 }
2089
2090 /* Generates a 64-bit byte swap. */
2091 static IRTemp math_BYTESWAP64 ( IRTemp src )
2092 {
2093    IRTemp res;
2094    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2095    res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
2096    res = math_SWAPHELPER(res, 0xFFFFFFFF00000000ULL, 32);
2097    return res;
2098 }
2099
2100 /* Generates a 64-bit bit swap. */
2101 static IRTemp math_BITSWAP64 ( IRTemp src )
2102 {
2103    IRTemp res;
2104    res = math_SWAPHELPER(src, 0xAAAAAAAAAAAAAAAAULL, 1);
2105    res = math_SWAPHELPER(res, 0xCCCCCCCCCCCCCCCCULL, 2);
2106    res = math_SWAPHELPER(res, 0xF0F0F0F0F0F0F0F0ULL, 4);
2107    return math_BYTESWAP64(res);
2108 }
2109
2110 /* Duplicates the bits at the bottom of the given word to fill the
2111    whole word.  src :: Ity_I64 is assumed to have zeroes everywhere
2112    except for the bottom bits. */
2113 static IRTemp math_DUP_TO_64 ( IRTemp src, IRType srcTy )
2114 {
2115    if (srcTy == Ity_I8) {
2116       IRTemp t16 = newTemp(Ity_I64);
2117       assign(t16, binop(Iop_Or64, mkexpr(src),
2118                                   binop(Iop_Shl64, mkexpr(src), mkU8(8))));
2119       IRTemp t32 = newTemp(Ity_I64);
2120       assign(t32, binop(Iop_Or64, mkexpr(t16),
2121                                   binop(Iop_Shl64, mkexpr(t16), mkU8(16))));
2122       IRTemp t64 = newTemp(Ity_I64);
2123       assign(t64, binop(Iop_Or64, mkexpr(t32),
2124                                   binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
2125       return t64;
2126    }
2127    if (srcTy == Ity_I16) {
2128       IRTemp t32 = newTemp(Ity_I64);
2129       assign(t32, binop(Iop_Or64, mkexpr(src),
2130                                   binop(Iop_Shl64, mkexpr(src), mkU8(16))));
2131       IRTemp t64 = newTemp(Ity_I64);
2132       assign(t64, binop(Iop_Or64, mkexpr(t32),
2133                                   binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
2134       return t64;
2135    }
2136    if (srcTy == Ity_I32) {
2137       IRTemp t64 = newTemp(Ity_I64);
2138       assign(t64, binop(Iop_Or64, mkexpr(src),
2139                                   binop(Iop_Shl64, mkexpr(src), mkU8(32))));
2140       return t64;
2141    }
2142    if (srcTy == Ity_I64) {
2143       return src;
2144    }
2145    vassert(0);
2146 }
2147
2148
2149 /* Duplicates the src element exactly so as to fill a V128 value. */
2150 static IRTemp math_DUP_TO_V128 ( IRTemp src, IRType srcTy )
2151 {
2152    IRTemp res = newTempV128();
2153    if (srcTy == Ity_F64) {
2154       IRTemp i64 = newTemp(Ity_I64);
2155       assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(src)));
2156       assign(res, binop(Iop_64HLtoV128, mkexpr(i64), mkexpr(i64)));
2157       return res;
2158    }
2159    if (srcTy == Ity_F32) {
2160       IRTemp i64a = newTemp(Ity_I64);
2161       assign(i64a, unop(Iop_32Uto64, unop(Iop_ReinterpF32asI32, mkexpr(src))));
2162       IRTemp i64b = newTemp(Ity_I64);
2163       assign(i64b, binop(Iop_Or64, binop(Iop_Shl64, mkexpr(i64a), mkU8(32)),
2164                                    mkexpr(i64a)));
2165       assign(res, binop(Iop_64HLtoV128, mkexpr(i64b), mkexpr(i64b)));
2166       return res;
2167    }
2168    if (srcTy == Ity_I64) {
2169       assign(res, binop(Iop_64HLtoV128, mkexpr(src), mkexpr(src)));
2170       return res;
2171    }
2172    if (srcTy == Ity_I32 || srcTy == Ity_I16 || srcTy == Ity_I8) {
2173       IRTemp t1 = newTemp(Ity_I64);
2174       assign(t1, widenUto64(srcTy, mkexpr(src)));
2175       IRTemp t2 = math_DUP_TO_64(t1, srcTy);
2176       assign(res, binop(Iop_64HLtoV128, mkexpr(t2), mkexpr(t2)));
2177       return res;
2178    }
2179    vassert(0);
2180 }
2181
2182
2183 /* |fullWidth| is a full V128 width result.  Depending on bitQ,
2184    zero out the upper half. */
2185 static IRExpr* math_MAYBE_ZERO_HI64 ( UInt bitQ, IRTemp fullWidth )
2186 {
2187    if (bitQ == 1) return mkexpr(fullWidth);
2188    if (bitQ == 0) return unop(Iop_ZeroHI64ofV128, mkexpr(fullWidth));
2189    vassert(0);
2190 }
2191
2192 /* The same, but from an expression instead. */
2193 static IRExpr* math_MAYBE_ZERO_HI64_fromE ( UInt bitQ, IRExpr* fullWidth )
2194 {
2195    IRTemp fullWidthT = newTempV128();
2196    assign(fullWidthT, fullWidth);
2197    return math_MAYBE_ZERO_HI64(bitQ, fullWidthT);
2198 }
2199
2200
2201 /*------------------------------------------------------------*/
2202 /*--- FP comparison helpers                                ---*/
2203 /*------------------------------------------------------------*/
2204
2205 /* irRes :: Ity_I32 holds a floating point comparison result encoded
2206    as an IRCmpF64Result.  Generate code to convert it to an
2207    ARM64-encoded (N,Z,C,V) group in the lowest 4 bits of an I64 value.
2208    Assign a new temp to hold that value, and return the temp. */
2209 static
2210 IRTemp mk_convert_IRCmpF64Result_to_NZCV ( IRTemp irRes32 )
2211 {
2212    IRTemp ix       = newTemp(Ity_I64);
2213    IRTemp termL    = newTemp(Ity_I64);
2214    IRTemp termR    = newTemp(Ity_I64);
2215    IRTemp nzcv     = newTemp(Ity_I64);
2216    IRTemp irRes    = newTemp(Ity_I64);
2217
2218    /* This is where the fun starts.  We have to convert 'irRes' from
2219       an IR-convention return result (IRCmpF64Result) to an
2220       ARM-encoded (N,Z,C,V) group.  The final result is in the bottom
2221       4 bits of 'nzcv'. */
2222    /* Map compare result from IR to ARM(nzcv) */
2223    /*
2224       FP cmp result | IR   | ARM(nzcv)
2225       --------------------------------
2226       UN              0x45   0011
2227       LT              0x01   1000
2228       GT              0x00   0010
2229       EQ              0x40   0110
2230    */
2231    /* Now since you're probably wondering WTF ..
2232
2233       ix fishes the useful bits out of the IR value, bits 6 and 0, and
2234       places them side by side, giving a number which is 0, 1, 2 or 3.
2235
2236       termL is a sequence cooked up by GNU superopt.  It converts ix
2237          into an almost correct value NZCV value (incredibly), except
2238          for the case of UN, where it produces 0100 instead of the
2239          required 0011.
2240
2241       termR is therefore a correction term, also computed from ix.  It
2242          is 1 in the UN case and 0 for LT, GT and UN.  Hence, to get
2243          the final correct value, we subtract termR from termL.
2244
2245       Don't take my word for it.  There's a test program at the bottom
2246       of guest_arm_toIR.c, to try this out with.
2247    */
2248    assign(irRes, unop(Iop_32Uto64, mkexpr(irRes32)));
2249
2250    assign(
2251       ix,
2252       binop(Iop_Or64,
2253             binop(Iop_And64,
2254                   binop(Iop_Shr64, mkexpr(irRes), mkU8(5)),
2255                   mkU64(3)),
2256             binop(Iop_And64, mkexpr(irRes), mkU64(1))));
2257
2258    assign(
2259       termL,
2260       binop(Iop_Add64,
2261             binop(Iop_Shr64,
2262                   binop(Iop_Sub64,
2263                         binop(Iop_Shl64,
2264                               binop(Iop_Xor64, mkexpr(ix), mkU64(1)),
2265                               mkU8(62)),
2266                         mkU64(1)),
2267                   mkU8(61)),
2268             mkU64(1)));
2269
2270    assign(
2271       termR,
2272       binop(Iop_And64,
2273             binop(Iop_And64,
2274                   mkexpr(ix),
2275                   binop(Iop_Shr64, mkexpr(ix), mkU8(1))),
2276             mkU64(1)));
2277
2278    assign(nzcv, binop(Iop_Sub64, mkexpr(termL), mkexpr(termR)));
2279    return nzcv;
2280 }
2281
2282
2283 /*------------------------------------------------------------*/
2284 /*--- Data processing (immediate)                          ---*/
2285 /*------------------------------------------------------------*/
2286
2287 /* Helper functions for supporting "DecodeBitMasks" */
2288
2289 static ULong dbm_ROR ( Int width, ULong x, Int rot )
2290 {
2291    vassert(width > 0 && width <= 64);
2292    vassert(rot >= 0 && rot < width);
2293    if (rot == 0) return x;
2294    ULong res = x >> rot;
2295    res |= (x << (width - rot));
2296    if (width < 64)
2297      res &= ((1ULL << width) - 1);
2298    return res;
2299 }
2300
2301 static ULong dbm_RepTo64( Int esize, ULong x )
2302 {
2303    switch (esize) {
2304       case 64:
2305          return x;
2306       case 32:
2307          x &= 0xFFFFFFFF; x |= (x << 32);
2308          return x;
2309       case 16:
2310          x &= 0xFFFF; x |= (x << 16); x |= (x << 32);
2311          return x;
2312       case 8:
2313          x &= 0xFF; x |= (x << 8); x |= (x << 16); x |= (x << 32);
2314          return x;
2315       case 4:
2316          x &= 0xF; x |= (x << 4); x |= (x << 8);
2317          x |= (x << 16); x |= (x << 32);
2318          return x;
2319       case 2:
2320          x &= 0x3; x |= (x << 2); x |= (x << 4); x |= (x << 8);
2321          x |= (x << 16); x |= (x << 32);
2322          return x;
2323       default:
2324          break;
2325    }
2326    vpanic("dbm_RepTo64");
2327    /*NOTREACHED*/
2328    return 0;
2329 }
2330
2331 static Int dbm_highestSetBit ( ULong x )
2332 {
2333    Int i;
2334    for (i = 63; i >= 0; i--) {
2335       if (x & (1ULL << i))
2336          return i;
2337    }
2338    vassert(x == 0);
2339    return -1;
2340 }
2341
2342 static
2343 Bool dbm_DecodeBitMasks ( /*OUT*/ULong* wmask, /*OUT*/ULong* tmask,
2344                           ULong immN, ULong imms, ULong immr, Bool immediate,
2345                           UInt M /*32 or 64*/)
2346 {
2347    vassert(immN < (1ULL << 1));
2348    vassert(imms < (1ULL << 6));
2349    vassert(immr < (1ULL << 6));
2350    vassert(immediate == False || immediate == True);
2351    vassert(M == 32 || M == 64);
2352
2353    Int len = dbm_highestSetBit( ((immN << 6) & 64) | ((~imms) & 63) );
2354    if (len < 1) { /* printf("fail1\n"); */ return False; }
2355    vassert(len <= 6);
2356    vassert(M >= (1 << len));
2357
2358    vassert(len >= 1 && len <= 6);
2359    ULong levels = // (zeroes(6 - len) << (6-len)) | ones(len);
2360                   (1 << len) - 1;
2361    vassert(levels >= 1 && levels <= 63);
2362
2363    if (immediate && ((imms & levels) == levels)) {
2364       /* printf("fail2 imms %llu levels %llu len %d\n", imms, levels, len); */
2365       return False;
2366    }
2367
2368    ULong S = imms & levels;
2369    ULong R = immr & levels;
2370    Int   diff = S - R;
2371    diff &= 63;
2372    Int esize = 1 << len;
2373    vassert(2 <= esize && esize <= 64);
2374
2375    /* Be careful of these (1ULL << (S+1)) - 1 expressions, and the
2376       same below with d.  S can be 63 in which case we have an out of
2377       range and hence undefined shift. */
2378    vassert(S >= 0 && S <= 63);
2379    vassert(esize >= (S+1));
2380    ULong elem_s = // Zeroes(esize-(S+1)):Ones(S+1)
2381                   //(1ULL << (S+1)) - 1;
2382                   ((1ULL << S) - 1) + (1ULL << S);
2383
2384    Int d = // diff<len-1:0>
2385            diff & ((1 << len)-1);
2386    vassert(esize >= (d+1));
2387    vassert(d >= 0 && d <= 63);
2388
2389    ULong elem_d = // Zeroes(esize-(d+1)):Ones(d+1)
2390                   //(1ULL << (d+1)) - 1;
2391                   ((1ULL << d) - 1) + (1ULL << d);
2392
2393    if (esize != 64) vassert(elem_s < (1ULL << esize));
2394    if (esize != 64) vassert(elem_d < (1ULL << esize));
2395
2396    if (wmask) *wmask = dbm_RepTo64(esize, dbm_ROR(esize, elem_s, R));
2397    if (tmask) *tmask = dbm_RepTo64(esize, elem_d);
2398
2399    return True;
2400 }
2401
2402
2403 static
2404 Bool dis_ARM64_data_processing_immediate(/*MB_OUT*/DisResult* dres,
2405                                          UInt insn, Bool sigill_diag)
2406 {
2407 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
2408
2409    /* insn[28:23]
2410       10000x PC-rel addressing
2411       10001x Add/subtract (immediate)
2412       100100 Logical (immediate)
2413       100101 Move Wide (immediate)
2414       100110 Bitfield
2415       100111 Extract
2416    */
2417
2418    /* ------------------ ADD/SUB{,S} imm12 ------------------ */
2419    if (INSN(28,24) == BITS5(1,0,0,0,1)) {
2420       Bool is64   = INSN(31,31) == 1;
2421       Bool isSub  = INSN(30,30) == 1;
2422       Bool setCC  = INSN(29,29) == 1;
2423       UInt sh     = INSN(23,22);
2424       UInt uimm12 = INSN(21,10);
2425       UInt nn     = INSN(9,5);
2426       UInt dd     = INSN(4,0);
2427       const HChar* nm = isSub ? "sub" : "add";
2428       if (sh >= 2) {
2429          /* Invalid; fall through */
2430       } else {
2431          vassert(sh <= 1);
2432          uimm12 <<= (12 * sh);
2433          if (is64) {
2434             IRTemp argL  = newTemp(Ity_I64);
2435             IRTemp argR  = newTemp(Ity_I64);
2436             IRTemp res   = newTemp(Ity_I64);
2437             assign(argL, getIReg64orSP(nn));
2438             assign(argR, mkU64(uimm12));
2439             assign(res,  binop(isSub ? Iop_Sub64 : Iop_Add64,
2440                                mkexpr(argL), mkexpr(argR)));
2441             if (setCC) {
2442                putIReg64orZR(dd, mkexpr(res));
2443                setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
2444                DIP("%ss %s, %s, 0x%x\n",
2445                    nm, nameIReg64orZR(dd), nameIReg64orSP(nn), uimm12);
2446             } else {
2447                putIReg64orSP(dd, mkexpr(res));
2448                DIP("%s %s, %s, 0x%x\n",
2449                    nm, nameIReg64orSP(dd), nameIReg64orSP(nn), uimm12);
2450             }
2451          } else {
2452             IRTemp argL  = newTemp(Ity_I32);
2453             IRTemp argR  = newTemp(Ity_I32);
2454             IRTemp res   = newTemp(Ity_I32);
2455             assign(argL, getIReg32orSP(nn));
2456             assign(argR, mkU32(uimm12));
2457             assign(res,  binop(isSub ? Iop_Sub32 : Iop_Add32,
2458                                mkexpr(argL), mkexpr(argR)));
2459             if (setCC) {
2460                putIReg32orZR(dd, mkexpr(res));
2461                setFlags_ADD_SUB(False/*!is64*/, isSub, argL, argR);
2462                DIP("%ss %s, %s, 0x%x\n",
2463                    nm, nameIReg32orZR(dd), nameIReg32orSP(nn), uimm12);
2464             } else {
2465                putIReg32orSP(dd, mkexpr(res));
2466                DIP("%s %s, %s, 0x%x\n",
2467                    nm, nameIReg32orSP(dd), nameIReg32orSP(nn), uimm12);
2468             }
2469          }
2470          return True;
2471       }
2472    }
2473
2474    /* -------------------- ADR/ADRP -------------------- */
2475    if (INSN(28,24) == BITS5(1,0,0,0,0)) {
2476       UInt  bP    = INSN(31,31);
2477       UInt  immLo = INSN(30,29);
2478       UInt  immHi = INSN(23,5);
2479       UInt  rD    = INSN(4,0);
2480       ULong uimm  = (immHi << 2) | immLo;
2481       ULong simm  = sx_to_64(uimm, 21);
2482       ULong val;
2483       if (bP) {
2484          val = (guest_PC_curr_instr & 0xFFFFFFFFFFFFF000ULL) + (simm << 12);
2485       } else {
2486          val = guest_PC_curr_instr + simm;
2487       }
2488       putIReg64orZR(rD, mkU64(val));
2489       DIP("adr%s %s, 0x%llx\n", bP ? "p" : "", nameIReg64orZR(rD), val);
2490       return True;
2491    }
2492
2493    /* -------------------- LOGIC(imm) -------------------- */
2494    if (INSN(28,23) == BITS6(1,0,0,1,0,0)) {
2495       /* 31 30 28     22 21   15   9  4
2496          sf op 100100 N  immr imms Rn Rd
2497            op=00: AND  Rd|SP, Rn, #imm
2498            op=01: ORR  Rd|SP, Rn, #imm
2499            op=10: EOR  Rd|SP, Rn, #imm
2500            op=11: ANDS Rd|ZR, Rn, #imm
2501       */
2502       Bool  is64 = INSN(31,31) == 1;
2503       UInt  op   = INSN(30,29);
2504       UInt  N    = INSN(22,22);
2505       UInt  immR = INSN(21,16);
2506       UInt  immS = INSN(15,10);
2507       UInt  nn   = INSN(9,5);
2508       UInt  dd   = INSN(4,0);
2509       ULong imm  = 0;
2510       Bool  ok;
2511       if (N == 1 && !is64)
2512          goto after_logic_imm; /* not allowed; fall through */
2513       ok = dbm_DecodeBitMasks(&imm, NULL,
2514                               N, immS, immR, True, is64 ? 64 : 32);
2515       if (!ok)
2516          goto after_logic_imm;
2517
2518       const HChar* names[4] = { "and", "orr", "eor", "ands" };
2519       const IROp   ops64[4] = { Iop_And64, Iop_Or64, Iop_Xor64, Iop_And64 };
2520       const IROp   ops32[4] = { Iop_And32, Iop_Or32, Iop_Xor32, Iop_And32 };
2521
2522       vassert(op < 4);
2523       if (is64) {
2524          IRExpr* argL = getIReg64orZR(nn);
2525          IRExpr* argR = mkU64(imm);
2526          IRTemp  res  = newTemp(Ity_I64);
2527          assign(res, binop(ops64[op], argL, argR));
2528          if (op < 3) {
2529             putIReg64orSP(dd, mkexpr(res));
2530             DIP("%s %s, %s, 0x%llx\n", names[op],
2531                 nameIReg64orSP(dd), nameIReg64orZR(nn), imm);
2532          } else {
2533             putIReg64orZR(dd, mkexpr(res));
2534             setFlags_LOGIC(True/*is64*/, res);
2535             DIP("%s %s, %s, 0x%llx\n", names[op],
2536                 nameIReg64orZR(dd), nameIReg64orZR(nn), imm);
2537          }
2538       } else {
2539          IRExpr* argL = getIReg32orZR(nn);
2540          IRExpr* argR = mkU32((UInt)imm);
2541          IRTemp  res  = newTemp(Ity_I32);
2542          assign(res, binop(ops32[op], argL, argR));
2543          if (op < 3) {
2544             putIReg32orSP(dd, mkexpr(res));
2545             DIP("%s %s, %s, 0x%x\n", names[op],
2546                 nameIReg32orSP(dd), nameIReg32orZR(nn), (UInt)imm);
2547          } else {
2548             putIReg32orZR(dd, mkexpr(res));
2549             setFlags_LOGIC(False/*!is64*/, res);
2550             DIP("%s %s, %s, 0x%x\n", names[op],
2551                 nameIReg32orZR(dd), nameIReg32orZR(nn), (UInt)imm);
2552          }
2553       }
2554       return True;
2555    }
2556    after_logic_imm:
2557
2558    /* -------------------- MOV{Z,N,K} -------------------- */
2559    if (INSN(28,23) == BITS6(1,0,0,1,0,1)) {
2560       /* 31 30 28      22 20    4
2561          |  |  |       |  |     |
2562          sf 10 100 101 hw imm16 Rd   MOV(Z) Rd, (imm16 << (16*hw))
2563          sf 00 100 101 hw imm16 Rd   MOV(N) Rd, ~(imm16 << (16*hw))
2564          sf 11 100 101 hw imm16 Rd   MOV(K) Rd, (imm16 << (16*hw))
2565       */
2566       Bool is64   = INSN(31,31) == 1;
2567       UInt subopc = INSN(30,29);
2568       UInt hw     = INSN(22,21);
2569       UInt imm16  = INSN(20,5);
2570       UInt dd     = INSN(4,0);
2571       if (subopc == BITS2(0,1) || (!is64 && hw >= 2)) {
2572          /* invalid; fall through */
2573       } else {
2574          ULong imm64 = ((ULong)imm16) << (16 * hw);
2575          if (!is64)
2576             vassert(imm64 < 0x100000000ULL);
2577          switch (subopc) {
2578             case BITS2(1,0): // MOVZ
2579                putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
2580                DIP("movz %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
2581                break;
2582             case BITS2(0,0): // MOVN
2583                imm64 = ~imm64;
2584                if (!is64)
2585                   imm64 &= 0xFFFFFFFFULL;
2586                putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
2587                DIP("movn %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
2588                break;
2589             case BITS2(1,1): // MOVK
2590                /* This is more complex.  We are inserting a slice into
2591                   the destination register, so we need to have the old
2592                   value of it. */
2593                if (is64) {
2594                   IRTemp old = newTemp(Ity_I64);
2595                   assign(old, getIReg64orZR(dd));
2596                   ULong mask = 0xFFFFULL << (16 * hw);
2597                   IRExpr* res
2598                      = binop(Iop_Or64,
2599                              binop(Iop_And64, mkexpr(old), mkU64(~mask)),
2600                              mkU64(imm64));
2601                   putIReg64orZR(dd, res);
2602                   DIP("movk %s, 0x%x, lsl %u\n",
2603                       nameIReg64orZR(dd), imm16, 16*hw);
2604                } else {
2605                   IRTemp old = newTemp(Ity_I32);
2606                   assign(old, getIReg32orZR(dd));
2607                   vassert(hw <= 1);
2608                   UInt mask = ((UInt)0xFFFF) << (16 * hw);
2609                   IRExpr* res
2610                      = binop(Iop_Or32,
2611                              binop(Iop_And32, mkexpr(old), mkU32(~mask)),
2612                              mkU32((UInt)imm64));
2613                   putIReg32orZR(dd, res);
2614                   DIP("movk %s, 0x%x, lsl %u\n",
2615                       nameIReg32orZR(dd), imm16, 16*hw);
2616                }
2617                break;
2618             default:
2619                vassert(0);
2620          }
2621          return True;
2622       }
2623    }
2624
2625    /* -------------------- {U,S,}BFM -------------------- */
2626    /*    30 28     22 21   15   9  4
2627
2628       sf 10 100110 N  immr imms nn dd
2629          UBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2630          UBFM Xd, Xn, #immr, #imms   when sf=1, N=1
2631
2632       sf 00 100110 N  immr imms nn dd
2633          SBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2634          SBFM Xd, Xn, #immr, #imms   when sf=1, N=1
2635
2636       sf 01 100110 N  immr imms nn dd
2637          BFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2638          BFM Xd, Xn, #immr, #imms   when sf=1, N=1
2639    */
2640    if (INSN(28,23) == BITS6(1,0,0,1,1,0)) {
2641       UInt sf     = INSN(31,31);
2642       UInt opc    = INSN(30,29);
2643       UInt N      = INSN(22,22);
2644       UInt immR   = INSN(21,16);
2645       UInt immS   = INSN(15,10);
2646       UInt nn     = INSN(9,5);
2647       UInt dd     = INSN(4,0);
2648       Bool inZero = False;
2649       Bool extend = False;
2650       const HChar* nm = "???";
2651       /* skip invalid combinations */
2652       switch (opc) {
2653          case BITS2(0,0):
2654             inZero = True; extend = True; nm = "sbfm"; break;
2655          case BITS2(0,1):
2656             inZero = False; extend = False; nm = "bfm"; break;
2657          case BITS2(1,0):
2658             inZero = True; extend = False; nm = "ubfm"; break;
2659          case BITS2(1,1):
2660             goto after_bfm; /* invalid */
2661          default:
2662             vassert(0);
2663       }
2664       if (sf == 1 && N != 1) goto after_bfm;
2665       if (sf == 0 && (N != 0 || ((immR >> 5) & 1) != 0
2666                              || ((immS >> 5) & 1) != 0)) goto after_bfm;
2667       ULong wmask = 0, tmask = 0;
2668       Bool ok = dbm_DecodeBitMasks(&wmask, &tmask,
2669                                    N, immS, immR, False, sf == 1 ? 64 : 32);
2670       if (!ok) goto after_bfm; /* hmmm */
2671
2672       Bool   is64 = sf == 1;
2673       IRType ty   = is64 ? Ity_I64 : Ity_I32;
2674
2675       IRTemp dst = newTemp(ty);
2676       IRTemp src = newTemp(ty);
2677       IRTemp bot = newTemp(ty);
2678       IRTemp top = newTemp(ty);
2679       IRTemp res = newTemp(ty);
2680       assign(dst, inZero ? mkU(ty,0) : getIRegOrZR(is64, dd));
2681       assign(src, getIRegOrZR(is64, nn));
2682       /* perform bitfield move on low bits */
2683       assign(bot, binop(mkOR(ty),
2684                         binop(mkAND(ty), mkexpr(dst), mkU(ty, ~wmask)),
2685                         binop(mkAND(ty), mkexpr(mathROR(ty, src, immR)),
2686                                          mkU(ty, wmask))));
2687       /* determine extension bits (sign, zero or dest register) */
2688       assign(top, mkexpr(extend ? mathREPLICATE(ty, src, immS) : dst));
2689       /* combine extension bits and result bits */
2690       assign(res, binop(mkOR(ty),
2691                         binop(mkAND(ty), mkexpr(top), mkU(ty, ~tmask)),
2692                         binop(mkAND(ty), mkexpr(bot), mkU(ty, tmask))));
2693       putIRegOrZR(is64, dd, mkexpr(res));
2694       DIP("%s %s, %s, immR=%u, immS=%u\n",
2695           nm, nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR, immS);
2696       return True;
2697    }
2698    after_bfm:
2699
2700    /* ---------------------- EXTR ---------------------- */
2701    /*   30 28     22 20 15   9 4
2702       1 00 100111 10 m  imm6 n d  EXTR Xd, Xn, Xm, #imm6
2703       0 00 100111 00 m  imm6 n d  EXTR Wd, Wn, Wm, #imm6 when #imm6 < 32
2704    */
2705    if (INSN(30,23) == BITS8(0,0,1,0,0,1,1,1) && INSN(21,21) == 0) {
2706       Bool is64  = INSN(31,31) == 1;
2707       UInt mm    = INSN(20,16);
2708       UInt imm6  = INSN(15,10);
2709       UInt nn    = INSN(9,5);
2710       UInt dd    = INSN(4,0);
2711       Bool valid = True;
2712       if (INSN(31,31) != INSN(22,22))
2713         valid = False;
2714       if (!is64 && imm6 >= 32)
2715         valid = False;
2716       if (!valid) goto after_extr;
2717       IRType ty    = is64 ? Ity_I64 : Ity_I32;
2718       IRTemp srcHi = newTemp(ty);
2719       IRTemp srcLo = newTemp(ty);
2720       IRTemp res   = newTemp(ty);
2721       assign(srcHi, getIRegOrZR(is64, nn));
2722       assign(srcLo, getIRegOrZR(is64, mm));
2723       if (imm6 == 0) {
2724         assign(res, mkexpr(srcLo));
2725       } else {
2726         UInt szBits = 8 * sizeofIRType(ty);
2727         vassert(imm6 > 0 && imm6 < szBits);
2728         assign(res, binop(mkOR(ty),
2729                           binop(mkSHL(ty), mkexpr(srcHi), mkU8(szBits-imm6)),
2730                           binop(mkSHR(ty), mkexpr(srcLo), mkU8(imm6))));
2731       }
2732       putIRegOrZR(is64, dd, mkexpr(res));
2733       DIP("extr %s, %s, %s, #%u\n",
2734           nameIRegOrZR(is64,dd),
2735           nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm), imm6);
2736       return True;
2737    }
2738   after_extr:
2739
2740    if (sigill_diag) {
2741       vex_printf("ARM64 front end: data_processing_immediate\n");
2742    }
2743    return False;
2744 #  undef INSN
2745 }
2746
2747
2748 /*------------------------------------------------------------*/
2749 /*--- Data processing (register) instructions              ---*/
2750 /*------------------------------------------------------------*/
2751
2752 static const HChar* nameSH ( UInt sh ) {
2753    switch (sh) {
2754       case 0: return "lsl";
2755       case 1: return "lsr";
2756       case 2: return "asr";
2757       case 3: return "ror";
2758       default: vassert(0);
2759    }
2760 }
2761
2762 /* Generate IR to get a register value, possibly shifted by an
2763    immediate.  Returns either a 32- or 64-bit temporary holding the
2764    result.  After the shift, the value can optionally be NOT-ed
2765    too.
2766
2767    sh_how coding: 00=SHL, 01=SHR, 10=SAR, 11=ROR.  sh_amt may only be
2768    in the range 0 to (is64 ? 64 : 32)-1.  For some instructions, ROR
2769    isn't allowed, but it's the job of the caller to check that.
2770 */
2771 static IRTemp getShiftedIRegOrZR ( Bool is64,
2772                                    UInt sh_how, UInt sh_amt, UInt regNo,
2773                                    Bool invert )
2774 {
2775    vassert(sh_how < 4);
2776    vassert(sh_amt < (is64 ? 64 : 32));
2777    IRType ty = is64 ? Ity_I64 : Ity_I32;
2778    IRTemp t0 = newTemp(ty);
2779    assign(t0, getIRegOrZR(is64, regNo));
2780    IRTemp t1 = newTemp(ty);
2781    switch (sh_how) {
2782       case BITS2(0,0):
2783          assign(t1, binop(mkSHL(ty), mkexpr(t0), mkU8(sh_amt)));
2784          break;
2785       case BITS2(0,1):
2786          assign(t1, binop(mkSHR(ty), mkexpr(t0), mkU8(sh_amt)));
2787          break;
2788       case BITS2(1,0):
2789          assign(t1, binop(mkSAR(ty), mkexpr(t0), mkU8(sh_amt)));
2790          break;
2791       case BITS2(1,1):
2792          assign(t1, mkexpr(mathROR(ty, t0, sh_amt)));
2793          break;
2794       default:
2795          vassert(0);
2796    }
2797    if (invert) {
2798       IRTemp t2 = newTemp(ty);
2799       assign(t2, unop(mkNOT(ty), mkexpr(t1)));
2800       return t2;
2801    } else {
2802       return t1;
2803    }
2804 }
2805
2806
2807 static
2808 Bool dis_ARM64_data_processing_register(/*MB_OUT*/DisResult* dres,
2809                                         UInt insn, Bool sigill_diag)
2810 {
2811 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
2812
2813    /* ------------------- ADD/SUB(reg) ------------------- */
2814    /* x==0 => 32 bit op      x==1 => 64 bit op
2815       sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR(NOT ALLOWED)
2816
2817       31 30 29 28    23 21 20 15   9  4
2818       |  |  |  |     |  |  |  |    |  |
2819       x  0  0  01011 sh 0  Rm imm6 Rn Rd   ADD  Rd,Rn, sh(Rm,imm6)
2820       x  0  1  01011 sh 0  Rm imm6 Rn Rd   ADDS Rd,Rn, sh(Rm,imm6)
2821       x  1  0  01011 sh 0  Rm imm6 Rn Rd   SUB  Rd,Rn, sh(Rm,imm6)
2822       x  1  1  01011 sh 0  Rm imm6 Rn Rd   SUBS Rd,Rn, sh(Rm,imm6)
2823    */
2824    if (INSN(28,24) == BITS5(0,1,0,1,1) && INSN(21,21) == 0) {
2825       UInt   bX    = INSN(31,31);
2826       UInt   bOP   = INSN(30,30); /* 0: ADD, 1: SUB */
2827       UInt   bS    = INSN(29, 29); /* set flags? */
2828       UInt   sh    = INSN(23,22);
2829       UInt   rM    = INSN(20,16);
2830       UInt   imm6  = INSN(15,10);
2831       UInt   rN    = INSN(9,5);
2832       UInt   rD    = INSN(4,0);
2833       Bool   isSUB = bOP == 1;
2834       Bool   is64  = bX == 1;
2835       IRType ty    = is64 ? Ity_I64 : Ity_I32;
2836       if ((!is64 && imm6 > 31) || sh == BITS2(1,1)) {
2837          /* invalid; fall through */
2838       } else {
2839          IRTemp argL = newTemp(ty);
2840          assign(argL, getIRegOrZR(is64, rN));
2841          IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, False);
2842          IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
2843          IRTemp res  = newTemp(ty);
2844          assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
2845          if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
2846          if (bS) {
2847             setFlags_ADD_SUB(is64, isSUB, argL, argR);
2848          }
2849          DIP("%s%s %s, %s, %s, %s #%u\n",
2850              bOP ? "sub" : "add", bS ? "s" : "",
2851              nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
2852              nameIRegOrZR(is64, rM), nameSH(sh), imm6);
2853          return True;
2854       }
2855    }
2856
2857    /* ------------------- ADC/SBC(reg) ------------------- */
2858    /* x==0 => 32 bit op      x==1 => 64 bit op
2859
2860       31 30 29 28    23 21 20 15     9  4
2861       |  |  |  |     |  |  |  |      |  |
2862       x  0  0  11010 00 0  Rm 000000 Rn Rd   ADC  Rd,Rn,Rm
2863       x  0  1  11010 00 0  Rm 000000 Rn Rd   ADCS Rd,Rn,Rm
2864       x  1  0  11010 00 0  Rm 000000 Rn Rd   SBC  Rd,Rn,Rm
2865       x  1  1  11010 00 0  Rm 000000 Rn Rd   SBCS Rd,Rn,Rm
2866    */
2867
2868    if (INSN(28,21) == BITS8(1,1,0,1,0,0,0,0) && INSN(15,10) == 0 ) {
2869       UInt   bX    = INSN(31,31);
2870       UInt   bOP   = INSN(30,30); /* 0: ADC, 1: SBC */
2871       UInt   bS    = INSN(29,29); /* set flags */
2872       UInt   rM    = INSN(20,16);
2873       UInt   rN    = INSN(9,5);
2874       UInt   rD    = INSN(4,0);
2875
2876       Bool   isSUB = bOP == 1;
2877       Bool   is64  = bX == 1;
2878       IRType ty    = is64 ? Ity_I64 : Ity_I32;
2879
2880       IRTemp oldC = newTemp(ty);
2881       assign(oldC,
2882              is64 ? mk_arm64g_calculate_flag_c()
2883                   : unop(Iop_64to32, mk_arm64g_calculate_flag_c()) );
2884
2885       IRTemp argL = newTemp(ty);
2886       assign(argL, getIRegOrZR(is64, rN));
2887       IRTemp argR = newTemp(ty);
2888       assign(argR, getIRegOrZR(is64, rM));
2889
2890       IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
2891       IRTemp res  = newTemp(ty);
2892       if (isSUB) {
2893          IRExpr* one = is64 ? mkU64(1) : mkU32(1);
2894          IROp xorOp = is64 ? Iop_Xor64 : Iop_Xor32;
2895          assign(res,
2896                 binop(op,
2897                       binop(op, mkexpr(argL), mkexpr(argR)),
2898                       binop(xorOp, mkexpr(oldC), one)));
2899       } else {
2900          assign(res,
2901                 binop(op,
2902                       binop(op, mkexpr(argL), mkexpr(argR)),
2903                       mkexpr(oldC)));
2904       }
2905
2906       if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
2907
2908       if (bS) {
2909          setFlags_ADC_SBC(is64, isSUB, argL, argR, oldC);
2910       }
2911
2912       DIP("%s%s %s, %s, %s\n",
2913           bOP ? "sbc" : "adc", bS ? "s" : "",
2914           nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
2915           nameIRegOrZR(is64, rM));
2916       return True;
2917    }
2918
2919    /* -------------------- LOGIC(reg) -------------------- */
2920    /* x==0 => 32 bit op      x==1 => 64 bit op
2921       N==0 => inv? is no-op (no inversion)
2922       N==1 => inv? is NOT
2923       sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR
2924
2925       31 30 28    23 21 20 15   9  4
2926       |  |  |     |  |  |  |    |  |
2927       x  00 01010 sh N  Rm imm6 Rn Rd  AND  Rd,Rn, inv?(sh(Rm,imm6))
2928       x  01 01010 sh N  Rm imm6 Rn Rd  ORR  Rd,Rn, inv?(sh(Rm,imm6))
2929       x  10 01010 sh N  Rm imm6 Rn Rd  EOR  Rd,Rn, inv?(sh(Rm,imm6))
2930       x  11 01010 sh N  Rm imm6 Rn Rd  ANDS Rd,Rn, inv?(sh(Rm,imm6))
2931       With N=1, the names are: BIC ORN EON BICS
2932    */
2933    if (INSN(28,24) == BITS5(0,1,0,1,0)) {
2934       UInt   bX   = INSN(31,31);
2935       UInt   sh   = INSN(23,22);
2936       UInt   bN   = INSN(21,21);
2937       UInt   rM   = INSN(20,16);
2938       UInt   imm6 = INSN(15,10);
2939       UInt   rN   = INSN(9,5);
2940       UInt   rD   = INSN(4,0);
2941       Bool   is64 = bX == 1;
2942       IRType ty   = is64 ? Ity_I64 : Ity_I32;
2943       if (!is64 && imm6 > 31) {
2944          /* invalid; fall though */
2945       } else {
2946          IRTemp argL = newTemp(ty);
2947          assign(argL, getIRegOrZR(is64, rN));
2948          IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, bN == 1);
2949          IROp   op   = Iop_INVALID;
2950          switch (INSN(30,29)) {
2951             case BITS2(0,0): case BITS2(1,1): op = mkAND(ty); break;
2952             case BITS2(0,1):                  op = mkOR(ty);  break;
2953             case BITS2(1,0):                  op = mkXOR(ty); break;
2954             default: vassert(0);
2955          }
2956          IRTemp res = newTemp(ty);
2957          assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
2958          if (INSN(30,29) == BITS2(1,1)) {
2959             setFlags_LOGIC(is64, res);
2960          }
2961          putIRegOrZR(is64, rD, mkexpr(res));
2962
2963          static const HChar* names_op[8]
2964             = { "and", "orr", "eor", "ands", "bic", "orn", "eon", "bics" };
2965          vassert(((bN << 2) | INSN(30,29)) < 8);
2966          const HChar* nm_op = names_op[(bN << 2) | INSN(30,29)];
2967          /* Special-case the printing of "MOV" */
2968          if (rN == 31/*zr*/ && sh == 0/*LSL*/ && imm6 == 0 && bN == 0) {
2969             DIP("mov %s, %s\n", nameIRegOrZR(is64, rD),
2970                                 nameIRegOrZR(is64, rM));
2971          } else {
2972             DIP("%s %s, %s, %s, %s #%u\n", nm_op,
2973                 nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
2974                 nameIRegOrZR(is64, rM), nameSH(sh), imm6);
2975          }
2976          return True;
2977       }
2978    }
2979
2980    /* -------------------- {U,S}MULH -------------------- */
2981    /* 31       23 22 20 15     9   4
2982       10011011 1  10 Rm 011111 Rn Rd   UMULH Xd,Xn,Xm
2983       10011011 0  10 Rm 011111 Rn Rd   SMULH Xd,Xn,Xm
2984    */
2985    if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1)
2986        && INSN(22,21) == BITS2(1,0) && INSN(15,10) == BITS6(0,1,1,1,1,1)) {
2987       Bool isU = INSN(23,23) == 1;
2988       UInt mm  = INSN(20,16);
2989       UInt nn  = INSN(9,5);
2990       UInt dd  = INSN(4,0);
2991       putIReg64orZR(dd, unop(Iop_128HIto64,
2992                              binop(isU ? Iop_MullU64 : Iop_MullS64,
2993                                    getIReg64orZR(nn), getIReg64orZR(mm))));
2994       DIP("%cmulh %s, %s, %s\n",
2995           isU ? 'u' : 's',
2996           nameIReg64orZR(dd), nameIReg64orZR(nn), nameIReg64orZR(mm));
2997       return True;
2998    }
2999
3000    /* -------------------- M{ADD,SUB} -------------------- */
3001    /* 31 30           20 15 14 9 4
3002       sf 00 11011 000 m  0  a  n r   MADD Rd,Rn,Rm,Ra  d = a+m*n
3003       sf 00 11011 000 m  1  a  n r   MADD Rd,Rn,Rm,Ra  d = a-m*n
3004    */
3005    if (INSN(30,21) == BITS10(0,0,1,1,0,1,1,0,0,0)) {
3006       Bool is64  = INSN(31,31) == 1;
3007       UInt mm    = INSN(20,16);
3008       Bool isAdd = INSN(15,15) == 0;
3009       UInt aa    = INSN(14,10);
3010       UInt nn    = INSN(9,5);
3011       UInt dd    = INSN(4,0);
3012       if (is64) {
3013          putIReg64orZR(
3014             dd,
3015             binop(isAdd ? Iop_Add64 : Iop_Sub64,
3016                   getIReg64orZR(aa),
3017                   binop(Iop_Mul64, getIReg64orZR(mm), getIReg64orZR(nn))));
3018       } else {
3019          putIReg32orZR(
3020             dd,
3021             binop(isAdd ? Iop_Add32 : Iop_Sub32,
3022                   getIReg32orZR(aa),
3023                   binop(Iop_Mul32, getIReg32orZR(mm), getIReg32orZR(nn))));
3024       }
3025       DIP("%s %s, %s, %s, %s\n",
3026           isAdd ? "madd" : "msub",
3027           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
3028           nameIRegOrZR(is64, mm), nameIRegOrZR(is64, aa));
3029       return True;
3030    }
3031
3032    /* ---------------- CS{EL,INC,INV,NEG} ---------------- */
3033    /* 31 30 28        20 15   11 9  4
3034       sf 00 1101 0100 mm cond 00 nn dd   CSEL  Rd,Rn,Rm
3035       sf 00 1101 0100 mm cond 01 nn dd   CSINC Rd,Rn,Rm
3036       sf 10 1101 0100 mm cond 00 nn dd   CSINV Rd,Rn,Rm
3037       sf 10 1101 0100 mm cond 01 nn dd   CSNEG Rd,Rn,Rm
3038       In all cases, the operation is: Rd = if cond then Rn else OP(Rm)
3039    */
3040    if (INSN(29,21) == BITS9(0, 1,1,0,1, 0,1,0,0) && INSN(11,11) == 0) {
3041       Bool    is64 = INSN(31,31) == 1;
3042       UInt    b30  = INSN(30,30);
3043       UInt    mm   = INSN(20,16);
3044       UInt    cond = INSN(15,12);
3045       UInt    b10  = INSN(10,10);
3046       UInt    nn   = INSN(9,5);
3047       UInt    dd   = INSN(4,0);
3048       UInt    op   = (b30 << 1) | b10; /* 00=id 01=inc 10=inv 11=neg */
3049       IRType  ty   = is64 ? Ity_I64 : Ity_I32;
3050       IRExpr* argL = getIRegOrZR(is64, nn);
3051       IRExpr* argR = getIRegOrZR(is64, mm);
3052       switch (op) {
3053          case BITS2(0,0):
3054             break;
3055          case BITS2(0,1):
3056             argR = binop(mkADD(ty), argR, mkU(ty,1));
3057             break;
3058          case BITS2(1,0):
3059             argR = unop(mkNOT(ty), argR);
3060             break;
3061          case BITS2(1,1):
3062             argR = binop(mkSUB(ty), mkU(ty,0), argR);
3063             break;
3064          default:
3065             vassert(0);
3066       }
3067       putIRegOrZR(
3068          is64, dd,
3069          IRExpr_ITE(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
3070                     argL, argR)
3071       );
3072       const HChar* op_nm[4] = { "csel", "csinc", "csinv", "csneg" };
3073       DIP("%s %s, %s, %s, %s\n", op_nm[op],
3074           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
3075           nameIRegOrZR(is64, mm), nameCC(cond));
3076       return True;
3077    }
3078
3079    /* -------------- ADD/SUB(extended reg) -------------- */
3080    /*     28         20 15  12   9 4
3081       000 01011 00 1 m  opt imm3 n d   ADD  Wd|SP, Wn|SP, Wm ext&lsld
3082       100 01011 00 1 m  opt imm3 n d   ADD  Xd|SP, Xn|SP, Rm ext&lsld
3083
3084       001 01011 00 1 m  opt imm3 n d   ADDS Wd,    Wn|SP, Wm ext&lsld
3085       101 01011 00 1 m  opt imm3 n d   ADDS Xd,    Xn|SP, Rm ext&lsld
3086
3087       010 01011 00 1 m  opt imm3 n d   SUB  Wd|SP, Wn|SP, Wm ext&lsld
3088       110 01011 00 1 m  opt imm3 n d   SUB  Xd|SP, Xn|SP, Rm ext&lsld
3089
3090       011 01011 00 1 m  opt imm3 n d   SUBS Wd,    Wn|SP, Wm ext&lsld
3091       111 01011 00 1 m  opt imm3 n d   SUBS Xd,    Xn|SP, Rm ext&lsld
3092
3093       The 'm' operand is extended per opt, thusly:
3094
3095         000   Xm & 0xFF           UXTB
3096         001   Xm & 0xFFFF         UXTH
3097         010   Xm & (2^32)-1       UXTW
3098         011   Xm                  UXTX
3099
3100         100   Xm sx from bit 7    SXTB
3101         101   Xm sx from bit 15   SXTH
3102         110   Xm sx from bit 31   SXTW
3103         111   Xm                  SXTX
3104
3105       In the 64 bit case (bit31 == 1), UXTX and SXTX are the identity
3106       operation on Xm.  In the 32 bit case, UXTW, UXTX, SXTW and SXTX
3107       are the identity operation on Wm.
3108
3109       After extension, the value is shifted left by imm3 bits, which
3110       may only be in the range 0 .. 4 inclusive.
3111    */
3112    if (INSN(28,21) == BITS8(0,1,0,1,1,0,0,1) && INSN(12,10) <= 4) {
3113       Bool is64  = INSN(31,31) == 1;
3114       Bool isSub = INSN(30,30) == 1;
3115       Bool setCC = INSN(29,29) == 1;
3116       UInt mm    = INSN(20,16);
3117       UInt opt   = INSN(15,13);
3118       UInt imm3  = INSN(12,10);
3119       UInt nn    = INSN(9,5);
3120       UInt dd    = INSN(4,0);
3121       const HChar* nameExt[8] = { "uxtb", "uxth", "uxtw", "uxtx",
3122                                   "sxtb", "sxth", "sxtw", "sxtx" };
3123       /* Do almost the same thing in the 32- and 64-bit cases. */
3124       IRTemp xN = newTemp(Ity_I64);
3125       IRTemp xM = newTemp(Ity_I64);
3126       assign(xN, getIReg64orSP(nn));
3127       assign(xM, getIReg64orZR(mm));
3128       IRExpr* xMw  = mkexpr(xM); /* "xM widened" */
3129       Int     shSX = 0;
3130       /* widen Xm .. */
3131       switch (opt) {
3132          case BITS3(0,0,0): // UXTB
3133             xMw = binop(Iop_And64, xMw, mkU64(0xFF)); break;
3134          case BITS3(0,0,1): // UXTH
3135             xMw = binop(Iop_And64, xMw, mkU64(0xFFFF)); break;
3136          case BITS3(0,1,0): // UXTW -- noop for the 32bit case
3137             if (is64) {
3138                xMw = unop(Iop_32Uto64, unop(Iop_64to32, xMw));
3139             }
3140             break;
3141          case BITS3(0,1,1): // UXTX -- always a noop
3142             break;
3143          case BITS3(1,0,0): // SXTB
3144             shSX = 56; goto sxTo64;
3145          case BITS3(1,0,1): // SXTH
3146             shSX = 48; goto sxTo64;
3147          case BITS3(1,1,0): // SXTW -- noop for the 32bit case
3148             if (is64) {
3149                shSX = 32; goto sxTo64;
3150             }
3151             break;
3152          case BITS3(1,1,1): // SXTX -- always a noop
3153             break;
3154          sxTo64:
3155             vassert(shSX >= 32);
3156             xMw = binop(Iop_Sar64, binop(Iop_Shl64, xMw, mkU8(shSX)),
3157                         mkU8(shSX));
3158             break;
3159          default:
3160             vassert(0);
3161       }
3162       /* and now shift */
3163       IRTemp argL = xN;
3164       IRTemp argR = newTemp(Ity_I64);
3165       assign(argR, binop(Iop_Shl64, xMw, mkU8(imm3)));
3166       IRTemp res = newTemp(Ity_I64);
3167       assign(res, binop(isSub ? Iop_Sub64 : Iop_Add64,
3168                         mkexpr(argL), mkexpr(argR)));
3169       if (is64) {
3170          if (setCC) {
3171             putIReg64orZR(dd, mkexpr(res));
3172             setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
3173          } else {
3174             putIReg64orSP(dd, mkexpr(res));
3175          }
3176       } else {
3177          if (setCC) {
3178             IRTemp argL32 = newTemp(Ity_I32);
3179             IRTemp argR32 = newTemp(Ity_I32);
3180             putIReg32orZR(dd, unop(Iop_64to32, mkexpr(res)));
3181             assign(argL32, unop(Iop_64to32, mkexpr(argL)));
3182             assign(argR32, unop(Iop_64to32, mkexpr(argR)));
3183             setFlags_ADD_SUB(False/*!is64*/, isSub, argL32, argR32);
3184          } else {
3185             putIReg32orSP(dd, unop(Iop_64to32, mkexpr(res)));
3186          }
3187       }
3188       DIP("%s%s %s, %s, %s %s lsl %u\n",
3189           isSub ? "sub" : "add", setCC ? "s" : "",
3190           setCC ? nameIRegOrZR(is64, dd) : nameIRegOrSP(is64, dd),
3191           nameIRegOrSP(is64, nn), nameIRegOrSP(is64, mm),
3192           nameExt[opt], imm3);
3193       return True;
3194    }
3195
3196    /* ---------------- CCMP/CCMN(imm) ---------------- */
3197    /* Bizarrely, these appear in the "data processing register"
3198       category, even though they are operations against an
3199       immediate. */
3200    /* 31   29        20   15   11 9    3
3201       sf 1 111010010 imm5 cond 10 Rn 0 nzcv   CCMP Rn, #imm5, #nzcv, cond
3202       sf 0 111010010 imm5 cond 10 Rn 0 nzcv   CCMN Rn, #imm5, #nzcv, cond
3203
3204       Operation is:
3205          (CCMP) flags = if cond then flags-after-sub(Rn,imm5) else nzcv
3206          (CCMN) flags = if cond then flags-after-add(Rn,imm5) else nzcv
3207    */
3208    if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
3209        && INSN(11,10) == BITS2(1,0) && INSN(4,4) == 0) {
3210       Bool is64  = INSN(31,31) == 1;
3211       Bool isSUB = INSN(30,30) == 1;
3212       UInt imm5  = INSN(20,16);
3213       UInt cond  = INSN(15,12);
3214       UInt nn    = INSN(9,5);
3215       UInt nzcv  = INSN(3,0);
3216
3217       IRTemp condT = newTemp(Ity_I1);
3218       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
3219
3220       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3221       IRTemp argL = newTemp(ty);
3222       IRTemp argR = newTemp(ty);
3223
3224       if (is64) {
3225          assign(argL, getIReg64orZR(nn));
3226          assign(argR, mkU64(imm5));
3227       } else {
3228          assign(argL, getIReg32orZR(nn));
3229          assign(argR, mkU32(imm5));
3230       }
3231       setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
3232
3233       DIP("ccm%c %s, #%u, #%u, %s\n",
3234           isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
3235           imm5, nzcv, nameCC(cond));
3236       return True;
3237    }
3238
3239    /* ---------------- CCMP/CCMN(reg) ---------------- */
3240    /* 31   29        20 15   11 9    3
3241       sf 1 111010010 Rm cond 00 Rn 0 nzcv   CCMP Rn, Rm, #nzcv, cond
3242       sf 0 111010010 Rm cond 00 Rn 0 nzcv   CCMN Rn, Rm, #nzcv, cond
3243       Operation is:
3244          (CCMP) flags = if cond then flags-after-sub(Rn,Rm) else nzcv
3245          (CCMN) flags = if cond then flags-after-add(Rn,Rm) else nzcv
3246    */
3247    if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
3248        && INSN(11,10) == BITS2(0,0) && INSN(4,4) == 0) {
3249       Bool is64  = INSN(31,31) == 1;
3250       Bool isSUB = INSN(30,30) == 1;
3251       UInt mm    = INSN(20,16);
3252       UInt cond  = INSN(15,12);
3253       UInt nn    = INSN(9,5);
3254       UInt nzcv  = INSN(3,0);
3255
3256       IRTemp condT = newTemp(Ity_I1);
3257       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
3258
3259       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3260       IRTemp argL = newTemp(ty);
3261       IRTemp argR = newTemp(ty);
3262
3263       if (is64) {
3264          assign(argL, getIReg64orZR(nn));
3265          assign(argR, getIReg64orZR(mm));
3266       } else {
3267          assign(argL, getIReg32orZR(nn));
3268          assign(argR, getIReg32orZR(mm));
3269       }
3270       setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
3271
3272       DIP("ccm%c %s, %s, #%u, %s\n",
3273           isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
3274           nameIRegOrZR(is64, mm), nzcv, nameCC(cond));
3275       return True;
3276    }
3277
3278
3279    /* -------------- REV/REV16/REV32/RBIT -------------- */
3280    /* 31 30 28       20    15   11 9 4
3281
3282       1  10 11010110 00000 0000 11 n d    (1) REV   Xd, Xn
3283       0  10 11010110 00000 0000 10 n d    (2) REV   Wd, Wn
3284
3285       1  10 11010110 00000 0000 00 n d    (3) RBIT  Xd, Xn
3286       0  10 11010110 00000 0000 00 n d    (4) RBIT  Wd, Wn
3287
3288       1  10 11010110 00000 0000 01 n d    (5) REV16 Xd, Xn
3289       0  10 11010110 00000 0000 01 n d    (6) REV16 Wd, Wn
3290
3291       1  10 11010110 00000 0000 10 n d    (7) REV32 Xd, Xn
3292    */
3293    if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
3294        && INSN(20,12) == BITS9(0,0,0,0,0,0,0,0,0)) {
3295       UInt b31 = INSN(31,31);
3296       UInt opc = INSN(11,10);
3297
3298       UInt ix = 0;
3299       /**/ if (b31 == 1 && opc == BITS2(1,1)) ix = 1;
3300       else if (b31 == 0 && opc == BITS2(1,0)) ix = 2;
3301       else if (b31 == 1 && opc == BITS2(0,0)) ix = 3;
3302       else if (b31 == 0 && opc == BITS2(0,0)) ix = 4;
3303       else if (b31 == 1 && opc == BITS2(0,1)) ix = 5;
3304       else if (b31 == 0 && opc == BITS2(0,1)) ix = 6;
3305       else if (b31 == 1 && opc == BITS2(1,0)) ix = 7;
3306       if (ix >= 1 && ix <= 7) {
3307          Bool   is64  = ix == 1 || ix == 3 || ix == 5 || ix == 7;
3308          UInt   nn    = INSN(9,5);
3309          UInt   dd    = INSN(4,0);
3310          IRTemp src   = newTemp(Ity_I64);
3311          IRTemp dst   = IRTemp_INVALID;
3312          IRTemp (*math)(IRTemp) = NULL;
3313          switch (ix) {
3314             case 1: case 2: math = math_BYTESWAP64;   break;
3315             case 3: case 4: math = math_BITSWAP64;    break;
3316             case 5: case 6: math = math_USHORTSWAP64; break;
3317             case 7:         math = math_UINTSWAP64;   break;
3318             default: vassert(0);
3319          }
3320          const HChar* names[7]
3321            = { "rev", "rev", "rbit", "rbit", "rev16", "rev16", "rev32" };
3322          const HChar* nm = names[ix-1];
3323          vassert(math);
3324          if (ix == 6) {
3325             /* This has to be special cased, since the logic below doesn't
3326                handle it correctly. */
3327             assign(src, getIReg64orZR(nn));
3328             dst = math(src);
3329             putIReg64orZR(dd,
3330                           unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(dst))));
3331          } else if (is64) {
3332             assign(src, getIReg64orZR(nn));
3333             dst = math(src);
3334             putIReg64orZR(dd, mkexpr(dst));
3335          } else {
3336             assign(src, binop(Iop_Shl64, getIReg64orZR(nn), mkU8(32)));
3337             dst = math(src);
3338             putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
3339          }
3340          DIP("%s %s, %s\n", nm,
3341              nameIRegOrZR(is64,dd), nameIRegOrZR(is64,nn));
3342          return True;
3343       }
3344       /* else fall through */
3345    }
3346
3347    /* -------------------- CLZ/CLS -------------------- */
3348    /*    30 28   24   20    15      9 4
3349       sf 10 1101 0110 00000 00010 0 n d    CLZ Rd, Rn
3350       sf 10 1101 0110 00000 00010 1 n d    CLS Rd, Rn
3351    */
3352    if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
3353        && INSN(20,11) == BITS10(0,0,0,0,0,0,0,0,1,0)) {
3354       Bool   is64  = INSN(31,31) == 1;
3355       Bool   isCLS = INSN(10,10) == 1;
3356       UInt   nn    = INSN(9,5);
3357       UInt   dd    = INSN(4,0);
3358       IRTemp src   = newTemp(Ity_I64);
3359       IRTemp srcZ  = newTemp(Ity_I64);
3360       IRTemp dst   = newTemp(Ity_I64);
3361       /* Get the argument, widened out to 64 bit */
3362       if (is64) {
3363          assign(src, getIReg64orZR(nn));
3364       } else {
3365          assign(src, binop(Iop_Shl64,
3366                            unop(Iop_32Uto64, getIReg32orZR(nn)), mkU8(32)));
3367       }
3368       /* If this is CLS, mash the arg around accordingly */
3369       if (isCLS) {
3370          IRExpr* one = mkU8(1);
3371          assign(srcZ,
3372          binop(Iop_Xor64,
3373                binop(Iop_Shl64, mkexpr(src), one),
3374                binop(Iop_Shl64, binop(Iop_Shr64, mkexpr(src), one), one)));
3375       } else {
3376          assign(srcZ, mkexpr(src));
3377       }
3378       /* And compute CLZ. */
3379       if (is64) {
3380          assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
3381                                 mkU64(isCLS ? 63 : 64),
3382                                 unop(Iop_Clz64, mkexpr(srcZ))));
3383          putIReg64orZR(dd, mkexpr(dst));
3384       } else {
3385          assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
3386                                 mkU64(isCLS ? 31 : 32),
3387                                 unop(Iop_Clz64, mkexpr(srcZ))));
3388          putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
3389       }
3390       DIP("cl%c %s, %s\n", isCLS ? 's' : 'z',
3391           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn));
3392       return True;
3393    }
3394
3395    /* ------------------ LSLV/LSRV/ASRV/RORV ------------------ */
3396    /*    30 28        20 15   11 9 4
3397       sf 00 1101 0110 m  0010 00 n d   LSLV Rd,Rn,Rm
3398       sf 00 1101 0110 m  0010 01 n d   LSRV Rd,Rn,Rm
3399       sf 00 1101 0110 m  0010 10 n d   ASRV Rd,Rn,Rm
3400       sf 00 1101 0110 m  0010 11 n d   RORV Rd,Rn,Rm
3401    */
3402    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3403        && INSN(15,12) == BITS4(0,0,1,0)) {
3404       Bool   is64 = INSN(31,31) == 1;
3405       UInt   mm   = INSN(20,16);
3406       UInt   op   = INSN(11,10);
3407       UInt   nn   = INSN(9,5);
3408       UInt   dd   = INSN(4,0);
3409       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3410       IRTemp srcL = newTemp(ty);
3411       IRTemp srcR = newTemp(Ity_I64);
3412       IRTemp res  = newTemp(ty);
3413       IROp   iop  = Iop_INVALID;
3414       assign(srcL, getIRegOrZR(is64, nn));
3415       assign(srcR, binop(Iop_And64, getIReg64orZR(mm),
3416                                     mkU64(is64 ? 63 : 31)));
3417       if (op < 3) {
3418          // LSLV, LSRV, ASRV
3419          switch (op) {
3420             case BITS2(0,0): iop = mkSHL(ty); break;
3421             case BITS2(0,1): iop = mkSHR(ty); break;
3422             case BITS2(1,0): iop = mkSAR(ty); break;
3423             default: vassert(0);
3424          }
3425          assign(res, binop(iop, mkexpr(srcL),
3426                                 unop(Iop_64to8, mkexpr(srcR))));
3427       } else {
3428          // RORV
3429          IROp opSHL = mkSHL(ty);
3430          IROp opSHR = mkSHR(ty);
3431          IROp opOR  = mkOR(ty);
3432          IRExpr* width = mkU64(is64 ? 64: 32);
3433          assign(
3434             res,
3435             IRExpr_ITE(
3436                binop(Iop_CmpEQ64, mkexpr(srcR), mkU64(0)),
3437                mkexpr(srcL),
3438                binop(opOR,
3439                      binop(opSHL,
3440                            mkexpr(srcL),
3441                            unop(Iop_64to8, binop(Iop_Sub64, width,
3442                                                             mkexpr(srcR)))),
3443                      binop(opSHR,
3444                            mkexpr(srcL), unop(Iop_64to8, mkexpr(srcR))))
3445          ));
3446       }
3447       putIRegOrZR(is64, dd, mkexpr(res));
3448       vassert(op < 4);
3449       const HChar* names[4] = { "lslv", "lsrv", "asrv", "rorv" };
3450       DIP("%s %s, %s, %s\n",
3451           names[op], nameIRegOrZR(is64,dd),
3452                      nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm));
3453       return True;
3454    }
3455
3456    /* -------------------- SDIV/UDIV -------------------- */
3457    /*    30 28        20 15    10 9 4
3458       sf 00 1101 0110 m  00001  1 n d  SDIV Rd,Rn,Rm
3459       sf 00 1101 0110 m  00001  0 n d  UDIV Rd,Rn,Rm
3460    */
3461    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3462        && INSN(15,11) == BITS5(0,0,0,0,1)) {
3463       Bool is64 = INSN(31,31) == 1;
3464       UInt mm   = INSN(20,16);
3465       Bool isS  = INSN(10,10) == 1;
3466       UInt nn   = INSN(9,5);
3467       UInt dd   = INSN(4,0);
3468       if (isS) {
3469          putIRegOrZR(is64, dd, binop(is64 ? Iop_DivS64 : Iop_DivS32,
3470                                      getIRegOrZR(is64, nn),
3471                                      getIRegOrZR(is64, mm)));
3472       } else {
3473          putIRegOrZR(is64, dd, binop(is64 ? Iop_DivU64 : Iop_DivU32,
3474                                      getIRegOrZR(is64, nn),
3475                                      getIRegOrZR(is64, mm)));
3476       }
3477       DIP("%cdiv %s, %s, %s\n", isS ? 's' : 'u',
3478           nameIRegOrZR(is64, dd),
3479           nameIRegOrZR(is64, nn), nameIRegOrZR(is64, mm));
3480       return True;
3481    }
3482
3483    /* ------------------ {S,U}M{ADD,SUB}L ------------------ */
3484    /* 31        23  20 15 14 9 4
3485       1001 1011 101 m  0  a  n d   UMADDL Xd,Wn,Wm,Xa
3486       1001 1011 001 m  0  a  n d   SMADDL Xd,Wn,Wm,Xa
3487       1001 1011 101 m  1  a  n d   UMSUBL Xd,Wn,Wm,Xa
3488       1001 1011 001 m  1  a  n d   SMSUBL Xd,Wn,Wm,Xa
3489       with operation
3490          Xd = Xa +/- (Wn *u/s Wm)
3491    */
3492    if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1) && INSN(22,21) == BITS2(0,1)) {
3493       Bool   isU   = INSN(23,23) == 1;
3494       UInt   mm    = INSN(20,16);
3495       Bool   isAdd = INSN(15,15) == 0;
3496       UInt   aa    = INSN(14,10);
3497       UInt   nn    = INSN(9,5);
3498       UInt   dd    = INSN(4,0);
3499       IRTemp wN    = newTemp(Ity_I32);
3500       IRTemp wM    = newTemp(Ity_I32);
3501       IRTemp xA    = newTemp(Ity_I64);
3502       IRTemp muld  = newTemp(Ity_I64);
3503       IRTemp res   = newTemp(Ity_I64);
3504       assign(wN, getIReg32orZR(nn));
3505       assign(wM, getIReg32orZR(mm));
3506       assign(xA, getIReg64orZR(aa));
3507       assign(muld, binop(isU ? Iop_MullU32 : Iop_MullS32,
3508                          mkexpr(wN), mkexpr(wM)));
3509       assign(res, binop(isAdd ? Iop_Add64 : Iop_Sub64,
3510                         mkexpr(xA), mkexpr(muld)));
3511       putIReg64orZR(dd, mkexpr(res));
3512       DIP("%cm%sl %s, %s, %s, %s\n", isU ? 'u' : 's', isAdd ? "add" : "sub",
3513           nameIReg64orZR(dd), nameIReg32orZR(nn),
3514           nameIReg32orZR(mm), nameIReg64orZR(aa));
3515       return True;
3516    }
3517
3518    /* -------------------- CRC32/CRC32C -------------------- */
3519    /* 31 30           20 15   11 9 4
3520       sf 00 1101 0110 m  0100 sz n d   CRC32<sz>  Wd, Wn, Wm|Xm
3521       sf 00 1101 0110 m  0101 sz n d   CRC32C<sz> Wd, Wn, Wm|Xm
3522    */
3523    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3524        && INSN(15,13) == BITS3(0,1,0)) {
3525       UInt bitSF = INSN(31,31);
3526       UInt mm    = INSN(20,16);
3527       UInt bitC  = INSN(12,12);
3528       UInt sz    = INSN(11,10);
3529       UInt nn    = INSN(9,5);
3530       UInt dd    = INSN(4,0);
3531       vassert(sz >= 0 && sz <= 3);
3532       if ((bitSF == 0 && sz <= BITS2(1,0))
3533           || (bitSF == 1 && sz == BITS2(1,1))) {
3534          UInt ix = (bitC == 1 ? 4 : 0) | sz;
3535          void* helpers[8]
3536             = { &arm64g_calc_crc32b,   &arm64g_calc_crc32h,
3537                 &arm64g_calc_crc32w,   &arm64g_calc_crc32x,
3538                 &arm64g_calc_crc32cb,  &arm64g_calc_crc32ch,
3539                 &arm64g_calc_crc32cw,  &arm64g_calc_crc32cx };
3540          const HChar* hNames[8]
3541             = { "arm64g_calc_crc32b",  "arm64g_calc_crc32h",
3542                 "arm64g_calc_crc32w",  "arm64g_calc_crc32x",
3543                 "arm64g_calc_crc32cb", "arm64g_calc_crc32ch",
3544                 "arm64g_calc_crc32cw", "arm64g_calc_crc32cx" };
3545          const HChar* iNames[8]
3546             = { "crc32b",  "crc32h",  "crc32w",  "crc32x",
3547                 "crc32cb", "crc32ch", "crc32cw", "crc32cx" };
3548
3549          IRTemp srcN = newTemp(Ity_I64);
3550          assign(srcN, unop(Iop_32Uto64, unop(Iop_64to32, getIReg64orZR(nn))));
3551
3552          IRTemp  srcM = newTemp(Ity_I64);
3553          IRExpr* at64 = getIReg64orZR(mm);
3554          switch (sz) {
3555             case BITS2(0,0):
3556                assign(srcM, binop(Iop_And64, at64, mkU64(0xFF))); break;
3557             case BITS2(0,1):
3558                assign(srcM, binop(Iop_And64, at64, mkU64(0xFFFF))); break;
3559             case BITS2(1,0):
3560                assign(srcM, binop(Iop_And64, at64, mkU64(0xFFFFFFFF))); break;
3561             case BITS2(1,1):
3562                assign(srcM, at64); break;
3563             default:
3564                vassert(0);
3565          }
3566
3567          vassert(ix >= 0 && ix <= 7);
3568
3569          putIReg64orZR(
3570             dd,
3571             unop(Iop_32Uto64,
3572                  unop(Iop_64to32,
3573                       mkIRExprCCall(Ity_I64, 0/*regparm*/,
3574                                     hNames[ix], helpers[ix],
3575                                     mkIRExprVec_2(mkexpr(srcN),
3576                                                   mkexpr(srcM))))));
3577
3578          DIP("%s %s, %s, %s\n", iNames[ix],
3579              nameIReg32orZR(dd),
3580              nameIReg32orZR(nn), nameIRegOrZR(bitSF == 1, mm));
3581          return True;
3582       }
3583       /* fall through */
3584    }
3585
3586    if (sigill_diag) {
3587       vex_printf("ARM64 front end: data_processing_register\n");
3588    }
3589    return False;
3590 #  undef INSN
3591 }
3592
3593
3594 /*------------------------------------------------------------*/
3595 /*--- Math helpers for vector interleave/deinterleave      ---*/
3596 /*------------------------------------------------------------*/
3597
3598 #define EX(_tmp) \
3599            mkexpr(_tmp)
3600 #define SL(_hi128,_lo128,_nbytes) \
3601            ( (_nbytes) == 0 \
3602                 ? (_lo128) \
3603                 : triop(Iop_SliceV128,(_hi128),(_lo128),mkU8(_nbytes)) )
3604 #define ROR(_v128,_nbytes) \
3605            SL((_v128),(_v128),(_nbytes))
3606 #define ROL(_v128,_nbytes) \
3607            SL((_v128),(_v128),16-(_nbytes))
3608 #define SHR(_v128,_nbytes) \
3609            binop(Iop_ShrV128,(_v128),mkU8(8*(_nbytes)))
3610 #define SHL(_v128,_nbytes) \
3611            binop(Iop_ShlV128,(_v128),mkU8(8*(_nbytes)))
3612 #define ILO64x2(_argL,_argR) \
3613            binop(Iop_InterleaveLO64x2,(_argL),(_argR))
3614 #define IHI64x2(_argL,_argR) \
3615            binop(Iop_InterleaveHI64x2,(_argL),(_argR))
3616 #define ILO32x4(_argL,_argR) \
3617            binop(Iop_InterleaveLO32x4,(_argL),(_argR))
3618 #define IHI32x4(_argL,_argR) \
3619            binop(Iop_InterleaveHI32x4,(_argL),(_argR))
3620 #define ILO16x8(_argL,_argR) \
3621            binop(Iop_InterleaveLO16x8,(_argL),(_argR))
3622 #define IHI16x8(_argL,_argR) \
3623            binop(Iop_InterleaveHI16x8,(_argL),(_argR))
3624 #define ILO8x16(_argL,_argR) \
3625            binop(Iop_InterleaveLO8x16,(_argL),(_argR))
3626 #define IHI8x16(_argL,_argR) \
3627            binop(Iop_InterleaveHI8x16,(_argL),(_argR))
3628 #define CEV32x4(_argL,_argR) \
3629            binop(Iop_CatEvenLanes32x4,(_argL),(_argR))
3630 #define COD32x4(_argL,_argR) \
3631            binop(Iop_CatOddLanes32x4,(_argL),(_argR))
3632 #define COD16x8(_argL,_argR) \
3633            binop(Iop_CatOddLanes16x8,(_argL),(_argR))
3634 #define COD8x16(_argL,_argR) \
3635            binop(Iop_CatOddLanes8x16,(_argL),(_argR))
3636 #define CEV8x16(_argL,_argR) \
3637            binop(Iop_CatEvenLanes8x16,(_argL),(_argR))
3638 #define AND(_arg1,_arg2) \
3639            binop(Iop_AndV128,(_arg1),(_arg2))
3640 #define OR2(_arg1,_arg2) \
3641            binop(Iop_OrV128,(_arg1),(_arg2))
3642 #define OR3(_arg1,_arg2,_arg3) \
3643            binop(Iop_OrV128,(_arg1),binop(Iop_OrV128,(_arg2),(_arg3)))
3644 #define OR4(_arg1,_arg2,_arg3,_arg4) \
3645            binop(Iop_OrV128, \
3646                  binop(Iop_OrV128,(_arg1),(_arg2)), \
3647                  binop(Iop_OrV128,(_arg3),(_arg4)))
3648
3649
3650 /* Do interleaving for 1 128 bit vector, for ST1 insns. */
3651 static
3652 void math_INTERLEAVE1_128( /*OUTx1*/ IRTemp* i0,
3653                            UInt laneSzBlg2, IRTemp u0 )
3654 {
3655    assign(*i0, mkexpr(u0));
3656 }
3657
3658
3659 /* Do interleaving for 2 128 bit vectors, for ST2 insns. */
3660 static
3661 void math_INTERLEAVE2_128( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
3662                            UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
3663 {
3664    /* This is pretty easy, since we have primitives directly to
3665       hand. */
3666    if (laneSzBlg2 == 3) {
3667       // 64x2
3668       // u1 == B1 B0, u0 == A1 A0
3669       // i1 == B1 A1, i0 == B0 A0
3670       assign(*i0, binop(Iop_InterleaveLO64x2, mkexpr(u1), mkexpr(u0)));
3671       assign(*i1, binop(Iop_InterleaveHI64x2, mkexpr(u1), mkexpr(u0)));
3672       return;
3673    }
3674    if (laneSzBlg2 == 2) {
3675       // 32x4
3676       // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
3677       // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
3678       assign(*i0, binop(Iop_InterleaveLO32x4, mkexpr(u1), mkexpr(u0)));
3679       assign(*i1, binop(Iop_InterleaveHI32x4, mkexpr(u1), mkexpr(u0)));
3680       return;
3681    }
3682    if (laneSzBlg2 == 1) {
3683       // 16x8
3684       // u1 == B{7..0}, u0 == A{7..0}
3685       // i0 == B3 A3 B2 A2 B1 A1 B0 A0
3686       // i1 == B7 A7 B6 A6 B5 A5 B4 A4
3687       assign(*i0, binop(Iop_InterleaveLO16x8, mkexpr(u1), mkexpr(u0)));
3688       assign(*i1, binop(Iop_InterleaveHI16x8, mkexpr(u1), mkexpr(u0)));
3689       return;
3690    }
3691    if (laneSzBlg2 == 0) {
3692       // 8x16
3693       // u1 == B{f..0}, u0 == A{f..0}
3694       // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
3695       // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
3696       assign(*i0, binop(Iop_InterleaveLO8x16, mkexpr(u1), mkexpr(u0)));
3697       assign(*i1, binop(Iop_InterleaveHI8x16, mkexpr(u1), mkexpr(u0)));
3698       return;
3699    }
3700    /*NOTREACHED*/
3701    vassert(0);
3702 }
3703
3704
3705 /* Do interleaving for 3 128 bit vectors, for ST3 insns. */
3706 static
3707 void math_INTERLEAVE3_128(
3708         /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
3709         UInt laneSzBlg2,
3710         IRTemp u0, IRTemp u1, IRTemp u2 )
3711 {
3712    if (laneSzBlg2 == 3) {
3713       // 64x2
3714       // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
3715       // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
3716       assign(*i2, IHI64x2( EX(u2), EX(u1) ));
3717       assign(*i1, ILO64x2( ROR(EX(u0),8), EX(u2) ));
3718       assign(*i0, ILO64x2( EX(u1), EX(u0) ));
3719       return;
3720    }
3721
3722    if (laneSzBlg2 == 2) {
3723       // 32x4
3724       // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
3725       // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
3726       // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
3727       IRTemp p0    = newTempV128();
3728       IRTemp p1    = newTempV128();
3729       IRTemp p2    = newTempV128();
3730       IRTemp c1100 = newTempV128();
3731       IRTemp c0011 = newTempV128();
3732       IRTemp c0110 = newTempV128();
3733       assign(c1100, mkV128(0xFF00));
3734       assign(c0011, mkV128(0x00FF));
3735       assign(c0110, mkV128(0x0FF0));
3736       // First interleave them at 64x2 granularity,
3737       // generating partial ("p") values.
3738       math_INTERLEAVE3_128(&p0, &p1, &p2, 3, u0, u1, u2);
3739       // And more shuffling around for the final answer
3740       assign(*i2, OR2( AND( IHI32x4(EX(p2), ROL(EX(p2),8)), EX(c1100) ),
3741                        AND( IHI32x4(ROR(EX(p1),4), EX(p2)), EX(c0011) ) ));
3742       assign(*i1, OR3( SHL(EX(p2),12),
3743                        AND(EX(p1),EX(c0110)),
3744                        SHR(EX(p0),12) ));
3745       assign(*i0, OR2( AND( ILO32x4(EX(p0),ROL(EX(p1),4)), EX(c1100) ),
3746                        AND( ILO32x4(ROR(EX(p0),8),EX(p0)), EX(c0011) ) ));
3747       return;
3748    }
3749
3750    if (laneSzBlg2 == 1) {
3751       // 16x8
3752       // u2 == C7 C6 C5 C4 C3 C2 C1 C0
3753       // u1 == B7 B6 B5 B4 B3 B2 B1 B0
3754       // u0 == A7 A6 A5 A4 A3 A2 A1 A0
3755       //
3756       // p2 == C7 C6 B7 B6 A7 A6 C5 C4
3757       // p1 == B5 B4 A5 A4 C3 C2 B3 B2
3758       // p0 == A3 A2 C1 C0 B1 B0 A1 A0
3759       //
3760       // i2 == C7 B7 A7 C6 B6 A6 C5 B5
3761       // i1 == A5 C4 B4 A4 C4 B3 A3 C2
3762       // i0 == B2 A2 C1 B1 A1 C0 B0 A0
3763       IRTemp p0    = newTempV128();
3764       IRTemp p1    = newTempV128();
3765       IRTemp p2    = newTempV128();
3766       IRTemp c1000 = newTempV128();
3767       IRTemp c0100 = newTempV128();
3768       IRTemp c0010 = newTempV128();
3769       IRTemp c0001 = newTempV128();
3770       assign(c1000, mkV128(0xF000));
3771       assign(c0100, mkV128(0x0F00));
3772       assign(c0010, mkV128(0x00F0));
3773       assign(c0001, mkV128(0x000F));
3774       // First interleave them at 32x4 granularity,
3775       // generating partial ("p") values.
3776       math_INTERLEAVE3_128(&p0, &p1, &p2, 2, u0, u1, u2);
3777       // And more shuffling around for the final answer
3778       assign(*i2,
3779              OR4( AND( IHI16x8( EX(p2),        ROL(EX(p2),4) ), EX(c1000) ),
3780                   AND( IHI16x8( ROL(EX(p2),6), EX(p2)        ), EX(c0100) ),
3781                   AND( IHI16x8( ROL(EX(p2),2), ROL(EX(p2),6) ), EX(c0010) ),
3782                   AND( ILO16x8( ROR(EX(p2),2), ROL(EX(p1),2) ), EX(c0001) )
3783       ));
3784       assign(*i1,
3785              OR4( AND( IHI16x8( ROL(EX(p1),4), ROR(EX(p2),2) ), EX(c1000) ),
3786                   AND( IHI16x8( EX(p1),        ROL(EX(p1),4) ), EX(c0100) ),
3787                   AND( IHI16x8( ROL(EX(p1),4), ROL(EX(p1),8) ), EX(c0010) ),
3788                   AND( IHI16x8( ROR(EX(p0),6), ROL(EX(p1),4) ), EX(c0001) )
3789       ));
3790       assign(*i0,
3791              OR4( AND( IHI16x8( ROR(EX(p1),2), ROL(EX(p0),2) ), EX(c1000) ),
3792                   AND( IHI16x8( ROL(EX(p0),2), ROL(EX(p0),6) ), EX(c0100) ),
3793                   AND( IHI16x8( ROL(EX(p0),8), ROL(EX(p0),2) ), EX(c0010) ),
3794                   AND( IHI16x8( ROL(EX(p0),4), ROL(EX(p0),8) ), EX(c0001) )
3795       ));
3796       return;
3797    }
3798
3799    if (laneSzBlg2 == 0) {
3800       // 8x16.  It doesn't seem worth the hassle of first doing a
3801       // 16x8 interleave, so just generate all 24 partial results
3802       // directly :-(
3803       // u2 == Cf .. C0, u1 == Bf .. B0, u0 == Af .. A0
3804       // i2 == Cf Bf Af Ce .. Bb Ab Ca
3805       // i1 == Ba Aa C9 B9 .. A6 C5 B5
3806       // i0 == A5 C4 B4 A4 .. C0 B0 A0
3807
3808       IRTemp i2_FEDC = newTempV128(); IRTemp i2_BA98 = newTempV128();
3809       IRTemp i2_7654 = newTempV128(); IRTemp i2_3210 = newTempV128();
3810       IRTemp i1_FEDC = newTempV128(); IRTemp i1_BA98 = newTempV128();
3811       IRTemp i1_7654 = newTempV128(); IRTemp i1_3210 = newTempV128();
3812       IRTemp i0_FEDC = newTempV128(); IRTemp i0_BA98 = newTempV128();
3813       IRTemp i0_7654 = newTempV128(); IRTemp i0_3210 = newTempV128();
3814       IRTemp i2_hi64 = newTempV128(); IRTemp i2_lo64 = newTempV128();
3815       IRTemp i1_hi64 = newTempV128(); IRTemp i1_lo64 = newTempV128();
3816       IRTemp i0_hi64 = newTempV128(); IRTemp i0_lo64 = newTempV128();
3817
3818       // eg XXXX(qqq, CC, 0xF, BB, 0xA)) sets qqq to be a vector
3819       // of the form 14 bytes junk : CC[0xF] : BB[0xA]
3820       //
3821 #     define XXXX(_tempName,_srcVec1,_srcShift1,_srcVec2,_srcShift2) \
3822          IRTemp t_##_tempName = newTempV128(); \
3823          assign(t_##_tempName, \
3824                 ILO8x16( ROR(EX(_srcVec1),(_srcShift1)), \
3825                          ROR(EX(_srcVec2),(_srcShift2)) ) )
3826
3827       // Let CC, BB, AA be (handy) aliases of u2, u1, u0 respectively
3828       IRTemp CC = u2; IRTemp BB = u1; IRTemp AA = u0;
3829
3830       // The slicing and reassembly are done as interleavedly as possible,
3831       // so as to minimise the demand for registers in the back end, which
3832       // was observed to be a problem in testing.
3833
3834       XXXX(CfBf, CC, 0xf, BB, 0xf); // i2[15:14]
3835       XXXX(AfCe, AA, 0xf, CC, 0xe);
3836       assign(i2_FEDC, ILO16x8(EX(t_CfBf), EX(t_AfCe)));
3837
3838       XXXX(BeAe, BB, 0xe, AA, 0xe);
3839       XXXX(CdBd, CC, 0xd, BB, 0xd);
3840       assign(i2_BA98, ILO16x8(EX(t_BeAe), EX(t_CdBd)));
3841       assign(i2_hi64, ILO32x4(EX(i2_FEDC), EX(i2_BA98)));
3842
3843       XXXX(AdCc, AA, 0xd, CC, 0xc);
3844       XXXX(BcAc, BB, 0xc, AA, 0xc);
3845       assign(i2_7654, ILO16x8(EX(t_AdCc), EX(t_BcAc)));
3846
3847       XXXX(CbBb, CC, 0xb, BB, 0xb);
3848       XXXX(AbCa, AA, 0xb, CC, 0xa); // i2[1:0]
3849       assign(i2_3210, ILO16x8(EX(t_CbBb), EX(t_AbCa)));
3850       assign(i2_lo64, ILO32x4(EX(i2_7654), EX(i2_3210)));
3851       assign(*i2, ILO64x2(EX(i2_hi64), EX(i2_lo64)));
3852
3853       XXXX(BaAa, BB, 0xa, AA, 0xa); // i1[15:14]
3854       XXXX(C9B9, CC, 0x9, BB, 0x9);
3855       assign(i1_FEDC, ILO16x8(EX(t_BaAa), EX(t_C9B9)));
3856
3857       XXXX(A9C8, AA, 0x9, CC, 0x8);
3858       XXXX(B8A8, BB, 0x8, AA, 0x8);
3859       assign(i1_BA98, ILO16x8(EX(t_A9C8), EX(t_B8A8)));
3860       assign(i1_hi64, ILO32x4(EX(i1_FEDC), EX(i1_BA98)));
3861
3862       XXXX(C7B7, CC, 0x7, BB, 0x7);
3863       XXXX(A7C6, AA, 0x7, CC, 0x6);
3864       assign(i1_7654, ILO16x8(EX(t_C7B7), EX(t_A7C6)));
3865
3866       XXXX(B6A6, BB, 0x6, AA, 0x6);
3867       XXXX(C5B5, CC, 0x5, BB, 0x5); // i1[1:0]
3868       assign(i1_3210, ILO16x8(EX(t_B6A6), EX(t_C5B5)));
3869       assign(i1_lo64, ILO32x4(EX(i1_7654), EX(i1_3210)));
3870       assign(*i1, ILO64x2(EX(i1_hi64), EX(i1_lo64)));
3871
3872       XXXX(A5C4, AA, 0x5, CC, 0x4); // i0[15:14]
3873       XXXX(B4A4, BB, 0x4, AA, 0x4);
3874       assign(i0_FEDC, ILO16x8(EX(t_A5C4), EX(t_B4A4)));
3875
3876       XXXX(C3B3, CC, 0x3, BB, 0x3);
3877       XXXX(A3C2, AA, 0x3, CC, 0x2);
3878       assign(i0_BA98, ILO16x8(EX(t_C3B3), EX(t_A3C2)));
3879       assign(i0_hi64, ILO32x4(EX(i0_FEDC), EX(i0_BA98)));
3880
3881       XXXX(B2A2, BB, 0x2, AA, 0x2);
3882       XXXX(C1B1, CC, 0x1, BB, 0x1);
3883       assign(i0_7654, ILO16x8(EX(t_B2A2), EX(t_C1B1)));
3884
3885       XXXX(A1C0, AA, 0x1, CC, 0x0);
3886       XXXX(B0A0, BB, 0x0, AA, 0x0); // i0[1:0]
3887       assign(i0_3210, ILO16x8(EX(t_A1C0), EX(t_B0A0)));
3888       assign(i0_lo64, ILO32x4(EX(i0_7654), EX(i0_3210)));
3889       assign(*i0, ILO64x2(EX(i0_hi64), EX(i0_lo64)));
3890
3891 #     undef XXXX
3892       return;
3893    }
3894
3895    /*NOTREACHED*/
3896    vassert(0);
3897 }
3898
3899
3900 /* Do interleaving for 4 128 bit vectors, for ST4 insns. */
3901 static
3902 void math_INTERLEAVE4_128(
3903         /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
3904         UInt laneSzBlg2,
3905         IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
3906 {
3907    if (laneSzBlg2 == 3) {
3908       // 64x2
3909       assign(*i0, ILO64x2(EX(u1), EX(u0)));
3910       assign(*i1, ILO64x2(EX(u3), EX(u2)));
3911       assign(*i2, IHI64x2(EX(u1), EX(u0)));
3912       assign(*i3, IHI64x2(EX(u3), EX(u2)));
3913       return;
3914    }
3915    if (laneSzBlg2 == 2) {
3916       // 32x4
3917       // First, interleave at the 64-bit lane size.
3918       IRTemp p0 = newTempV128();
3919       IRTemp p1 = newTempV128();
3920       IRTemp p2 = newTempV128();
3921       IRTemp p3 = newTempV128();
3922       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 3, u0, u1, u2, u3);
3923       // And interleave (cat) at the 32 bit size.
3924       assign(*i0, CEV32x4(EX(p1), EX(p0)));
3925       assign(*i1, COD32x4(EX(p1), EX(p0)));
3926       assign(*i2, CEV32x4(EX(p3), EX(p2)));
3927       assign(*i3, COD32x4(EX(p3), EX(p2)));
3928       return;
3929    }
3930    if (laneSzBlg2 == 1) {
3931       // 16x8
3932       // First, interleave at the 32-bit lane size.
3933       IRTemp p0 = newTempV128();
3934       IRTemp p1 = newTempV128();
3935       IRTemp p2 = newTempV128();
3936       IRTemp p3 = newTempV128();
3937       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 2, u0, u1, u2, u3);
3938       // And rearrange within each vector, to get the right 16 bit lanes.
3939       assign(*i0, COD16x8(EX(p0), SHL(EX(p0), 2)));
3940       assign(*i1, COD16x8(EX(p1), SHL(EX(p1), 2)));
3941       assign(*i2, COD16x8(EX(p2), SHL(EX(p2), 2)));
3942       assign(*i3, COD16x8(EX(p3), SHL(EX(p3), 2)));
3943       return;
3944    }
3945    if (laneSzBlg2 == 0) {
3946       // 8x16
3947       // First, interleave at the 16-bit lane size.
3948       IRTemp p0 = newTempV128();
3949       IRTemp p1 = newTempV128();
3950       IRTemp p2 = newTempV128();
3951       IRTemp p3 = newTempV128();
3952       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 1, u0, u1, u2, u3);
3953       // And rearrange within each vector, to get the right 8 bit lanes.
3954       assign(*i0, IHI32x4(COD8x16(EX(p0),EX(p0)), CEV8x16(EX(p0),EX(p0))));
3955       assign(*i1, IHI32x4(COD8x16(EX(p1),EX(p1)), CEV8x16(EX(p1),EX(p1))));
3956       assign(*i2, IHI32x4(COD8x16(EX(p2),EX(p2)), CEV8x16(EX(p2),EX(p2))));
3957       assign(*i3, IHI32x4(COD8x16(EX(p3),EX(p3)), CEV8x16(EX(p3),EX(p3))));
3958       return;
3959    }
3960    /*NOTREACHED*/
3961    vassert(0);
3962 }
3963
3964
3965 /* Do deinterleaving for 1 128 bit vector, for LD1 insns. */
3966 static
3967 void math_DEINTERLEAVE1_128( /*OUTx1*/ IRTemp* u0,
3968                              UInt laneSzBlg2, IRTemp i0 )
3969 {
3970    assign(*u0, mkexpr(i0));
3971 }
3972
3973
3974 /* Do deinterleaving for 2 128 bit vectors, for LD2 insns. */
3975 static
3976 void math_DEINTERLEAVE2_128( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
3977                              UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
3978 {
3979    /* This is pretty easy, since we have primitives directly to
3980       hand. */
3981    if (laneSzBlg2 == 3) {
3982       // 64x2
3983       // i1 == B1 A1, i0 == B0 A0
3984       // u1 == B1 B0, u0 == A1 A0
3985       assign(*u0, binop(Iop_InterleaveLO64x2, mkexpr(i1), mkexpr(i0)));
3986       assign(*u1, binop(Iop_InterleaveHI64x2, mkexpr(i1), mkexpr(i0)));
3987       return;
3988    }
3989    if (laneSzBlg2 == 2) {
3990       // 32x4
3991       // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
3992       // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
3993       assign(*u0, binop(Iop_CatEvenLanes32x4, mkexpr(i1), mkexpr(i0)));
3994       assign(*u1, binop(Iop_CatOddLanes32x4, mkexpr(i1), mkexpr(i0)));
3995       return;
3996    }
3997    if (laneSzBlg2 == 1) {
3998       // 16x8
3999       // i0 == B3 A3 B2 A2 B1 A1 B0 A0
4000       // i1 == B7 A7 B6 A6 B5 A5 B4 A4
4001       // u1 == B{7..0}, u0 == A{7..0}
4002       assign(*u0, binop(Iop_CatEvenLanes16x8, mkexpr(i1), mkexpr(i0)));
4003       assign(*u1, binop(Iop_CatOddLanes16x8,  mkexpr(i1), mkexpr(i0)));
4004       return;
4005    }
4006    if (laneSzBlg2 == 0) {
4007       // 8x16
4008       // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
4009       // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
4010       // u1 == B{f..0}, u0 == A{f..0}
4011       assign(*u0, binop(Iop_CatEvenLanes8x16, mkexpr(i1), mkexpr(i0)));
4012       assign(*u1, binop(Iop_CatOddLanes8x16,  mkexpr(i1), mkexpr(i0)));
4013       return;
4014    }
4015    /*NOTREACHED*/
4016    vassert(0);
4017 }
4018
4019
4020 /* Do deinterleaving for 3 128 bit vectors, for LD3 insns. */
4021 static
4022 void math_DEINTERLEAVE3_128(
4023         /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
4024         UInt laneSzBlg2,
4025         IRTemp i0, IRTemp i1, IRTemp i2 )
4026 {
4027    if (laneSzBlg2 == 3) {
4028       // 64x2
4029       // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
4030       // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
4031       assign(*u2, ILO64x2( ROL(EX(i2),8), EX(i1)        ));
4032       assign(*u1, ILO64x2( EX(i2),        ROL(EX(i0),8) ));
4033       assign(*u0, ILO64x2( ROL(EX(i1),8), EX(i0)        ));
4034       return;
4035    }
4036
4037    if (laneSzBlg2 == 2) {
4038       // 32x4
4039       // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
4040       // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
4041       // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
4042       IRTemp t_a1c0b0a0 = newTempV128();
4043       IRTemp t_a2c1b1a1 = newTempV128();
4044       IRTemp t_a3c2b2a2 = newTempV128();
4045       IRTemp t_a0c3b3a3 = newTempV128();
4046       IRTemp p0 = newTempV128();
4047       IRTemp p1 = newTempV128();
4048       IRTemp p2 = newTempV128();
4049       // Compute some intermediate values.
4050       assign(t_a1c0b0a0, EX(i0));
4051       assign(t_a2c1b1a1, SL(EX(i1),EX(i0),3*4));
4052       assign(t_a3c2b2a2, SL(EX(i2),EX(i1),2*4));
4053       assign(t_a0c3b3a3, SL(EX(i0),EX(i2),1*4));
4054       // First deinterleave into lane-pairs
4055       assign(p0, ILO32x4(EX(t_a2c1b1a1),EX(t_a1c0b0a0)));
4056       assign(p1, ILO64x2(ILO32x4(EX(t_a0c3b3a3), EX(t_a3c2b2a2)),
4057                          IHI32x4(EX(t_a2c1b1a1), EX(t_a1c0b0a0))));
4058       assign(p2, ILO32x4(ROR(EX(t_a0c3b3a3),1*4), ROR(EX(t_a3c2b2a2),1*4)));
4059       // Then deinterleave at 64x2 granularity.
4060       math_DEINTERLEAVE3_128(u0, u1, u2, 3, p0, p1, p2);
4061       return;
4062    }
4063
4064    if (laneSzBlg2 == 1) {
4065       // 16x8
4066       // u2 == C7 C6 C5 C4 C3 C2 C1 C0
4067       // u1 == B7 B6 B5 B4 B3 B2 B1 B0
4068       // u0 == A7 A6 A5 A4 A3 A2 A1 A0
4069       //
4070       // i2 == C7 B7 A7 C6 B6 A6 C5 B5
4071       // i1 == A5 C4 B4 A4 C4 B3 A3 C2
4072       // i0 == B2 A2 C1 B1 A1 C0 B0 A0
4073       //
4074       // p2 == C7 C6 B7 B6 A7 A6 C5 C4
4075       // p1 == B5 B4 A5 A4 C3 C2 B3 B2
4076       // p0 == A3 A2 C1 C0 B1 B0 A1 A0
4077
4078       IRTemp s0, s1, s2, s3, t0, t1, t2, t3, p0, p1, p2, c00111111;
4079       s0 = s1 = s2 = s3
4080          = t0 = t1 = t2 = t3 = p0 = p1 = p2 = c00111111 = IRTemp_INVALID;
4081       newTempsV128_4(&s0, &s1, &s2, &s3);
4082       newTempsV128_4(&t0, &t1, &t2, &t3);
4083       newTempsV128_4(&p0, &p1, &p2, &c00111111);
4084
4085       // s0 == b2a2 c1b1a1 c0b0a0
4086       // s1 == b4a4 c3b3c3 c2b2a2
4087       // s2 == b6a6 c5b5a5 c4b4a4
4088       // s3 == b0a0 c7b7a7 c6b6a6
4089       assign(s0, EX(i0));
4090       assign(s1, SL(EX(i1),EX(i0),6*2));
4091       assign(s2, SL(EX(i2),EX(i1),4*2));
4092       assign(s3, SL(EX(i0),EX(i2),2*2));
4093
4094       // t0 == 0 0 c1c0 b1b0 a1a0
4095       // t1 == 0 0 c3c2 b3b2 a3a2
4096       // t2 == 0 0 c5c4 b5b4 a5a4
4097       // t3 == 0 0 c7c6 b7b6 a7a6
4098       assign(c00111111, mkV128(0x0FFF));
4099       assign(t0, AND( ILO16x8( ROR(EX(s0),3*2), EX(s0)), EX(c00111111)));
4100       assign(t1, AND( ILO16x8( ROR(EX(s1),3*2), EX(s1)), EX(c00111111)));
4101       assign(t2, AND( ILO16x8( ROR(EX(s2),3*2), EX(s2)), EX(c00111111)));
4102       assign(t3, AND( ILO16x8( ROR(EX(s3),3*2), EX(s3)), EX(c00111111)));
4103
4104       assign(p0, OR2(EX(t0),          SHL(EX(t1),6*2)));
4105       assign(p1, OR2(SHL(EX(t2),4*2), SHR(EX(t1),2*2)));
4106       assign(p2, OR2(SHL(EX(t3),2*2), SHR(EX(t2),4*2)));
4107
4108       // Then deinterleave at 32x4 granularity.
4109       math_DEINTERLEAVE3_128(u0, u1, u2, 2, p0, p1, p2);
4110       return;
4111    }
4112
4113    if (laneSzBlg2 == 0) {
4114       // 8x16.  This is the same scheme as for 16x8, with twice the
4115       // number of intermediate values.
4116       //
4117       // u2 == C{f..0}
4118       // u1 == B{f..0}
4119       // u0 == A{f..0}
4120       //
4121       // i2 == CBA{f} CBA{e} CBA{d} CBA{c} CBA{b} C{a}
4122       // i1 ==  BA{a} CBA{9} CBA{8} CBA{7} CBA{6} CB{5}
4123       // i0 ==   A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
4124       //
4125       // p2 == C{fe} B{fe} A{fe} C{dc} B{dc} A{dc} C{ba} B{ba}
4126       // p1 == A{ba} C{98} B{98} A{98} C{76} B{76} A{76} C{54}
4127       // p0 == B{54} A{54} C{32} B{32} A{32} C{10} B{10} A{10}
4128       //
4129       IRTemp s0, s1, s2, s3, s4, s5, s6, s7,
4130              t0, t1, t2, t3, t4, t5, t6, t7, p0, p1, p2, cMASK;
4131       s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7
4132          = t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = p0 = p1 = p2 = cMASK
4133          = IRTemp_INVALID;
4134       newTempsV128_4(&s0, &s1, &s2, &s3);
4135       newTempsV128_4(&s4, &s5, &s6, &s7);
4136       newTempsV128_4(&t0, &t1, &t2, &t3);
4137       newTempsV128_4(&t4, &t5, &t6, &t7);
4138       newTempsV128_4(&p0, &p1, &p2, &cMASK);
4139
4140       // s0 == A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
4141       // s1 == A{7} CBA{6} CBA{5} CBA{4} CBA{3} CBA{2}
4142       // s2 == A{9} CBA{8} CBA{7} CBA{6} CBA{5} CBA{4}
4143       // s3 == A{b} CBA{a} CBA{9} CBA{8} CBA{7} CBA{6}
4144       // s4 == A{d} CBA{c} CBA{b} CBA{a} CBA{9} CBA{8}
4145       // s5 == A{f} CBA{e} CBA{d} CBA{c} CBA{b} CBA{a}
4146       // s6 == A{1} CBA{0} CBA{f} CBA{e} CBA{d} CBA{c}
4147       // s7 == A{3} CBA{2} CBA{1} CBA{0} CBA{f} CBA{e}
4148       assign(s0, SL(EX(i1),EX(i0), 0));
4149       assign(s1, SL(EX(i1),EX(i0), 6));
4150       assign(s2, SL(EX(i1),EX(i0),12));
4151       assign(s3, SL(EX(i2),EX(i1), 2));
4152       assign(s4, SL(EX(i2),EX(i1), 8));
4153       assign(s5, SL(EX(i2),EX(i1),14));
4154       assign(s6, SL(EX(i0),EX(i2), 4));
4155       assign(s7, SL(EX(i0),EX(i2),10));
4156
4157       // t0 == 0--(ten)--0 C1 C0 B1 B0 A1 A0
4158       // t1 == 0--(ten)--0 C3 C2 B3 B2 A3 A2
4159       // t2 == 0--(ten)--0 C5 C4 B5 B4 A5 A4
4160       // t3 == 0--(ten)--0 C7 C6 B7 B6 A7 A6
4161       // t4 == 0--(ten)--0 C9 C8 B9 B8 A9 A8
4162       // t5 == 0--(ten)--0 Cb Ca Bb Ba Ab Aa
4163       // t6 == 0--(ten)--0 Cd Cc Bd Bc Ad Ac
4164       // t7 == 0--(ten)--0 Cf Ce Bf Be Af Ae
4165       assign(cMASK, mkV128(0x003F));
4166       assign(t0, AND( ILO8x16( ROR(EX(s0),3), EX(s0)), EX(cMASK)));
4167       assign(t1, AND( ILO8x16( ROR(EX(s1),3), EX(s1)), EX(cMASK)));
4168       assign(t2, AND( ILO8x16( ROR(EX(s2),3), EX(s2)), EX(cMASK)));
4169       assign(t3, AND( ILO8x16( ROR(EX(s3),3), EX(s3)), EX(cMASK)));
4170       assign(t4, AND( ILO8x16( ROR(EX(s4),3), EX(s4)), EX(cMASK)));
4171       assign(t5, AND( ILO8x16( ROR(EX(s5),3), EX(s5)), EX(cMASK)));
4172       assign(t6, AND( ILO8x16( ROR(EX(s6),3), EX(s6)), EX(cMASK)));
4173       assign(t7, AND( ILO8x16( ROR(EX(s7),3), EX(s7)), EX(cMASK)));
4174
4175       assign(p0, OR3( SHL(EX(t2),12), SHL(EX(t1),6), EX(t0) ));
4176       assign(p1, OR4( SHL(EX(t5),14), SHL(EX(t4),8),
4177                  SHL(EX(t3),2), SHR(EX(t2),4) ));
4178       assign(p2, OR3( SHL(EX(t7),10), SHL(EX(t6),4), SHR(EX(t5),2) ));
4179
4180       // Then deinterleave at 16x8 granularity.
4181       math_DEINTERLEAVE3_128(u0, u1, u2, 1, p0, p1, p2);
4182       return;
4183    }
4184
4185    /*NOTREACHED*/
4186    vassert(0);
4187 }
4188
4189
4190 /* Do deinterleaving for 4 128 bit vectors, for LD4 insns. */
4191 static
4192 void math_DEINTERLEAVE4_128(
4193         /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
4194         UInt laneSzBlg2,
4195         IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
4196 {
4197    if (laneSzBlg2 == 3) {
4198       // 64x2
4199       assign(*u0, ILO64x2(EX(i2), EX(i0)));
4200       assign(*u1, IHI64x2(EX(i2), EX(i0)));
4201       assign(*u2, ILO64x2(EX(i3), EX(i1)));
4202       assign(*u3, IHI64x2(EX(i3), EX(i1)));
4203       return;
4204    }
4205    if (laneSzBlg2 == 2) {
4206       // 32x4
4207       IRTemp p0 = newTempV128();
4208       IRTemp p2 = newTempV128();
4209       IRTemp p1 = newTempV128();
4210       IRTemp p3 = newTempV128();
4211       assign(p0, ILO32x4(EX(i1), EX(i0)));
4212       assign(p1, IHI32x4(EX(i1), EX(i0)));
4213       assign(p2, ILO32x4(EX(i3), EX(i2)));
4214       assign(p3, IHI32x4(EX(i3), EX(i2)));
4215       // And now do what we did for the 64-bit case.
4216       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 3, p0, p1, p2, p3);
4217       return;
4218    }
4219    if (laneSzBlg2 == 1) {
4220       // 16x8
4221       // Deinterleave into 32-bit chunks, then do as the 32-bit case.
4222       IRTemp p0 = newTempV128();
4223       IRTemp p1 = newTempV128();
4224       IRTemp p2 = newTempV128();
4225       IRTemp p3 = newTempV128();
4226       assign(p0, IHI16x8(EX(i0), SHL(EX(i0), 8)));
4227       assign(p1, IHI16x8(EX(i1), SHL(EX(i1), 8)));
4228       assign(p2, IHI16x8(EX(i2), SHL(EX(i2), 8)));
4229       assign(p3, IHI16x8(EX(i3), SHL(EX(i3), 8)));
4230       // From here on is like the 32 bit case.
4231       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 2, p0, p1, p2, p3);
4232       return;
4233    }
4234    if (laneSzBlg2 == 0) {
4235       // 8x16
4236       // Deinterleave into 16-bit chunks, then do as the 16-bit case.
4237       IRTemp p0 = newTempV128();
4238       IRTemp p1 = newTempV128();
4239       IRTemp p2 = newTempV128();
4240       IRTemp p3 = newTempV128();
4241       assign(p0, IHI64x2( IHI8x16(EX(i0),ROL(EX(i0),4)),
4242                           ILO8x16(EX(i0),ROL(EX(i0),4)) ));
4243       assign(p1, IHI64x2( IHI8x16(EX(i1),ROL(EX(i1),4)),
4244                           ILO8x16(EX(i1),ROL(EX(i1),4)) ));
4245       assign(p2, IHI64x2( IHI8x16(EX(i2),ROL(EX(i2),4)),
4246                           ILO8x16(EX(i2),ROL(EX(i2),4)) ));
4247       assign(p3, IHI64x2( IHI8x16(EX(i3),ROL(EX(i3),4)),
4248                           ILO8x16(EX(i3),ROL(EX(i3),4)) ));
4249       // From here on is like the 16 bit case.
4250       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 1, p0, p1, p2, p3);
4251       return;
4252    }
4253    /*NOTREACHED*/
4254    vassert(0);
4255 }
4256
4257
4258 /* Wrappers that use the full-width (de)interleavers to do half-width
4259    (de)interleaving.  The scheme is to clone each input lane in the
4260    lower half of each incoming value, do a full width (de)interleave
4261    at the next lane size up, and remove every other lane of the the
4262    result.  The returned values may have any old junk in the upper
4263    64 bits -- the caller must ignore that. */
4264
4265 /* Helper function -- get doubling and narrowing operations. */
4266 static
4267 void math_get_doubler_and_halver ( /*OUT*/IROp* doubler,
4268                                    /*OUT*/IROp* halver,
4269                                    UInt laneSzBlg2 )
4270 {
4271    switch (laneSzBlg2) {
4272       case 2:
4273          *doubler = Iop_InterleaveLO32x4; *halver = Iop_CatEvenLanes32x4;
4274          break;
4275       case 1:
4276          *doubler = Iop_InterleaveLO16x8; *halver = Iop_CatEvenLanes16x8;
4277          break;
4278       case 0:
4279          *doubler = Iop_InterleaveLO8x16; *halver = Iop_CatEvenLanes8x16;
4280          break;
4281       default:
4282          vassert(0);
4283    }
4284 }
4285
4286 /* Do interleaving for 1 64 bit vector, for ST1 insns. */
4287 static
4288 void math_INTERLEAVE1_64( /*OUTx1*/ IRTemp* i0,
4289                           UInt laneSzBlg2, IRTemp u0 )
4290 {
4291    assign(*i0, mkexpr(u0));
4292 }
4293
4294
4295 /* Do interleaving for 2 64 bit vectors, for ST2 insns. */
4296 static
4297 void math_INTERLEAVE2_64( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
4298                           UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
4299 {
4300    if (laneSzBlg2 == 3) {
4301       // 1x64, degenerate case
4302       assign(*i0, EX(u0));
4303       assign(*i1, EX(u1));
4304       return;
4305    }
4306
4307    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4308    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4309    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4310
4311    IRTemp du0 = newTempV128();
4312    IRTemp du1 = newTempV128();
4313    assign(du0, binop(doubler, EX(u0), EX(u0)));
4314    assign(du1, binop(doubler, EX(u1), EX(u1)));
4315    IRTemp di0 = newTempV128();
4316    IRTemp di1 = newTempV128();
4317    math_INTERLEAVE2_128(&di0, &di1, laneSzBlg2 + 1, du0, du1);
4318    assign(*i0, binop(halver, EX(di0), EX(di0)));
4319    assign(*i1, binop(halver, EX(di1), EX(di1)));
4320 }
4321
4322
4323 /* Do interleaving for 3 64 bit vectors, for ST3 insns. */
4324 static
4325 void math_INTERLEAVE3_64(
4326         /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
4327         UInt laneSzBlg2,
4328         IRTemp u0, IRTemp u1, IRTemp u2 )
4329 {
4330    if (laneSzBlg2 == 3) {
4331       // 1x64, degenerate case
4332       assign(*i0, EX(u0));
4333       assign(*i1, EX(u1));
4334       assign(*i2, EX(u2));
4335       return;
4336    }
4337
4338    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4339    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4340    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4341
4342    IRTemp du0 = newTempV128();
4343    IRTemp du1 = newTempV128();
4344    IRTemp du2 = newTempV128();
4345    assign(du0, binop(doubler, EX(u0), EX(u0)));
4346    assign(du1, binop(doubler, EX(u1), EX(u1)));
4347    assign(du2, binop(doubler, EX(u2), EX(u2)));
4348    IRTemp di0 = newTempV128();
4349    IRTemp di1 = newTempV128();
4350    IRTemp di2 = newTempV128();
4351    math_INTERLEAVE3_128(&di0, &di1, &di2, laneSzBlg2 + 1, du0, du1, du2);
4352    assign(*i0, binop(halver, EX(di0), EX(di0)));
4353    assign(*i1, binop(halver, EX(di1), EX(di1)));
4354    assign(*i2, binop(halver, EX(di2), EX(di2)));
4355 }
4356
4357
4358 /* Do interleaving for 4 64 bit vectors, for ST4 insns. */
4359 static
4360 void math_INTERLEAVE4_64(
4361         /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
4362         UInt laneSzBlg2,
4363         IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
4364 {
4365    if (laneSzBlg2 == 3) {
4366       // 1x64, degenerate case
4367       assign(*i0, EX(u0));
4368       assign(*i1, EX(u1));
4369       assign(*i2, EX(u2));
4370       assign(*i3, EX(u3));
4371       return;
4372    }
4373
4374    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4375    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4376    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4377
4378    IRTemp du0 = newTempV128();
4379    IRTemp du1 = newTempV128();
4380    IRTemp du2 = newTempV128();
4381    IRTemp du3 = newTempV128();
4382    assign(du0, binop(doubler, EX(u0), EX(u0)));
4383    assign(du1, binop(doubler, EX(u1), EX(u1)));
4384    assign(du2, binop(doubler, EX(u2), EX(u2)));
4385    assign(du3, binop(doubler, EX(u3), EX(u3)));
4386    IRTemp di0 = newTempV128();
4387    IRTemp di1 = newTempV128();
4388    IRTemp di2 = newTempV128();
4389    IRTemp di3 = newTempV128();
4390    math_INTERLEAVE4_128(&di0, &di1, &di2, &di3,
4391                         laneSzBlg2 + 1, du0, du1, du2, du3);
4392    assign(*i0, binop(halver, EX(di0), EX(di0)));
4393    assign(*i1, binop(halver, EX(di1), EX(di1)));
4394    assign(*i2, binop(halver, EX(di2), EX(di2)));
4395    assign(*i3, binop(halver, EX(di3), EX(di3)));
4396 }
4397
4398
4399 /* Do deinterleaving for 1 64 bit vector, for LD1 insns. */
4400 static
4401 void math_DEINTERLEAVE1_64( /*OUTx1*/ IRTemp* u0,
4402                             UInt laneSzBlg2, IRTemp i0 )
4403 {
4404    assign(*u0, mkexpr(i0));
4405 }
4406
4407
4408 /* Do deinterleaving for 2 64 bit vectors, for LD2 insns. */
4409 static
4410 void math_DEINTERLEAVE2_64( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
4411                             UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
4412 {
4413    if (laneSzBlg2 == 3) {
4414       // 1x64, degenerate case
4415       assign(*u0, EX(i0));
4416       assign(*u1, EX(i1));
4417       return;
4418    }
4419
4420    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4421    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4422    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4423
4424    IRTemp di0 = newTempV128();
4425    IRTemp di1 = newTempV128();
4426    assign(di0, binop(doubler, EX(i0), EX(i0)));
4427    assign(di1, binop(doubler, EX(i1), EX(i1)));
4428
4429    IRTemp du0 = newTempV128();
4430    IRTemp du1 = newTempV128();
4431    math_DEINTERLEAVE2_128(&du0, &du1, laneSzBlg2 + 1, di0, di1);
4432    assign(*u0, binop(halver, EX(du0), EX(du0)));
4433    assign(*u1, binop(halver, EX(du1), EX(du1)));
4434 }
4435
4436
4437 /* Do deinterleaving for 3 64 bit vectors, for LD3 insns. */
4438 static
4439 void math_DEINTERLEAVE3_64(
4440         /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
4441         UInt laneSzBlg2,
4442         IRTemp i0, IRTemp i1, IRTemp i2 )
4443 {
4444    if (laneSzBlg2 == 3) {
4445       // 1x64, degenerate case
4446       assign(*u0, EX(i0));
4447       assign(*u1, EX(i1));
4448       assign(*u2, EX(i2));
4449       return;
4450    }
4451
4452    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4453    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4454    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4455
4456    IRTemp di0 = newTempV128();
4457    IRTemp di1 = newTempV128();
4458    IRTemp di2 = newTempV128();
4459    assign(di0, binop(doubler, EX(i0), EX(i0)));
4460    assign(di1, binop(doubler, EX(i1), EX(i1)));
4461    assign(di2, binop(doubler, EX(i2), EX(i2)));
4462    IRTemp du0 = newTempV128();
4463    IRTemp du1 = newTempV128();
4464    IRTemp du2 = newTempV128();
4465    math_DEINTERLEAVE3_128(&du0, &du1, &du2, laneSzBlg2 + 1, di0, di1, di2);
4466    assign(*u0, binop(halver, EX(du0), EX(du0)));
4467    assign(*u1, binop(halver, EX(du1), EX(du1)));
4468    assign(*u2, binop(halver, EX(du2), EX(du2)));
4469 }
4470
4471
4472 /* Do deinterleaving for 4 64 bit vectors, for LD4 insns. */
4473 static
4474 void math_DEINTERLEAVE4_64(
4475         /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
4476         UInt laneSzBlg2,
4477         IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
4478 {
4479    if (laneSzBlg2 == 3) {
4480       // 1x64, degenerate case
4481       assign(*u0, EX(i0));
4482       assign(*u1, EX(i1));
4483       assign(*u2, EX(i2));
4484       assign(*u3, EX(i3));
4485       return;
4486    }
4487
4488    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4489    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4490    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4491
4492    IRTemp di0 = newTempV128();
4493    IRTemp di1 = newTempV128();
4494    IRTemp di2 = newTempV128();
4495    IRTemp di3 = newTempV128();
4496    assign(di0, binop(doubler, EX(i0), EX(i0)));
4497    assign(di1, binop(doubler, EX(i1), EX(i1)));
4498    assign(di2, binop(doubler, EX(i2), EX(i2)));
4499    assign(di3, binop(doubler, EX(i3), EX(i3)));
4500    IRTemp du0 = newTempV128();
4501    IRTemp du1 = newTempV128();
4502    IRTemp du2 = newTempV128();
4503    IRTemp du3 = newTempV128();
4504    math_DEINTERLEAVE4_128(&du0, &du1, &du2, &du3,
4505                           laneSzBlg2 + 1, di0, di1, di2, di3);
4506    assign(*u0, binop(halver, EX(du0), EX(du0)));
4507    assign(*u1, binop(halver, EX(du1), EX(du1)));
4508    assign(*u2, binop(halver, EX(du2), EX(du2)));
4509    assign(*u3, binop(halver, EX(du3), EX(du3)));
4510 }
4511
4512
4513 #undef EX
4514 #undef SL
4515 #undef ROR
4516 #undef ROL
4517 #undef SHR
4518 #undef SHL
4519 #undef ILO64x2
4520 #undef IHI64x2
4521 #undef ILO32x4
4522 #undef IHI32x4
4523 #undef ILO16x8
4524 #undef IHI16x8
4525 #undef ILO16x8
4526 #undef IHI16x8
4527 #undef CEV32x4
4528 #undef COD32x4
4529 #undef COD16x8
4530 #undef COD8x16
4531 #undef CEV8x16
4532 #undef AND
4533 #undef OR2
4534 #undef OR3
4535 #undef OR4
4536
4537
4538 /*------------------------------------------------------------*/
4539 /*--- Load and Store instructions                          ---*/
4540 /*------------------------------------------------------------*/
4541
4542 /* Generate the EA for a "reg + reg" style amode.  This is done from
4543    parts of the insn, but for sanity checking sake it takes the whole
4544    insn.  This appears to depend on insn[15:12], with opt=insn[15:13]
4545    and S=insn[12]:
4546
4547    The possible forms, along with their opt:S values, are:
4548       011:0   Xn|SP + Xm
4549       111:0   Xn|SP + Xm
4550       011:1   Xn|SP + Xm * transfer_szB
4551       111:1   Xn|SP + Xm * transfer_szB
4552       010:0   Xn|SP + 32Uto64(Wm)
4553       010:1   Xn|SP + 32Uto64(Wm) * transfer_szB
4554       110:0   Xn|SP + 32Sto64(Wm)
4555       110:1   Xn|SP + 32Sto64(Wm) * transfer_szB
4556
4557    Rm is insn[20:16].  Rn is insn[9:5].  Rt is insn[4:0].  Log2 of
4558    the transfer size is insn[23,31,30].  For integer loads/stores,
4559    insn[23] is zero, hence szLg2 can be at most 3 in such cases.
4560
4561    If the decoding fails, it returns IRTemp_INVALID.
4562
4563    isInt is True iff this is decoding is for transfers to/from integer
4564    registers.  If False it is for transfers to/from vector registers.
4565 */
4566 static IRTemp gen_indexed_EA ( /*OUT*/HChar* buf, UInt insn, Bool isInt )
4567 {
4568    UInt    optS  = SLICE_UInt(insn, 15, 12);
4569    UInt    mm    = SLICE_UInt(insn, 20, 16);
4570    UInt    nn    = SLICE_UInt(insn, 9, 5);
4571    UInt    szLg2 = (isInt ? 0 : (SLICE_UInt(insn, 23, 23) << 2))
4572                    | SLICE_UInt(insn, 31, 30); // Log2 of the size
4573
4574    buf[0] = 0;
4575
4576    /* Sanity checks, that this really is a load/store insn. */
4577    if (SLICE_UInt(insn, 11, 10) != BITS2(1,0))
4578       goto fail;
4579
4580    if (isInt
4581        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,1,1)/*LDR*/
4582        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,0,1)/*STR*/
4583        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,0,1)/*LDRSbhw Xt*/
4584        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,1,1))/*LDRSbhw Wt*/
4585       goto fail;
4586
4587    if (!isInt
4588        && SLICE_UInt(insn, 29, 24) != BITS6(1,1,1,1,0,0)) /*LDR/STR*/
4589       goto fail;
4590
4591    /* Throw out non-verified but possibly valid cases. */
4592    switch (szLg2) {
4593       case BITS3(0,0,0): break; //  8 bit, valid for both int and vec
4594       case BITS3(0,0,1): break; // 16 bit, valid for both int and vec
4595       case BITS3(0,1,0): break; // 32 bit, valid for both int and vec
4596       case BITS3(0,1,1): break; // 64 bit, valid for both int and vec
4597       case BITS3(1,0,0): // can only ever be valid for the vector case
4598                          if (isInt) goto fail; else break;
4599       case BITS3(1,0,1): // these sizes are never valid
4600       case BITS3(1,1,0):
4601       case BITS3(1,1,1): goto fail;
4602
4603       default: vassert(0);
4604    }
4605
4606    IRExpr* rhs  = NULL;
4607    switch (optS) {
4608       case BITS4(1,1,1,0): goto fail; //ATC
4609       case BITS4(0,1,1,0):
4610          rhs = getIReg64orZR(mm);
4611          vex_sprintf(buf, "[%s, %s]",
4612                      nameIReg64orZR(nn), nameIReg64orZR(mm));
4613          break;
4614       case BITS4(1,1,1,1): goto fail; //ATC
4615       case BITS4(0,1,1,1):
4616          rhs = binop(Iop_Shl64, getIReg64orZR(mm), mkU8(szLg2));
4617          vex_sprintf(buf, "[%s, %s lsl %u]",
4618                      nameIReg64orZR(nn), nameIReg64orZR(mm), szLg2);
4619          break;
4620       case BITS4(0,1,0,0):
4621          rhs = unop(Iop_32Uto64, getIReg32orZR(mm));
4622          vex_sprintf(buf, "[%s, %s uxtx]",
4623                      nameIReg64orZR(nn), nameIReg32orZR(mm));
4624          break;
4625       case BITS4(0,1,0,1):
4626          rhs = binop(Iop_Shl64,
4627                      unop(Iop_32Uto64, getIReg32orZR(mm)), mkU8(szLg2));
4628          vex_sprintf(buf, "[%s, %s uxtx, lsl %u]",
4629                      nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
4630          break;
4631       case BITS4(1,1,0,0):
4632          rhs = unop(Iop_32Sto64, getIReg32orZR(mm));
4633          vex_sprintf(buf, "[%s, %s sxtx]",
4634                      nameIReg64orZR(nn), nameIReg32orZR(mm));
4635          break;
4636       case BITS4(1,1,0,1):
4637          rhs = binop(Iop_Shl64,
4638                      unop(Iop_32Sto64, getIReg32orZR(mm)), mkU8(szLg2));
4639          vex_sprintf(buf, "[%s, %s sxtx, lsl %u]",
4640                      nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
4641          break;
4642       default:
4643          /* The rest appear to be genuinely invalid */
4644          goto fail;
4645    }
4646
4647    vassert(rhs);
4648    IRTemp res = newTemp(Ity_I64);
4649    assign(res, binop(Iop_Add64, getIReg64orSP(nn), rhs));
4650    return res;
4651
4652   fail:
4653    if (0 /*really, sigill_diag, but that causes too much plumbing*/) {
4654       vex_printf("gen_indexed_EA: unhandled case optS == 0x%x\n", optS);
4655    }
4656    return IRTemp_INVALID;
4657 }
4658
4659
4660 /* Generate an 8/16/32/64 bit integer store to ADDR for the lowest
4661    bits of DATAE :: Ity_I64. */
4662 static void gen_narrowing_store ( UInt szB, IRTemp addr, IRExpr* dataE )
4663 {
4664    IRExpr* addrE = mkexpr(addr);
4665    switch (szB) {
4666       case 8:
4667          storeLE(addrE, dataE);
4668          break;
4669       case 4:
4670          storeLE(addrE, unop(Iop_64to32, dataE));
4671          break;
4672       case 2:
4673          storeLE(addrE, unop(Iop_64to16, dataE));
4674          break;
4675       case 1:
4676          storeLE(addrE, unop(Iop_64to8, dataE));
4677          break;
4678       default:
4679          vassert(0);
4680    }
4681 }
4682
4683
4684 /* Generate an 8/16/32/64 bit unsigned widening load from ADDR,
4685    placing the result in an Ity_I64 temporary. */
4686 static IRTemp gen_zwidening_load ( UInt szB, IRTemp addr )
4687 {
4688    IRTemp  res   = newTemp(Ity_I64);
4689    IRExpr* addrE = mkexpr(addr);
4690    switch (szB) {
4691       case 8:
4692          assign(res, loadLE(Ity_I64,addrE));
4693          break;
4694       case 4:
4695          assign(res, unop(Iop_32Uto64, loadLE(Ity_I32,addrE)));
4696          break;
4697       case 2:
4698          assign(res, unop(Iop_16Uto64, loadLE(Ity_I16,addrE)));
4699          break;
4700       case 1:
4701          assign(res, unop(Iop_8Uto64, loadLE(Ity_I8,addrE)));
4702          break;
4703       default:
4704          vassert(0);
4705    }
4706    return res;
4707 }
4708
4709
4710 /* Generate a "standard 7" name, from bitQ and size.  But also
4711    allow ".1d" since that's occasionally useful. */
4712 static
4713 const HChar* nameArr_Q_SZ ( UInt bitQ, UInt size )
4714 {
4715    vassert(bitQ <= 1 && size <= 3);
4716    const HChar* nms[8]
4717       = { "8b", "4h", "2s", "1d", "16b", "8h", "4s", "2d" };
4718    UInt ix = (bitQ << 2) | size;
4719    vassert(ix < 8);
4720    return nms[ix];
4721 }
4722
4723
4724 static
4725 Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
4726                           const VexAbiInfo* abiinfo, Bool sigill_diag)
4727 {
4728 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
4729
4730    /* ------------ LDR,STR (immediate, uimm12) ----------- */
4731    /* uimm12 is scaled by the transfer size
4732
4733       31 29  26    21    9  4
4734       |  |   |     |     |  |
4735       11 111 00100 imm12 nn tt    STR  Xt, [Xn|SP, #imm12 * 8]
4736       11 111 00101 imm12 nn tt    LDR  Xt, [Xn|SP, #imm12 * 8]
4737
4738       10 111 00100 imm12 nn tt    STR  Wt, [Xn|SP, #imm12 * 4]
4739       10 111 00101 imm12 nn tt    LDR  Wt, [Xn|SP, #imm12 * 4]
4740
4741       01 111 00100 imm12 nn tt    STRH Wt, [Xn|SP, #imm12 * 2]
4742       01 111 00101 imm12 nn tt    LDRH Wt, [Xn|SP, #imm12 * 2]
4743
4744       00 111 00100 imm12 nn tt    STRB Wt, [Xn|SP, #imm12 * 1]
4745       00 111 00101 imm12 nn tt    LDRB Wt, [Xn|SP, #imm12 * 1]
4746    */
4747    if (INSN(29,23) == BITS7(1,1,1,0,0,1,0)) {
4748       UInt   szLg2 = INSN(31,30);
4749       UInt   szB   = 1 << szLg2;
4750       Bool   isLD  = INSN(22,22) == 1;
4751       UInt   offs  = INSN(21,10) * szB;
4752       UInt   nn    = INSN(9,5);
4753       UInt   tt    = INSN(4,0);
4754       IRTemp ta    = newTemp(Ity_I64);
4755       assign(ta, binop(Iop_Add64, getIReg64orSP(nn), mkU64(offs)));
4756       if (nn == 31) { /* FIXME generate stack alignment check */ }
4757       vassert(szLg2 < 4);
4758       if (isLD) {
4759          putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, ta)));
4760       } else {
4761          gen_narrowing_store(szB, ta, getIReg64orZR(tt));
4762       }
4763       const HChar* ld_name[4] = { "ldrb", "ldrh", "ldr", "ldr" };
4764       const HChar* st_name[4] = { "strb", "strh", "str", "str" };
4765       DIP("%s %s, [%s, #%u]\n",
4766           (isLD ? ld_name : st_name)[szLg2], nameIRegOrZR(szB == 8, tt),
4767           nameIReg64orSP(nn), offs);
4768       return True;
4769    }
4770
4771    /* ------------ LDUR,STUR (immediate, simm9) ----------- */
4772    /*
4773       31 29  26      20   11 9  4
4774       |  |   |       |    |  |  |
4775       (at-Rn-then-Rn=EA)  |  |  |
4776       sz 111 00000 0 imm9 01 Rn Rt   STR Rt, [Xn|SP], #simm9
4777       sz 111 00001 0 imm9 01 Rn Rt   LDR Rt, [Xn|SP], #simm9
4778
4779       (at-EA-then-Rn=EA)
4780       sz 111 00000 0 imm9 11 Rn Rt   STR Rt, [Xn|SP, #simm9]!
4781       sz 111 00001 0 imm9 11 Rn Rt   LDR Rt, [Xn|SP, #simm9]!
4782
4783       (at-EA)
4784       sz 111 00000 0 imm9 00 Rn Rt   STR Rt, [Xn|SP, #simm9]
4785       sz 111 00001 0 imm9 00 Rn Rt   LDR Rt, [Xn|SP, #simm9]
4786
4787       simm9 is unscaled.
4788
4789       The case 'wback && Rn == Rt && Rt != 31' is disallowed.  In the
4790       load case this is because would create two competing values for
4791       Rt.  In the store case the reason is unclear, but the spec
4792       disallows it anyway.
4793
4794       Stores are narrowing, loads are unsigned widening.  sz encodes
4795       the transfer size in the normal way: 00=1, 01=2, 10=4, 11=8.
4796    */
4797    if ((INSN(29,21) & BITS9(1,1,1, 1,1,1,1,0, 1))
4798        == BITS9(1,1,1, 0,0,0,0,0, 0)) {
4799       UInt szLg2  = INSN(31,30);
4800       UInt szB    = 1 << szLg2;
4801       Bool isLoad = INSN(22,22) == 1;
4802       UInt imm9   = INSN(20,12);
4803       UInt nn     = INSN(9,5);
4804       UInt tt     = INSN(4,0);
4805       Bool wBack  = INSN(10,10) == 1;
4806       UInt how    = INSN(11,10);
4807       if (how == BITS2(1,0) || (wBack && nn == tt && tt != 31)) {
4808          /* undecodable; fall through */
4809       } else {
4810          if (nn == 31) { /* FIXME generate stack alignment check */ }
4811
4812          // Compute the transfer address TA and the writeback address WA.
4813          IRTemp tRN = newTemp(Ity_I64);
4814          assign(tRN, getIReg64orSP(nn));
4815          IRTemp tEA = newTemp(Ity_I64);
4816          Long simm9 = (Long)sx_to_64(imm9, 9);
4817          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
4818
4819          IRTemp tTA = newTemp(Ity_I64);
4820          IRTemp tWA = newTemp(Ity_I64);
4821          switch (how) {
4822             case BITS2(0,1):
4823                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
4824             case BITS2(1,1):
4825                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
4826             case BITS2(0,0):
4827                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
4828             default:
4829                vassert(0); /* NOTREACHED */
4830          }
4831
4832          /* Normally rN would be updated after the transfer.  However, in
4833             the special cases typifed by
4834                str x30, [sp,#-16]!
4835                str w1, [sp,#-32]!
4836             it is necessary to update SP before the transfer, (1)
4837             because Memcheck will otherwise complain about a write
4838             below the stack pointer, and (2) because the segfault
4839             stack extension mechanism will otherwise extend the stack
4840             only down to SP before the instruction, which might not be
4841             far enough, if the -16/-32 bit takes the actual access
4842             address to the next page.
4843          */
4844          Bool earlyWBack
4845            = wBack && simm9 < 0 && (szB == 8 || szB == 4)
4846              && how == BITS2(1,1) && nn == 31 && !isLoad;
4847
4848          if (wBack && earlyWBack)
4849             putIReg64orSP(nn, mkexpr(tEA));
4850
4851          if (isLoad) {
4852             putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, tTA)));
4853          } else {
4854             gen_narrowing_store(szB, tTA, getIReg64orZR(tt));
4855          }
4856
4857          if (wBack && !earlyWBack)
4858             putIReg64orSP(nn, mkexpr(tEA));
4859
4860          const HChar* ld_name[4] = { "ldurb", "ldurh", "ldur", "ldur" };
4861          const HChar* st_name[4] = { "sturb", "sturh", "stur", "stur" };
4862          const HChar* fmt_str = NULL;
4863          switch (how) {
4864             case BITS2(0,1):
4865                fmt_str = "%s %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
4866                break;
4867             case BITS2(1,1):
4868                fmt_str = "%s %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
4869                break;
4870             case BITS2(0,0):
4871                fmt_str = "%s %s, [%s, #%lld] (at-Rn)\n";
4872                break;
4873             default:
4874                vassert(0);
4875          }
4876          DIP(fmt_str, (isLoad ? ld_name : st_name)[szLg2],
4877                       nameIRegOrZR(szB == 8, tt),
4878                       nameIReg64orSP(nn), simm9);
4879          return True;
4880       }
4881    }
4882
4883    /* -------- LDP,STP (immediate, simm7) (INT REGS) -------- */
4884    /* L==1 => mm==LD
4885       L==0 => mm==ST
4886       x==0 => 32 bit transfers, and zero extended loads
4887       x==1 => 64 bit transfers
4888       simm7 is scaled by the (single-register) transfer size
4889
4890       (at-Rn-then-Rn=EA)
4891       x0 101 0001 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP], #imm
4892
4893       (at-EA-then-Rn=EA)
4894       x0 101 0011 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]!
4895
4896       (at-EA)
4897       x0 101 0010 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]
4898    */
4899    UInt insn_30_23 = INSN(30,23);
4900    if (insn_30_23 == BITS8(0,1,0,1,0,0,0,1)
4901        || insn_30_23 == BITS8(0,1,0,1,0,0,1,1)
4902        || insn_30_23 == BITS8(0,1,0,1,0,0,1,0)) {
4903       UInt bL     = INSN(22,22);
4904       UInt bX     = INSN(31,31);
4905       UInt bWBack = INSN(23,23);
4906       UInt rT1    = INSN(4,0);
4907       UInt rN     = INSN(9,5);
4908       UInt rT2    = INSN(14,10);
4909       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
4910       if ((bWBack && (rT1 == rN || rT2 == rN) && rN != 31)
4911           || (bL && rT1 == rT2)) {
4912          /* undecodable; fall through */
4913       } else {
4914          if (rN == 31) { /* FIXME generate stack alignment check */ }
4915
4916          // Compute the transfer address TA and the writeback address WA.
4917          IRTemp tRN = newTemp(Ity_I64);
4918          assign(tRN, getIReg64orSP(rN));
4919          IRTemp tEA = newTemp(Ity_I64);
4920          simm7 = (bX ? 8 : 4) * simm7;
4921          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
4922
4923          IRTemp tTA = newTemp(Ity_I64);
4924          IRTemp tWA = newTemp(Ity_I64);
4925          switch (INSN(24,23)) {
4926             case BITS2(0,1):
4927                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
4928             case BITS2(1,1):
4929                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
4930             case BITS2(1,0):
4931                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
4932             default:
4933                vassert(0); /* NOTREACHED */
4934          }
4935
4936          /* Normally rN would be updated after the transfer.  However, in
4937             the special case typifed by
4938                stp x29, x30, [sp,#-112]!
4939             it is necessary to update SP before the transfer, (1)
4940             because Memcheck will otherwise complain about a write
4941             below the stack pointer, and (2) because the segfault
4942             stack extension mechanism will otherwise extend the stack
4943             only down to SP before the instruction, which might not be
4944             far enough, if the -112 bit takes the actual access
4945             address to the next page.
4946          */
4947          Bool earlyWBack
4948            = bWBack && simm7 < 0
4949              && INSN(24,23) == BITS2(1,1) && rN == 31 && bL == 0;
4950
4951          if (bWBack && earlyWBack)
4952             putIReg64orSP(rN, mkexpr(tEA));
4953
4954          /**/ if (bL == 1 && bX == 1) {
4955             // 64 bit load
4956             putIReg64orZR(rT1, loadLE(Ity_I64,
4957                                       binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
4958             putIReg64orZR(rT2, loadLE(Ity_I64,
4959                                       binop(Iop_Add64,mkexpr(tTA),mkU64(8))));
4960          } else if (bL == 1 && bX == 0) {
4961             // 32 bit load
4962             putIReg32orZR(rT1, loadLE(Ity_I32,
4963                                       binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
4964             putIReg32orZR(rT2, loadLE(Ity_I32,
4965                                       binop(Iop_Add64,mkexpr(tTA),mkU64(4))));
4966          } else if (bL == 0 && bX == 1) {
4967             // 64 bit store
4968             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
4969                     getIReg64orZR(rT1));
4970             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(8)),
4971                     getIReg64orZR(rT2));
4972          } else {
4973             vassert(bL == 0 && bX == 0);
4974             // 32 bit store
4975             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
4976                     getIReg32orZR(rT1));
4977             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(4)),
4978                     getIReg32orZR(rT2));
4979          }
4980
4981          if (bWBack && !earlyWBack)
4982             putIReg64orSP(rN, mkexpr(tEA));
4983
4984          const HChar* fmt_str = NULL;
4985          switch (INSN(24,23)) {
4986             case BITS2(0,1):
4987                fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
4988                break;
4989             case BITS2(1,1):
4990                fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
4991                break;
4992             case BITS2(1,0):
4993                fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
4994                break;
4995             default:
4996                vassert(0);
4997          }
4998          DIP(fmt_str, bL == 0 ? "st" : "ld",
4999                       nameIRegOrZR(bX == 1, rT1),
5000                       nameIRegOrZR(bX == 1, rT2),
5001                       nameIReg64orSP(rN), simm7);
5002          return True;
5003       }
5004    }
5005
5006    /* -------- LDPSW (immediate, simm7) (INT REGS) -------- */
5007    /* Does 32 bit transfers which are sign extended to 64 bits.
5008       simm7 is scaled by the (single-register) transfer size
5009
5010       (at-Rn-then-Rn=EA)
5011       01 101 0001 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP], #imm
5012
5013       (at-EA-then-Rn=EA)
5014       01 101 0011 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP, #imm]!
5015
5016       (at-EA)
5017       01 101 0010 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP, #imm]
5018    */
5019    UInt insn_31_22 = INSN(31,22);
5020    if (insn_31_22 == BITS10(0,1,1,0,1,0,0,0,1,1)
5021        || insn_31_22 == BITS10(0,1,1,0,1,0,0,1,1,1)
5022        || insn_31_22 == BITS10(0,1,1,0,1,0,0,1,0,1)) {
5023       UInt bWBack = INSN(23,23);
5024       UInt rT1    = INSN(4,0);
5025       UInt rN     = INSN(9,5);
5026       UInt rT2    = INSN(14,10);
5027       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5028       if ((bWBack && (rT1 == rN || rT2 == rN) && rN != 31)
5029           || (rT1 == rT2)) {
5030          /* undecodable; fall through */
5031       } else {
5032          if (rN == 31) { /* FIXME generate stack alignment check */ }
5033
5034          // Compute the transfer address TA and the writeback address WA.
5035          IRTemp tRN = newTemp(Ity_I64);
5036          assign(tRN, getIReg64orSP(rN));
5037          IRTemp tEA = newTemp(Ity_I64);
5038          simm7 = 4 * simm7;
5039          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5040
5041          IRTemp tTA = newTemp(Ity_I64);
5042          IRTemp tWA = newTemp(Ity_I64);
5043          switch (INSN(24,23)) {
5044             case BITS2(0,1):
5045                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5046             case BITS2(1,1):
5047                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5048             case BITS2(1,0):
5049                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5050             default:
5051                vassert(0); /* NOTREACHED */
5052          }
5053
5054          // 32 bit load, sign extended to 64 bits
5055          putIReg64orZR(rT1, unop(Iop_32Sto64,
5056                                  loadLE(Ity_I32, binop(Iop_Add64,
5057                                                        mkexpr(tTA),
5058                                                        mkU64(0)))));
5059          putIReg64orZR(rT2, unop(Iop_32Sto64,
5060                                  loadLE(Ity_I32, binop(Iop_Add64,
5061                                                        mkexpr(tTA),
5062                                                        mkU64(4)))));
5063          if (bWBack)
5064             putIReg64orSP(rN, mkexpr(tEA));
5065
5066          const HChar* fmt_str = NULL;
5067          switch (INSN(24,23)) {
5068             case BITS2(0,1):
5069                fmt_str = "ldpsw %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5070                break;
5071             case BITS2(1,1):
5072                fmt_str = "ldpsw %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5073                break;
5074             case BITS2(1,0):
5075                fmt_str = "ldpsw %s, %s, [%s, #%lld] (at-Rn)\n";
5076                break;
5077             default:
5078                vassert(0);
5079          }
5080          DIP(fmt_str, nameIReg64orZR(rT1),
5081                       nameIReg64orZR(rT2),
5082                       nameIReg64orSP(rN), simm7);
5083          return True;
5084       }
5085    }
5086
5087    /* ---------------- LDR (literal, int reg) ---------------- */
5088    /* 31 29      23    4
5089       00 011 000 imm19 Rt   LDR   Wt, [PC + sxTo64(imm19 << 2)]
5090       01 011 000 imm19 Rt   LDR   Xt, [PC + sxTo64(imm19 << 2)]
5091       10 011 000 imm19 Rt   LDRSW Xt, [PC + sxTo64(imm19 << 2)]
5092       11 011 000 imm19 Rt   prefetch  [PC + sxTo64(imm19 << 2)]
5093       Just handles the first two cases for now.
5094    */
5095    if (INSN(29,24) == BITS6(0,1,1,0,0,0) && INSN(31,31) == 0) {
5096       UInt  imm19 = INSN(23,5);
5097       UInt  rT    = INSN(4,0);
5098       UInt  bX    = INSN(30,30);
5099       ULong ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
5100       if (bX) {
5101          putIReg64orZR(rT, loadLE(Ity_I64, mkU64(ea)));
5102       } else {
5103          putIReg32orZR(rT, loadLE(Ity_I32, mkU64(ea)));
5104       }
5105       DIP("ldr %s, 0x%llx (literal)\n", nameIRegOrZR(bX == 1, rT), ea);
5106       return True;
5107    }
5108
5109    /* -------------- {LD,ST}R (integer register) --------------- */
5110    /* 31 29        20 15     12 11 9  4
5111       |  |         |  |      |  |  |  |
5112       11 111000011 Rm option S  10 Rn Rt  LDR  Xt, [Xn|SP, R<m>{ext/sh}]
5113       10 111000011 Rm option S  10 Rn Rt  LDR  Wt, [Xn|SP, R<m>{ext/sh}]
5114       01 111000011 Rm option S  10 Rn Rt  LDRH Wt, [Xn|SP, R<m>{ext/sh}]
5115       00 111000011 Rm option S  10 Rn Rt  LDRB Wt, [Xn|SP, R<m>{ext/sh}]
5116
5117       11 111000001 Rm option S  10 Rn Rt  STR  Xt, [Xn|SP, R<m>{ext/sh}]
5118       10 111000001 Rm option S  10 Rn Rt  STR  Wt, [Xn|SP, R<m>{ext/sh}]
5119       01 111000001 Rm option S  10 Rn Rt  STRH Wt, [Xn|SP, R<m>{ext/sh}]
5120       00 111000001 Rm option S  10 Rn Rt  STRB Wt, [Xn|SP, R<m>{ext/sh}]
5121    */
5122    if (INSN(29,23) == BITS7(1,1,1,0,0,0,0)
5123        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5124       HChar  dis_buf[64];
5125       UInt   szLg2 = INSN(31,30);
5126       Bool   isLD  = INSN(22,22) == 1;
5127       UInt   tt    = INSN(4,0);
5128       IRTemp ea    = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
5129       if (ea != IRTemp_INVALID) {
5130          switch (szLg2) {
5131             case 3: /* 64 bit */
5132                if (isLD) {
5133                   putIReg64orZR(tt, loadLE(Ity_I64, mkexpr(ea)));
5134                   DIP("ldr %s, %s\n", nameIReg64orZR(tt), dis_buf);
5135                } else {
5136                   storeLE(mkexpr(ea), getIReg64orZR(tt));
5137                   DIP("str %s, %s\n", nameIReg64orZR(tt), dis_buf);
5138                }
5139                break;
5140             case 2: /* 32 bit */
5141                if (isLD) {
5142                   putIReg32orZR(tt, loadLE(Ity_I32, mkexpr(ea)));
5143                   DIP("ldr %s, %s\n", nameIReg32orZR(tt), dis_buf);
5144                } else {
5145                   storeLE(mkexpr(ea), getIReg32orZR(tt));
5146                   DIP("str %s, %s\n", nameIReg32orZR(tt), dis_buf);
5147                }
5148                break;
5149             case 1: /* 16 bit */
5150                if (isLD) {
5151                   putIReg64orZR(tt, unop(Iop_16Uto64,
5152                                          loadLE(Ity_I16, mkexpr(ea))));
5153                   DIP("ldruh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5154                } else {
5155                   storeLE(mkexpr(ea), unop(Iop_64to16, getIReg64orZR(tt)));
5156                   DIP("strh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5157                }
5158                break;
5159             case 0: /* 8 bit */
5160                if (isLD) {
5161                   putIReg64orZR(tt, unop(Iop_8Uto64,
5162                                          loadLE(Ity_I8, mkexpr(ea))));
5163                   DIP("ldrub %s, %s\n", nameIReg32orZR(tt), dis_buf);
5164                } else {
5165                   storeLE(mkexpr(ea), unop(Iop_64to8, getIReg64orZR(tt)));
5166                   DIP("strb %s, %s\n", nameIReg32orZR(tt), dis_buf);
5167                }
5168                break;
5169             default:
5170                vassert(0);
5171          }
5172          return True;
5173       }
5174    }
5175
5176    /* -------------- LDRS{B,H,W} (uimm12) -------------- */
5177    /* 31 29  26  23 21    9 4
5178       10 111 001 10 imm12 n t   LDRSW Xt, [Xn|SP, #pimm12 * 4]
5179       01 111 001 1x imm12 n t   LDRSH Rt, [Xn|SP, #pimm12 * 2]
5180       00 111 001 1x imm12 n t   LDRSB Rt, [Xn|SP, #pimm12 * 1]
5181       where
5182          Rt is Wt when x==1, Xt when x==0
5183    */
5184    if (INSN(29,23) == BITS7(1,1,1,0,0,1,1)) {
5185       /* Further checks on bits 31:30 and 22 */
5186       Bool valid = False;
5187       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5188          case BITS3(1,0,0):
5189          case BITS3(0,1,0): case BITS3(0,1,1):
5190          case BITS3(0,0,0): case BITS3(0,0,1):
5191             valid = True;
5192             break;
5193       }
5194       if (valid) {
5195          UInt    szLg2 = INSN(31,30);
5196          UInt    bitX  = INSN(22,22);
5197          UInt    imm12 = INSN(21,10);
5198          UInt    nn    = INSN(9,5);
5199          UInt    tt    = INSN(4,0);
5200          UInt    szB   = 1 << szLg2;
5201          IRExpr* ea    = binop(Iop_Add64,
5202                                getIReg64orSP(nn), mkU64(imm12 * szB));
5203          switch (szB) {
5204             case 4:
5205                vassert(bitX == 0);
5206                putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, ea)));
5207                DIP("ldrsw %s, [%s, #%u]\n", nameIReg64orZR(tt),
5208                    nameIReg64orSP(nn), imm12 * szB);
5209                break;
5210             case 2:
5211                if (bitX == 1) {
5212                   putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, ea)));
5213                } else {
5214                   putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, ea)));
5215                }
5216                DIP("ldrsh %s, [%s, #%u]\n",
5217                    nameIRegOrZR(bitX == 0, tt),
5218                    nameIReg64orSP(nn), imm12 * szB);
5219                break;
5220             case 1:
5221                if (bitX == 1) {
5222                   putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, ea)));
5223                } else {
5224                   putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, ea)));
5225                }
5226                DIP("ldrsb %s, [%s, #%u]\n",
5227                    nameIRegOrZR(bitX == 0, tt),
5228                    nameIReg64orSP(nn), imm12 * szB);
5229                break;
5230             default:
5231                vassert(0);
5232          }
5233          return True;
5234       }
5235       /* else fall through */
5236    }
5237
5238    /* -------------- LDRS{B,H,W} (simm9, upd) -------------- */
5239    /* (at-Rn-then-Rn=EA)
5240       31 29      23 21 20   11 9 4
5241       00 111 000 1x 0  imm9 01 n t  LDRSB Rt, [Xn|SP], #simm9
5242       01 111 000 1x 0  imm9 01 n t  LDRSH Rt, [Xn|SP], #simm9
5243       10 111 000 10 0  imm9 01 n t  LDRSW Xt, [Xn|SP], #simm9
5244
5245       (at-EA-then-Rn=EA)
5246       00 111 000 1x 0  imm9 11 n t  LDRSB Rt, [Xn|SP, #simm9]!
5247       01 111 000 1x 0  imm9 11 n t  LDRSH Rt, [Xn|SP, #simm9]!
5248       10 111 000 10 0  imm9 11 n t  LDRSW Xt, [Xn|SP, #simm9]!
5249       where
5250          Rt is Wt when x==1, Xt when x==0
5251          transfer-at-Rn when [11]==0, at EA when [11]==1
5252    */
5253    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5254        && INSN(21,21) == 0 && INSN(10,10) == 1) {
5255       /* Further checks on bits 31:30 and 22 */
5256       Bool valid = False;
5257       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5258          case BITS3(1,0,0):                    // LDRSW Xt
5259          case BITS3(0,1,0): case BITS3(0,1,1): // LDRSH Xt, Wt
5260          case BITS3(0,0,0): case BITS3(0,0,1): // LDRSB Xt, Wt
5261             valid = True;
5262             break;
5263       }
5264       if (valid) {
5265          UInt   szLg2 = INSN(31,30);
5266          UInt   imm9  = INSN(20,12);
5267          Bool   atRN  = INSN(11,11) == 0;
5268          UInt   nn    = INSN(9,5);
5269          UInt   tt    = INSN(4,0);
5270          IRTemp tRN   = newTemp(Ity_I64);
5271          IRTemp tEA   = newTemp(Ity_I64);
5272          IRTemp tTA   = IRTemp_INVALID;
5273          ULong  simm9 = sx_to_64(imm9, 9);
5274          Bool   is64  = INSN(22,22) == 0;
5275          assign(tRN, getIReg64orSP(nn));
5276          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5277          tTA = atRN ? tRN : tEA;
5278          HChar ch = '?';
5279          /* There are 5 cases:
5280                byte     load,           SX to 64
5281                byte     load, SX to 32, ZX to 64
5282                halfword load,           SX to 64
5283                halfword load, SX to 32, ZX to 64
5284                word     load,           SX to 64
5285             The ifs below handle them in the listed order.
5286          */
5287          if (szLg2 == 0) {
5288             ch = 'b';
5289             if (is64) {
5290                putIReg64orZR(tt, unop(Iop_8Sto64,
5291                                       loadLE(Ity_I8, mkexpr(tTA))));
5292             } else {
5293                putIReg32orZR(tt, unop(Iop_8Sto32,
5294                                       loadLE(Ity_I8, mkexpr(tTA))));
5295             }
5296          }
5297          else if (szLg2 == 1) {
5298             ch = 'h';
5299             if (is64) {
5300                putIReg64orZR(tt, unop(Iop_16Sto64,
5301                                       loadLE(Ity_I16, mkexpr(tTA))));
5302             } else {
5303                putIReg32orZR(tt, unop(Iop_16Sto32,
5304                                       loadLE(Ity_I16, mkexpr(tTA))));
5305             }
5306          }
5307          else if (szLg2 == 2 && is64) {
5308             ch = 'w';
5309             putIReg64orZR(tt, unop(Iop_32Sto64,
5310                                    loadLE(Ity_I32, mkexpr(tTA))));
5311          }
5312          else {
5313             vassert(0);
5314          }
5315          putIReg64orSP(nn, mkexpr(tEA));
5316          DIP(atRN ? "ldrs%c %s, [%s], #%llu\n" : "ldrs%c %s, [%s, #%llu]!",
5317              ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), simm9);
5318          return True;
5319       }
5320       /* else fall through */
5321    }
5322
5323    /* -------------- LDRS{B,H,W} (simm9, noUpd) -------------- */
5324    /* 31 29      23 21 20   11 9 4
5325       00 111 000 1x 0  imm9 00 n t  LDURSB Rt, [Xn|SP, #simm9]
5326       01 111 000 1x 0  imm9 00 n t  LDURSH Rt, [Xn|SP, #simm9]
5327       10 111 000 10 0  imm9 00 n t  LDURSW Xt, [Xn|SP, #simm9]
5328       where
5329          Rt is Wt when x==1, Xt when x==0
5330    */
5331    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5332        && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
5333       /* Further checks on bits 31:30 and 22 */
5334       Bool valid = False;
5335       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5336          case BITS3(1,0,0):                    // LDURSW Xt
5337          case BITS3(0,1,0): case BITS3(0,1,1): // LDURSH Xt, Wt
5338          case BITS3(0,0,0): case BITS3(0,0,1): // LDURSB Xt, Wt
5339             valid = True;
5340             break;
5341       }
5342       if (valid) {
5343          UInt   szLg2 = INSN(31,30);
5344          UInt   imm9  = INSN(20,12);
5345          UInt   nn    = INSN(9,5);
5346          UInt   tt    = INSN(4,0);
5347          IRTemp tRN   = newTemp(Ity_I64);
5348          IRTemp tEA   = newTemp(Ity_I64);
5349          ULong  simm9 = sx_to_64(imm9, 9);
5350          Bool   is64  = INSN(22,22) == 0;
5351          assign(tRN, getIReg64orSP(nn));
5352          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5353          HChar ch = '?';
5354          /* There are 5 cases:
5355                byte     load,           SX to 64
5356                byte     load, SX to 32, ZX to 64
5357                halfword load,           SX to 64
5358                halfword load, SX to 32, ZX to 64
5359                word     load,           SX to 64
5360             The ifs below handle them in the listed order.
5361          */
5362          if (szLg2 == 0) {
5363             ch = 'b';
5364             if (is64) {
5365                putIReg64orZR(tt, unop(Iop_8Sto64,
5366                                       loadLE(Ity_I8, mkexpr(tEA))));
5367             } else {
5368                putIReg32orZR(tt, unop(Iop_8Sto32,
5369                                       loadLE(Ity_I8, mkexpr(tEA))));
5370             }
5371          }
5372          else if (szLg2 == 1) {
5373             ch = 'h';
5374             if (is64) {
5375                putIReg64orZR(tt, unop(Iop_16Sto64,
5376                                       loadLE(Ity_I16, mkexpr(tEA))));
5377             } else {
5378                putIReg32orZR(tt, unop(Iop_16Sto32,
5379                                       loadLE(Ity_I16, mkexpr(tEA))));
5380             }
5381          }
5382          else if (szLg2 == 2 && is64) {
5383             ch = 'w';
5384             putIReg64orZR(tt, unop(Iop_32Sto64,
5385                                    loadLE(Ity_I32, mkexpr(tEA))));
5386          }
5387          else {
5388             vassert(0);
5389          }
5390          DIP("ldurs%c %s, [%s, #%lld]",
5391              ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), (Long)simm9);
5392          return True;
5393       }
5394       /* else fall through */
5395    }
5396
5397    /* -------- LDP,STP (immediate, simm7) (FP&VEC) -------- */
5398    /* L==1    => mm==LD
5399       L==0    => mm==ST
5400       sz==00  => 32 bit (S) transfers
5401       sz==01  => 64 bit (D) transfers
5402       sz==10  => 128 bit (Q) transfers
5403       sz==11  isn't allowed
5404       simm7 is scaled by the (single-register) transfer size
5405
5406       31 29  26   22 21   14 9 4
5407
5408       sz 101 1000 L  imm7 t2 n t1   mmNP SDQt1, SDQt2, [Xn|SP, #imm]
5409                                     (at-EA, with nontemporal hint)
5410
5411       sz 101 1001 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP], #imm
5412                                     (at-Rn-then-Rn=EA)
5413
5414       sz 101 1010 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]
5415                                     (at-EA)
5416
5417       sz 101 1011 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]!
5418                                     (at-EA-then-Rn=EA)
5419    */
5420    if (INSN(29,25) == BITS5(1,0,1,1,0)) {
5421       UInt szSlg2 = INSN(31,30); // log2 of the xfer size in 32-bit units
5422       Bool isLD   = INSN(22,22) == 1;
5423       Bool wBack  = INSN(23,23) == 1;
5424       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5425       UInt tt2    = INSN(14,10);
5426       UInt nn     = INSN(9,5);
5427       UInt tt1    = INSN(4,0);
5428       if (szSlg2 == BITS2(1,1) || (isLD && tt1 == tt2)) {
5429          /* undecodable; fall through */
5430       } else {
5431          if (nn == 31) { /* FIXME generate stack alignment check */ }
5432
5433          // Compute the transfer address TA and the writeback address WA.
5434          UInt   szB = 4 << szSlg2; /* szB is the per-register size */
5435          IRTemp tRN = newTemp(Ity_I64);
5436          assign(tRN, getIReg64orSP(nn));
5437          IRTemp tEA = newTemp(Ity_I64);
5438          simm7 = szB * simm7;
5439          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5440
5441          IRTemp tTA = newTemp(Ity_I64);
5442          IRTemp tWA = newTemp(Ity_I64);
5443          switch (INSN(24,23)) {
5444             case BITS2(0,1):
5445                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5446             case BITS2(1,1):
5447                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5448             case BITS2(1,0):
5449             case BITS2(0,0):
5450                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5451             default:
5452                vassert(0); /* NOTREACHED */
5453          }
5454
5455          IRType ty = Ity_INVALID;
5456          switch (szB) {
5457             case 4:  ty = Ity_F32;  break;
5458             case 8:  ty = Ity_F64;  break;
5459             case 16: ty = Ity_V128; break;
5460             default: vassert(0);
5461          }
5462
5463          /* Normally rN would be updated after the transfer.  However, in
5464             the special cases typifed by
5465                stp q0, q1, [sp,#-512]!
5466                stp d0, d1, [sp,#-512]!
5467                stp s0, s1, [sp,#-512]!
5468             it is necessary to update SP before the transfer, (1)
5469             because Memcheck will otherwise complain about a write
5470             below the stack pointer, and (2) because the segfault
5471             stack extension mechanism will otherwise extend the stack
5472             only down to SP before the instruction, which might not be
5473             far enough, if the -512 bit takes the actual access
5474             address to the next page.
5475          */
5476          Bool earlyWBack
5477            = wBack && simm7 < 0
5478              && INSN(24,23) == BITS2(1,1) && nn == 31 && !isLD;
5479
5480          if (wBack && earlyWBack)
5481             putIReg64orSP(nn, mkexpr(tEA));
5482
5483          if (isLD) {
5484             if (szB < 16) {
5485                putQReg128(tt1, mkV128(0x0000));
5486             }
5487             putQRegLO(tt1,
5488                       loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(0))));
5489             if (szB < 16) {
5490                putQReg128(tt2, mkV128(0x0000));
5491             }
5492             putQRegLO(tt2,
5493                       loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(szB))));
5494          } else {
5495             storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(0)),
5496                     getQRegLO(tt1, ty));
5497             storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(szB)),
5498                     getQRegLO(tt2, ty));
5499          }
5500
5501          if (wBack && !earlyWBack)
5502             putIReg64orSP(nn, mkexpr(tEA));
5503
5504          const HChar* fmt_str = NULL;
5505          switch (INSN(24,23)) {
5506             case BITS2(0,1):
5507                fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5508                break;
5509             case BITS2(1,1):
5510                fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5511                break;
5512             case BITS2(1,0):
5513                fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
5514                break;
5515             case BITS2(0,0):
5516                fmt_str = "%snp %s, %s, [%s, #%lld] (at-Rn)\n";
5517                break;
5518             default:
5519                vassert(0);
5520          }
5521          DIP(fmt_str, isLD ? "ld" : "st",
5522                       nameQRegLO(tt1, ty), nameQRegLO(tt2, ty),
5523                       nameIReg64orSP(nn), simm7);
5524          return True;
5525       }
5526    }
5527
5528    /* -------------- {LD,ST}R (vector register) --------------- */
5529    /* 31 29     23  20 15     12 11 9  4
5530       |  |      |   |  |      |  |  |  |
5531       00 111100 011 Rm option S  10 Rn Rt  LDR Bt, [Xn|SP, R<m>{ext/sh}]
5532       01 111100 011 Rm option S  10 Rn Rt  LDR Ht, [Xn|SP, R<m>{ext/sh}]
5533       10 111100 011 Rm option S  10 Rn Rt  LDR St, [Xn|SP, R<m>{ext/sh}]
5534       11 111100 011 Rm option S  10 Rn Rt  LDR Dt, [Xn|SP, R<m>{ext/sh}]
5535       00 111100 111 Rm option S  10 Rn Rt  LDR Qt, [Xn|SP, R<m>{ext/sh}]
5536
5537       00 111100 001 Rm option S  10 Rn Rt  STR Bt, [Xn|SP, R<m>{ext/sh}]
5538       01 111100 001 Rm option S  10 Rn Rt  STR Ht, [Xn|SP, R<m>{ext/sh}]
5539       10 111100 001 Rm option S  10 Rn Rt  STR St, [Xn|SP, R<m>{ext/sh}]
5540       11 111100 001 Rm option S  10 Rn Rt  STR Dt, [Xn|SP, R<m>{ext/sh}]
5541       00 111100 101 Rm option S  10 Rn Rt  STR Qt, [Xn|SP, R<m>{ext/sh}]
5542    */
5543    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5544        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5545       HChar  dis_buf[64];
5546       UInt   szLg2 = (INSN(23,23) << 2) | INSN(31,30);
5547       Bool   isLD  = INSN(22,22) == 1;
5548       UInt   tt    = INSN(4,0);
5549       if (szLg2 > 4) goto after_LDR_STR_vector_register;
5550       IRTemp ea    = gen_indexed_EA(dis_buf, insn, False/*to/from vec regs*/);
5551       if (ea == IRTemp_INVALID) goto after_LDR_STR_vector_register;
5552       switch (szLg2) {
5553          case 0: /* 8 bit */
5554             if (isLD) {
5555                putQReg128(tt, mkV128(0x0000));
5556                putQRegLO(tt, loadLE(Ity_I8, mkexpr(ea)));
5557                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
5558             } else {
5559                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I8));
5560                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
5561             }
5562             break;
5563          case 1:
5564             if (isLD) {
5565                putQReg128(tt, mkV128(0x0000));
5566                putQRegLO(tt, loadLE(Ity_I16, mkexpr(ea)));
5567                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
5568             } else {
5569                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I16));
5570                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
5571             }
5572             break;
5573          case 2: /* 32 bit */
5574             if (isLD) {
5575                putQReg128(tt, mkV128(0x0000));
5576                putQRegLO(tt, loadLE(Ity_I32, mkexpr(ea)));
5577                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
5578             } else {
5579                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I32));
5580                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
5581             }
5582             break;
5583          case 3: /* 64 bit */
5584             if (isLD) {
5585                putQReg128(tt, mkV128(0x0000));
5586                putQRegLO(tt, loadLE(Ity_I64, mkexpr(ea)));
5587                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
5588             } else {
5589                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I64));
5590                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
5591             }
5592             break;
5593          case 4:
5594             if (isLD) {
5595                putQReg128(tt, loadLE(Ity_V128, mkexpr(ea)));
5596                DIP("ldr %s, %s\n", nameQReg128(tt), dis_buf);
5597             } else {
5598                storeLE(mkexpr(ea), getQReg128(tt));
5599                DIP("str %s, %s\n", nameQReg128(tt), dis_buf);
5600             }
5601             break;
5602          default:
5603             vassert(0);
5604       }
5605       return True;
5606    }
5607   after_LDR_STR_vector_register:
5608
5609    /* ---------- LDRS{B,H,W} (integer register, SX) ---------- */
5610    /* 31 29      22 20 15  12 11 9  4
5611       |  |       |  |  |   |  |  |  |
5612       10 1110001 01 Rm opt S 10 Rn Rt    LDRSW Xt, [Xn|SP, R<m>{ext/sh}]
5613
5614       01 1110001 01 Rm opt S 10 Rn Rt    LDRSH Xt, [Xn|SP, R<m>{ext/sh}]
5615       01 1110001 11 Rm opt S 10 Rn Rt    LDRSH Wt, [Xn|SP, R<m>{ext/sh}]
5616
5617       00 1110001 01 Rm opt S 10 Rn Rt    LDRSB Xt, [Xn|SP, R<m>{ext/sh}]
5618       00 1110001 11 Rm opt S 10 Rn Rt    LDRSB Wt, [Xn|SP, R<m>{ext/sh}]
5619    */
5620    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5621        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5622       HChar  dis_buf[64];
5623       UInt   szLg2  = INSN(31,30);
5624       Bool   sxTo64 = INSN(22,22) == 0; // else sx to 32 and zx to 64
5625       UInt   tt     = INSN(4,0);
5626       if (szLg2 == 3) goto after_LDRS_integer_register;
5627       IRTemp ea     = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
5628       if (ea == IRTemp_INVALID) goto after_LDRS_integer_register;
5629       /* Enumerate the 5 variants explicitly. */
5630       if (szLg2 == 2/*32 bit*/ && sxTo64) {
5631          putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, mkexpr(ea))));
5632          DIP("ldrsw %s, %s\n", nameIReg64orZR(tt), dis_buf);
5633          return True;
5634       }
5635       else
5636       if (szLg2 == 1/*16 bit*/) {
5637          if (sxTo64) {
5638             putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, mkexpr(ea))));
5639             DIP("ldrsh %s, %s\n", nameIReg64orZR(tt), dis_buf);
5640          } else {
5641             putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, mkexpr(ea))));
5642             DIP("ldrsh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5643          }
5644          return True;
5645       }
5646       else
5647       if (szLg2 == 0/*8 bit*/) {
5648          if (sxTo64) {
5649             putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, mkexpr(ea))));
5650             DIP("ldrsb %s, %s\n", nameIReg64orZR(tt), dis_buf);
5651          } else {
5652             putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, mkexpr(ea))));
5653             DIP("ldrsb %s, %s\n", nameIReg32orZR(tt), dis_buf);
5654          }
5655          return True;
5656       }
5657       /* else it's an invalid combination */
5658    }
5659   after_LDRS_integer_register:
5660
5661    /* -------- LDR/STR (immediate, SIMD&FP, unsigned offset) -------- */
5662    /* This is the Unsigned offset variant only.  The Post-Index and
5663       Pre-Index variants are below.
5664
5665       31 29      23 21    9 4
5666       00 111 101 01 imm12 n t   LDR Bt, [Xn|SP + imm12 * 1]
5667       01 111 101 01 imm12 n t   LDR Ht, [Xn|SP + imm12 * 2]
5668       10 111 101 01 imm12 n t   LDR St, [Xn|SP + imm12 * 4]
5669       11 111 101 01 imm12 n t   LDR Dt, [Xn|SP + imm12 * 8]
5670       00 111 101 11 imm12 n t   LDR Qt, [Xn|SP + imm12 * 16]
5671
5672       00 111 101 00 imm12 n t   STR Bt, [Xn|SP + imm12 * 1]
5673       01 111 101 00 imm12 n t   STR Ht, [Xn|SP + imm12 * 2]
5674       10 111 101 00 imm12 n t   STR St, [Xn|SP + imm12 * 4]
5675       11 111 101 00 imm12 n t   STR Dt, [Xn|SP + imm12 * 8]
5676       00 111 101 10 imm12 n t   STR Qt, [Xn|SP + imm12 * 16]
5677    */
5678    if (INSN(29,24) == BITS6(1,1,1,1,0,1)
5679        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4) {
5680       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5681       Bool   isLD   = INSN(22,22) == 1;
5682       UInt   pimm12 = INSN(21,10) << szLg2;
5683       UInt   nn     = INSN(9,5);
5684       UInt   tt     = INSN(4,0);
5685       IRTemp tEA    = newTemp(Ity_I64);
5686       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5687       assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(pimm12)));
5688       if (isLD) {
5689          if (szLg2 < 4) {
5690             putQReg128(tt, mkV128(0x0000));
5691          }
5692          putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
5693       } else {
5694          storeLE(mkexpr(tEA), getQRegLO(tt, ty));
5695       }
5696       DIP("%s %s, [%s, #%u]\n",
5697           isLD ? "ldr" : "str",
5698           nameQRegLO(tt, ty), nameIReg64orSP(nn), pimm12);
5699       return True;
5700    }
5701
5702    /* -------- LDR/STR (immediate, SIMD&FP, pre/post index) -------- */
5703    /* These are the Post-Index and Pre-Index variants.
5704
5705       31 29      23   20   11 9 4
5706       (at-Rn-then-Rn=EA)
5707       00 111 100 01 0 imm9 01 n t   LDR Bt, [Xn|SP], #simm
5708       01 111 100 01 0 imm9 01 n t   LDR Ht, [Xn|SP], #simm
5709       10 111 100 01 0 imm9 01 n t   LDR St, [Xn|SP], #simm
5710       11 111 100 01 0 imm9 01 n t   LDR Dt, [Xn|SP], #simm
5711       00 111 100 11 0 imm9 01 n t   LDR Qt, [Xn|SP], #simm
5712
5713       (at-EA-then-Rn=EA)
5714       00 111 100 01 0 imm9 11 n t   LDR Bt, [Xn|SP, #simm]!
5715       01 111 100 01 0 imm9 11 n t   LDR Ht, [Xn|SP, #simm]!
5716       10 111 100 01 0 imm9 11 n t   LDR St, [Xn|SP, #simm]!
5717       11 111 100 01 0 imm9 11 n t   LDR Dt, [Xn|SP, #simm]!
5718       00 111 100 11 0 imm9 11 n t   LDR Qt, [Xn|SP, #simm]!
5719
5720       Stores are the same except with bit 22 set to 0.
5721    */
5722    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5723        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
5724        && INSN(21,21) == 0 && INSN(10,10) == 1) {
5725       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5726       Bool   isLD   = INSN(22,22) == 1;
5727       UInt   imm9   = INSN(20,12);
5728       Bool   atRN   = INSN(11,11) == 0;
5729       UInt   nn     = INSN(9,5);
5730       UInt   tt     = INSN(4,0);
5731       IRTemp tRN    = newTemp(Ity_I64);
5732       IRTemp tEA    = newTemp(Ity_I64);
5733       IRTemp tTA    = IRTemp_INVALID;
5734       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5735       ULong  simm9  = sx_to_64(imm9, 9);
5736       assign(tRN, getIReg64orSP(nn));
5737       assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5738       tTA = atRN ? tRN : tEA;
5739
5740       /* Do early writeback for the cases typified by
5741             str d8, [sp, #-32]!
5742             str d10, [sp, #-128]!
5743             str q1, [sp, #-32]!
5744          for the same reasons as described in a similar comment in the
5745          "LDP,STP (immediate, simm7) (FP&VEC)" case just above.
5746       */
5747       Bool earlyWBack
5748          = !atRN && !isLD && (ty == Ity_F64 || ty == Ity_V128)
5749            && nn == 31 && ((Long)simm9) < 0;
5750
5751       if (earlyWBack)
5752          putIReg64orSP(nn, mkexpr(tEA));
5753
5754       if (isLD) {
5755          if (szLg2 < 4) {
5756             putQReg128(tt, mkV128(0x0000));
5757          }
5758          putQRegLO(tt, loadLE(ty, mkexpr(tTA)));
5759       } else {
5760          storeLE(mkexpr(tTA), getQRegLO(tt, ty));
5761       }
5762
5763       if (!earlyWBack)
5764          putIReg64orSP(nn, mkexpr(tEA));
5765
5766       DIP(atRN ? "%s %s, [%s], #%lld\n" : "%s %s, [%s, #%lld]!\n",
5767           isLD ? "ldr" : "str",
5768           nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
5769       return True;
5770    }
5771
5772    /* -------- LDUR/STUR (unscaled offset, SIMD&FP) -------- */
5773    /* 31 29      23   20   11 9 4
5774       00 111 100 01 0 imm9 00 n t   LDR Bt, [Xn|SP, #simm]
5775       01 111 100 01 0 imm9 00 n t   LDR Ht, [Xn|SP, #simm]
5776       10 111 100 01 0 imm9 00 n t   LDR St, [Xn|SP, #simm]
5777       11 111 100 01 0 imm9 00 n t   LDR Dt, [Xn|SP, #simm]
5778       00 111 100 11 0 imm9 00 n t   LDR Qt, [Xn|SP, #simm]
5779
5780       00 111 100 00 0 imm9 00 n t   STR Bt, [Xn|SP, #simm]
5781       01 111 100 00 0 imm9 00 n t   STR Ht, [Xn|SP, #simm]
5782       10 111 100 00 0 imm9 00 n t   STR St, [Xn|SP, #simm]
5783       11 111 100 00 0 imm9 00 n t   STR Dt, [Xn|SP, #simm]
5784       00 111 100 10 0 imm9 00 n t   STR Qt, [Xn|SP, #simm]
5785    */
5786    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5787        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
5788        && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
5789       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5790       Bool   isLD   = INSN(22,22) == 1;
5791       UInt   imm9   = INSN(20,12);
5792       UInt   nn     = INSN(9,5);
5793       UInt   tt     = INSN(4,0);
5794       ULong  simm9  = sx_to_64(imm9, 9);
5795       IRTemp tEA    = newTemp(Ity_I64);
5796       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5797       assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(simm9)));
5798       if (isLD) {
5799          if (szLg2 < 4) {
5800             putQReg128(tt, mkV128(0x0000));
5801          }
5802          putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
5803       } else {
5804          storeLE(mkexpr(tEA), getQRegLO(tt, ty));
5805       }
5806       DIP("%s %s, [%s, #%lld]\n",
5807           isLD ? "ldur" : "stur",
5808           nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
5809       return True;
5810    }
5811
5812    /* ---------------- LDR (literal, SIMD&FP) ---------------- */
5813    /* 31 29      23    4
5814       00 011 100 imm19 t    LDR St, [PC + sxTo64(imm19 << 2)]
5815       01 011 100 imm19 t    LDR Dt, [PC + sxTo64(imm19 << 2)]
5816       10 011 100 imm19 t    LDR Qt, [PC + sxTo64(imm19 << 2)]
5817    */
5818    if (INSN(29,24) == BITS6(0,1,1,1,0,0) && INSN(31,30) < BITS2(1,1)) {
5819       UInt   szB   = 4 << INSN(31,30);
5820       UInt   imm19 = INSN(23,5);
5821       UInt   tt    = INSN(4,0);
5822       ULong  ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
5823       IRType ty    = preferredVectorSubTypeFromSize(szB);
5824       putQReg128(tt, mkV128(0x0000));
5825       putQRegLO(tt, loadLE(ty, mkU64(ea)));
5826       DIP("ldr %s, 0x%llx (literal)\n", nameQRegLO(tt, ty), ea);
5827       return True;
5828    }
5829
5830    /* ------ LD1/ST1 (multiple 1-elem structs to/from 1 reg  ------ */
5831    /* ------ LD2/ST2 (multiple 2-elem structs to/from 2 regs ------ */
5832    /* ------ LD3/ST3 (multiple 3-elem structs to/from 3 regs ------ */
5833    /* ------ LD4/ST4 (multiple 4-elem structs to/from 4 regs ------ */
5834    /* 31 29  26   22 21 20    15   11 9 4
5835
5836       0q 001 1000 L  0  00000 0000 sz n t  xx4 {Vt..t+3.T}, [Xn|SP]
5837       0q 001 1001 L  0  m     0000 sz n t  xx4 {Vt..t+3.T}, [Xn|SP], step
5838
5839       0q 001 1000 L  0  00000 0100 sz n t  xx3 {Vt..t+2.T}, [Xn|SP]
5840       0q 001 1001 L  0  m     0100 sz n t  xx3 {Vt..t+2.T}, [Xn|SP], step
5841
5842       0q 001 1000 L  0  00000 1000 sz n t  xx2 {Vt..t+1.T}, [Xn|SP]
5843       0q 001 1001 L  0  m     1000 sz n t  xx2 {Vt..t+1.T}, [Xn|SP], step
5844
5845       0q 001 1000 L  0  00000 0111 sz n t  xx1 {Vt.T},      [Xn|SP]
5846       0q 001 1001 L  0  m     0111 sz n t  xx1 {Vt.T},      [Xn|SP], step
5847
5848       T    = defined by Q and sz in the normal way
5849       step = if m == 11111 then transfer-size else Xm
5850       xx   = case L of 1 -> LD ; 0 -> ST
5851    */
5852    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
5853        && INSN(21,21) == 0) {
5854       Bool bitQ  = INSN(30,30);
5855       Bool isPX  = INSN(23,23) == 1;
5856       Bool isLD  = INSN(22,22) == 1;
5857       UInt mm    = INSN(20,16);
5858       UInt opc   = INSN(15,12);
5859       UInt sz    = INSN(11,10);
5860       UInt nn    = INSN(9,5);
5861       UInt tt    = INSN(4,0);
5862       Bool isQ   = bitQ == 1;
5863       Bool is1d  = sz == BITS2(1,1) && !isQ;
5864       UInt nRegs = 0;
5865       switch (opc) {
5866          case BITS4(0,0,0,0): nRegs = 4; break;
5867          case BITS4(0,1,0,0): nRegs = 3; break;
5868          case BITS4(1,0,0,0): nRegs = 2; break;
5869          case BITS4(0,1,1,1): nRegs = 1; break;
5870          default: break;
5871       }
5872
5873       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
5874          If we see it, set nRegs to 0 so as to cause the next conditional
5875          to fail. */
5876       if (!isPX && mm != 0)
5877          nRegs = 0;
5878
5879       if (nRegs == 1                             /* .1d is allowed */
5880           || (nRegs >= 2 && nRegs <= 4 && !is1d) /* .1d is not allowed */) {
5881
5882          UInt xferSzB = (isQ ? 16 : 8) * nRegs;
5883
5884          /* Generate the transfer address (TA) and if necessary the
5885             writeback address (WB) */
5886          IRTemp tTA = newTemp(Ity_I64);
5887          assign(tTA, getIReg64orSP(nn));
5888          if (nn == 31) { /* FIXME generate stack alignment check */ }
5889          IRTemp tWB = IRTemp_INVALID;
5890          if (isPX) {
5891             tWB = newTemp(Ity_I64);
5892             assign(tWB, binop(Iop_Add64,
5893                               mkexpr(tTA),
5894                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
5895                                                      : getIReg64orZR(mm)));
5896          }
5897
5898          /* -- BEGIN generate the transfers -- */
5899
5900          IRTemp u0, u1, u2, u3, i0, i1, i2, i3;
5901          u0 = u1 = u2 = u3 = i0 = i1 = i2 = i3 = IRTemp_INVALID;
5902          switch (nRegs) {
5903             case 4: u3 = newTempV128(); i3 = newTempV128(); /* fallthru */
5904             case 3: u2 = newTempV128(); i2 = newTempV128(); /* fallthru */
5905             case 2: u1 = newTempV128(); i1 = newTempV128(); /* fallthru */
5906             case 1: u0 = newTempV128(); i0 = newTempV128(); break;
5907             default: vassert(0);
5908          }
5909
5910          /* -- Multiple 128 or 64 bit stores -- */
5911          if (!isLD) {
5912             switch (nRegs) {
5913                case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
5914                case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
5915                case 2: assign(u1, getQReg128((tt+1) % 32)); /* fallthru */
5916                case 1: assign(u0, getQReg128((tt+0) % 32)); break;
5917                default: vassert(0);
5918             }
5919             switch (nRegs) {
5920                case 4:  (isQ ? math_INTERLEAVE4_128 : math_INTERLEAVE4_64)
5921                            (&i0, &i1, &i2, &i3, sz, u0, u1, u2, u3);
5922                         break;
5923                case 3:  (isQ ? math_INTERLEAVE3_128 : math_INTERLEAVE3_64)
5924                            (&i0, &i1, &i2, sz, u0, u1, u2);
5925                         break;
5926                case 2:  (isQ ? math_INTERLEAVE2_128 : math_INTERLEAVE2_64)
5927                            (&i0, &i1, sz, u0, u1);
5928                         break;
5929                case 1:  (isQ ? math_INTERLEAVE1_128 : math_INTERLEAVE1_64)
5930                            (&i0, sz, u0);
5931                         break;
5932                default: vassert(0);
5933             }
5934 #           define MAYBE_NARROW_TO_64(_expr) \
5935                       (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
5936             UInt step = isQ ? 16 : 8;
5937             switch (nRegs) {
5938                case 4:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
5939                                  MAYBE_NARROW_TO_64(mkexpr(i3)) );
5940                         /* fallthru */
5941                case 3:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
5942                                  MAYBE_NARROW_TO_64(mkexpr(i2)) );
5943                         /* fallthru */
5944                case 2:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
5945                                  MAYBE_NARROW_TO_64(mkexpr(i1)) );
5946                         /* fallthru */
5947                case 1:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
5948                                  MAYBE_NARROW_TO_64(mkexpr(i0)) );
5949                         break;
5950                default: vassert(0);
5951             }
5952 #           undef MAYBE_NARROW_TO_64
5953          }
5954
5955          /* -- Multiple 128 or 64 bit loads -- */
5956          else /* isLD */ {
5957             UInt   step   = isQ ? 16 : 8;
5958             IRType loadTy = isQ ? Ity_V128 : Ity_I64;
5959 #           define MAYBE_WIDEN_FROM_64(_expr) \
5960                       (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
5961             switch (nRegs) {
5962                case 4:
5963                   assign(i3, MAYBE_WIDEN_FROM_64(
5964                                 loadLE(loadTy,
5965                                        binop(Iop_Add64, mkexpr(tTA),
5966                                                         mkU64(3 * step)))));
5967                   /* fallthru */
5968                case 3:
5969                   assign(i2, MAYBE_WIDEN_FROM_64(
5970                                 loadLE(loadTy,
5971                                        binop(Iop_Add64, mkexpr(tTA),
5972                                                         mkU64(2 * step)))));
5973                   /* fallthru */
5974                case 2:
5975                   assign(i1, MAYBE_WIDEN_FROM_64(
5976                                 loadLE(loadTy,
5977                                        binop(Iop_Add64, mkexpr(tTA),
5978                                                         mkU64(1 * step)))));
5979                   /* fallthru */
5980                case 1:
5981                   assign(i0, MAYBE_WIDEN_FROM_64(
5982                                 loadLE(loadTy,
5983                                        binop(Iop_Add64, mkexpr(tTA),
5984                                                         mkU64(0 * step)))));
5985                   break;
5986                default:
5987                   vassert(0);
5988             }
5989 #           undef MAYBE_WIDEN_FROM_64
5990             switch (nRegs) {
5991                case 4:  (isQ ? math_DEINTERLEAVE4_128 : math_DEINTERLEAVE4_64)
5992                            (&u0, &u1, &u2, &u3, sz, i0,i1,i2,i3);
5993                         break;
5994                case 3:  (isQ ? math_DEINTERLEAVE3_128 : math_DEINTERLEAVE3_64)
5995                            (&u0, &u1, &u2, sz, i0, i1, i2);
5996                         break;
5997                case 2:  (isQ ? math_DEINTERLEAVE2_128 : math_DEINTERLEAVE2_64)
5998                            (&u0, &u1, sz, i0, i1);
5999                         break;
6000                case 1:  (isQ ? math_DEINTERLEAVE1_128 : math_DEINTERLEAVE1_64)
6001                            (&u0, sz, i0);
6002                         break;
6003                default: vassert(0);
6004             }
6005             switch (nRegs) {
6006                case 4:  putQReg128( (tt+3) % 32,
6007                                     math_MAYBE_ZERO_HI64(bitQ, u3));
6008                         /* fallthru */
6009                case 3:  putQReg128( (tt+2) % 32,
6010                                     math_MAYBE_ZERO_HI64(bitQ, u2));
6011                         /* fallthru */
6012                case 2:  putQReg128( (tt+1) % 32,
6013                                     math_MAYBE_ZERO_HI64(bitQ, u1));
6014                         /* fallthru */
6015                case 1:  putQReg128( (tt+0) % 32,
6016                                     math_MAYBE_ZERO_HI64(bitQ, u0));
6017                         break;
6018                default: vassert(0);
6019             }
6020          }
6021
6022          /* -- END generate the transfers -- */
6023
6024          /* Do the writeback, if necessary */
6025          if (isPX) {
6026             putIReg64orSP(nn, mkexpr(tWB));
6027          }
6028
6029          HChar pxStr[20];
6030          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6031          if (isPX) {
6032             if (mm == BITS5(1,1,1,1,1))
6033                vex_sprintf(pxStr, ", #%u", xferSzB);
6034             else
6035                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6036          }
6037          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6038          DIP("%s%u {v%u.%s .. v%u.%s}, [%s]%s\n",
6039              isLD ? "ld" : "st", nRegs,
6040              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6041              pxStr);
6042
6043          return True;
6044       }
6045       /* else fall through */
6046    }
6047
6048    /* ------ LD1/ST1 (multiple 1-elem structs to/from 2 regs  ------ */
6049    /* ------ LD1/ST1 (multiple 1-elem structs to/from 3 regs  ------ */
6050    /* ------ LD1/ST1 (multiple 1-elem structs to/from 4 regs  ------ */
6051    /* 31 29  26   22 21 20    15   11 9 4
6052
6053       0q 001 1000 L  0  00000 0010 sz n t  xx1 {Vt..t+3.T}, [Xn|SP]
6054       0q 001 1001 L  0  m     0010 sz n t  xx1 {Vt..t+3.T}, [Xn|SP], step
6055
6056       0q 001 1000 L  0  00000 0110 sz n t  xx1 {Vt..t+2.T}, [Xn|SP]
6057       0q 001 1001 L  0  m     0110 sz n t  xx1 {Vt..t+2.T}, [Xn|SP], step
6058
6059       0q 001 1000 L  0  00000 1010 sz n t  xx1 {Vt..t+1.T}, [Xn|SP]
6060       0q 001 1001 L  0  m     1010 sz n t  xx1 {Vt..t+1.T}, [Xn|SP], step
6061
6062       T    = defined by Q and sz in the normal way
6063       step = if m == 11111 then transfer-size else Xm
6064       xx   = case L of 1 -> LD ; 0 -> ST
6065    */
6066    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
6067        && INSN(21,21) == 0) {
6068       Bool bitQ  = INSN(30,30);
6069       Bool isPX  = INSN(23,23) == 1;
6070       Bool isLD  = INSN(22,22) == 1;
6071       UInt mm    = INSN(20,16);
6072       UInt opc   = INSN(15,12);
6073       UInt sz    = INSN(11,10);
6074       UInt nn    = INSN(9,5);
6075       UInt tt    = INSN(4,0);
6076       Bool isQ   = bitQ == 1;
6077       UInt nRegs = 0;
6078       switch (opc) {
6079          case BITS4(0,0,1,0): nRegs = 4; break;
6080          case BITS4(0,1,1,0): nRegs = 3; break;
6081          case BITS4(1,0,1,0): nRegs = 2; break;
6082          default: break;
6083       }
6084
6085       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
6086          If we see it, set nRegs to 0 so as to cause the next conditional
6087          to fail. */
6088       if (!isPX && mm != 0)
6089          nRegs = 0;
6090
6091       if (nRegs >= 2 && nRegs <= 4) {
6092
6093          UInt xferSzB = (isQ ? 16 : 8) * nRegs;
6094
6095          /* Generate the transfer address (TA) and if necessary the
6096             writeback address (WB) */
6097          IRTemp tTA = newTemp(Ity_I64);
6098          assign(tTA, getIReg64orSP(nn));
6099          if (nn == 31) { /* FIXME generate stack alignment check */ }
6100          IRTemp tWB = IRTemp_INVALID;
6101          if (isPX) {
6102             tWB = newTemp(Ity_I64);
6103             assign(tWB, binop(Iop_Add64,
6104                               mkexpr(tTA),
6105                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6106                                                      : getIReg64orZR(mm)));
6107          }
6108
6109          /* -- BEGIN generate the transfers -- */
6110
6111          IRTemp u0, u1, u2, u3;
6112          u0 = u1 = u2 = u3 = IRTemp_INVALID;
6113          switch (nRegs) {
6114             case 4: u3 = newTempV128(); /* fallthru */
6115             case 3: u2 = newTempV128(); /* fallthru */
6116             case 2: u1 = newTempV128();
6117                     u0 = newTempV128(); break;
6118             default: vassert(0);
6119          }
6120
6121          /* -- Multiple 128 or 64 bit stores -- */
6122          if (!isLD) {
6123             switch (nRegs) {
6124                case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
6125                case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
6126                case 2: assign(u1, getQReg128((tt+1) % 32));
6127                        assign(u0, getQReg128((tt+0) % 32)); break;
6128                default: vassert(0);
6129             }
6130 #           define MAYBE_NARROW_TO_64(_expr) \
6131                       (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
6132             UInt step = isQ ? 16 : 8;
6133             switch (nRegs) {
6134                case 4:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
6135                                  MAYBE_NARROW_TO_64(mkexpr(u3)) );
6136                         /* fallthru */
6137                case 3:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
6138                                  MAYBE_NARROW_TO_64(mkexpr(u2)) );
6139                         /* fallthru */
6140                case 2:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
6141                                  MAYBE_NARROW_TO_64(mkexpr(u1)) );
6142                         storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
6143                                  MAYBE_NARROW_TO_64(mkexpr(u0)) );
6144                         break;
6145                default: vassert(0);
6146             }
6147 #           undef MAYBE_NARROW_TO_64
6148          }
6149
6150          /* -- Multiple 128 or 64 bit loads -- */
6151          else /* isLD */ {
6152             UInt   step   = isQ ? 16 : 8;
6153             IRType loadTy = isQ ? Ity_V128 : Ity_I64;
6154 #           define MAYBE_WIDEN_FROM_64(_expr) \
6155                       (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
6156             switch (nRegs) {
6157                case 4:
6158                   assign(u3, MAYBE_WIDEN_FROM_64(
6159                                 loadLE(loadTy,
6160                                        binop(Iop_Add64, mkexpr(tTA),
6161                                                         mkU64(3 * step)))));
6162                   /* fallthru */
6163                case 3:
6164                   assign(u2, MAYBE_WIDEN_FROM_64(
6165                                 loadLE(loadTy,
6166                                        binop(Iop_Add64, mkexpr(tTA),
6167                                                         mkU64(2 * step)))));
6168                   /* fallthru */
6169                case 2:
6170                   assign(u1, MAYBE_WIDEN_FROM_64(
6171                                 loadLE(loadTy,
6172                                        binop(Iop_Add64, mkexpr(tTA),
6173                                                         mkU64(1 * step)))));
6174                   assign(u0, MAYBE_WIDEN_FROM_64(
6175                                 loadLE(loadTy,
6176                                        binop(Iop_Add64, mkexpr(tTA),
6177                                                         mkU64(0 * step)))));
6178                   break;
6179                default:
6180                   vassert(0);
6181             }
6182 #           undef MAYBE_WIDEN_FROM_64
6183             switch (nRegs) {
6184                case 4:  putQReg128( (tt+3) % 32,
6185                                     math_MAYBE_ZERO_HI64(bitQ, u3));
6186                         /* fallthru */
6187                case 3:  putQReg128( (tt+2) % 32,
6188                                     math_MAYBE_ZERO_HI64(bitQ, u2));
6189                         /* fallthru */
6190                case 2:  putQReg128( (tt+1) % 32,
6191                                     math_MAYBE_ZERO_HI64(bitQ, u1));
6192                         putQReg128( (tt+0) % 32,
6193                                     math_MAYBE_ZERO_HI64(bitQ, u0));
6194                         break;
6195                default: vassert(0);
6196             }
6197          }
6198
6199          /* -- END generate the transfers -- */
6200
6201          /* Do the writeback, if necessary */
6202          if (isPX) {
6203             putIReg64orSP(nn, mkexpr(tWB));
6204          }
6205
6206          HChar pxStr[20];
6207          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6208          if (isPX) {
6209             if (mm == BITS5(1,1,1,1,1))
6210                vex_sprintf(pxStr, ", #%u", xferSzB);
6211             else
6212                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6213          }
6214          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6215          DIP("%s1 {v%u.%s .. v%u.%s}, [%s]%s\n",
6216              isLD ? "ld" : "st",
6217              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6218              pxStr);
6219
6220          return True;
6221       }
6222       /* else fall through */
6223    }
6224
6225    /* ---------- LD1R (single structure, replicate) ---------- */
6226    /* ---------- LD2R (single structure, replicate) ---------- */
6227    /* ---------- LD3R (single structure, replicate) ---------- */
6228    /* ---------- LD4R (single structure, replicate) ---------- */
6229    /* 31 29       22 20    15    11 9 4
6230       0q 001 1010 10 00000 110 0 sz n t  LD1R {Vt.T}, [Xn|SP]
6231       0q 001 1011 10 m     110 0 sz n t  LD1R {Vt.T}, [Xn|SP], step
6232
6233       0q 001 1010 11 00000 110 0 sz n t  LD2R {Vt..t+1.T}, [Xn|SP]
6234       0q 001 1011 11 m     110 0 sz n t  LD2R {Vt..t+1.T}, [Xn|SP], step
6235
6236       0q 001 1010 10 00000 111 0 sz n t  LD3R {Vt..t+2.T}, [Xn|SP]
6237       0q 001 1011 10 m     111 0 sz n t  LD3R {Vt..t+2.T}, [Xn|SP], step
6238
6239       0q 001 1010 11 00000 111 0 sz n t  LD4R {Vt..t+3.T}, [Xn|SP]
6240       0q 001 1011 11 m     111 0 sz n t  LD4R {Vt..t+3.T}, [Xn|SP], step
6241
6242       step = if m == 11111 then transfer-size else Xm
6243    */
6244    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)
6245        && INSN(22,22) == 1 && INSN(15,14) == BITS2(1,1)
6246        && INSN(12,12) == 0) {
6247       UInt   bitQ  = INSN(30,30);
6248       Bool   isPX  = INSN(23,23) == 1;
6249       UInt   nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
6250       UInt   mm    = INSN(20,16);
6251       UInt   sz    = INSN(11,10);
6252       UInt   nn    = INSN(9,5);
6253       UInt   tt    = INSN(4,0);
6254
6255       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
6256       if (isPX || mm == 0) {
6257
6258          IRType ty    = integerIRTypeOfSize(1 << sz);
6259
6260          UInt laneSzB = 1 << sz;
6261          UInt xferSzB = laneSzB * nRegs;
6262
6263          /* Generate the transfer address (TA) and if necessary the
6264             writeback address (WB) */
6265          IRTemp tTA = newTemp(Ity_I64);
6266          assign(tTA, getIReg64orSP(nn));
6267          if (nn == 31) { /* FIXME generate stack alignment check */ }
6268          IRTemp tWB = IRTemp_INVALID;
6269          if (isPX) {
6270             tWB = newTemp(Ity_I64);
6271             assign(tWB, binop(Iop_Add64,
6272                               mkexpr(tTA),
6273                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6274                                                      : getIReg64orZR(mm)));
6275          }
6276
6277          /* Do the writeback, if necessary */
6278          if (isPX) {
6279             putIReg64orSP(nn, mkexpr(tWB));
6280          }
6281
6282          IRTemp e0, e1, e2, e3, v0, v1, v2, v3;
6283          e0 = e1 = e2 = e3 = v0 = v1 = v2 = v3 = IRTemp_INVALID;
6284          switch (nRegs) {
6285             case 4:
6286                e3 = newTemp(ty);
6287                assign(e3, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6288                                                       mkU64(3 * laneSzB))));
6289                v3 = math_DUP_TO_V128(e3, ty);
6290                putQReg128((tt+3) % 32, math_MAYBE_ZERO_HI64(bitQ, v3));
6291                /* fallthrough */
6292             case 3:
6293                e2 = newTemp(ty);
6294                assign(e2, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6295                                                       mkU64(2 * laneSzB))));
6296                v2 = math_DUP_TO_V128(e2, ty);
6297                putQReg128((tt+2) % 32, math_MAYBE_ZERO_HI64(bitQ, v2));
6298                /* fallthrough */
6299             case 2:
6300                e1 = newTemp(ty);
6301                assign(e1, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6302                                                       mkU64(1 * laneSzB))));
6303                v1 = math_DUP_TO_V128(e1, ty);
6304                putQReg128((tt+1) % 32, math_MAYBE_ZERO_HI64(bitQ, v1));
6305                /* fallthrough */
6306             case 1:
6307                e0 = newTemp(ty);
6308                assign(e0, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6309                                                       mkU64(0 * laneSzB))));
6310                v0 = math_DUP_TO_V128(e0, ty);
6311                putQReg128((tt+0) % 32, math_MAYBE_ZERO_HI64(bitQ, v0));
6312                break;
6313             default:
6314                vassert(0);
6315          }
6316
6317          HChar pxStr[20];
6318          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6319          if (isPX) {
6320             if (mm == BITS5(1,1,1,1,1))
6321                vex_sprintf(pxStr, ", #%u", xferSzB);
6322             else
6323                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6324          }
6325          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6326          DIP("ld%ur {v%u.%s .. v%u.%s}, [%s]%s\n",
6327              nRegs,
6328              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6329              pxStr);
6330
6331          return True;
6332       }
6333       /* else fall through */
6334    }
6335
6336    /* ------ LD1/ST1 (single structure, to/from one lane) ------ */
6337    /* ------ LD2/ST2 (single structure, to/from one lane) ------ */
6338    /* ------ LD3/ST3 (single structure, to/from one lane) ------ */
6339    /* ------ LD4/ST4 (single structure, to/from one lane) ------ */
6340    /* 31 29       22 21 20    15    11 9 4
6341       0q 001 1010 L  0  00000 xx0 S sz n t  op1 {Vt.T}[ix], [Xn|SP]
6342       0q 001 1011 L  0  m     xx0 S sz n t  op1 {Vt.T}[ix], [Xn|SP], step
6343
6344       0q 001 1010 L  1  00000 xx0 S sz n t  op2 {Vt..t+1.T}[ix], [Xn|SP]
6345       0q 001 1011 L  1  m     xx0 S sz n t  op2 {Vt..t+1.T}[ix], [Xn|SP], step
6346
6347       0q 001 1010 L  0  00000 xx1 S sz n t  op3 {Vt..t+2.T}[ix], [Xn|SP]
6348       0q 001 1011 L  0  m     xx1 S sz n t  op3 {Vt..t+2.T}[ix], [Xn|SP], step
6349
6350       0q 001 1010 L  1  00000 xx1 S sz n t  op4 {Vt..t+3.T}[ix], [Xn|SP]
6351       0q 001 1011 L  1  m     xx1 S sz n t  op4 {Vt..t+3.T}[ix], [Xn|SP], step
6352
6353       step = if m == 11111 then transfer-size else Xm
6354       op   = case L of 1 -> LD ; 0 -> ST
6355
6356       laneszB,ix = case xx:q:S:sz of 00:b:b:bb -> 1, bbbb
6357                                      01:b:b:b0 -> 2, bbb
6358                                      10:b:b:00 -> 4, bb
6359                                      10:b:0:01 -> 8, b
6360    */
6361    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)) {
6362       UInt   bitQ  = INSN(30,30);
6363       Bool   isPX  = INSN(23,23) == 1;
6364       Bool   isLD  = INSN(22,22) == 1;
6365       UInt   nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
6366       UInt   mm    = INSN(20,16);
6367       UInt   xx    = INSN(15,14);
6368       UInt   bitS  = INSN(12,12);
6369       UInt   sz    = INSN(11,10);
6370       UInt   nn    = INSN(9,5);
6371       UInt   tt    = INSN(4,0);
6372
6373       Bool valid = True;
6374
6375       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
6376       if (!isPX && mm != 0)
6377          valid = False;
6378
6379       UInt laneSzB = 0;  /* invalid */
6380       UInt ix      = 16; /* invalid */
6381
6382       UInt xx_q_S_sz = (xx << 4) | (bitQ << 3) | (bitS << 2) | sz;
6383       switch (xx_q_S_sz) {
6384          case 0x00: case 0x01: case 0x02: case 0x03:
6385          case 0x04: case 0x05: case 0x06: case 0x07:
6386          case 0x08: case 0x09: case 0x0A: case 0x0B:
6387          case 0x0C: case 0x0D: case 0x0E: case 0x0F:
6388             laneSzB = 1; ix = xx_q_S_sz & 0xF;
6389             break;
6390          case 0x10: case 0x12: case 0x14: case 0x16:
6391          case 0x18: case 0x1A: case 0x1C: case 0x1E:
6392             laneSzB = 2; ix = (xx_q_S_sz >> 1) & 7;
6393             break;
6394          case 0x20: case 0x24: case 0x28: case 0x2C:
6395             laneSzB = 4; ix = (xx_q_S_sz >> 2) & 3;
6396             break;
6397          case 0x21: case 0x29:
6398             laneSzB = 8; ix = (xx_q_S_sz >> 3) & 1;
6399             break;
6400          default:
6401             break;
6402       }
6403
6404       if (valid && laneSzB != 0) {
6405
6406          IRType ty      = integerIRTypeOfSize(laneSzB);
6407          UInt   xferSzB = laneSzB * nRegs;
6408
6409          /* Generate the transfer address (TA) and if necessary the
6410             writeback address (WB) */
6411          IRTemp tTA = newTemp(Ity_I64);
6412          assign(tTA, getIReg64orSP(nn));
6413          if (nn == 31) { /* FIXME generate stack alignment check */ }
6414          IRTemp tWB = IRTemp_INVALID;
6415          if (isPX) {
6416             tWB = newTemp(Ity_I64);
6417             assign(tWB, binop(Iop_Add64,
6418                               mkexpr(tTA),
6419                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6420                                                      : getIReg64orZR(mm)));
6421          }
6422
6423          /* Do the writeback, if necessary */
6424          if (isPX) {
6425             putIReg64orSP(nn, mkexpr(tWB));
6426          }
6427
6428          switch (nRegs) {
6429             case 4: {
6430                IRExpr* addr
6431                   = binop(Iop_Add64, mkexpr(tTA), mkU64(3 * laneSzB));
6432                if (isLD) {
6433                   putQRegLane((tt+3) % 32, ix, loadLE(ty, addr));
6434                } else {
6435                   storeLE(addr, getQRegLane((tt+3) % 32, ix, ty));
6436                }
6437             }
6438             /* fallthrough */
6439             case 3: {
6440                IRExpr* addr
6441                   = binop(Iop_Add64, mkexpr(tTA), mkU64(2 * laneSzB));
6442                if (isLD) {
6443                   putQRegLane((tt+2) % 32, ix, loadLE(ty, addr));
6444                } else {
6445                   storeLE(addr, getQRegLane((tt+2) % 32, ix, ty));
6446                }
6447             }
6448             /* fallthrough */
6449             case 2: {
6450                IRExpr* addr
6451                   = binop(Iop_Add64, mkexpr(tTA), mkU64(1 * laneSzB));
6452                if (isLD) {
6453                   putQRegLane((tt+1) % 32, ix, loadLE(ty, addr));
6454                } else {
6455                   storeLE(addr, getQRegLane((tt+1) % 32, ix, ty));
6456                }
6457             }
6458             /* fallthrough */
6459             case 1: {
6460                IRExpr* addr
6461                   = binop(Iop_Add64, mkexpr(tTA), mkU64(0 * laneSzB));
6462                if (isLD) {
6463                   putQRegLane((tt+0) % 32, ix, loadLE(ty, addr));
6464                } else {
6465                   storeLE(addr, getQRegLane((tt+0) % 32, ix, ty));
6466                }
6467                break;
6468             }
6469             default:
6470                vassert(0);
6471          }
6472
6473          HChar pxStr[20];
6474          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6475          if (isPX) {
6476             if (mm == BITS5(1,1,1,1,1))
6477                vex_sprintf(pxStr, ", #%u", xferSzB);
6478             else
6479                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6480          }
6481          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6482          DIP("%s%u {v%u.%s .. v%u.%s}[%u], [%s]%s\n",
6483              isLD ? "ld" : "st", nRegs,
6484              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr,
6485              ix, nameIReg64orSP(nn), pxStr);
6486
6487          return True;
6488       }
6489       /* else fall through */
6490    }
6491
6492    /* ------------------ LD{,A}X{R,RH,RB} ------------------ */
6493    /* ------------------ ST{,L}X{R,RH,RB} ------------------ */
6494    /* 31 29     23  20      14    9 4
6495       sz 001000 010 11111 0 11111 n t   LDX{R,RH,RB}  Rt, [Xn|SP]
6496       sz 001000 010 11111 1 11111 n t   LDAX{R,RH,RB} Rt, [Xn|SP]
6497       sz 001000 000 s     0 11111 n t   STX{R,RH,RB}  Ws, Rt, [Xn|SP]
6498       sz 001000 000 s     1 11111 n t   STLX{R,RH,RB} Ws, Rt, [Xn|SP]
6499    */
6500    /* For the "standard" implementation we pass through the LL and SC to
6501       the host.  For the "fallback" implementation, for details see
6502         https://bugs.kde.org/show_bug.cgi?id=344524 and
6503         https://bugs.kde.org/show_bug.cgi?id=369459,
6504       but in short:
6505
6506       LoadLinked(addr)
6507         gs.LLsize = load_size // 1, 2, 4 or 8
6508         gs.LLaddr = addr
6509         gs.LLdata = zeroExtend(*addr)
6510
6511       StoreCond(addr, data)
6512         tmp_LLsize = gs.LLsize
6513         gs.LLsize = 0 // "no transaction"
6514         if tmp_LLsize != store_size        -> fail
6515         if addr != gs.LLaddr               -> fail
6516         if zeroExtend(*addr) != gs.LLdata  -> fail
6517         cas_ok = CAS(store_size, addr, gs.LLdata -> data)
6518         if !cas_ok                         -> fail
6519         succeed
6520
6521       When thread scheduled
6522         gs.LLsize = 0 // "no transaction"
6523         (coregrind/m_scheduler/scheduler.c, run_thread_for_a_while()
6524          has to do this bit)
6525    */
6526    if (INSN(29,23) == BITS7(0,0,1,0,0,0,0)
6527        && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,0)
6528        && INSN(14,10) == BITS5(1,1,1,1,1)) {
6529       UInt szBlg2     = INSN(31,30);
6530       Bool isLD       = INSN(22,22) == 1;
6531       Bool isAcqOrRel = INSN(15,15) == 1;
6532       UInt ss         = INSN(20,16);
6533       UInt nn         = INSN(9,5);
6534       UInt tt         = INSN(4,0);
6535
6536       vassert(szBlg2 < 4);
6537       UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
6538       IRType ty  = integerIRTypeOfSize(szB);
6539       const HChar* suffix[4] = { "rb", "rh", "r", "r" };
6540
6541       IRTemp ea = newTemp(Ity_I64);
6542       assign(ea, getIReg64orSP(nn));
6543       /* FIXME generate check that ea is szB-aligned */
6544
6545       if (isLD && ss == BITS5(1,1,1,1,1)) {
6546          IRTemp res = newTemp(ty);
6547          if (abiinfo->guest__use_fallback_LLSC) {
6548             // Do the load first so we don't update any guest state
6549             // if it faults.
6550             IRTemp loaded_data64 = newTemp(Ity_I64);
6551             assign(loaded_data64, widenUto64(ty, loadLE(ty, mkexpr(ea))));
6552             stmt( IRStmt_Put( OFFB_LLSC_DATA, mkexpr(loaded_data64) ));
6553             stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
6554             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(szB) ));
6555             putIReg64orZR(tt, mkexpr(loaded_data64));
6556          } else {
6557             stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
6558             putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
6559          }
6560          if (isAcqOrRel) {
6561             stmt(IRStmt_MBE(Imbe_Fence));
6562          }
6563          DIP("ld%sx%s %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
6564              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
6565              abiinfo->guest__use_fallback_LLSC
6566                 ? "(fallback implementation)" : "");
6567          return True;
6568       }
6569       if (!isLD) {
6570          if (isAcqOrRel) {
6571             stmt(IRStmt_MBE(Imbe_Fence));
6572          }
6573          IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
6574          if (abiinfo->guest__use_fallback_LLSC) {
6575             // This is really ugly, since we don't have any way to do
6576             // proper if-then-else.  First, set up as if the SC failed,
6577             // and jump forwards if it really has failed.
6578
6579             // Continuation address
6580             IRConst* nia = IRConst_U64(guest_PC_curr_instr + 4);
6581
6582             // "the SC failed".  Any non-zero value means failure.
6583             putIReg64orZR(ss, mkU64(1));
6584
6585             IRTemp tmp_LLsize = newTemp(Ity_I64);
6586             assign(tmp_LLsize, IRExpr_Get(OFFB_LLSC_SIZE, Ity_I64));
6587             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) // "no transaction"
6588             ));
6589             // Fail if no or wrong-size transaction
6590             vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
6591             stmt( IRStmt_Exit(
6592                      binop(Iop_CmpNE64, mkexpr(tmp_LLsize), mkU64(szB)),
6593                      Ijk_Boring, nia, OFFB_PC
6594             ));
6595             // Fail if the address doesn't match the LL address
6596             stmt( IRStmt_Exit(
6597                       binop(Iop_CmpNE64, mkexpr(ea),
6598                                          IRExpr_Get(OFFB_LLSC_ADDR, Ity_I64)),
6599                       Ijk_Boring, nia, OFFB_PC
6600             ));
6601             // Fail if the data doesn't match the LL data
6602             IRTemp llsc_data64 = newTemp(Ity_I64);
6603             assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA, Ity_I64));
6604             stmt( IRStmt_Exit(
6605                       binop(Iop_CmpNE64, widenUto64(ty, loadLE(ty, mkexpr(ea))),
6606                                          mkexpr(llsc_data64)),
6607                       Ijk_Boring, nia, OFFB_PC
6608             ));
6609             // Try to CAS the new value in.
6610             IRTemp old = newTemp(ty);
6611             IRTemp expd = newTemp(ty);
6612             assign(expd, narrowFrom64(ty, mkexpr(llsc_data64)));
6613             stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
6614                                      Iend_LE, mkexpr(ea),
6615                                      /*expdHi*/NULL, mkexpr(expd),
6616                                      /*dataHi*/NULL, data
6617             )));
6618             // Fail if the CAS failed (viz, old != expd)
6619             stmt( IRStmt_Exit(
6620                       binop(Iop_CmpNE64,
6621                             widenUto64(ty, mkexpr(old)),
6622                             widenUto64(ty, mkexpr(expd))),
6623                       Ijk_Boring, nia, OFFB_PC
6624             ));
6625             // Otherwise we succeeded (!)
6626             putIReg64orZR(ss, mkU64(0));
6627          } else {
6628             IRTemp res = newTemp(Ity_I1);
6629             stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
6630             /* IR semantics: res is 1 if store succeeds, 0 if it fails.
6631                Need to set rS to 1 on failure, 0 on success. */
6632             putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
6633                                                mkU64(1)));
6634          }
6635          DIP("st%sx%s %s, %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
6636              nameIRegOrZR(False, ss),
6637              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
6638              abiinfo->guest__use_fallback_LLSC
6639                 ? "(fallback implementation)" : "");
6640          return True;
6641       }
6642       /* else fall through */
6643    }
6644
6645    /* ------------------ LDA{R,RH,RB} ------------------ */
6646    /* ------------------ STL{R,RH,RB} ------------------ */
6647    /* 31 29     23  20      14    9 4
6648       sz 001000 110 11111 1 11111 n t   LDAR<sz> Rt, [Xn|SP]
6649       sz 001000 100 11111 1 11111 n t   STLR<sz> Rt, [Xn|SP]
6650    */
6651    if (INSN(29,23) == BITS7(0,0,1,0,0,0,1)
6652        && INSN(21,10) == BITS12(0,1,1,1,1,1,1,1,1,1,1,1)) {
6653       UInt szBlg2 = INSN(31,30);
6654       Bool isLD   = INSN(22,22) == 1;
6655       UInt nn     = INSN(9,5);
6656       UInt tt     = INSN(4,0);
6657
6658       vassert(szBlg2 < 4);
6659       UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
6660       IRType ty  = integerIRTypeOfSize(szB);
6661       const HChar* suffix[4] = { "rb", "rh", "r", "r" };
6662
6663       IRTemp ea = newTemp(Ity_I64);
6664       assign(ea, getIReg64orSP(nn));
6665       /* FIXME generate check that ea is szB-aligned */
6666
6667       if (isLD) {
6668          IRTemp res = newTemp(ty);
6669          assign(res, loadLE(ty, mkexpr(ea)));
6670          putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
6671          stmt(IRStmt_MBE(Imbe_Fence));
6672          DIP("lda%s %s, [%s]\n", suffix[szBlg2],
6673              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
6674       } else {
6675          stmt(IRStmt_MBE(Imbe_Fence));
6676          IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
6677          storeLE(mkexpr(ea), data);
6678          DIP("stl%s %s, [%s]\n", suffix[szBlg2],
6679              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
6680       }
6681       return True;
6682    }
6683
6684    /* The PRFM cases that follow are possibly allow Rt values (the
6685       prefetch operation) which are not allowed by the documentation.
6686       This should be looked into. */
6687    /* ------------------ PRFM (immediate) ------------------ */
6688    /* 31           21    9 4
6689       11 111 00110 imm12 n t   PRFM pfrop=Rt, [Xn|SP, #pimm]
6690    */
6691    if (INSN(31,22) == BITS10(1,1,1,1,1,0,0,1,1,0)) {
6692       UInt imm12 = INSN(21,10);
6693       UInt nn    = INSN(9,5);
6694       UInt tt    = INSN(4,0);
6695       /* Generating any IR here is pointless, except for documentation
6696          purposes, as it will get optimised away later. */
6697       IRTemp ea = newTemp(Ity_I64);
6698       assign(ea, binop(Iop_Add64, getIReg64orSP(nn), mkU64(imm12 * 8)));
6699       DIP("prfm prfop=%u, [%s, #%u]\n", tt, nameIReg64orSP(nn), imm12 * 8);
6700       return True;
6701    }
6702
6703    /* ------------------ PRFM (register) ------------------ */
6704    /* 31 29      22 20 15  12 11 9  4
6705       11 1110001 01 Rm opt S  10 Rn Rt    PRFM pfrop=Rt, [Xn|SP, R<m>{ext/sh}]
6706    */
6707    if (INSN(31,21) == BITS11(1,1,1,1,1,0,0,0,1,0,1)
6708        && INSN(11,10) == BITS2(1,0)) {
6709       HChar  dis_buf[64];
6710       UInt   tt = INSN(4,0);
6711       IRTemp ea = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
6712       if (ea != IRTemp_INVALID) {
6713          /* No actual code to generate. */
6714          DIP("prfm prfop=%u, %s\n", tt, dis_buf);
6715          return True;
6716       }
6717    }
6718
6719    /* ------------------ PRFM (unscaled offset) ------------------ */
6720    /* 31 29      22 20   11 9  4
6721       11 1110001 00 imm9 00 Rn Rt    PRFM pfrop=Rt, [Xn|SP, #simm]
6722    */
6723    if (INSN(31,21) == BITS11(1,1, 1,1,1,0,0,0,1, 0,0)
6724        && INSN(11,10) == BITS2(0,0)) {
6725       ULong  imm9   = INSN(20,12);
6726       UInt   nn     = INSN(9,5);
6727       UInt   tt     = INSN(4,0);
6728       ULong  offset = sx_to_64(imm9, 9);
6729       IRTemp ea     = newTemp(Ity_I64);
6730       assign(ea, binop(Iop_Add64, getIReg64orSP(nn), mkU64(offset)));
6731       /* No actual code to generate. */
6732       DIP("prfum prfop=%u, [%s, #0x%llx]\n", tt, nameIReg64orSP(nn), offset);
6733       return True;
6734    }
6735
6736    /* ---------------- ARMv8.1-LSE: Atomic Memory Operations ---------------- */
6737    /* 31 29     23 22 21 20 15   11 9 4
6738       sz 111000 A  R  1  s  0000 00 n t LDADD{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
6739       sz 111000 A  R  1  s  0001 00 n t LDCLR{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
6740       sz 111000 A  R  1  s  0010 00 n t LDEOR{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
6741       sz 111000 A  R  1  s  0011 00 n t LDSET{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
6742       sz 111000 A  R  1  s  0100 00 n t LDSMAX{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
6743       sz 111000 A  R  1  s  0101 00 n t LDSMIN{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
6744       sz 111000 A  R  1  s  0110 00 n t LDUMAX{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
6745       sz 111000 A  R  1  s  0111 00 n t LDUMIN{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
6746       sz 111000 A  R  1  s  1000 00 n t SWP{,A}{,L}<sz>    <Rs>, <Rt>, [<Xn|SP>]
6747    */
6748    if (INSN(29,24) == BITS6(1,1,1,0,0,0)
6749        && INSN(21,21) == 1
6750        && (INSN(15,12) <= BITS4(1,0,0,0))
6751        && INSN(11,10) == BITS2(0,0)) {
6752       UInt szBlg2 = INSN(31,30);
6753       Bool isAcq = INSN(23,23) == 1;
6754       Bool isRel = INSN(22,22) == 1;
6755       UInt ss  = INSN(20,16);
6756       UInt opc = INSN(15,12);
6757       UInt nn  = INSN(9,5);
6758       UInt tt  = INSN(4,0);
6759
6760       const HChar* nm = NULL;
6761       const HChar* suffix[4] = { "b", "h", "", "" };
6762
6763       vassert(szBlg2 < 4);
6764       UInt  szB = 1 << szBlg2; /* 1, 2, 4 or 8 bytes*/
6765       IRType ty = integerIRTypeOfSize(szB);
6766       Bool is64 = szB == 8;
6767       Bool isSigned = (opc == 4) || (opc == 5) /*smax || smin*/;
6768
6769       // IR used to emulate these atomic memory ops:
6770       // 1) barrier
6771       // 2) load
6772       // 3) widen operands and do arithmetic/logic op
6773       // 4) cas to see if target memory updated
6774       // 5) barrier
6775       // 6) repeat from 1) if cas says target memory not updated
6776       // 7) update register
6777
6778       IRTemp ea = newTemp(Ity_I64);
6779       assign(ea, getIReg64orSP(nn));
6780
6781       // Insert barrier before loading for acquire and acquire-release variants:
6782       // A and AL.
6783       if (isAcq && (tt != 31))
6784          stmt(IRStmt_MBE(Imbe_Fence));
6785
6786       // Load LHS from memory, RHS from register.
6787       IRTemp orig = newTemp(ty);
6788       assign(orig, loadLE(ty, mkexpr(ea)));
6789       IRExpr *lhs = mkexpr(orig);
6790       IRExpr *rhs = narrowFrom64(ty, getIReg64orZR(ss));
6791       IRExpr *res = NULL;
6792
6793       lhs = isSigned ? widenSto64(ty, lhs) : widenUto64(ty, lhs);
6794       rhs = isSigned ? widenSto64(ty, rhs) : widenUto64(ty, rhs);
6795
6796       // Perform the operation.
6797       switch (opc) {
6798          case 0:
6799             nm = "ldadd";
6800             res = binop(Iop_Add64, lhs, rhs);
6801             break;
6802          case 1:
6803             nm = "ldclr";
6804             res = binop(Iop_And64, lhs, unop(mkNOT(Ity_I64), rhs));
6805             break;
6806          case 2:
6807             nm = "ldeor";
6808             res = binop(Iop_Xor64, lhs, rhs);
6809             break;
6810          case 3:
6811             nm = "ldset";
6812             res = binop(Iop_Or64, lhs, rhs);
6813             break;
6814          case 4:
6815             nm = "ldsmax";
6816             res = IRExpr_ITE(binop(Iop_CmpLT64S, lhs, rhs), rhs, lhs);
6817             break;
6818          case 5:
6819             nm = "ldsmin";
6820             res = IRExpr_ITE(binop(Iop_CmpLT64S, lhs, rhs), lhs, rhs);
6821             break;
6822          case 6:
6823             nm = "ldumax";
6824             res = IRExpr_ITE(binop(Iop_CmpLT64U, lhs, rhs), rhs, rhs);
6825             break;
6826          case 7:
6827             nm = "ldumin";
6828             res = IRExpr_ITE(binop(Iop_CmpLT64U, lhs, rhs), lhs, rhs);
6829             break;
6830          case 8:
6831             nm = "swp";
6832             res = lhs;
6833             break;
6834          default:
6835             vassert(0);
6836             break;
6837       }
6838
6839       // Store the result back if LHS remains unchanged in memory.
6840       IRTemp old = newTemp(ty);
6841       stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
6842                                Iend_LE, mkexpr(ea),
6843                                /*expdHi*/NULL, mkexpr(orig),
6844                                /*dataHi*/NULL, narrowFrom64(ty, res))) );
6845
6846       // Insert barrier after storing for release and acquire-release variants:
6847       // L and AL.
6848       if (isRel)
6849          stmt(IRStmt_MBE(Imbe_Fence));
6850
6851       // Retry if the CAS failed (i.e. when old != orig).
6852       IRConst* nia = IRConst_U64(guest_PC_curr_instr);
6853       stmt( IRStmt_Exit(
6854                 binop(Iop_CasCmpNE64,
6855                       widenUto64(ty, mkexpr(old)),
6856                       widenUto64(ty, mkexpr(orig))),
6857                 Ijk_Boring, nia, OFFB_PC ));
6858       // Otherwise we succeeded.
6859       putIReg64orZR(tt, widenUto64(ty, mkexpr(old)));
6860
6861       DIP("%s%s%s%s %s, %s, [%s]\n", nm, isAcq ? "a" : "", isRel ? "l" : "",
6862           suffix[szBlg2], nameIRegOrZR(is64, ss), nameIRegOrZR(is64, tt),
6863           nameIReg64orSP(nn));
6864       return True;
6865    }
6866
6867    /* ------------------ ARMv8.1-LSE: Compare-and-Swap ------------------ */
6868    /* 31 29      22 21 20 15 14    9 4
6869       sz 0010001 A  1  s  R  11111 n t CAS{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
6870    */
6871    if (INSN(29,23) == BITS7(0,0,1,0,0,0,1)
6872        && INSN(21,21) == 1
6873        && INSN(14,10) == BITS5(1,1,1,1,1)) {
6874       UInt szBlg2 = INSN(31,30);
6875       Bool isAcq = INSN(22,22) == 1;
6876       Bool isRel = INSN(15,15) == 1;
6877       UInt ss  = INSN(20,16);
6878       UInt nn  = INSN(9,5);
6879       UInt tt  = INSN(4,0);
6880
6881       const HChar* suffix[4] = { "b", "h", "", "" };
6882
6883       UInt  szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
6884       IRType ty = integerIRTypeOfSize(szB);
6885       Bool is64 = szB == 8;
6886
6887       IRExpr *exp = narrowFrom64(ty, getIReg64orZR(ss));
6888       IRExpr *new = narrowFrom64(ty, getIReg64orZR(tt));
6889
6890       if (isAcq)
6891          stmt(IRStmt_MBE(Imbe_Fence));
6892
6893       // Store the result back if LHS remains unchanged in memory.
6894       IRTemp old = newTemp(ty);
6895       stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
6896                                Iend_LE, getIReg64orSP(nn),
6897                                /*expdHi*/NULL, exp,
6898                                /*dataHi*/NULL, new)) );
6899
6900       if (isRel)
6901          stmt(IRStmt_MBE(Imbe_Fence));
6902
6903       putIReg64orZR(ss, widenUto64(ty, mkexpr(old)));
6904       DIP("cas%s%s%s %s, %s, [%s]\n",
6905           isAcq ? "a" : "", isRel ? "l" : "", suffix[szBlg2],
6906           nameIRegOrZR(is64, ss), nameIRegOrZR(is64, tt), nameIReg64orSP(nn));
6907       return True;
6908    }
6909
6910    /* ---------------- ARMv8.1-LSE: Compare-and-Swap Pair --------------- */
6911    /* 31 30 29      22 21 20 15 14    9 4
6912       0  sz 0010000 A  1  s  R  11111 n t CASP{,A}{,L} <Rs>, <Rt>, [<Xn|SP>]
6913    */
6914    if (INSN(31,31) == 0
6915        && INSN(29,23) == BITS7(0,0,1,0,0,0,0)
6916        && INSN(21,21) == 1
6917        && INSN(14,10) == BITS5(1,1,1,1,1)) {
6918       UInt is64 = INSN(30,30);
6919       Bool isAcq = INSN(22,22) == 1;
6920       Bool isRel = INSN(15,15) == 1;
6921       UInt ss  = INSN(20,16);
6922       UInt nn  = INSN(9,5);
6923       UInt tt  = INSN(4,0);
6924
6925       if ((ss & 0x1) || (tt & 0x1)) {
6926          /* undefined; fall through */
6927       } else {
6928          IRExpr *expLo = getIRegOrZR(is64, ss);
6929          IRExpr *expHi = getIRegOrZR(is64, ss + 1);
6930          IRExpr *newLo = getIRegOrZR(is64, tt);
6931          IRExpr *newHi = getIRegOrZR(is64, tt + 1);
6932          IRTemp oldLo = newTemp(is64 ? Ity_I64 : Ity_I32);
6933          IRTemp oldHi = newTemp(is64 ? Ity_I64 : Ity_I32);
6934
6935          if (isAcq)
6936             stmt(IRStmt_MBE(Imbe_Fence));
6937
6938          stmt( IRStmt_CAS(mkIRCAS(oldHi, oldLo,
6939                                   Iend_LE, getIReg64orSP(nn),
6940                                   expHi, expLo,
6941                                   newHi, newLo)) );
6942
6943          if (isRel)
6944             stmt(IRStmt_MBE(Imbe_Fence));
6945
6946          putIRegOrZR(is64, ss, mkexpr(oldLo));
6947          putIRegOrZR(is64, ss+1, mkexpr(oldHi));
6948          DIP("casp%s%s %s, %s, %s, %s, [%s]\n",
6949              isAcq ? "a" : "", isRel ? "l" : "",
6950              nameIRegOrZR(is64, ss), nameIRegOrZR(is64, ss+1),
6951              nameIRegOrZR(is64, tt), nameIRegOrZR(is64, tt+1),
6952              nameIReg64orSP(nn));
6953          return True;
6954       }
6955    }
6956
6957    if (sigill_diag) {
6958       vex_printf("ARM64 front end: load_store\n");
6959    }
6960
6961    return False;
6962 #  undef INSN
6963 }
6964
6965
6966 /*------------------------------------------------------------*/
6967 /*--- Control flow and misc instructions                   ---*/
6968 /*------------------------------------------------------------*/
6969
6970 static
6971 Bool dis_ARM64_branch_etc(/*MB_OUT*/DisResult* dres, UInt insn,
6972                           const VexArchInfo* archinfo,
6973                           const VexAbiInfo* abiinfo, Bool sigill_diag)
6974 {
6975 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
6976
6977    /* ---------------------- B cond ----------------------- */
6978    /* 31        24    4 3
6979       0101010 0 imm19 0 cond */
6980    if (INSN(31,24) == BITS8(0,1,0,1,0,1,0,0) && INSN(4,4) == 0) {
6981       UInt  cond   = INSN(3,0);
6982       ULong uimm64 = INSN(23,5) << 2;
6983       Long  simm64 = (Long)sx_to_64(uimm64, 21);
6984       vassert(dres->whatNext    == Dis_Continue);
6985       vassert(dres->len         == 4);
6986       vassert(dres->jk_StopHere == Ijk_INVALID);
6987       stmt( IRStmt_Exit(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
6988                         Ijk_Boring,
6989                         IRConst_U64(guest_PC_curr_instr + simm64),
6990                         OFFB_PC) );
6991       putPC(mkU64(guest_PC_curr_instr + 4));
6992       dres->whatNext    = Dis_StopHere;
6993       dres->jk_StopHere = Ijk_Boring;
6994       DIP("b.%s 0x%llx\n", nameCC(cond), guest_PC_curr_instr + simm64);
6995       return True;
6996    }
6997
6998    /* -------------------- B{L} uncond -------------------- */
6999    if (INSN(30,26) == BITS5(0,0,1,0,1)) {
7000       /* 000101 imm26  B  (PC + sxTo64(imm26 << 2))
7001          100101 imm26  B  (PC + sxTo64(imm26 << 2))
7002       */
7003       UInt  bLink  = INSN(31,31);
7004       ULong uimm64 = INSN(25,0) << 2;
7005       Long  simm64 = (Long)sx_to_64(uimm64, 28);
7006       if (bLink) {
7007          putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
7008       }
7009       putPC(mkU64(guest_PC_curr_instr + simm64));
7010       dres->whatNext = Dis_StopHere;
7011       dres->jk_StopHere = Ijk_Call;
7012       DIP("b%s 0x%llx\n", bLink == 1 ? "l" : "",
7013                           guest_PC_curr_instr + simm64);
7014       return True;
7015    }
7016
7017    /* --------------------- B{L} reg --------------------- */
7018    /* 31      24 22 20    15     9  4
7019       1101011 00 10 11111 000000 nn 00000  RET  Rn
7020       1101011 00 01 11111 000000 nn 00000  CALL Rn
7021       1101011 00 00 11111 000000 nn 00000  JMP  Rn
7022    */
7023    if (INSN(31,23) == BITS9(1,1,0,1,0,1,1,0,0)
7024        && INSN(20,16) == BITS5(1,1,1,1,1)
7025        && INSN(15,10) == BITS6(0,0,0,0,0,0)
7026        && INSN(4,0) == BITS5(0,0,0,0,0)) {
7027       UInt branch_type = INSN(22,21);
7028       UInt nn          = INSN(9,5);
7029       if (branch_type == BITS2(1,0) /* RET */) {
7030          putPC(getIReg64orZR(nn));
7031          dres->whatNext = Dis_StopHere;
7032          dres->jk_StopHere = Ijk_Ret;
7033          DIP("ret %s\n", nameIReg64orZR(nn));
7034          return True;
7035       }
7036       if (branch_type == BITS2(0,1) /* CALL */) {
7037          IRTemp dst = newTemp(Ity_I64);
7038          assign(dst, getIReg64orZR(nn));
7039          putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
7040          putPC(mkexpr(dst));
7041          dres->whatNext = Dis_StopHere;
7042          dres->jk_StopHere = Ijk_Call;
7043          DIP("blr %s\n", nameIReg64orZR(nn));
7044          return True;
7045       }
7046       if (branch_type == BITS2(0,0) /* JMP */) {
7047          putPC(getIReg64orZR(nn));
7048          dres->whatNext = Dis_StopHere;
7049          dres->jk_StopHere = Ijk_Boring;
7050          DIP("jmp %s\n", nameIReg64orZR(nn));
7051          return True;
7052       }
7053    }
7054
7055    /* -------------------- CB{N}Z -------------------- */
7056    /* sf 011 010 1 imm19 Rt   CBNZ Xt|Wt, (PC + sxTo64(imm19 << 2))
7057       sf 011 010 0 imm19 Rt   CBZ  Xt|Wt, (PC + sxTo64(imm19 << 2))
7058    */
7059    if (INSN(30,25) == BITS6(0,1,1,0,1,0)) {
7060       Bool    is64   = INSN(31,31) == 1;
7061       Bool    bIfZ   = INSN(24,24) == 0;
7062       ULong   uimm64 = INSN(23,5) << 2;
7063       UInt    rT     = INSN(4,0);
7064       Long    simm64 = (Long)sx_to_64(uimm64, 21);
7065       IRExpr* cond   = NULL;
7066       if (is64) {
7067          cond = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
7068                       getIReg64orZR(rT), mkU64(0));
7069       } else {
7070          cond = binop(bIfZ ? Iop_CmpEQ32 : Iop_CmpNE32,
7071                       getIReg32orZR(rT), mkU32(0));
7072       }
7073       stmt( IRStmt_Exit(cond,
7074                         Ijk_Boring,
7075                         IRConst_U64(guest_PC_curr_instr + simm64),
7076                         OFFB_PC) );
7077       putPC(mkU64(guest_PC_curr_instr + 4));
7078       dres->whatNext    = Dis_StopHere;
7079       dres->jk_StopHere = Ijk_Boring;
7080       DIP("cb%sz %s, 0x%llx\n",
7081           bIfZ ? "" : "n", nameIRegOrZR(is64, rT),
7082           guest_PC_curr_instr + simm64);
7083       return True;
7084    }
7085
7086    /* -------------------- TB{N}Z -------------------- */
7087    /* 31 30      24 23  18  5 4
7088       b5 011 011 1  b40 imm14 t  TBNZ Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
7089       b5 011 011 0  b40 imm14 t  TBZ  Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
7090    */
7091    if (INSN(30,25) == BITS6(0,1,1,0,1,1)) {
7092       UInt    b5     = INSN(31,31);
7093       Bool    bIfZ   = INSN(24,24) == 0;
7094       UInt    b40    = INSN(23,19);
7095       UInt    imm14  = INSN(18,5);
7096       UInt    tt     = INSN(4,0);
7097       UInt    bitNo  = (b5 << 5) | b40;
7098       ULong   uimm64 = imm14 << 2;
7099       Long    simm64 = sx_to_64(uimm64, 16);
7100       IRExpr* cond
7101          = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
7102                  binop(Iop_And64,
7103                        binop(Iop_Shr64, getIReg64orZR(tt), mkU8(bitNo)),
7104                        mkU64(1)),
7105                  mkU64(0));
7106       stmt( IRStmt_Exit(cond,
7107                         Ijk_Boring,
7108                         IRConst_U64(guest_PC_curr_instr + simm64),
7109                         OFFB_PC) );
7110       putPC(mkU64(guest_PC_curr_instr + 4));
7111       dres->whatNext    = Dis_StopHere;
7112       dres->jk_StopHere = Ijk_Boring;
7113       DIP("tb%sz %s, #%u, 0x%llx\n",
7114           bIfZ ? "" : "n", nameIReg64orZR(tt), bitNo,
7115           guest_PC_curr_instr + simm64);
7116       return True;
7117    }
7118
7119    /* -------------------- SVC -------------------- */
7120    /* 11010100 000 imm16 000 01
7121       Don't bother with anything except the imm16==0 case.
7122    */
7123    if (INSN(31,0) == 0xD4000001) {
7124       putPC(mkU64(guest_PC_curr_instr + 4));
7125       dres->whatNext    = Dis_StopHere;
7126       dres->jk_StopHere = Ijk_Sys_syscall;
7127       DIP("svc #0\n");
7128       return True;
7129    }
7130
7131    /* ------------------ M{SR,RS} ------------------ */
7132    /* ---- Cases for TPIDR_EL0 ----
7133       0xD51BD0 010 Rt   MSR tpidr_el0, rT
7134       0xD53BD0 010 Rt   MRS rT, tpidr_el0
7135    */
7136    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51BD040 /*MSR*/
7137        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53BD040 /*MRS*/) {
7138       Bool toSys = INSN(21,21) == 0;
7139       UInt tt    = INSN(4,0);
7140       if (toSys) {
7141          stmt( IRStmt_Put( OFFB_TPIDR_EL0, getIReg64orZR(tt)) );
7142          DIP("msr tpidr_el0, %s\n", nameIReg64orZR(tt));
7143       } else {
7144          putIReg64orZR(tt, IRExpr_Get( OFFB_TPIDR_EL0, Ity_I64 ));
7145          DIP("mrs %s, tpidr_el0\n", nameIReg64orZR(tt));
7146       }
7147       return True;
7148    }
7149    /* ---- Cases for FPCR ----
7150       0xD51B44 000 Rt  MSR fpcr, rT
7151       0xD53B44 000 Rt  MSR rT, fpcr
7152    */
7153    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4400 /*MSR*/
7154        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4400 /*MRS*/) {
7155       Bool toSys = INSN(21,21) == 0;
7156       UInt tt    = INSN(4,0);
7157       if (toSys) {
7158          stmt( IRStmt_Put( OFFB_FPCR, getIReg32orZR(tt)) );
7159          DIP("msr fpcr, %s\n", nameIReg64orZR(tt));
7160       } else {
7161          putIReg32orZR(tt, IRExpr_Get(OFFB_FPCR, Ity_I32));
7162          DIP("mrs %s, fpcr\n", nameIReg64orZR(tt));
7163       }
7164       return True;
7165    }
7166    /* ---- Cases for FPSR ----
7167       0xD51B44 001 Rt  MSR fpsr, rT
7168       0xD53B44 001 Rt  MSR rT, fpsr
7169       The only part of this we model is FPSR.QC.  All other bits
7170       are ignored when writing to it and RAZ when reading from it.
7171    */
7172    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4420 /*MSR*/
7173        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4420 /*MRS*/) {
7174       Bool toSys = INSN(21,21) == 0;
7175       UInt tt    = INSN(4,0);
7176       if (toSys) {
7177          /* Just deal with FPSR.QC.  Make up a V128 value which is
7178             zero if Xt[27] is zero and any other value if Xt[27] is
7179             nonzero. */
7180          IRTemp qc64 = newTemp(Ity_I64);
7181          assign(qc64, binop(Iop_And64,
7182                             binop(Iop_Shr64, getIReg64orZR(tt), mkU8(27)),
7183                             mkU64(1)));
7184          IRExpr* qcV128 = binop(Iop_64HLtoV128, mkexpr(qc64), mkexpr(qc64));
7185          stmt( IRStmt_Put( OFFB_QCFLAG, qcV128 ) );
7186          DIP("msr fpsr, %s\n", nameIReg64orZR(tt));
7187       } else {
7188          /* Generate a value which is all zeroes except for bit 27,
7189             which must be zero if QCFLAG is all zeroes and one otherwise. */
7190          IRTemp qcV128 = newTempV128();
7191          assign(qcV128, IRExpr_Get( OFFB_QCFLAG, Ity_V128 ));
7192          IRTemp qc64 = newTemp(Ity_I64);
7193          assign(qc64, binop(Iop_Or64, unop(Iop_V128HIto64, mkexpr(qcV128)),
7194                                       unop(Iop_V128to64,   mkexpr(qcV128))));
7195          IRExpr* res = binop(Iop_Shl64,
7196                              unop(Iop_1Uto64,
7197                                   binop(Iop_CmpNE64, mkexpr(qc64), mkU64(0))),
7198                              mkU8(27));
7199          putIReg64orZR(tt, res);
7200          DIP("mrs %s, fpsr\n", nameIReg64orZR(tt));
7201       }
7202       return True;
7203    }
7204    /* ---- Cases for NZCV ----
7205       D51B42 000 Rt  MSR nzcv, rT
7206       D53B42 000 Rt  MRS rT, nzcv
7207       The only parts of NZCV that actually exist are bits 31:28, which
7208       are the N Z C and V bits themselves.  Hence the flags thunk provides
7209       all the state we need.
7210    */
7211    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4200 /*MSR*/
7212        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4200 /*MRS*/) {
7213       Bool  toSys = INSN(21,21) == 0;
7214       UInt  tt    = INSN(4,0);
7215       if (toSys) {
7216          IRTemp t = newTemp(Ity_I64);
7217          assign(t, binop(Iop_And64, getIReg64orZR(tt), mkU64(0xF0000000ULL)));
7218          setFlags_COPY(t);
7219          DIP("msr %s, nzcv\n", nameIReg32orZR(tt));
7220       } else {
7221          IRTemp res = newTemp(Ity_I64);
7222          assign(res, mk_arm64g_calculate_flags_nzcv());
7223          putIReg32orZR(tt, unop(Iop_64to32, mkexpr(res)));
7224          DIP("mrs %s, nzcv\n", nameIReg64orZR(tt));
7225       }
7226       return True;
7227    }
7228    /* ---- Cases for DCZID_EL0 ----
7229       Don't support arbitrary reads and writes to this register.  Just
7230       return the value 16, which indicates that the DC ZVA instruction
7231       is not permitted, so we don't have to emulate it.
7232       D5 3B 00 111 Rt  MRS rT, dczid_el0
7233    */
7234    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B00E0) {
7235       UInt tt = INSN(4,0);
7236       putIReg64orZR(tt, mkU64(1<<4));
7237       DIP("mrs %s, dczid_el0 (FAKED)\n", nameIReg64orZR(tt));
7238       return True;
7239    }
7240    /* ---- Cases for CTR_EL0 ----
7241       We just handle reads, and make up a value from the D and I line
7242       sizes in the VexArchInfo we are given, and patch in the following
7243       fields that the Foundation model gives ("natively"):
7244       CWG = 0b0100, ERG = 0b0100, L1Ip = 0b11
7245       D5 3B 00 001 Rt  MRS rT, dczid_el0
7246    */
7247    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B0020) {
7248       UInt tt = INSN(4,0);
7249       /* Need to generate a value from dMinLine_lg2_szB and
7250          dMinLine_lg2_szB.  The value in the register is in 32-bit
7251          units, so need to subtract 2 from the values in the
7252          VexArchInfo.  We can assume that the values here are valid --
7253          disInstr_ARM64 checks them -- so there's no need to deal with
7254          out-of-range cases. */
7255       vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
7256               && archinfo->arm64_dMinLine_lg2_szB <= 17
7257               && archinfo->arm64_iMinLine_lg2_szB >= 2
7258               && archinfo->arm64_iMinLine_lg2_szB <= 17);
7259       UInt val
7260          = 0x8440c000 | ((0xF & (archinfo->arm64_dMinLine_lg2_szB - 2)) << 16)
7261                       | ((0xF & (archinfo->arm64_iMinLine_lg2_szB - 2)) << 0);
7262       putIReg64orZR(tt, mkU64(val));
7263       DIP("mrs %s, ctr_el0\n", nameIReg64orZR(tt));
7264       return True;
7265    }
7266    /* ---- Cases for CNTVCT_EL0 ----
7267       This is a timestamp counter of some sort.  Support reads of it only
7268       by passing through to the host.
7269       D5 3B E0 010 Rt  MRS Xt, cntvct_el0
7270    */
7271    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53BE040) {
7272       UInt     tt   = INSN(4,0);
7273       IRTemp   val  = newTemp(Ity_I64);
7274       IRExpr** args = mkIRExprVec_0();
7275       IRDirty* d    = unsafeIRDirty_1_N (
7276                          val,
7277                          0/*regparms*/,
7278                          "arm64g_dirtyhelper_MRS_CNTVCT_EL0",
7279                          &arm64g_dirtyhelper_MRS_CNTVCT_EL0,
7280                          args
7281                       );
7282       /* execute the dirty call, dumping the result in val. */
7283       stmt( IRStmt_Dirty(d) );
7284       putIReg64orZR(tt, mkexpr(val));
7285       DIP("mrs %s, cntvct_el0\n", nameIReg64orZR(tt));
7286       return True;
7287    }
7288    /* ---- Cases for CNTFRQ_EL0 ----
7289       This is always RO at EL0, so it's safe to pass through to the host.
7290       D5 3B E0 000 Rt  MRS Xt, cntfrq_el0
7291    */
7292    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53BE000) {
7293       UInt     tt   = INSN(4,0);
7294       IRTemp   val  = newTemp(Ity_I64);
7295       IRExpr** args = mkIRExprVec_0();
7296       IRDirty* d    = unsafeIRDirty_1_N (
7297                          val,
7298                          0/*regparms*/,
7299                          "arm64g_dirtyhelper_MRS_CNTFRQ_EL0",
7300                          &arm64g_dirtyhelper_MRS_CNTFRQ_EL0,
7301                          args
7302                       );
7303       /* execute the dirty call, dumping the result in val. */
7304       stmt( IRStmt_Dirty(d) );
7305       putIReg64orZR(tt, mkexpr(val));
7306       DIP("mrs %s, cntfrq_el0\n", nameIReg64orZR(tt));
7307       return True;
7308    }
7309
7310    /* ------------------ IC_IVAU ------------------ */
7311    /* D5 0B 75 001 Rt  ic ivau, rT
7312    */
7313    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD50B7520) {
7314       /* We will always be provided with a valid iMinLine value. */
7315       vassert(archinfo->arm64_iMinLine_lg2_szB >= 2
7316               && archinfo->arm64_iMinLine_lg2_szB <= 17);
7317       /* Round the requested address, in rT, down to the start of the
7318          containing block. */
7319       UInt   tt      = INSN(4,0);
7320       ULong  lineszB = 1ULL << archinfo->arm64_iMinLine_lg2_szB;
7321       IRTemp addr    = newTemp(Ity_I64);
7322       assign( addr, binop( Iop_And64,
7323                            getIReg64orZR(tt),
7324                            mkU64(~(lineszB - 1))) );
7325       /* Set the invalidation range, request exit-and-invalidate, with
7326          continuation at the next instruction. */
7327       stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
7328       stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
7329       /* be paranoid ... */
7330       stmt( IRStmt_MBE(Imbe_Fence) );
7331       putPC(mkU64( guest_PC_curr_instr + 4 ));
7332       dres->whatNext    = Dis_StopHere;
7333       dres->jk_StopHere = Ijk_InvalICache;
7334       DIP("ic ivau, %s\n", nameIReg64orZR(tt));
7335       return True;
7336    }
7337
7338    /* ------------------ DC_CVAU ------------------ */
7339    /* D5 0B 7B 001 Rt  dc cvau, rT
7340       D5 0B 7E 001 Rt  dc civac, rT
7341    */
7342    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD50B7B20
7343        || (INSN(31,0) & 0xFFFFFFE0) == 0xD50B7E20) {
7344       /* Exactly the same scheme as for IC IVAU, except we observe the
7345          dMinLine size, and request an Ijk_FlushDCache instead of
7346          Ijk_InvalICache. */
7347       /* We will always be provided with a valid dMinLine value. */
7348       vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
7349               && archinfo->arm64_dMinLine_lg2_szB <= 17);
7350       /* Round the requested address, in rT, down to the start of the
7351          containing block. */
7352       UInt   tt      = INSN(4,0);
7353       ULong  lineszB = 1ULL << archinfo->arm64_dMinLine_lg2_szB;
7354       IRTemp addr    = newTemp(Ity_I64);
7355       assign( addr, binop( Iop_And64,
7356                            getIReg64orZR(tt),
7357                            mkU64(~(lineszB - 1))) );
7358       /* Set the flush range, request exit-and-flush, with
7359          continuation at the next instruction. */
7360       stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
7361       stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
7362       /* be paranoid ... */
7363       stmt( IRStmt_MBE(Imbe_Fence) );
7364       putPC(mkU64( guest_PC_curr_instr + 4 ));
7365       dres->whatNext    = Dis_StopHere;
7366       dres->jk_StopHere = Ijk_FlushDCache;
7367       DIP("dc cvau, %s\n", nameIReg64orZR(tt));
7368       return True;
7369    }
7370
7371    /* ------------------ ISB, DMB, DSB ------------------ */
7372    /* 31          21            11  7 6  4
7373       11010 10100 0 00 011 0011 CRm 1 01 11111  DMB opt
7374       11010 10100 0 00 011 0011 CRm 1 00 11111  DSB opt
7375       11010 10100 0 00 011 0011 CRm 1 10 11111  ISB opt
7376    */
7377    if (INSN(31,22) == BITS10(1,1,0,1,0,1,0,1,0,0)
7378        && INSN(21,12) == BITS10(0,0,0,0,1,1,0,0,1,1)
7379        && INSN(7,7) == 1
7380        && INSN(6,5) <= BITS2(1,0) && INSN(4,0) == BITS5(1,1,1,1,1)) {
7381       UInt opc = INSN(6,5);
7382       UInt CRm = INSN(11,8);
7383       vassert(opc <= 2 && CRm <= 15);
7384       stmt(IRStmt_MBE(Imbe_Fence));
7385       const HChar* opNames[3]
7386          = { "dsb", "dmb", "isb" };
7387       const HChar* howNames[16]
7388          = { "#0", "oshld", "oshst", "osh", "#4", "nshld", "nshst", "nsh",
7389              "#8", "ishld", "ishst", "ish", "#12", "ld", "st", "sy" };
7390       DIP("%s %s\n", opNames[opc], howNames[CRm]);
7391       return True;
7392    }
7393
7394    /* -------------------- NOP -------------------- */
7395    if (INSN(31,0) == 0xD503201F) {
7396       DIP("nop\n");
7397       return True;
7398    }
7399
7400    /* -------------------- BRK -------------------- */
7401    /* 31        23  20    4
7402       1101 0100 001 imm16 00000  BRK #imm16
7403    */
7404    if (INSN(31,24) == BITS8(1,1,0,1,0,1,0,0)
7405        && INSN(23,21) == BITS3(0,0,1) && INSN(4,0) == BITS5(0,0,0,0,0)) {
7406       UInt imm16 = INSN(20,5);
7407       /* Request SIGTRAP and then restart of this insn. */
7408       putPC(mkU64(guest_PC_curr_instr + 0));
7409       dres->whatNext    = Dis_StopHere;
7410       dres->jk_StopHere = Ijk_SigTRAP;
7411       DIP("brk #%u\n", imm16);
7412       return True;
7413    }
7414
7415    /* ------------------- YIELD ------------------- */
7416    /* 31        23        15        7
7417       1101 0101 0000 0011 0010 0000 0011 1111
7418    */
7419    if (INSN(31,0) == 0xD503203F) {
7420       /* Request yield followed by continuation at the next insn. */
7421       putPC(mkU64(guest_PC_curr_instr + 4));
7422       dres->whatNext    = Dis_StopHere;
7423       dres->jk_StopHere = Ijk_Yield;
7424       DIP("yield\n");
7425       return True;
7426    }
7427
7428    /* -------------------- HINT ------------------- */
7429    /* 31        23        15   11   4 3
7430       1101 0101 0000 0011 0010 imm7 1 1111
7431       Catch otherwise unhandled HINT instructions - any
7432       like YIELD which are explicitly handled should go
7433       above this case.
7434    */
7435    if (INSN(31,24) == BITS8(1,1,0,1,0,1,0,1)
7436        && INSN(23,16) == BITS8(0,0,0,0,0,0,1,1)
7437        && INSN(15,12) == BITS4(0,0,1,0)
7438        && INSN(4,0) == BITS5(1,1,1,1,1)) {
7439       UInt imm7 = INSN(11,5);
7440       DIP("hint #%u\n", imm7);
7441       return True;
7442    }
7443
7444    /* ------------------- CLREX ------------------ */
7445    /* 31        23        15   11 7
7446       1101 0101 0000 0011 0011 m  0101 1111  CLREX CRm
7447       CRm is apparently ignored.
7448    */
7449    if ((INSN(31,0) & 0xFFFFF0FF) == 0xD503305F) {
7450       UInt mm = INSN(11,8);
7451       /* AFAICS, this simply cancels a (all?) reservations made by a
7452          (any?) preceding LDREX(es).  Arrange to hand it through to
7453          the back end. */
7454       if (abiinfo->guest__use_fallback_LLSC) {
7455          stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) )); // "no transaction"
7456       } else {
7457          stmt( IRStmt_MBE(Imbe_CancelReservation) );
7458       }
7459       DIP("clrex #%u\n", mm);
7460       return True;
7461    }
7462
7463    if (sigill_diag) {
7464       vex_printf("ARM64 front end: branch_etc\n");
7465    }
7466    return False;
7467 #  undef INSN
7468 }
7469
7470
7471 /*------------------------------------------------------------*/
7472 /*--- SIMD and FP instructions: helper functions           ---*/
7473 /*------------------------------------------------------------*/
7474
7475 /* Some constructors for interleave/deinterleave expressions. */
7476
7477 static IRExpr* mk_CatEvenLanes64x2 ( IRTemp a10, IRTemp b10 ) {
7478    // returns a0 b0
7479    return binop(Iop_InterleaveLO64x2, mkexpr(a10), mkexpr(b10));
7480 }
7481
7482 static IRExpr* mk_CatOddLanes64x2 ( IRTemp a10, IRTemp b10 ) {
7483    // returns a1 b1
7484    return binop(Iop_InterleaveHI64x2, mkexpr(a10), mkexpr(b10));
7485 }
7486
7487 static IRExpr* mk_CatEvenLanes32x4 ( IRTemp a3210, IRTemp b3210 ) {
7488    // returns a2 a0 b2 b0
7489    return binop(Iop_CatEvenLanes32x4, mkexpr(a3210), mkexpr(b3210));
7490 }
7491
7492 static IRExpr* mk_CatOddLanes32x4 ( IRTemp a3210, IRTemp b3210 ) {
7493    // returns a3 a1 b3 b1
7494    return binop(Iop_CatOddLanes32x4, mkexpr(a3210), mkexpr(b3210));
7495 }
7496
7497 static IRExpr* mk_InterleaveLO32x4 ( IRTemp a3210, IRTemp b3210 ) {
7498    // returns a1 b1 a0 b0
7499    return binop(Iop_InterleaveLO32x4, mkexpr(a3210), mkexpr(b3210));
7500 }
7501
7502 static IRExpr* mk_InterleaveHI32x4 ( IRTemp a3210, IRTemp b3210 ) {
7503    // returns a3 b3 a2 b2
7504    return binop(Iop_InterleaveHI32x4, mkexpr(a3210), mkexpr(b3210));
7505 }
7506
7507 static IRExpr* mk_CatEvenLanes16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7508    // returns a6 a4 a2 a0 b6 b4 b2 b0
7509    return binop(Iop_CatEvenLanes16x8, mkexpr(a76543210), mkexpr(b76543210));
7510 }
7511
7512 static IRExpr* mk_CatOddLanes16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7513    // returns a7 a5 a3 a1 b7 b5 b3 b1
7514    return binop(Iop_CatOddLanes16x8, mkexpr(a76543210), mkexpr(b76543210));
7515 }
7516
7517 static IRExpr* mk_InterleaveLO16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7518    // returns a3 b3 a2 b2 a1 b1 a0 b0
7519    return binop(Iop_InterleaveLO16x8, mkexpr(a76543210), mkexpr(b76543210));
7520 }
7521
7522 static IRExpr* mk_InterleaveHI16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7523    // returns a7 b7 a6 b6 a5 b5 a4 b4
7524    return binop(Iop_InterleaveHI16x8, mkexpr(a76543210), mkexpr(b76543210));
7525 }
7526
7527 static IRExpr* mk_CatEvenLanes8x16 ( IRTemp aFEDCBA9876543210,
7528                                      IRTemp bFEDCBA9876543210 ) {
7529    // returns aE aC aA a8 a6 a4 a2 a0 bE bC bA b8 b6 b4 b2 b0
7530    return binop(Iop_CatEvenLanes8x16, mkexpr(aFEDCBA9876543210),
7531                                       mkexpr(bFEDCBA9876543210));
7532 }
7533
7534 static IRExpr* mk_CatOddLanes8x16 ( IRTemp aFEDCBA9876543210,
7535                                     IRTemp bFEDCBA9876543210 ) {
7536    // returns aF aD aB a9 a7 a5 a3 a1 bF bD bB b9 b7 b5 b3 b1
7537    return binop(Iop_CatOddLanes8x16, mkexpr(aFEDCBA9876543210),
7538                                      mkexpr(bFEDCBA9876543210));
7539 }
7540
7541 static IRExpr* mk_InterleaveLO8x16 ( IRTemp aFEDCBA9876543210,
7542                                      IRTemp bFEDCBA9876543210 ) {
7543    // returns a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0
7544    return binop(Iop_InterleaveLO8x16, mkexpr(aFEDCBA9876543210),
7545                                       mkexpr(bFEDCBA9876543210));
7546 }
7547
7548 static IRExpr* mk_InterleaveHI8x16 ( IRTemp aFEDCBA9876543210,
7549                                      IRTemp bFEDCBA9876543210 ) {
7550    // returns aF bF aE bE aD bD aC bC aB bB aA bA a9 b9 a8 b8
7551    return binop(Iop_InterleaveHI8x16, mkexpr(aFEDCBA9876543210),
7552                                       mkexpr(bFEDCBA9876543210));
7553 }
7554
7555 /* Generate N copies of |bit| in the bottom of a ULong. */
7556 static ULong Replicate ( ULong bit, Int N )
7557 {
7558    vassert(bit <= 1 && N >= 1 && N < 64);
7559    if (bit == 0) {
7560       return 0;
7561     } else {
7562       /* Careful.  This won't work for N == 64. */
7563       return (1ULL << N) - 1;
7564    }
7565 }
7566
7567 static ULong Replicate32x2 ( ULong bits32 )
7568 {
7569    vassert(0 == (bits32 & ~0xFFFFFFFFULL));
7570    return (bits32 << 32) | bits32;
7571 }
7572
7573 static ULong Replicate16x4 ( ULong bits16 )
7574 {
7575    vassert(0 == (bits16 & ~0xFFFFULL));
7576    return Replicate32x2((bits16 << 16) | bits16);
7577 }
7578
7579 static ULong Replicate8x8 ( ULong bits8 )
7580 {
7581    vassert(0 == (bits8 & ~0xFFULL));
7582    return Replicate16x4((bits8 << 8) | bits8);
7583 }
7584
7585 /* Expand the VFPExpandImm-style encoding in the bottom 8 bits of
7586    |imm8| to either a 32-bit value if N is 32 or a 64 bit value if N
7587    is 64.  In the former case, the upper 32 bits of the returned value
7588    are guaranteed to be zero. */
7589 static ULong VFPExpandImm ( ULong imm8, Int N )
7590 {
7591    vassert(imm8 <= 0xFF);
7592    vassert(N == 32 || N == 64);
7593    Int E = ((N == 32) ? 8 : 11) - 2; // The spec incorrectly omits the -2.
7594    Int F = N - E - 1;
7595    ULong imm8_6 = (imm8 >> 6) & 1;
7596    /* sign: 1 bit */
7597    /* exp:  E bits */
7598    /* frac: F bits */
7599    ULong sign = (imm8 >> 7) & 1;
7600    ULong exp  = ((imm8_6 ^ 1) << (E-1)) | Replicate(imm8_6, E-1);
7601    ULong frac = ((imm8 & 63) << (F-6)) | Replicate(0, F-6);
7602    vassert(sign < (1ULL << 1));
7603    vassert(exp  < (1ULL << E));
7604    vassert(frac < (1ULL << F));
7605    vassert(1 + E + F == N);
7606    ULong res = (sign << (E+F)) | (exp << F) | frac;
7607    return res;
7608 }
7609
7610 /* Expand an AdvSIMDExpandImm-style encoding into a 64-bit value.
7611    This might fail, as indicated by the returned Bool.  Page 2530 of
7612    the manual. */
7613 static Bool AdvSIMDExpandImm ( /*OUT*/ULong* res,
7614                                UInt op, UInt cmode, UInt imm8 )
7615 {
7616    vassert(op <= 1);
7617    vassert(cmode <= 15);
7618    vassert(imm8 <= 255);
7619
7620    *res = 0; /* will overwrite iff returning True */
7621
7622    ULong imm64    = 0;
7623    Bool  testimm8 = False;
7624
7625    switch (cmode >> 1) {
7626       case 0:
7627          testimm8 = False; imm64 = Replicate32x2(imm8); break;
7628       case 1:
7629          testimm8 = True; imm64 = Replicate32x2(imm8 << 8); break;
7630       case 2:
7631          testimm8 = True; imm64 = Replicate32x2(imm8 << 16); break;
7632       case 3:
7633          testimm8 = True; imm64 = Replicate32x2(imm8 << 24); break;
7634       case 4:
7635           testimm8 = False; imm64 = Replicate16x4(imm8); break;
7636       case 5:
7637           testimm8 = True; imm64 = Replicate16x4(imm8 << 8); break;
7638       case 6:
7639           testimm8 = True;
7640           if ((cmode & 1) == 0)
7641               imm64 = Replicate32x2((imm8 << 8) | 0xFF);
7642           else
7643               imm64 = Replicate32x2((imm8 << 16) | 0xFFFF);
7644           break;
7645       case 7:
7646          testimm8 = False;
7647          if ((cmode & 1) == 0 && op == 0)
7648              imm64 = Replicate8x8(imm8);
7649          if ((cmode & 1) == 0 && op == 1) {
7650              imm64 = 0;   imm64 |= (imm8 & 0x80) ? 0xFF : 0x00;
7651              imm64 <<= 8; imm64 |= (imm8 & 0x40) ? 0xFF : 0x00;
7652              imm64 <<= 8; imm64 |= (imm8 & 0x20) ? 0xFF : 0x00;
7653              imm64 <<= 8; imm64 |= (imm8 & 0x10) ? 0xFF : 0x00;
7654              imm64 <<= 8; imm64 |= (imm8 & 0x08) ? 0xFF : 0x00;
7655              imm64 <<= 8; imm64 |= (imm8 & 0x04) ? 0xFF : 0x00;
7656              imm64 <<= 8; imm64 |= (imm8 & 0x02) ? 0xFF : 0x00;
7657              imm64 <<= 8; imm64 |= (imm8 & 0x01) ? 0xFF : 0x00;
7658          }
7659          if ((cmode & 1) == 1 && op == 0) {
7660             ULong imm8_7  = (imm8 >> 7) & 1;
7661             ULong imm8_6  = (imm8 >> 6) & 1;
7662             ULong imm8_50 = imm8 & 63;
7663             ULong imm32 = (imm8_7                 << (1 + 5 + 6 + 19))
7664                           | ((imm8_6 ^ 1)         << (5 + 6 + 19))
7665                           | (Replicate(imm8_6, 5) << (6 + 19))
7666                           | (imm8_50              << 19);
7667             imm64 = Replicate32x2(imm32);
7668          }
7669          if ((cmode & 1) == 1 && op == 1) {
7670             // imm64 = imm8<7>:NOT(imm8<6>)
7671             //                :Replicate(imm8<6>,8):imm8<5:0>:Zeros(48);
7672             ULong imm8_7  = (imm8 >> 7) & 1;
7673             ULong imm8_6  = (imm8 >> 6) & 1;
7674             ULong imm8_50 = imm8 & 63;
7675             imm64 = (imm8_7 << 63) | ((imm8_6 ^ 1) << 62)
7676                     | (Replicate(imm8_6, 8) << 54)
7677                     | (imm8_50 << 48);
7678          }
7679          break;
7680       default:
7681         vassert(0);
7682    }
7683
7684    if (testimm8 && imm8 == 0)
7685       return False;
7686
7687    *res = imm64;
7688    return True;
7689 }
7690
7691 /* Help a bit for decoding laneage for vector operations that can be
7692    of the form 4x32, 2x64 or 2x32-and-zero-upper-half, as encoded by Q
7693    and SZ bits, typically for vector floating point. */
7694 static Bool getLaneInfo_Q_SZ ( /*OUT*/IRType* tyI,  /*OUT*/IRType* tyF,
7695                                /*OUT*/UInt* nLanes, /*OUT*/Bool* zeroUpper,
7696                                /*OUT*/const HChar** arrSpec,
7697                                Bool bitQ, Bool bitSZ )
7698 {
7699    vassert(bitQ == True || bitQ == False);
7700    vassert(bitSZ == True || bitSZ == False);
7701    if (bitQ && bitSZ) { // 2x64
7702       if (tyI)       *tyI       = Ity_I64;
7703       if (tyF)       *tyF       = Ity_F64;
7704       if (nLanes)    *nLanes    = 2;
7705       if (zeroUpper) *zeroUpper = False;
7706       if (arrSpec)   *arrSpec   = "2d";
7707       return True;
7708    }
7709    if (bitQ && !bitSZ) { // 4x32
7710       if (tyI)       *tyI       = Ity_I32;
7711       if (tyF)       *tyF       = Ity_F32;
7712       if (nLanes)    *nLanes    = 4;
7713       if (zeroUpper) *zeroUpper = False;
7714       if (arrSpec)   *arrSpec   = "4s";
7715       return True;
7716    }
7717    if (!bitQ && !bitSZ) { // 2x32
7718       if (tyI)       *tyI       = Ity_I32;
7719       if (tyF)       *tyF       = Ity_F32;
7720       if (nLanes)    *nLanes    = 2;
7721       if (zeroUpper) *zeroUpper = True;
7722       if (arrSpec)   *arrSpec   = "2s";
7723       return True;
7724    }
7725    // Else impliedly 1x64, which isn't allowed.
7726    return False;
7727 }
7728
7729 /* Helper for decoding laneage for shift-style vector operations
7730    that involve an immediate shift amount. */
7731 static Bool getLaneInfo_IMMH_IMMB ( /*OUT*/UInt* shift, /*OUT*/UInt* szBlg2,
7732                                     UInt immh, UInt immb )
7733 {
7734    vassert(immh < (1<<4));
7735    vassert(immb < (1<<3));
7736    UInt immhb = (immh << 3) | immb;
7737    if (immh & 8) {
7738       if (shift)  *shift  = 128 - immhb;
7739       if (szBlg2) *szBlg2 = 3;
7740       return True;
7741    }
7742    if (immh & 4) {
7743       if (shift)  *shift  = 64 - immhb;
7744       if (szBlg2) *szBlg2 = 2;
7745       return True;
7746    }
7747    if (immh & 2) {
7748       if (shift)  *shift  = 32 - immhb;
7749       if (szBlg2) *szBlg2 = 1;
7750       return True;
7751    }
7752    if (immh & 1) {
7753       if (shift)  *shift  = 16 - immhb;
7754       if (szBlg2) *szBlg2 = 0;
7755       return True;
7756    }
7757    return False;
7758 }
7759
7760 /* Generate IR to fold all lanes of the V128 value in 'src' as
7761    characterised by the operator 'op', and return the result in the
7762    bottom bits of a V128, with all other bits set to zero. */
7763 static IRTemp math_FOLDV ( IRTemp src, IROp op )
7764 {
7765    /* The basic idea is to use repeated applications of Iop_CatEven*
7766       and Iop_CatOdd* operators to 'src' so as to clone each lane into
7767       a complete vector.  Then fold all those vectors with 'op' and
7768       zero out all but the least significant lane. */
7769    switch (op) {
7770       case Iop_Min8Sx16: case Iop_Min8Ux16:
7771       case Iop_Max8Sx16: case Iop_Max8Ux16: case Iop_Add8x16: {
7772          /* NB: temp naming here is misleading -- the naming is for 8
7773             lanes of 16 bit, whereas what is being operated on is 16
7774             lanes of 8 bits. */
7775          IRTemp x76543210 = src;
7776          IRTemp x76547654 = newTempV128();
7777          IRTemp x32103210 = newTempV128();
7778          assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
7779          assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
7780          IRTemp x76767676 = newTempV128();
7781          IRTemp x54545454 = newTempV128();
7782          IRTemp x32323232 = newTempV128();
7783          IRTemp x10101010 = newTempV128();
7784          assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
7785          assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
7786          assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
7787          assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
7788          IRTemp x77777777 = newTempV128();
7789          IRTemp x66666666 = newTempV128();
7790          IRTemp x55555555 = newTempV128();
7791          IRTemp x44444444 = newTempV128();
7792          IRTemp x33333333 = newTempV128();
7793          IRTemp x22222222 = newTempV128();
7794          IRTemp x11111111 = newTempV128();
7795          IRTemp x00000000 = newTempV128();
7796          assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
7797          assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
7798          assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
7799          assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
7800          assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
7801          assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
7802          assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
7803          assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
7804          /* Naming not misleading after here. */
7805          IRTemp xAllF = newTempV128();
7806          IRTemp xAllE = newTempV128();
7807          IRTemp xAllD = newTempV128();
7808          IRTemp xAllC = newTempV128();
7809          IRTemp xAllB = newTempV128();
7810          IRTemp xAllA = newTempV128();
7811          IRTemp xAll9 = newTempV128();
7812          IRTemp xAll8 = newTempV128();
7813          IRTemp xAll7 = newTempV128();
7814          IRTemp xAll6 = newTempV128();
7815          IRTemp xAll5 = newTempV128();
7816          IRTemp xAll4 = newTempV128();
7817          IRTemp xAll3 = newTempV128();
7818          IRTemp xAll2 = newTempV128();
7819          IRTemp xAll1 = newTempV128();
7820          IRTemp xAll0 = newTempV128();
7821          assign(xAllF, mk_CatOddLanes8x16 (x77777777, x77777777));
7822          assign(xAllE, mk_CatEvenLanes8x16(x77777777, x77777777));
7823          assign(xAllD, mk_CatOddLanes8x16 (x66666666, x66666666));
7824          assign(xAllC, mk_CatEvenLanes8x16(x66666666, x66666666));
7825          assign(xAllB, mk_CatOddLanes8x16 (x55555555, x55555555));
7826          assign(xAllA, mk_CatEvenLanes8x16(x55555555, x55555555));
7827          assign(xAll9, mk_CatOddLanes8x16 (x44444444, x44444444));
7828          assign(xAll8, mk_CatEvenLanes8x16(x44444444, x44444444));
7829          assign(xAll7, mk_CatOddLanes8x16 (x33333333, x33333333));
7830          assign(xAll6, mk_CatEvenLanes8x16(x33333333, x33333333));
7831          assign(xAll5, mk_CatOddLanes8x16 (x22222222, x22222222));
7832          assign(xAll4, mk_CatEvenLanes8x16(x22222222, x22222222));
7833          assign(xAll3, mk_CatOddLanes8x16 (x11111111, x11111111));
7834          assign(xAll2, mk_CatEvenLanes8x16(x11111111, x11111111));
7835          assign(xAll1, mk_CatOddLanes8x16 (x00000000, x00000000));
7836          assign(xAll0, mk_CatEvenLanes8x16(x00000000, x00000000));
7837          IRTemp maxFE = newTempV128();
7838          IRTemp maxDC = newTempV128();
7839          IRTemp maxBA = newTempV128();
7840          IRTemp max98 = newTempV128();
7841          IRTemp max76 = newTempV128();
7842          IRTemp max54 = newTempV128();
7843          IRTemp max32 = newTempV128();
7844          IRTemp max10 = newTempV128();
7845          assign(maxFE, binop(op, mkexpr(xAllF), mkexpr(xAllE)));
7846          assign(maxDC, binop(op, mkexpr(xAllD), mkexpr(xAllC)));
7847          assign(maxBA, binop(op, mkexpr(xAllB), mkexpr(xAllA)));
7848          assign(max98, binop(op, mkexpr(xAll9), mkexpr(xAll8)));
7849          assign(max76, binop(op, mkexpr(xAll7), mkexpr(xAll6)));
7850          assign(max54, binop(op, mkexpr(xAll5), mkexpr(xAll4)));
7851          assign(max32, binop(op, mkexpr(xAll3), mkexpr(xAll2)));
7852          assign(max10, binop(op, mkexpr(xAll1), mkexpr(xAll0)));
7853          IRTemp maxFEDC = newTempV128();
7854          IRTemp maxBA98 = newTempV128();
7855          IRTemp max7654 = newTempV128();
7856          IRTemp max3210 = newTempV128();
7857          assign(maxFEDC, binop(op, mkexpr(maxFE), mkexpr(maxDC)));
7858          assign(maxBA98, binop(op, mkexpr(maxBA), mkexpr(max98)));
7859          assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
7860          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
7861          IRTemp maxFEDCBA98 = newTempV128();
7862          IRTemp max76543210 = newTempV128();
7863          assign(maxFEDCBA98, binop(op, mkexpr(maxFEDC), mkexpr(maxBA98)));
7864          assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
7865          IRTemp maxAllLanes = newTempV128();
7866          assign(maxAllLanes, binop(op, mkexpr(maxFEDCBA98),
7867                                        mkexpr(max76543210)));
7868          IRTemp res = newTempV128();
7869          assign(res, unop(Iop_ZeroHI120ofV128, mkexpr(maxAllLanes)));
7870          return res;
7871       }
7872       case Iop_Min16Sx8: case Iop_Min16Ux8:
7873       case Iop_Max16Sx8: case Iop_Max16Ux8: case Iop_Add16x8: {
7874          IRTemp x76543210 = src;
7875          IRTemp x76547654 = newTempV128();
7876          IRTemp x32103210 = newTempV128();
7877          assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
7878          assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
7879          IRTemp x76767676 = newTempV128();
7880          IRTemp x54545454 = newTempV128();
7881          IRTemp x32323232 = newTempV128();
7882          IRTemp x10101010 = newTempV128();
7883          assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
7884          assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
7885          assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
7886          assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
7887          IRTemp x77777777 = newTempV128();
7888          IRTemp x66666666 = newTempV128();
7889          IRTemp x55555555 = newTempV128();
7890          IRTemp x44444444 = newTempV128();
7891          IRTemp x33333333 = newTempV128();
7892          IRTemp x22222222 = newTempV128();
7893          IRTemp x11111111 = newTempV128();
7894          IRTemp x00000000 = newTempV128();
7895          assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
7896          assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
7897          assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
7898          assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
7899          assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
7900          assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
7901          assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
7902          assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
7903          IRTemp max76 = newTempV128();
7904          IRTemp max54 = newTempV128();
7905          IRTemp max32 = newTempV128();
7906          IRTemp max10 = newTempV128();
7907          assign(max76, binop(op, mkexpr(x77777777), mkexpr(x66666666)));
7908          assign(max54, binop(op, mkexpr(x55555555), mkexpr(x44444444)));
7909          assign(max32, binop(op, mkexpr(x33333333), mkexpr(x22222222)));
7910          assign(max10, binop(op, mkexpr(x11111111), mkexpr(x00000000)));
7911          IRTemp max7654 = newTempV128();
7912          IRTemp max3210 = newTempV128();
7913          assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
7914          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
7915          IRTemp max76543210 = newTempV128();
7916          assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
7917          IRTemp res = newTempV128();
7918          assign(res, unop(Iop_ZeroHI112ofV128, mkexpr(max76543210)));
7919          return res;
7920       }
7921       case Iop_Max32Fx4: case Iop_Min32Fx4:
7922       case Iop_Min32Sx4: case Iop_Min32Ux4:
7923       case Iop_Max32Sx4: case Iop_Max32Ux4: case Iop_Add32x4: {
7924          IRTemp x3210 = src;
7925          IRTemp x3232 = newTempV128();
7926          IRTemp x1010 = newTempV128();
7927          assign(x3232, mk_CatOddLanes64x2 (x3210, x3210));
7928          assign(x1010, mk_CatEvenLanes64x2(x3210, x3210));
7929          IRTemp x3333 = newTempV128();
7930          IRTemp x2222 = newTempV128();
7931          IRTemp x1111 = newTempV128();
7932          IRTemp x0000 = newTempV128();
7933          assign(x3333, mk_CatOddLanes32x4 (x3232, x3232));
7934          assign(x2222, mk_CatEvenLanes32x4(x3232, x3232));
7935          assign(x1111, mk_CatOddLanes32x4 (x1010, x1010));
7936          assign(x0000, mk_CatEvenLanes32x4(x1010, x1010));
7937          IRTemp max32 = newTempV128();
7938          IRTemp max10 = newTempV128();
7939          assign(max32, binop(op, mkexpr(x3333), mkexpr(x2222)));
7940          assign(max10, binop(op, mkexpr(x1111), mkexpr(x0000)));
7941          IRTemp max3210 = newTempV128();
7942          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
7943          IRTemp res = newTempV128();
7944          assign(res, unop(Iop_ZeroHI96ofV128, mkexpr(max3210)));
7945          return res;
7946       }
7947       case Iop_Add64x2: {
7948          IRTemp x10 = src;
7949          IRTemp x00 = newTempV128();
7950          IRTemp x11 = newTempV128();
7951          assign(x11, binop(Iop_InterleaveHI64x2, mkexpr(x10), mkexpr(x10)));
7952          assign(x00, binop(Iop_InterleaveLO64x2, mkexpr(x10), mkexpr(x10)));
7953          IRTemp max10 = newTempV128();
7954          assign(max10, binop(op, mkexpr(x11), mkexpr(x00)));
7955          IRTemp res = newTempV128();
7956          assign(res, unop(Iop_ZeroHI64ofV128, mkexpr(max10)));
7957          return res;
7958       }
7959       default:
7960          vassert(0);
7961    }
7962 }
7963
7964
7965 /* Generate IR for TBL and TBX.  This deals with the 128 bit case
7966    only. */
7967 static IRTemp math_TBL_TBX ( IRTemp tab[4], UInt len, IRTemp src,
7968                              IRTemp oor_values )
7969 {
7970    vassert(len >= 0 && len <= 3);
7971
7972    /* Generate some useful constants as concisely as possible. */
7973    IRTemp half15 = newTemp(Ity_I64);
7974    assign(half15, mkU64(0x0F0F0F0F0F0F0F0FULL));
7975    IRTemp half16 = newTemp(Ity_I64);
7976    assign(half16, mkU64(0x1010101010101010ULL));
7977
7978    /* A zero vector */
7979    IRTemp allZero = newTempV128();
7980    assign(allZero, mkV128(0x0000));
7981    /* A vector containing 15 in each 8-bit lane */
7982    IRTemp all15 = newTempV128();
7983    assign(all15, binop(Iop_64HLtoV128, mkexpr(half15), mkexpr(half15)));
7984    /* A vector containing 16 in each 8-bit lane */
7985    IRTemp all16 = newTempV128();
7986    assign(all16, binop(Iop_64HLtoV128, mkexpr(half16), mkexpr(half16)));
7987    /* A vector containing 32 in each 8-bit lane */
7988    IRTemp all32 = newTempV128();
7989    assign(all32, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all16)));
7990    /* A vector containing 48 in each 8-bit lane */
7991    IRTemp all48 = newTempV128();
7992    assign(all48, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all32)));
7993    /* A vector containing 64 in each 8-bit lane */
7994    IRTemp all64 = newTempV128();
7995    assign(all64, binop(Iop_Add8x16, mkexpr(all32), mkexpr(all32)));
7996
7997    /* Group the 16/32/48/64 vectors so as to be indexable. */
7998    IRTemp allXX[4] = { all16, all32, all48, all64 };
7999
8000    /* Compute the result for each table vector, with zeroes in places
8001       where the index values are out of range, and OR them into the
8002       running vector. */
8003    IRTemp running_result = newTempV128();
8004    assign(running_result, mkV128(0));
8005
8006    UInt tabent;
8007    for (tabent = 0; tabent <= len; tabent++) {
8008       vassert(tabent >= 0 && tabent < 4);
8009       IRTemp bias = newTempV128();
8010       assign(bias,
8011              mkexpr(tabent == 0 ? allZero : allXX[tabent-1]));
8012       IRTemp biased_indices = newTempV128();
8013       assign(biased_indices,
8014              binop(Iop_Sub8x16, mkexpr(src), mkexpr(bias)));
8015       IRTemp valid_mask = newTempV128();
8016       assign(valid_mask,
8017              binop(Iop_CmpGT8Ux16, mkexpr(all16), mkexpr(biased_indices)));
8018       IRTemp safe_biased_indices = newTempV128();
8019       assign(safe_biased_indices,
8020              binop(Iop_AndV128, mkexpr(biased_indices), mkexpr(all15)));
8021       IRTemp results_or_junk = newTempV128();
8022       assign(results_or_junk,
8023              binop(Iop_Perm8x16, mkexpr(tab[tabent]),
8024                                  mkexpr(safe_biased_indices)));
8025       IRTemp results_or_zero = newTempV128();
8026       assign(results_or_zero,
8027              binop(Iop_AndV128, mkexpr(results_or_junk), mkexpr(valid_mask)));
8028       /* And OR that into the running result. */
8029       IRTemp tmp = newTempV128();
8030       assign(tmp, binop(Iop_OrV128, mkexpr(results_or_zero),
8031                         mkexpr(running_result)));
8032       running_result = tmp;
8033    }
8034
8035    /* So now running_result holds the overall result where the indices
8036       are in range, and zero in out-of-range lanes.  Now we need to
8037       compute an overall validity mask and use this to copy in the
8038       lanes in the oor_values for out of range indices.  This is
8039       unnecessary for TBL but will get folded out by iropt, so we lean
8040       on that and generate the same code for TBL and TBX here. */
8041    IRTemp overall_valid_mask = newTempV128();
8042    assign(overall_valid_mask,
8043           binop(Iop_CmpGT8Ux16, mkexpr(allXX[len]), mkexpr(src)));
8044    IRTemp result = newTempV128();
8045    assign(result,
8046           binop(Iop_OrV128,
8047                 mkexpr(running_result),
8048                 binop(Iop_AndV128,
8049                       mkexpr(oor_values),
8050                       unop(Iop_NotV128, mkexpr(overall_valid_mask)))));
8051    return result;
8052 }
8053
8054
8055 /* Let |argL| and |argR| be V128 values, and let |opI64x2toV128| be
8056    an op which takes two I64s and produces a V128.  That is, a widening
8057    operator.  Generate IR which applies |opI64x2toV128| to either the
8058    lower (if |is2| is False) or upper (if |is2| is True) halves of
8059    |argL| and |argR|, and return the value in a new IRTemp.
8060 */
8061 static
8062 IRTemp math_BINARY_WIDENING_V128 ( Bool is2, IROp opI64x2toV128,
8063                                    IRExpr* argL, IRExpr* argR )
8064 {
8065    IRTemp res   = newTempV128();
8066    IROp   slice = is2 ? Iop_V128HIto64 : Iop_V128to64;
8067    assign(res, binop(opI64x2toV128, unop(slice, argL),
8068                                     unop(slice, argR)));
8069    return res;
8070 }
8071
8072
8073 /* Generate signed/unsigned absolute difference vector IR. */
8074 static
8075 IRTemp math_ABD ( Bool isU, UInt size, IRExpr* argLE, IRExpr* argRE )
8076 {
8077    vassert(size <= 3);
8078    IRTemp argL = newTempV128();
8079    IRTemp argR = newTempV128();
8080    IRTemp msk  = newTempV128();
8081    IRTemp res  = newTempV128();
8082    assign(argL, argLE);
8083    assign(argR, argRE);
8084    assign(msk, binop(isU ? mkVecCMPGTU(size) : mkVecCMPGTS(size),
8085                      mkexpr(argL), mkexpr(argR)));
8086    assign(res,
8087           binop(Iop_OrV128,
8088                 binop(Iop_AndV128,
8089                       binop(mkVecSUB(size), mkexpr(argL), mkexpr(argR)),
8090                       mkexpr(msk)),
8091                 binop(Iop_AndV128,
8092                       binop(mkVecSUB(size), mkexpr(argR), mkexpr(argL)),
8093                       unop(Iop_NotV128, mkexpr(msk)))));
8094    return res;
8095 }
8096
8097
8098 /* Generate IR that takes a V128 and sign- or zero-widens
8099    either the lower or upper set of lanes to twice-as-wide,
8100    resulting in a new V128 value. */
8101 static
8102 IRTemp math_WIDEN_LO_OR_HI_LANES ( Bool zWiden, Bool fromUpperHalf,
8103                                    UInt sizeNarrow, IRExpr* srcE )
8104 {
8105    IRTemp src = newTempV128();
8106    IRTemp res = newTempV128();
8107    assign(src, srcE);
8108    switch (sizeNarrow) {
8109       case X10:
8110          assign(res,
8111                 binop(zWiden ? Iop_ShrN64x2 : Iop_SarN64x2,
8112                       binop(fromUpperHalf ? Iop_InterleaveHI32x4
8113                                           : Iop_InterleaveLO32x4,
8114                             mkexpr(src),
8115                             mkexpr(src)),
8116                       mkU8(32)));
8117          break;
8118       case X01:
8119          assign(res,
8120                 binop(zWiden ? Iop_ShrN32x4 : Iop_SarN32x4,
8121                       binop(fromUpperHalf ? Iop_InterleaveHI16x8
8122                                           : Iop_InterleaveLO16x8,
8123                             mkexpr(src),
8124                             mkexpr(src)),
8125                       mkU8(16)));
8126          break;
8127       case X00:
8128          assign(res,
8129                 binop(zWiden ? Iop_ShrN16x8 : Iop_SarN16x8,
8130                       binop(fromUpperHalf ? Iop_InterleaveHI8x16
8131                                           : Iop_InterleaveLO8x16,
8132                             mkexpr(src),
8133                             mkexpr(src)),
8134                       mkU8(8)));
8135          break;
8136       default:
8137          vassert(0);
8138    }
8139    return res;
8140 }
8141
8142
8143 /* Generate IR that takes a V128 and sign- or zero-widens
8144    either the even or odd lanes to twice-as-wide,
8145    resulting in a new V128 value. */
8146 static
8147 IRTemp math_WIDEN_EVEN_OR_ODD_LANES ( Bool zWiden, Bool fromOdd,
8148                                       UInt sizeNarrow, IRExpr* srcE )
8149 {
8150    IRTemp src   = newTempV128();
8151    IRTemp res   = newTempV128();
8152    IROp   opSAR = mkVecSARN(sizeNarrow+1);
8153    IROp   opSHR = mkVecSHRN(sizeNarrow+1);
8154    IROp   opSHL = mkVecSHLN(sizeNarrow+1);
8155    IROp   opSxR = zWiden ? opSHR : opSAR;
8156    UInt   amt   = 0;
8157    switch (sizeNarrow) {
8158       case X10: amt = 32; break;
8159       case X01: amt = 16; break;
8160       case X00: amt = 8;  break;
8161       default: vassert(0);
8162    }
8163    assign(src, srcE);
8164    if (fromOdd) {
8165       assign(res, binop(opSxR, mkexpr(src), mkU8(amt)));
8166    } else {
8167       assign(res, binop(opSxR, binop(opSHL, mkexpr(src), mkU8(amt)),
8168                                mkU8(amt)));
8169    }
8170    return res;
8171 }
8172
8173
8174 /* Generate IR that takes two V128s and narrows (takes lower half)
8175    of each lane, producing a single V128 value. */
8176 static
8177 IRTemp math_NARROW_LANES ( IRTemp argHi, IRTemp argLo, UInt sizeNarrow )
8178 {
8179    IRTemp res = newTempV128();
8180    assign(res, binop(mkVecCATEVENLANES(sizeNarrow),
8181                      mkexpr(argHi), mkexpr(argLo)));
8182    return res;
8183 }
8184
8185
8186 /* Return a temp which holds the vector dup of the lane of width
8187    (1 << size) obtained from src[laneNo]. */
8188 static
8189 IRTemp math_DUP_VEC_ELEM ( IRExpr* src, UInt size, UInt laneNo )
8190 {
8191    vassert(size <= 3);
8192    /* Normalise |laneNo| so it is of the form
8193       x000 for D, xx00 for S, xxx0 for H, and xxxx for B.
8194       This puts the bits we want to inspect at constant offsets
8195       regardless of the value of |size|.
8196    */
8197    UInt ix = laneNo << size;
8198    vassert(ix <= 15);
8199    IROp ops[4] = { Iop_INVALID, Iop_INVALID, Iop_INVALID, Iop_INVALID };
8200    switch (size) {
8201       case 0: /* B */
8202          ops[0] = (ix & 1) ? Iop_CatOddLanes8x16 : Iop_CatEvenLanes8x16;
8203          /* fallthrough */
8204       case 1: /* H */
8205          ops[1] = (ix & 2) ? Iop_CatOddLanes16x8 : Iop_CatEvenLanes16x8;
8206          /* fallthrough */
8207       case 2: /* S */
8208          ops[2] = (ix & 4) ? Iop_CatOddLanes32x4 : Iop_CatEvenLanes32x4;
8209          /* fallthrough */
8210       case 3: /* D */
8211          ops[3] = (ix & 8) ? Iop_InterleaveHI64x2 : Iop_InterleaveLO64x2;
8212          break;
8213       default:
8214          vassert(0);
8215    }
8216    IRTemp res = newTempV128();
8217    assign(res, src);
8218    Int i;
8219    for (i = 3; i >= 0; i--) {
8220       if (ops[i] == Iop_INVALID)
8221          break;
8222       IRTemp tmp = newTempV128();
8223       assign(tmp, binop(ops[i], mkexpr(res), mkexpr(res)));
8224       res = tmp;
8225    }
8226    return res;
8227 }
8228
8229
8230 /* Let |srcV| be a V128 value, and let |imm5| be a lane-and-size
8231    selector encoded as shown below.  Return a new V128 holding the
8232    selected lane from |srcV| dup'd out to V128, and also return the
8233    lane number, log2 of the lane size in bytes, and width-character via
8234    *laneNo, *laneSzLg2 and *laneCh respectively.  It may be that imm5
8235    is an invalid selector, in which case return
8236    IRTemp_INVALID, 0, 0 and '?' respectively.
8237
8238    imm5 = xxxx1   signifies .b[xxxx]
8239         = xxx10   .h[xxx]
8240         = xx100   .s[xx]
8241         = x1000   .d[x]
8242         otherwise invalid
8243 */
8244 static
8245 IRTemp handle_DUP_VEC_ELEM ( /*OUT*/UInt* laneNo,
8246                              /*OUT*/UInt* laneSzLg2, /*OUT*/HChar* laneCh,
8247                              IRExpr* srcV, UInt imm5 )
8248 {
8249    *laneNo    = 0;
8250    *laneSzLg2 = 0;
8251    *laneCh    = '?';
8252
8253    if (imm5 & 1) {
8254       *laneNo    = (imm5 >> 1) & 15;
8255       *laneSzLg2 = 0;
8256       *laneCh    = 'b';
8257    }
8258    else if (imm5 & 2) {
8259       *laneNo    = (imm5 >> 2) & 7;
8260       *laneSzLg2 = 1;
8261       *laneCh    = 'h';
8262    }
8263    else if (imm5 & 4) {
8264       *laneNo    = (imm5 >> 3) & 3;
8265       *laneSzLg2 = 2;
8266       *laneCh    = 's';
8267    }
8268    else if (imm5 & 8) {
8269       *laneNo    = (imm5 >> 4) & 1;
8270       *laneSzLg2 = 3;
8271       *laneCh    = 'd';
8272    }
8273    else {
8274       /* invalid */
8275       return IRTemp_INVALID;
8276    }
8277
8278    return math_DUP_VEC_ELEM(srcV, *laneSzLg2, *laneNo);
8279 }
8280
8281
8282 /* Clone |imm| to every lane of a V128, with lane size log2 of |size|. */
8283 static
8284 IRTemp math_VEC_DUP_IMM ( UInt size, ULong imm )
8285 {
8286    IRType ty  = Ity_INVALID;
8287    IRTemp rcS = IRTemp_INVALID;
8288    switch (size) {
8289       case X01:
8290          vassert(imm <= 0xFFFFULL);
8291          ty  = Ity_I16;
8292          rcS = newTemp(ty); assign(rcS, mkU16( (UShort)imm ));
8293          break;
8294       case X10:
8295          vassert(imm <= 0xFFFFFFFFULL);
8296          ty  = Ity_I32;
8297          rcS = newTemp(ty); assign(rcS, mkU32( (UInt)imm ));
8298          break;
8299       case X11:
8300          ty  = Ity_I64;
8301          rcS = newTemp(ty); assign(rcS, mkU64(imm)); break;
8302       default:
8303          vassert(0);
8304    }
8305    IRTemp rcV = math_DUP_TO_V128(rcS, ty);
8306    return rcV;
8307 }
8308
8309
8310 /* Let |new64| be a V128 in which only the lower 64 bits are interesting,
8311    and the upper can contain any value -- it is ignored.  If |is2| is False,
8312    generate IR to put |new64| in the lower half of vector reg |dd| and zero
8313    the upper half.  If |is2| is True, generate IR to put |new64| in the upper
8314    half of vector reg |dd| and leave the lower half unchanged.  This
8315    simulates the behaviour of the "foo/foo2" instructions in which the
8316    destination is half the width of sources, for example addhn/addhn2.
8317 */
8318 static
8319 void putLO64andZUorPutHI64 ( Bool is2, UInt dd, IRTemp new64 )
8320 {
8321    if (is2) {
8322       /* Get the old contents of Vdd, zero the upper half, and replace
8323          it with 'x'. */
8324       IRTemp t_zero_oldLO = newTempV128();
8325       assign(t_zero_oldLO, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
8326       IRTemp t_newHI_zero = newTempV128();
8327       assign(t_newHI_zero, binop(Iop_InterleaveLO64x2, mkexpr(new64),
8328                                                        mkV128(0x0000)));
8329       IRTemp res = newTempV128();
8330       assign(res, binop(Iop_OrV128, mkexpr(t_zero_oldLO),
8331                                     mkexpr(t_newHI_zero)));
8332       putQReg128(dd, mkexpr(res));
8333    } else {
8334       /* This is simple. */
8335       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(new64)));
8336    }
8337 }
8338
8339
8340 /* Compute vector SQABS at lane size |size| for |srcE|, returning
8341    the q result in |*qabs| and the normal result in |*nabs|. */
8342 static
8343 void math_SQABS ( /*OUT*/IRTemp* qabs, /*OUT*/IRTemp* nabs,
8344                   IRExpr* srcE, UInt size )
8345 {
8346       IRTemp src, mask, maskn, nsub, qsub;
8347       src = mask = maskn = nsub = qsub = IRTemp_INVALID;
8348       newTempsV128_7(&src, &mask, &maskn, &nsub, &qsub, nabs, qabs);
8349       assign(src,   srcE);
8350       assign(mask,  binop(mkVecCMPGTS(size),  mkV128(0x0000), mkexpr(src)));
8351       assign(maskn, unop(Iop_NotV128, mkexpr(mask)));
8352       assign(nsub,  binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
8353       assign(qsub,  binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
8354       assign(*nabs, binop(Iop_OrV128,
8355                           binop(Iop_AndV128, mkexpr(nsub), mkexpr(mask)),
8356                           binop(Iop_AndV128, mkexpr(src),  mkexpr(maskn))));
8357       assign(*qabs, binop(Iop_OrV128,
8358                           binop(Iop_AndV128, mkexpr(qsub), mkexpr(mask)),
8359                           binop(Iop_AndV128, mkexpr(src),  mkexpr(maskn))));
8360 }
8361
8362
8363 /* Compute vector SQNEG at lane size |size| for |srcE|, returning
8364    the q result in |*qneg| and the normal result in |*nneg|. */
8365 static
8366 void math_SQNEG ( /*OUT*/IRTemp* qneg, /*OUT*/IRTemp* nneg,
8367                   IRExpr* srcE, UInt size )
8368 {
8369       IRTemp src = IRTemp_INVALID;
8370       newTempsV128_3(&src, nneg, qneg);
8371       assign(src,   srcE);
8372       assign(*nneg, binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
8373       assign(*qneg, binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
8374 }
8375
8376
8377 /* Zero all except the least significant lane of |srcE|, where |size|
8378    indicates the lane size in the usual way. */
8379 static IRTemp math_ZERO_ALL_EXCEPT_LOWEST_LANE ( UInt size, IRExpr* srcE )
8380 {
8381    vassert(size < 4);
8382    IRTemp t = newTempV128();
8383    assign(t, unop(mkVecZEROHIxxOFV128(size), srcE));
8384    return t;
8385 }
8386
8387
8388 /* Generate IR to compute vector widening MULL from either the lower
8389    (is2==False) or upper (is2==True) halves of vecN and vecM.  The
8390    widening multiplies are unsigned when isU==True and signed when
8391    isU==False.  |size| is the narrow lane size indication.  Optionally,
8392    the product may be added to or subtracted from vecD, at the wide lane
8393    size.  This happens when |mas| is 'a' (add) or 's' (sub).  When |mas|
8394    is 'm' (only multiply) then the accumulate part does not happen, and
8395    |vecD| is expected to == IRTemp_INVALID.
8396
8397    Only size==0 (h_b_b), size==1 (s_h_h) and size==2 (d_s_s) variants
8398    are allowed.  The result is returned in a new IRTemp, which is
8399    returned in *res. */
8400 static
8401 void math_MULL_ACC ( /*OUT*/IRTemp* res,
8402                      Bool is2, Bool isU, UInt size, HChar mas,
8403                      IRTemp vecN, IRTemp vecM, IRTemp vecD )
8404 {
8405    vassert(res && *res == IRTemp_INVALID);
8406    vassert(size <= 2);
8407    vassert(mas == 'm' || mas == 'a' || mas == 's');
8408    if (mas == 'm') vassert(vecD == IRTemp_INVALID);
8409    IROp   mulOp = isU ? mkVecMULLU(size) : mkVecMULLS(size);
8410    IROp   accOp = (mas == 'a') ? mkVecADD(size+1)
8411                   : (mas == 's' ? mkVecSUB(size+1)
8412                   : Iop_INVALID);
8413    IRTemp mul   = math_BINARY_WIDENING_V128(is2, mulOp,
8414                                             mkexpr(vecN), mkexpr(vecM));
8415    *res = newTempV128();
8416    assign(*res, mas == 'm' ? mkexpr(mul)
8417                            : binop(accOp, mkexpr(vecD), mkexpr(mul)));
8418 }
8419
8420
8421 /* Same as math_MULL_ACC, except the multiply is signed widening,
8422    the multiplied value is then doubled, before being added to or
8423    subtracted from the accumulated value.  And everything is
8424    saturated.  In all cases, saturation residuals are returned
8425    via (sat1q, sat1n), and in the accumulate cases,
8426    via (sat2q, sat2n) too.  All results are returned in new temporaries.
8427    In the no-accumulate case, *sat2q and *sat2n are never instantiated,
8428    so the caller can tell this has happened. */
8429 static
8430 void math_SQDMULL_ACC ( /*OUT*/IRTemp* res,
8431                         /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
8432                         /*OUT*/IRTemp* sat2q, /*OUT*/IRTemp* sat2n,
8433                         Bool is2, UInt size, HChar mas,
8434                         IRTemp vecN, IRTemp vecM, IRTemp vecD )
8435 {
8436    vassert(size <= 2);
8437    vassert(mas == 'm' || mas == 'a' || mas == 's');
8438    /* Compute
8439          sat1q = vecN.D[is2] *sq vecM.d[is2] *q 2
8440          sat1n = vecN.D[is2] *s  vecM.d[is2] *  2
8441       IOW take either the low or high halves of vecN and vecM, signed widen,
8442       multiply, double that, and signedly saturate.  Also compute the same
8443       but without saturation.
8444    */
8445    vassert(sat2q && *sat2q == IRTemp_INVALID);
8446    vassert(sat2n && *sat2n == IRTemp_INVALID);
8447    newTempsV128_3(sat1q, sat1n, res);
8448    IRTemp tq = math_BINARY_WIDENING_V128(is2, mkVecQDMULLS(size),
8449                                          mkexpr(vecN), mkexpr(vecM));
8450    IRTemp tn = math_BINARY_WIDENING_V128(is2, mkVecMULLS(size),
8451                                          mkexpr(vecN), mkexpr(vecM));
8452    assign(*sat1q, mkexpr(tq));
8453    assign(*sat1n, binop(mkVecADD(size+1), mkexpr(tn), mkexpr(tn)));
8454
8455    /* If there is no accumulation, the final result is sat1q,
8456       and there's no assignment to sat2q or sat2n. */
8457    if (mas == 'm') {
8458       assign(*res, mkexpr(*sat1q));
8459       return;
8460    }
8461
8462    /* Compute
8463          sat2q  = vecD +sq/-sq sat1q
8464          sat2n  = vecD +/-     sat1n
8465          result = sat2q
8466    */
8467    newTempsV128_2(sat2q, sat2n);
8468    assign(*sat2q, binop(mas == 'a' ? mkVecQADDS(size+1) : mkVecQSUBS(size+1),
8469                         mkexpr(vecD), mkexpr(*sat1q)));
8470    assign(*sat2n, binop(mas == 'a' ? mkVecADD(size+1) : mkVecSUB(size+1),
8471                         mkexpr(vecD), mkexpr(*sat1n)));
8472    assign(*res, mkexpr(*sat2q));
8473 }
8474
8475
8476 /* Generate IR for widening signed vector multiplies.  The operands
8477    have their lane width signedly widened, and they are then multiplied
8478    at the wider width, returning results in two new IRTemps. */
8479 static
8480 void math_MULLS ( /*OUT*/IRTemp* resHI, /*OUT*/IRTemp* resLO,
8481                   UInt sizeNarrow, IRTemp argL, IRTemp argR )
8482 {
8483    vassert(sizeNarrow <= 2);
8484    newTempsV128_2(resHI, resLO);
8485    IRTemp argLhi = newTemp(Ity_I64);
8486    IRTemp argLlo = newTemp(Ity_I64);
8487    IRTemp argRhi = newTemp(Ity_I64);
8488    IRTemp argRlo = newTemp(Ity_I64);
8489    assign(argLhi, unop(Iop_V128HIto64, mkexpr(argL)));
8490    assign(argLlo, unop(Iop_V128to64,   mkexpr(argL)));
8491    assign(argRhi, unop(Iop_V128HIto64, mkexpr(argR)));
8492    assign(argRlo, unop(Iop_V128to64,   mkexpr(argR)));
8493    IROp opMulls = mkVecMULLS(sizeNarrow);
8494    assign(*resHI, binop(opMulls, mkexpr(argLhi), mkexpr(argRhi)));
8495    assign(*resLO, binop(opMulls, mkexpr(argLlo), mkexpr(argRlo)));
8496 }
8497
8498
8499 /* Generate IR for SQDMULH and SQRDMULH: signedly wideningly multiply,
8500    double that, possibly add a rounding constant (R variants), and take
8501    the high half. */
8502 static
8503 void math_SQDMULH ( /*OUT*/IRTemp* res,
8504                     /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
8505                     Bool isR, UInt size, IRTemp vN, IRTemp vM )
8506 {
8507    vassert(size == X01 || size == X10); /* s or h only */
8508
8509    newTempsV128_3(res, sat1q, sat1n);
8510
8511    IRTemp mullsHI = IRTemp_INVALID, mullsLO = IRTemp_INVALID;
8512    math_MULLS(&mullsHI, &mullsLO, size, vN, vM);
8513
8514    IRTemp addWide = mkVecADD(size+1);
8515
8516    if (isR) {
8517       assign(*sat1q, binop(mkVecQRDMULHIS(size), mkexpr(vN), mkexpr(vM)));
8518
8519       Int    rcShift    = size == X01 ? 15 : 31;
8520       IRTemp roundConst = math_VEC_DUP_IMM(size+1, 1ULL << rcShift);
8521       assign(*sat1n,
8522              binop(mkVecCATODDLANES(size),
8523                    binop(addWide,
8524                          binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
8525                          mkexpr(roundConst)),
8526                    binop(addWide,
8527                          binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO)),
8528                          mkexpr(roundConst))));
8529    } else {
8530       assign(*sat1q, binop(mkVecQDMULHIS(size), mkexpr(vN), mkexpr(vM)));
8531
8532       assign(*sat1n,
8533              binop(mkVecCATODDLANES(size),
8534                    binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
8535                    binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO))));
8536    }
8537
8538    assign(*res, mkexpr(*sat1q));
8539 }
8540
8541 /* Generate IR for SQRDMLAH and SQRDMLSH: signedly wideningly multiply,
8542    double, add a rounding constant, take the high half and accumulate. */
8543 static
8544 void math_SQRDMLAH ( /*OUT*/IRTemp* res, /*OUT*/IRTemp* res_nosat, Bool isAdd,
8545                      UInt size, IRTemp vD, IRTemp vN, IRTemp vM )
8546 {
8547    vassert(size == X01 || size == X10); /* s or h only */
8548
8549    /* SQRDMLAH = SQADD(A, SQRDMULH(B, C)) */
8550
8551    IRTemp mul, mul_nosat, dummy;
8552    mul = mul_nosat = dummy = IRTemp_INVALID;
8553    math_SQDMULH(&mul, &dummy, &mul_nosat, True/*R*/, size, vN, vM);
8554
8555    IROp  op = isAdd ? mkVecADD(size)   : mkVecSUB(size);
8556    IROp qop = isAdd ? mkVecQADDS(size) : mkVecQSUBS(size);
8557    newTempsV128_2(res, res_nosat);
8558    assign(*res, binop(qop, mkexpr(vD), mkexpr(mul)));
8559    assign(*res_nosat, binop(op, mkexpr(vD), mkexpr(mul_nosat)));
8560 }
8561
8562
8563 /* Generate IR for SQSHL, UQSHL, SQSHLU by imm.  Put the result in
8564    a new temp in *res, and the Q difference pair in new temps in
8565    *qDiff1 and *qDiff2 respectively.  |nm| denotes which of the
8566    three operations it is. */
8567 static
8568 void math_QSHL_IMM ( /*OUT*/IRTemp* res,
8569                      /*OUT*/IRTemp* qDiff1, /*OUT*/IRTemp* qDiff2,
8570                      IRTemp src, UInt size, UInt shift, const HChar* nm )
8571 {
8572    vassert(size <= 3);
8573    UInt laneBits = 8 << size;
8574    vassert(shift < laneBits);
8575    newTempsV128_3(res, qDiff1, qDiff2);
8576    IRTemp z128 = newTempV128();
8577    assign(z128, mkV128(0x0000));
8578
8579    /* UQSHL */
8580    if (vex_streq(nm, "uqshl")) {
8581       IROp qop = mkVecQSHLNSATUU(size);
8582       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
8583       if (shift == 0) {
8584          /* No shift means no saturation. */
8585          assign(*qDiff1, mkexpr(z128));
8586          assign(*qDiff2, mkexpr(z128));
8587       } else {
8588          /* Saturation has occurred if any of the shifted-out bits are
8589             nonzero.  We get the shifted-out bits by right-shifting the
8590             original value. */
8591          UInt rshift = laneBits - shift;
8592          vassert(rshift >= 1 && rshift < laneBits);
8593          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
8594          assign(*qDiff2, mkexpr(z128));
8595       }
8596       return;
8597    }
8598
8599    /* SQSHL */
8600    if (vex_streq(nm, "sqshl")) {
8601       IROp qop = mkVecQSHLNSATSS(size);
8602       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
8603       if (shift == 0) {
8604          /* No shift means no saturation. */
8605          assign(*qDiff1, mkexpr(z128));
8606          assign(*qDiff2, mkexpr(z128));
8607       } else {
8608          /* Saturation has occurred if any of the shifted-out bits are
8609             different from the top bit of the original value. */
8610          UInt rshift = laneBits - 1 - shift;
8611          vassert(rshift >= 0 && rshift < laneBits-1);
8612          /* qDiff1 is the shifted out bits, and the top bit of the original
8613             value, preceded by zeroes. */
8614          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
8615          /* qDiff2 is the top bit of the original value, cloned the
8616             correct number of times. */
8617          assign(*qDiff2, binop(mkVecSHRN(size),
8618                                binop(mkVecSARN(size), mkexpr(src),
8619                                                       mkU8(laneBits-1)),
8620                                mkU8(rshift)));
8621          /* This also succeeds in comparing the top bit of the original
8622             value to itself, which is a bit stupid, but not wrong. */
8623       }
8624       return;
8625    }
8626
8627    /* SQSHLU */
8628    if (vex_streq(nm, "sqshlu")) {
8629       IROp qop = mkVecQSHLNSATSU(size);
8630       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
8631       if (shift == 0) {
8632          /* If there's no shift, saturation depends on the top bit
8633             of the source. */
8634          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(laneBits-1)));
8635          assign(*qDiff2, mkexpr(z128));
8636       } else {
8637          /* Saturation has occurred if any of the shifted-out bits are
8638             nonzero.  We get the shifted-out bits by right-shifting the
8639             original value. */
8640          UInt rshift = laneBits - shift;
8641          vassert(rshift >= 1 && rshift < laneBits);
8642          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
8643          assign(*qDiff2, mkexpr(z128));
8644       }
8645       return;
8646    }
8647
8648    vassert(0);
8649 }
8650
8651
8652 /* Generate IR to do SRHADD and URHADD. */
8653 static
8654 IRTemp math_RHADD ( UInt size, Bool isU, IRTemp aa, IRTemp bb )
8655 {
8656    /* Generate this:
8657       (A >> 1) + (B >> 1) + (((A & 1) + (B & 1) + 1) >> 1)
8658    */
8659    vassert(size <= 3);
8660    IROp opSHR = isU ? mkVecSHRN(size) : mkVecSARN(size);
8661    IROp opADD = mkVecADD(size);
8662    /* The only tricky bit is to generate the correct vector 1 constant. */
8663    const ULong ones64[4]
8664       = { 0x0101010101010101ULL, 0x0001000100010001ULL,
8665           0x0000000100000001ULL, 0x0000000000000001ULL };
8666    IRTemp imm64 = newTemp(Ity_I64);
8667    assign(imm64, mkU64(ones64[size]));
8668    IRTemp vecOne = newTempV128();
8669    assign(vecOne, binop(Iop_64HLtoV128, mkexpr(imm64), mkexpr(imm64)));
8670    IRTemp scaOne = newTemp(Ity_I8);
8671    assign(scaOne, mkU8(1));
8672    IRTemp res = newTempV128();
8673    assign(res,
8674           binop(opADD,
8675                 binop(opSHR, mkexpr(aa), mkexpr(scaOne)),
8676                 binop(opADD,
8677                       binop(opSHR, mkexpr(bb), mkexpr(scaOne)),
8678                       binop(opSHR,
8679                             binop(opADD,
8680                                   binop(opADD,
8681                                         binop(Iop_AndV128, mkexpr(aa),
8682                                                            mkexpr(vecOne)),
8683                                         binop(Iop_AndV128, mkexpr(bb),
8684                                                            mkexpr(vecOne))
8685                                   ),
8686                                   mkexpr(vecOne)
8687                             ),
8688                             mkexpr(scaOne)
8689                       )
8690                 )
8691           )
8692    );
8693    return res;
8694 }
8695
8696
8697 /* QCFLAG tracks the SIMD sticky saturation status.  Update the status
8698    thusly: if, after application of |opZHI| to both |qres| and |nres|,
8699    they have the same value, leave QCFLAG unchanged.  Otherwise, set it
8700    (implicitly) to 1.  |opZHI| may only be one of the Iop_ZeroHIxxofV128
8701    operators, or Iop_INVALID, in which case |qres| and |nres| are used
8702    unmodified.  The presence |opZHI| means this function can be used to
8703    generate QCFLAG update code for both scalar and vector SIMD operations.
8704 */
8705 static
8706 void updateQCFLAGwithDifferenceZHI ( IRTemp qres, IRTemp nres, IROp opZHI )
8707 {
8708    IRTemp diff      = newTempV128();
8709    IRTemp oldQCFLAG = newTempV128();
8710    IRTemp newQCFLAG = newTempV128();
8711    if (opZHI == Iop_INVALID) {
8712       assign(diff, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres)));
8713    } else {
8714       vassert(opZHI == Iop_ZeroHI64ofV128
8715               || opZHI == Iop_ZeroHI96ofV128 || opZHI == Iop_ZeroHI112ofV128);
8716       assign(diff, unop(opZHI, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres))));
8717    }
8718    assign(oldQCFLAG, IRExpr_Get(OFFB_QCFLAG, Ity_V128));
8719    assign(newQCFLAG, binop(Iop_OrV128, mkexpr(oldQCFLAG), mkexpr(diff)));
8720    stmt(IRStmt_Put(OFFB_QCFLAG, mkexpr(newQCFLAG)));
8721 }
8722
8723
8724 /* A variant of updateQCFLAGwithDifferenceZHI in which |qres| and |nres|
8725    are used unmodified, hence suitable for QCFLAG updates for whole-vector
8726    operations. */
8727 static
8728 void updateQCFLAGwithDifference ( IRTemp qres, IRTemp nres )
8729 {
8730    updateQCFLAGwithDifferenceZHI(qres, nres, Iop_INVALID);
8731 }
8732
8733
8734 /* Generate IR to rearrange two vector values in a way which is useful
8735    for doing S/D add-pair etc operations.  There are 3 cases:
8736
8737    2d:  [m1 m0] [n1 n0]  -->  [m1 n1] [m0 n0]
8738
8739    4s:  [m3 m2 m1 m0] [n3 n2 n1 n0]  -->  [m3 m1 n3 n1] [m2 m0 n2 n0]
8740
8741    2s:  [m2 m2 m1 m0] [n3 n2 n1 n0]  -->  [0 0 m1 n1] [0 0 m0 n0]
8742
8743    The cases are distinguished as follows:
8744    isD == True,  bitQ == 1  =>  2d
8745    isD == False, bitQ == 1  =>  4s
8746    isD == False, bitQ == 0  =>  2s
8747 */
8748 static
8749 void math_REARRANGE_FOR_FLOATING_PAIRWISE (
8750         /*OUT*/IRTemp* rearrL, /*OUT*/IRTemp* rearrR,
8751         IRTemp vecM, IRTemp vecN, Bool isD, UInt bitQ
8752      )
8753 {
8754    vassert(rearrL && *rearrL == IRTemp_INVALID);
8755    vassert(rearrR && *rearrR == IRTemp_INVALID);
8756    *rearrL = newTempV128();
8757    *rearrR = newTempV128();
8758    if (isD) {
8759       // 2d case
8760       vassert(bitQ == 1);
8761       assign(*rearrL, binop(Iop_InterleaveHI64x2, mkexpr(vecM), mkexpr(vecN)));
8762       assign(*rearrR, binop(Iop_InterleaveLO64x2, mkexpr(vecM), mkexpr(vecN)));
8763    }
8764    else if (!isD && bitQ == 1) {
8765       // 4s case
8766       assign(*rearrL, binop(Iop_CatOddLanes32x4,  mkexpr(vecM), mkexpr(vecN)));
8767       assign(*rearrR, binop(Iop_CatEvenLanes32x4, mkexpr(vecM), mkexpr(vecN)));
8768    } else {
8769       // 2s case
8770       vassert(!isD && bitQ == 0);
8771       IRTemp m1n1m0n0 = newTempV128();
8772       IRTemp m0n0m1n1 = newTempV128();
8773       assign(m1n1m0n0, binop(Iop_InterleaveLO32x4,
8774                              mkexpr(vecM), mkexpr(vecN)));
8775       assign(m0n0m1n1, triop(Iop_SliceV128,
8776                              mkexpr(m1n1m0n0), mkexpr(m1n1m0n0), mkU8(8)));
8777       assign(*rearrL, unop(Iop_ZeroHI64ofV128, mkexpr(m1n1m0n0)));
8778       assign(*rearrR, unop(Iop_ZeroHI64ofV128, mkexpr(m0n0m1n1)));
8779    }
8780 }
8781
8782
8783 /* Returns 2.0 ^ (-n) for n in 1 .. 64 */
8784 static Double two_to_the_minus ( Int n )
8785 {
8786    if (n == 1) return 0.5;
8787    vassert(n >= 2 && n <= 64);
8788    Int half = n / 2;
8789    return two_to_the_minus(half) * two_to_the_minus(n - half);
8790 }
8791
8792
8793 /* Returns 2.0 ^ n for n in 1 .. 64 */
8794 static Double two_to_the_plus ( Int n )
8795 {
8796    if (n == 1) return 2.0;
8797    vassert(n >= 2 && n <= 64);
8798    Int half = n / 2;
8799    return two_to_the_plus(half) * two_to_the_plus(n - half);
8800 }
8801
8802
8803 /*------------------------------------------------------------*/
8804 /*--- SIMD and FP instructions                             ---*/
8805 /*------------------------------------------------------------*/
8806
8807 static
8808 Bool dis_AdvSIMD_EXT(/*MB_OUT*/DisResult* dres, UInt insn)
8809 {
8810    /* 31  29     23  21 20 15 14   10 9 4
8811       0 q 101110 op2 0  m  0  imm4 0  n d
8812       Decode fields: op2
8813    */
8814 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8815    if (INSN(31,31) != 0
8816        || INSN(29,24) != BITS6(1,0,1,1,1,0)
8817        || INSN(21,21) != 0 || INSN(15,15) != 0 || INSN(10,10) != 0) {
8818       return False;
8819    }
8820    UInt bitQ = INSN(30,30);
8821    UInt op2  = INSN(23,22);
8822    UInt mm   = INSN(20,16);
8823    UInt imm4 = INSN(14,11);
8824    UInt nn   = INSN(9,5);
8825    UInt dd   = INSN(4,0);
8826
8827    if (op2 == BITS2(0,0)) {
8828       /* -------- 00: EXT 16b_16b_16b, 8b_8b_8b -------- */
8829       IRTemp sHi = newTempV128();
8830       IRTemp sLo = newTempV128();
8831       IRTemp res = newTempV128();
8832       assign(sHi, getQReg128(mm));
8833       assign(sLo, getQReg128(nn));
8834       if (bitQ == 1) {
8835          if (imm4 == 0) {
8836             assign(res, mkexpr(sLo));
8837          } else {
8838             vassert(imm4 >= 1 && imm4 <= 15);
8839             assign(res, triop(Iop_SliceV128,
8840                               mkexpr(sHi), mkexpr(sLo), mkU8(imm4)));
8841          }
8842          putQReg128(dd, mkexpr(res));
8843          DIP("ext v%u.16b, v%u.16b, v%u.16b, #%u\n", dd, nn, mm, imm4);
8844       } else {
8845          if (imm4 >= 8) return False;
8846          if (imm4 == 0) {
8847             assign(res, mkexpr(sLo));
8848          } else {
8849             vassert(imm4 >= 1 && imm4 <= 7);
8850             IRTemp hi64lo64 = newTempV128();
8851             assign(hi64lo64, binop(Iop_InterleaveLO64x2,
8852                                    mkexpr(sHi), mkexpr(sLo)));
8853             assign(res, triop(Iop_SliceV128,
8854                               mkexpr(hi64lo64), mkexpr(hi64lo64), mkU8(imm4)));
8855          }
8856          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
8857          DIP("ext v%u.8b, v%u.8b, v%u.8b, #%u\n", dd, nn, mm, imm4);
8858       }
8859       return True;
8860    }
8861
8862    return False;
8863 #  undef INSN
8864 }
8865
8866
8867 static
8868 Bool dis_AdvSIMD_TBL_TBX(/*MB_OUT*/DisResult* dres, UInt insn)
8869 {
8870    /* 31  29     23  21 20 15 14  12 11 9 4
8871       0 q 001110 op2 0  m  0  len op 00 n d
8872       Decode fields: op2,len,op
8873    */
8874 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8875    if (INSN(31,31) != 0
8876        || INSN(29,24) != BITS6(0,0,1,1,1,0)
8877        || INSN(21,21) != 0
8878        || INSN(15,15) != 0
8879        || INSN(11,10) != BITS2(0,0)) {
8880       return False;
8881    }
8882    UInt bitQ  = INSN(30,30);
8883    UInt op2   = INSN(23,22);
8884    UInt mm    = INSN(20,16);
8885    UInt len   = INSN(14,13);
8886    UInt bitOP = INSN(12,12);
8887    UInt nn    = INSN(9,5);
8888    UInt dd    = INSN(4,0);
8889
8890    if (op2 == X00) {
8891       /* -------- 00,xx,0 TBL, xx register table -------- */
8892       /* -------- 00,xx,1 TBX, xx register table -------- */
8893       /* 31  28        20 15 14  12  9 4
8894          0q0 01110 000 m  0  len 000 n d  TBL Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
8895          0q0 01110 000 m  0  len 100 n d  TBX Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
8896          where Ta = 16b(q=1) or 8b(q=0)
8897       */
8898       Bool isTBX = bitOP == 1;
8899       /* The out-of-range values to use. */
8900       IRTemp oor_values = newTempV128();
8901       assign(oor_values, isTBX ? getQReg128(dd) : mkV128(0));
8902       /* src value */
8903       IRTemp src = newTempV128();
8904       assign(src, getQReg128(mm));
8905       /* The table values */
8906       IRTemp tab[4];
8907       UInt   i;
8908       for (i = 0; i <= len; i++) {
8909          vassert(i < 4);
8910          tab[i] = newTempV128();
8911          assign(tab[i], getQReg128((nn + i) % 32));
8912       }
8913       IRTemp res = math_TBL_TBX(tab, len, src, oor_values);
8914       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8915       const HChar* Ta = bitQ ==1 ? "16b" : "8b";
8916       const HChar* nm = isTBX ? "tbx" : "tbl";
8917       DIP("%s %s.%s, {v%u.16b .. v%u.16b}, %s.%s\n",
8918           nm, nameQReg128(dd), Ta, nn, (nn + len) % 32, nameQReg128(mm), Ta);
8919       return True;
8920    }
8921
8922 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8923    return False;
8924 #  undef INSN
8925 }
8926
8927
8928 static
8929 Bool dis_AdvSIMD_ZIP_UZP_TRN(/*MB_OUT*/DisResult* dres, UInt insn)
8930 {
8931    /* 31  29     23   21 20 15 14     11 9 4
8932       0 q 001110 size 0  m  0  opcode 10 n d
8933       Decode fields: opcode
8934    */
8935 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8936    if (INSN(31,31) != 0
8937        || INSN(29,24) != BITS6(0,0,1,1,1,0)
8938        || INSN(21,21) != 0 || INSN(15,15) != 0 || INSN(11,10) != BITS2(1,0)) {
8939       return False;
8940    }
8941    UInt bitQ   = INSN(30,30);
8942    UInt size   = INSN(23,22);
8943    UInt mm     = INSN(20,16);
8944    UInt opcode = INSN(14,12);
8945    UInt nn     = INSN(9,5);
8946    UInt dd     = INSN(4,0);
8947
8948    if (opcode == BITS3(0,0,1) || opcode == BITS3(1,0,1)) {
8949       /* -------- 001 UZP1 std7_std7_std7 -------- */
8950       /* -------- 101 UZP2 std7_std7_std7 -------- */
8951       if (bitQ == 0 && size == X11) return False; // implied 1d case
8952       Bool   isUZP1 = opcode == BITS3(0,0,1);
8953       IROp   op     = isUZP1 ? mkVecCATEVENLANES(size)
8954                              : mkVecCATODDLANES(size);
8955       IRTemp preL = newTempV128();
8956       IRTemp preR = newTempV128();
8957       IRTemp res  = newTempV128();
8958       if (bitQ == 0) {
8959          assign(preL, binop(Iop_InterleaveLO64x2, getQReg128(mm),
8960                                                   getQReg128(nn)));
8961          assign(preR, mkexpr(preL));
8962       } else {
8963          assign(preL, getQReg128(mm));
8964          assign(preR, getQReg128(nn));
8965       }
8966       assign(res, binop(op, mkexpr(preL), mkexpr(preR)));
8967       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8968       const HChar* nm  = isUZP1 ? "uzp1" : "uzp2";
8969       const HChar* arr = nameArr_Q_SZ(bitQ, size);
8970       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
8971           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
8972       return True;
8973    }
8974
8975    if (opcode == BITS3(0,1,0) || opcode == BITS3(1,1,0)) {
8976       /* -------- 010 TRN1 std7_std7_std7 -------- */
8977       /* -------- 110 TRN2 std7_std7_std7 -------- */
8978       if (bitQ == 0 && size == X11) return False; // implied 1d case
8979       Bool   isTRN1 = opcode == BITS3(0,1,0);
8980       IROp   op1    = isTRN1 ? mkVecCATEVENLANES(size)
8981                              : mkVecCATODDLANES(size);
8982       IROp op2 = mkVecINTERLEAVEHI(size);
8983       IRTemp srcM = newTempV128();
8984       IRTemp srcN = newTempV128();
8985       IRTemp res  = newTempV128();
8986       assign(srcM, getQReg128(mm));
8987       assign(srcN, getQReg128(nn));
8988       assign(res, binop(op2, binop(op1, mkexpr(srcM), mkexpr(srcM)),
8989                              binop(op1, mkexpr(srcN), mkexpr(srcN))));
8990       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8991       const HChar* nm  = isTRN1 ? "trn1" : "trn2";
8992       const HChar* arr = nameArr_Q_SZ(bitQ, size);
8993       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
8994           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
8995       return True;
8996    }
8997
8998    if (opcode == BITS3(0,1,1) || opcode == BITS3(1,1,1)) {
8999       /* -------- 011 ZIP1 std7_std7_std7 -------- */
9000       /* -------- 111 ZIP2 std7_std7_std7 -------- */
9001       if (bitQ == 0 && size == X11) return False; // implied 1d case
9002       Bool   isZIP1 = opcode == BITS3(0,1,1);
9003       IROp   op     = isZIP1 ? mkVecINTERLEAVELO(size)
9004                              : mkVecINTERLEAVEHI(size);
9005       IRTemp preL = newTempV128();
9006       IRTemp preR = newTempV128();
9007       IRTemp res  = newTempV128();
9008       if (bitQ == 0 && !isZIP1) {
9009          IRTemp z128 = newTempV128();
9010          assign(z128, mkV128(0x0000));
9011          // preL = Vm shifted left 32 bits
9012          // preR = Vn shifted left 32 bits
9013          assign(preL, triop(Iop_SliceV128,
9014                             getQReg128(mm), mkexpr(z128), mkU8(12)));
9015          assign(preR, triop(Iop_SliceV128,
9016                             getQReg128(nn), mkexpr(z128), mkU8(12)));
9017
9018       } else {
9019          assign(preL, getQReg128(mm));
9020          assign(preR, getQReg128(nn));
9021       }
9022       assign(res, binop(op, mkexpr(preL), mkexpr(preR)));
9023       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9024       const HChar* nm  = isZIP1 ? "zip1" : "zip2";
9025       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9026       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
9027           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
9028       return True;
9029    }
9030
9031    return False;
9032 #  undef INSN
9033 }
9034
9035
9036 static
9037 Bool dis_AdvSIMD_across_lanes(/*MB_OUT*/DisResult* dres, UInt insn)
9038 {
9039    /* 31    28    23   21    16     11 9 4
9040       0 q u 01110 size 11000 opcode 10 n d
9041       Decode fields: u,size,opcode
9042    */
9043 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9044    if (INSN(31,31) != 0
9045        || INSN(28,24) != BITS5(0,1,1,1,0)
9046        || INSN(21,17) != BITS5(1,1,0,0,0) || INSN(11,10) != BITS2(1,0)) {
9047       return False;
9048    }
9049    UInt bitQ   = INSN(30,30);
9050    UInt bitU   = INSN(29,29);
9051    UInt size   = INSN(23,22);
9052    UInt opcode = INSN(16,12);
9053    UInt nn     = INSN(9,5);
9054    UInt dd     = INSN(4,0);
9055
9056    if (opcode == BITS5(0,0,0,1,1)) {
9057       /* -------- 0,xx,00011 SADDLV -------- */
9058       /* -------- 1,xx,00011 UADDLV -------- */
9059       /* size is the narrow size */
9060       if (size == X11 || (size == X10 && bitQ == 0)) return False;
9061       Bool   isU = bitU == 1;
9062       IRTemp src = newTempV128();
9063       assign(src, getQReg128(nn));
9064       /* The basic plan is to widen the lower half, and if Q = 1,
9065          the upper half too.  Add them together (if Q = 1), and in
9066          either case fold with add at twice the lane width.
9067       */
9068       IRExpr* widened
9069          = mkexpr(math_WIDEN_LO_OR_HI_LANES(
9070                      isU, False/*!fromUpperHalf*/, size, mkexpr(src)));
9071       if (bitQ == 1) {
9072          widened
9073             = binop(mkVecADD(size+1),
9074                     widened,
9075                     mkexpr(math_WIDEN_LO_OR_HI_LANES(
9076                               isU, True/*fromUpperHalf*/, size, mkexpr(src)))
9077               );
9078       }
9079       /* Now fold. */
9080       IRTemp tWi = newTempV128();
9081       assign(tWi, widened);
9082       IRTemp res = math_FOLDV(tWi, mkVecADD(size+1));
9083       putQReg128(dd, mkexpr(res));
9084       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9085       const HChar  ch  = "bhsd"[size];
9086       DIP("%s %s.%c, %s.%s\n", isU ? "uaddlv" : "saddlv",
9087           nameQReg128(dd), ch, nameQReg128(nn), arr);
9088       return True;
9089    }
9090
9091    UInt ix = 0;
9092    /**/ if (opcode == BITS5(0,1,0,1,0)) { ix = bitU == 0 ? 1 : 2; }
9093    else if (opcode == BITS5(1,1,0,1,0)) { ix = bitU == 0 ? 3 : 4; }
9094    else if (opcode == BITS5(1,1,0,1,1) && bitU == 0) { ix = 5; }
9095    /**/
9096    if (ix != 0) {
9097       /* -------- 0,xx,01010: SMAXV -------- (1) */
9098       /* -------- 1,xx,01010: UMAXV -------- (2) */
9099       /* -------- 0,xx,11010: SMINV -------- (3) */
9100       /* -------- 1,xx,11010: UMINV -------- (4) */
9101       /* -------- 0,xx,11011: ADDV  -------- (5) */
9102       vassert(ix >= 1 && ix <= 5);
9103       if (size == X11) return False; // 1d,2d cases not allowed
9104       if (size == X10 && bitQ == 0) return False; // 2s case not allowed
9105       const IROp opMAXS[3]
9106          = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4 };
9107       const IROp opMAXU[3]
9108          = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4 };
9109       const IROp opMINS[3]
9110          = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4 };
9111       const IROp opMINU[3]
9112          = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4 };
9113       const IROp opADD[3]
9114          = { Iop_Add8x16,  Iop_Add16x8,  Iop_Add32x4 };
9115       vassert(size < 3);
9116       IROp op = Iop_INVALID;
9117       const HChar* nm = NULL;
9118       switch (ix) {
9119          case 1: op = opMAXS[size]; nm = "smaxv"; break;
9120          case 2: op = opMAXU[size]; nm = "umaxv"; break;
9121          case 3: op = opMINS[size]; nm = "sminv"; break;
9122          case 4: op = opMINU[size]; nm = "uminv"; break;
9123          case 5: op = opADD[size];  nm = "addv";  break;
9124          default: vassert(0);
9125       }
9126       vassert(op != Iop_INVALID && nm != NULL);
9127       IRTemp tN1 = newTempV128();
9128       assign(tN1, getQReg128(nn));
9129       /* If Q == 0, we're just folding lanes in the lower half of
9130          the value.  In which case, copy the lower half of the
9131          source into the upper half, so we can then treat it the
9132          same as the full width case.  Except for the addition case,
9133          in which we have to zero out the upper half. */
9134       IRTemp tN2 = newTempV128();
9135       assign(tN2, bitQ == 0
9136                      ? (ix == 5 ? unop(Iop_ZeroHI64ofV128, mkexpr(tN1))
9137                                 : mk_CatEvenLanes64x2(tN1,tN1))
9138                      : mkexpr(tN1));
9139       IRTemp res = math_FOLDV(tN2, op);
9140       if (res == IRTemp_INVALID)
9141          return False; /* means math_FOLDV
9142                           doesn't handle this case yet */
9143       putQReg128(dd, mkexpr(res));
9144       const IRType tys[3] = { Ity_I8, Ity_I16, Ity_I32 };
9145       IRType laneTy = tys[size];
9146       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9147       DIP("%s %s, %s.%s\n", nm,
9148           nameQRegLO(dd, laneTy), nameQReg128(nn), arr);
9149       return True;
9150    }
9151
9152    if ((size == X00 || size == X10)
9153        && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) {
9154       /* -------- 0,00,01100: FMAXMNV s_4s -------- */
9155       /* -------- 0,10,01100: FMINMNV s_4s -------- */
9156       /* -------- 1,00,01111: FMAXV   s_4s -------- */
9157       /* -------- 1,10,01111: FMINV   s_4s -------- */
9158       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
9159       if (bitQ == 0) return False; // Only 4s is allowed
9160       Bool   isMIN = (size & 2) == 2;
9161       Bool   isNM  = opcode == BITS5(0,1,1,0,0);
9162       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(2);
9163       IRTemp src = newTempV128();
9164       assign(src, getQReg128(nn));
9165       IRTemp res = math_FOLDV(src, opMXX);
9166       putQReg128(dd, mkexpr(res));
9167       DIP("%s%sv s%u, %u.4s\n",
9168           isMIN ? "fmin" : "fmax", isNM ? "nm" : "", dd, nn);
9169       return True;
9170    }
9171
9172 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9173    return False;
9174 #  undef INSN
9175 }
9176
9177
9178 static
9179 Bool dis_AdvSIMD_copy(/*MB_OUT*/DisResult* dres, UInt insn)
9180 {
9181    /* 31     28       20   15 14   10 9 4
9182       0 q op 01110000 imm5 0  imm4 1  n d
9183       Decode fields: q,op,imm4
9184    */
9185 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9186    if (INSN(31,31) != 0
9187        || INSN(28,21) != BITS8(0,1,1,1,0,0,0,0)
9188        || INSN(15,15) != 0 || INSN(10,10) != 1) {
9189       return False;
9190    }
9191    UInt bitQ  = INSN(30,30);
9192    UInt bitOP = INSN(29,29);
9193    UInt imm5  = INSN(20,16);
9194    UInt imm4  = INSN(14,11);
9195    UInt nn    = INSN(9,5);
9196    UInt dd    = INSN(4,0);
9197
9198    /* -------- x,0,0000: DUP (element, vector) -------- */
9199    /* 31  28       20   15     9 4
9200       0q0 01110000 imm5 000001 n d  DUP Vd.T, Vn.Ts[index]
9201    */
9202    if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
9203       UInt   laneNo    = 0;
9204       UInt   laneSzLg2 = 0;
9205       HChar  laneCh    = '?';
9206       IRTemp res       = handle_DUP_VEC_ELEM(&laneNo, &laneSzLg2, &laneCh,
9207                                              getQReg128(nn), imm5);
9208       if (res == IRTemp_INVALID)
9209          return False;
9210       if (bitQ == 0 && laneSzLg2 == X11)
9211          return False; /* .1d case */
9212       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9213       const HChar* arT = nameArr_Q_SZ(bitQ, laneSzLg2);
9214       DIP("dup %s.%s, %s.%c[%u]\n",
9215            nameQReg128(dd), arT, nameQReg128(nn), laneCh, laneNo);
9216       return True;
9217    }
9218
9219    /* -------- x,0,0001: DUP (general, vector) -------- */
9220    /* 31  28       20   15       9 4
9221       0q0 01110000 imm5 0 0001 1 n d  DUP Vd.T, Rn
9222       Q=0 writes 64, Q=1 writes 128
9223       imm5: xxxx1  8B(q=0)      or 16b(q=1),     R=W
9224             xxx10  4H(q=0)      or 8H(q=1),      R=W
9225             xx100  2S(q=0)      or 4S(q=1),      R=W
9226             x1000  Invalid(q=0) or 2D(q=1),      R=X
9227             x0000  Invalid(q=0) or Invalid(q=1)
9228       Require op=0, imm4=0001
9229    */
9230    if (bitOP == 0 && imm4 == BITS4(0,0,0,1)) {
9231       Bool   isQ = bitQ == 1;
9232       IRTemp w0  = newTemp(Ity_I64);
9233       const HChar* arT = "??";
9234       IRType laneTy = Ity_INVALID;
9235       if (imm5 & 1) {
9236          arT    = isQ ? "16b" : "8b";
9237          laneTy = Ity_I8;
9238          assign(w0, unop(Iop_8Uto64, unop(Iop_64to8, getIReg64orZR(nn))));
9239       }
9240       else if (imm5 & 2) {
9241          arT    = isQ ? "8h" : "4h";
9242          laneTy = Ity_I16;
9243          assign(w0, unop(Iop_16Uto64, unop(Iop_64to16, getIReg64orZR(nn))));
9244       }
9245       else if (imm5 & 4) {
9246          arT    = isQ ? "4s" : "2s";
9247          laneTy = Ity_I32;
9248          assign(w0, unop(Iop_32Uto64, unop(Iop_64to32, getIReg64orZR(nn))));
9249       }
9250       else if ((imm5 & 8) && isQ) {
9251          arT    = "2d";
9252          laneTy = Ity_I64;
9253          assign(w0, getIReg64orZR(nn));
9254       }
9255       else {
9256          /* invalid; leave laneTy unchanged. */
9257       }
9258       /* */
9259       if (laneTy != Ity_INVALID) {
9260          IRTemp w1 = math_DUP_TO_64(w0, laneTy);
9261          putQReg128(dd, binop(Iop_64HLtoV128,
9262                               isQ ? mkexpr(w1) : mkU64(0), mkexpr(w1)));
9263          DIP("dup %s.%s, %s\n",
9264              nameQReg128(dd), arT, nameIRegOrZR(laneTy == Ity_I64, nn));
9265          return True;
9266       }
9267       /* invalid */
9268       return False;
9269    }
9270
9271    /* -------- 1,0,0011: INS (general) -------- */
9272    /* 31  28       20   15     9 4
9273       010 01110000 imm5 000111 n d  INS Vd.Ts[ix], Rn
9274       where Ts,ix = case imm5 of xxxx1 -> B, xxxx
9275                                  xxx10 -> H, xxx
9276                                  xx100 -> S, xx
9277                                  x1000 -> D, x
9278    */
9279    if (bitQ == 1 && bitOP == 0 && imm4 == BITS4(0,0,1,1)) {
9280       HChar   ts     = '?';
9281       UInt    laneNo = 16;
9282       IRExpr* src    = NULL;
9283       if (imm5 & 1) {
9284          src    = unop(Iop_64to8, getIReg64orZR(nn));
9285          laneNo = (imm5 >> 1) & 15;
9286          ts     = 'b';
9287       }
9288       else if (imm5 & 2) {
9289          src    = unop(Iop_64to16, getIReg64orZR(nn));
9290          laneNo = (imm5 >> 2) & 7;
9291          ts     = 'h';
9292       }
9293       else if (imm5 & 4) {
9294          src    = unop(Iop_64to32, getIReg64orZR(nn));
9295          laneNo = (imm5 >> 3) & 3;
9296          ts     = 's';
9297       }
9298       else if (imm5 & 8) {
9299          src    = getIReg64orZR(nn);
9300          laneNo = (imm5 >> 4) & 1;
9301          ts     = 'd';
9302       }
9303       /* */
9304       if (src) {
9305          vassert(laneNo < 16);
9306          putQRegLane(dd, laneNo, src);
9307          DIP("ins %s.%c[%u], %s\n",
9308              nameQReg128(dd), ts, laneNo, nameIReg64orZR(nn));
9309          return True;
9310       }
9311       /* invalid */
9312       return False;
9313    }
9314
9315    /* -------- x,0,0101: SMOV -------- */
9316    /* -------- x,0,0111: UMOV -------- */
9317    /* 31  28        20   15     9 4
9318       0q0 01110 000 imm5 001111 n d  UMOV Xd/Wd, Vn.Ts[index]
9319       0q0 01110 000 imm5 001011 n d  SMOV Xd/Wd, Vn.Ts[index]
9320       dest is Xd when q==1, Wd when q==0
9321       UMOV:
9322          Ts,index,ops = case q:imm5 of
9323                           0:xxxx1 -> B, xxxx, 8Uto64
9324                           1:xxxx1 -> invalid
9325                           0:xxx10 -> H, xxx,  16Uto64
9326                           1:xxx10 -> invalid
9327                           0:xx100 -> S, xx,   32Uto64
9328                           1:xx100 -> invalid
9329                           1:x1000 -> D, x,    copy64
9330                           other   -> invalid
9331       SMOV:
9332          Ts,index,ops = case q:imm5 of
9333                           0:xxxx1 -> B, xxxx, (32Uto64 . 8Sto32)
9334                           1:xxxx1 -> B, xxxx, 8Sto64
9335                           0:xxx10 -> H, xxx,  (32Uto64 . 16Sto32)
9336                           1:xxx10 -> H, xxx,  16Sto64
9337                           0:xx100 -> invalid
9338                           1:xx100 -> S, xx,   32Sto64
9339                           1:x1000 -> invalid
9340                           other   -> invalid
9341    */
9342    if (bitOP == 0 && (imm4 == BITS4(0,1,0,1) || imm4 == BITS4(0,1,1,1))) {
9343       Bool isU  = (imm4 & 2) == 2;
9344       const HChar* arTs = "??";
9345       UInt    laneNo = 16; /* invalid */
9346       // Setting 'res' to non-NULL determines valid/invalid
9347       IRExpr* res    = NULL;
9348       if (!bitQ && (imm5 & 1)) { // 0:xxxx1
9349          laneNo = (imm5 >> 1) & 15;
9350          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
9351          res = isU ? unop(Iop_8Uto64, lane)
9352                    : unop(Iop_32Uto64, unop(Iop_8Sto32, lane));
9353          arTs = "b";
9354       }
9355       else if (bitQ && (imm5 & 1)) { // 1:xxxx1
9356          laneNo = (imm5 >> 1) & 15;
9357          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
9358          res = isU ? NULL
9359                    : unop(Iop_8Sto64, lane);
9360          arTs = "b";
9361       }
9362       else if (!bitQ && (imm5 & 2)) { // 0:xxx10
9363          laneNo = (imm5 >> 2) & 7;
9364          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
9365          res = isU ? unop(Iop_16Uto64, lane)
9366                    : unop(Iop_32Uto64, unop(Iop_16Sto32, lane));
9367          arTs = "h";
9368       }
9369       else if (bitQ && (imm5 & 2)) { // 1:xxx10
9370          laneNo = (imm5 >> 2) & 7;
9371          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
9372          res = isU ? NULL
9373                    : unop(Iop_16Sto64, lane);
9374          arTs = "h";
9375       }
9376       else if (!bitQ && (imm5 & 4)) { // 0:xx100
9377          laneNo = (imm5 >> 3) & 3;
9378          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
9379          res = isU ? unop(Iop_32Uto64, lane)
9380                    : NULL;
9381          arTs = "s";
9382       }
9383       else if (bitQ && (imm5 & 4)) { // 1:xxx10
9384          laneNo = (imm5 >> 3) & 3;
9385          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
9386          res = isU ? NULL
9387                    : unop(Iop_32Sto64, lane);
9388          arTs = "s";
9389       }
9390       else if (bitQ && (imm5 & 8)) { // 1:x1000
9391          laneNo = (imm5 >> 4) & 1;
9392          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I64);
9393          res = isU ? lane
9394                    : NULL;
9395          arTs = "d";
9396       }
9397       /* */
9398       if (res) {
9399          vassert(laneNo < 16);
9400          putIReg64orZR(dd, res);
9401          DIP("%cmov %s, %s.%s[%u]\n", isU ? 'u' : 's',
9402              nameIRegOrZR(bitQ == 1, dd),
9403              nameQReg128(nn), arTs, laneNo);
9404          return True;
9405       }
9406       /* invalid */
9407       return False;
9408    }
9409
9410    /* -------- 1,1,xxxx: INS (element) -------- */
9411    /* 31  28       20     14   9 4
9412       011 01110000 imm5 0 imm4 n d  INS Vd.Ts[ix1], Vn.Ts[ix2]
9413       where Ts,ix1,ix2
9414                = case imm5 of xxxx1 -> B, xxxx, imm4[3:0]
9415                               xxx10 -> H, xxx,  imm4[3:1]
9416                               xx100 -> S, xx,   imm4[3:2]
9417                               x1000 -> D, x,    imm4[3:3]
9418    */
9419    if (bitQ == 1 && bitOP == 1) {
9420       HChar   ts  = '?';
9421       IRType  ity = Ity_INVALID;
9422       UInt    ix1 = 16;
9423       UInt    ix2 = 16;
9424       if (imm5 & 1) {
9425          ts  = 'b';
9426          ity = Ity_I8;
9427          ix1 = (imm5 >> 1) & 15;
9428          ix2 = (imm4 >> 0) & 15;
9429       }
9430       else if (imm5 & 2) {
9431          ts  = 'h';
9432          ity = Ity_I16;
9433          ix1 = (imm5 >> 2) & 7;
9434          ix2 = (imm4 >> 1) & 7;
9435       }
9436       else if (imm5 & 4) {
9437          ts  = 's';
9438          ity = Ity_I32;
9439          ix1 = (imm5 >> 3) & 3;
9440          ix2 = (imm4 >> 2) & 3;
9441       }
9442       else if (imm5 & 8) {
9443          ts  = 'd';
9444          ity = Ity_I64;
9445          ix1 = (imm5 >> 4) & 1;
9446          ix2 = (imm4 >> 3) & 1;
9447       }
9448       /* */
9449       if (ity != Ity_INVALID) {
9450          vassert(ix1 < 16);
9451          vassert(ix2 < 16);
9452          putQRegLane(dd, ix1, getQRegLane(nn, ix2, ity));
9453          DIP("ins %s.%c[%u], %s.%c[%u]\n",
9454              nameQReg128(dd), ts, ix1, nameQReg128(nn), ts, ix2);
9455          return True;
9456       }
9457       /* invalid */
9458       return False;
9459    }
9460
9461    return False;
9462 #  undef INSN
9463 }
9464
9465
9466 static
9467 Bool dis_AdvSIMD_modified_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
9468 {
9469    /* 31    28          18  15    11 9     4
9470       0q op 01111 00000 abc cmode 01 defgh d
9471       Decode fields: q,op,cmode
9472       Bit 11 is really "o2", but it is always zero.
9473    */
9474 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9475    if (INSN(31,31) != 0
9476        || INSN(28,19) != BITS10(0,1,1,1,1,0,0,0,0,0)
9477        || INSN(11,10) != BITS2(0,1)) {
9478       return False;
9479    }
9480    UInt bitQ     = INSN(30,30);
9481    UInt bitOP    = INSN(29,29);
9482    UInt cmode    = INSN(15,12);
9483    UInt abcdefgh = (INSN(18,16) << 5) | INSN(9,5);
9484    UInt dd       = INSN(4,0);
9485
9486    ULong imm64lo  = 0;
9487    UInt  op_cmode = (bitOP << 4) | cmode;
9488    Bool  ok       = False;
9489    Bool  isORR    = False;
9490    Bool  isBIC    = False;
9491    Bool  isMOV    = False;
9492    Bool  isMVN    = False;
9493    Bool  isFMOV   = False;
9494    switch (op_cmode) {
9495       /* -------- x,0,0000 MOVI 32-bit shifted imm -------- */
9496       /* -------- x,0,0010 MOVI 32-bit shifted imm -------- */
9497       /* -------- x,0,0100 MOVI 32-bit shifted imm -------- */
9498       /* -------- x,0,0110 MOVI 32-bit shifted imm -------- */
9499       case BITS5(0,0,0,0,0): case BITS5(0,0,0,1,0):
9500       case BITS5(0,0,1,0,0): case BITS5(0,0,1,1,0): // 0:0xx0
9501          ok = True; isMOV = True; break;
9502
9503       /* -------- x,0,0001 ORR (vector, immediate) 32-bit -------- */
9504       /* -------- x,0,0011 ORR (vector, immediate) 32-bit -------- */
9505       /* -------- x,0,0101 ORR (vector, immediate) 32-bit -------- */
9506       /* -------- x,0,0111 ORR (vector, immediate) 32-bit -------- */
9507       case BITS5(0,0,0,0,1): case BITS5(0,0,0,1,1):
9508       case BITS5(0,0,1,0,1): case BITS5(0,0,1,1,1): // 0:0xx1
9509          ok = True; isORR = True; break;
9510
9511       /* -------- x,0,1000 MOVI 16-bit shifted imm -------- */
9512       /* -------- x,0,1010 MOVI 16-bit shifted imm -------- */
9513       case BITS5(0,1,0,0,0): case BITS5(0,1,0,1,0): // 0:10x0
9514          ok = True; isMOV = True; break;
9515
9516       /* -------- x,0,1001 ORR (vector, immediate) 16-bit -------- */
9517       /* -------- x,0,1011 ORR (vector, immediate) 16-bit -------- */
9518       case BITS5(0,1,0,0,1): case BITS5(0,1,0,1,1): // 0:10x1
9519          ok = True; isORR = True; break;
9520
9521       /* -------- x,0,1100 MOVI 32-bit shifting ones -------- */
9522       /* -------- x,0,1101 MOVI 32-bit shifting ones -------- */
9523       case BITS5(0,1,1,0,0): case BITS5(0,1,1,0,1): // 0:110x
9524          ok = True; isMOV = True; break;
9525
9526       /* -------- x,0,1110 MOVI 8-bit -------- */
9527       case BITS5(0,1,1,1,0):
9528          ok = True; isMOV = True; break;
9529
9530       /* -------- x,0,1111 FMOV (vector, immediate, F32) -------- */
9531       case BITS5(0,1,1,1,1): // 0:1111
9532          ok = True; isFMOV = True; break;
9533
9534       /* -------- x,1,0000 MVNI 32-bit shifted imm -------- */
9535       /* -------- x,1,0010 MVNI 32-bit shifted imm  -------- */
9536       /* -------- x,1,0100 MVNI 32-bit shifted imm  -------- */
9537       /* -------- x,1,0110 MVNI 32-bit shifted imm  -------- */
9538       case BITS5(1,0,0,0,0): case BITS5(1,0,0,1,0):
9539       case BITS5(1,0,1,0,0): case BITS5(1,0,1,1,0): // 1:0xx0
9540          ok = True; isMVN = True; break;
9541
9542       /* -------- x,1,0001 BIC (vector, immediate) 32-bit -------- */
9543       /* -------- x,1,0011 BIC (vector, immediate) 32-bit -------- */
9544       /* -------- x,1,0101 BIC (vector, immediate) 32-bit -------- */
9545       /* -------- x,1,0111 BIC (vector, immediate) 32-bit -------- */
9546       case BITS5(1,0,0,0,1): case BITS5(1,0,0,1,1):
9547       case BITS5(1,0,1,0,1): case BITS5(1,0,1,1,1): // 1:0xx1
9548          ok = True; isBIC = True; break;
9549
9550       /* -------- x,1,1000 MVNI 16-bit shifted imm -------- */
9551       /* -------- x,1,1010 MVNI 16-bit shifted imm -------- */
9552       case BITS5(1,1,0,0,0): case BITS5(1,1,0,1,0): // 1:10x0
9553          ok = True; isMVN = True; break;
9554
9555       /* -------- x,1,1001 BIC (vector, immediate) 16-bit -------- */
9556       /* -------- x,1,1011 BIC (vector, immediate) 16-bit -------- */
9557       case BITS5(1,1,0,0,1): case BITS5(1,1,0,1,1): // 1:10x1
9558          ok = True; isBIC = True; break;
9559
9560       /* -------- x,1,1100 MVNI 32-bit shifting ones -------- */
9561       /* -------- x,1,1101 MVNI 32-bit shifting ones -------- */
9562       case BITS5(1,1,1,0,0): case BITS5(1,1,1,0,1): // 1:110x
9563          ok = True; isMVN = True; break;
9564
9565       /* -------- 0,1,1110 MOVI 64-bit scalar -------- */
9566       /* -------- 1,1,1110 MOVI 64-bit vector -------- */
9567       case BITS5(1,1,1,1,0):
9568          ok = True; isMOV = True; break;
9569
9570       /* -------- 1,1,1111 FMOV (vector, immediate, F64) -------- */
9571       case BITS5(1,1,1,1,1): // 1:1111
9572          ok = bitQ == 1; isFMOV = True; break;
9573
9574       default:
9575         break;
9576    }
9577    if (ok) {
9578       vassert(1 == (isMOV ? 1 : 0) + (isMVN ? 1 : 0)
9579                    + (isORR ? 1 : 0) + (isBIC ? 1 : 0) + (isFMOV ? 1 : 0));
9580       ok = AdvSIMDExpandImm(&imm64lo, bitOP, cmode, abcdefgh);
9581    }
9582    if (ok) {
9583       if (isORR || isBIC) {
9584          ULong inv
9585             = isORR ? 0ULL : ~0ULL;
9586          IRExpr* immV128
9587             = binop(Iop_64HLtoV128, mkU64(inv ^ imm64lo), mkU64(inv ^ imm64lo));
9588          IRExpr* res
9589             = binop(isORR ? Iop_OrV128 : Iop_AndV128, getQReg128(dd), immV128);
9590          const HChar* nm = isORR ? "orr" : "bic";
9591          if (bitQ == 0) {
9592             putQReg128(dd, unop(Iop_ZeroHI64ofV128, res));
9593             DIP("%s %s.1d, %016llx\n", nm, nameQReg128(dd), imm64lo);
9594          } else {
9595             putQReg128(dd, res);
9596             DIP("%s %s.2d, #0x%016llx'%016llx\n", nm,
9597                 nameQReg128(dd), imm64lo, imm64lo);
9598          }
9599       }
9600       else if (isMOV || isMVN || isFMOV) {
9601          if (isMVN) imm64lo = ~imm64lo;
9602          ULong   imm64hi = bitQ == 0  ? 0  :  imm64lo;
9603          IRExpr* immV128 = binop(Iop_64HLtoV128, mkU64(imm64hi),
9604                                                  mkU64(imm64lo));
9605          putQReg128(dd, immV128);
9606          DIP("mov %s, #0x%016llx'%016llx\n", nameQReg128(dd), imm64hi, imm64lo);
9607       }
9608       return True;
9609    }
9610    /* else fall through */
9611
9612    return False;
9613 #  undef INSN
9614 }
9615
9616
9617 static
9618 Bool dis_AdvSIMD_scalar_copy(/*MB_OUT*/DisResult* dres, UInt insn)
9619 {
9620    /* 31    28       20   15 14   10 9 4
9621       01 op 11110000 imm5 0  imm4 1  n d
9622       Decode fields: op,imm4
9623    */
9624 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9625    if (INSN(31,30) != BITS2(0,1)
9626        || INSN(28,21) != BITS8(1,1,1,1,0,0,0,0)
9627        || INSN(15,15) != 0 || INSN(10,10) != 1) {
9628       return False;
9629    }
9630    UInt bitOP = INSN(29,29);
9631    UInt imm5  = INSN(20,16);
9632    UInt imm4  = INSN(14,11);
9633    UInt nn    = INSN(9,5);
9634    UInt dd    = INSN(4,0);
9635
9636    if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
9637       /* -------- 0,0000 DUP (element, scalar) -------- */
9638       IRTemp w0     = newTemp(Ity_I64);
9639       const HChar* arTs = "??";
9640       IRType laneTy = Ity_INVALID;
9641       UInt   laneNo = 16; /* invalid */
9642       if (imm5 & 1) {
9643          arTs   = "b";
9644          laneNo = (imm5 >> 1) & 15;
9645          laneTy = Ity_I8;
9646          assign(w0, unop(Iop_8Uto64, getQRegLane(nn, laneNo, laneTy)));
9647       }
9648       else if (imm5 & 2) {
9649          arTs   = "h";
9650          laneNo = (imm5 >> 2) & 7;
9651          laneTy = Ity_I16;
9652          assign(w0, unop(Iop_16Uto64, getQRegLane(nn, laneNo, laneTy)));
9653       }
9654       else if (imm5 & 4) {
9655          arTs   = "s";
9656          laneNo = (imm5 >> 3) & 3;
9657          laneTy = Ity_I32;
9658          assign(w0, unop(Iop_32Uto64, getQRegLane(nn, laneNo, laneTy)));
9659       }
9660       else if (imm5 & 8) {
9661          arTs   = "d";
9662          laneNo = (imm5 >> 4) & 1;
9663          laneTy = Ity_I64;
9664          assign(w0, getQRegLane(nn, laneNo, laneTy));
9665       }
9666       else {
9667          /* invalid; leave laneTy unchanged. */
9668       }
9669       /* */
9670       if (laneTy != Ity_INVALID) {
9671          vassert(laneNo < 16);
9672          putQReg128(dd, binop(Iop_64HLtoV128, mkU64(0), mkexpr(w0)));
9673          DIP("dup %s, %s.%s[%u]\n",
9674              nameQRegLO(dd, laneTy), nameQReg128(nn), arTs, laneNo);
9675          return True;
9676       }
9677       /* else fall through */
9678    }
9679
9680    return False;
9681 #  undef INSN
9682 }
9683
9684
9685 static
9686 Bool dis_AdvSIMD_scalar_pairwise(/*MB_OUT*/DisResult* dres, UInt insn)
9687 {
9688    /* 31   28    23 21    16     11 9 4
9689       01 u 11110 sz 11000 opcode 10 n d
9690       Decode fields: u,sz,opcode
9691    */
9692 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9693    if (INSN(31,30) != BITS2(0,1)
9694        || INSN(28,24) != BITS5(1,1,1,1,0)
9695        || INSN(21,17) != BITS5(1,1,0,0,0)
9696        || INSN(11,10) != BITS2(1,0)) {
9697       return False;
9698    }
9699    UInt bitU   = INSN(29,29);
9700    UInt sz     = INSN(23,22);
9701    UInt opcode = INSN(16,12);
9702    UInt nn     = INSN(9,5);
9703    UInt dd     = INSN(4,0);
9704
9705    if (bitU == 0 && sz == X11 && opcode == BITS5(1,1,0,1,1)) {
9706       /* -------- 0,11,11011 ADDP d_2d -------- */
9707       IRTemp xy = newTempV128();
9708       IRTemp xx = newTempV128();
9709       assign(xy, getQReg128(nn));
9710       assign(xx, binop(Iop_InterleaveHI64x2, mkexpr(xy), mkexpr(xy)));
9711       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
9712                           binop(Iop_Add64x2, mkexpr(xy), mkexpr(xx))));
9713       DIP("addp d%u, %s.2d\n", dd, nameQReg128(nn));
9714       return True;
9715    }
9716
9717    if (bitU == 1 && sz <= X01 && opcode == BITS5(0,1,1,0,1)) {
9718       /* -------- 1,00,01101 ADDP s_2s -------- */
9719       /* -------- 1,01,01101 ADDP d_2d -------- */
9720       Bool   isD   = sz == X01;
9721       IROp   opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
9722       IROp   opADD = mkVecADDF(isD ? 3 : 2);
9723       IRTemp src   = newTempV128();
9724       IRTemp argL  = newTempV128();
9725       IRTemp argR  = newTempV128();
9726       assign(src, getQReg128(nn));
9727       assign(argL, unop(opZHI, mkexpr(src)));
9728       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
9729                                                     mkU8(isD ? 8 : 4))));
9730       putQReg128(dd, unop(opZHI,
9731                           triop(opADD, mkexpr(mk_get_IR_rounding_mode()),
9732                                               mkexpr(argL), mkexpr(argR))));
9733       DIP(isD ? "faddp d%u, v%u.2d\n" : "faddp s%u, v%u.2s\n", dd, nn);
9734       return True;
9735    }
9736
9737    if (bitU == 1
9738        && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) {
9739       /* -------- 1,0x,01100 FMAXNMP d_2d, s_2s -------- */
9740       /* -------- 1,1x,01100 FMINNMP d_2d, s_2s -------- */
9741       /* -------- 1,0x,01111 FMAXP   d_2d, s_2s -------- */
9742       /* -------- 1,1x,01111 FMINP   d_2d, s_2s -------- */
9743       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
9744       Bool   isD   = (sz & 1) == 1;
9745       Bool   isMIN = (sz & 2) == 2;
9746       Bool   isNM  = opcode == BITS5(0,1,1,0,0);
9747       IROp   opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
9748       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? 3 : 2);
9749       IRTemp src   = newTempV128();
9750       IRTemp argL  = newTempV128();
9751       IRTemp argR  = newTempV128();
9752       assign(src, getQReg128(nn));
9753       assign(argL, unop(opZHI, mkexpr(src)));
9754       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
9755                                                     mkU8(isD ? 8 : 4))));
9756       putQReg128(dd, unop(opZHI,
9757                           binop(opMXX, mkexpr(argL), mkexpr(argR))));
9758       HChar c = isD ? 'd' : 's';
9759       DIP("%s%sp %c%u, v%u.2%c\n",
9760            isMIN ? "fmin" : "fmax", isNM ? "nm" : "", c, dd, nn, c);
9761       return True;
9762    }
9763
9764    return False;
9765 #  undef INSN
9766 }
9767
9768
9769 static
9770 Bool dis_AdvSIMD_scalar_shift_by_imm(/*MB_OUT*/DisResult* dres, UInt insn)
9771 {
9772    /* 31   28     22   18   15     10 9 4
9773       01 u 111110 immh immb opcode 1  n d
9774       Decode fields: u,immh,opcode
9775    */
9776 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9777    if (INSN(31,30) != BITS2(0,1)
9778        || INSN(28,23) != BITS6(1,1,1,1,1,0) || INSN(10,10) != 1) {
9779       return False;
9780    }
9781    UInt bitU   = INSN(29,29);
9782    UInt immh   = INSN(22,19);
9783    UInt immb   = INSN(18,16);
9784    UInt opcode = INSN(15,11);
9785    UInt nn     = INSN(9,5);
9786    UInt dd     = INSN(4,0);
9787    UInt immhb  = (immh << 3) | immb;
9788
9789    if ((immh & 8) == 8
9790        && (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0))) {
9791       /* -------- 0,1xxx,00000 SSHR d_d_#imm -------- */
9792       /* -------- 1,1xxx,00000 USHR d_d_#imm -------- */
9793       /* -------- 0,1xxx,00010 SSRA d_d_#imm -------- */
9794       /* -------- 1,1xxx,00010 USRA d_d_#imm -------- */
9795       Bool isU   = bitU == 1;
9796       Bool isAcc = opcode == BITS5(0,0,0,1,0);
9797       UInt sh    = 128 - immhb;
9798       vassert(sh >= 1 && sh <= 64);
9799       IROp    op  = isU ? Iop_ShrN64x2 : Iop_SarN64x2;
9800       IRExpr* src = getQReg128(nn);
9801       IRTemp  shf = newTempV128();
9802       IRTemp  res = newTempV128();
9803       if (sh == 64 && isU) {
9804          assign(shf, mkV128(0x0000));
9805       } else {
9806          UInt nudge = 0;
9807          if (sh == 64) {
9808             vassert(!isU);
9809             nudge = 1;
9810          }
9811          assign(shf, binop(op, src, mkU8(sh - nudge)));
9812       }
9813       assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
9814                         : mkexpr(shf));
9815       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9816       const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
9817                               : (isU ? "ushr" : "sshr");
9818       DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
9819       return True;
9820    }
9821
9822    if ((immh & 8) == 8
9823        && (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0))) {
9824       /* -------- 0,1xxx,00100 SRSHR d_d_#imm -------- */
9825       /* -------- 1,1xxx,00100 URSHR d_d_#imm -------- */
9826       /* -------- 0,1xxx,00110 SRSRA d_d_#imm -------- */
9827       /* -------- 1,1xxx,00110 URSRA d_d_#imm -------- */
9828       Bool isU   = bitU == 1;
9829       Bool isAcc = opcode == BITS5(0,0,1,1,0);
9830       UInt sh    = 128 - immhb;
9831       vassert(sh >= 1 && sh <= 64);
9832       IROp    op  = isU ? Iop_Rsh64Ux2 : Iop_Rsh64Sx2;
9833       vassert(sh >= 1 && sh <= 64);
9834       IRExpr* src  = getQReg128(nn);
9835       IRTemp  imm8 = newTemp(Ity_I8);
9836       assign(imm8, mkU8((UChar)(-sh)));
9837       IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
9838       IRTemp  shf  = newTempV128();
9839       IRTemp  res  = newTempV128();
9840       assign(shf, binop(op, src, amt));
9841       assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
9842                         : mkexpr(shf));
9843       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9844       const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
9845                               : (isU ? "urshr" : "srshr");
9846       DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
9847       return True;
9848    }
9849
9850    if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,0,0)) {
9851       /* -------- 1,1xxx,01000 SRI d_d_#imm -------- */
9852       UInt sh = 128 - immhb;
9853       vassert(sh >= 1 && sh <= 64);
9854       if (sh == 64) {
9855          putQReg128(dd, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
9856       } else {
9857          /* sh is in range 1 .. 63 */
9858          ULong   nmask  = (ULong)(((Long)0x8000000000000000ULL) >> (sh-1));
9859          IRExpr* nmaskV = binop(Iop_64HLtoV128, mkU64(nmask), mkU64(nmask));
9860          IRTemp  res    = newTempV128();
9861          assign(res, binop(Iop_OrV128,
9862                            binop(Iop_AndV128, getQReg128(dd), nmaskV),
9863                            binop(Iop_ShrN64x2, getQReg128(nn), mkU8(sh))));
9864          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9865       }
9866       DIP("sri d%u, d%u, #%u\n", dd, nn, sh);
9867       return True;
9868    }
9869
9870    if (bitU == 0 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
9871       /* -------- 0,1xxx,01010 SHL d_d_#imm -------- */
9872       UInt sh = immhb - 64;
9873       vassert(sh >= 0 && sh < 64);
9874       putQReg128(dd,
9875                  unop(Iop_ZeroHI64ofV128,
9876                       sh == 0 ? getQReg128(nn)
9877                               : binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
9878       DIP("shl d%u, d%u, #%u\n", dd, nn, sh);
9879       return True;
9880    }
9881
9882    if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
9883       /* -------- 1,1xxx,01010 SLI d_d_#imm -------- */
9884       UInt sh = immhb - 64;
9885       vassert(sh >= 0 && sh < 64);
9886       if (sh == 0) {
9887          putQReg128(dd, unop(Iop_ZeroHI64ofV128, getQReg128(nn)));
9888       } else {
9889          /* sh is in range 1 .. 63 */
9890          ULong   nmask  = (1ULL << sh) - 1;
9891          IRExpr* nmaskV = binop(Iop_64HLtoV128, mkU64(nmask), mkU64(nmask));
9892          IRTemp  res    = newTempV128();
9893          assign(res, binop(Iop_OrV128,
9894                            binop(Iop_AndV128, getQReg128(dd), nmaskV),
9895                            binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
9896          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9897       }
9898       DIP("sli d%u, d%u, #%u\n", dd, nn, sh);
9899       return True;
9900    }
9901
9902    if (opcode == BITS5(0,1,1,1,0)
9903        || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
9904       /* -------- 0,01110  SQSHL  #imm -------- */
9905       /* -------- 1,01110  UQSHL  #imm -------- */
9906       /* -------- 1,01100  SQSHLU #imm -------- */
9907       UInt size  = 0;
9908       UInt shift = 0;
9909       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
9910       if (!ok) return False;
9911       vassert(size >= 0 && size <= 3);
9912       /* The shift encoding has opposite sign for the leftwards case.
9913          Adjust shift to compensate. */
9914       UInt lanebits = 8 << size;
9915       shift = lanebits - shift;
9916       vassert(shift >= 0 && shift < lanebits);
9917       const HChar* nm = NULL;
9918       /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
9919       else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
9920       else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
9921       else vassert(0);
9922       IRTemp qDiff1 = IRTemp_INVALID;
9923       IRTemp qDiff2 = IRTemp_INVALID;
9924       IRTemp res = IRTemp_INVALID;
9925       IRTemp src = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn));
9926       /* This relies on the fact that the zeroed out lanes generate zeroed
9927          result lanes and don't saturate, so there's no point in trimming
9928          the resulting res, qDiff1 or qDiff2 values. */
9929       math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
9930       putQReg128(dd, mkexpr(res));
9931       updateQCFLAGwithDifference(qDiff1, qDiff2);
9932       const HChar arr = "bhsd"[size];
9933       DIP("%s %c%u, %c%u, #%u\n", nm, arr, dd, arr, nn, shift);
9934       return True;
9935    }
9936
9937    if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
9938        || (bitU == 1
9939            && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
9940       /* -------- 0,10010   SQSHRN #imm -------- */
9941       /* -------- 1,10010   UQSHRN #imm -------- */
9942       /* -------- 0,10011  SQRSHRN #imm -------- */
9943       /* -------- 1,10011  UQRSHRN #imm -------- */
9944       /* -------- 1,10000  SQSHRUN #imm -------- */
9945       /* -------- 1,10001 SQRSHRUN #imm -------- */
9946       UInt size  = 0;
9947       UInt shift = 0;
9948       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
9949       if (!ok || size == X11) return False;
9950       vassert(size >= X00 && size <= X10);
9951       vassert(shift >= 1 && shift <= (8 << size));
9952       const HChar* nm = "??";
9953       IROp op = Iop_INVALID;
9954       /* Decide on the name and the operation. */
9955       /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
9956          nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size);
9957       }
9958       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
9959          nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size);
9960       }
9961       else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) {
9962          nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size);
9963       }
9964       else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
9965          nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size);
9966       }
9967       else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) {
9968          nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size);
9969       }
9970       else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) {
9971          nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size);
9972       }
9973       else vassert(0);
9974       /* Compute the result (Q, shifted value) pair. */
9975       IRTemp src128 = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size+1, getQReg128(nn));
9976       IRTemp pair   = newTempV128();
9977       assign(pair, binop(op, mkexpr(src128), mkU8(shift)));
9978       /* Update the result reg */
9979       IRTemp res64in128 = newTempV128();
9980       assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair)));
9981       putQReg128(dd, mkexpr(res64in128));
9982       /* Update the Q flag. */
9983       IRTemp q64q64 = newTempV128();
9984       assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair)));
9985       IRTemp z128 = newTempV128();
9986       assign(z128, mkV128(0x0000));
9987       updateQCFLAGwithDifference(q64q64, z128);
9988       /* */
9989       const HChar arrNarrow = "bhsd"[size];
9990       const HChar arrWide   = "bhsd"[size+1];
9991       DIP("%s %c%u, %c%u, #%u\n", nm, arrNarrow, dd, arrWide, nn, shift);
9992       return True;
9993    }
9994
9995    if (immh >= BITS4(0,1,0,0) && opcode == BITS5(1,1,1,0,0)) {
9996       /* -------- 0,!=00xx,11100 SCVTF d_d_imm, s_s_imm -------- */
9997       /* -------- 1,!=00xx,11100 UCVTF d_d_imm, s_s_imm -------- */
9998       UInt size  = 0;
9999       UInt fbits = 0;
10000       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
10001       /* The following holds because immh is never zero. */
10002       vassert(ok);
10003       /* The following holds because immh >= 0100. */
10004       vassert(size == X10 || size == X11);
10005       Bool isD = size == X11;
10006       Bool isU = bitU == 1;
10007       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
10008       Double  scale  = two_to_the_minus(fbits);
10009       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
10010                              : IRExpr_Const(IRConst_F32( (Float)scale ));
10011       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
10012       IROp    opCVT  = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
10013                            : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
10014       IRType tyF = isD ? Ity_F64 : Ity_F32;
10015       IRType tyI = isD ? Ity_I64 : Ity_I32;
10016       IRTemp src = newTemp(tyI);
10017       IRTemp res = newTemp(tyF);
10018       IRTemp rm  = mk_get_IR_rounding_mode();
10019       assign(src, getQRegLane(nn, 0, tyI));
10020       assign(res, triop(opMUL, mkexpr(rm),
10021                                binop(opCVT, mkexpr(rm), mkexpr(src)), scaleE));
10022       putQRegLane(dd, 0, mkexpr(res));
10023       if (!isD) {
10024          putQRegLane(dd, 1, mkU32(0));
10025       }
10026       putQRegLane(dd, 1, mkU64(0));
10027       const HChar ch = isD ? 'd' : 's';
10028       DIP("%s %c%u, %c%u, #%u\n", isU ? "ucvtf" : "scvtf",
10029           ch, dd, ch, nn, fbits);
10030       return True;
10031    }
10032
10033    if (immh >= BITS4(0,1,0,0) && opcode == BITS5(1,1,1,1,1)) {
10034       /* -------- 0,!=00xx,11111 FCVTZS d_d_imm, s_s_imm -------- */
10035       /* -------- 1,!=00xx,11111 FCVTZU d_d_imm, s_s_imm -------- */
10036       UInt size  = 0;
10037       UInt fbits = 0;
10038       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
10039       /* The following holds because immh is never zero. */
10040       vassert(ok);
10041       /* The following holds because immh >= 0100. */
10042       vassert(size == X10 || size == X11);
10043       Bool isD = size == X11;
10044       Bool isU = bitU == 1;
10045       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
10046       Double  scale  = two_to_the_plus(fbits);
10047       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
10048                            : IRExpr_Const(IRConst_F32( (Float)scale ));
10049       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
10050       IROp    opCVT  = isU ? (isD ? Iop_F64toI64U : Iop_F32toI32U)
10051                            : (isD ? Iop_F64toI64S : Iop_F32toI32S);
10052       IRType tyF = isD ? Ity_F64 : Ity_F32;
10053       IRType tyI = isD ? Ity_I64 : Ity_I32;
10054       IRTemp src = newTemp(tyF);
10055       IRTemp res = newTemp(tyI);
10056       IRTemp rm  = newTemp(Ity_I32);
10057       assign(src, getQRegLane(nn, 0, tyF));
10058       assign(rm,  mkU32(Irrm_ZERO));
10059       assign(res, binop(opCVT, mkexpr(rm),
10060                                triop(opMUL, mkexpr(rm), mkexpr(src), scaleE)));
10061       putQRegLane(dd, 0, mkexpr(res));
10062       if (!isD) {
10063          putQRegLane(dd, 1, mkU32(0));
10064       }
10065       putQRegLane(dd, 1, mkU64(0));
10066       const HChar ch = isD ? 'd' : 's';
10067       DIP("%s %c%u, %c%u, #%u\n", isU ? "fcvtzu" : "fcvtzs",
10068           ch, dd, ch, nn, fbits);
10069       return True;
10070    }
10071
10072 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10073    return False;
10074 #  undef INSN
10075 }
10076
10077
10078 static
10079 Bool dis_AdvSIMD_scalar_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
10080 {
10081    /* 31 29 28    23   21 20 15     11 9 4
10082       01 U  11110 size 1  m  opcode 00 n d
10083       Decode fields: u,opcode
10084    */
10085 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10086    if (INSN(31,30) != BITS2(0,1)
10087        || INSN(28,24) != BITS5(1,1,1,1,0)
10088        || INSN(21,21) != 1
10089        || INSN(11,10) != BITS2(0,0)) {
10090       return False;
10091    }
10092    UInt bitU   = INSN(29,29);
10093    UInt size   = INSN(23,22);
10094    UInt mm     = INSN(20,16);
10095    UInt opcode = INSN(15,12);
10096    UInt nn     = INSN(9,5);
10097    UInt dd     = INSN(4,0);
10098    vassert(size < 4);
10099
10100    if (bitU == 0
10101        && (opcode == BITS4(1,1,0,1)
10102            || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
10103       /* -------- 0,1101  SQDMULL -------- */ // 0 (ks)
10104       /* -------- 0,1001  SQDMLAL -------- */ // 1
10105       /* -------- 0,1011  SQDMLSL -------- */ // 2
10106       /* Widens, and size refers to the narrowed lanes. */
10107       UInt ks = 3;
10108       switch (opcode) {
10109          case BITS4(1,1,0,1): ks = 0; break;
10110          case BITS4(1,0,0,1): ks = 1; break;
10111          case BITS4(1,0,1,1): ks = 2; break;
10112          default: vassert(0);
10113       }
10114       vassert(ks >= 0 && ks <= 2);
10115       if (size == X00 || size == X11) return False;
10116       vassert(size <= 2);
10117       IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
10118       vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
10119       newTempsV128_3(&vecN, &vecM, &vecD);
10120       assign(vecN, getQReg128(nn));
10121       assign(vecM, getQReg128(mm));
10122       assign(vecD, getQReg128(dd));
10123       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
10124                        False/*!is2*/, size, "mas"[ks],
10125                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
10126       IROp opZHI = mkVecZEROHIxxOFV128(size+1);
10127       putQReg128(dd, unop(opZHI, mkexpr(res)));
10128       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
10129       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
10130       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
10131          updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
10132       }
10133       const HChar* nm        = ks == 0 ? "sqdmull"
10134                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
10135       const HChar  arrNarrow = "bhsd"[size];
10136       const HChar  arrWide   = "bhsd"[size+1];
10137       DIP("%s %c%u, %c%u, %c%u\n",
10138           nm, arrWide, dd, arrNarrow, nn, arrNarrow, mm);
10139       return True;
10140    }
10141
10142    return False;
10143 #  undef INSN
10144 }
10145
10146
10147 static
10148 Bool dis_AdvSIMD_scalar_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
10149 {
10150    /* 31 29 28    23   21 20 15     10 9 4
10151       01 U  11110 size 1  m  opcode 1  n d
10152       Decode fields: u,size,opcode
10153    */
10154 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10155    if (INSN(31,30) != BITS2(0,1)
10156        || INSN(28,24) != BITS5(1,1,1,1,0)
10157        || INSN(21,21) != 1
10158        || INSN(10,10) != 1) {
10159       return False;
10160    }
10161    UInt bitU   = INSN(29,29);
10162    UInt size   = INSN(23,22);
10163    UInt mm     = INSN(20,16);
10164    UInt opcode = INSN(15,11);
10165    UInt nn     = INSN(9,5);
10166    UInt dd     = INSN(4,0);
10167    vassert(size < 4);
10168
10169    if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
10170       /* -------- 0,xx,00001 SQADD std4_std4_std4 -------- */
10171       /* -------- 1,xx,00001 UQADD std4_std4_std4 -------- */
10172       /* -------- 0,xx,00101 SQSUB std4_std4_std4 -------- */
10173       /* -------- 1,xx,00101 UQSUB std4_std4_std4 -------- */
10174       Bool isADD = opcode == BITS5(0,0,0,0,1);
10175       Bool isU   = bitU == 1;
10176       IROp qop   = Iop_INVALID;
10177       IROp nop   = Iop_INVALID;
10178       if (isADD) {
10179          qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
10180          nop = mkVecADD(size);
10181       } else {
10182          qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
10183          nop = mkVecSUB(size);
10184       }
10185       IRTemp argL = newTempV128();
10186       IRTemp argR = newTempV128();
10187       IRTemp qres = newTempV128();
10188       IRTemp nres = newTempV128();
10189       assign(argL, getQReg128(nn));
10190       assign(argR, getQReg128(mm));
10191       assign(qres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10192                              size, binop(qop, mkexpr(argL), mkexpr(argR)))));
10193       assign(nres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10194                              size, binop(nop, mkexpr(argL), mkexpr(argR)))));
10195       putQReg128(dd, mkexpr(qres));
10196       updateQCFLAGwithDifference(qres, nres);
10197       const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd")
10198                                : (isU ? "uqsub" : "sqsub");
10199       const HChar  arr = "bhsd"[size];
10200       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10201       return True;
10202    }
10203
10204    if (size == X11 && opcode == BITS5(0,0,1,1,0)) {
10205       /* -------- 0,11,00110 CMGT d_d_d -------- */ // >s
10206       /* -------- 1,11,00110 CMHI d_d_d -------- */ // >u
10207       Bool    isGT = bitU == 0;
10208       IRExpr* argL = getQReg128(nn);
10209       IRExpr* argR = getQReg128(mm);
10210       IRTemp  res  = newTempV128();
10211       assign(res,
10212              isGT ? binop(Iop_CmpGT64Sx2, argL, argR)
10213                   : binop(Iop_CmpGT64Ux2, argL, argR));
10214       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10215       DIP("%s %s, %s, %s\n",isGT ? "cmgt" : "cmhi",
10216           nameQRegLO(dd, Ity_I64),
10217           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10218       return True;
10219    }
10220
10221    if (size == X11 && opcode == BITS5(0,0,1,1,1)) {
10222       /* -------- 0,11,00111 CMGE d_d_d -------- */ // >=s
10223       /* -------- 1,11,00111 CMHS d_d_d -------- */ // >=u
10224       Bool    isGE = bitU == 0;
10225       IRExpr* argL = getQReg128(nn);
10226       IRExpr* argR = getQReg128(mm);
10227       IRTemp  res  = newTempV128();
10228       assign(res,
10229              isGE ? unop(Iop_NotV128, binop(Iop_CmpGT64Sx2, argR, argL))
10230                   : unop(Iop_NotV128, binop(Iop_CmpGT64Ux2, argR, argL)));
10231       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10232       DIP("%s %s, %s, %s\n", isGE ? "cmge" : "cmhs",
10233           nameQRegLO(dd, Ity_I64),
10234           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10235       return True;
10236    }
10237
10238    if (size == X11 && (opcode == BITS5(0,1,0,0,0)
10239                        || opcode == BITS5(0,1,0,1,0))) {
10240       /* -------- 0,xx,01000 SSHL  d_d_d -------- */
10241       /* -------- 0,xx,01010 SRSHL d_d_d -------- */
10242       /* -------- 1,xx,01000 USHL  d_d_d -------- */
10243       /* -------- 1,xx,01010 URSHL d_d_d -------- */
10244       Bool isU = bitU == 1;
10245       Bool isR = opcode == BITS5(0,1,0,1,0);
10246       IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
10247                      : (isU ? mkVecSHU(size)  : mkVecSHS(size));
10248       IRTemp res = newTempV128();
10249       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
10250       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10251       const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
10252                              : (isU ? "ushl"  : "sshl");
10253       DIP("%s %s, %s, %s\n", nm,
10254           nameQRegLO(dd, Ity_I64),
10255           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10256       return True;
10257    }
10258
10259    if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
10260       /* -------- 0,xx,01001 SQSHL  std4_std4_std4 -------- */
10261       /* -------- 0,xx,01011 SQRSHL std4_std4_std4 -------- */
10262       /* -------- 1,xx,01001 UQSHL  std4_std4_std4 -------- */
10263       /* -------- 1,xx,01011 UQRSHL std4_std4_std4 -------- */
10264       Bool isU = bitU == 1;
10265       Bool isR = opcode == BITS5(0,1,0,1,1);
10266       IROp op  = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
10267                      : (isU ? mkVecQANDUQSH(size)  : mkVecQANDSQSH(size));
10268       /* This is a bit tricky.  Since we're only interested in the lowest
10269          lane of the result, we zero out all the rest in the operands, so
10270          as to ensure that other lanes don't pollute the returned Q value.
10271          This works because it means, for the lanes we don't care about, we
10272          are shifting zero by zero, which can never saturate. */
10273       IRTemp res256 = newTemp(Ity_V256);
10274       IRTemp resSH  = newTempV128();
10275       IRTemp resQ   = newTempV128();
10276       IRTemp zero   = newTempV128();
10277       assign(
10278          res256,
10279          binop(op,
10280                mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn))),
10281                mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(mm)))));
10282       assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
10283       assign(resQ,  unop(Iop_V256toV128_1, mkexpr(res256)));
10284       assign(zero,  mkV128(0x0000));
10285       putQReg128(dd, mkexpr(resSH));
10286       updateQCFLAGwithDifference(resQ, zero);
10287       const HChar* nm  = isR ? (isU ? "uqrshl" : "sqrshl")
10288                              : (isU ? "uqshl"  : "sqshl");
10289       const HChar  arr = "bhsd"[size];
10290       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10291       return True;
10292    }
10293
10294    if (size == X11 && opcode == BITS5(1,0,0,0,0)) {
10295       /* -------- 0,11,10000 ADD d_d_d -------- */
10296       /* -------- 1,11,10000 SUB d_d_d -------- */
10297       Bool   isSUB = bitU == 1;
10298       IRTemp res   = newTemp(Ity_I64);
10299       assign(res, binop(isSUB ? Iop_Sub64 : Iop_Add64,
10300                         getQRegLane(nn, 0, Ity_I64),
10301                         getQRegLane(mm, 0, Ity_I64)));
10302       putQRegLane(dd, 0, mkexpr(res));
10303       putQRegLane(dd, 1, mkU64(0));
10304       DIP("%s %s, %s, %s\n", isSUB ? "sub" : "add",
10305           nameQRegLO(dd, Ity_I64),
10306           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10307       return True;
10308    }
10309
10310    if (size == X11 && opcode == BITS5(1,0,0,0,1)) {
10311       /* -------- 0,11,10001 CMTST d_d_d -------- */ // &, != 0
10312       /* -------- 1,11,10001 CMEQ  d_d_d -------- */ // ==
10313       Bool    isEQ = bitU == 1;
10314       IRExpr* argL = getQReg128(nn);
10315       IRExpr* argR = getQReg128(mm);
10316       IRTemp  res  = newTempV128();
10317       assign(res,
10318              isEQ ? binop(Iop_CmpEQ64x2, argL, argR)
10319                   : unop(Iop_NotV128, binop(Iop_CmpEQ64x2,
10320                                             binop(Iop_AndV128, argL, argR),
10321                                             mkV128(0x0000))));
10322       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10323       DIP("%s %s, %s, %s\n", isEQ ? "cmeq" : "cmtst",
10324           nameQRegLO(dd, Ity_I64),
10325           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10326       return True;
10327    }
10328
10329    if (opcode == BITS5(1,0,1,1,0)) {
10330       /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
10331       /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
10332       if (size == X00 || size == X11) return False;
10333       Bool isR = bitU == 1;
10334       IRTemp res, sat1q, sat1n, vN, vM;
10335       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
10336       newTempsV128_2(&vN, &vM);
10337       assign(vN, getQReg128(nn));
10338       assign(vM, getQReg128(mm));
10339       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
10340       putQReg128(dd,
10341                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res))));
10342       updateQCFLAGwithDifference(
10343          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1q)),
10344          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1n)));
10345       const HChar  arr = "bhsd"[size];
10346       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
10347       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10348       return True;
10349    }
10350
10351    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
10352       /* -------- 1,1x,11010 FABD d_d_d, s_s_s -------- */
10353       IRType ity = size == X11 ? Ity_F64 : Ity_F32;
10354       IRTemp res = newTemp(ity);
10355       assign(res, unop(mkABSF(ity),
10356                        triop(mkSUBF(ity),
10357                              mkexpr(mk_get_IR_rounding_mode()),
10358                              getQRegLO(nn,ity), getQRegLO(mm,ity))));
10359       putQReg128(dd, mkV128(0x0000));
10360       putQRegLO(dd, mkexpr(res));
10361       DIP("fabd %s, %s, %s\n",
10362           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10363       return True;
10364    }
10365
10366    if (bitU == 0 && size <= X01 && opcode == BITS5(1,1,0,1,1)) {
10367       /* -------- 0,0x,11011 FMULX d_d_d, s_s_s -------- */
10368       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
10369       IRType ity = size == X01 ? Ity_F64 : Ity_F32;
10370       IRTemp res = newTemp(ity);
10371       assign(res, triop(mkMULF(ity),
10372                         mkexpr(mk_get_IR_rounding_mode()),
10373                         getQRegLO(nn,ity), getQRegLO(mm,ity)));
10374       putQReg128(dd, mkV128(0x0000));
10375       putQRegLO(dd, mkexpr(res));
10376       DIP("fmulx %s, %s, %s\n",
10377           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10378       return True;
10379    }
10380
10381    if (size <= X01 && opcode == BITS5(1,1,1,0,0)) {
10382       /* -------- 0,0x,11100 FCMEQ d_d_d, s_s_s -------- */
10383       /* -------- 1,0x,11100 FCMGE d_d_d, s_s_s -------- */
10384       Bool   isD   = size == X01;
10385       IRType ity   = isD ? Ity_F64 : Ity_F32;
10386       Bool   isGE  = bitU == 1;
10387       IROp   opCMP = isGE ? (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4)
10388                           : (isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4);
10389       IRTemp res   = newTempV128();
10390       assign(res, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn)) // swapd
10391                        : binop(opCMP, getQReg128(nn), getQReg128(mm)));
10392       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10393                                                              mkexpr(res))));
10394       DIP("%s %s, %s, %s\n", isGE ? "fcmge" : "fcmeq",
10395           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10396       return True;
10397    }
10398
10399    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,0,0)) {
10400       /* -------- 1,1x,11100 FCMGT d_d_d, s_s_s -------- */
10401       Bool   isD   = size == X11;
10402       IRType ity   = isD ? Ity_F64 : Ity_F32;
10403       IROp   opCMP = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
10404       IRTemp res   = newTempV128();
10405       assign(res, binop(opCMP, getQReg128(mm), getQReg128(nn))); // swapd
10406       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10407                                                              mkexpr(res))));
10408       DIP("%s %s, %s, %s\n", "fcmgt",
10409           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10410       return True;
10411    }
10412
10413    if (bitU == 1 && opcode == BITS5(1,1,1,0,1)) {
10414       /* -------- 1,0x,11101 FACGE d_d_d, s_s_s -------- */
10415       /* -------- 1,1x,11101 FACGT d_d_d, s_s_s -------- */
10416       Bool   isD   = (size & 1) == 1;
10417       IRType ity   = isD ? Ity_F64 : Ity_F32;
10418       Bool   isGT  = (size & 2) == 2;
10419       IROp   opCMP = isGT ? (isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4)
10420                           : (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4);
10421       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
10422       IRTemp res   = newTempV128();
10423       assign(res, binop(opCMP, unop(opABS, getQReg128(mm)),
10424                                unop(opABS, getQReg128(nn)))); // swapd
10425       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10426                                                              mkexpr(res))));
10427       DIP("%s %s, %s, %s\n", isGT ? "facgt" : "facge",
10428           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10429       return True;
10430    }
10431
10432    if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
10433       /* -------- 0,0x,11111: FRECPS  d_d_d, s_s_s -------- */
10434       /* -------- 0,1x,11111: FRSQRTS d_d_d, s_s_s -------- */
10435       Bool isSQRT = (size & 2) == 2;
10436       Bool isD    = (size & 1) == 1;
10437       IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
10438                            : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
10439       IRTemp res = newTempV128();
10440       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
10441       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10442                                                              mkexpr(res))));
10443       HChar c = isD ? 'd' : 's';
10444       DIP("%s %c%u, %c%u, %c%u\n", isSQRT ? "frsqrts" : "frecps",
10445           c, dd, c, nn, c, mm);
10446       return True;
10447    }
10448
10449    return False;
10450 #  undef INSN
10451 }
10452
10453 static
10454 Bool dis_AdvSIMD_scalar_three_same_extra(/*MB_OUT*/DisResult* dres, UInt insn)
10455 {
10456    /* 31 29 28    23   21 20 15     10 9 4
10457       01 U  11110 size 0  m  opcode 1  n d
10458       Decode fields: u,size,opcode
10459    */
10460 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10461    if (INSN(31,30) != BITS2(0,1)
10462        || INSN(28,24) != BITS5(1,1,1,1,0)
10463        || INSN(21,21) != 0
10464        || INSN(10,10) != 1) {
10465       return False;
10466    }
10467    UInt bitU   = INSN(29,29);
10468    UInt size   = INSN(23,22);
10469    UInt mm     = INSN(20,16);
10470    UInt opcode = INSN(15,11);
10471    UInt nn     = INSN(9,5);
10472    UInt dd     = INSN(4,0);
10473    vassert(size < 4);
10474    vassert(mm < 32 && nn < 32 && dd < 32);
10475
10476    if (bitU == 1 && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1))) {
10477       /* -------- xx,10000 SQRDMLAH s and h variants only -------- */
10478       /* -------- xx,10001 SQRDMLSH s and h variants only -------- */
10479       if (size == X00 || size == X11) return False;
10480       Bool isAdd = opcode == BITS5(1,0,0,0,0);
10481
10482       IRTemp res, res_nosat, vD, vN, vM;
10483       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
10484       newTempsV128_3(&vD, &vN, &vM);
10485       assign(vD, getQReg128(dd));
10486       assign(vN, getQReg128(nn));
10487       assign(vM, getQReg128(mm));
10488
10489       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
10490       putQReg128(dd,
10491                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res))));
10492       updateQCFLAGwithDifference(
10493          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res)),
10494          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res_nosat)));
10495
10496       const HChar  arr = "hs"[size];
10497       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
10498       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10499       return True;
10500    }
10501
10502    return False;
10503 #  undef INSN
10504 }
10505
10506
10507 static
10508 Bool dis_AdvSIMD_scalar_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
10509 {
10510    /* 31 29 28    23   21    16     11 9 4
10511       01 U  11110 size 10000 opcode 10 n d
10512       Decode fields: u,size,opcode
10513    */
10514 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10515    if (INSN(31,30) != BITS2(0,1)
10516        || INSN(28,24) != BITS5(1,1,1,1,0)
10517        || INSN(21,17) != BITS5(1,0,0,0,0)
10518        || INSN(11,10) != BITS2(1,0)) {
10519       return False;
10520    }
10521    UInt bitU   = INSN(29,29);
10522    UInt size   = INSN(23,22);
10523    UInt opcode = INSN(16,12);
10524    UInt nn     = INSN(9,5);
10525    UInt dd     = INSN(4,0);
10526    vassert(size < 4);
10527
10528    if (opcode == BITS5(0,0,0,1,1)) {
10529       /* -------- 0,xx,00011: SUQADD std4_std4 -------- */
10530       /* -------- 1,xx,00011: USQADD std4_std4 -------- */
10531       /* These are a bit tricky (to say the least).  See comments on
10532          the vector variants (in dis_AdvSIMD_two_reg_misc) below for
10533          details. */
10534       Bool   isUSQADD = bitU == 1;
10535       IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
10536                              : mkVecQADDEXTUSSATSS(size);
10537       IROp   nop  = mkVecADD(size);
10538       IRTemp argL = newTempV128();
10539       IRTemp argR = newTempV128();
10540       assign(argL, getQReg128(nn));
10541       assign(argR, getQReg128(dd));
10542       IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10543                        size, binop(qop, mkexpr(argL), mkexpr(argR)));
10544       IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10545                        size, binop(nop, mkexpr(argL), mkexpr(argR)));
10546       putQReg128(dd, mkexpr(qres));
10547       updateQCFLAGwithDifference(qres, nres);
10548       const HChar arr = "bhsd"[size];
10549       DIP("%s %c%u, %c%u\n", isUSQADD ? "usqadd" : "suqadd", arr, dd, arr, nn);
10550       return True;
10551    }
10552
10553    if (opcode == BITS5(0,0,1,1,1)) {
10554       /* -------- 0,xx,00111 SQABS std4_std4 -------- */
10555       /* -------- 1,xx,00111 SQNEG std4_std4 -------- */
10556       Bool isNEG = bitU == 1;
10557       IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
10558       (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
10559                                          getQReg128(nn), size );
10560       IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(qresFW));
10561       IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(nresFW));
10562       putQReg128(dd, mkexpr(qres));
10563       updateQCFLAGwithDifference(qres, nres);
10564       const HChar arr = "bhsd"[size];
10565       DIP("%s %c%u, %c%u\n", isNEG ? "sqneg" : "sqabs", arr, dd, arr, nn);
10566       return True;
10567    }
10568
10569    if (size == X11 && opcode == BITS5(0,1,0,0,0)) {
10570       /* -------- 0,11,01000: CMGT d_d_#0 -------- */ // >s 0
10571       /* -------- 1,11,01000: CMGE d_d_#0 -------- */ // >=s 0
10572       Bool    isGT = bitU == 0;
10573       IRExpr* argL = getQReg128(nn);
10574       IRExpr* argR = mkV128(0x0000);
10575       IRTemp  res  = newTempV128();
10576       assign(res, isGT ? binop(Iop_CmpGT64Sx2, argL, argR)
10577                        : unop(Iop_NotV128, binop(Iop_CmpGT64Sx2, argR, argL)));
10578       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10579       DIP("cm%s d%u, d%u, #0\n", isGT ? "gt" : "ge", dd, nn);
10580       return True;
10581    }
10582
10583    if (size == X11 && opcode == BITS5(0,1,0,0,1)) {
10584       /* -------- 0,11,01001: CMEQ d_d_#0 -------- */ // == 0
10585       /* -------- 1,11,01001: CMLE d_d_#0 -------- */ // <=s 0
10586       Bool    isEQ = bitU == 0;
10587       IRExpr* argL = getQReg128(nn);
10588       IRExpr* argR = mkV128(0x0000);
10589       IRTemp  res  = newTempV128();
10590       assign(res, isEQ ? binop(Iop_CmpEQ64x2, argL, argR)
10591                        : unop(Iop_NotV128,
10592                               binop(Iop_CmpGT64Sx2, argL, argR)));
10593       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10594       DIP("cm%s d%u, d%u, #0\n", isEQ ? "eq" : "le", dd, nn);
10595       return True;
10596    }
10597
10598    if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,0)) {
10599       /* -------- 0,11,01010: CMLT d_d_#0 -------- */ // <s 0
10600       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10601                           binop(Iop_CmpGT64Sx2, mkV128(0x0000),
10602                                                 getQReg128(nn))));
10603       DIP("cm%s d%u, d%u, #0\n", "lt", dd, nn);
10604       return True;
10605    }
10606
10607    if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
10608       /* -------- 0,11,01011 ABS d_d -------- */
10609       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10610                           unop(Iop_Abs64x2, getQReg128(nn))));
10611       DIP("abs d%u, d%u\n", dd, nn);
10612       return True;
10613    }
10614
10615    if (bitU == 1 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
10616       /* -------- 1,11,01011 NEG d_d -------- */
10617       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10618                           binop(Iop_Sub64x2, mkV128(0x0000), getQReg128(nn))));
10619       DIP("neg d%u, d%u\n", dd, nn);
10620       return True;
10621    }
10622
10623    UInt ix = 0; /*INVALID*/
10624    if (size >= X10) {
10625       switch (opcode) {
10626          case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 4 : 1; break;
10627          case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 5 : 2; break;
10628          case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;
10629          default: break;
10630       }
10631    }
10632    if (ix > 0) {
10633       /* -------- 0,1x,01100 FCMGT d_d_#0.0, s_s_#0.0 (ix 1) -------- */
10634       /* -------- 0,1x,01101 FCMEQ d_d_#0.0, s_s_#0.0 (ix 2) -------- */
10635       /* -------- 0,1x,01110 FCMLT d_d_#0.0, s_s_#0.0 (ix 3) -------- */
10636       /* -------- 1,1x,01100 FCMGE d_d_#0.0, s_s_#0.0 (ix 4) -------- */
10637       /* -------- 1,1x,01101 FCMLE d_d_#0.0, s_s_#0.0 (ix 5) -------- */
10638       Bool   isD     = size == X11;
10639       IRType ity     = isD ? Ity_F64 : Ity_F32;
10640       IROp   opCmpEQ = isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
10641       IROp   opCmpLE = isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
10642       IROp   opCmpLT = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
10643       IROp   opCmp   = Iop_INVALID;
10644       Bool   swap    = False;
10645       const HChar* nm = "??";
10646       switch (ix) {
10647          case 1: nm = "fcmgt"; opCmp = opCmpLT; swap = True; break;
10648          case 2: nm = "fcmeq"; opCmp = opCmpEQ; break;
10649          case 3: nm = "fcmlt"; opCmp = opCmpLT; break;
10650          case 4: nm = "fcmge"; opCmp = opCmpLE; swap = True; break;
10651          case 5: nm = "fcmle"; opCmp = opCmpLE; break;
10652          default: vassert(0);
10653       }
10654       IRExpr* zero = mkV128(0x0000);
10655       IRTemp res = newTempV128();
10656       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
10657                        : binop(opCmp, getQReg128(nn), zero));
10658       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10659                                                              mkexpr(res))));
10660
10661       DIP("%s %s, %s, #0.0\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
10662       return True;
10663    }
10664
10665    if (opcode == BITS5(1,0,1,0,0)
10666        || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) {
10667       /* -------- 0,xx,10100: SQXTN -------- */
10668       /* -------- 1,xx,10100: UQXTN -------- */
10669       /* -------- 1,xx,10010: SQXTUN -------- */
10670       if (size == X11) return False;
10671       vassert(size < 3);
10672       IROp  opN    = Iop_INVALID;
10673       Bool  zWiden = True;
10674       const HChar* nm = "??";
10675       /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) {
10676          opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False;
10677       }
10678       else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) {
10679          opN = mkVecQNARROWUNUU(size); nm = "uqxtn";
10680       }
10681       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
10682          opN = mkVecQNARROWUNSU(size); nm = "sqxtun";
10683       }
10684       else vassert(0);
10685       IRTemp src  = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10686                        size+1, getQReg128(nn));
10687       IRTemp resN = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10688                        size, unop(Iop_64UtoV128, unop(opN, mkexpr(src))));
10689       putQReg128(dd, mkexpr(resN));
10690       /* This widens zero lanes to zero, and compares it against zero, so all
10691          of the non-participating lanes make no contribution to the
10692          Q flag state. */
10693       IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/,
10694                                               size, mkexpr(resN));
10695       updateQCFLAGwithDifference(src, resW);
10696       const HChar arrNarrow = "bhsd"[size];
10697       const HChar arrWide   = "bhsd"[size+1];
10698       DIP("%s %c%u, %c%u\n", nm, arrNarrow, dd, arrWide, nn);
10699       return True;
10700    }
10701
10702    if (opcode == BITS5(1,0,1,1,0) && bitU == 1 && size == X01) {
10703       /* -------- 1,01,10110 FCVTXN s_d -------- */
10704       /* Using Irrm_NEAREST here isn't right.  The docs say "round to
10705          odd" but I don't know what that really means. */
10706       putQRegLO(dd,
10707                 binop(Iop_F64toF32, mkU32(Irrm_NEAREST),
10708                                     getQRegLO(nn, Ity_F64)));
10709       putQRegLane(dd, 1, mkU32(0));
10710       putQRegLane(dd, 1, mkU64(0));
10711       DIP("fcvtxn s%u, d%u\n", dd, nn);
10712       return True;
10713    }
10714
10715    ix = 0; /*INVALID*/
10716    switch (opcode) {
10717       case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
10718       case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
10719       case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
10720       default: break;
10721    }
10722    if (ix > 0) {
10723       /* -------- 0,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
10724       /* -------- 0,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
10725       /* -------- 0,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
10726       /* -------- 0,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
10727       /* -------- 0,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
10728       /* -------- 1,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
10729       /* -------- 1,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
10730       /* -------- 1,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
10731       /* -------- 1,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
10732       /* -------- 1,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
10733       Bool           isD  = (size & 1) == 1;
10734       IRType         tyF  = isD ? Ity_F64 : Ity_F32;
10735       IRType         tyI  = isD ? Ity_I64 : Ity_I32;
10736       IRRoundingMode irrm = 8; /*impossible*/
10737       HChar          ch   = '?';
10738       switch (ix) {
10739          case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
10740          case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
10741          case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
10742          case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
10743          case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
10744          default: vassert(0);
10745       }
10746       IROp cvt = Iop_INVALID;
10747       if (bitU == 1) {
10748          cvt = isD ? Iop_F64toI64U : Iop_F32toI32U;
10749       } else {
10750          cvt = isD ? Iop_F64toI64S : Iop_F32toI32S;
10751       }
10752       IRTemp src = newTemp(tyF);
10753       IRTemp res = newTemp(tyI);
10754       assign(src, getQRegLane(nn, 0, tyF));
10755       assign(res, binop(cvt, mkU32(irrm), mkexpr(src)));
10756       putQRegLane(dd, 0, mkexpr(res)); /* bits 31-0 or 63-0 */
10757       if (!isD) {
10758          putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
10759       }
10760       putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
10761       HChar sOrD = isD ? 'd' : 's';
10762       DIP("fcvt%c%c %c%u, %c%u\n", ch, bitU == 1 ? 'u' : 's',
10763           sOrD, dd, sOrD, nn);
10764       return True;
10765    }
10766
10767    if (size <= X01 && opcode == BITS5(1,1,1,0,1)) {
10768       /* -------- 0,0x,11101: SCVTF d_d, s_s -------- */
10769       /* -------- 1,0x,11101: UCVTF d_d, s_s -------- */
10770       Bool   isU = bitU == 1;
10771       Bool   isD = (size & 1) == 1;
10772       IRType tyI = isD ? Ity_I64 : Ity_I32;
10773       IROp   iop = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
10774                        : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
10775       IRTemp rm  = mk_get_IR_rounding_mode();
10776       putQRegLO(dd, binop(iop, mkexpr(rm), getQRegLO(nn, tyI)));
10777       if (!isD) {
10778          putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
10779       }
10780       putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
10781       HChar c = isD ? 'd' : 's';
10782       DIP("%ccvtf %c%u, %c%u\n", isU ? 'u' : 's', c, dd, c, nn);
10783       return True;
10784    }
10785
10786    if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
10787       /* -------- 0,1x,11101: FRECPE  d_d, s_s -------- */
10788       /* -------- 1,1x,11101: FRSQRTE d_d, s_s -------- */
10789       Bool isSQRT = bitU == 1;
10790       Bool isD    = (size & 1) == 1;
10791       IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
10792                            : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
10793       IRTemp resV = newTempV128();
10794       assign(resV, unop(op, getQReg128(nn)));
10795       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10796                                                              mkexpr(resV))));
10797       HChar c = isD ? 'd' : 's';
10798       DIP("%s %c%u, %c%u\n", isSQRT ? "frsqrte" : "frecpe", c, dd, c, nn);
10799       return True;
10800    }
10801
10802    if (bitU == 0 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
10803       /* -------- 0,1x,11111: FRECPX  d_d, s_s -------- */
10804       Bool   isD = (size & 1) == 1;
10805       IRType ty  = isD ? Ity_F64 : Ity_F32;
10806       IROp   op  = isD ? Iop_RecpExpF64 : Iop_RecpExpF32;
10807       IRTemp res = newTemp(ty);
10808       IRTemp rm  = mk_get_IR_rounding_mode();
10809       assign(res, binop(op, mkexpr(rm), getQRegLane(nn, 0, ty)));
10810       putQReg128(dd, mkV128(0x0000));
10811       putQRegLane(dd, 0, mkexpr(res));
10812       HChar c = isD ? 'd' : 's';
10813       DIP("%s %c%u, %c%u\n", "frecpx", c, dd, c, nn);
10814       return True;
10815    }
10816
10817    return False;
10818 #  undef INSN
10819 }
10820
10821
10822 static
10823 Bool dis_AdvSIMD_scalar_x_indexed_element(/*MB_OUT*/DisResult* dres, UInt insn)
10824 {
10825    /* 31   28    23   21 20 19 15     11   9 4
10826       01 U 11111 size L  M  m  opcode H  0 n d
10827       Decode fields are: u,size,opcode
10828       M is really part of the mm register number.  Individual
10829       cases need to inspect L and H though.
10830    */
10831 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10832    if (INSN(31,30) != BITS2(0,1)
10833        || INSN(28,24) != BITS5(1,1,1,1,1) || INSN(10,10) != 0) {
10834       return False;
10835    }
10836    UInt bitU   = INSN(29,29);
10837    UInt size   = INSN(23,22);
10838    UInt bitL   = INSN(21,21);
10839    UInt bitM   = INSN(20,20);
10840    UInt mmLO4  = INSN(19,16);
10841    UInt opcode = INSN(15,12);
10842    UInt bitH   = INSN(11,11);
10843    UInt nn     = INSN(9,5);
10844    UInt dd     = INSN(4,0);
10845    vassert(size < 4);
10846    vassert(bitH < 2 && bitM < 2 && bitL < 2);
10847
10848    if (bitU == 0 && size >= X10
10849        && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
10850       /* -------- 0,1x,0001 FMLA d_d_d[], s_s_s[] -------- */
10851       /* -------- 0,1x,0101 FMLS d_d_d[], s_s_s[] -------- */
10852       Bool isD   = (size & 1) == 1;
10853       Bool isSUB = opcode == BITS4(0,1,0,1);
10854       UInt index;
10855       if      (!isD)             index = (bitH << 1) | bitL;
10856       else if (isD && bitL == 0) index = bitH;
10857       else return False; // sz:L == x11 => unallocated encoding
10858       vassert(index < (isD ? 2 : 4));
10859       IRType ity   = isD ? Ity_F64 : Ity_F32;
10860       IRTemp elem  = newTemp(ity);
10861       UInt   mm    = (bitM << 4) | mmLO4;
10862       assign(elem, getQRegLane(mm, index, ity));
10863       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
10864       IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
10865       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
10866       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
10867       IRTemp rm    = mk_get_IR_rounding_mode();
10868       IRTemp t1    = newTempV128();
10869       IRTemp t2    = newTempV128();
10870       // FIXME: double rounding; use FMA primops instead
10871       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
10872       assign(t2, triop(isSUB ? opSUB : opADD,
10873                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
10874       putQReg128(dd,
10875                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
10876                                                          mkexpr(t2))));
10877       const HChar c = isD ? 'd' : 's';
10878       DIP("%s %c%u, %c%u, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
10879           c, dd, c, nn, nameQReg128(mm), c, index);
10880       return True;
10881    }
10882
10883    if (size >= X10 && opcode == BITS4(1,0,0,1)) {
10884       /* -------- 0,1x,1001 FMUL  d_d_d[], s_s_s[] -------- */
10885       /* -------- 1,1x,1001 FMULX d_d_d[], s_s_s[] -------- */
10886       Bool isD    = (size & 1) == 1;
10887       Bool isMULX = bitU == 1;
10888       UInt index;
10889       if      (!isD)             index = (bitH << 1) | bitL;
10890       else if (isD && bitL == 0) index = bitH;
10891       else return False; // sz:L == x11 => unallocated encoding
10892       vassert(index < (isD ? 2 : 4));
10893       IRType ity   = isD ? Ity_F64 : Ity_F32;
10894       IRTemp elem  = newTemp(ity);
10895       UInt   mm    = (bitM << 4) | mmLO4;
10896       assign(elem, getQRegLane(mm, index, ity));
10897       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
10898       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
10899       IRTemp rm    = mk_get_IR_rounding_mode();
10900       IRTemp t1    = newTempV128();
10901       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
10902       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
10903       putQReg128(dd,
10904                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
10905                                                          mkexpr(t1))));
10906       const HChar c = isD ? 'd' : 's';
10907       DIP("%s %c%u, %c%u, %s.%c[%u]\n", isMULX ? "fmulx" : "fmul",
10908           c, dd, c, nn, nameQReg128(mm), c, index);
10909       return True;
10910    }
10911
10912    if (bitU == 0
10913        && (opcode == BITS4(1,0,1,1)
10914            || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
10915       /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
10916       /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
10917       /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
10918       /* Widens, and size refers to the narrowed lanes. */
10919       UInt ks = 3;
10920       switch (opcode) {
10921          case BITS4(1,0,1,1): ks = 0; break;
10922          case BITS4(0,0,1,1): ks = 1; break;
10923          case BITS4(0,1,1,1): ks = 2; break;
10924          default: vassert(0);
10925       }
10926       vassert(ks >= 0 && ks <= 2);
10927       UInt mm  = 32; // invalid
10928       UInt ix  = 16; // invalid
10929       switch (size) {
10930          case X00:
10931             return False; // h_b_b[] case is not allowed
10932          case X01:
10933             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
10934          case X10:
10935             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
10936          case X11:
10937             return False; // q_d_d[] case is not allowed
10938          default:
10939             vassert(0);
10940       }
10941       vassert(mm < 32 && ix < 16);
10942       IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
10943       vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
10944       newTempsV128_2(&vecN, &vecD);
10945       assign(vecN, getQReg128(nn));
10946       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
10947       assign(vecD, getQReg128(dd));
10948       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
10949                        False/*!is2*/, size, "mas"[ks],
10950                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
10951       IROp opZHI = mkVecZEROHIxxOFV128(size+1);
10952       putQReg128(dd, unop(opZHI, mkexpr(res)));
10953       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
10954       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
10955       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
10956          updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
10957       }
10958       const HChar* nm        = ks == 0 ? "sqmull"
10959                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
10960       const HChar  arrNarrow = "bhsd"[size];
10961       const HChar  arrWide   = "bhsd"[size+1];
10962       DIP("%s %c%u, %c%u, v%u.%c[%u]\n",
10963           nm, arrWide, dd, arrNarrow, nn, dd, arrNarrow, ix);
10964       return True;
10965    }
10966
10967    if (bitU == 0 && (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1))) {
10968       /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
10969       /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
10970       UInt mm  = 32; // invalid
10971       UInt ix  = 16; // invalid
10972       switch (size) {
10973          case X00:
10974             return False; // b case is not allowed
10975          case X01:
10976             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
10977          case X10:
10978             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
10979          case X11:
10980             return False; // q case is not allowed
10981          default:
10982             vassert(0);
10983       }
10984       vassert(mm < 32 && ix < 16);
10985       Bool isR = opcode == BITS4(1,1,0,1);
10986       IRTemp res, sat1q, sat1n, vN, vM;
10987       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
10988       vN = newTempV128();
10989       assign(vN, getQReg128(nn));
10990       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
10991       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
10992       IROp opZHI = mkVecZEROHIxxOFV128(size);
10993       putQReg128(dd, unop(opZHI, mkexpr(res)));
10994       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
10995       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
10996       HChar ch         = size == X01 ? 'h' : 's';
10997       DIP("%s %c%u, %c%u, v%d.%c[%u]\n", nm, ch, dd, ch, nn, ch, (Int)dd, ix);
10998       return True;
10999    }
11000
11001    if (bitU == 1 && (opcode == BITS4(1,1,0,1) || opcode == BITS4(1,1,1,1))) {
11002       /* -------- 0,xx,1101 SQRDMLAH s and h variants only -------- */
11003       /* -------- 0,xx,1111 SQRDMLSH s and h variants only -------- */
11004       UInt mm  = 32; // invalid
11005       UInt ix  = 16; // invalid
11006       switch (size) {
11007          case X00:
11008             return False; // b case is not allowed
11009          case X01:
11010             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
11011          case X10:
11012             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
11013          case X11:
11014             return False; // d case is not allowed
11015          default:
11016             vassert(0);
11017       }
11018       vassert(size < 4);
11019       vassert(mm < 32 && ix < 16);
11020       Bool isAdd = opcode == BITS4(1,1,0,1);
11021
11022       IRTemp res, res_nosat, vD, vN, vM;
11023       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
11024       newTempsV128_2(&vD, &vN);
11025       assign(vD, getQReg128(dd));
11026       assign(vN, getQReg128(nn));
11027       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
11028
11029       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
11030       IROp opZHI = mkVecZEROHIxxOFV128(size);
11031       putQReg128(dd, unop(opZHI, mkexpr(res)));
11032       updateQCFLAGwithDifferenceZHI(res, res_nosat, opZHI);
11033
11034       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
11035       HChar ch         = size == X01 ? 'h' : 's';
11036       DIP("%s %c%u, %c%u, v%d.%c[%u]\n", nm, ch, dd, ch, nn, ch, (Int)dd, ix);
11037       return True;
11038    }
11039
11040    return False;
11041 #  undef INSN
11042 }
11043
11044
11045 static
11046 Bool dis_AdvSIMD_shift_by_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
11047 {
11048    /* 31    28     22   18   15     10 9 4
11049       0 q u 011110 immh immb opcode 1  n d
11050       Decode fields: u,opcode
11051    */
11052 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11053    if (INSN(31,31) != 0
11054        || INSN(28,23) != BITS6(0,1,1,1,1,0) || INSN(10,10) != 1) {
11055       return False;
11056    }
11057    UInt bitQ   = INSN(30,30);
11058    UInt bitU   = INSN(29,29);
11059    UInt immh   = INSN(22,19);
11060    UInt immb   = INSN(18,16);
11061    UInt opcode = INSN(15,11);
11062    UInt nn     = INSN(9,5);
11063    UInt dd     = INSN(4,0);
11064
11065    if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0)) {
11066       /* -------- 0,00000 SSHR std7_std7_#imm -------- */
11067       /* -------- 1,00000 USHR std7_std7_#imm -------- */
11068       /* -------- 0,00010 SSRA std7_std7_#imm -------- */
11069       /* -------- 1,00010 USRA std7_std7_#imm -------- */
11070       /* laneTy, shift = case immh:immb of
11071                          0001:xxx -> B, SHR:8-xxx
11072                          001x:xxx -> H, SHR:16-xxxx
11073                          01xx:xxx -> S, SHR:32-xxxxx
11074                          1xxx:xxx -> D, SHR:64-xxxxxx
11075                          other    -> invalid
11076       */
11077       UInt size  = 0;
11078       UInt shift = 0;
11079       Bool isQ   = bitQ == 1;
11080       Bool isU   = bitU == 1;
11081       Bool isAcc = opcode == BITS5(0,0,0,1,0);
11082       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11083       if (!ok || (bitQ == 0 && size == X11)) return False;
11084       vassert(size >= 0 && size <= 3);
11085       UInt lanebits = 8 << size;
11086       vassert(shift >= 1 && shift <= lanebits);
11087       IROp    op  = isU ? mkVecSHRN(size) : mkVecSARN(size);
11088       IRExpr* src = getQReg128(nn);
11089       IRTemp  shf = newTempV128();
11090       IRTemp  res = newTempV128();
11091       if (shift == lanebits && isU) {
11092          assign(shf, mkV128(0x0000));
11093       } else {
11094          UInt nudge = 0;
11095          if (shift == lanebits) {
11096             vassert(!isU);
11097             nudge = 1;
11098          }
11099          assign(shf, binop(op, src, mkU8(shift - nudge)));
11100       }
11101       assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
11102                         : mkexpr(shf));
11103       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11104       HChar laneCh = "bhsd"[size];
11105       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
11106       const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
11107                               : (isU ? "ushr" : "sshr");
11108       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
11109           nameQReg128(dd), nLanes, laneCh,
11110           nameQReg128(nn), nLanes, laneCh, shift);
11111       return True;
11112    }
11113
11114    if (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0)) {
11115       /* -------- 0,00100 SRSHR std7_std7_#imm -------- */
11116       /* -------- 1,00100 URSHR std7_std7_#imm -------- */
11117       /* -------- 0,00110 SRSRA std7_std7_#imm -------- */
11118       /* -------- 1,00110 URSRA std7_std7_#imm -------- */
11119       /* laneTy, shift = case immh:immb of
11120                          0001:xxx -> B, SHR:8-xxx
11121                          001x:xxx -> H, SHR:16-xxxx
11122                          01xx:xxx -> S, SHR:32-xxxxx
11123                          1xxx:xxx -> D, SHR:64-xxxxxx
11124                          other    -> invalid
11125       */
11126       UInt size  = 0;
11127       UInt shift = 0;
11128       Bool isQ   = bitQ == 1;
11129       Bool isU   = bitU == 1;
11130       Bool isAcc = opcode == BITS5(0,0,1,1,0);
11131       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11132       if (!ok || (bitQ == 0 && size == X11)) return False;
11133       vassert(size >= 0 && size <= 3);
11134       UInt lanebits = 8 << size;
11135       vassert(shift >= 1 && shift <= lanebits);
11136       IROp    op   = isU ? mkVecRSHU(size) : mkVecRSHS(size);
11137       IRExpr* src  = getQReg128(nn);
11138       IRTemp  imm8 = newTemp(Ity_I8);
11139       assign(imm8, mkU8((UChar)(-shift)));
11140       IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
11141       IRTemp  shf  = newTempV128();
11142       IRTemp  res  = newTempV128();
11143       assign(shf, binop(op, src, amt));
11144       assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
11145                         : mkexpr(shf));
11146       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11147       HChar laneCh = "bhsd"[size];
11148       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
11149       const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
11150                               : (isU ? "urshr" : "srshr");
11151       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
11152           nameQReg128(dd), nLanes, laneCh,
11153           nameQReg128(nn), nLanes, laneCh, shift);
11154       return True;
11155    }
11156
11157    if (bitU == 1 && opcode == BITS5(0,1,0,0,0)) {
11158       /* -------- 1,01000 SRI std7_std7_#imm -------- */
11159       /* laneTy, shift = case immh:immb of
11160                          0001:xxx -> B, SHR:8-xxx
11161                          001x:xxx -> H, SHR:16-xxxx
11162                          01xx:xxx -> S, SHR:32-xxxxx
11163                          1xxx:xxx -> D, SHR:64-xxxxxx
11164                          other    -> invalid
11165       */
11166       UInt size  = 0;
11167       UInt shift = 0;
11168       Bool isQ   = bitQ == 1;
11169       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11170       if (!ok || (bitQ == 0 && size == X11)) return False;
11171       vassert(size >= 0 && size <= 3);
11172       UInt lanebits = 8 << size;
11173       vassert(shift >= 1 && shift <= lanebits);
11174       IRExpr* src = getQReg128(nn);
11175       IRTemp  res = newTempV128();
11176       if (shift == lanebits) {
11177          assign(res, getQReg128(dd));
11178       } else {
11179          assign(res, binop(mkVecSHRN(size), src, mkU8(shift)));
11180          IRExpr* nmask = binop(mkVecSHLN(size),
11181                                mkV128(0xFFFF), mkU8(lanebits - shift));
11182          IRTemp  tmp   = newTempV128();
11183          assign(tmp, binop(Iop_OrV128,
11184                            mkexpr(res),
11185                            binop(Iop_AndV128, getQReg128(dd), nmask)));
11186          res = tmp;
11187       }
11188       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11189       HChar laneCh = "bhsd"[size];
11190       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
11191       DIP("%s %s.%u%c, %s.%u%c, #%u\n", "sri",
11192           nameQReg128(dd), nLanes, laneCh,
11193           nameQReg128(nn), nLanes, laneCh, shift);
11194       return True;
11195    }
11196
11197    if (opcode == BITS5(0,1,0,1,0)) {
11198       /* -------- 0,01010 SHL std7_std7_#imm -------- */
11199       /* -------- 1,01010 SLI std7_std7_#imm -------- */
11200       /* laneTy, shift = case immh:immb of
11201                          0001:xxx -> B, xxx
11202                          001x:xxx -> H, xxxx
11203                          01xx:xxx -> S, xxxxx
11204                          1xxx:xxx -> D, xxxxxx
11205                          other    -> invalid
11206       */
11207       UInt size  = 0;
11208       UInt shift = 0;
11209       Bool isSLI = bitU == 1;
11210       Bool isQ   = bitQ == 1;
11211       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11212       if (!ok || (bitQ == 0 && size == X11)) return False;
11213       vassert(size >= 0 && size <= 3);
11214       /* The shift encoding has opposite sign for the leftwards case.
11215          Adjust shift to compensate. */
11216       UInt lanebits = 8 << size;
11217       shift = lanebits - shift;
11218       vassert(shift >= 0 && shift < lanebits);
11219       IROp    op  = mkVecSHLN(size);
11220       IRExpr* src = getQReg128(nn);
11221       IRTemp  res = newTempV128();
11222       if (shift == 0) {
11223          assign(res, src);
11224       } else {
11225          assign(res, binop(op, src, mkU8(shift)));
11226          if (isSLI) {
11227             IRExpr* nmask = binop(mkVecSHRN(size),
11228                                   mkV128(0xFFFF), mkU8(lanebits - shift));
11229             IRTemp  tmp   = newTempV128();
11230             assign(tmp, binop(Iop_OrV128,
11231                               mkexpr(res),
11232                               binop(Iop_AndV128, getQReg128(dd), nmask)));
11233             res = tmp;
11234          }
11235       }
11236       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11237       HChar laneCh = "bhsd"[size];
11238       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
11239       const HChar* nm = isSLI ? "sli" : "shl";
11240       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
11241           nameQReg128(dd), nLanes, laneCh,
11242           nameQReg128(nn), nLanes, laneCh, shift);
11243       return True;
11244    }
11245
11246    if (opcode == BITS5(0,1,1,1,0)
11247        || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
11248       /* -------- 0,01110  SQSHL  std7_std7_#imm -------- */
11249       /* -------- 1,01110  UQSHL  std7_std7_#imm -------- */
11250       /* -------- 1,01100  SQSHLU std7_std7_#imm -------- */
11251       UInt size  = 0;
11252       UInt shift = 0;
11253       Bool isQ   = bitQ == 1;
11254       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11255       if (!ok || (bitQ == 0 && size == X11)) return False;
11256       vassert(size >= 0 && size <= 3);
11257       /* The shift encoding has opposite sign for the leftwards case.
11258          Adjust shift to compensate. */
11259       UInt lanebits = 8 << size;
11260       shift = lanebits - shift;
11261       vassert(shift >= 0 && shift < lanebits);
11262       const HChar* nm = NULL;
11263       /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
11264       else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
11265       else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
11266       else vassert(0);
11267       IRTemp qDiff1 = IRTemp_INVALID;
11268       IRTemp qDiff2 = IRTemp_INVALID;
11269       IRTemp res = IRTemp_INVALID;
11270       IRTemp src = newTempV128();
11271       assign(src, getQReg128(nn));
11272       math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
11273       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11274       updateQCFLAGwithDifferenceZHI(qDiff1, qDiff2,
11275                                     isQ ? Iop_INVALID : Iop_ZeroHI64ofV128);
11276       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11277       DIP("%s %s.%s, %s.%s, #%u\n", nm,
11278           nameQReg128(dd), arr, nameQReg128(nn), arr, shift);
11279       return True;
11280    }
11281
11282    if (bitU == 0
11283        && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1))) {
11284       /* -------- 0,10000  SHRN{,2} #imm -------- */
11285       /* -------- 0,10001 RSHRN{,2} #imm -------- */
11286       /* Narrows, and size is the narrow size. */
11287       UInt size  = 0;
11288       UInt shift = 0;
11289       Bool is2   = bitQ == 1;
11290       Bool isR   = opcode == BITS5(1,0,0,0,1);
11291       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11292       if (!ok || size == X11) return False;
11293       vassert(shift >= 1);
11294       IRTemp t1 = newTempV128();
11295       IRTemp t2 = newTempV128();
11296       IRTemp t3 = newTempV128();
11297       assign(t1, getQReg128(nn));
11298       assign(t2, isR ? binop(mkVecADD(size+1),
11299                              mkexpr(t1),
11300                              mkexpr(math_VEC_DUP_IMM(size+1, 1ULL<<(shift-1))))
11301                      : mkexpr(t1));
11302       assign(t3, binop(mkVecSHRN(size+1), mkexpr(t2), mkU8(shift)));
11303       IRTemp t4 = math_NARROW_LANES(t3, t3, size);
11304       putLO64andZUorPutHI64(is2, dd, t4);
11305       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11306       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11307       DIP("%s %s.%s, %s.%s, #%u\n", isR ? "rshrn" : "shrn",
11308           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
11309       return True;
11310    }
11311
11312    if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
11313        || (bitU == 1
11314            && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
11315       /* -------- 0,10010   SQSHRN{,2} #imm -------- */
11316       /* -------- 1,10010   UQSHRN{,2} #imm -------- */
11317       /* -------- 0,10011  SQRSHRN{,2} #imm -------- */
11318       /* -------- 1,10011  UQRSHRN{,2} #imm -------- */
11319       /* -------- 1,10000  SQSHRUN{,2} #imm -------- */
11320       /* -------- 1,10001 SQRSHRUN{,2} #imm -------- */
11321       UInt size  = 0;
11322       UInt shift = 0;
11323       Bool is2   = bitQ == 1;
11324       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11325       if (!ok || size == X11) return False;
11326       vassert(shift >= 1 && shift <= (8 << size));
11327       const HChar* nm = "??";
11328       IROp op = Iop_INVALID;
11329       /* Decide on the name and the operation. */
11330       /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
11331          nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size);
11332       }
11333       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
11334          nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size);
11335       }
11336       else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) {
11337          nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size);
11338       }
11339       else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
11340          nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size);
11341       }
11342       else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) {
11343          nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size);
11344       }
11345       else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) {
11346          nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size);
11347       }
11348       else vassert(0);
11349       /* Compute the result (Q, shifted value) pair. */
11350       IRTemp src128 = newTempV128();
11351       assign(src128, getQReg128(nn));
11352       IRTemp pair = newTempV128();
11353       assign(pair, binop(op, mkexpr(src128), mkU8(shift)));
11354       /* Update the result reg */
11355       IRTemp res64in128 = newTempV128();
11356       assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair)));
11357       putLO64andZUorPutHI64(is2, dd, res64in128);
11358       /* Update the Q flag. */
11359       IRTemp q64q64 = newTempV128();
11360       assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair)));
11361       IRTemp z128 = newTempV128();
11362       assign(z128, mkV128(0x0000));
11363       updateQCFLAGwithDifference(q64q64, z128);
11364       /* */
11365       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11366       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11367       DIP("%s %s.%s, %s.%s, #%u\n", nm,
11368           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
11369       return True;
11370    }
11371
11372    if (opcode == BITS5(1,0,1,0,0)) {
11373       /* -------- 0,10100 SSHLL{,2} #imm -------- */
11374       /* -------- 1,10100 USHLL{,2} #imm -------- */
11375       /* 31  28     22   18   15     9 4
11376          0q0 011110 immh immb 101001 n d  SSHLL Vd.Ta, Vn.Tb, #sh
11377          0q1 011110 immh immb 101001 n d  USHLL Vd.Ta, Vn.Tb, #sh
11378          where Ta,Tb,sh
11379            = case immh of 1xxx -> invalid
11380                           01xx -> 2d, 2s(q0)/4s(q1),  immh:immb - 32 (0..31)
11381                           001x -> 4s, 4h(q0)/8h(q1),  immh:immb - 16 (0..15)
11382                           0001 -> 8h, 8b(q0)/16b(q1), immh:immb - 8  (0..7)
11383                           0000 -> AdvSIMD modified immediate (???)
11384       */
11385       Bool    isQ   = bitQ == 1;
11386       Bool    isU   = bitU == 1;
11387       UInt    immhb = (immh << 3) | immb;
11388       IRTemp  src   = newTempV128();
11389       IRTemp  zero  = newTempV128();
11390       IRExpr* res   = NULL;
11391       UInt    sh    = 0;
11392       const HChar* ta = "??";
11393       const HChar* tb = "??";
11394       assign(src, getQReg128(nn));
11395       assign(zero, mkV128(0x0000));
11396       if (immh & 8) {
11397          /* invalid; don't assign to res */
11398       }
11399       else if (immh & 4) {
11400          sh = immhb - 32;
11401          vassert(sh < 32); /* so 32-sh is 1..32 */
11402          ta = "2d";
11403          tb = isQ ? "4s" : "2s";
11404          IRExpr* tmp = isQ ? mk_InterleaveHI32x4(src, zero)
11405                            : mk_InterleaveLO32x4(src, zero);
11406          res = binop(isU ? Iop_ShrN64x2 : Iop_SarN64x2, tmp, mkU8(32-sh));
11407       }
11408       else if (immh & 2) {
11409          sh = immhb - 16;
11410          vassert(sh < 16); /* so 16-sh is 1..16 */
11411          ta = "4s";
11412          tb = isQ ? "8h" : "4h";
11413          IRExpr* tmp = isQ ? mk_InterleaveHI16x8(src, zero)
11414                            : mk_InterleaveLO16x8(src, zero);
11415          res = binop(isU ? Iop_ShrN32x4 : Iop_SarN32x4, tmp, mkU8(16-sh));
11416       }
11417       else if (immh & 1) {
11418          sh = immhb - 8;
11419          vassert(sh < 8); /* so 8-sh is 1..8 */
11420          ta = "8h";
11421          tb = isQ ? "16b" : "8b";
11422          IRExpr* tmp = isQ ? mk_InterleaveHI8x16(src, zero)
11423                            : mk_InterleaveLO8x16(src, zero);
11424          res = binop(isU ? Iop_ShrN16x8 : Iop_SarN16x8, tmp, mkU8(8-sh));
11425       } else {
11426          vassert(immh == 0);
11427          /* invalid; don't assign to res */
11428       }
11429       /* */
11430       if (res) {
11431          putQReg128(dd, res);
11432          DIP("%cshll%s %s.%s, %s.%s, #%u\n",
11433              isU ? 'u' : 's', isQ ? "2" : "",
11434              nameQReg128(dd), ta, nameQReg128(nn), tb, sh);
11435          return True;
11436       }
11437       return False;
11438    }
11439
11440    if (opcode == BITS5(1,1,1,0,0)) {
11441       /* -------- 0,11100 SCVTF {2d_2d,4s_4s,2s_2s}_imm -------- */
11442       /* -------- 1,11100 UCVTF {2d_2d,4s_4s,2s_2s}_imm -------- */
11443       /* If immh is of the form 00xx, the insn is invalid. */
11444       if (immh < BITS4(0,1,0,0)) return False;
11445       UInt size  = 0;
11446       UInt fbits = 0;
11447       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
11448       /* The following holds because immh is never zero. */
11449       vassert(ok);
11450       /* The following holds because immh >= 0100. */
11451       vassert(size == X10 || size == X11);
11452       Bool isD = size == X11;
11453       Bool isU = bitU == 1;
11454       Bool isQ = bitQ == 1;
11455       if (isD && !isQ) return False; /* reject .1d case */
11456       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
11457       Double  scale  = two_to_the_minus(fbits);
11458       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
11459                            : IRExpr_Const(IRConst_F32( (Float)scale ));
11460       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
11461       IROp    opCVT  = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
11462                            : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
11463       IRType tyF = isD ? Ity_F64 : Ity_F32;
11464       IRType tyI = isD ? Ity_I64 : Ity_I32;
11465       UInt nLanes = (isQ ? 2 : 1) * (isD ? 1 : 2);
11466       vassert(nLanes == 2 || nLanes == 4);
11467       for (UInt i = 0; i < nLanes; i++) {
11468          IRTemp src = newTemp(tyI);
11469          IRTemp res = newTemp(tyF);
11470          IRTemp rm  = mk_get_IR_rounding_mode();
11471          assign(src, getQRegLane(nn, i, tyI));
11472          assign(res, triop(opMUL, mkexpr(rm),
11473                                   binop(opCVT, mkexpr(rm), mkexpr(src)),
11474                                   scaleE));
11475          putQRegLane(dd, i, mkexpr(res));
11476       }
11477       if (!isQ) {
11478          putQRegLane(dd, 1, mkU64(0));
11479       }
11480       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11481       DIP("%s %s.%s, %s.%s, #%u\n", isU ? "ucvtf" : "scvtf",
11482           nameQReg128(dd), arr, nameQReg128(nn), arr, fbits);
11483       return True;
11484    }
11485
11486    if (opcode == BITS5(1,1,1,1,1)) {
11487       /* -------- 0,11111 FCVTZS {2d_2d,4s_4s,2s_2s}_imm -------- */
11488       /* -------- 1,11111 FCVTZU {2d_2d,4s_4s,2s_2s}_imm -------- */
11489       /* If immh is of the form 00xx, the insn is invalid. */
11490       if (immh < BITS4(0,1,0,0)) return False;
11491       UInt size  = 0;
11492       UInt fbits = 0;
11493       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
11494       /* The following holds because immh is never zero. */
11495       vassert(ok);
11496       /* The following holds because immh >= 0100. */
11497       vassert(size == X10 || size == X11);
11498       Bool isD = size == X11;
11499       Bool isU = bitU == 1;
11500       Bool isQ = bitQ == 1;
11501       if (isD && !isQ) return False; /* reject .1d case */
11502       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
11503       Double  scale  = two_to_the_plus(fbits);
11504       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
11505                            : IRExpr_Const(IRConst_F32( (Float)scale ));
11506       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
11507       IROp    opCVT  = isU ? (isD ? Iop_F64toI64U : Iop_F32toI32U)
11508                            : (isD ? Iop_F64toI64S : Iop_F32toI32S);
11509       IRType tyF = isD ? Ity_F64 : Ity_F32;
11510       IRType tyI = isD ? Ity_I64 : Ity_I32;
11511       UInt nLanes = (isQ ? 2 : 1) * (isD ? 1 : 2);
11512       vassert(nLanes == 2 || nLanes == 4);
11513       for (UInt i = 0; i < nLanes; i++) {
11514          IRTemp src = newTemp(tyF);
11515          IRTemp res = newTemp(tyI);
11516          IRTemp rm  = newTemp(Ity_I32);
11517          assign(src, getQRegLane(nn, i, tyF));
11518          assign(rm,  mkU32(Irrm_ZERO));
11519          assign(res, binop(opCVT, mkexpr(rm),
11520                                   triop(opMUL, mkexpr(rm),
11521                                                mkexpr(src), scaleE)));
11522          putQRegLane(dd, i, mkexpr(res));
11523       }
11524       if (!isQ) {
11525          putQRegLane(dd, 1, mkU64(0));
11526       }
11527       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11528       DIP("%s %s.%s, %s.%s, #%u\n", isU ? "fcvtzu" : "fcvtzs",
11529           nameQReg128(dd), arr, nameQReg128(nn), arr, fbits);
11530       return True;
11531    }
11532
11533 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11534    return False;
11535 #  undef INSN
11536 }
11537
11538
11539 static
11540 Bool dis_AdvSIMD_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
11541 {
11542    /* 31 30 29 28    23   21 20 15     11 9 4
11543       0  Q  U  01110 size 1  m  opcode 00 n d
11544       Decode fields: u,opcode
11545    */
11546 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11547    if (INSN(31,31) != 0
11548        || INSN(28,24) != BITS5(0,1,1,1,0)
11549        || INSN(21,21) != 1
11550        || INSN(11,10) != BITS2(0,0)) {
11551       return False;
11552    }
11553    UInt bitQ   = INSN(30,30);
11554    UInt bitU   = INSN(29,29);
11555    UInt size   = INSN(23,22);
11556    UInt mm     = INSN(20,16);
11557    UInt opcode = INSN(15,12);
11558    UInt nn     = INSN(9,5);
11559    UInt dd     = INSN(4,0);
11560    vassert(size < 4);
11561    Bool is2    = bitQ == 1;
11562
11563    if (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,0,1,0)) {
11564       /* -------- 0,0000 SADDL{2} -------- */
11565       /* -------- 1,0000 UADDL{2} -------- */
11566       /* -------- 0,0010 SSUBL{2} -------- */
11567       /* -------- 1,0010 USUBL{2} -------- */
11568       /* Widens, and size refers to the narrow lanes. */
11569       if (size == X11) return False;
11570       vassert(size <= 2);
11571       Bool   isU   = bitU == 1;
11572       Bool   isADD = opcode == BITS4(0,0,0,0);
11573       IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
11574       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
11575       IRTemp res   = newTempV128();
11576       assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
11577                         mkexpr(argL), mkexpr(argR)));
11578       putQReg128(dd, mkexpr(res));
11579       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11580       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11581       const HChar* nm        = isADD ? (isU ? "uaddl" : "saddl")
11582                                      : (isU ? "usubl" : "ssubl");
11583       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11584           nameQReg128(dd), arrWide,
11585           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11586       return True;
11587    }
11588
11589    if (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,0,1,1)) {
11590       /* -------- 0,0001 SADDW{2} -------- */
11591       /* -------- 1,0001 UADDW{2} -------- */
11592       /* -------- 0,0011 SSUBW{2} -------- */
11593       /* -------- 1,0011 USUBW{2} -------- */
11594       /* Widens, and size refers to the narrow lanes. */
11595       if (size == X11) return False;
11596       vassert(size <= 2);
11597       Bool   isU   = bitU == 1;
11598       Bool   isADD = opcode == BITS4(0,0,0,1);
11599       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
11600       IRTemp res   = newTempV128();
11601       assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
11602                         getQReg128(nn), mkexpr(argR)));
11603       putQReg128(dd, mkexpr(res));
11604       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11605       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11606       const HChar* nm        = isADD ? (isU ? "uaddw" : "saddw")
11607                                      : (isU ? "usubw" : "ssubw");
11608       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11609           nameQReg128(dd), arrWide,
11610           nameQReg128(nn), arrWide, nameQReg128(mm), arrNarrow);
11611       return True;
11612    }
11613
11614    if (opcode == BITS4(0,1,0,0) || opcode == BITS4(0,1,1,0)) {
11615       /* -------- 0,0100  ADDHN{2} -------- */
11616       /* -------- 1,0100 RADDHN{2} -------- */
11617       /* -------- 0,0110  SUBHN{2} -------- */
11618       /* -------- 1,0110 RSUBHN{2} -------- */
11619       /* Narrows, and size refers to the narrowed lanes. */
11620       if (size == X11) return False;
11621       vassert(size <= 2);
11622       const UInt shift[3] = { 8, 16, 32 };
11623       Bool isADD = opcode == BITS4(0,1,0,0);
11624       Bool isR   = bitU == 1;
11625       /* Combined elements in wide lanes */
11626       IRTemp  wide  = newTempV128();
11627       IRExpr* wideE = binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
11628                             getQReg128(nn), getQReg128(mm));
11629       if (isR) {
11630          wideE = binop(mkVecADD(size+1),
11631                        wideE,
11632                        mkexpr(math_VEC_DUP_IMM(size+1,
11633                                                1ULL << (shift[size]-1))));
11634       }
11635       assign(wide, wideE);
11636       /* Top halves of elements, still in wide lanes */
11637       IRTemp shrd = newTempV128();
11638       assign(shrd, binop(mkVecSHRN(size+1), mkexpr(wide), mkU8(shift[size])));
11639       /* Elements now compacted into lower 64 bits */
11640       IRTemp new64 = newTempV128();
11641       assign(new64, binop(mkVecCATEVENLANES(size), mkexpr(shrd), mkexpr(shrd)));
11642       putLO64andZUorPutHI64(is2, dd, new64);
11643       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11644       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11645       const HChar* nm = isADD ? (isR ? "raddhn" : "addhn")
11646                               : (isR ? "rsubhn" : "subhn");
11647       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11648           nameQReg128(dd), arrNarrow,
11649           nameQReg128(nn), arrWide, nameQReg128(mm), arrWide);
11650       return True;
11651    }
11652
11653    if (opcode == BITS4(0,1,0,1) || opcode == BITS4(0,1,1,1)) {
11654       /* -------- 0,0101 SABAL{2} -------- */
11655       /* -------- 1,0101 UABAL{2} -------- */
11656       /* -------- 0,0111 SABDL{2} -------- */
11657       /* -------- 1,0111 UABDL{2} -------- */
11658       /* Widens, and size refers to the narrow lanes. */
11659       if (size == X11) return False;
11660       vassert(size <= 2);
11661       Bool   isU   = bitU == 1;
11662       Bool   isACC = opcode == BITS4(0,1,0,1);
11663       IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
11664       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
11665       IRTemp abd   = math_ABD(isU, size+1, mkexpr(argL), mkexpr(argR));
11666       IRTemp res   = newTempV128();
11667       assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(abd), getQReg128(dd))
11668                         : mkexpr(abd));
11669       putQReg128(dd, mkexpr(res));
11670       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11671       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11672       const HChar* nm        = isACC ? (isU ? "uabal" : "sabal")
11673                                      : (isU ? "uabdl" : "sabdl");
11674       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11675           nameQReg128(dd), arrWide,
11676           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11677       return True;
11678    }
11679
11680    if (opcode == BITS4(1,1,0,0)
11681        || opcode == BITS4(1,0,0,0) || opcode == BITS4(1,0,1,0)) {
11682       /* -------- 0,1100  SMULL{2} -------- */ // 0 (ks)
11683       /* -------- 1,1100  UMULL{2} -------- */ // 0
11684       /* -------- 0,1000  SMLAL{2} -------- */ // 1
11685       /* -------- 1,1000  UMLAL{2} -------- */ // 1
11686       /* -------- 0,1010  SMLSL{2} -------- */ // 2
11687       /* -------- 1,1010  UMLSL{2} -------- */ // 2
11688       /* Widens, and size refers to the narrow lanes. */
11689       UInt ks = 3;
11690       switch (opcode) {
11691          case BITS4(1,1,0,0): ks = 0; break;
11692          case BITS4(1,0,0,0): ks = 1; break;
11693          case BITS4(1,0,1,0): ks = 2; break;
11694          default: vassert(0);
11695       }
11696       vassert(ks >= 0 && ks <= 2);
11697       if (size == X11) return False;
11698       vassert(size <= 2);
11699       Bool   isU  = bitU == 1;
11700       IRTemp vecN = newTempV128();
11701       IRTemp vecM = newTempV128();
11702       IRTemp vecD = newTempV128();
11703       assign(vecN, getQReg128(nn));
11704       assign(vecM, getQReg128(mm));
11705       assign(vecD, getQReg128(dd));
11706       IRTemp res = IRTemp_INVALID;
11707       math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
11708                     vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
11709       putQReg128(dd, mkexpr(res));
11710       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11711       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11712       const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
11713       DIP("%c%s%s %s.%s, %s.%s, %s.%s\n", isU ? 'u' : 's', nm, is2 ? "2" : "",
11714           nameQReg128(dd), arrWide,
11715           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11716       return True;
11717    }
11718
11719    if (bitU == 0
11720        && (opcode == BITS4(1,1,0,1)
11721            || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
11722       /* -------- 0,1101  SQDMULL{2} -------- */ // 0 (ks)
11723       /* -------- 0,1001  SQDMLAL{2} -------- */ // 1
11724       /* -------- 0,1011  SQDMLSL{2} -------- */ // 2
11725       /* Widens, and size refers to the narrow lanes. */
11726       UInt ks = 3;
11727       switch (opcode) {
11728          case BITS4(1,1,0,1): ks = 0; break;
11729          case BITS4(1,0,0,1): ks = 1; break;
11730          case BITS4(1,0,1,1): ks = 2; break;
11731          default: vassert(0);
11732       }
11733       vassert(ks >= 0 && ks <= 2);
11734       if (size == X00 || size == X11) return False;
11735       vassert(size <= 2);
11736       IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
11737       vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
11738       newTempsV128_3(&vecN, &vecM, &vecD);
11739       assign(vecN, getQReg128(nn));
11740       assign(vecM, getQReg128(mm));
11741       assign(vecD, getQReg128(dd));
11742       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
11743                        is2, size, "mas"[ks],
11744                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
11745       putQReg128(dd, mkexpr(res));
11746       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
11747       updateQCFLAGwithDifference(sat1q, sat1n);
11748       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
11749          updateQCFLAGwithDifference(sat2q, sat2n);
11750       }
11751       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11752       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11753       const HChar* nm        = ks == 0 ? "sqdmull"
11754                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
11755       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11756           nameQReg128(dd), arrWide,
11757           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11758       return True;
11759    }
11760
11761    if (bitU == 0 && opcode == BITS4(1,1,1,0)) {
11762       /* -------- 0,1110  PMULL{2} -------- */
11763       /* Widens, and size refers to the narrow lanes. */
11764       if (size != X00 && size != X11) return False;
11765       IRTemp  res  = IRTemp_INVALID;
11766       IRExpr* srcN = getQReg128(nn);
11767       IRExpr* srcM = getQReg128(mm);
11768       const HChar* arrNarrow = NULL;
11769       const HChar* arrWide   = NULL;
11770       if (size == X00) {
11771          res = math_BINARY_WIDENING_V128(is2, Iop_PolynomialMull8x8,
11772                                          srcN, srcM);
11773          arrNarrow = nameArr_Q_SZ(bitQ, size);
11774          arrWide   = nameArr_Q_SZ(1,    size+1);
11775       } else {
11776          /* The same thing as the X00 case, except we have to call
11777             a helper to do it. */
11778          vassert(size == X11);
11779          res = newTemp(Ity_V128);
11780          IROp slice
11781             = is2 ? Iop_V128HIto64 : Iop_V128to64;
11782          IRExpr** args
11783             = mkIRExprVec_3( IRExpr_VECRET(),
11784                              unop(slice, srcN), unop(slice, srcM));
11785          IRDirty* di
11786             = unsafeIRDirty_1_N( res, 0/*regparms*/,
11787                                       "arm64g_dirtyhelper_PMULLQ",
11788                                       &arm64g_dirtyhelper_PMULLQ, args);
11789          stmt(IRStmt_Dirty(di));
11790          /* We can't use nameArr_Q_SZ for this because it can't deal with
11791             Q-sized (128 bit) results.  Hence do it by hand. */
11792          arrNarrow = bitQ == 0 ? "1d" : "2d";
11793          arrWide   = "1q";
11794       }
11795       putQReg128(dd, mkexpr(res));
11796       DIP("%s%s %s.%s, %s.%s, %s.%s\n", "pmull", is2 ? "2" : "",
11797           nameQReg128(dd), arrWide,
11798           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11799       return True;
11800    }
11801
11802    return False;
11803 #  undef INSN
11804 }
11805
11806
11807 static
11808 Bool dis_AdvSIMD_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
11809 {
11810    /* 31 30 29 28    23   21 20 15     10 9 4
11811       0  Q  U  01110 size 1  m  opcode 1  n d
11812       Decode fields: u,size,opcode
11813    */
11814 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11815    if (INSN(31,31) != 0
11816        || INSN(28,24) != BITS5(0,1,1,1,0)
11817        || INSN(21,21) != 1
11818        || INSN(10,10) != 1) {
11819       return False;
11820    }
11821    UInt bitQ   = INSN(30,30);
11822    UInt bitU   = INSN(29,29);
11823    UInt size   = INSN(23,22);
11824    UInt mm     = INSN(20,16);
11825    UInt opcode = INSN(15,11);
11826    UInt nn     = INSN(9,5);
11827    UInt dd     = INSN(4,0);
11828    vassert(size < 4);
11829
11830    if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,1,0,0)) {
11831       /* -------- 0,xx,00000 SHADD std6_std6_std6 -------- */
11832       /* -------- 1,xx,00000 UHADD std6_std6_std6 -------- */
11833       /* -------- 0,xx,00100 SHSUB std6_std6_std6 -------- */
11834       /* -------- 1,xx,00100 UHSUB std6_std6_std6 -------- */
11835       if (size == X11) return False;
11836       Bool isADD = opcode == BITS5(0,0,0,0,0);
11837       Bool isU   = bitU == 1;
11838       /* Widen both args out, do the math, narrow to final result. */
11839       IRTemp argL   = newTempV128();
11840       IRTemp argLhi = IRTemp_INVALID;
11841       IRTemp argLlo = IRTemp_INVALID;
11842       IRTemp argR   = newTempV128();
11843       IRTemp argRhi = IRTemp_INVALID;
11844       IRTemp argRlo = IRTemp_INVALID;
11845       IRTemp resHi  = newTempV128();
11846       IRTemp resLo  = newTempV128();
11847       IRTemp res    = IRTemp_INVALID;
11848       assign(argL, getQReg128(nn));
11849       argLlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argL));
11850       argLhi = math_WIDEN_LO_OR_HI_LANES(isU, True,  size, mkexpr(argL));
11851       assign(argR, getQReg128(mm));
11852       argRlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argR));
11853       argRhi = math_WIDEN_LO_OR_HI_LANES(isU, True,  size, mkexpr(argR));
11854       IROp opADDSUB = isADD ? mkVecADD(size+1) : mkVecSUB(size+1);
11855       IROp opSxR = isU ? mkVecSHRN(size+1) : mkVecSARN(size+1);
11856       assign(resHi, binop(opSxR,
11857                           binop(opADDSUB, mkexpr(argLhi), mkexpr(argRhi)),
11858                           mkU8(1)));
11859       assign(resLo, binop(opSxR,
11860                           binop(opADDSUB, mkexpr(argLlo), mkexpr(argRlo)),
11861                           mkU8(1)));
11862       res = math_NARROW_LANES ( resHi, resLo, size );
11863       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11864       const HChar* nm  = isADD ? (isU ? "uhadd" : "shadd")
11865                                : (isU ? "uhsub" : "shsub");
11866       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11867       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11868           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11869       return True;
11870    }
11871
11872    if (opcode == BITS5(0,0,0,1,0)) {
11873       /* -------- 0,xx,00010 SRHADD std7_std7_std7 -------- */
11874       /* -------- 1,xx,00010 URHADD std7_std7_std7 -------- */
11875       if (bitQ == 0 && size == X11) return False; // implied 1d case
11876       Bool   isU  = bitU == 1;
11877       IRTemp argL = newTempV128();
11878       IRTemp argR = newTempV128();
11879       assign(argL, getQReg128(nn));
11880       assign(argR, getQReg128(mm));
11881       IRTemp res = math_RHADD(size, isU, argL, argR);
11882       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11883       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11884       DIP("%s %s.%s, %s.%s, %s.%s\n", isU ? "urhadd" : "srhadd",
11885           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11886       return True;
11887    }
11888
11889    if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
11890       /* -------- 0,xx,00001 SQADD std7_std7_std7 -------- */
11891       /* -------- 1,xx,00001 UQADD std7_std7_std7 -------- */
11892       /* -------- 0,xx,00101 SQSUB std7_std7_std7 -------- */
11893       /* -------- 1,xx,00101 UQSUB std7_std7_std7 -------- */
11894       if (bitQ == 0 && size == X11) return False; // implied 1d case
11895       Bool isADD = opcode == BITS5(0,0,0,0,1);
11896       Bool isU   = bitU == 1;
11897       IROp qop   = Iop_INVALID;
11898       IROp nop   = Iop_INVALID;
11899       if (isADD) {
11900          qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
11901          nop = mkVecADD(size);
11902       } else {
11903          qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
11904          nop = mkVecSUB(size);
11905       }
11906       IRTemp argL = newTempV128();
11907       IRTemp argR = newTempV128();
11908       IRTemp qres = newTempV128();
11909       IRTemp nres = newTempV128();
11910       assign(argL, getQReg128(nn));
11911       assign(argR, getQReg128(mm));
11912       assign(qres, math_MAYBE_ZERO_HI64_fromE(
11913                       bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
11914       assign(nres, math_MAYBE_ZERO_HI64_fromE(
11915                       bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
11916       putQReg128(dd, mkexpr(qres));
11917       updateQCFLAGwithDifference(qres, nres);
11918       const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd")
11919                                : (isU ? "uqsub" : "sqsub");
11920       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11921       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11922           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11923       return True;
11924    }
11925
11926    if (bitU == 0 && opcode == BITS5(0,0,0,1,1)) {
11927       /* -------- 0,00,00011 AND 16b_16b_16b, 8b_8b_8b -------- */
11928       /* -------- 0,01,00011 BIC 16b_16b_16b, 8b_8b_8b -------- */
11929       /* -------- 0,10,00011 ORR 16b_16b_16b, 8b_8b_8b -------- */
11930       /* -------- 0,10,00011 ORN 16b_16b_16b, 8b_8b_8b -------- */
11931       Bool   isORx  = (size & 2) == 2;
11932       Bool   invert = (size & 1) == 1;
11933       IRTemp res    = newTempV128();
11934       assign(res, binop(isORx ? Iop_OrV128 : Iop_AndV128,
11935                         getQReg128(nn),
11936                         invert ? unop(Iop_NotV128, getQReg128(mm))
11937                                : getQReg128(mm)));
11938       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11939       const HChar* names[4] = { "and", "bic", "orr", "orn" };
11940       const HChar* ar = bitQ == 1 ? "16b" : "8b";
11941       DIP("%s %s.%s, %s.%s, %s.%s\n", names[INSN(23,22)],
11942           nameQReg128(dd), ar, nameQReg128(nn), ar, nameQReg128(mm), ar);
11943       return True;
11944    }
11945
11946    if (bitU == 1 && opcode == BITS5(0,0,0,1,1)) {
11947       /* -------- 1,00,00011 EOR 16b_16b_16b, 8b_8b_8b -------- */
11948       /* -------- 1,01,00011 BSL 16b_16b_16b, 8b_8b_8b -------- */
11949       /* -------- 1,10,00011 BIT 16b_16b_16b, 8b_8b_8b -------- */
11950       /* -------- 1,10,00011 BIF 16b_16b_16b, 8b_8b_8b -------- */
11951       IRTemp argD = newTempV128();
11952       IRTemp argN = newTempV128();
11953       IRTemp argM = newTempV128();
11954       assign(argD, getQReg128(dd));
11955       assign(argN, getQReg128(nn));
11956       assign(argM, getQReg128(mm));
11957       const IROp opXOR = Iop_XorV128;
11958       const IROp opAND = Iop_AndV128;
11959       const IROp opNOT = Iop_NotV128;
11960       IRTemp res = newTempV128();
11961       switch (size) {
11962          case BITS2(0,0): /* EOR */
11963             assign(res, binop(opXOR, mkexpr(argM), mkexpr(argN)));
11964             break;
11965          case BITS2(0,1): /* BSL */
11966             assign(res, binop(opXOR, mkexpr(argM),
11967                               binop(opAND,
11968                                     binop(opXOR, mkexpr(argM), mkexpr(argN)),
11969                                           mkexpr(argD))));
11970             break;
11971          case BITS2(1,0): /* BIT */
11972             assign(res, binop(opXOR, mkexpr(argD),
11973                               binop(opAND,
11974                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
11975                                     mkexpr(argM))));
11976             break;
11977          case BITS2(1,1): /* BIF */
11978             assign(res, binop(opXOR, mkexpr(argD),
11979                               binop(opAND,
11980                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
11981                                     unop(opNOT, mkexpr(argM)))));
11982             break;
11983          default:
11984             vassert(0);
11985       }
11986       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11987       const HChar* nms[4] = { "eor", "bsl", "bit", "bif" };
11988       const HChar* arr = bitQ == 1 ? "16b" : "8b";
11989       DIP("%s %s.%s, %s.%s, %s.%s\n", nms[size],
11990           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11991       return True;
11992    }
11993
11994    if (opcode == BITS5(0,0,1,1,0)) {
11995       /* -------- 0,xx,00110 CMGT std7_std7_std7 -------- */ // >s
11996       /* -------- 1,xx,00110 CMHI std7_std7_std7 -------- */ // >u
11997       if (bitQ == 0 && size == X11) return False; // implied 1d case
11998       Bool   isGT  = bitU == 0;
11999       IRExpr* argL = getQReg128(nn);
12000       IRExpr* argR = getQReg128(mm);
12001       IRTemp  res  = newTempV128();
12002       assign(res,
12003              isGT ? binop(mkVecCMPGTS(size), argL, argR)
12004                   : binop(mkVecCMPGTU(size), argL, argR));
12005       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12006       const HChar* nm  = isGT ? "cmgt" : "cmhi";
12007       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12008       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12009           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12010       return True;
12011    }
12012
12013    if (opcode == BITS5(0,0,1,1,1)) {
12014       /* -------- 0,xx,00111 CMGE std7_std7_std7 -------- */ // >=s
12015       /* -------- 1,xx,00111 CMHS std7_std7_std7 -------- */ // >=u
12016       if (bitQ == 0 && size == X11) return False; // implied 1d case
12017       Bool    isGE = bitU == 0;
12018       IRExpr* argL = getQReg128(nn);
12019       IRExpr* argR = getQReg128(mm);
12020       IRTemp  res  = newTempV128();
12021       assign(res,
12022              isGE ? unop(Iop_NotV128, binop(mkVecCMPGTS(size), argR, argL))
12023                   : unop(Iop_NotV128, binop(mkVecCMPGTU(size), argR, argL)));
12024       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12025       const HChar* nm  = isGE ? "cmge" : "cmhs";
12026       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12027       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12028           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12029       return True;
12030    }
12031
12032    if (opcode == BITS5(0,1,0,0,0) || opcode == BITS5(0,1,0,1,0)) {
12033       /* -------- 0,xx,01000 SSHL  std7_std7_std7 -------- */
12034       /* -------- 0,xx,01010 SRSHL std7_std7_std7 -------- */
12035       /* -------- 1,xx,01000 USHL  std7_std7_std7 -------- */
12036       /* -------- 1,xx,01010 URSHL std7_std7_std7 -------- */
12037       if (bitQ == 0 && size == X11) return False; // implied 1d case
12038       Bool isU = bitU == 1;
12039       Bool isR = opcode == BITS5(0,1,0,1,0);
12040       IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
12041                      : (isU ? mkVecSHU(size)  : mkVecSHS(size));
12042       IRTemp res = newTempV128();
12043       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
12044       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12045       const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
12046                              : (isU ? "ushl"  : "sshl");
12047       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12048       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12049           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12050       return True;
12051    }
12052
12053    if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
12054       /* -------- 0,xx,01001 SQSHL  std7_std7_std7 -------- */
12055       /* -------- 0,xx,01011 SQRSHL std7_std7_std7 -------- */
12056       /* -------- 1,xx,01001 UQSHL  std7_std7_std7 -------- */
12057       /* -------- 1,xx,01011 UQRSHL std7_std7_std7 -------- */
12058       if (bitQ == 0 && size == X11) return False; // implied 1d case
12059       Bool isU = bitU == 1;
12060       Bool isR = opcode == BITS5(0,1,0,1,1);
12061       IROp op  = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
12062                      : (isU ? mkVecQANDUQSH(size)  : mkVecQANDSQSH(size));
12063       /* This is a bit tricky.  If we're only interested in the lowest 64 bits
12064          of the result (viz, bitQ == 0), then we must adjust the operands to
12065          ensure that the upper part of the result, that we don't care about,
12066          doesn't pollute the returned Q value.  To do this, zero out the upper
12067          operand halves beforehand.  This works because it means, for the
12068          lanes we don't care about, we are shifting zero by zero, which can
12069          never saturate. */
12070       IRTemp res256 = newTemp(Ity_V256);
12071       IRTemp resSH  = newTempV128();
12072       IRTemp resQ   = newTempV128();
12073       IRTemp zero   = newTempV128();
12074       assign(res256, binop(op,
12075                            math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(nn)),
12076                            math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(mm))));
12077       assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
12078       assign(resQ,  unop(Iop_V256toV128_1, mkexpr(res256)));
12079       assign(zero,  mkV128(0x0000));
12080       putQReg128(dd, mkexpr(resSH));
12081       updateQCFLAGwithDifference(resQ, zero);
12082       const HChar* nm  = isR ? (isU ? "uqrshl" : "sqrshl")
12083                              : (isU ? "uqshl"  : "sqshl");
12084       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12085       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12086           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12087       return True;
12088    }
12089
12090    if (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,0,1)) {
12091       /* -------- 0,xx,01100 SMAX std7_std7_std7 -------- */
12092       /* -------- 1,xx,01100 UMAX std7_std7_std7 -------- */
12093       /* -------- 0,xx,01101 SMIN std7_std7_std7 -------- */
12094       /* -------- 1,xx,01101 UMIN std7_std7_std7 -------- */
12095       if (bitQ == 0 && size == X11) return False; // implied 1d case
12096       Bool isU   = bitU == 1;
12097       Bool isMAX = (opcode & 1) == 0;
12098       IROp op    = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
12099                          : (isU ? mkVecMINU(size) : mkVecMINS(size));
12100       IRTemp t   = newTempV128();
12101       assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
12102       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t));
12103       const HChar* nm = isMAX ? (isU ? "umax" : "smax")
12104                               : (isU ? "umin" : "smin");
12105       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12106       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12107           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12108       return True;
12109    }
12110
12111    if (opcode == BITS5(0,1,1,1,0) || opcode == BITS5(0,1,1,1,1)) {
12112       /* -------- 0,xx,01110 SABD std6_std6_std6 -------- */
12113       /* -------- 1,xx,01110 UABD std6_std6_std6 -------- */
12114       /* -------- 0,xx,01111 SABA std6_std6_std6 -------- */
12115       /* -------- 1,xx,01111 UABA std6_std6_std6 -------- */
12116       if (size == X11) return False; // 1d/2d cases not allowed
12117       Bool isU   = bitU == 1;
12118       Bool isACC = opcode == BITS5(0,1,1,1,1);
12119       vassert(size <= 2);
12120       IRTemp t1 = math_ABD(isU, size, getQReg128(nn), getQReg128(mm));
12121       IRTemp t2 = newTempV128();
12122       assign(t2, isACC ? binop(mkVecADD(size), mkexpr(t1), getQReg128(dd))
12123                        : mkexpr(t1));
12124       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
12125       const HChar* nm  = isACC ? (isU ? "uaba" : "saba")
12126                                : (isU ? "uabd" : "sabd");
12127       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12128       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12129           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12130       return True;
12131    }
12132
12133    if (opcode == BITS5(1,0,0,0,0)) {
12134       /* -------- 0,xx,10000 ADD std7_std7_std7 -------- */
12135       /* -------- 1,xx,10000 SUB std7_std7_std7 -------- */
12136       if (bitQ == 0 && size == X11) return False; // implied 1d case
12137       Bool   isSUB = bitU == 1;
12138       IROp   op    = isSUB ? mkVecSUB(size) : mkVecADD(size);
12139       IRTemp t     = newTempV128();
12140       assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
12141       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t));
12142       const HChar* nm  = isSUB ? "sub" : "add";
12143       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12144       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12145           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12146       return True;
12147    }
12148
12149    if (opcode == BITS5(1,0,0,0,1)) {
12150       /* -------- 0,xx,10001 CMTST std7_std7_std7 -------- */ // &, != 0
12151       /* -------- 1,xx,10001 CMEQ  std7_std7_std7 -------- */ // ==
12152       if (bitQ == 0 && size == X11) return False; // implied 1d case
12153       Bool    isEQ = bitU == 1;
12154       IRExpr* argL = getQReg128(nn);
12155       IRExpr* argR = getQReg128(mm);
12156       IRTemp  res  = newTempV128();
12157       assign(res,
12158              isEQ ? binop(mkVecCMPEQ(size), argL, argR)
12159                   : unop(Iop_NotV128, binop(mkVecCMPEQ(size),
12160                                             binop(Iop_AndV128, argL, argR),
12161                                             mkV128(0x0000))));
12162       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12163       const HChar* nm  = isEQ ? "cmeq" : "cmtst";
12164       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12165       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12166           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12167       return True;
12168    }
12169
12170    if (opcode == BITS5(1,0,0,1,0)) {
12171       /* -------- 0,xx,10010 MLA std7_std7_std7 -------- */
12172       /* -------- 1,xx,10010 MLS std7_std7_std7 -------- */
12173       if (bitQ == 0 && size == X11) return False; // implied 1d case
12174       Bool isMLS = bitU == 1;
12175       IROp   opMUL    = mkVecMUL(size);
12176       IROp   opADDSUB = isMLS ? mkVecSUB(size) : mkVecADD(size);
12177       IRTemp res      = newTempV128();
12178       if (opMUL != Iop_INVALID && opADDSUB != Iop_INVALID) {
12179          assign(res, binop(opADDSUB,
12180                            getQReg128(dd),
12181                            binop(opMUL, getQReg128(nn), getQReg128(mm))));
12182          putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12183          const HChar* arr = nameArr_Q_SZ(bitQ, size);
12184          DIP("%s %s.%s, %s.%s, %s.%s\n", isMLS ? "mls" : "mla",
12185              nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12186          return True;
12187       }
12188       return False;
12189    }
12190
12191    if (opcode == BITS5(1,0,0,1,1)) {
12192       /* -------- 0,xx,10011 MUL  std7_std7_std7 -------- */
12193       /* -------- 1,xx,10011 PMUL 16b_16b_16b, 8b_8b_8b -------- */
12194       if (bitQ == 0 && size == X11) return False; // implied 1d case
12195       Bool isPMUL = bitU == 1;
12196       const IROp opsPMUL[4]
12197          = { Iop_PolynomialMul8x16, Iop_INVALID, Iop_INVALID, Iop_INVALID };
12198       IROp   opMUL = isPMUL ? opsPMUL[size] : mkVecMUL(size);
12199       IRTemp res   = newTempV128();
12200       if (opMUL != Iop_INVALID) {
12201          assign(res, binop(opMUL, getQReg128(nn), getQReg128(mm)));
12202          putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12203          const HChar* arr = nameArr_Q_SZ(bitQ, size);
12204          DIP("%s %s.%s, %s.%s, %s.%s\n", isPMUL ? "pmul" : "mul",
12205              nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12206          return True;
12207       }
12208       return False;
12209    }
12210
12211    if (opcode == BITS5(1,0,1,0,0) || opcode == BITS5(1,0,1,0,1)) {
12212       /* -------- 0,xx,10100 SMAXP std6_std6_std6 -------- */
12213       /* -------- 1,xx,10100 UMAXP std6_std6_std6 -------- */
12214       /* -------- 0,xx,10101 SMINP std6_std6_std6 -------- */
12215       /* -------- 1,xx,10101 UMINP std6_std6_std6 -------- */
12216       if (size == X11) return False;
12217       Bool isU   = bitU == 1;
12218       Bool isMAX = opcode == BITS5(1,0,1,0,0);
12219       IRTemp vN  = newTempV128();
12220       IRTemp vM  = newTempV128();
12221       IROp op = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
12222                       : (isU ? mkVecMINU(size) : mkVecMINS(size));
12223       assign(vN, getQReg128(nn));
12224       assign(vM, getQReg128(mm));
12225       IRTemp res128 = newTempV128();
12226       assign(res128,
12227              binop(op,
12228                    binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
12229                    binop(mkVecCATODDLANES(size),  mkexpr(vM), mkexpr(vN))));
12230       /* In the half-width case, use CatEL32x4 to extract the half-width
12231          result from the full-width result. */
12232       IRExpr* res
12233          = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
12234                             binop(Iop_CatEvenLanes32x4, mkexpr(res128),
12235                                                         mkexpr(res128)))
12236                      : mkexpr(res128);
12237       putQReg128(dd, res);
12238       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12239       const HChar* nm  = isMAX ? (isU ? "umaxp" : "smaxp")
12240                                : (isU ? "uminp" : "sminp");
12241       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12242           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12243       return True;
12244    }
12245
12246    if (opcode == BITS5(1,0,1,1,0)) {
12247       /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
12248       /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
12249       if (size == X00 || size == X11) return False;
12250       Bool isR = bitU == 1;
12251       IRTemp res, sat1q, sat1n, vN, vM;
12252       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
12253       newTempsV128_2(&vN, &vM);
12254       assign(vN, getQReg128(nn));
12255       assign(vM, getQReg128(mm));
12256       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
12257       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12258       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
12259       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
12260       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12261       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
12262       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12263           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12264       return True;
12265    }
12266
12267    if (bitU == 0 && opcode == BITS5(1,0,1,1,1)) {
12268       /* -------- 0,xx,10111 ADDP std7_std7_std7 -------- */
12269       if (bitQ == 0 && size == X11) return False; // implied 1d case
12270       IRTemp vN = newTempV128();
12271       IRTemp vM = newTempV128();
12272       assign(vN, getQReg128(nn));
12273       assign(vM, getQReg128(mm));
12274       IRTemp res128 = newTempV128();
12275       assign(res128,
12276              binop(mkVecADD(size),
12277                    binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
12278                    binop(mkVecCATODDLANES(size),  mkexpr(vM), mkexpr(vN))));
12279       /* In the half-width case, use CatEL32x4 to extract the half-width
12280          result from the full-width result. */
12281       IRExpr* res
12282          = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
12283                             binop(Iop_CatEvenLanes32x4, mkexpr(res128),
12284                                                         mkexpr(res128)))
12285                      : mkexpr(res128);
12286       putQReg128(dd, res);
12287       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12288       DIP("addp %s.%s, %s.%s, %s.%s\n",
12289           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12290       return True;
12291    }
12292
12293    if (bitU == 0
12294        && (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,1,1,0))) {
12295       /* -------- 0,0x,11000 FMAXNM 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12296       /* -------- 0,1x,11000 FMINNM 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12297       /* -------- 0,0x,11110 FMAX   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12298       /* -------- 0,1x,11110 FMIN   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12299       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
12300       Bool   isD   = (size & 1) == 1;
12301       if (bitQ == 0 && isD) return False; // implied 1d case
12302       Bool   isMIN = (size & 2) == 2;
12303       Bool   isNM  = opcode == BITS5(1,1,0,0,0);
12304       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? X11 : X10);
12305       IRTemp res   = newTempV128();
12306       assign(res, binop(opMXX, getQReg128(nn), getQReg128(mm)));
12307       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12308       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12309       DIP("%s%s %s.%s, %s.%s, %s.%s\n",
12310           isMIN ? "fmin" : "fmax", isNM ? "nm" : "",
12311           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12312       return True;
12313    }
12314
12315    if (bitU == 0 && opcode == BITS5(1,1,0,0,1)) {
12316       /* -------- 0,0x,11001 FMLA 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12317       /* -------- 0,1x,11001 FMLS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12318       Bool isD   = (size & 1) == 1;
12319       Bool isSUB = (size & 2) == 2;
12320       if (bitQ == 0 && isD) return False; // implied 1d case
12321       IROp opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
12322       IROp opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
12323       IROp opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
12324       IRTemp rm = mk_get_IR_rounding_mode();
12325       IRTemp t1 = newTempV128();
12326       IRTemp t2 = newTempV128();
12327       // FIXME: double rounding; use FMA primops instead
12328       assign(t1, triop(opMUL,
12329                        mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12330       assign(t2, triop(isSUB ? opSUB : opADD,
12331                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
12332       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
12333       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12334       DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fmls" : "fmla",
12335           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12336       return True;
12337    }
12338
12339    if (bitU == 0 && opcode == BITS5(1,1,0,1,0)) {
12340       /* -------- 0,0x,11010 FADD 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12341       /* -------- 0,1x,11010 FSUB 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12342       Bool isD   = (size & 1) == 1;
12343       Bool isSUB = (size & 2) == 2;
12344       if (bitQ == 0 && isD) return False; // implied 1d case
12345       const IROp ops[4]
12346          = { Iop_Add32Fx4, Iop_Add64Fx2, Iop_Sub32Fx4, Iop_Sub64Fx2 };
12347       IROp   op = ops[size];
12348       IRTemp rm = mk_get_IR_rounding_mode();
12349       IRTemp t1 = newTempV128();
12350       IRTemp t2 = newTempV128();
12351       assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12352       assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
12353       putQReg128(dd, mkexpr(t2));
12354       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12355       DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fsub" : "fadd",
12356           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12357       return True;
12358    }
12359
12360    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
12361       /* -------- 1,1x,11010 FABD 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12362       Bool isD = (size & 1) == 1;
12363       if (bitQ == 0 && isD) return False; // implied 1d case
12364       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
12365       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
12366       IRTemp rm    = mk_get_IR_rounding_mode();
12367       IRTemp t1    = newTempV128();
12368       IRTemp t2    = newTempV128();
12369       // FIXME: use Abd primop instead?
12370       assign(t1, triop(opSUB, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12371       assign(t2, unop(opABS, mkexpr(t1)));
12372       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
12373       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12374       DIP("fabd %s.%s, %s.%s, %s.%s\n",
12375           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12376       return True;
12377    }
12378
12379    if (size <= X01 && opcode == BITS5(1,1,0,1,1)) {
12380       /* -------- 0,0x,11011 FMULX 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12381       /* -------- 1,0x,11011 FMUL  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12382       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
12383       Bool isD    = (size & 1) == 1;
12384       Bool isMULX = bitU == 0;
12385       if (bitQ == 0 && isD) return False; // implied 1d case
12386       IRTemp rm = mk_get_IR_rounding_mode();
12387       IRTemp t1 = newTempV128();
12388       assign(t1, triop(isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4,
12389                        mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12390       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12391       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12392       DIP("%s %s.%s, %s.%s, %s.%s\n", isMULX ? "fmulx" : "fmul",
12393           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12394       return True;
12395    }
12396
12397    if (size <= X01 && opcode == BITS5(1,1,1,0,0)) {
12398       /* -------- 0,0x,11100 FCMEQ 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12399       /* -------- 1,0x,11100 FCMGE 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12400       Bool isD = (size & 1) == 1;
12401       if (bitQ == 0 && isD) return False; // implied 1d case
12402       Bool   isGE  = bitU == 1;
12403       IROp   opCMP = isGE ? (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4)
12404                           : (isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4);
12405       IRTemp t1    = newTempV128();
12406       assign(t1, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn)) // swapd
12407                       : binop(opCMP, getQReg128(nn), getQReg128(mm)));
12408       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12409       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12410       DIP("%s %s.%s, %s.%s, %s.%s\n", isGE ? "fcmge" : "fcmeq",
12411           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12412       return True;
12413    }
12414
12415    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,0,0)) {
12416       /* -------- 1,1x,11100 FCMGT 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12417       Bool isD = (size & 1) == 1;
12418       if (bitQ == 0 && isD) return False; // implied 1d case
12419       IROp   opCMP = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
12420       IRTemp t1    = newTempV128();
12421       assign(t1, binop(opCMP, getQReg128(mm), getQReg128(nn))); // swapd
12422       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12423       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12424       DIP("%s %s.%s, %s.%s, %s.%s\n", "fcmgt",
12425           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12426       return True;
12427    }
12428
12429    if (bitU == 1 && opcode == BITS5(1,1,1,0,1)) {
12430       /* -------- 1,0x,11101 FACGE 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12431       /* -------- 1,1x,11101 FACGT 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12432       Bool isD  = (size & 1) == 1;
12433       Bool isGT = (size & 2) == 2;
12434       if (bitQ == 0 && isD) return False; // implied 1d case
12435       IROp   opCMP = isGT ? (isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4)
12436                           : (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4);
12437       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
12438       IRTemp t1    = newTempV128();
12439       assign(t1, binop(opCMP, unop(opABS, getQReg128(mm)),
12440                               unop(opABS, getQReg128(nn)))); // swapd
12441       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12442       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12443       DIP("%s %s.%s, %s.%s, %s.%s\n", isGT ? "facgt" : "facge",
12444           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12445       return True;
12446    }
12447
12448    if (bitU == 1
12449        && (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,1,1,0))) {
12450       /* -------- 1,0x,11000 FMAXNMP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12451       /* -------- 1,1x,11000 FMINNMP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12452       /* -------- 1,0x,11110 FMAXP   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12453       /* -------- 1,1x,11110 FMINP   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12454       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
12455       Bool isD = (size & 1) == 1;
12456       if (bitQ == 0 && isD) return False; // implied 1d case
12457       Bool   isMIN = (size & 2) == 2;
12458       Bool   isNM  = opcode == BITS5(1,1,0,0,0);
12459       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? 3 : 2);
12460       IRTemp srcN  = newTempV128();
12461       IRTemp srcM  = newTempV128();
12462       IRTemp preL  = IRTemp_INVALID;
12463       IRTemp preR  = IRTemp_INVALID;
12464       assign(srcN, getQReg128(nn));
12465       assign(srcM, getQReg128(mm));
12466       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR,
12467                                            srcM, srcN, isD, bitQ);
12468       putQReg128(
12469          dd, math_MAYBE_ZERO_HI64_fromE(
12470                 bitQ,
12471                 binop(opMXX, mkexpr(preL), mkexpr(preR))));
12472       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12473       DIP("%s%sp %s.%s, %s.%s, %s.%s\n",
12474           isMIN ? "fmin" : "fmax", isNM ? "nm" : "",
12475           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12476       return True;
12477    }
12478
12479    if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,0,1,0)) {
12480       /* -------- 1,0x,11010 FADDP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12481       Bool isD = size == X01;
12482       if (bitQ == 0 && isD) return False; // implied 1d case
12483       IRTemp srcN = newTempV128();
12484       IRTemp srcM = newTempV128();
12485       IRTemp preL = IRTemp_INVALID;
12486       IRTemp preR = IRTemp_INVALID;
12487       assign(srcN, getQReg128(nn));
12488       assign(srcM, getQReg128(mm));
12489       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR,
12490                                            srcM, srcN, isD, bitQ);
12491       putQReg128(
12492          dd, math_MAYBE_ZERO_HI64_fromE(
12493                 bitQ,
12494                 triop(mkVecADDF(isD ? 3 : 2),
12495                       mkexpr(mk_get_IR_rounding_mode()),
12496                       mkexpr(preL), mkexpr(preR))));
12497       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12498       DIP("%s %s.%s, %s.%s, %s.%s\n", "faddp",
12499           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12500       return True;
12501    }
12502
12503    if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,1,1,1)) {
12504       /* -------- 1,0x,11111 FDIV 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12505       Bool isD = (size & 1) == 1;
12506       if (bitQ == 0 && isD) return False; // implied 1d case
12507       vassert(size <= 1);
12508       const IROp ops[2] = { Iop_Div32Fx4, Iop_Div64Fx2 };
12509       IROp   op = ops[size];
12510       IRTemp rm = mk_get_IR_rounding_mode();
12511       IRTemp t1 = newTempV128();
12512       IRTemp t2 = newTempV128();
12513       assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12514       assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
12515       putQReg128(dd, mkexpr(t2));
12516       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12517       DIP("%s %s.%s, %s.%s, %s.%s\n", "fdiv",
12518           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12519       return True;
12520    }
12521
12522    if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
12523       /* -------- 0,0x,11111: FRECPS  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12524       /* -------- 0,1x,11111: FRSQRTS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12525       Bool isSQRT = (size & 2) == 2;
12526       Bool isD    = (size & 1) == 1;
12527       if (bitQ == 0 && isD) return False; // implied 1d case
12528       IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
12529                            : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
12530       IRTemp res = newTempV128();
12531       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
12532       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12533       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12534       DIP("%s %s.%s, %s.%s, %s.%s\n", isSQRT ? "frsqrts" : "frecps",
12535           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12536       return True;
12537    }
12538
12539    return False;
12540 #  undef INSN
12541 }
12542
12543
12544 static
12545 Bool dis_AdvSIMD_three_same_extra(/*MB_OUT*/DisResult* dres, UInt insn)
12546 {
12547    /* 31 30 29 28    23   21 20 15 14     10 9 4
12548       0  Q  U  01110 size 0  m  1  opcode 1  n d
12549       Decode fields: u,size,opcode
12550    */
12551 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12552    if (INSN(31,31) != 0
12553        || INSN(28,24) != BITS5(0,1,1,1,0)
12554        || INSN(21,21) != 0
12555        || INSN(15,15) != 1
12556        || INSN(10,10) != 1) {
12557       return False;
12558    }
12559    UInt bitQ   = INSN(30,30);
12560    UInt bitU   = INSN(29,29);
12561    UInt size   = INSN(23,22);
12562    UInt mm     = INSN(20,16);
12563    UInt opcode = INSN(14,11);
12564    UInt nn     = INSN(9,5);
12565    UInt dd     = INSN(4,0);
12566    vassert(size < 4);
12567    vassert(mm < 32 && nn < 32 && dd < 32);
12568
12569    if (bitU == 1 && (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,0,0,1))) {
12570       /* -------- 0,xx,10110 SQRDMLAH s and h variants only -------- */
12571       /* -------- 1,xx,10110 SQRDMLSH s and h variants only -------- */
12572       if (size == X00 || size == X11) return False;
12573       Bool isAdd = opcode == BITS4(0,0,0,0);
12574
12575       IRTemp res, res_nosat, vD, vN, vM;
12576       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
12577       newTempsV128_3(&vD, &vN, &vM);
12578       assign(vD, getQReg128(dd));
12579       assign(vN, getQReg128(nn));
12580       assign(vM, getQReg128(mm));
12581
12582       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
12583       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
12584       updateQCFLAGwithDifferenceZHI(res, res_nosat, opZHI);
12585       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12586
12587       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12588       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
12589       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12590           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12591       return True;
12592    }
12593
12594    return False;
12595 #  undef INSN
12596 }
12597
12598
12599 static
12600 Bool dis_AdvSIMD_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
12601 {
12602    /* 31 30 29 28    23   21    16     11 9 4
12603       0  Q  U  01110 size 10000 opcode 10 n d
12604       Decode fields: U,size,opcode
12605    */
12606 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12607    if (INSN(31,31) != 0
12608        || INSN(28,24) != BITS5(0,1,1,1,0)
12609        || INSN(21,17) != BITS5(1,0,0,0,0)
12610        || INSN(11,10) != BITS2(1,0)) {
12611       return False;
12612    }
12613    UInt bitQ   = INSN(30,30);
12614    UInt bitU   = INSN(29,29);
12615    UInt size   = INSN(23,22);
12616    UInt opcode = INSN(16,12);
12617    UInt nn     = INSN(9,5);
12618    UInt dd     = INSN(4,0);
12619    vassert(size < 4);
12620
12621    if (bitU == 0 && size <= X10 && opcode == BITS5(0,0,0,0,0)) {
12622       /* -------- 0,00,00000: REV64 16b_16b, 8b_8b -------- */
12623       /* -------- 0,01,00000: REV64 8h_8h, 4h_4h -------- */
12624       /* -------- 0,10,00000: REV64 4s_4s, 2s_2s -------- */
12625       const IROp iops[3] = { Iop_Reverse8sIn64_x2,
12626                              Iop_Reverse16sIn64_x2, Iop_Reverse32sIn64_x2 };
12627       vassert(size <= 2);
12628       IRTemp res = newTempV128();
12629       assign(res, unop(iops[size], getQReg128(nn)));
12630       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12631       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12632       DIP("%s %s.%s, %s.%s\n", "rev64",
12633           nameQReg128(dd), arr, nameQReg128(nn), arr);
12634       return True;
12635    }
12636
12637    if (bitU == 1 && size <= X01 && opcode == BITS5(0,0,0,0,0)) {
12638       /* -------- 1,00,00000: REV32 16b_16b, 8b_8b -------- */
12639       /* -------- 1,01,00000: REV32 8h_8h, 4h_4h -------- */
12640       Bool   isH = size == X01;
12641       IRTemp res = newTempV128();
12642       IROp   iop = isH ? Iop_Reverse16sIn32_x4 : Iop_Reverse8sIn32_x4;
12643       assign(res, unop(iop, getQReg128(nn)));
12644       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12645       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12646       DIP("%s %s.%s, %s.%s\n", "rev32",
12647           nameQReg128(dd), arr, nameQReg128(nn), arr);
12648       return True;
12649    }
12650
12651    if (bitU == 0 && size == X00 && opcode == BITS5(0,0,0,0,1)) {
12652       /* -------- 0,00,00001: REV16 16b_16b, 8b_8b -------- */
12653       IRTemp res = newTempV128();
12654       assign(res, unop(Iop_Reverse8sIn16_x8, getQReg128(nn)));
12655       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12656       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12657       DIP("%s %s.%s, %s.%s\n", "rev16",
12658           nameQReg128(dd), arr, nameQReg128(nn), arr);
12659       return True;
12660    }
12661
12662    if (opcode == BITS5(0,0,0,1,0) || opcode == BITS5(0,0,1,1,0)) {
12663       /* -------- 0,xx,00010: SADDLP std6_std6 -------- */
12664       /* -------- 1,xx,00010: UADDLP std6_std6 -------- */
12665       /* -------- 0,xx,00110: SADALP std6_std6 -------- */
12666       /* -------- 1,xx,00110: UADALP std6_std6 -------- */
12667       /* Widens, and size refers to the narrow size. */
12668       if (size == X11) return False; // no 1d or 2d cases
12669       Bool   isU   = bitU == 1;
12670       Bool   isACC = opcode == BITS5(0,0,1,1,0);
12671       IRTemp src   = newTempV128();
12672       IRTemp sum   = newTempV128();
12673       IRTemp res   = newTempV128();
12674       assign(src, getQReg128(nn));
12675       assign(sum,
12676              binop(mkVecADD(size+1),
12677                    mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
12678                              isU, True/*fromOdd*/, size, mkexpr(src))),
12679                    mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
12680                              isU, False/*!fromOdd*/, size, mkexpr(src)))));
12681       assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(sum), getQReg128(dd))
12682                         : mkexpr(sum));
12683       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12684       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12685       const HChar* arrWide   = nameArr_Q_SZ(bitQ, size+1);
12686       DIP("%s %s.%s, %s.%s\n", isACC ? (isU ? "uadalp" : "sadalp")
12687                                      : (isU ? "uaddlp" : "saddlp"),
12688           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
12689       return True;
12690    }
12691
12692    if (opcode == BITS5(0,0,0,1,1)) {
12693       /* -------- 0,xx,00011: SUQADD std7_std7 -------- */
12694       /* -------- 1,xx,00011: USQADD std7_std7 -------- */
12695       if (bitQ == 0 && size == X11) return False; // implied 1d case
12696       Bool isUSQADD = bitU == 1;
12697       /* This is switched (in the US vs SU sense) deliberately.
12698          SUQADD corresponds to the ExtUSsatSS variants and
12699          USQADD corresponds to the ExtSUsatUU variants.
12700          See libvex_ir for more details. */
12701       IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
12702                              : mkVecQADDEXTUSSATSS(size);
12703       IROp   nop  = mkVecADD(size);
12704       IRTemp argL = newTempV128();
12705       IRTemp argR = newTempV128();
12706       IRTemp qres = newTempV128();
12707       IRTemp nres = newTempV128();
12708       /* Because the two arguments to the addition are implicitly
12709          extended differently (one signedly, the other unsignedly) it is
12710          important to present them to the primop in the correct order. */
12711       assign(argL, getQReg128(nn));
12712       assign(argR, getQReg128(dd));
12713       assign(qres, math_MAYBE_ZERO_HI64_fromE(
12714                       bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
12715       assign(nres, math_MAYBE_ZERO_HI64_fromE(
12716                       bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
12717       putQReg128(dd, mkexpr(qres));
12718       updateQCFLAGwithDifference(qres, nres);
12719       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12720       DIP("%s %s.%s, %s.%s\n", isUSQADD ? "usqadd" : "suqadd",
12721           nameQReg128(dd), arr, nameQReg128(nn), arr);
12722       return True;
12723    }
12724
12725    if (opcode == BITS5(0,0,1,0,0)) {
12726       /* -------- 0,xx,00100: CLS std6_std6 -------- */
12727       /* -------- 1,xx,00100: CLZ std6_std6 -------- */
12728       if (size == X11) return False; // no 1d or 2d cases
12729       const IROp opsCLS[3] = { Iop_Cls8x16, Iop_Cls16x8, Iop_Cls32x4 };
12730       const IROp opsCLZ[3] = { Iop_Clz8x16, Iop_Clz16x8, Iop_Clz32x4 };
12731       Bool   isCLZ = bitU == 1;
12732       IRTemp res   = newTempV128();
12733       vassert(size <= 2);
12734       assign(res, unop(isCLZ ? opsCLZ[size] : opsCLS[size], getQReg128(nn)));
12735       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12736       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12737       DIP("%s %s.%s, %s.%s\n", isCLZ ? "clz" : "cls",
12738           nameQReg128(dd), arr, nameQReg128(nn), arr);
12739       return True;
12740    }
12741
12742    if (size == X00 && opcode == BITS5(0,0,1,0,1)) {
12743       /* -------- 0,00,00101: CNT 16b_16b, 8b_8b -------- */
12744       /* -------- 1,00,00101: NOT 16b_16b, 8b_8b -------- */
12745       IRTemp res = newTempV128();
12746       assign(res, unop(bitU == 0 ? Iop_Cnt8x16 : Iop_NotV128, getQReg128(nn)));
12747       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12748       const HChar* arr = nameArr_Q_SZ(bitQ, 0);
12749       DIP("%s %s.%s, %s.%s\n", bitU == 0 ? "cnt" : "not",
12750           nameQReg128(dd), arr, nameQReg128(nn), arr);
12751       return True;
12752    }
12753
12754    if (bitU == 1 && size == X01 && opcode == BITS5(0,0,1,0,1)) {
12755       /* -------- 1,01,00101  RBIT 16b_16b, 8b_8b -------- */
12756       IRTemp res = newTempV128();
12757       assign(res, unop(Iop_Reverse1sIn8_x16, getQReg128(nn)));
12758       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12759       const HChar* arr = nameArr_Q_SZ(bitQ, 0);
12760       DIP("%s %s.%s, %s.%s\n", "rbit",
12761           nameQReg128(dd), arr, nameQReg128(nn), arr);
12762       return True;
12763    }
12764
12765    if (opcode == BITS5(0,0,1,1,1)) {
12766       /* -------- 0,xx,00111 SQABS std7_std7 -------- */
12767       /* -------- 1,xx,00111 SQNEG std7_std7 -------- */
12768       if (bitQ == 0 && size == X11) return False; // implied 1d case
12769       Bool   isNEG  = bitU == 1;
12770       IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
12771       (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
12772                                          getQReg128(nn), size );
12773       IRTemp qres = newTempV128(), nres = newTempV128();
12774       assign(qres, math_MAYBE_ZERO_HI64(bitQ, qresFW));
12775       assign(nres, math_MAYBE_ZERO_HI64(bitQ, nresFW));
12776       putQReg128(dd, mkexpr(qres));
12777       updateQCFLAGwithDifference(qres, nres);
12778       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12779       DIP("%s %s.%s, %s.%s\n", isNEG ? "sqneg" : "sqabs",
12780           nameQReg128(dd), arr, nameQReg128(nn), arr);
12781       return True;
12782    }
12783
12784    if (opcode == BITS5(0,1,0,0,0)) {
12785       /* -------- 0,xx,01000: CMGT std7_std7_#0 -------- */ // >s 0
12786       /* -------- 1,xx,01000: CMGE std7_std7_#0 -------- */ // >=s 0
12787       if (bitQ == 0 && size == X11) return False; // implied 1d case
12788       Bool    isGT  = bitU == 0;
12789       IRExpr* argL  = getQReg128(nn);
12790       IRExpr* argR  = mkV128(0x0000);
12791       IRTemp  res   = newTempV128();
12792       IROp    opGTS = mkVecCMPGTS(size);
12793       assign(res, isGT ? binop(opGTS, argL, argR)
12794                        : unop(Iop_NotV128, binop(opGTS, argR, argL)));
12795       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12796       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12797       DIP("cm%s %s.%s, %s.%s, #0\n", isGT ? "gt" : "ge",
12798           nameQReg128(dd), arr, nameQReg128(nn), arr);
12799       return True;
12800    }
12801
12802    if (opcode == BITS5(0,1,0,0,1)) {
12803       /* -------- 0,xx,01001: CMEQ std7_std7_#0 -------- */ // == 0
12804       /* -------- 1,xx,01001: CMLE std7_std7_#0 -------- */ // <=s 0
12805       if (bitQ == 0 && size == X11) return False; // implied 1d case
12806       Bool    isEQ = bitU == 0;
12807       IRExpr* argL = getQReg128(nn);
12808       IRExpr* argR = mkV128(0x0000);
12809       IRTemp  res  = newTempV128();
12810       assign(res, isEQ ? binop(mkVecCMPEQ(size), argL, argR)
12811                        : unop(Iop_NotV128,
12812                               binop(mkVecCMPGTS(size), argL, argR)));
12813       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12814       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12815       DIP("cm%s %s.%s, %s.%s, #0\n", isEQ ? "eq" : "le",
12816           nameQReg128(dd), arr, nameQReg128(nn), arr);
12817       return True;
12818    }
12819
12820    if (bitU == 0 && opcode == BITS5(0,1,0,1,0)) {
12821       /* -------- 0,xx,01010: CMLT std7_std7_#0 -------- */ // <s 0
12822       if (bitQ == 0 && size == X11) return False; // implied 1d case
12823       IRExpr* argL = getQReg128(nn);
12824       IRExpr* argR = mkV128(0x0000);
12825       IRTemp  res  = newTempV128();
12826       assign(res, binop(mkVecCMPGTS(size), argR, argL));
12827       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12828       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12829       DIP("cm%s %s.%s, %s.%s, #0\n", "lt",
12830           nameQReg128(dd), arr, nameQReg128(nn), arr);
12831       return True;
12832    }
12833
12834    if (bitU == 0 && opcode == BITS5(0,1,0,1,1)) {
12835       /* -------- 0,xx,01011: ABS std7_std7 -------- */
12836       if (bitQ == 0 && size == X11) return False; // implied 1d case
12837       IRTemp res = newTempV128();
12838       assign(res, unop(mkVecABS(size), getQReg128(nn)));
12839       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12840       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12841       DIP("abs %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
12842       return True;
12843    }
12844
12845    if (bitU == 1 && opcode == BITS5(0,1,0,1,1)) {
12846       /* -------- 1,xx,01011: NEG std7_std7 -------- */
12847       if (bitQ == 0 && size == X11) return False; // implied 1d case
12848       IRTemp res = newTempV128();
12849       assign(res, binop(mkVecSUB(size), mkV128(0x0000), getQReg128(nn)));
12850       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12851       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12852       DIP("neg %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
12853       return True;
12854    }
12855
12856    UInt ix = 0; /*INVALID*/
12857    if (size >= X10) {
12858       switch (opcode) {
12859          case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 4 : 1; break;
12860          case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 5 : 2; break;
12861          case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;
12862          default: break;
12863       }
12864    }
12865    if (ix > 0) {
12866       /* -------- 0,1x,01100 FCMGT 2d_2d,4s_4s,2s_2s _#0.0 (ix 1) -------- */
12867       /* -------- 0,1x,01101 FCMEQ 2d_2d,4s_4s,2s_2s _#0.0 (ix 2) -------- */
12868       /* -------- 0,1x,01110 FCMLT 2d_2d,4s_4s,2s_2s _#0.0 (ix 3) -------- */
12869       /* -------- 1,1x,01100 FCMGE 2d_2d,4s_4s,2s_2s _#0.0 (ix 4) -------- */
12870       /* -------- 1,1x,01101 FCMLE 2d_2d,4s_4s,2s_2s _#0.0 (ix 5) -------- */
12871       if (bitQ == 0 && size == X11) return False; // implied 1d case
12872       Bool   isD     = size == X11;
12873       IROp   opCmpEQ = isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
12874       IROp   opCmpLE = isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
12875       IROp   opCmpLT = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
12876       IROp   opCmp   = Iop_INVALID;
12877       Bool   swap    = False;
12878       const HChar* nm = "??";
12879       switch (ix) {
12880          case 1: nm = "fcmgt"; opCmp = opCmpLT; swap = True; break;
12881          case 2: nm = "fcmeq"; opCmp = opCmpEQ; break;
12882          case 3: nm = "fcmlt"; opCmp = opCmpLT; break;
12883          case 4: nm = "fcmge"; opCmp = opCmpLE; swap = True; break;
12884          case 5: nm = "fcmle"; opCmp = opCmpLE; break;
12885          default: vassert(0);
12886       }
12887       IRExpr* zero = mkV128(0x0000);
12888       IRTemp res = newTempV128();
12889       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
12890                        : binop(opCmp, getQReg128(nn), zero));
12891       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12892       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
12893       DIP("%s %s.%s, %s.%s, #0.0\n", nm,
12894           nameQReg128(dd), arr, nameQReg128(nn), arr);
12895       return True;
12896    }
12897
12898    if (size >= X10 && opcode == BITS5(0,1,1,1,1)) {
12899       /* -------- 0,1x,01111: FABS 2d_2d, 4s_4s, 2s_2s -------- */
12900       /* -------- 1,1x,01111: FNEG 2d_2d, 4s_4s, 2s_2s -------- */
12901       if (bitQ == 0 && size == X11) return False; // implied 1d case
12902       Bool   isFNEG = bitU == 1;
12903       IROp   op     = isFNEG ? (size == X10 ? Iop_Neg32Fx4 : Iop_Neg64Fx2)
12904                              : (size == X10 ? Iop_Abs32Fx4 : Iop_Abs64Fx2);
12905       IRTemp res = newTempV128();
12906       assign(res, unop(op, getQReg128(nn)));
12907       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12908       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
12909       DIP("%s %s.%s, %s.%s\n", isFNEG ? "fneg" : "fabs",
12910           nameQReg128(dd), arr, nameQReg128(nn), arr);
12911       return True;
12912    }
12913
12914    if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
12915       /* -------- 0,xx,10010: XTN{,2} -------- */
12916       if (size == X11) return False;
12917       vassert(size < 3);
12918       Bool   is2  = bitQ == 1;
12919       IROp   opN  = mkVecNARROWUN(size);
12920       IRTemp resN = newTempV128();
12921       assign(resN, unop(Iop_64UtoV128, unop(opN, getQReg128(nn))));
12922       putLO64andZUorPutHI64(is2, dd, resN);
12923       const HChar* nm        = "xtn";
12924       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12925       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12926       DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm,
12927           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
12928       return True;
12929    }
12930
12931    if (opcode == BITS5(1,0,1,0,0)
12932        || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) {
12933       /* -------- 0,xx,10100: SQXTN{,2} -------- */
12934       /* -------- 1,xx,10100: UQXTN{,2} -------- */
12935       /* -------- 1,xx,10010: SQXTUN{,2} -------- */
12936       if (size == X11) return False;
12937       vassert(size < 3);
12938       Bool  is2    = bitQ == 1;
12939       IROp  opN    = Iop_INVALID;
12940       Bool  zWiden = True;
12941       const HChar* nm = "??";
12942       /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) {
12943          opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False;
12944       }
12945       else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) {
12946          opN = mkVecQNARROWUNUU(size); nm = "uqxtn";
12947       }
12948       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
12949          opN = mkVecQNARROWUNSU(size); nm = "sqxtun";
12950       }
12951       else vassert(0);
12952       IRTemp src  = newTempV128();
12953       assign(src, getQReg128(nn));
12954       IRTemp resN = newTempV128();
12955       assign(resN, unop(Iop_64UtoV128, unop(opN, mkexpr(src))));
12956       putLO64andZUorPutHI64(is2, dd, resN);
12957       IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/,
12958                                               size, mkexpr(resN));
12959       updateQCFLAGwithDifference(src, resW);
12960       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12961       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12962       DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm,
12963           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
12964       return True;
12965    }
12966
12967    if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
12968       /* -------- 1,xx,10011 SHLL{2} #lane-width -------- */
12969       /* Widens, and size is the narrow size. */
12970       if (size == X11) return False;
12971       Bool is2   = bitQ == 1;
12972       IROp opINT = is2 ? mkVecINTERLEAVEHI(size) : mkVecINTERLEAVELO(size);
12973       IROp opSHL = mkVecSHLN(size+1);
12974       IRTemp src = newTempV128();
12975       IRTemp res = newTempV128();
12976       assign(src, getQReg128(nn));
12977       assign(res, binop(opSHL, binop(opINT, mkexpr(src), mkexpr(src)),
12978                                mkU8(8 << size)));
12979       putQReg128(dd, mkexpr(res));
12980       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12981       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12982       DIP("shll%s %s.%s, %s.%s, #%d\n", is2 ? "2" : "",
12983           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow, 8 << size);
12984       return True;
12985    }
12986
12987    if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,0)) {
12988       /* -------- 0,0x,10110: FCVTN 4h/8h_4s, 2s/4s_2d -------- */
12989       UInt   nLanes = size == X00 ? 4 : 2;
12990       IRType srcTy  = size == X00 ? Ity_F32 : Ity_F64;
12991       IROp   opCvt  = size == X00 ? Iop_F32toF16 : Iop_F64toF32;
12992       IRTemp rm     = mk_get_IR_rounding_mode();
12993       IRTemp src[nLanes];
12994       for (UInt i = 0; i < nLanes; i++) {
12995          src[i] = newTemp(srcTy);
12996          assign(src[i], getQRegLane(nn, i, srcTy));
12997       }
12998       for (UInt i = 0; i < nLanes; i++) {
12999          putQRegLane(dd, nLanes * bitQ + i,
13000                          binop(opCvt, mkexpr(rm), mkexpr(src[i])));
13001       }
13002       if (bitQ == 0) {
13003          putQRegLane(dd, 1, mkU64(0));
13004       }
13005       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
13006       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
13007       DIP("fcvtn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
13008           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
13009       return True;
13010    }
13011
13012    if (bitU == 1 && size == X01 && opcode == BITS5(1,0,1,1,0)) {
13013       /* -------- 1,01,10110: FCVTXN 2s/4s_2d -------- */
13014       /* Using Irrm_NEAREST here isn't right.  The docs say "round to
13015          odd" but I don't know what that really means. */
13016       IRType srcTy = Ity_F64;
13017       IROp   opCvt = Iop_F64toF32;
13018       IRTemp src[2];
13019       for (UInt i = 0; i < 2; i++) {
13020          src[i] = newTemp(srcTy);
13021          assign(src[i], getQRegLane(nn, i, srcTy));
13022       }
13023       for (UInt i = 0; i < 2; i++) {
13024          putQRegLane(dd, 2 * bitQ + i,
13025                          binop(opCvt, mkU32(Irrm_NEAREST), mkexpr(src[i])));
13026       }
13027       if (bitQ == 0) {
13028          putQRegLane(dd, 1, mkU64(0));
13029       }
13030       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
13031       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
13032       DIP("fcvtxn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
13033           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
13034       return True;
13035    }
13036
13037    if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,1)) {
13038       /* -------- 0,0x,10111: FCVTL 4s_4h/8h, 2d_2s/4s -------- */
13039       UInt   nLanes = size == X00 ? 4 : 2;
13040       IRType srcTy  = size == X00 ? Ity_F16 : Ity_F32;
13041       IROp   opCvt  = size == X00 ? Iop_F16toF32 : Iop_F32toF64;
13042       IRTemp src[nLanes];
13043       for (UInt i = 0; i < nLanes; i++) {
13044          src[i] = newTemp(srcTy);
13045          assign(src[i], getQRegLane(nn, nLanes * bitQ + i, srcTy));
13046       }
13047       for (UInt i = 0; i < nLanes; i++) {
13048          putQRegLane(dd, i, unop(opCvt, mkexpr(src[i])));
13049       }
13050       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
13051       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
13052       DIP("fcvtl%s %s.%s, %s.%s\n", bitQ ? "2" : "",
13053           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
13054       return True;
13055    }
13056
13057    ix = 0;
13058    if (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,0,0,1)) {
13059       ix = 1 + ((((bitU & 1) << 2) | ((size & 2) << 0)) | ((opcode & 1) << 0));
13060       // = 1 + bitU[0]:size[1]:opcode[0]
13061       vassert(ix >= 1 && ix <= 8);
13062       if (ix == 7) ix = 0;
13063    }
13064    if (ix > 0) {
13065       /* -------- 0,0x,11000 FRINTN 2d_2d, 4s_4s, 2s_2s (1) -------- */
13066       /* -------- 0,0x,11001 FRINTM 2d_2d, 4s_4s, 2s_2s (2) -------- */
13067       /* -------- 0,1x,11000 FRINTP 2d_2d, 4s_4s, 2s_2s (3) -------- */
13068       /* -------- 0,1x,11001 FRINTZ 2d_2d, 4s_4s, 2s_2s (4) -------- */
13069       /* -------- 1,0x,11000 FRINTA 2d_2d, 4s_4s, 2s_2s (5) -------- */
13070       /* -------- 1,0x,11001 FRINTX 2d_2d, 4s_4s, 2s_2s (6) -------- */
13071       /* -------- 1,1x,11000 (apparently unassigned)    (7) -------- */
13072       /* -------- 1,1x,11001 FRINTI 2d_2d, 4s_4s, 2s_2s (8) -------- */
13073       /* rm plan:
13074          FRINTN: tieeven -- !! FIXME KLUDGED !!
13075          FRINTM: -inf
13076          FRINTP: +inf
13077          FRINTZ: zero
13078          FRINTA: tieaway -- !! FIXME KLUDGED !!
13079          FRINTX: per FPCR + "exact = TRUE"
13080          FRINTI: per FPCR
13081       */
13082       Bool isD = (size & 1) == 1;
13083       if (bitQ == 0 && isD) return False; // implied 1d case
13084
13085       IRTemp irrmRM = mk_get_IR_rounding_mode();
13086
13087       UChar ch = '?';
13088       IRTemp irrm = newTemp(Ity_I32);
13089       switch (ix) {
13090          case 1: ch = 'n'; assign(irrm, mkU32(Irrm_NEAREST)); break;
13091          case 2: ch = 'm'; assign(irrm, mkU32(Irrm_NegINF)); break;
13092          case 3: ch = 'p'; assign(irrm, mkU32(Irrm_PosINF)); break;
13093          case 4: ch = 'z'; assign(irrm, mkU32(Irrm_ZERO)); break;
13094          // The following is a kludge.  Should be: Irrm_NEAREST_TIE_AWAY_0
13095          case 5: ch = 'a'; assign(irrm, mkU32(Irrm_NEAREST)); break;
13096          // I am unsure about the following, due to the "integral exact"
13097          // description in the manual.  What does it mean? (frintx, that is)
13098          case 6: ch = 'x'; assign(irrm, mkexpr(irrmRM)); break;
13099          case 8: ch = 'i'; assign(irrm, mkexpr(irrmRM)); break;
13100          default: vassert(0);
13101       }
13102
13103       IROp opRND = isD ? Iop_RoundF64toInt : Iop_RoundF32toInt;
13104       if (isD) {
13105          for (UInt i = 0; i < 2; i++) {
13106             putQRegLane(dd, i, binop(opRND, mkexpr(irrm),
13107                                             getQRegLane(nn, i, Ity_F64)));
13108          }
13109       } else {
13110          UInt n = bitQ==1 ? 4 : 2;
13111          for (UInt i = 0; i < n; i++) {
13112             putQRegLane(dd, i, binop(opRND, mkexpr(irrm),
13113                                             getQRegLane(nn, i, Ity_F32)));
13114          }
13115          if (bitQ == 0)
13116             putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
13117       }
13118       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13119       DIP("frint%c %s.%s, %s.%s\n", ch,
13120           nameQReg128(dd), arr, nameQReg128(nn), arr);
13121       return True;
13122    }
13123
13124    ix = 0; /*INVALID*/
13125    switch (opcode) {
13126       case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
13127       case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
13128       case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
13129       default: break;
13130    }
13131    if (ix > 0) {
13132       /* -------- 0,0x,11010 FCVTNS 2d_2d, 4s_4s, 2s_2s (ix 1) -------- */
13133       /* -------- 0,0x,11011 FCVTMS 2d_2d, 4s_4s, 2s_2s (ix 2) -------- */
13134       /* -------- 0,0x,11100 FCVTAS 2d_2d, 4s_4s, 2s_2s (ix 3) -------- */
13135       /* -------- 0,1x,11010 FCVTPS 2d_2d, 4s_4s, 2s_2s (ix 4) -------- */
13136       /* -------- 0,1x,11011 FCVTZS 2d_2d, 4s_4s, 2s_2s (ix 5) -------- */
13137       /* -------- 1,0x,11010 FCVTNS 2d_2d, 4s_4s, 2s_2s (ix 1) -------- */
13138       /* -------- 1,0x,11011 FCVTMS 2d_2d, 4s_4s, 2s_2s (ix 2) -------- */
13139       /* -------- 1,0x,11100 FCVTAS 2d_2d, 4s_4s, 2s_2s (ix 3) -------- */
13140       /* -------- 1,1x,11010 FCVTPS 2d_2d, 4s_4s, 2s_2s (ix 4) -------- */
13141       /* -------- 1,1x,11011 FCVTZS 2d_2d, 4s_4s, 2s_2s (ix 5) -------- */
13142       Bool isD = (size & 1) == 1;
13143       if (bitQ == 0 && isD) return False; // implied 1d case
13144
13145       IRRoundingMode irrm = 8; /*impossible*/
13146       HChar          ch   = '?';
13147       switch (ix) {
13148          case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
13149          case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
13150          case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
13151          case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
13152          case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
13153          default: vassert(0);
13154       }
13155       IROp cvt = Iop_INVALID;
13156       if (bitU == 1) {
13157          cvt = isD ? Iop_F64toI64U : Iop_F32toI32U;
13158       } else {
13159          cvt = isD ? Iop_F64toI64S : Iop_F32toI32S;
13160       }
13161       if (isD) {
13162          for (UInt i = 0; i < 2; i++) {
13163             putQRegLane(dd, i, binop(cvt, mkU32(irrm),
13164                                             getQRegLane(nn, i, Ity_F64)));
13165          }
13166       } else {
13167          UInt n = bitQ==1 ? 4 : 2;
13168          for (UInt i = 0; i < n; i++) {
13169             putQRegLane(dd, i, binop(cvt, mkU32(irrm),
13170                                             getQRegLane(nn, i, Ity_F32)));
13171          }
13172          if (bitQ == 0)
13173             putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
13174       }
13175       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13176       DIP("fcvt%c%c %s.%s, %s.%s\n", ch, bitU == 1 ? 'u' : 's',
13177           nameQReg128(dd), arr, nameQReg128(nn), arr);
13178       return True;
13179    }
13180
13181    if (size == X10 && opcode == BITS5(1,1,1,0,0)) {
13182       /* -------- 0,10,11100: URECPE  4s_4s, 2s_2s -------- */
13183       /* -------- 1,10,11100: URSQRTE 4s_4s, 2s_2s -------- */
13184       Bool isREC = bitU == 0;
13185       IROp op    = isREC ? Iop_RecipEst32Ux4 : Iop_RSqrtEst32Ux4;
13186       IRTemp res = newTempV128();
13187       assign(res, unop(op, getQReg128(nn)));
13188       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13189       const HChar* nm  = isREC ? "urecpe" : "ursqrte";
13190       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13191       DIP("%s %s.%s, %s.%s\n", nm,
13192           nameQReg128(dd), arr, nameQReg128(nn), arr);
13193       return True;
13194    }
13195
13196    if (size <= X01 && opcode == BITS5(1,1,1,0,1)) {
13197       /* -------- 0,0x,11101: SCVTF -------- */
13198       /* -------- 1,0x,11101: UCVTF -------- */
13199       /* 31  28      22 21       15     9 4
13200          0q0 01110 0 sz 1  00001 110110 n d  SCVTF Vd, Vn
13201          0q1 01110 0 sz 1  00001 110110 n d  UCVTF Vd, Vn
13202          with laneage:
13203          case sz:Q of 00 -> 2S, zero upper, 01 -> 4S, 10 -> illegal, 11 -> 2D
13204       */
13205       Bool isQ   = bitQ == 1;
13206       Bool isU   = bitU == 1;
13207       Bool isF64 = (size & 1) == 1;
13208       if (isQ || !isF64) {
13209          IRType tyF = Ity_INVALID, tyI = Ity_INVALID;
13210          UInt   nLanes = 0;
13211          Bool   zeroHI = False;
13212          const HChar* arrSpec = NULL;
13213          Bool   ok  = getLaneInfo_Q_SZ(&tyI, &tyF, &nLanes, &zeroHI, &arrSpec,
13214                                        isQ, isF64 );
13215          IROp   iop = isU ? (isF64 ? Iop_I64UtoF64 : Iop_I32UtoF32)
13216                           : (isF64 ? Iop_I64StoF64 : Iop_I32StoF32);
13217          IRTemp rm  = mk_get_IR_rounding_mode();
13218          UInt   i;
13219          vassert(ok); /* the 'if' above should ensure this */
13220          for (i = 0; i < nLanes; i++) {
13221             putQRegLane(dd, i,
13222                         binop(iop, mkexpr(rm), getQRegLane(nn, i, tyI)));
13223          }
13224          if (zeroHI) {
13225             putQRegLane(dd, 1, mkU64(0));
13226          }
13227          DIP("%ccvtf %s.%s, %s.%s\n", isU ? 'u' : 's',
13228              nameQReg128(dd), arrSpec, nameQReg128(nn), arrSpec);
13229          return True;
13230       }
13231       /* else fall through */
13232    }
13233
13234    if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
13235       /* -------- 0,1x,11101: FRECPE  2d_2d, 4s_4s, 2s_2s -------- */
13236       /* -------- 1,1x,11101: FRSQRTE 2d_2d, 4s_4s, 2s_2s -------- */
13237       Bool isSQRT = bitU == 1;
13238       Bool isD    = (size & 1) == 1;
13239       IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
13240                            : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
13241       if (bitQ == 0 && isD) return False; // implied 1d case
13242       IRTemp resV = newTempV128();
13243       assign(resV, unop(op, getQReg128(nn)));
13244       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
13245       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
13246       DIP("%s %s.%s, %s.%s\n", isSQRT ? "frsqrte" : "frecpe",
13247           nameQReg128(dd), arr, nameQReg128(nn), arr);
13248       return True;
13249    }
13250
13251    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
13252       /* -------- 1,1x,11111: FSQRT 2d_2d, 4s_4s, 2s_2s -------- */
13253       Bool isD = (size & 1) == 1;
13254       IROp op  = isD ? Iop_Sqrt64Fx2 : Iop_Sqrt32Fx4;
13255       if (bitQ == 0 && isD) return False; // implied 1d case
13256       IRTemp resV = newTempV128();
13257       assign(resV, binop(op, mkexpr(mk_get_IR_rounding_mode()),
13258                              getQReg128(nn)));
13259       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
13260       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
13261       DIP("%s %s.%s, %s.%s\n", "fsqrt",
13262           nameQReg128(dd), arr, nameQReg128(nn), arr);
13263       return True;
13264    }
13265
13266    return False;
13267 #  undef INSN
13268 }
13269
13270
13271 static
13272 Bool dis_AdvSIMD_vector_x_indexed_elem(/*MB_OUT*/DisResult* dres, UInt insn)
13273 {
13274    /* 31    28    23   21 20 19 15     11   9 4
13275       0 Q U 01111 size L  M  m  opcode H  0 n d
13276       Decode fields are: u,size,opcode
13277       M is really part of the mm register number.  Individual
13278       cases need to inspect L and H though.
13279    */
13280 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13281    if (INSN(31,31) != 0
13282        || INSN(28,24) != BITS5(0,1,1,1,1) || INSN(10,10) !=0) {
13283       return False;
13284    }
13285    UInt bitQ   = INSN(30,30);
13286    UInt bitU   = INSN(29,29);
13287    UInt size   = INSN(23,22);
13288    UInt bitL   = INSN(21,21);
13289    UInt bitM   = INSN(20,20);
13290    UInt mmLO4  = INSN(19,16);
13291    UInt opcode = INSN(15,12);
13292    UInt bitH   = INSN(11,11);
13293    UInt nn     = INSN(9,5);
13294    UInt dd     = INSN(4,0);
13295    vassert(size < 4);
13296    vassert(bitH < 2 && bitM < 2 && bitL < 2);
13297
13298    if (bitU == 0 && size >= X10
13299        && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
13300       /* -------- 0,1x,0001 FMLA 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
13301       /* -------- 0,1x,0101 FMLS 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
13302       if (bitQ == 0 && size == X11) return False; // implied 1d case
13303       Bool isD   = (size & 1) == 1;
13304       Bool isSUB = opcode == BITS4(0,1,0,1);
13305       UInt index;
13306       if      (!isD)             index = (bitH << 1) | bitL;
13307       else if (isD && bitL == 0) index = bitH;
13308       else return False; // sz:L == x11 => unallocated encoding
13309       vassert(index < (isD ? 2 : 4));
13310       IRType ity   = isD ? Ity_F64 : Ity_F32;
13311       IRTemp elem  = newTemp(ity);
13312       UInt   mm    = (bitM << 4) | mmLO4;
13313       assign(elem, getQRegLane(mm, index, ity));
13314       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
13315       IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
13316       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
13317       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
13318       IRTemp rm    = mk_get_IR_rounding_mode();
13319       IRTemp t1    = newTempV128();
13320       IRTemp t2    = newTempV128();
13321       // FIXME: double rounding; use FMA primops instead
13322       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
13323       assign(t2, triop(isSUB ? opSUB : opADD,
13324                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
13325       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
13326       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13327       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
13328           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm),
13329           isD ? 'd' : 's', index);
13330       return True;
13331    }
13332
13333    if (size >= X10 && opcode == BITS4(1,0,0,1)) {
13334       /* -------- 0,1x,1001 FMUL  2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
13335       /* -------- 1,1x,1001 FMULX 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
13336       if (bitQ == 0 && size == X11) return False; // implied 1d case
13337       Bool isD    = (size & 1) == 1;
13338       Bool isMULX = bitU == 1;
13339       UInt index;
13340       if      (!isD)             index = (bitH << 1) | bitL;
13341       else if (isD && bitL == 0) index = bitH;
13342       else return False; // sz:L == x11 => unallocated encoding
13343       vassert(index < (isD ? 2 : 4));
13344       IRType ity  = isD ? Ity_F64 : Ity_F32;
13345       IRTemp elem = newTemp(ity);
13346       UInt   mm   = (bitM << 4) | mmLO4;
13347       assign(elem, getQRegLane(mm, index, ity));
13348       IRTemp dupd = math_DUP_TO_V128(elem, ity);
13349       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
13350       IRTemp res  = newTempV128();
13351       assign(res, triop(isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4,
13352                         mkexpr(mk_get_IR_rounding_mode()),
13353                         getQReg128(nn), mkexpr(dupd)));
13354       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13355       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13356       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n",
13357           isMULX ? "fmulx" : "fmul", nameQReg128(dd), arr,
13358           nameQReg128(nn), arr, nameQReg128(mm), isD ? 'd' : 's', index);
13359       return True;
13360    }
13361
13362    if ((bitU == 1 && (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,1,0,0)))
13363        || (bitU == 0 && opcode == BITS4(1,0,0,0))) {
13364       /* -------- 1,xx,0000 MLA s/h variants only -------- */
13365       /* -------- 1,xx,0100 MLS s/h variants only -------- */
13366       /* -------- 0,xx,1000 MUL s/h variants only -------- */
13367       Bool isMLA = opcode == BITS4(0,0,0,0);
13368       Bool isMLS = opcode == BITS4(0,1,0,0);
13369       UInt mm    = 32; // invalid
13370       UInt ix    = 16; // invalid
13371       switch (size) {
13372          case X00:
13373             return False; // b case is not allowed
13374          case X01:
13375             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13376          case X10:
13377             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13378          case X11:
13379             return False; // d case is not allowed
13380          default:
13381             vassert(0);
13382       }
13383       vassert(mm < 32 && ix < 16);
13384       IROp   opMUL = mkVecMUL(size);
13385       IROp   opADD = mkVecADD(size);
13386       IROp   opSUB = mkVecSUB(size);
13387       HChar  ch    = size == X01 ? 'h' : 's';
13388       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13389       IRTemp vecD  = newTempV128();
13390       IRTemp vecN  = newTempV128();
13391       IRTemp res   = newTempV128();
13392       assign(vecD, getQReg128(dd));
13393       assign(vecN, getQReg128(nn));
13394       IRExpr* prod = binop(opMUL, mkexpr(vecN), mkexpr(vecM));
13395       if (isMLA || isMLS) {
13396          assign(res, binop(isMLA ? opADD : opSUB, mkexpr(vecD), prod));
13397       } else {
13398          assign(res, prod);
13399       }
13400       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13401       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13402       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isMLA ? "mla"
13403                                                 : (isMLS ? "mls" : "mul"),
13404           nameQReg128(dd), arr,
13405           nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
13406       return True;
13407    }
13408
13409    if (opcode == BITS4(1,0,1,0)
13410        || opcode == BITS4(0,0,1,0) || opcode == BITS4(0,1,1,0)) {
13411       /* -------- 0,xx,1010 SMULL s/h variants only -------- */ // 0 (ks)
13412       /* -------- 1,xx,1010 UMULL s/h variants only -------- */ // 0
13413       /* -------- 0,xx,0010 SMLAL s/h variants only -------- */ // 1
13414       /* -------- 1,xx,0010 UMLAL s/h variants only -------- */ // 1
13415       /* -------- 0,xx,0110 SMLSL s/h variants only -------- */ // 2
13416       /* -------- 1,xx,0110 SMLSL s/h variants only -------- */ // 2
13417       /* Widens, and size refers to the narrowed lanes. */
13418       UInt ks = 3;
13419       switch (opcode) {
13420          case BITS4(1,0,1,0): ks = 0; break;
13421          case BITS4(0,0,1,0): ks = 1; break;
13422          case BITS4(0,1,1,0): ks = 2; break;
13423          default: vassert(0);
13424       }
13425       vassert(ks >= 0 && ks <= 2);
13426       Bool isU = bitU == 1;
13427       Bool is2 = bitQ == 1;
13428       UInt mm  = 32; // invalid
13429       UInt ix  = 16; // invalid
13430       switch (size) {
13431          case X00:
13432             return False; // h_b_b[] case is not allowed
13433          case X01:
13434             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13435          case X10:
13436             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13437          case X11:
13438             return False; // q_d_d[] case is not allowed
13439          default:
13440             vassert(0);
13441       }
13442       vassert(mm < 32 && ix < 16);
13443       IRTemp vecN  = newTempV128();
13444       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13445       IRTemp vecD  = newTempV128();
13446       assign(vecN, getQReg128(nn));
13447       assign(vecD, getQReg128(dd));
13448       IRTemp res = IRTemp_INVALID;
13449       math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
13450                     vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
13451       putQReg128(dd, mkexpr(res));
13452       const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
13453       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13454       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13455       HChar ch               = size == X01 ? 'h' : 's';
13456       DIP("%c%s%s %s.%s, %s.%s, %s.%c[%u]\n",
13457           isU ? 'u' : 's', nm, is2 ? "2" : "",
13458           nameQReg128(dd), arrWide,
13459           nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
13460       return True;
13461    }
13462
13463    if (bitU == 0
13464        && (opcode == BITS4(1,0,1,1)
13465            || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
13466       /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
13467       /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
13468       /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
13469       /* Widens, and size refers to the narrowed lanes. */
13470       UInt ks = 3;
13471       switch (opcode) {
13472          case BITS4(1,0,1,1): ks = 0; break;
13473          case BITS4(0,0,1,1): ks = 1; break;
13474          case BITS4(0,1,1,1): ks = 2; break;
13475          default: vassert(0);
13476       }
13477       vassert(ks >= 0 && ks <= 2);
13478       Bool is2 = bitQ == 1;
13479       UInt mm  = 32; // invalid
13480       UInt ix  = 16; // invalid
13481       switch (size) {
13482          case X00:
13483             return False; // h_b_b[] case is not allowed
13484          case X01:
13485             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13486          case X10:
13487             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13488          case X11:
13489             return False; // q_d_d[] case is not allowed
13490          default:
13491             vassert(0);
13492       }
13493       vassert(mm < 32 && ix < 16);
13494       IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
13495       vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
13496       newTempsV128_2(&vecN, &vecD);
13497       assign(vecN, getQReg128(nn));
13498       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13499       assign(vecD, getQReg128(dd));
13500       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
13501                        is2, size, "mas"[ks],
13502                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
13503       putQReg128(dd, mkexpr(res));
13504       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
13505       updateQCFLAGwithDifference(sat1q, sat1n);
13506       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
13507          updateQCFLAGwithDifference(sat2q, sat2n);
13508       }
13509       const HChar* nm        = ks == 0 ? "sqdmull"
13510                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
13511       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13512       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13513       HChar ch               = size == X01 ? 'h' : 's';
13514       DIP("%s%s %s.%s, %s.%s, %s.%c[%u]\n",
13515           nm, is2 ? "2" : "",
13516           nameQReg128(dd), arrWide,
13517           nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
13518       return True;
13519    }
13520
13521    if (bitU == 0 && (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1))) {
13522       /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
13523       /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
13524       UInt mm  = 32; // invalid
13525       UInt ix  = 16; // invalid
13526       switch (size) {
13527          case X00:
13528             return False; // b case is not allowed
13529          case X01:
13530             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13531          case X10:
13532             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13533          case X11:
13534             return False; // q case is not allowed
13535          default:
13536             vassert(0);
13537       }
13538       vassert(mm < 32 && ix < 16);
13539       Bool isR = opcode == BITS4(1,1,0,1);
13540       IRTemp res, sat1q, sat1n, vN, vM;
13541       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
13542       vN = newTempV128();
13543       assign(vN, getQReg128(nn));
13544       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13545       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
13546       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13547       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
13548       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
13549       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
13550       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13551       HChar ch         = size == X01 ? 'h' : 's';
13552       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", nm,
13553           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
13554       return True;
13555    }
13556
13557    if (bitU == 1 && (opcode == BITS4(1,1,0,1) || opcode == BITS4(1,1,1,1))) {
13558       /* -------- 0,xx,1101 SQRDMLAH s and h variants only -------- */
13559       /* -------- 0,xx,1111 SQRDMLSH s and h variants only -------- */
13560       UInt mm  = 32; // invalid
13561       UInt ix  = 16; // invalid
13562       switch (size) {
13563          case X00:
13564             return False; // b case is not allowed
13565          case X01:        // h
13566             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13567          case X10:        // s
13568             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13569          case X11:
13570             return False; // d case is not allowed
13571          default:
13572             vassert(0);
13573       }
13574       vassert(mm < 32 && ix < 16);
13575
13576       IRTemp res, res_nosat, vD, vN, vM;
13577       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
13578       newTempsV128_2(&vD, &vN);
13579       assign(vD, getQReg128(dd));
13580       assign(vN, getQReg128(nn));
13581
13582       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13583       Bool isAdd = opcode == BITS4(1,1,0,1);
13584       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
13585       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
13586       updateQCFLAGwithDifferenceZHI(res, res_nosat, opZHI);
13587       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13588
13589       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13590       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
13591       HChar ch         = size == X01 ? 'h' : 's';
13592       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", nm,
13593           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), ch, ix);
13594       return True;
13595    }
13596
13597    return False;
13598 #  undef INSN
13599 }
13600
13601
13602 static
13603 Bool dis_AdvSIMD_crypto_aes(/*MB_OUT*/DisResult* dres, UInt insn)
13604 {
13605    /* 31        23   21    16     11 9 4
13606       0100 1110 size 10100 opcode 10 n d
13607       Decode fields are: size,opcode
13608       Size is always 00 in ARMv8, it appears.
13609    */
13610 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13611    if (INSN(31,24) != BITS8(0,1,0,0,1,1,1,0)
13612       || INSN(21,17) != BITS5(1,0,1,0,0) || INSN(11,10) != BITS2(1,0)) {
13613       return False;
13614    }
13615    UInt size   = INSN(23,22);
13616    UInt opcode = INSN(16,12);
13617    UInt nn     = INSN(9,5);
13618    UInt dd     = INSN(4,0);
13619
13620    if (size == BITS2(0,0)
13621        && (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,0,1))) {
13622       /* -------- 00,00100: AESE Vd.16b, Vn.16b -------- */
13623       /* -------- 00,00101: AESD Vd.16b, Vn.16b -------- */
13624       Bool   isD  = opcode == BITS5(0,0,1,0,1);
13625       IRTemp op1  = newTemp(Ity_V128);
13626       IRTemp op2  = newTemp(Ity_V128);
13627       IRTemp xord = newTemp(Ity_V128);
13628       IRTemp res  = newTemp(Ity_V128);
13629       void*        helper = isD ? &arm64g_dirtyhelper_AESD
13630                                 : &arm64g_dirtyhelper_AESE;
13631       const HChar* hname  = isD ? "arm64g_dirtyhelper_AESD"
13632                                 : "arm64g_dirtyhelper_AESE";
13633       assign(op1, getQReg128(dd));
13634       assign(op2, getQReg128(nn));
13635       assign(xord, binop(Iop_XorV128, mkexpr(op1), mkexpr(op2)));
13636       IRDirty* di
13637          = unsafeIRDirty_1_N( res, 0/*regparms*/, hname, helper,
13638                               mkIRExprVec_3(
13639                                  IRExpr_VECRET(),
13640                                  unop(Iop_V128HIto64, mkexpr(xord)),
13641                                  unop(Iop_V128to64, mkexpr(xord)) ) );
13642       stmt(IRStmt_Dirty(di));
13643       putQReg128(dd, mkexpr(res));
13644       DIP("aes%c %s.16b, %s.16b\n", isD ? 'd' : 'e',
13645                                     nameQReg128(dd), nameQReg128(nn));
13646       return True;
13647    }
13648
13649    if (size == BITS2(0,0)
13650        && (opcode == BITS5(0,0,1,1,0) || opcode == BITS5(0,0,1,1,1))) {
13651       /* -------- 00,00110: AESMC  Vd.16b, Vn.16b -------- */
13652       /* -------- 00,00111: AESIMC Vd.16b, Vn.16b -------- */
13653       Bool   isI  = opcode == BITS5(0,0,1,1,1);
13654       IRTemp src  = newTemp(Ity_V128);
13655       IRTemp res  = newTemp(Ity_V128);
13656       void*        helper = isI ? &arm64g_dirtyhelper_AESIMC
13657                                 : &arm64g_dirtyhelper_AESMC;
13658       const HChar* hname  = isI ? "arm64g_dirtyhelper_AESIMC"
13659                                 : "arm64g_dirtyhelper_AESMC";
13660       assign(src, getQReg128(nn));
13661       IRDirty* di
13662          = unsafeIRDirty_1_N( res, 0/*regparms*/, hname, helper,
13663                               mkIRExprVec_3(
13664                                  IRExpr_VECRET(),
13665                                  unop(Iop_V128HIto64, mkexpr(src)),
13666                                  unop(Iop_V128to64, mkexpr(src)) ) );
13667       stmt(IRStmt_Dirty(di));
13668       putQReg128(dd, mkexpr(res));
13669       DIP("aes%s %s.16b, %s.16b\n", isI ? "imc" : "mc",
13670                                     nameQReg128(dd), nameQReg128(nn));
13671       return True;
13672    }
13673
13674    return False;
13675 #  undef INSN
13676 }
13677
13678
13679 static
13680 Bool dis_AdvSIMD_crypto_three_reg_sha(/*MB_OUT*/DisResult* dres, UInt insn)
13681 {
13682    /* 31   28   23 21 20 15 14  11 9 4
13683       0101 1110 sz 0  m  0  opc 00 n d
13684       Decode fields are: sz,opc
13685    */
13686 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13687    if (INSN(31,24) != BITS8(0,1,0,1,1,1,1,0) || INSN(21,21) != 0
13688        || INSN(15,15) != 0 || INSN(11,10) != BITS2(0,0)) {
13689       return False;
13690    }
13691    UInt sz  = INSN(23,22);
13692    UInt mm  = INSN(20,16);
13693    UInt opc = INSN(14,12);
13694    UInt nn  = INSN(9,5);
13695    UInt dd  = INSN(4,0);
13696    if (sz == BITS2(0,0) && opc <= BITS3(1,1,0)) {
13697       /* -------- 00,000 SHA1C     Qd,    Sn,    Vm.4S -------- */
13698       /* -------- 00,001 SHA1P     Qd,    Sn,    Vm.4S -------- */
13699       /* -------- 00,010 SHA1M     Qd,    Sn,    Vm.4S -------- */
13700       /* -------- 00,011 SHA1SU0   Vd.4S, Vn.4S, Vm.4S -------- */
13701       /* -------- 00,100 SHA256H   Qd,    Qn,    Vm.4S -------- */
13702       /* -------- 00,101 SHA256H2  Qd,    Qn,    Vm.4S -------- */
13703       /* -------- 00,110 SHA256SU1 Vd.4S, Vn.4S, Vm.4S -------- */
13704       vassert(opc < 7);
13705       const HChar* inames[7]
13706          = { "sha1c", "sha1p", "sha1m", "sha1su0",
13707              "sha256h", "sha256h2", "sha256su1" };
13708       void(*helpers[7])(V128*,ULong,ULong,ULong,ULong,ULong,ULong)
13709          = { &arm64g_dirtyhelper_SHA1C,    &arm64g_dirtyhelper_SHA1P,
13710              &arm64g_dirtyhelper_SHA1M,    &arm64g_dirtyhelper_SHA1SU0,
13711              &arm64g_dirtyhelper_SHA256H,  &arm64g_dirtyhelper_SHA256H2,
13712              &arm64g_dirtyhelper_SHA256SU1 };
13713       const HChar* hnames[7]
13714          = { "arm64g_dirtyhelper_SHA1C",    "arm64g_dirtyhelper_SHA1P",
13715              "arm64g_dirtyhelper_SHA1M",    "arm64g_dirtyhelper_SHA1SU0",
13716              "arm64g_dirtyhelper_SHA256H",  "arm64g_dirtyhelper_SHA256H2",
13717              "arm64g_dirtyhelper_SHA256SU1" };
13718       IRTemp vD      = newTemp(Ity_V128);
13719       IRTemp vN      = newTemp(Ity_V128);
13720       IRTemp vM      = newTemp(Ity_V128);
13721       IRTemp vDhi    = newTemp(Ity_I64);
13722       IRTemp vDlo    = newTemp(Ity_I64);
13723       IRTemp vNhiPre = newTemp(Ity_I64);
13724       IRTemp vNloPre = newTemp(Ity_I64);
13725       IRTemp vNhi    = newTemp(Ity_I64);
13726       IRTemp vNlo    = newTemp(Ity_I64);
13727       IRTemp vMhi    = newTemp(Ity_I64);
13728       IRTemp vMlo    = newTemp(Ity_I64);
13729       assign(vD,      getQReg128(dd));
13730       assign(vN,      getQReg128(nn));
13731       assign(vM,      getQReg128(mm));
13732       assign(vDhi,    unop(Iop_V128HIto64, mkexpr(vD)));
13733       assign(vDlo,    unop(Iop_V128to64,   mkexpr(vD)));
13734       assign(vNhiPre, unop(Iop_V128HIto64, mkexpr(vN)));
13735       assign(vNloPre, unop(Iop_V128to64,   mkexpr(vN)));
13736       assign(vMhi,    unop(Iop_V128HIto64, mkexpr(vM)));
13737       assign(vMlo,    unop(Iop_V128to64,   mkexpr(vM)));
13738       /* Mask off any bits of the N register operand that aren't actually
13739          needed, so that Memcheck doesn't complain unnecessarily. */
13740       switch (opc) {
13741          case BITS3(0,0,0): case BITS3(0,0,1): case BITS3(0,1,0):
13742             assign(vNhi, mkU64(0));
13743             assign(vNlo, unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(vNloPre))));
13744             break;
13745          case BITS3(0,1,1): case BITS3(1,0,0):
13746          case BITS3(1,0,1): case BITS3(1,1,0):
13747             assign(vNhi, mkexpr(vNhiPre));
13748             assign(vNlo, mkexpr(vNloPre));
13749             break;
13750          default:
13751             vassert(0);
13752       }
13753       IRTemp res = newTemp(Ity_V128);
13754       IRDirty* di
13755          = unsafeIRDirty_1_N( res, 0/*regparms*/, hnames[opc], helpers[opc],
13756                               mkIRExprVec_7(
13757                                  IRExpr_VECRET(),
13758                                  mkexpr(vDhi), mkexpr(vDlo), mkexpr(vNhi),
13759                                  mkexpr(vNlo), mkexpr(vMhi), mkexpr(vMlo)));
13760       stmt(IRStmt_Dirty(di));
13761       putQReg128(dd, mkexpr(res));
13762       switch (opc) {
13763          case BITS3(0,0,0): case BITS3(0,0,1): case BITS3(0,1,0):
13764             DIP("%s q%u, s%u, v%u.4s\n", inames[opc], dd, nn, mm);
13765             break;
13766          case BITS3(0,1,1): case BITS3(1,1,0):
13767             DIP("%s v%u.4s, v%u.4s, v%u.4s\n", inames[opc], dd, nn, mm);
13768             break;
13769          case BITS3(1,0,0): case BITS3(1,0,1):
13770             DIP("%s q%u, q%u, v%u.4s\n", inames[opc], dd, nn, mm);
13771             break;
13772          default:
13773             vassert(0);
13774       }
13775       return True;
13776    }
13777
13778    return False;
13779 #  undef INSN
13780 }
13781
13782
13783 static
13784 Bool dis_AdvSIMD_crypto_two_reg_sha(/*MB_OUT*/DisResult* dres, UInt insn)
13785 {
13786    /* 31   28   23 21    16  11 9 4
13787       0101 1110 sz 10100 opc 10 n d
13788       Decode fields are: sz,opc
13789    */
13790 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13791    if (INSN(31,24) != BITS8(0,1,0,1,1,1,1,0)
13792        || INSN(21,17) != BITS5(1,0,1,0,0) || INSN(11,10) != BITS2(1,0)) {
13793       return False;
13794    }
13795    UInt sz  = INSN(23,22);
13796    UInt opc = INSN(16,12);
13797    UInt nn  = INSN(9,5);
13798    UInt dd  = INSN(4,0);
13799    if (sz == BITS2(0,0) && opc <= BITS5(0,0,0,1,0)) {
13800       /* -------- 00,00000 SHA1H     Sd,    Sn    -------- */
13801       /* -------- 00,00001 SHA1SU1   Vd.4S, Vn.4S -------- */
13802       /* -------- 00,00010 SHA256SU0 Vd.4S, Vn.4S -------- */
13803       vassert(opc < 3);
13804       const HChar* inames[3] = { "sha1h", "sha1su1", "sha256su0" };
13805       IRTemp vD   = newTemp(Ity_V128);
13806       IRTemp vN   = newTemp(Ity_V128);
13807       IRTemp vDhi = newTemp(Ity_I64);
13808       IRTemp vDlo = newTemp(Ity_I64);
13809       IRTemp vNhi = newTemp(Ity_I64);
13810       IRTemp vNlo = newTemp(Ity_I64);
13811       assign(vD,   getQReg128(dd));
13812       assign(vN,   getQReg128(nn));
13813       assign(vDhi, unop(Iop_V128HIto64, mkexpr(vD)));
13814       assign(vDlo, unop(Iop_V128to64,   mkexpr(vD)));
13815       assign(vNhi, unop(Iop_V128HIto64, mkexpr(vN)));
13816       assign(vNlo, unop(Iop_V128to64,   mkexpr(vN)));
13817       /* Mask off any bits of the N register operand that aren't actually
13818          needed, so that Memcheck doesn't complain unnecessarily.  Also
13819          construct the calls, given that the helper functions don't take
13820          the same number of arguments. */
13821       IRDirty* di  = NULL;
13822       IRTemp   res = newTemp(Ity_V128);
13823       switch (opc) {
13824          case BITS5(0,0,0,0,0): {
13825             IRExpr* vNloMasked = unop(Iop_32Uto64,
13826                                       unop(Iop_64to32, mkexpr(vNlo)));
13827             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
13828                                     "arm64g_dirtyhelper_SHA1H",
13829                                     &arm64g_dirtyhelper_SHA1H,
13830                                     mkIRExprVec_3(
13831                                        IRExpr_VECRET(),
13832                                        mkU64(0), vNloMasked) );
13833             break;
13834          }
13835          case BITS5(0,0,0,0,1):
13836             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
13837                                     "arm64g_dirtyhelper_SHA1SU1",
13838                                     &arm64g_dirtyhelper_SHA1SU1,
13839                                     mkIRExprVec_5(
13840                                        IRExpr_VECRET(),
13841                                        mkexpr(vDhi), mkexpr(vDlo),
13842                                        mkexpr(vNhi), mkexpr(vNlo)) );
13843             break;
13844          case BITS5(0,0,0,1,0):
13845             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
13846                                     "arm64g_dirtyhelper_SHA256SU0",
13847                                     &arm64g_dirtyhelper_SHA256SU0,
13848                                     mkIRExprVec_5(
13849                                        IRExpr_VECRET(),
13850                                        mkexpr(vDhi), mkexpr(vDlo),
13851                                        mkexpr(vNhi), mkexpr(vNlo)) );
13852             break;
13853          default:
13854             vassert(0);
13855       }
13856       stmt(IRStmt_Dirty(di));
13857       putQReg128(dd, mkexpr(res));
13858       switch (opc) {
13859          case BITS5(0,0,0,0,0):
13860             DIP("%s s%u, s%u\n", inames[opc], dd, nn);
13861             break;
13862          case BITS5(0,0,0,0,1): case BITS5(0,0,0,1,0):
13863             DIP("%s v%u.4s, v%u.4s\n", inames[opc], dd, nn);
13864             break;
13865          default:
13866             vassert(0);
13867       }
13868       return True;
13869    }
13870
13871    return False;
13872 #  undef INSN
13873 }
13874
13875
13876 static
13877 Bool dis_AdvSIMD_fp_compare(/*MB_OUT*/DisResult* dres, UInt insn)
13878 {
13879    /* 31  28    23 21 20 15 13   9 4
13880       000 11110 ty 1  m  op 1000 n opcode2
13881       The first 3 bits are really "M 0 S", but M and S are always zero.
13882       Decode fields are: ty,op,opcode2
13883    */
13884 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13885    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
13886        || INSN(21,21) != 1 || INSN(13,10) != BITS4(1,0,0,0)) {
13887       return False;
13888    }
13889    UInt ty      = INSN(23,22);
13890    UInt mm      = INSN(20,16);
13891    UInt op      = INSN(15,14);
13892    UInt nn      = INSN(9,5);
13893    UInt opcode2 = INSN(4,0);
13894    vassert(ty < 4);
13895
13896    if (ty <= X01 && op == X00
13897        && (opcode2 & BITS5(0,0,1,1,1)) == BITS5(0,0,0,0,0)) {
13898       /* -------- 0x,00,00000 FCMP  d_d,   s_s -------- */
13899       /* -------- 0x,00,01000 FCMP  d_#0, s_#0 -------- */
13900       /* -------- 0x,00,10000 FCMPE d_d,   s_s -------- */
13901       /* -------- 0x,00,11000 FCMPE d_#0, s_#0 -------- */
13902       /* 31        23   20    15      9 4
13903          000 11110 01 1     m 00 1000 n 10 000  FCMPE Dn, Dm
13904          000 11110 01 1 00000 00 1000 n 11 000  FCMPE Dn, #0.0
13905          000 11110 01 1     m 00 1000 n 00 000  FCMP  Dn, Dm
13906          000 11110 01 1 00000 00 1000 n 01 000  FCMP  Dn, #0.0
13907
13908          000 11110 00 1     m 00 1000 n 10 000  FCMPE Sn, Sm
13909          000 11110 00 1 00000 00 1000 n 11 000  FCMPE Sn, #0.0
13910          000 11110 00 1     m 00 1000 n 00 000  FCMP  Sn, Sm
13911          000 11110 00 1 00000 00 1000 n 01 000  FCMP  Sn, #0.0
13912
13913          FCMPE generates Invalid Operation exn if either arg is any kind
13914          of NaN.  FCMP generates Invalid Operation exn if either arg is a
13915          signalling NaN.  We ignore this detail here and produce the same
13916          IR for both.
13917       */
13918       Bool   isD     = (ty & 1) == 1;
13919       Bool   isCMPE  = (opcode2 & 16) == 16;
13920       Bool   cmpZero = (opcode2 & 8) == 8;
13921       IRType ity     = isD ? Ity_F64 : Ity_F32;
13922       Bool   valid   = True;
13923       if (cmpZero && mm != 0) valid = False;
13924       if (valid) {
13925          IRTemp argL  = newTemp(ity);
13926          IRTemp argR  = newTemp(ity);
13927          IRTemp irRes = newTemp(Ity_I32);
13928          assign(argL, getQRegLO(nn, ity));
13929          assign(argR,
13930                 cmpZero
13931                    ? (IRExpr_Const(isD ? IRConst_F64i(0) : IRConst_F32i(0)))
13932                    : getQRegLO(mm, ity));
13933          assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32,
13934                              mkexpr(argL), mkexpr(argR)));
13935          IRTemp nzcv = mk_convert_IRCmpF64Result_to_NZCV(irRes);
13936          IRTemp nzcv_28x0 = newTemp(Ity_I64);
13937          assign(nzcv_28x0, binop(Iop_Shl64, mkexpr(nzcv), mkU8(28)));
13938          setFlags_COPY(nzcv_28x0);
13939          DIP("fcmp%s %s, %s\n", isCMPE ? "e" : "", nameQRegLO(nn, ity),
13940              cmpZero ? "#0.0" : nameQRegLO(mm, ity));
13941          return True;
13942       }
13943       return False;
13944    }
13945
13946    return False;
13947 #  undef INSN
13948 }
13949
13950
13951 static
13952 Bool dis_AdvSIMD_fp_conditional_compare(/*MB_OUT*/DisResult* dres, UInt insn)
13953 {
13954    /* 31  28    23 21 20 15   11 9 4  3
13955       000 11110 ty 1  m  cond 01 n op nzcv
13956       The first 3 bits are really "M 0 S", but M and S are always zero.
13957       Decode fields are: ty,op
13958    */
13959 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13960    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
13961        || INSN(21,21) != 1 || INSN(11,10) != BITS2(0,1)) {
13962       return False;
13963    }
13964    UInt ty   = INSN(23,22);
13965    UInt mm   = INSN(20,16);
13966    UInt cond = INSN(15,12);
13967    UInt nn   = INSN(9,5);
13968    UInt op   = INSN(4,4);
13969    UInt nzcv = INSN(3,0);
13970    vassert(ty < 4 && op <= 1);
13971
13972    if (ty <= BITS2(0,1)) {
13973       /* -------- 00,0 FCCMP  s_s -------- */
13974       /* -------- 00,1 FCCMPE s_s -------- */
13975       /* -------- 01,0 FCCMP  d_d -------- */
13976       /* -------- 01,1 FCCMPE d_d -------- */
13977
13978       /* FCCMPE generates Invalid Operation exn if either arg is any kind
13979          of NaN.  FCCMP generates Invalid Operation exn if either arg is a
13980          signalling NaN.  We ignore this detail here and produce the same
13981          IR for both.
13982       */
13983       Bool   isD    = (ty & 1) == 1;
13984       Bool   isCMPE = op == 1;
13985       IRType ity    = isD ? Ity_F64 : Ity_F32;
13986       IRTemp argL   = newTemp(ity);
13987       IRTemp argR   = newTemp(ity);
13988       IRTemp irRes  = newTemp(Ity_I32);
13989       assign(argL,  getQRegLO(nn, ity));
13990       assign(argR,  getQRegLO(mm, ity));
13991       assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32,
13992                           mkexpr(argL), mkexpr(argR)));
13993       IRTemp condT = newTemp(Ity_I1);
13994       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
13995       IRTemp nzcvT = mk_convert_IRCmpF64Result_to_NZCV(irRes);
13996
13997       IRTemp nzcvT_28x0 = newTemp(Ity_I64);
13998       assign(nzcvT_28x0, binop(Iop_Shl64, mkexpr(nzcvT), mkU8(28)));
13999
14000       IRExpr* nzcvF_28x0 = mkU64(((ULong)nzcv) << 28);
14001
14002       IRTemp nzcv_28x0 = newTemp(Ity_I64);
14003       assign(nzcv_28x0, IRExpr_ITE(mkexpr(condT),
14004                                    mkexpr(nzcvT_28x0), nzcvF_28x0));
14005       setFlags_COPY(nzcv_28x0);
14006       DIP("fccmp%s %s, %s, #%u, %s\n", isCMPE ? "e" : "",
14007           nameQRegLO(nn, ity), nameQRegLO(mm, ity), nzcv, nameCC(cond));
14008       return True;
14009    }
14010
14011    return False;
14012 #  undef INSN
14013 }
14014
14015
14016 static
14017 Bool dis_AdvSIMD_fp_conditional_select(/*MB_OUT*/DisResult* dres, UInt insn)
14018 {
14019    /* 31        23 21 20 15   11 9 5
14020       000 11110 ty 1  m  cond 11 n d
14021       The first 3 bits are really "M 0 S", but M and S are always zero.
14022       Decode fields: ty
14023    */
14024 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14025    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0) || INSN(21,21) != 1
14026        || INSN(11,10) != BITS2(1,1)) {
14027       return False;
14028    }
14029    UInt ty   = INSN(23,22);
14030    UInt mm   = INSN(20,16);
14031    UInt cond = INSN(15,12);
14032    UInt nn   = INSN(9,5);
14033    UInt dd   = INSN(4,0);
14034    if (ty <= X01) {
14035       /* -------- 00: FCSEL s_s -------- */
14036       /* -------- 00: FCSEL d_d -------- */
14037       IRType ity = ty == X01 ? Ity_F64 : Ity_F32;
14038       IRTemp srcT = newTemp(ity);
14039       IRTemp srcF = newTemp(ity);
14040       IRTemp res  = newTemp(ity);
14041       assign(srcT, getQRegLO(nn, ity));
14042       assign(srcF, getQRegLO(mm, ity));
14043       assign(res, IRExpr_ITE(
14044                      unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
14045                      mkexpr(srcT), mkexpr(srcF)));
14046       putQReg128(dd, mkV128(0x0000));
14047       putQRegLO(dd, mkexpr(res));
14048       DIP("fcsel %s, %s, %s, %s\n",
14049           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity),
14050           nameCC(cond));
14051       return True;
14052    }
14053    return False;
14054 #  undef INSN
14055 }
14056
14057
14058 static
14059 Bool dis_AdvSIMD_fp_data_proc_1_source(/*MB_OUT*/DisResult* dres, UInt insn)
14060 {
14061    /* 31  28    23 21 20     14    9 4
14062       000 11110 ty 1  opcode 10000 n d
14063       The first 3 bits are really "M 0 S", but M and S are always zero.
14064       Decode fields: ty,opcode
14065    */
14066 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14067    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
14068        || INSN(21,21) != 1 || INSN(14,10) != BITS5(1,0,0,0,0)) {
14069       return False;
14070    }
14071    UInt ty     = INSN(23,22);
14072    UInt opcode = INSN(20,15);
14073    UInt nn     = INSN(9,5);
14074    UInt dd     = INSN(4,0);
14075
14076    if (ty <= X01 && opcode <= BITS6(0,0,0,0,1,1)) {
14077       /* -------- 0x,000000: FMOV  d_d, s_s -------- */
14078       /* -------- 0x,000001: FABS  d_d, s_s -------- */
14079       /* -------- 0x,000010: FNEG  d_d, s_s -------- */
14080       /* -------- 0x,000011: FSQRT d_d, s_s -------- */
14081       IRType ity = ty == X01 ? Ity_F64 : Ity_F32;
14082       IRTemp src = newTemp(ity);
14083       IRTemp res = newTemp(ity);
14084       const HChar* nm = "??";
14085       assign(src, getQRegLO(nn, ity));
14086       switch (opcode) {
14087          case BITS6(0,0,0,0,0,0):
14088             nm = "fmov"; assign(res, mkexpr(src)); break;
14089          case BITS6(0,0,0,0,0,1):
14090             nm = "fabs"; assign(res, unop(mkABSF(ity), mkexpr(src))); break;
14091          case BITS6(0,0,0,0,1,0):
14092             nm = "fabs"; assign(res, unop(mkNEGF(ity), mkexpr(src))); break;
14093          case BITS6(0,0,0,0,1,1):
14094             nm = "fsqrt";
14095             assign(res, binop(mkSQRTF(ity),
14096                               mkexpr(mk_get_IR_rounding_mode()),
14097                               mkexpr(src))); break;
14098          default:
14099             vassert(0);
14100       }
14101       putQReg128(dd, mkV128(0x0000));
14102       putQRegLO(dd, mkexpr(res));
14103       DIP("%s %s, %s\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
14104       return True;
14105    }
14106
14107    if (   (ty == X11 && (opcode == BITS6(0,0,0,1,0,0)
14108                          || opcode == BITS6(0,0,0,1,0,1)))
14109        || (ty == X00 && (opcode == BITS6(0,0,0,1,1,1)
14110                          || opcode == BITS6(0,0,0,1,0,1)))
14111        || (ty == X01 && (opcode == BITS6(0,0,0,1,1,1)
14112                          || opcode == BITS6(0,0,0,1,0,0)))) {
14113       /* -------- 11,000100: FCVT s_h -------- */
14114       /* -------- 11,000101: FCVT d_h -------- */
14115       /* -------- 00,000111: FCVT h_s -------- */
14116       /* -------- 00,000101: FCVT d_s -------- */
14117       /* -------- 01,000111: FCVT h_d -------- */
14118       /* -------- 01,000100: FCVT s_d -------- */
14119       /* 31        23 21    16 14    9 4
14120          000 11110 11 10001 00 10000 n d   FCVT Sd, Hn
14121          --------- 11 ----- 01 ---------   FCVT Dd, Hn
14122          --------- 00 ----- 11 ---------   FCVT Hd, Sn
14123          --------- 00 ----- 01 ---------   FCVT Dd, Sn
14124          --------- 01 ----- 11 ---------   FCVT Hd, Dn
14125          --------- 01 ----- 00 ---------   FCVT Sd, Dn
14126          Rounding, when dst is smaller than src, is per the FPCR.
14127       */
14128       UInt b2322 = ty;
14129       UInt b1615 = opcode & BITS2(1,1);
14130       switch ((b2322 << 2) | b1615) {
14131          case BITS4(0,0,0,1):   // S -> D
14132          case BITS4(1,1,0,1): { // H -> D
14133             Bool   srcIsH = b2322 == BITS2(1,1);
14134             IRType srcTy  = srcIsH ? Ity_F16 : Ity_F32;
14135             IRTemp res    = newTemp(Ity_F64);
14136             assign(res, unop(srcIsH ? Iop_F16toF64 : Iop_F32toF64,
14137                              getQRegLO(nn, srcTy)));
14138             putQReg128(dd, mkV128(0x0000));
14139             putQRegLO(dd, mkexpr(res));
14140             DIP("fcvt %s, %s\n",
14141                 nameQRegLO(dd, Ity_F64), nameQRegLO(nn, srcTy));
14142             return True;
14143          }
14144          case BITS4(0,1,0,0):   // D -> S
14145          case BITS4(0,1,1,1): { // D -> H
14146             Bool   dstIsH = b1615 == BITS2(1,1);
14147             IRType dstTy  = dstIsH ? Ity_F16 : Ity_F32;
14148             IRTemp res    = newTemp(dstTy);
14149             assign(res, binop(dstIsH ? Iop_F64toF16 : Iop_F64toF32,
14150                               mkexpr(mk_get_IR_rounding_mode()),
14151                               getQRegLO(nn, Ity_F64)));
14152             putQReg128(dd, mkV128(0x0000));
14153             putQRegLO(dd, mkexpr(res));
14154             DIP("fcvt %s, %s\n",
14155                 nameQRegLO(dd, dstTy), nameQRegLO(nn, Ity_F64));
14156             return True;
14157          }
14158          case BITS4(0,0,1,1):   // S -> H
14159          case BITS4(1,1,0,0): { // H -> S
14160             Bool   toH   = b1615 == BITS2(1,1);
14161             IRType srcTy = toH ? Ity_F32 : Ity_F16;
14162             IRType dstTy = toH ? Ity_F16 : Ity_F32;
14163             IRTemp res = newTemp(dstTy);
14164             if (toH) {
14165                assign(res, binop(Iop_F32toF16,
14166                                  mkexpr(mk_get_IR_rounding_mode()),
14167                                  getQRegLO(nn, srcTy)));
14168
14169             } else {
14170                assign(res, unop(Iop_F16toF32,
14171                                 getQRegLO(nn, srcTy)));
14172             }
14173             putQReg128(dd, mkV128(0x0000));
14174             putQRegLO(dd, mkexpr(res));
14175             DIP("fcvt %s, %s\n",
14176                 nameQRegLO(dd, dstTy), nameQRegLO(nn, srcTy));
14177             return True;
14178          }
14179          default:
14180             break;
14181       }
14182       /* else unhandled */
14183       return False;
14184    }
14185
14186    if (ty <= X01
14187        && opcode >= BITS6(0,0,1,0,0,0) && opcode <= BITS6(0,0,1,1,1,1)
14188        && opcode != BITS6(0,0,1,1,0,1)) {
14189       /* -------- 0x,001000 FRINTN d_d, s_s -------- */
14190       /* -------- 0x,001001 FRINTP d_d, s_s -------- */
14191       /* -------- 0x,001010 FRINTM d_d, s_s -------- */
14192       /* -------- 0x,001011 FRINTZ d_d, s_s -------- */
14193       /* -------- 0x,001100 FRINTA d_d, s_s -------- */
14194       /* -------- 0x,001110 FRINTX d_d, s_s -------- */
14195       /* -------- 0x,001111 FRINTI d_d, s_s -------- */
14196       /* 31        23 21   17  14    9 4
14197          000 11110 0x 1001 111 10000 n d  FRINTI Fd, Fm (round per FPCR)
14198                            rm
14199          x==0 => S-registers, x==1 => D-registers
14200          rm (17:15) encodings:
14201             111 per FPCR  (FRINTI)
14202             001 +inf      (FRINTP)
14203             010 -inf      (FRINTM)
14204             011 zero      (FRINTZ)
14205             000 tieeven   (FRINTN) -- !! FIXME KLUDGED !!
14206             100 tieaway   (FRINTA) -- !! FIXME KLUDGED !!
14207             110 per FPCR + "exact = TRUE" (FRINTX)
14208             101 unallocated
14209       */
14210       Bool    isD   = (ty & 1) == 1;
14211       UInt    rm    = opcode & BITS6(0,0,0,1,1,1);
14212       IRType  ity   = isD ? Ity_F64 : Ity_F32;
14213       IRExpr* irrmE = NULL;
14214       UChar   ch    = '?';
14215       switch (rm) {
14216          case BITS3(0,1,1): ch = 'z'; irrmE = mkU32(Irrm_ZERO); break;
14217          case BITS3(0,1,0): ch = 'm'; irrmE = mkU32(Irrm_NegINF); break;
14218          case BITS3(0,0,1): ch = 'p'; irrmE = mkU32(Irrm_PosINF); break;
14219          // The following is a kludge.  Should be: Irrm_NEAREST_TIE_AWAY_0
14220          case BITS3(1,0,0): ch = 'a'; irrmE = mkU32(Irrm_NEAREST); break;
14221          // I am unsure about the following, due to the "integral exact"
14222          // description in the manual.  What does it mean? (frintx, that is)
14223          case BITS3(1,1,0):
14224             ch = 'x'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
14225          case BITS3(1,1,1):
14226             ch = 'i'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
14227          // The following is a kludge.  There's no Irrm_ value to represent
14228          // this ("to nearest, with ties to even")
14229          case BITS3(0,0,0): ch = 'n'; irrmE = mkU32(Irrm_NEAREST); break;
14230          default: break;
14231       }
14232       if (irrmE) {
14233          IRTemp src = newTemp(ity);
14234          IRTemp dst = newTemp(ity);
14235          assign(src, getQRegLO(nn, ity));
14236          assign(dst, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
14237                            irrmE, mkexpr(src)));
14238          putQReg128(dd, mkV128(0x0000));
14239          putQRegLO(dd, mkexpr(dst));
14240          DIP("frint%c %s, %s\n",
14241              ch, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
14242          return True;
14243       }
14244       return False;
14245    }
14246
14247    return False;
14248 #  undef INSN
14249 }
14250
14251
14252 static
14253 Bool dis_AdvSIMD_fp_data_proc_2_source(/*MB_OUT*/DisResult* dres, UInt insn)
14254 {
14255    /* 31  28    23 21 20 15     11 9 4
14256       000 11110 ty 1  m  opcode 10 n d
14257       The first 3 bits are really "M 0 S", but M and S are always zero.
14258       Decode fields: ty, opcode
14259    */
14260 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14261    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
14262        || INSN(21,21) != 1 || INSN(11,10) != BITS2(1,0)) {
14263       return False;
14264    }
14265    UInt ty     = INSN(23,22);
14266    UInt mm     = INSN(20,16);
14267    UInt opcode = INSN(15,12);
14268    UInt nn     = INSN(9,5);
14269    UInt dd     = INSN(4,0);
14270
14271    if (ty <= X01 && opcode <= BITS4(0,1,1,1)) {
14272       /* ------- 0x,0000: FMUL d_d, s_s ------- */
14273       /* ------- 0x,0001: FDIV d_d, s_s ------- */
14274       /* ------- 0x,0010: FADD d_d, s_s ------- */
14275       /* ------- 0x,0011: FSUB d_d, s_s ------- */
14276       /* ------- 0x,0100: FMAX d_d, s_s ------- */
14277       /* ------- 0x,0101: FMIN d_d, s_s ------- */
14278       /* ------- 0x,0110: FMAXNM d_d, s_s ------- (FIXME KLUDGED) */
14279       /* ------- 0x,0111: FMINNM d_d, s_s ------- (FIXME KLUDGED) */
14280       IRType ity = ty == X00 ? Ity_F32 : Ity_F64;
14281       IROp   iop = Iop_INVALID;
14282       const HChar* nm = "???";
14283       switch (opcode) {
14284          case BITS4(0,0,0,0): nm = "fmul"; iop = mkMULF(ity); break;
14285          case BITS4(0,0,0,1): nm = "fdiv"; iop = mkDIVF(ity); break;
14286          case BITS4(0,0,1,0): nm = "fadd"; iop = mkADDF(ity); break;
14287          case BITS4(0,0,1,1): nm = "fsub"; iop = mkSUBF(ity); break;
14288          case BITS4(0,1,0,0): nm = "fmax"; iop = mkVecMAXF(ty+2); break;
14289          case BITS4(0,1,0,1): nm = "fmin"; iop = mkVecMINF(ty+2); break;
14290          case BITS4(0,1,1,0): nm = "fmaxnm"; iop = mkVecMAXF(ty+2); break; //!!
14291          case BITS4(0,1,1,1): nm = "fminnm"; iop = mkVecMINF(ty+2); break; //!!
14292          default: vassert(0);
14293       }
14294       if (opcode <= BITS4(0,0,1,1)) {
14295          // This is really not good code.  TODO: avoid width-changing
14296          IRTemp res = newTemp(ity);
14297          assign(res, triop(iop, mkexpr(mk_get_IR_rounding_mode()),
14298                                 getQRegLO(nn, ity), getQRegLO(mm, ity)));
14299          putQReg128(dd, mkV128(0));
14300          putQRegLO(dd, mkexpr(res));
14301       } else {
14302          putQReg128(dd, unop(mkVecZEROHIxxOFV128(ty+2),
14303                              binop(iop, getQReg128(nn), getQReg128(mm))));
14304       }
14305       DIP("%s %s, %s, %s\n",
14306           nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
14307       return True;
14308    }
14309
14310    if (ty <= X01 && opcode == BITS4(1,0,0,0)) {
14311       /* ------- 0x,1000: FNMUL d_d, s_s ------- */
14312       IRType ity  = ty == X00 ? Ity_F32 : Ity_F64;
14313       IROp   iop  = mkMULF(ity);
14314       IROp   iopn = mkNEGF(ity);
14315       const HChar* nm = "fnmul";
14316       IRExpr* resE = unop(iopn,
14317                           triop(iop, mkexpr(mk_get_IR_rounding_mode()),
14318                                 getQRegLO(nn, ity), getQRegLO(mm, ity)));
14319       IRTemp  res  = newTemp(ity);
14320       assign(res, resE);
14321       putQReg128(dd, mkV128(0));
14322       putQRegLO(dd, mkexpr(res));
14323       DIP("%s %s, %s, %s\n",
14324           nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
14325       return True;
14326    }
14327
14328    return False;
14329 #  undef INSN
14330 }
14331
14332
14333 static
14334 Bool dis_AdvSIMD_fp_data_proc_3_source(/*MB_OUT*/DisResult* dres, UInt insn)
14335 {
14336    /* 31  28    23 21 20 15 14 9 4
14337       000 11111 ty o1 m  o0 a  n d
14338       The first 3 bits are really "M 0 S", but M and S are always zero.
14339       Decode fields: ty,o1,o0
14340    */
14341 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14342    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,1)) {
14343       return False;
14344    }
14345    UInt ty    = INSN(23,22);
14346    UInt bitO1 = INSN(21,21);
14347    UInt mm    = INSN(20,16);
14348    UInt bitO0 = INSN(15,15);
14349    UInt aa    = INSN(14,10);
14350    UInt nn    = INSN(9,5);
14351    UInt dd    = INSN(4,0);
14352    vassert(ty < 4);
14353
14354    if (ty <= X01) {
14355       /* -------- 0x,0,0 FMADD  d_d_d_d, s_s_s_s -------- */
14356       /* -------- 0x,0,1 FMSUB  d_d_d_d, s_s_s_s -------- */
14357       /* -------- 0x,1,0 FNMADD d_d_d_d, s_s_s_s -------- */
14358       /* -------- 0x,1,1 FNMSUB d_d_d_d, s_s_s_s -------- */
14359       /* -------------------- F{N}M{ADD,SUB} -------------------- */
14360       /* 31          22   20 15 14 9 4   ix
14361          000 11111 0 sz 0 m  0  a  n d   0   FMADD  Fd,Fn,Fm,Fa
14362          000 11111 0 sz 0 m  1  a  n d   1   FMSUB  Fd,Fn,Fm,Fa
14363          000 11111 0 sz 1 m  0  a  n d   2   FNMADD Fd,Fn,Fm,Fa
14364          000 11111 0 sz 1 m  1  a  n d   3   FNMSUB Fd,Fn,Fm,Fa
14365          where Fx=Dx when sz=1, Fx=Sx when sz=0
14366
14367                   -----SPEC------    ----IMPL----
14368          fmadd       a +    n * m    a + n * m
14369          fmsub       a + (-n) * m    a - n * m
14370          fnmadd   (-a) + (-n) * m    -(a + n * m)
14371          fnmsub   (-a) +    n * m    -(a - n * m)
14372       */
14373       Bool    isD   = (ty & 1) == 1;
14374       UInt    ix    = (bitO1 << 1) | bitO0;
14375       IRType  ity   = isD ? Ity_F64 : Ity_F32;
14376       IROp    opADD = mkADDF(ity);
14377       IROp    opSUB = mkSUBF(ity);
14378       IROp    opMUL = mkMULF(ity);
14379       IROp    opNEG = mkNEGF(ity);
14380       IRTemp  res   = newTemp(ity);
14381       IRExpr* eA    = getQRegLO(aa, ity);
14382       IRExpr* eN    = getQRegLO(nn, ity);
14383       IRExpr* eM    = getQRegLO(mm, ity);
14384       IRExpr* rm    = mkexpr(mk_get_IR_rounding_mode());
14385       IRExpr* eNxM  = triop(opMUL, rm, eN, eM);
14386       switch (ix) {
14387          case 0:  assign(res, triop(opADD, rm, eA, eNxM)); break;
14388          case 1:  assign(res, triop(opSUB, rm, eA, eNxM)); break;
14389          case 2:  assign(res, unop(opNEG, triop(opADD, rm, eA, eNxM))); break;
14390          case 3:  assign(res, unop(opNEG, triop(opSUB, rm, eA, eNxM))); break;
14391          default: vassert(0);
14392       }
14393       putQReg128(dd, mkV128(0x0000));
14394       putQRegLO(dd, mkexpr(res));
14395       const HChar* names[4] = { "fmadd", "fmsub", "fnmadd", "fnmsub" };
14396       DIP("%s %s, %s, %s, %s\n",
14397           names[ix], nameQRegLO(dd, ity), nameQRegLO(nn, ity),
14398                      nameQRegLO(mm, ity), nameQRegLO(aa, ity));
14399       return True;
14400    }
14401
14402    return False;
14403 #  undef INSN
14404 }
14405
14406
14407 static
14408 Bool dis_AdvSIMD_fp_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
14409 {
14410    /* 31  28    23 21 20   12  9    4
14411       000 11110 ty 1  imm8 100 imm5 d
14412       The first 3 bits are really "M 0 S", but M and S are always zero.
14413    */
14414 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14415    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
14416        || INSN(21,21) != 1 || INSN(12,10) != BITS3(1,0,0)) {
14417       return False;
14418    }
14419    UInt ty     = INSN(23,22);
14420    UInt imm8   = INSN(20,13);
14421    UInt imm5   = INSN(9,5);
14422    UInt dd     = INSN(4,0);
14423
14424    /* ------- 00,00000: FMOV s_imm ------- */
14425    /* ------- 01,00000: FMOV d_imm ------- */
14426    if (ty <= X01 && imm5 == BITS5(0,0,0,0,0)) {
14427       Bool  isD  = (ty & 1) == 1;
14428       ULong imm  = VFPExpandImm(imm8, isD ? 64 : 32);
14429       if (!isD) {
14430          vassert(0 == (imm & 0xFFFFFFFF00000000ULL));
14431       }
14432       putQReg128(dd, mkV128(0));
14433       putQRegLO(dd, isD ? mkU64(imm) : mkU32(imm & 0xFFFFFFFFULL));
14434       DIP("fmov %s, #0x%llx\n",
14435           nameQRegLO(dd, isD ? Ity_F64 : Ity_F32), imm);
14436       return True;
14437    }
14438
14439    return False;
14440 #  undef INSN
14441 }
14442
14443
14444 static
14445 Bool dis_AdvSIMD_fp_to_from_fixedp_conv(/*MB_OUT*/DisResult* dres, UInt insn)
14446 {
14447 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14448    /* 31 30 29 28    23   21 20    18     15    9 4
14449       sf  0  0 11110 type 0  rmode opcode scale n d
14450       The first 3 bits are really "sf 0 S", but S is always zero.
14451       Decode fields: sf,type,rmode,opcode
14452    */
14453 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14454    if (INSN(30,29) != BITS2(0,0)
14455        || INSN(28,24) != BITS5(1,1,1,1,0)
14456        || INSN(21,21) != 0) {
14457       return False;
14458    }
14459    UInt bitSF = INSN(31,31);
14460    UInt ty    = INSN(23,22); // type
14461    UInt rm    = INSN(20,19); // rmode
14462    UInt op    = INSN(18,16); // opcode
14463    UInt sc    = INSN(15,10); // scale
14464    UInt nn    = INSN(9,5);
14465    UInt dd    = INSN(4,0);
14466
14467    if (ty <= X01 && rm == X11
14468        && (op == BITS3(0,0,0) || op == BITS3(0,0,1))) {
14469       /* -------- (ix) sf ty rm opc -------- */
14470       /* -------- 0    0  00 11 000: FCVTZS w_s_#fbits -------- */
14471       /* -------- 1    0  01 11 000: FCVTZS w_d_#fbits -------- */
14472       /* -------- 2    1  00 11 000: FCVTZS x_s_#fbits -------- */
14473       /* -------- 3    1  01 11 000: FCVTZS x_d_#fbits -------- */
14474
14475       /* -------- 4    0  00 11 001: FCVTZU w_s_#fbits -------- */
14476       /* -------- 5    0  01 11 001: FCVTZU w_d_#fbits -------- */
14477       /* -------- 6    1  00 11 001: FCVTZU x_s_#fbits -------- */
14478       /* -------- 7    1  01 11 001: FCVTZU x_d_#fbits -------- */
14479       Bool isI64 = bitSF == 1;
14480       Bool isF64 = (ty & 1) == 1;
14481       Bool isU   = (op & 1) == 1;
14482       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
14483
14484       Int fbits = 64 - sc;
14485       vassert(fbits >= 1 && fbits <= (isI64 ? 64 : 32));
14486
14487       Double  scale  = two_to_the_plus(fbits);
14488       IRExpr* scaleE = isF64 ? IRExpr_Const(IRConst_F64(scale))
14489                              : IRExpr_Const(IRConst_F32( (Float)scale ));
14490       IROp    opMUL  = isF64 ? Iop_MulF64 : Iop_MulF32;
14491
14492       const IROp ops[8]
14493         = { Iop_F32toI32S, Iop_F64toI32S, Iop_F32toI64S, Iop_F64toI64S,
14494             Iop_F32toI32U, Iop_F64toI32U, Iop_F32toI64U, Iop_F64toI64U };
14495       IRTemp irrm = newTemp(Ity_I32);
14496       assign(irrm, mkU32(Irrm_ZERO));
14497
14498       IRExpr* src = getQRegLO(nn, isF64 ? Ity_F64 : Ity_F32);
14499       IRExpr* res = binop(ops[ix], mkexpr(irrm),
14500                                    triop(opMUL, mkexpr(irrm), src, scaleE));
14501       putIRegOrZR(isI64, dd, res);
14502
14503       DIP("fcvtz%c %s, %s, #%d\n",
14504           isU ? 'u' : 's', nameIRegOrZR(isI64, dd),
14505           nameQRegLO(nn, isF64 ? Ity_F64 : Ity_F32), fbits);
14506       return True;
14507    }
14508
14509    /* ------ sf,ty,rm,opc ------ */
14510    /* ------ x,0x,00,010  SCVTF s/d, w/x, #fbits  ------ */
14511    /* ------ x,0x,00,011  UCVTF s/d, w/x, #fbits  ------ */
14512    /* (ix) sf  S 28    ty   rm opc 15    9 4
14513       0    0 0 0 11110 00 0 00 010 scale n d  SCVTF Sd, Wn, #fbits
14514       1    0 0 0 11110 01 0 00 010 scale n d  SCVTF Dd, Wn, #fbits
14515       2    1 0 0 11110 00 0 00 010 scale n d  SCVTF Sd, Xn, #fbits
14516       3    1 0 0 11110 01 0 00 010 scale n d  SCVTF Dd, Xn, #fbits
14517
14518       4    0 0 0 11110 00 0 00 011 scale n d  UCVTF Sd, Wn, #fbits
14519       5    0 0 0 11110 01 0 00 011 scale n d  UCVTF Dd, Wn, #fbits
14520       6    1 0 0 11110 00 0 00 011 scale n d  UCVTF Sd, Xn, #fbits
14521       7    1 0 0 11110 01 0 00 011 scale n d  UCVTF Dd, Xn, #fbits
14522
14523       These are signed/unsigned conversion from integer registers to
14524       FP registers, all 4 32/64-bit combinations, rounded per FPCR,
14525       scaled per |scale|.
14526    */
14527    if (ty <= X01 && rm == X00
14528        && (op == BITS3(0,1,0) || op == BITS3(0,1,1))
14529        && (bitSF == 1 || ((sc >> 5) & 1) == 1)) {
14530       Bool isI64 = bitSF == 1;
14531       Bool isF64 = (ty & 1) == 1;
14532       Bool isU   = (op & 1) == 1;
14533       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
14534
14535       Int fbits = 64 - sc;
14536       vassert(fbits >= 1 && fbits <= (isI64 ? 64 : 32));
14537
14538       Double  scale  = two_to_the_minus(fbits);
14539       IRExpr* scaleE = isF64 ? IRExpr_Const(IRConst_F64(scale))
14540                              : IRExpr_Const(IRConst_F32( (Float)scale ));
14541       IROp    opMUL  = isF64 ? Iop_MulF64 : Iop_MulF32;
14542
14543       const IROp ops[8]
14544         = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
14545             Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
14546       IRExpr* src = getIRegOrZR(isI64, nn);
14547       IRExpr* res = (isF64 && !isI64)
14548                        ? unop(ops[ix], src)
14549                        : binop(ops[ix],
14550                                mkexpr(mk_get_IR_rounding_mode()), src);
14551       putQReg128(dd, mkV128(0));
14552       putQRegLO(dd, triop(opMUL, mkU32(Irrm_NEAREST), res, scaleE));
14553
14554       DIP("%ccvtf %s, %s, #%d\n",
14555           isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32),
14556           nameIRegOrZR(isI64, nn), fbits);
14557       return True;
14558    }
14559
14560    return False;
14561 #  undef INSN
14562 }
14563
14564
14565 static
14566 Bool dis_AdvSIMD_fp_to_from_int_conv(/*MB_OUT*/DisResult* dres, UInt insn)
14567 {
14568    /* 31 30 29 28    23   21 20    18     15     9 4
14569       sf  0  0 11110 type 1  rmode opcode 000000 n d
14570       The first 3 bits are really "sf 0 S", but S is always zero.
14571       Decode fields: sf,type,rmode,opcode
14572    */
14573 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14574    if (INSN(30,29) != BITS2(0,0)
14575        || INSN(28,24) != BITS5(1,1,1,1,0)
14576        || INSN(21,21) != 1
14577        || INSN(15,10) != BITS6(0,0,0,0,0,0)) {
14578       return False;
14579    }
14580    UInt bitSF = INSN(31,31);
14581    UInt ty    = INSN(23,22); // type
14582    UInt rm    = INSN(20,19); // rmode
14583    UInt op    = INSN(18,16); // opcode
14584    UInt nn    = INSN(9,5);
14585    UInt dd    = INSN(4,0);
14586
14587    // op = 000, 001
14588    /* -------- FCVT{N,P,M,Z,A}{S,U} (scalar, integer) -------- */
14589    /*    30       23   20 18  15     9 4
14590       sf 00 11110 0x 1 00 000 000000 n d  FCVTNS Rd, Fn (round to
14591       sf 00 11110 0x 1 00 001 000000 n d  FCVTNU Rd, Fn  nearest)
14592       ---------------- 01 --------------  FCVTP-------- (round to +inf)
14593       ---------------- 10 --------------  FCVTM-------- (round to -inf)
14594       ---------------- 11 --------------  FCVTZ-------- (round to zero)
14595       ---------------- 00 100 ----------  FCVTAS------- (nearest, ties away)
14596       ---------------- 00 101 ----------  FCVTAU------- (nearest, ties away)
14597
14598       Rd is Xd when sf==1, Wd when sf==0
14599       Fn is Dn when x==1, Sn when x==0
14600       20:19 carry the rounding mode, using the same encoding as FPCR
14601    */
14602    if (ty <= X01
14603        && (   ((op == BITS3(0,0,0) || op == BITS3(0,0,1)) && True)
14604            || ((op == BITS3(1,0,0) || op == BITS3(1,0,1)) && rm == BITS2(0,0))
14605           )
14606       ) {
14607       Bool isI64 = bitSF == 1;
14608       Bool isF64 = (ty & 1) == 1;
14609       Bool isU   = (op & 1) == 1;
14610       /* Decide on the IR rounding mode to use. */
14611       IRRoundingMode irrm = 8; /*impossible*/
14612       HChar ch = '?';
14613       if (op == BITS3(0,0,0) || op == BITS3(0,0,1)) {
14614          switch (rm) {
14615             case BITS2(0,0): ch = 'n'; irrm = Irrm_NEAREST; break;
14616             case BITS2(0,1): ch = 'p'; irrm = Irrm_PosINF; break;
14617             case BITS2(1,0): ch = 'm'; irrm = Irrm_NegINF; break;
14618             case BITS2(1,1): ch = 'z'; irrm = Irrm_ZERO; break;
14619             default: vassert(0);
14620          }
14621       } else {
14622          vassert(op == BITS3(1,0,0) || op == BITS3(1,0,1));
14623          switch (rm) {
14624             case BITS2(0,0): ch = 'a'; irrm = Irrm_NEAREST; break;
14625             default: vassert(0);
14626          }
14627       }
14628       vassert(irrm != 8);
14629       /* Decide on the conversion primop, based on the source size,
14630          dest size and signedness (8 possibilities).  Case coding:
14631             F32 ->s I32   0
14632             F32 ->u I32   1
14633             F32 ->s I64   2
14634             F32 ->u I64   3
14635             F64 ->s I32   4
14636             F64 ->u I32   5
14637             F64 ->s I64   6
14638             F64 ->u I64   7
14639       */
14640       UInt ix = (isF64 ? 4 : 0) | (isI64 ? 2 : 0) | (isU ? 1 : 0);
14641       vassert(ix < 8);
14642       const IROp iops[8]
14643          = { Iop_F32toI32S, Iop_F32toI32U, Iop_F32toI64S, Iop_F32toI64U,
14644              Iop_F64toI32S, Iop_F64toI32U, Iop_F64toI64S, Iop_F64toI64U };
14645       IROp iop = iops[ix];
14646       // A bit of ATCery: bounce all cases we haven't seen an example of.
14647       if (/* F32toI32S */
14648              (iop == Iop_F32toI32S && irrm == Irrm_ZERO)   /* FCVTZS Wd,Sn */
14649           || (iop == Iop_F32toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Sn */
14650           || (iop == Iop_F32toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Sn */
14651           || (iop == Iop_F32toI32S && irrm == Irrm_NEAREST)/* FCVT{A,N}S W,S */
14652           /* F32toI32U */
14653           || (iop == Iop_F32toI32U && irrm == Irrm_ZERO)   /* FCVTZU Wd,Sn */
14654           || (iop == Iop_F32toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Sn */
14655           || (iop == Iop_F32toI32U && irrm == Irrm_PosINF) /* FCVTPU Wd,Sn */
14656           || (iop == Iop_F32toI32U && irrm == Irrm_NEAREST)/* FCVT{A,N}U W,S */
14657           /* F32toI64S */
14658           || (iop == Iop_F32toI64S && irrm == Irrm_ZERO)   /* FCVTZS Xd,Sn */
14659           || (iop == Iop_F32toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Sn */
14660           || (iop == Iop_F32toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Sn */
14661           || (iop == Iop_F32toI64S && irrm == Irrm_NEAREST)/* FCVT{A,N}S X,S */
14662           /* F32toI64U */
14663           || (iop == Iop_F32toI64U && irrm == Irrm_ZERO)   /* FCVTZU Xd,Sn */
14664           || (iop == Iop_F32toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Sn */
14665           || (iop == Iop_F32toI64U && irrm == Irrm_PosINF) /* FCVTPU Xd,Sn */
14666           || (iop == Iop_F32toI64U && irrm == Irrm_NEAREST)/* FCVT{A,N}U X,S */
14667           /* F64toI32S */
14668           || (iop == Iop_F64toI32S && irrm == Irrm_ZERO)   /* FCVTZS Wd,Dn */
14669           || (iop == Iop_F64toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Dn */
14670           || (iop == Iop_F64toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Dn */
14671           || (iop == Iop_F64toI32S && irrm == Irrm_NEAREST)/* FCVT{A,N}S W,D */
14672           /* F64toI32U */
14673           || (iop == Iop_F64toI32U && irrm == Irrm_ZERO)   /* FCVTZU Wd,Dn */
14674           || (iop == Iop_F64toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Dn */
14675           || (iop == Iop_F64toI32U && irrm == Irrm_PosINF) /* FCVTPU Wd,Dn */
14676           || (iop == Iop_F64toI32U && irrm == Irrm_NEAREST)/* FCVT{A,N}U W,D */
14677           /* F64toI64S */
14678           || (iop == Iop_F64toI64S && irrm == Irrm_ZERO)   /* FCVTZS Xd,Dn */
14679           || (iop == Iop_F64toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Dn */
14680           || (iop == Iop_F64toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Dn */
14681           || (iop == Iop_F64toI64S && irrm == Irrm_NEAREST)/* FCVT{A,N}S X,D */
14682           /* F64toI64U */
14683           || (iop == Iop_F64toI64U && irrm == Irrm_ZERO)   /* FCVTZU Xd,Dn */
14684           || (iop == Iop_F64toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Dn */
14685           || (iop == Iop_F64toI64U && irrm == Irrm_PosINF) /* FCVTPU Xd,Dn */
14686           || (iop == Iop_F64toI64U && irrm == Irrm_NEAREST)/* FCVT{A,N}U X,D */
14687          ) {
14688         /* validated */
14689       } else {
14690         return False;
14691       }
14692       IRType srcTy  = isF64 ? Ity_F64 : Ity_F32;
14693       IRType dstTy  = isI64 ? Ity_I64 : Ity_I32;
14694       IRTemp src    = newTemp(srcTy);
14695       IRTemp dst    = newTemp(dstTy);
14696       assign(src, getQRegLO(nn, srcTy));
14697       assign(dst, binop(iop, mkU32(irrm), mkexpr(src)));
14698       putIRegOrZR(isI64, dd, mkexpr(dst));
14699       DIP("fcvt%c%c %s, %s\n", ch, isU ? 'u' : 's',
14700           nameIRegOrZR(isI64, dd), nameQRegLO(nn, srcTy));
14701       return True;
14702    }
14703
14704    // op = 010, 011
14705    /* -------------- {S,U}CVTF (scalar, integer) -------------- */
14706    /* (ix) sf  S 28    ty   rm op  15     9 4
14707       0    0 0 0 11110 00 1 00 010 000000 n d  SCVTF Sd, Wn
14708       1    0 0 0 11110 01 1 00 010 000000 n d  SCVTF Dd, Wn
14709       2    1 0 0 11110 00 1 00 010 000000 n d  SCVTF Sd, Xn
14710       3    1 0 0 11110 01 1 00 010 000000 n d  SCVTF Dd, Xn
14711
14712       4    0 0 0 11110 00 1 00 011 000000 n d  UCVTF Sd, Wn
14713       5    0 0 0 11110 01 1 00 011 000000 n d  UCVTF Dd, Wn
14714       6    1 0 0 11110 00 1 00 011 000000 n d  UCVTF Sd, Xn
14715       7    1 0 0 11110 01 1 00 011 000000 n d  UCVTF Dd, Xn
14716
14717       These are signed/unsigned conversion from integer registers to
14718       FP registers, all 4 32/64-bit combinations, rounded per FPCR.
14719    */
14720    if (ty <= X01 && rm == X00 && (op == BITS3(0,1,0) || op == BITS3(0,1,1))) {
14721       Bool isI64 = bitSF == 1;
14722       Bool isF64 = (ty & 1) == 1;
14723       Bool isU   = (op & 1) == 1;
14724       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
14725       const IROp ops[8]
14726         = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
14727             Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
14728       IRExpr* src = getIRegOrZR(isI64, nn);
14729       IRExpr* res = (isF64 && !isI64)
14730                        ? unop(ops[ix], src)
14731                        : binop(ops[ix],
14732                                mkexpr(mk_get_IR_rounding_mode()), src);
14733       putQReg128(dd, mkV128(0));
14734       putQRegLO(dd, res);
14735       DIP("%ccvtf %s, %s\n",
14736           isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32),
14737           nameIRegOrZR(isI64, nn));
14738       return True;
14739    }
14740
14741    // op = 110, 111
14742    /* -------- FMOV (general) -------- */
14743    /* case sf  S       ty   rm op  15     9 4
14744        (1) 0 0 0 11110 00 1 00 111 000000 n d     FMOV Sd,      Wn
14745        (2) 1 0 0 11110 01 1 00 111 000000 n d     FMOV Dd,      Xn
14746        (3) 1 0 0 11110 10 1 01 111 000000 n d     FMOV Vd.D[1], Xn
14747
14748        (4) 0 0 0 11110 00 1 00 110 000000 n d     FMOV Wd, Sn
14749        (5) 1 0 0 11110 01 1 00 110 000000 n d     FMOV Xd, Dn
14750        (6) 1 0 0 11110 10 1 01 110 000000 n d     FMOV Xd, Vn.D[1]
14751    */
14752    if (1) {
14753       UInt ix = 0; // case
14754       if (bitSF == 0) {
14755          if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,1))
14756             ix = 1;
14757          else
14758          if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,0))
14759             ix = 4;
14760       } else {
14761          vassert(bitSF == 1);
14762          if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,1))
14763             ix = 2;
14764          else
14765          if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,0))
14766             ix = 5;
14767          else
14768          if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,1))
14769             ix = 3;
14770          else
14771          if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,0))
14772             ix = 6;
14773       }
14774       if (ix > 0) {
14775          switch (ix) {
14776             case 1:
14777                putQReg128(dd, mkV128(0));
14778                putQRegLO(dd, getIReg32orZR(nn));
14779                DIP("fmov s%u, w%u\n", dd, nn);
14780                break;
14781             case 2:
14782                putQReg128(dd, mkV128(0));
14783                putQRegLO(dd, getIReg64orZR(nn));
14784                DIP("fmov d%u, x%u\n", dd, nn);
14785                break;
14786             case 3:
14787                putQRegHI64(dd, getIReg64orZR(nn));
14788                DIP("fmov v%u.d[1], x%u\n", dd, nn);
14789                break;
14790             case 4:
14791                putIReg32orZR(dd, getQRegLO(nn, Ity_I32));
14792                DIP("fmov w%u, s%u\n", dd, nn);
14793                break;
14794             case 5:
14795                putIReg64orZR(dd, getQRegLO(nn, Ity_I64));
14796                DIP("fmov x%u, d%u\n", dd, nn);
14797                break;
14798             case 6:
14799                putIReg64orZR(dd, getQRegHI64(nn));
14800                DIP("fmov x%u, v%u.d[1]\n", dd, nn);
14801                break;
14802             default:
14803                vassert(0);
14804          }
14805          return True;
14806       }
14807       /* undecodable; fall through */
14808    }
14809
14810    return False;
14811 #  undef INSN
14812 }
14813
14814
14815 static
14816 Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
14817 {
14818    Bool ok;
14819    ok = dis_AdvSIMD_EXT(dres, insn);
14820    if (UNLIKELY(ok)) return True;
14821    ok = dis_AdvSIMD_TBL_TBX(dres, insn);
14822    if (UNLIKELY(ok)) return True;
14823    ok = dis_AdvSIMD_ZIP_UZP_TRN(dres, insn);
14824    if (UNLIKELY(ok)) return True;
14825    ok = dis_AdvSIMD_across_lanes(dres, insn);
14826    if (UNLIKELY(ok)) return True;
14827    ok = dis_AdvSIMD_copy(dres, insn);
14828    if (UNLIKELY(ok)) return True;
14829    ok = dis_AdvSIMD_modified_immediate(dres, insn);
14830    if (UNLIKELY(ok)) return True;
14831    ok = dis_AdvSIMD_scalar_copy(dres, insn);
14832    if (UNLIKELY(ok)) return True;
14833    ok = dis_AdvSIMD_scalar_pairwise(dres, insn);
14834    if (UNLIKELY(ok)) return True;
14835    ok = dis_AdvSIMD_scalar_shift_by_imm(dres, insn);
14836    if (UNLIKELY(ok)) return True;
14837    ok = dis_AdvSIMD_scalar_three_different(dres, insn);
14838    if (UNLIKELY(ok)) return True;
14839    ok = dis_AdvSIMD_scalar_three_same(dres, insn);
14840    if (UNLIKELY(ok)) return True;
14841    ok = dis_AdvSIMD_scalar_three_same_extra(dres, insn);
14842    if (UNLIKELY(ok)) return True;
14843    ok = dis_AdvSIMD_scalar_two_reg_misc(dres, insn);
14844    if (UNLIKELY(ok)) return True;
14845    ok = dis_AdvSIMD_scalar_x_indexed_element(dres, insn);
14846    if (UNLIKELY(ok)) return True;
14847    ok = dis_AdvSIMD_shift_by_immediate(dres, insn);
14848    if (UNLIKELY(ok)) return True;
14849    ok = dis_AdvSIMD_three_different(dres, insn);
14850    if (UNLIKELY(ok)) return True;
14851    ok = dis_AdvSIMD_three_same(dres, insn);
14852    if (UNLIKELY(ok)) return True;
14853    ok = dis_AdvSIMD_three_same_extra(dres, insn);
14854    if (UNLIKELY(ok)) return True;
14855    ok = dis_AdvSIMD_two_reg_misc(dres, insn);
14856    if (UNLIKELY(ok)) return True;
14857    ok = dis_AdvSIMD_vector_x_indexed_elem(dres, insn);
14858    if (UNLIKELY(ok)) return True;
14859    ok = dis_AdvSIMD_crypto_aes(dres, insn);
14860    if (UNLIKELY(ok)) return True;
14861    ok = dis_AdvSIMD_crypto_three_reg_sha(dres, insn);
14862    if (UNLIKELY(ok)) return True;
14863    ok = dis_AdvSIMD_crypto_two_reg_sha(dres, insn);
14864    if (UNLIKELY(ok)) return True;
14865    ok = dis_AdvSIMD_fp_compare(dres, insn);
14866    if (UNLIKELY(ok)) return True;
14867    ok = dis_AdvSIMD_fp_conditional_compare(dres, insn);
14868    if (UNLIKELY(ok)) return True;
14869    ok = dis_AdvSIMD_fp_conditional_select(dres, insn);
14870    if (UNLIKELY(ok)) return True;
14871    ok = dis_AdvSIMD_fp_data_proc_1_source(dres, insn);
14872    if (UNLIKELY(ok)) return True;
14873    ok = dis_AdvSIMD_fp_data_proc_2_source(dres, insn);
14874    if (UNLIKELY(ok)) return True;
14875    ok = dis_AdvSIMD_fp_data_proc_3_source(dres, insn);
14876    if (UNLIKELY(ok)) return True;
14877    ok = dis_AdvSIMD_fp_immediate(dres, insn);
14878    if (UNLIKELY(ok)) return True;
14879    ok = dis_AdvSIMD_fp_to_from_fixedp_conv(dres, insn);
14880    if (UNLIKELY(ok)) return True;
14881    ok = dis_AdvSIMD_fp_to_from_int_conv(dres, insn);
14882    if (UNLIKELY(ok)) return True;
14883    return False;
14884 }
14885
14886
14887 /*------------------------------------------------------------*/
14888 /*--- Disassemble a single ARM64 instruction               ---*/
14889 /*------------------------------------------------------------*/
14890
14891 /* Disassemble a single ARM64 instruction into IR.  The instruction
14892    has is located at |guest_instr| and has guest IP of
14893    |guest_PC_curr_instr|, which will have been set before the call
14894    here.  Returns True iff the instruction was decoded, in which case
14895    *dres will be set accordingly, or False, in which case *dres should
14896    be ignored by the caller. */
14897
14898 static
14899 Bool disInstr_ARM64_WRK (
14900         /*MB_OUT*/DisResult* dres,
14901         const UChar* guest_instr,
14902         const VexArchInfo* archinfo,
14903         const VexAbiInfo*  abiinfo,
14904         Bool sigill_diag
14905      )
14906 {
14907    // A macro to fish bits out of 'insn'.
14908 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14909
14910 //ZZ    DisResult dres;
14911 //ZZ    UInt      insn;
14912 //ZZ    //Bool      allow_VFP = False;
14913 //ZZ    //UInt      hwcaps = archinfo->hwcaps;
14914 //ZZ    IRTemp    condT; /* :: Ity_I32 */
14915 //ZZ    UInt      summary;
14916 //ZZ    HChar     dis_buf[128];  // big enough to hold LDMIA etc text
14917 //ZZ
14918 //ZZ    /* What insn variants are we supporting today? */
14919 //ZZ    //allow_VFP  = (0 != (hwcaps & VEX_HWCAPS_ARM_VFP));
14920 //ZZ    // etc etc
14921
14922    /* Set result defaults. */
14923    dres->whatNext    = Dis_Continue;
14924    dres->len         = 4;
14925    dres->jk_StopHere = Ijk_INVALID;
14926    dres->hint        = Dis_HintNone;
14927
14928    /* At least this is simple on ARM64: insns are all 4 bytes long, and
14929       4-aligned.  So just fish the whole thing out of memory right now
14930       and have done. */
14931    UInt insn = getUIntLittleEndianly( guest_instr );
14932
14933    if (0) vex_printf("insn: 0x%x\n", insn);
14934
14935    DIP("\t(arm64) 0x%llx:  ", (ULong)guest_PC_curr_instr);
14936
14937    vassert(0 == (guest_PC_curr_instr & 3ULL));
14938
14939    /* ----------------------------------------------------------- */
14940
14941    /* Spot "Special" instructions (see comment at top of file). */
14942    {
14943       const UChar* code = guest_instr;
14944       /* Spot the 16-byte preamble:
14945             93CC0D8C   ror x12, x12, #3
14946             93CC358C   ror x12, x12, #13
14947             93CCCD8C   ror x12, x12, #51
14948             93CCF58C   ror x12, x12, #61
14949       */
14950       UInt word1 = 0x93CC0D8C;
14951       UInt word2 = 0x93CC358C;
14952       UInt word3 = 0x93CCCD8C;
14953       UInt word4 = 0x93CCF58C;
14954       if (getUIntLittleEndianly(code+ 0) == word1 &&
14955           getUIntLittleEndianly(code+ 4) == word2 &&
14956           getUIntLittleEndianly(code+ 8) == word3 &&
14957           getUIntLittleEndianly(code+12) == word4) {
14958          /* Got a "Special" instruction preamble.  Which one is it? */
14959          if (getUIntLittleEndianly(code+16) == 0xAA0A014A
14960                                                /* orr x10,x10,x10 */) {
14961             /* X3 = client_request ( X4 ) */
14962             DIP("x3 = client_request ( x4 )\n");
14963             putPC(mkU64( guest_PC_curr_instr + 20 ));
14964             dres->jk_StopHere = Ijk_ClientReq;
14965             dres->whatNext    = Dis_StopHere;
14966             return True;
14967          }
14968          else
14969          if (getUIntLittleEndianly(code+16) == 0xAA0B016B
14970                                                /* orr x11,x11,x11 */) {
14971             /* X3 = guest_NRADDR */
14972             DIP("x3 = guest_NRADDR\n");
14973             dres->len = 20;
14974             putIReg64orZR(3, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
14975             return True;
14976          }
14977          else
14978          if (getUIntLittleEndianly(code+16) == 0xAA0C018C
14979                                                /* orr x12,x12,x12 */) {
14980             /*  branch-and-link-to-noredir X8 */
14981             DIP("branch-and-link-to-noredir x8\n");
14982             putIReg64orZR(30, mkU64(guest_PC_curr_instr + 20));
14983             putPC(getIReg64orZR(8));
14984             dres->jk_StopHere = Ijk_NoRedir;
14985             dres->whatNext    = Dis_StopHere;
14986             return True;
14987          }
14988          else
14989          if (getUIntLittleEndianly(code+16) == 0xAA090129
14990                                                /* orr x9,x9,x9 */) {
14991             /* IR injection */
14992             DIP("IR injection\n");
14993             vex_inject_ir(irsb, Iend_LE);
14994             // Invalidate the current insn. The reason is that the IRop we're
14995             // injecting here can change. In which case the translation has to
14996             // be redone. For ease of handling, we simply invalidate all the
14997             // time.
14998             stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_PC_curr_instr)));
14999             stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(20)));
15000             putPC(mkU64( guest_PC_curr_instr + 20 ));
15001             dres->whatNext    = Dis_StopHere;
15002             dres->jk_StopHere = Ijk_InvalICache;
15003             return True;
15004          }
15005          /* We don't know what it is. */
15006          return False;
15007          /*NOTREACHED*/
15008       }
15009    }
15010
15011    /* ----------------------------------------------------------- */
15012
15013    /* Main ARM64 instruction decoder starts here. */
15014
15015    Bool ok = False;
15016
15017    /* insn[28:25] determines the top-level grouping, so let's start
15018       off with that.
15019
15020       For all of these dis_ARM64_ functions, we pass *dres with the
15021       normal default results "insn OK, 4 bytes long, keep decoding" so
15022       they don't need to change it.  However, decodes of control-flow
15023       insns may cause *dres to change.
15024    */
15025    switch (INSN(28,25)) {
15026       case BITS4(1,0,0,0): case BITS4(1,0,0,1):
15027          // Data processing - immediate
15028          ok = dis_ARM64_data_processing_immediate(dres, insn, sigill_diag);
15029          break;
15030       case BITS4(1,0,1,0): case BITS4(1,0,1,1):
15031          // Branch, exception generation and system instructions
15032          ok = dis_ARM64_branch_etc(dres, insn, archinfo, abiinfo, sigill_diag);
15033          break;
15034       case BITS4(0,1,0,0): case BITS4(0,1,1,0):
15035       case BITS4(1,1,0,0): case BITS4(1,1,1,0):
15036          // Loads and stores
15037          ok = dis_ARM64_load_store(dres, insn, abiinfo, sigill_diag);
15038          break;
15039       case BITS4(0,1,0,1): case BITS4(1,1,0,1):
15040          // Data processing - register
15041          ok = dis_ARM64_data_processing_register(dres, insn, sigill_diag);
15042          break;
15043       case BITS4(0,1,1,1): case BITS4(1,1,1,1):
15044          // Data processing - SIMD and floating point
15045          ok = dis_ARM64_simd_and_fp(dres, insn);
15046          break;
15047       case BITS4(0,0,0,0): case BITS4(0,0,0,1):
15048       case BITS4(0,0,1,0): case BITS4(0,0,1,1):
15049          // UNALLOCATED
15050          break;
15051       default:
15052          vassert(0); /* Can't happen */
15053    }
15054
15055    /* If the next-level down decoders failed, make sure |dres| didn't
15056       get changed. */
15057    if (!ok) {
15058       vassert(dres->whatNext    == Dis_Continue);
15059       vassert(dres->len         == 4);
15060       vassert(dres->jk_StopHere == Ijk_INVALID);
15061    }
15062
15063    return ok;
15064
15065 #  undef INSN
15066 }
15067
15068
15069 /*------------------------------------------------------------*/
15070 /*--- Top-level fn                                         ---*/
15071 /*------------------------------------------------------------*/
15072
15073 /* Disassemble a single instruction into IR.  The instruction
15074    is located in host memory at &guest_code[delta]. */
15075
15076 DisResult disInstr_ARM64 ( IRSB*        irsb_IN,
15077                            const UChar* guest_code_IN,
15078                            Long         delta_IN,
15079                            Addr         guest_IP,
15080                            VexArch      guest_arch,
15081                            const VexArchInfo* archinfo,
15082                            const VexAbiInfo*  abiinfo,
15083                            VexEndness   host_endness_IN,
15084                            Bool         sigill_diag_IN )
15085 {
15086    DisResult dres;
15087    vex_bzero(&dres, sizeof(dres));
15088
15089    /* Set globals (see top of this file) */
15090    vassert(guest_arch == VexArchARM64);
15091
15092    irsb                = irsb_IN;
15093    host_endness        = host_endness_IN;
15094    guest_PC_curr_instr = (Addr64)guest_IP;
15095
15096    /* Sanity checks */
15097    /* (x::UInt - 2) <= 15   ===   x >= 2 && x <= 17 (I hope) */
15098    vassert((archinfo->arm64_dMinLine_lg2_szB - 2) <= 15);
15099    vassert((archinfo->arm64_iMinLine_lg2_szB - 2) <= 15);
15100
15101    /* Try to decode */
15102    Bool ok = disInstr_ARM64_WRK( &dres,
15103                                  &guest_code_IN[delta_IN],
15104                                  archinfo, abiinfo, sigill_diag_IN );
15105    if (ok) {
15106       /* All decode successes end up here. */
15107       vassert(dres.len == 4 || dres.len == 20);
15108       switch (dres.whatNext) {
15109          case Dis_Continue:
15110             putPC( mkU64(dres.len + guest_PC_curr_instr) );
15111             break;
15112          case Dis_StopHere:
15113             break;
15114          default:
15115             vassert(0);
15116       }
15117       DIP("\n");
15118    } else {
15119       /* All decode failures end up here. */
15120       if (sigill_diag_IN) {
15121          Int   i, j;
15122          UChar buf[64];
15123          UInt  insn
15124                   = getUIntLittleEndianly( &guest_code_IN[delta_IN] );
15125          vex_bzero(buf, sizeof(buf));
15126          for (i = j = 0; i < 32; i++) {
15127             if (i > 0) {
15128               if ((i & 7) == 0) buf[j++] = ' ';
15129               else if ((i & 3) == 0) buf[j++] = '\'';
15130             }
15131             buf[j++] = (insn & (1<<(31-i))) ? '1' : '0';
15132          }
15133          vex_printf("disInstr(arm64): unhandled instruction 0x%08x\n", insn);
15134          vex_printf("disInstr(arm64): %s\n", buf);
15135       }
15136
15137       /* Tell the dispatcher that this insn cannot be decoded, and so
15138          has not been executed, and (is currently) the next to be
15139          executed.  PC should be up-to-date since it is made so at the
15140          start of each insn, but nevertheless be paranoid and update
15141          it again right now. */
15142       putPC( mkU64(guest_PC_curr_instr) );
15143       dres.len         = 0;
15144       dres.whatNext    = Dis_StopHere;
15145       dres.jk_StopHere = Ijk_NoDecode;
15146    }
15147    return dres;
15148 }
15149
15150
15151 /*--------------------------------------------------------------------*/
15152 /*--- end                                       guest_arm64_toIR.c ---*/
15153 /*--------------------------------------------------------------------*/