VEX/priv/guest_arm64_toIR.c

   1 /* -*- mode: C; c-basic-offset: 3; -*- */
   2
   3 /*--------------------------------------------------------------------*/
   4 /*--- begin                                     guest_arm64_toIR.c ---*/
   5 /*--------------------------------------------------------------------*/
   6
   7 /*
   8    This file is part of Valgrind, a dynamic binary instrumentation
   9    framework.
  10
  11    Copyright (C) 2013-2017 OpenWorks
  12       info@open-works.net
  13
  14    This program is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU General Public License as
  16    published by the Free Software Foundation; either version 2 of the
  17    License, or (at your option) any later version.
  18
  19    This program is distributed in the hope that it will be useful, but
  20    WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    General Public License for more details.
  23
  24    You should have received a copy of the GNU General Public License
  25    along with this program; if not, write to the Free Software
  26    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  27    02110-1301, USA.
  28
  29    The GNU General Public License is contained in the file COPYING.
  30 */
  31
  32 /* KNOWN LIMITATIONS 2014-Nov-16
  33
  34    * Correctness: FMAXNM, FMINNM are implemented the same as FMAX/FMIN.
  35
  36      Also FP comparison "unordered" .. is implemented as normal FP
  37      comparison.
  38
  39      Both should be fixed.  They behave incorrectly in the presence of
  40      NaNs.
  41
  42      FMULX is treated the same as FMUL.  That's also not correct.
  43
  44    * Floating multiply-add (etc) insns.  Are split into a multiply and
  45      an add, and so suffer double rounding and hence sometimes the
  46      least significant mantissa bit is incorrect.  Fix: use the IR
  47      multiply-add IROps instead.
  48
  49    * FRINTA, FRINTN are kludged .. they just round to nearest.  No special
  50      handling for the "ties" case.  FRINTX might be dubious too.
  51
  52    * Ditto FCVTXN.  No idea what "round to odd" means.  This implementation
  53      just rounds to nearest.
  54 */
  55
  56 /* "Special" instructions.
  57
  58    This instruction decoder can decode four special instructions
  59    which mean nothing natively (are no-ops as far as regs/mem are
  60    concerned) but have meaning for supporting Valgrind.  A special
  61    instruction is flagged by a 16-byte preamble:
  62
  63       93CC0D8C 93CC358C 93CCCD8C 93CCF58C
  64       (ror x12, x12, #3;   ror x12, x12, #13
  65        ror x12, x12, #51;  ror x12, x12, #61)
  66
  67    Following that, one of the following 3 are allowed
  68    (standard interpretation in parentheses):
  69
  70       AA0A014A (orr x10,x10,x10)   X3 = client_request ( X4 )
  71       AA0B016B (orr x11,x11,x11)   X3 = guest_NRADDR
  72       AA0C018C (orr x12,x12,x12)   branch-and-link-to-noredir X8
  73       AA090129 (orr x9,x9,x9)      IR injection
  74
  75    Any other bytes following the 16-byte preamble are illegal and
  76    constitute a failure in instruction decoding.  This all assumes
  77    that the preamble will never occur except in specific code
  78    fragments designed for Valgrind to catch.
  79 */
  80
  81 /* Translates ARM64 code to IR. */
  82
  83 #include "libvex_basictypes.h"
  84 #include "libvex_ir.h"
  85 #include "libvex.h"
  86 #include "libvex_guest_arm64.h"
  87
  88 #include "main_util.h"
  89 #include "main_globals.h"
  90 #include "guest_generic_bb_to_IR.h"
  91 #include "guest_arm64_defs.h"
  92
  93
  94 /*------------------------------------------------------------*/
  95 /*--- Globals                                              ---*/
  96 /*------------------------------------------------------------*/
  97
  98 /* These are set at the start of the translation of a instruction, so
  99    that we don't have to pass them around endlessly.  CONST means does
 100    not change during translation of the instruction.
 101 */
 102
 103 /* CONST: what is the host's endianness?  We need to know this in
 104    order to do sub-register accesses to the SIMD/FP registers
 105    correctly. */
 106 static VexEndness host_endness;
 107
 108 /* CONST: The guest address for the instruction currently being
 109    translated.  */
 110 static Addr64 guest_PC_curr_instr;
 111
 112 /* MOD: The IRSB* into which we're generating code. */
 113 static IRSB* irsb;
 114
 115
 116 /*------------------------------------------------------------*/
 117 /*--- Debugging output                                     ---*/
 118 /*------------------------------------------------------------*/
 119
 120 #define DIP(format, args...)           \
 121    if (vex_traceflags & VEX_TRACE_FE)  \
 122       vex_printf(format, ## args)
 123
 124 #define DIS(buf, format, args...)      \
 125    if (vex_traceflags & VEX_TRACE_FE)  \
 126       vex_sprintf(buf, format, ## args)
 127
 128
 129 /*------------------------------------------------------------*/
 130 /*--- Helper bits and pieces for deconstructing the        ---*/
 131 /*--- arm insn stream.                                     ---*/
 132 /*------------------------------------------------------------*/
 133
 134 /* Do a little-endian load of a 32-bit word, regardless of the
 135    endianness of the underlying host. */
 136 static inline UInt getUIntLittleEndianly ( const UChar* p )
 137 {
 138    UInt w = 0;
 139    w = (w << 8) | p[3];
 140    w = (w << 8) | p[2];
 141    w = (w << 8) | p[1];
 142    w = (w << 8) | p[0];
 143    return w;
 144 }
 145
 146 /* Sign extend a N-bit value up to 64 bits, by copying
 147    bit N-1 into all higher positions. */
 148 static ULong sx_to_64 ( ULong x, UInt n )
 149 {
 150    vassert(n > 1 && n < 64);
 151    x <<= (64-n);
 152    Long r = (Long)x;
 153    r >>= (64-n);
 154    return (ULong)r;
 155 }
 156
 157 //ZZ /* Do a little-endian load of a 16-bit word, regardless of the
 158 //ZZ    endianness of the underlying host. */
 159 //ZZ static inline UShort getUShortLittleEndianly ( UChar* p )
 160 //ZZ {
 161 //ZZ    UShort w = 0;
 162 //ZZ    w = (w << 8) | p[1];
 163 //ZZ    w = (w << 8) | p[0];
 164 //ZZ    return w;
 165 //ZZ }
 166 //ZZ
 167 //ZZ static UInt ROR32 ( UInt x, UInt sh ) {
 168 //ZZ    vassert(sh >= 0 && sh < 32);
 169 //ZZ    if (sh == 0)
 170 //ZZ       return x;
 171 //ZZ    else
 172 //ZZ       return (x << (32-sh)) | (x >> sh);
 173 //ZZ }
 174 //ZZ
 175 //ZZ static Int popcount32 ( UInt x )
 176 //ZZ {
 177 //ZZ    Int res = 0, i;
 178 //ZZ    for (i = 0; i < 32; i++) {
 179 //ZZ       res += (x & 1);
 180 //ZZ       x >>= 1;
 181 //ZZ    }
 182 //ZZ    return res;
 183 //ZZ }
 184 //ZZ
 185 //ZZ static UInt setbit32 ( UInt x, Int ix, UInt b )
 186 //ZZ {
 187 //ZZ    UInt mask = 1 << ix;
 188 //ZZ    x &= ~mask;
 189 //ZZ    x |= ((b << ix) & mask);
 190 //ZZ    return x;
 191 //ZZ }
 192
 193 #define BITS2(_b1,_b0)  \
 194    (((_b1) << 1) | (_b0))
 195
 196 #define BITS3(_b2,_b1,_b0)  \
 197   (((_b2) << 2) | ((_b1) << 1) | (_b0))
 198
 199 #define BITS4(_b3,_b2,_b1,_b0)  \
 200    (((_b3) << 3) | ((_b2) << 2) | ((_b1) << 1) | (_b0))
 201
 202 #define BITS8(_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 203    ((BITS4((_b7),(_b6),(_b5),(_b4)) << 4)  \
 204     | BITS4((_b3),(_b2),(_b1),(_b0)))
 205
 206 #define BITS5(_b4,_b3,_b2,_b1,_b0)  \
 207    (BITS8(0,0,0,(_b4),(_b3),(_b2),(_b1),(_b0)))
 208 #define BITS6(_b5,_b4,_b3,_b2,_b1,_b0)  \
 209    (BITS8(0,0,(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 210 #define BITS7(_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 211    (BITS8(0,(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 212
 213 #define BITS9(_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 214    (((_b8) << 8)  \
 215     | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 216
 217 #define BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 218    (((_b9) << 9) | ((_b8) << 8)  \
 219     | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 220
 221 #define BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 222    (((_b10) << 10)  \
 223     | BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
 224
 225 #define BITS12(_b11, _b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0) \
 226    (((_b11) << 11)  \
 227     | BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
 228
 229 #define X00 BITS2(0,0)
 230 #define X01 BITS2(0,1)
 231 #define X10 BITS2(1,0)
 232 #define X11 BITS2(1,1)
 233
 234 // produces _uint[_bMax:_bMin]
 235 #define SLICE_UInt(_uint,_bMax,_bMin)  \
 236    (( ((UInt)(_uint)) >> (_bMin))  \
 237     & (UInt)((1ULL << ((_bMax) - (_bMin) + 1)) - 1ULL))
 238
 239
 240 /*------------------------------------------------------------*/
 241 /*--- Helper bits and pieces for creating IR fragments.    ---*/
 242 /*------------------------------------------------------------*/
 243
 244 static IRExpr* mkV128 ( UShort w )
 245 {
 246    return IRExpr_Const(IRConst_V128(w));
 247 }
 248
 249 static IRExpr* mkU64 ( ULong i )
 250 {
 251    return IRExpr_Const(IRConst_U64(i));
 252 }
 253
 254 static IRExpr* mkU32 ( UInt i )
 255 {
 256    return IRExpr_Const(IRConst_U32(i));
 257 }
 258
 259 static IRExpr* mkU16 ( UInt i )
 260 {
 261    vassert(i < 65536);
 262    return IRExpr_Const(IRConst_U16(i));
 263 }
 264
 265 static IRExpr* mkU8 ( UInt i )
 266 {
 267    vassert(i < 256);
 268    return IRExpr_Const(IRConst_U8( (UChar)i ));
 269 }
 270
 271 static IRExpr* mkexpr ( IRTemp tmp )
 272 {
 273    return IRExpr_RdTmp(tmp);
 274 }
 275
 276 static IRExpr* unop ( IROp op, IRExpr* a )
 277 {
 278    return IRExpr_Unop(op, a);
 279 }
 280
 281 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
 282 {
 283    return IRExpr_Binop(op, a1, a2);
 284 }
 285
 286 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
 287 {
 288    return IRExpr_Triop(op, a1, a2, a3);
 289 }
 290
 291 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
 292 {
 293    return IRExpr_Load(Iend_LE, ty, addr);
 294 }
 295
 296 /* Add a statement to the list held by "irbb". */
 297 static void stmt ( IRStmt* st )
 298 {
 299    addStmtToIRSB( irsb, st );
 300 }
 301
 302 static void assign ( IRTemp dst, IRExpr* e )
 303 {
 304    stmt( IRStmt_WrTmp(dst, e) );
 305 }
 306
 307 static void storeLE ( IRExpr* addr, IRExpr* data )
 308 {
 309    stmt( IRStmt_Store(Iend_LE, addr, data) );
 310 }
 311
 312 //ZZ static void storeGuardedLE ( IRExpr* addr, IRExpr* data, IRTemp guardT )
 313 //ZZ {
 314 //ZZ    if (guardT == IRTemp_INVALID) {
 315 //ZZ       /* unconditional */
 316 //ZZ       storeLE(addr, data);
 317 //ZZ    } else {
 318 //ZZ       stmt( IRStmt_StoreG(Iend_LE, addr, data,
 319 //ZZ                           binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
 320 //ZZ    }
 321 //ZZ }
 322 //ZZ
 323 //ZZ static void loadGuardedLE ( IRTemp dst, IRLoadGOp cvt,
 324 //ZZ                             IRExpr* addr, IRExpr* alt,
 325 //ZZ                             IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
 326 //ZZ {
 327 //ZZ    if (guardT == IRTemp_INVALID) {
 328 //ZZ       /* unconditional */
 329 //ZZ       IRExpr* loaded = NULL;
 330 //ZZ       switch (cvt) {
 331 //ZZ          case ILGop_Ident32:
 332 //ZZ             loaded = loadLE(Ity_I32, addr); break;
 333 //ZZ          case ILGop_8Uto32:
 334 //ZZ             loaded = unop(Iop_8Uto32, loadLE(Ity_I8, addr)); break;
 335 //ZZ          case ILGop_8Sto32:
 336 //ZZ             loaded = unop(Iop_8Sto32, loadLE(Ity_I8, addr)); break;
 337 //ZZ          case ILGop_16Uto32:
 338 //ZZ             loaded = unop(Iop_16Uto32, loadLE(Ity_I16, addr)); break;
 339 //ZZ          case ILGop_16Sto32:
 340 //ZZ             loaded = unop(Iop_16Sto32, loadLE(Ity_I16, addr)); break;
 341 //ZZ          default:
 342 //ZZ             vassert(0);
 343 //ZZ       }
 344 //ZZ       vassert(loaded != NULL);
 345 //ZZ       assign(dst, loaded);
 346 //ZZ    } else {
 347 //ZZ       /* Generate a guarded load into 'dst', but apply 'cvt' to the
 348 //ZZ          loaded data before putting the data in 'dst'.  If the load
 349 //ZZ          does not take place, 'alt' is placed directly in 'dst'. */
 350 //ZZ       stmt( IRStmt_LoadG(Iend_LE, cvt, dst, addr, alt,
 351 //ZZ                          binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
 352 //ZZ    }
 353 //ZZ }
 354
 355 /* Generate a new temporary of the given type. */
 356 static IRTemp newTemp ( IRType ty )
 357 {
 358    vassert(isPlausibleIRType(ty));
 359    return newIRTemp( irsb->tyenv, ty );
 360 }
 361
 362 /* This is used in many places, so the brevity is an advantage. */
 363 static IRTemp newTempV128(void)
 364 {
 365    return newTemp(Ity_V128);
 366 }
 367
 368 /* Initialise V128 temporaries en masse. */
 369 static
 370 void newTempsV128_2(IRTemp* t1, IRTemp* t2)
 371 {
 372    vassert(t1 && *t1 == IRTemp_INVALID);
 373    vassert(t2 && *t2 == IRTemp_INVALID);
 374    *t1 = newTempV128();
 375    *t2 = newTempV128();
 376 }
 377
 378 static
 379 void newTempsV128_3(IRTemp* t1, IRTemp* t2, IRTemp* t3)
 380 {
 381    vassert(t1 && *t1 == IRTemp_INVALID);
 382    vassert(t2 && *t2 == IRTemp_INVALID);
 383    vassert(t3 && *t3 == IRTemp_INVALID);
 384    *t1 = newTempV128();
 385    *t2 = newTempV128();
 386    *t3 = newTempV128();
 387 }
 388
 389 static
 390 void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4)
 391 {
 392    vassert(t1 && *t1 == IRTemp_INVALID);
 393    vassert(t2 && *t2 == IRTemp_INVALID);
 394    vassert(t3 && *t3 == IRTemp_INVALID);
 395    vassert(t4 && *t4 == IRTemp_INVALID);
 396    *t1 = newTempV128();
 397    *t2 = newTempV128();
 398    *t3 = newTempV128();
 399    *t4 = newTempV128();
 400 }
 401
 402 static
 403 void newTempsV128_7(IRTemp* t1, IRTemp* t2, IRTemp* t3,
 404                     IRTemp* t4, IRTemp* t5, IRTemp* t6, IRTemp* t7)
 405 {
 406    vassert(t1 && *t1 == IRTemp_INVALID);
 407    vassert(t2 && *t2 == IRTemp_INVALID);
 408    vassert(t3 && *t3 == IRTemp_INVALID);
 409    vassert(t4 && *t4 == IRTemp_INVALID);
 410    vassert(t5 && *t5 == IRTemp_INVALID);
 411    vassert(t6 && *t6 == IRTemp_INVALID);
 412    vassert(t7 && *t7 == IRTemp_INVALID);
 413    *t1 = newTempV128();
 414    *t2 = newTempV128();
 415    *t3 = newTempV128();
 416    *t4 = newTempV128();
 417    *t5 = newTempV128();
 418    *t6 = newTempV128();
 419    *t7 = newTempV128();
 420 }
 421
 422 //ZZ /* Produces a value in 0 .. 3, which is encoded as per the type
 423 //ZZ    IRRoundingMode. */
 424 //ZZ static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
 425 //ZZ {
 426 //ZZ    return mkU32(Irrm_NEAREST);
 427 //ZZ }
 428 //ZZ
 429 //ZZ /* Generate an expression for SRC rotated right by ROT. */
 430 //ZZ static IRExpr* genROR32( IRTemp src, Int rot )
 431 //ZZ {
 432 //ZZ    vassert(rot >= 0 && rot < 32);
 433 //ZZ    if (rot == 0)
 434 //ZZ       return mkexpr(src);
 435 //ZZ    return
 436 //ZZ       binop(Iop_Or32,
 437 //ZZ             binop(Iop_Shl32, mkexpr(src), mkU8(32 - rot)),
 438 //ZZ             binop(Iop_Shr32, mkexpr(src), mkU8(rot)));
 439 //ZZ }
 440 //ZZ
 441 //ZZ static IRExpr* mkU128 ( ULong i )
 442 //ZZ {
 443 //ZZ    return binop(Iop_64HLtoV128, mkU64(i), mkU64(i));
 444 //ZZ }
 445 //ZZ
 446 //ZZ /* Generate a 4-aligned version of the given expression if
 447 //ZZ    the given condition is true.  Else return it unchanged. */
 448 //ZZ static IRExpr* align4if ( IRExpr* e, Bool b )
 449 //ZZ {
 450 //ZZ    if (b)
 451 //ZZ       return binop(Iop_And32, e, mkU32(~3));
 452 //ZZ    else
 453 //ZZ       return e;
 454 //ZZ }
 455
 456 /* Other IR construction helpers. */
 457 static IROp mkAND ( IRType ty ) {
 458    switch (ty) {
 459       case Ity_I32: return Iop_And32;
 460       case Ity_I64: return Iop_And64;
 461       default: vpanic("mkAND");
 462    }
 463 }
 464
 465 static IROp mkOR ( IRType ty ) {
 466    switch (ty) {
 467       case Ity_I32: return Iop_Or32;
 468       case Ity_I64: return Iop_Or64;
 469       default: vpanic("mkOR");
 470    }
 471 }
 472
 473 static IROp mkXOR ( IRType ty ) {
 474    switch (ty) {
 475       case Ity_I32: return Iop_Xor32;
 476       case Ity_I64: return Iop_Xor64;
 477       default: vpanic("mkXOR");
 478    }
 479 }
 480
 481 static IROp mkSHL ( IRType ty ) {
 482    switch (ty) {
 483       case Ity_I32: return Iop_Shl32;
 484       case Ity_I64: return Iop_Shl64;
 485       default: vpanic("mkSHL");
 486    }
 487 }
 488
 489 static IROp mkSHR ( IRType ty ) {
 490    switch (ty) {
 491       case Ity_I32: return Iop_Shr32;
 492       case Ity_I64: return Iop_Shr64;
 493       default: vpanic("mkSHR");
 494    }
 495 }
 496
 497 static IROp mkSAR ( IRType ty ) {
 498    switch (ty) {
 499       case Ity_I32: return Iop_Sar32;
 500       case Ity_I64: return Iop_Sar64;
 501       default: vpanic("mkSAR");
 502    }
 503 }
 504
 505 static IROp mkNOT ( IRType ty ) {
 506    switch (ty) {
 507       case Ity_I32: return Iop_Not32;
 508       case Ity_I64: return Iop_Not64;
 509       default: vpanic("mkNOT");
 510    }
 511 }
 512
 513 static IROp mkADD ( IRType ty ) {
 514    switch (ty) {
 515       case Ity_I32: return Iop_Add32;
 516       case Ity_I64: return Iop_Add64;
 517       default: vpanic("mkADD");
 518    }
 519 }
 520
 521 static IROp mkSUB ( IRType ty ) {
 522    switch (ty) {
 523       case Ity_I32: return Iop_Sub32;
 524       case Ity_I64: return Iop_Sub64;
 525       default: vpanic("mkSUB");
 526    }
 527 }
 528
 529 static IROp mkADDF ( IRType ty ) {
 530    switch (ty) {
 531       case Ity_F32: return Iop_AddF32;
 532       case Ity_F64: return Iop_AddF64;
 533       default: vpanic("mkADDF");
 534    }
 535 }
 536
 537 static IROp mkSUBF ( IRType ty ) {
 538    switch (ty) {
 539       case Ity_F32: return Iop_SubF32;
 540       case Ity_F64: return Iop_SubF64;
 541       default: vpanic("mkSUBF");
 542    }
 543 }
 544
 545 static IROp mkMULF ( IRType ty ) {
 546    switch (ty) {
 547       case Ity_F32: return Iop_MulF32;
 548       case Ity_F64: return Iop_MulF64;
 549       default: vpanic("mkMULF");
 550    }
 551 }
 552
 553 static IROp mkDIVF ( IRType ty ) {
 554    switch (ty) {
 555       case Ity_F32: return Iop_DivF32;
 556       case Ity_F64: return Iop_DivF64;
 557       default: vpanic("mkMULF");
 558    }
 559 }
 560
 561 static IROp mkNEGF ( IRType ty ) {
 562    switch (ty) {
 563       case Ity_F32: return Iop_NegF32;
 564       case Ity_F64: return Iop_NegF64;
 565       default: vpanic("mkNEGF");
 566    }
 567 }
 568
 569 static IROp mkABSF ( IRType ty ) {
 570    switch (ty) {
 571       case Ity_F32: return Iop_AbsF32;
 572       case Ity_F64: return Iop_AbsF64;
 573       default: vpanic("mkNEGF");
 574    }
 575 }
 576
 577 static IROp mkSQRTF ( IRType ty ) {
 578    switch (ty) {
 579       case Ity_F32: return Iop_SqrtF32;
 580       case Ity_F64: return Iop_SqrtF64;
 581       default: vpanic("mkNEGF");
 582    }
 583 }
 584
 585 static IROp mkVecADD ( UInt size ) {
 586    const IROp ops[4]
 587       = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
 588    vassert(size < 4);
 589    return ops[size];
 590 }
 591
 592 static IROp mkVecQADDU ( UInt size ) {
 593    const IROp ops[4]
 594       = { Iop_QAdd8Ux16, Iop_QAdd16Ux8, Iop_QAdd32Ux4, Iop_QAdd64Ux2 };
 595    vassert(size < 4);
 596    return ops[size];
 597 }
 598
 599 static IROp mkVecQADDS ( UInt size ) {
 600    const IROp ops[4]
 601       = { Iop_QAdd8Sx16, Iop_QAdd16Sx8, Iop_QAdd32Sx4, Iop_QAdd64Sx2 };
 602    vassert(size < 4);
 603    return ops[size];
 604 }
 605
 606 static IROp mkVecQADDEXTSUSATUU ( UInt size ) {
 607    const IROp ops[4]
 608       = { Iop_QAddExtSUsatUU8x16, Iop_QAddExtSUsatUU16x8,
 609           Iop_QAddExtSUsatUU32x4, Iop_QAddExtSUsatUU64x2 };
 610    vassert(size < 4);
 611    return ops[size];
 612 }
 613
 614 static IROp mkVecQADDEXTUSSATSS ( UInt size ) {
 615    const IROp ops[4]
 616       = { Iop_QAddExtUSsatSS8x16, Iop_QAddExtUSsatSS16x8,
 617           Iop_QAddExtUSsatSS32x4, Iop_QAddExtUSsatSS64x2 };
 618    vassert(size < 4);
 619    return ops[size];
 620 }
 621
 622 static IROp mkVecSUB ( UInt size ) {
 623    const IROp ops[4]
 624       = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
 625    vassert(size < 4);
 626    return ops[size];
 627 }
 628
 629 static IROp mkVecQSUBU ( UInt size ) {
 630    const IROp ops[4]
 631       = { Iop_QSub8Ux16, Iop_QSub16Ux8, Iop_QSub32Ux4, Iop_QSub64Ux2 };
 632    vassert(size < 4);
 633    return ops[size];
 634 }
 635
 636 static IROp mkVecQSUBS ( UInt size ) {
 637    const IROp ops[4]
 638       = { Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4, Iop_QSub64Sx2 };
 639    vassert(size < 4);
 640    return ops[size];
 641 }
 642
 643 static IROp mkVecSARN ( UInt size ) {
 644    const IROp ops[4]
 645       = { Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4, Iop_SarN64x2 };
 646    vassert(size < 4);
 647    return ops[size];
 648 }
 649
 650 static IROp mkVecSHRN ( UInt size ) {
 651    const IROp ops[4]
 652       = { Iop_ShrN8x16, Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
 653    vassert(size < 4);
 654    return ops[size];
 655 }
 656
 657 static IROp mkVecSHLN ( UInt size ) {
 658    const IROp ops[4]
 659       = { Iop_ShlN8x16, Iop_ShlN16x8, Iop_ShlN32x4, Iop_ShlN64x2 };
 660    vassert(size < 4);
 661    return ops[size];
 662 }
 663
 664 static IROp mkVecCATEVENLANES ( UInt size ) {
 665    const IROp ops[4]
 666       = { Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8,
 667           Iop_CatEvenLanes32x4, Iop_InterleaveLO64x2 };
 668    vassert(size < 4);
 669    return ops[size];
 670 }
 671
 672 static IROp mkVecCATODDLANES ( UInt size ) {
 673    const IROp ops[4]
 674       = { Iop_CatOddLanes8x16, Iop_CatOddLanes16x8,
 675           Iop_CatOddLanes32x4, Iop_InterleaveHI64x2 };
 676    vassert(size < 4);
 677    return ops[size];
 678 }
 679
 680 static IROp mkVecINTERLEAVELO ( UInt size ) {
 681    const IROp ops[4]
 682       = { Iop_InterleaveLO8x16, Iop_InterleaveLO16x8,
 683           Iop_InterleaveLO32x4, Iop_InterleaveLO64x2 };
 684    vassert(size < 4);
 685    return ops[size];
 686 }
 687
 688 static IROp mkVecINTERLEAVEHI ( UInt size ) {
 689    const IROp ops[4]
 690       = { Iop_InterleaveHI8x16, Iop_InterleaveHI16x8,
 691           Iop_InterleaveHI32x4, Iop_InterleaveHI64x2 };
 692    vassert(size < 4);
 693    return ops[size];
 694 }
 695
 696 static IROp mkVecMAXU ( UInt size ) {
 697    const IROp ops[4]
 698       = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4, Iop_Max64Ux2 };
 699    vassert(size < 4);
 700    return ops[size];
 701 }
 702
 703 static IROp mkVecMAXS ( UInt size ) {
 704    const IROp ops[4]
 705       = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4, Iop_Max64Sx2 };
 706    vassert(size < 4);
 707    return ops[size];
 708 }
 709
 710 static IROp mkVecMINU ( UInt size ) {
 711    const IROp ops[4]
 712       = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4, Iop_Min64Ux2 };
 713    vassert(size < 4);
 714    return ops[size];
 715 }
 716
 717 static IROp mkVecMINS ( UInt size ) {
 718    const IROp ops[4]
 719       = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4, Iop_Min64Sx2 };
 720    vassert(size < 4);
 721    return ops[size];
 722 }
 723
 724 static IROp mkVecMUL ( UInt size ) {
 725    const IROp ops[4]
 726       = { Iop_Mul8x16, Iop_Mul16x8, Iop_Mul32x4, Iop_INVALID };
 727    vassert(size < 3);
 728    return ops[size];
 729 }
 730
 731 static IROp mkVecMULLU ( UInt sizeNarrow ) {
 732    const IROp ops[4]
 733       = { Iop_Mull8Ux8, Iop_Mull16Ux4, Iop_Mull32Ux2, Iop_INVALID };
 734    vassert(sizeNarrow < 3);
 735    return ops[sizeNarrow];
 736 }
 737
 738 static IROp mkVecMULLS ( UInt sizeNarrow ) {
 739    const IROp ops[4]
 740       = { Iop_Mull8Sx8, Iop_Mull16Sx4, Iop_Mull32Sx2, Iop_INVALID };
 741    vassert(sizeNarrow < 3);
 742    return ops[sizeNarrow];
 743 }
 744
 745 static IROp mkVecQDMULLS ( UInt sizeNarrow ) {
 746    const IROp ops[4]
 747       = { Iop_INVALID, Iop_QDMull16Sx4, Iop_QDMull32Sx2, Iop_INVALID };
 748    vassert(sizeNarrow < 3);
 749    return ops[sizeNarrow];
 750 }
 751
 752 static IROp mkVecCMPEQ ( UInt size ) {
 753    const IROp ops[4]
 754       = { Iop_CmpEQ8x16, Iop_CmpEQ16x8, Iop_CmpEQ32x4, Iop_CmpEQ64x2 };
 755    vassert(size < 4);
 756    return ops[size];
 757 }
 758
 759 static IROp mkVecCMPGTU ( UInt size ) {
 760    const IROp ops[4]
 761       = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4, Iop_CmpGT64Ux2 };
 762    vassert(size < 4);
 763    return ops[size];
 764 }
 765
 766 static IROp mkVecCMPGTS ( UInt size ) {
 767    const IROp ops[4]
 768       = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2 };
 769    vassert(size < 4);
 770    return ops[size];
 771 }
 772
 773 static IROp mkVecABS ( UInt size ) {
 774    const IROp ops[4]
 775       = { Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2 };
 776    vassert(size < 4);
 777    return ops[size];
 778 }
 779
 780 static IROp mkVecZEROHIxxOFV128 ( UInt size ) {
 781    const IROp ops[4]
 782       = { Iop_ZeroHI120ofV128, Iop_ZeroHI112ofV128,
 783           Iop_ZeroHI96ofV128,  Iop_ZeroHI64ofV128 };
 784    vassert(size < 4);
 785    return ops[size];
 786 }
 787
 788 static IRExpr* mkU ( IRType ty, ULong imm ) {
 789    switch (ty) {
 790       case Ity_I32: return mkU32((UInt)(imm & 0xFFFFFFFFULL));
 791       case Ity_I64: return mkU64(imm);
 792       default: vpanic("mkU");
 793    }
 794 }
 795
 796 static IROp mkVecQDMULHIS ( UInt size ) {
 797    const IROp ops[4]
 798       = { Iop_INVALID, Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4, Iop_INVALID };
 799    vassert(size < 4);
 800    return ops[size];
 801 }
 802
 803 static IROp mkVecQRDMULHIS ( UInt size ) {
 804    const IROp ops[4]
 805       = { Iop_INVALID, Iop_QRDMulHi16Sx8, Iop_QRDMulHi32Sx4, Iop_INVALID };
 806    vassert(size < 4);
 807    return ops[size];
 808 }
 809
 810 static IROp mkVecQANDUQSH ( UInt size ) {
 811    const IROp ops[4]
 812       = { Iop_QandUQsh8x16, Iop_QandUQsh16x8,
 813           Iop_QandUQsh32x4, Iop_QandUQsh64x2 };
 814    vassert(size < 4);
 815    return ops[size];
 816 }
 817
 818 static IROp mkVecQANDSQSH ( UInt size ) {
 819    const IROp ops[4]
 820       = { Iop_QandSQsh8x16, Iop_QandSQsh16x8,
 821           Iop_QandSQsh32x4, Iop_QandSQsh64x2 };
 822    vassert(size < 4);
 823    return ops[size];
 824 }
 825
 826 static IROp mkVecQANDUQRSH ( UInt size ) {
 827    const IROp ops[4]
 828       = { Iop_QandUQRsh8x16, Iop_QandUQRsh16x8,
 829           Iop_QandUQRsh32x4, Iop_QandUQRsh64x2 };
 830    vassert(size < 4);
 831    return ops[size];
 832 }
 833
 834 static IROp mkVecQANDSQRSH ( UInt size ) {
 835    const IROp ops[4]
 836       = { Iop_QandSQRsh8x16, Iop_QandSQRsh16x8,
 837           Iop_QandSQRsh32x4, Iop_QandSQRsh64x2 };
 838    vassert(size < 4);
 839    return ops[size];
 840 }
 841
 842 static IROp mkVecSHU ( UInt size ) {
 843    const IROp ops[4]
 844       = { Iop_Sh8Ux16, Iop_Sh16Ux8, Iop_Sh32Ux4, Iop_Sh64Ux2 };
 845    vassert(size < 4);
 846    return ops[size];
 847 }
 848
 849 static IROp mkVecSHS ( UInt size ) {
 850    const IROp ops[4]
 851       = { Iop_Sh8Sx16, Iop_Sh16Sx8, Iop_Sh32Sx4, Iop_Sh64Sx2 };
 852    vassert(size < 4);
 853    return ops[size];
 854 }
 855
 856 static IROp mkVecRSHU ( UInt size ) {
 857    const IROp ops[4]
 858       = { Iop_Rsh8Ux16, Iop_Rsh16Ux8, Iop_Rsh32Ux4, Iop_Rsh64Ux2 };
 859    vassert(size < 4);
 860    return ops[size];
 861 }
 862
 863 static IROp mkVecRSHS ( UInt size ) {
 864    const IROp ops[4]
 865       = { Iop_Rsh8Sx16, Iop_Rsh16Sx8, Iop_Rsh32Sx4, Iop_Rsh64Sx2 };
 866    vassert(size < 4);
 867    return ops[size];
 868 }
 869
 870 static IROp mkVecNARROWUN ( UInt sizeNarrow ) {
 871    const IROp ops[4]
 872       = { Iop_NarrowUn16to8x8, Iop_NarrowUn32to16x4,
 873           Iop_NarrowUn64to32x2, Iop_INVALID };
 874    vassert(sizeNarrow < 4);
 875    return ops[sizeNarrow];
 876 }
 877
 878 static IROp mkVecQNARROWUNSU ( UInt sizeNarrow ) {
 879    const IROp ops[4]
 880       = { Iop_QNarrowUn16Sto8Ux8,  Iop_QNarrowUn32Sto16Ux4,
 881           Iop_QNarrowUn64Sto32Ux2, Iop_INVALID };
 882    vassert(sizeNarrow < 4);
 883    return ops[sizeNarrow];
 884 }
 885
 886 static IROp mkVecQNARROWUNSS ( UInt sizeNarrow ) {
 887    const IROp ops[4]
 888       = { Iop_QNarrowUn16Sto8Sx8,  Iop_QNarrowUn32Sto16Sx4,
 889           Iop_QNarrowUn64Sto32Sx2, Iop_INVALID };
 890    vassert(sizeNarrow < 4);
 891    return ops[sizeNarrow];
 892 }
 893
 894 static IROp mkVecQNARROWUNUU ( UInt sizeNarrow ) {
 895    const IROp ops[4]
 896       = { Iop_QNarrowUn16Uto8Ux8,  Iop_QNarrowUn32Uto16Ux4,
 897           Iop_QNarrowUn64Uto32Ux2, Iop_INVALID };
 898    vassert(sizeNarrow < 4);
 899    return ops[sizeNarrow];
 900 }
 901
 902 static IROp mkVecQANDqshrNNARROWUU ( UInt sizeNarrow ) {
 903    const IROp ops[4]
 904       = { Iop_QandQShrNnarrow16Uto8Ux8, Iop_QandQShrNnarrow32Uto16Ux4,
 905           Iop_QandQShrNnarrow64Uto32Ux2, Iop_INVALID };
 906    vassert(sizeNarrow < 4);
 907    return ops[sizeNarrow];
 908 }
 909
 910 static IROp mkVecQANDqsarNNARROWSS ( UInt sizeNarrow ) {
 911    const IROp ops[4]
 912       = { Iop_QandQSarNnarrow16Sto8Sx8,  Iop_QandQSarNnarrow32Sto16Sx4,
 913           Iop_QandQSarNnarrow64Sto32Sx2, Iop_INVALID };
 914    vassert(sizeNarrow < 4);
 915    return ops[sizeNarrow];
 916 }
 917
 918 static IROp mkVecQANDqsarNNARROWSU ( UInt sizeNarrow ) {
 919    const IROp ops[4]
 920       = { Iop_QandQSarNnarrow16Sto8Ux8,  Iop_QandQSarNnarrow32Sto16Ux4,
 921           Iop_QandQSarNnarrow64Sto32Ux2, Iop_INVALID };
 922    vassert(sizeNarrow < 4);
 923    return ops[sizeNarrow];
 924 }
 925
 926 static IROp mkVecQANDqrshrNNARROWUU ( UInt sizeNarrow ) {
 927    const IROp ops[4]
 928       = { Iop_QandQRShrNnarrow16Uto8Ux8,  Iop_QandQRShrNnarrow32Uto16Ux4,
 929           Iop_QandQRShrNnarrow64Uto32Ux2, Iop_INVALID };
 930    vassert(sizeNarrow < 4);
 931    return ops[sizeNarrow];
 932 }
 933
 934 static IROp mkVecQANDqrsarNNARROWSS ( UInt sizeNarrow ) {
 935    const IROp ops[4]
 936       = { Iop_QandQRSarNnarrow16Sto8Sx8,  Iop_QandQRSarNnarrow32Sto16Sx4,
 937           Iop_QandQRSarNnarrow64Sto32Sx2, Iop_INVALID };
 938    vassert(sizeNarrow < 4);
 939    return ops[sizeNarrow];
 940 }
 941
 942 static IROp mkVecQANDqrsarNNARROWSU ( UInt sizeNarrow ) {
 943    const IROp ops[4]
 944       = { Iop_QandQRSarNnarrow16Sto8Ux8,  Iop_QandQRSarNnarrow32Sto16Ux4,
 945           Iop_QandQRSarNnarrow64Sto32Ux2, Iop_INVALID };
 946    vassert(sizeNarrow < 4);
 947    return ops[sizeNarrow];
 948 }
 949
 950 static IROp mkVecQSHLNSATUU ( UInt size ) {
 951    const IROp ops[4]
 952       = { Iop_QShlNsatUU8x16, Iop_QShlNsatUU16x8,
 953           Iop_QShlNsatUU32x4, Iop_QShlNsatUU64x2 };
 954    vassert(size < 4);
 955    return ops[size];
 956 }
 957
 958 static IROp mkVecQSHLNSATSS ( UInt size ) {
 959    const IROp ops[4]
 960       = { Iop_QShlNsatSS8x16, Iop_QShlNsatSS16x8,
 961           Iop_QShlNsatSS32x4, Iop_QShlNsatSS64x2 };
 962    vassert(size < 4);
 963    return ops[size];
 964 }
 965
 966 static IROp mkVecQSHLNSATSU ( UInt size ) {
 967    const IROp ops[4]
 968       = { Iop_QShlNsatSU8x16, Iop_QShlNsatSU16x8,
 969           Iop_QShlNsatSU32x4, Iop_QShlNsatSU64x2 };
 970    vassert(size < 4);
 971    return ops[size];
 972 }
 973
 974 static IROp mkVecADDF ( UInt size ) {
 975    const IROp ops[4]
 976       = { Iop_INVALID, Iop_INVALID, Iop_Add32Fx4, Iop_Add64Fx2 };
 977    vassert(size < 4);
 978    return ops[size];
 979 }
 980
 981 static IROp mkVecMAXF ( UInt size ) {
 982    const IROp ops[4]
 983       = { Iop_INVALID, Iop_INVALID, Iop_Max32Fx4, Iop_Max64Fx2 };
 984    vassert(size < 4);
 985    return ops[size];
 986 }
 987
 988 static IROp mkVecMINF ( UInt size ) {
 989    const IROp ops[4]
 990       = { Iop_INVALID, Iop_INVALID, Iop_Min32Fx4, Iop_Min64Fx2 };
 991    vassert(size < 4);
 992    return ops[size];
 993 }
 994
 995 /* Generate IR to create 'arg rotated right by imm', for sane values
 996    of 'ty' and 'imm'. */
 997 static IRTemp mathROR ( IRType ty, IRTemp arg, UInt imm )
 998 {
 999    UInt w = 0;
1000    if (ty == Ity_I64) {
1001       w = 64;
1002    } else {
1003       vassert(ty == Ity_I32);
1004       w = 32;
1005    }
1006    vassert(w != 0);
1007    vassert(imm < w);
1008    if (imm == 0) {
1009       return arg;
1010    }
1011    IRTemp res = newTemp(ty);
1012    assign(res, binop(mkOR(ty),
1013                      binop(mkSHL(ty), mkexpr(arg), mkU8(w - imm)),
1014                      binop(mkSHR(ty), mkexpr(arg), mkU8(imm)) ));
1015    return res;
1016 }
1017
1018 /* Generate IR to set the returned temp to either all-zeroes or
1019    all ones, as a copy of arg<imm>. */
1020 static IRTemp mathREPLICATE ( IRType ty, IRTemp arg, UInt imm )
1021 {
1022    UInt w = 0;
1023    if (ty == Ity_I64) {
1024       w = 64;
1025    } else {
1026       vassert(ty == Ity_I32);
1027       w = 32;
1028    }
1029    vassert(w != 0);
1030    vassert(imm < w);
1031    IRTemp res = newTemp(ty);
1032    assign(res, binop(mkSAR(ty),
1033                      binop(mkSHL(ty), mkexpr(arg), mkU8(w - 1 - imm)),
1034                      mkU8(w - 1)));
1035    return res;
1036 }
1037
1038 /* U-widen 8/16/32/64 bit int expr to 64. */
1039 static IRExpr* widenUto64 ( IRType srcTy, IRExpr* e )
1040 {
1041    switch (srcTy) {
1042       case Ity_I64: return e;
1043       case Ity_I32: return unop(Iop_32Uto64, e);
1044       case Ity_I16: return unop(Iop_16Uto64, e);
1045       case Ity_I8:  return unop(Iop_8Uto64, e);
1046       default: vpanic("widenUto64(arm64)");
1047    }
1048 }
1049
1050 /* Narrow 64 bit int expr to 8/16/32/64.  Clearly only some
1051    of these combinations make sense. */
1052 static IRExpr* narrowFrom64 ( IRType dstTy, IRExpr* e )
1053 {
1054    switch (dstTy) {
1055       case Ity_I64: return e;
1056       case Ity_I32: return unop(Iop_64to32, e);
1057       case Ity_I16: return unop(Iop_64to16, e);
1058       case Ity_I8:  return unop(Iop_64to8, e);
1059       default: vpanic("narrowFrom64(arm64)");
1060    }
1061 }
1062
1063
1064 /*------------------------------------------------------------*/
1065 /*--- Helpers for accessing guest registers.               ---*/
1066 /*------------------------------------------------------------*/
1067
1068 #define OFFB_X0       offsetof(VexGuestARM64State,guest_X0)
1069 #define OFFB_X1       offsetof(VexGuestARM64State,guest_X1)
1070 #define OFFB_X2       offsetof(VexGuestARM64State,guest_X2)
1071 #define OFFB_X3       offsetof(VexGuestARM64State,guest_X3)
1072 #define OFFB_X4       offsetof(VexGuestARM64State,guest_X4)
1073 #define OFFB_X5       offsetof(VexGuestARM64State,guest_X5)
1074 #define OFFB_X6       offsetof(VexGuestARM64State,guest_X6)
1075 #define OFFB_X7       offsetof(VexGuestARM64State,guest_X7)
1076 #define OFFB_X8       offsetof(VexGuestARM64State,guest_X8)
1077 #define OFFB_X9       offsetof(VexGuestARM64State,guest_X9)
1078 #define OFFB_X10      offsetof(VexGuestARM64State,guest_X10)
1079 #define OFFB_X11      offsetof(VexGuestARM64State,guest_X11)
1080 #define OFFB_X12      offsetof(VexGuestARM64State,guest_X12)
1081 #define OFFB_X13      offsetof(VexGuestARM64State,guest_X13)
1082 #define OFFB_X14      offsetof(VexGuestARM64State,guest_X14)
1083 #define OFFB_X15      offsetof(VexGuestARM64State,guest_X15)
1084 #define OFFB_X16      offsetof(VexGuestARM64State,guest_X16)
1085 #define OFFB_X17      offsetof(VexGuestARM64State,guest_X17)
1086 #define OFFB_X18      offsetof(VexGuestARM64State,guest_X18)
1087 #define OFFB_X19      offsetof(VexGuestARM64State,guest_X19)
1088 #define OFFB_X20      offsetof(VexGuestARM64State,guest_X20)
1089 #define OFFB_X21      offsetof(VexGuestARM64State,guest_X21)
1090 #define OFFB_X22      offsetof(VexGuestARM64State,guest_X22)
1091 #define OFFB_X23      offsetof(VexGuestARM64State,guest_X23)
1092 #define OFFB_X24      offsetof(VexGuestARM64State,guest_X24)
1093 #define OFFB_X25      offsetof(VexGuestARM64State,guest_X25)
1094 #define OFFB_X26      offsetof(VexGuestARM64State,guest_X26)
1095 #define OFFB_X27      offsetof(VexGuestARM64State,guest_X27)
1096 #define OFFB_X28      offsetof(VexGuestARM64State,guest_X28)
1097 #define OFFB_X29      offsetof(VexGuestARM64State,guest_X29)
1098 #define OFFB_X30      offsetof(VexGuestARM64State,guest_X30)
1099
1100 #define OFFB_XSP      offsetof(VexGuestARM64State,guest_XSP)
1101 #define OFFB_PC       offsetof(VexGuestARM64State,guest_PC)
1102
1103 #define OFFB_CC_OP    offsetof(VexGuestARM64State,guest_CC_OP)
1104 #define OFFB_CC_DEP1  offsetof(VexGuestARM64State,guest_CC_DEP1)
1105 #define OFFB_CC_DEP2  offsetof(VexGuestARM64State,guest_CC_DEP2)
1106 #define OFFB_CC_NDEP  offsetof(VexGuestARM64State,guest_CC_NDEP)
1107
1108 #define OFFB_TPIDR_EL0 offsetof(VexGuestARM64State,guest_TPIDR_EL0)
1109 #define OFFB_NRADDR   offsetof(VexGuestARM64State,guest_NRADDR)
1110
1111 #define OFFB_Q0       offsetof(VexGuestARM64State,guest_Q0)
1112 #define OFFB_Q1       offsetof(VexGuestARM64State,guest_Q1)
1113 #define OFFB_Q2       offsetof(VexGuestARM64State,guest_Q2)
1114 #define OFFB_Q3       offsetof(VexGuestARM64State,guest_Q3)
1115 #define OFFB_Q4       offsetof(VexGuestARM64State,guest_Q4)
1116 #define OFFB_Q5       offsetof(VexGuestARM64State,guest_Q5)
1117 #define OFFB_Q6       offsetof(VexGuestARM64State,guest_Q6)
1118 #define OFFB_Q7       offsetof(VexGuestARM64State,guest_Q7)
1119 #define OFFB_Q8       offsetof(VexGuestARM64State,guest_Q8)
1120 #define OFFB_Q9       offsetof(VexGuestARM64State,guest_Q9)
1121 #define OFFB_Q10      offsetof(VexGuestARM64State,guest_Q10)
1122 #define OFFB_Q11      offsetof(VexGuestARM64State,guest_Q11)
1123 #define OFFB_Q12      offsetof(VexGuestARM64State,guest_Q12)
1124 #define OFFB_Q13      offsetof(VexGuestARM64State,guest_Q13)
1125 #define OFFB_Q14      offsetof(VexGuestARM64State,guest_Q14)
1126 #define OFFB_Q15      offsetof(VexGuestARM64State,guest_Q15)
1127 #define OFFB_Q16      offsetof(VexGuestARM64State,guest_Q16)
1128 #define OFFB_Q17      offsetof(VexGuestARM64State,guest_Q17)
1129 #define OFFB_Q18      offsetof(VexGuestARM64State,guest_Q18)
1130 #define OFFB_Q19      offsetof(VexGuestARM64State,guest_Q19)
1131 #define OFFB_Q20      offsetof(VexGuestARM64State,guest_Q20)
1132 #define OFFB_Q21      offsetof(VexGuestARM64State,guest_Q21)
1133 #define OFFB_Q22      offsetof(VexGuestARM64State,guest_Q22)
1134 #define OFFB_Q23      offsetof(VexGuestARM64State,guest_Q23)
1135 #define OFFB_Q24      offsetof(VexGuestARM64State,guest_Q24)
1136 #define OFFB_Q25      offsetof(VexGuestARM64State,guest_Q25)
1137 #define OFFB_Q26      offsetof(VexGuestARM64State,guest_Q26)
1138 #define OFFB_Q27      offsetof(VexGuestARM64State,guest_Q27)
1139 #define OFFB_Q28      offsetof(VexGuestARM64State,guest_Q28)
1140 #define OFFB_Q29      offsetof(VexGuestARM64State,guest_Q29)
1141 #define OFFB_Q30      offsetof(VexGuestARM64State,guest_Q30)
1142 #define OFFB_Q31      offsetof(VexGuestARM64State,guest_Q31)
1143
1144 #define OFFB_FPCR     offsetof(VexGuestARM64State,guest_FPCR)
1145 #define OFFB_QCFLAG   offsetof(VexGuestARM64State,guest_QCFLAG)
1146
1147 #define OFFB_CMSTART  offsetof(VexGuestARM64State,guest_CMSTART)
1148 #define OFFB_CMLEN    offsetof(VexGuestARM64State,guest_CMLEN)
1149
1150 #define OFFB_LLSC_SIZE offsetof(VexGuestARM64State,guest_LLSC_SIZE)
1151 #define OFFB_LLSC_ADDR offsetof(VexGuestARM64State,guest_LLSC_ADDR)
1152 #define OFFB_LLSC_DATA offsetof(VexGuestARM64State,guest_LLSC_DATA)
1153
1154
1155 /* ---------------- Integer registers ---------------- */
1156
1157 static Int offsetIReg64 ( UInt iregNo )
1158 {
1159    /* Do we care about endianness here?  We do if sub-parts of integer
1160       registers are accessed. */
1161    switch (iregNo) {
1162       case 0:  return OFFB_X0;
1163       case 1:  return OFFB_X1;
1164       case 2:  return OFFB_X2;
1165       case 3:  return OFFB_X3;
1166       case 4:  return OFFB_X4;
1167       case 5:  return OFFB_X5;
1168       case 6:  return OFFB_X6;
1169       case 7:  return OFFB_X7;
1170       case 8:  return OFFB_X8;
1171       case 9:  return OFFB_X9;
1172       case 10: return OFFB_X10;
1173       case 11: return OFFB_X11;
1174       case 12: return OFFB_X12;
1175       case 13: return OFFB_X13;
1176       case 14: return OFFB_X14;
1177       case 15: return OFFB_X15;
1178       case 16: return OFFB_X16;
1179       case 17: return OFFB_X17;
1180       case 18: return OFFB_X18;
1181       case 19: return OFFB_X19;
1182       case 20: return OFFB_X20;
1183       case 21: return OFFB_X21;
1184       case 22: return OFFB_X22;
1185       case 23: return OFFB_X23;
1186       case 24: return OFFB_X24;
1187       case 25: return OFFB_X25;
1188       case 26: return OFFB_X26;
1189       case 27: return OFFB_X27;
1190       case 28: return OFFB_X28;
1191       case 29: return OFFB_X29;
1192       case 30: return OFFB_X30;
1193       /* but not 31 */
1194       default: vassert(0);
1195    }
1196 }
1197
1198 static Int offsetIReg64orSP ( UInt iregNo )
1199 {
1200    return iregNo == 31  ? OFFB_XSP  : offsetIReg64(iregNo);
1201 }
1202
1203 static const HChar* nameIReg64orZR ( UInt iregNo )
1204 {
1205    vassert(iregNo < 32);
1206    static const HChar* names[32]
1207       = { "x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",
1208           "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15",
1209           "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
1210           "x24", "x25", "x26", "x27", "x28", "x29", "x30", "xzr" };
1211    return names[iregNo];
1212 }
1213
1214 static const HChar* nameIReg64orSP ( UInt iregNo )
1215 {
1216    if (iregNo == 31) {
1217       return "sp";
1218    }
1219    vassert(iregNo < 31);
1220    return nameIReg64orZR(iregNo);
1221 }
1222
1223 static IRExpr* getIReg64orSP ( UInt iregNo )
1224 {
1225    vassert(iregNo < 32);
1226    return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
1227 }
1228
1229 static IRExpr* getIReg64orZR ( UInt iregNo )
1230 {
1231    if (iregNo == 31) {
1232       return mkU64(0);
1233    }
1234    vassert(iregNo < 31);
1235    return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
1236 }
1237
1238 static void putIReg64orSP ( UInt iregNo, IRExpr* e )
1239 {
1240    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1241    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
1242 }
1243
1244 static void putIReg64orZR ( UInt iregNo, IRExpr* e )
1245 {
1246    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1247    if (iregNo == 31) {
1248       return;
1249    }
1250    vassert(iregNo < 31);
1251    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
1252 }
1253
1254 static const HChar* nameIReg32orZR ( UInt iregNo )
1255 {
1256    vassert(iregNo < 32);
1257    static const HChar* names[32]
1258       = { "w0",  "w1",  "w2",  "w3",  "w4",  "w5",  "w6",  "w7",
1259           "w8",  "w9",  "w10", "w11", "w12", "w13", "w14", "w15",
1260           "w16", "w17", "w18", "w19", "w20", "w21", "w22", "w23",
1261           "w24", "w25", "w26", "w27", "w28", "w29", "w30", "wzr" };
1262    return names[iregNo];
1263 }
1264
1265 static const HChar* nameIReg32orSP ( UInt iregNo )
1266 {
1267    if (iregNo == 31) {
1268       return "wsp";
1269    }
1270    vassert(iregNo < 31);
1271    return nameIReg32orZR(iregNo);
1272 }
1273
1274 static IRExpr* getIReg32orSP ( UInt iregNo )
1275 {
1276    vassert(iregNo < 32);
1277    return unop(Iop_64to32,
1278                IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
1279 }
1280
1281 static IRExpr* getIReg32orZR ( UInt iregNo )
1282 {
1283    if (iregNo == 31) {
1284       return mkU32(0);
1285    }
1286    vassert(iregNo < 31);
1287    return unop(Iop_64to32,
1288                IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
1289 }
1290
1291 static void putIReg32orSP ( UInt iregNo, IRExpr* e )
1292 {
1293    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1294    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
1295 }
1296
1297 static void putIReg32orZR ( UInt iregNo, IRExpr* e )
1298 {
1299    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1300    if (iregNo == 31) {
1301       return;
1302    }
1303    vassert(iregNo < 31);
1304    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
1305 }
1306
1307 static const HChar* nameIRegOrSP ( Bool is64, UInt iregNo )
1308 {
1309    vassert(is64 == True || is64 == False);
1310    return is64 ? nameIReg64orSP(iregNo) : nameIReg32orSP(iregNo);
1311 }
1312
1313 static const HChar* nameIRegOrZR ( Bool is64, UInt iregNo )
1314 {
1315    vassert(is64 == True || is64 == False);
1316    return is64 ? nameIReg64orZR(iregNo) : nameIReg32orZR(iregNo);
1317 }
1318
1319 static IRExpr* getIRegOrZR ( Bool is64, UInt iregNo )
1320 {
1321    vassert(is64 == True || is64 == False);
1322    return is64 ? getIReg64orZR(iregNo) : getIReg32orZR(iregNo);
1323 }
1324
1325 static void putIRegOrZR ( Bool is64, UInt iregNo, IRExpr* e )
1326 {
1327    vassert(is64 == True || is64 == False);
1328    if (is64) putIReg64orZR(iregNo, e); else putIReg32orZR(iregNo, e);
1329 }
1330
1331 static void putPC ( IRExpr* e )
1332 {
1333    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1334    stmt( IRStmt_Put(OFFB_PC, e) );
1335 }
1336
1337
1338 /* ---------------- Vector (Q) registers ---------------- */
1339
1340 static Int offsetQReg128 ( UInt qregNo )
1341 {
1342    /* We don't care about endianness at this point.  It only becomes
1343       relevant when dealing with sections of these registers.*/
1344    switch (qregNo) {
1345       case 0:  return OFFB_Q0;
1346       case 1:  return OFFB_Q1;
1347       case 2:  return OFFB_Q2;
1348       case 3:  return OFFB_Q3;
1349       case 4:  return OFFB_Q4;
1350       case 5:  return OFFB_Q5;
1351       case 6:  return OFFB_Q6;
1352       case 7:  return OFFB_Q7;
1353       case 8:  return OFFB_Q8;
1354       case 9:  return OFFB_Q9;
1355       case 10: return OFFB_Q10;
1356       case 11: return OFFB_Q11;
1357       case 12: return OFFB_Q12;
1358       case 13: return OFFB_Q13;
1359       case 14: return OFFB_Q14;
1360       case 15: return OFFB_Q15;
1361       case 16: return OFFB_Q16;
1362       case 17: return OFFB_Q17;
1363       case 18: return OFFB_Q18;
1364       case 19: return OFFB_Q19;
1365       case 20: return OFFB_Q20;
1366       case 21: return OFFB_Q21;
1367       case 22: return OFFB_Q22;
1368       case 23: return OFFB_Q23;
1369       case 24: return OFFB_Q24;
1370       case 25: return OFFB_Q25;
1371       case 26: return OFFB_Q26;
1372       case 27: return OFFB_Q27;
1373       case 28: return OFFB_Q28;
1374       case 29: return OFFB_Q29;
1375       case 30: return OFFB_Q30;
1376       case 31: return OFFB_Q31;
1377       default: vassert(0);
1378    }
1379 }
1380
1381 /* Write to a complete Qreg. */
1382 static void putQReg128 ( UInt qregNo, IRExpr* e )
1383 {
1384    vassert(qregNo < 32);
1385    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_V128);
1386    stmt( IRStmt_Put(offsetQReg128(qregNo), e) );
1387 }
1388
1389 /* Read a complete Qreg. */
1390 static IRExpr* getQReg128 ( UInt qregNo )
1391 {
1392    vassert(qregNo < 32);
1393    return IRExpr_Get(offsetQReg128(qregNo), Ity_V128);
1394 }
1395
1396 /* Produce the IR type for some sub-part of a vector.  For 32- and 64-
1397    bit sub-parts we can choose either integer or float types, and
1398    choose float on the basis that that is the common use case and so
1399    will give least interference with Put-to-Get forwarding later
1400    on. */
1401 static IRType preferredVectorSubTypeFromSize ( UInt szB )
1402 {
1403    switch (szB) {
1404       case 1:  return Ity_I8;
1405       case 2:  return Ity_I16;
1406       case 4:  return Ity_I32; //Ity_F32;
1407       case 8:  return Ity_F64;
1408       case 16: return Ity_V128;
1409       default: vassert(0);
1410    }
1411 }
1412
1413 /* Find the offset of the laneNo'th lane of type laneTy in the given
1414    Qreg.  Since the host is little-endian, the least significant lane
1415    has the lowest offset. */
1416 static Int offsetQRegLane ( UInt qregNo, IRType laneTy, UInt laneNo )
1417 {
1418    vassert(host_endness == VexEndnessLE);
1419    Int base = offsetQReg128(qregNo);
1420    /* Since the host is little-endian, the least significant lane
1421       will be at the lowest address. */
1422    /* Restrict this to known types, so as to avoid silently accepting
1423       stupid types. */
1424    UInt laneSzB = 0;
1425    switch (laneTy) {
1426       case Ity_I8:                 laneSzB = 1;  break;
1427       case Ity_F16: case Ity_I16:  laneSzB = 2;  break;
1428       case Ity_F32: case Ity_I32:  laneSzB = 4;  break;
1429       case Ity_F64: case Ity_I64:  laneSzB = 8;  break;
1430       case Ity_V128:               laneSzB = 16; break;
1431       default: break;
1432    }
1433    vassert(laneSzB > 0);
1434    UInt minOff = laneNo * laneSzB;
1435    UInt maxOff = minOff + laneSzB - 1;
1436    vassert(maxOff < 16);
1437    return base + minOff;
1438 }
1439
1440 /* Put to the least significant lane of a Qreg. */
1441 static void putQRegLO ( UInt qregNo, IRExpr* e )
1442 {
1443    IRType ty  = typeOfIRExpr(irsb->tyenv, e);
1444    Int    off = offsetQRegLane(qregNo, ty, 0);
1445    switch (ty) {
1446       case Ity_I8:  case Ity_I16: case Ity_I32: case Ity_I64:
1447       case Ity_F16: case Ity_F32: case Ity_F64: case Ity_V128:
1448          break;
1449       default:
1450          vassert(0); // Other cases are probably invalid
1451    }
1452    stmt(IRStmt_Put(off, e));
1453 }
1454
1455 /* Get from the least significant lane of a Qreg. */
1456 static IRExpr* getQRegLO ( UInt qregNo, IRType ty )
1457 {
1458    Int off = offsetQRegLane(qregNo, ty, 0);
1459    switch (ty) {
1460       case Ity_I8:
1461       case Ity_F16: case Ity_I16:
1462       case Ity_I32: case Ity_I64:
1463       case Ity_F32: case Ity_F64: case Ity_V128:
1464          break;
1465       default:
1466          vassert(0); // Other cases are ATC
1467    }
1468    return IRExpr_Get(off, ty);
1469 }
1470
1471 static const HChar* nameQRegLO ( UInt qregNo, IRType laneTy )
1472 {
1473    static const HChar* namesQ[32]
1474       = { "q0",  "q1",  "q2",  "q3",  "q4",  "q5",  "q6",  "q7",
1475           "q8",  "q9",  "q10", "q11", "q12", "q13", "q14", "q15",
1476           "q16", "q17", "q18", "q19", "q20", "q21", "q22", "q23",
1477           "q24", "q25", "q26", "q27", "q28", "q29", "q30", "q31" };
1478    static const HChar* namesD[32]
1479       = { "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
1480           "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
1481           "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
1482           "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" };
1483    static const HChar* namesS[32]
1484       = { "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
1485           "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15",
1486           "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
1487           "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31" };
1488    static const HChar* namesH[32]
1489       = { "h0",  "h1",  "h2",  "h3",  "h4",  "h5",  "h6",  "h7",
1490           "h8",  "h9",  "h10", "h11", "h12", "h13", "h14", "h15",
1491           "h16", "h17", "h18", "h19", "h20", "h21", "h22", "h23",
1492           "h24", "h25", "h26", "h27", "h28", "h29", "h30", "h31" };
1493    static const HChar* namesB[32]
1494       = { "b0",  "b1",  "b2",  "b3",  "b4",  "b5",  "b6",  "b7",
1495           "b8",  "b9",  "b10", "b11", "b12", "b13", "b14", "b15",
1496           "b16", "b17", "b18", "b19", "b20", "b21", "b22", "b23",
1497           "b24", "b25", "b26", "b27", "b28", "b29", "b30", "b31" };
1498    vassert(qregNo < 32);
1499    switch (sizeofIRType(laneTy)) {
1500       case 1:  return namesB[qregNo];
1501       case 2:  return namesH[qregNo];
1502       case 4:  return namesS[qregNo];
1503       case 8:  return namesD[qregNo];
1504       case 16: return namesQ[qregNo];
1505       default: vassert(0);
1506    }
1507    /*NOTREACHED*/
1508 }
1509
1510 static const HChar* nameQReg128 ( UInt qregNo )
1511 {
1512    return nameQRegLO(qregNo, Ity_V128);
1513 }
1514
1515 /* Find the offset of the most significant half (8 bytes) of the given
1516    Qreg.  This requires knowing the endianness of the host. */
1517 static Int offsetQRegHI64 ( UInt qregNo )
1518 {
1519    return offsetQRegLane(qregNo, Ity_I64, 1);
1520 }
1521
1522 static IRExpr* getQRegHI64 ( UInt qregNo )
1523 {
1524    return IRExpr_Get(offsetQRegHI64(qregNo), Ity_I64);
1525 }
1526
1527 static void putQRegHI64 ( UInt qregNo, IRExpr* e )
1528 {
1529    IRType ty  = typeOfIRExpr(irsb->tyenv, e);
1530    Int    off = offsetQRegHI64(qregNo);
1531    switch (ty) {
1532       case Ity_I64: case Ity_F64:
1533          break;
1534       default:
1535          vassert(0); // Other cases are plain wrong
1536    }
1537    stmt(IRStmt_Put(off, e));
1538 }
1539
1540 /* Put to a specified lane of a Qreg. */
1541 static void putQRegLane ( UInt qregNo, UInt laneNo, IRExpr* e )
1542 {
1543    IRType laneTy  = typeOfIRExpr(irsb->tyenv, e);
1544    Int    off     = offsetQRegLane(qregNo, laneTy, laneNo);
1545    switch (laneTy) {
1546       case Ity_F64: case Ity_I64:
1547       case Ity_I32: case Ity_F32:
1548       case Ity_I16: case Ity_F16:
1549       case Ity_I8:
1550          break;
1551       default:
1552          vassert(0); // Other cases are ATC
1553    }
1554    stmt(IRStmt_Put(off, e));
1555 }
1556
1557 /* Get from a specified lane of a Qreg. */
1558 static IRExpr* getQRegLane ( UInt qregNo, UInt laneNo, IRType laneTy )
1559 {
1560    Int off = offsetQRegLane(qregNo, laneTy, laneNo);
1561    switch (laneTy) {
1562       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
1563       case Ity_F64: case Ity_F32: case Ity_F16:
1564          break;
1565       default:
1566          vassert(0); // Other cases are ATC
1567    }
1568    return IRExpr_Get(off, laneTy);
1569 }
1570
1571
1572 //ZZ /* ---------------- Misc registers ---------------- */
1573 //ZZ
1574 //ZZ static void putMiscReg32 ( UInt    gsoffset,
1575 //ZZ                            IRExpr* e, /* :: Ity_I32 */
1576 //ZZ                            IRTemp  guardT /* :: Ity_I32, 0 or 1 */)
1577 //ZZ {
1578 //ZZ    switch (gsoffset) {
1579 //ZZ       case OFFB_FPSCR:   break;
1580 //ZZ       case OFFB_QFLAG32: break;
1581 //ZZ       case OFFB_GEFLAG0: break;
1582 //ZZ       case OFFB_GEFLAG1: break;
1583 //ZZ       case OFFB_GEFLAG2: break;
1584 //ZZ       case OFFB_GEFLAG3: break;
1585 //ZZ       default: vassert(0); /* awaiting more cases */
1586 //ZZ    }
1587 //ZZ    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1588 //ZZ
1589 //ZZ    if (guardT == IRTemp_INVALID) {
1590 //ZZ       /* unconditional write */
1591 //ZZ       stmt(IRStmt_Put(gsoffset, e));
1592 //ZZ    } else {
1593 //ZZ       stmt(IRStmt_Put(
1594 //ZZ          gsoffset,
1595 //ZZ          IRExpr_ITE( binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0)),
1596 //ZZ                      e, IRExpr_Get(gsoffset, Ity_I32) )
1597 //ZZ       ));
1598 //ZZ    }
1599 //ZZ }
1600 //ZZ
1601 //ZZ static IRTemp get_ITSTATE ( void )
1602 //ZZ {
1603 //ZZ    ASSERT_IS_THUMB;
1604 //ZZ    IRTemp t = newTemp(Ity_I32);
1605 //ZZ    assign(t, IRExpr_Get( OFFB_ITSTATE, Ity_I32));
1606 //ZZ    return t;
1607 //ZZ }
1608 //ZZ
1609 //ZZ static void put_ITSTATE ( IRTemp t )
1610 //ZZ {
1611 //ZZ    ASSERT_IS_THUMB;
1612 //ZZ    stmt( IRStmt_Put( OFFB_ITSTATE, mkexpr(t)) );
1613 //ZZ }
1614 //ZZ
1615 //ZZ static IRTemp get_QFLAG32 ( void )
1616 //ZZ {
1617 //ZZ    IRTemp t = newTemp(Ity_I32);
1618 //ZZ    assign(t, IRExpr_Get( OFFB_QFLAG32, Ity_I32));
1619 //ZZ    return t;
1620 //ZZ }
1621 //ZZ
1622 //ZZ static void put_QFLAG32 ( IRTemp t, IRTemp condT )
1623 //ZZ {
1624 //ZZ    putMiscReg32( OFFB_QFLAG32, mkexpr(t), condT );
1625 //ZZ }
1626 //ZZ
1627 //ZZ /* Stickily set the 'Q' flag (APSR bit 27) of the APSR (Application Program
1628 //ZZ    Status Register) to indicate that overflow or saturation occurred.
1629 //ZZ    Nb: t must be zero to denote no saturation, and any nonzero
1630 //ZZ    value to indicate saturation. */
1631 //ZZ static void or_into_QFLAG32 ( IRExpr* e, IRTemp condT )
1632 //ZZ {
1633 //ZZ    IRTemp old = get_QFLAG32();
1634 //ZZ    IRTemp nyu = newTemp(Ity_I32);
1635 //ZZ    assign(nyu, binop(Iop_Or32, mkexpr(old), e) );
1636 //ZZ    put_QFLAG32(nyu, condT);
1637 //ZZ }
1638
1639
1640 /* ---------------- FPCR stuff ---------------- */
1641
1642 /* Generate IR to get hold of the rounding mode bits in FPCR, and
1643    convert them to IR format.  Bind the final result to the
1644    returned temp. */
1645 static IRTemp /* :: Ity_I32 */ mk_get_IR_rounding_mode ( void )
1646 {
1647    /* The ARMvfp encoding for rounding mode bits is:
1648          00  to nearest
1649          01  to +infinity
1650          10  to -infinity
1651          11  to zero
1652       We need to convert that to the IR encoding:
1653          00  to nearest (the default)
1654          10  to +infinity
1655          01  to -infinity
1656          11  to zero
1657       Which can be done by swapping bits 0 and 1.
1658       The rmode bits are at 23:22 in FPSCR.
1659    */
1660    IRTemp armEncd = newTemp(Ity_I32);
1661    IRTemp swapped = newTemp(Ity_I32);
1662    /* Fish FPCR[23:22] out, and slide to bottom.  Doesn't matter that
1663       we don't zero out bits 24 and above, since the assignment to
1664       'swapped' will mask them out anyway. */
1665    assign(armEncd,
1666           binop(Iop_Shr32, IRExpr_Get(OFFB_FPCR, Ity_I32), mkU8(22)));
1667    /* Now swap them. */
1668    assign(swapped,
1669           binop(Iop_Or32,
1670                 binop(Iop_And32,
1671                       binop(Iop_Shl32, mkexpr(armEncd), mkU8(1)),
1672                       mkU32(2)),
1673                 binop(Iop_And32,
1674                       binop(Iop_Shr32, mkexpr(armEncd), mkU8(1)),
1675                       mkU32(1))
1676          ));
1677    return swapped;
1678 }
1679
1680
1681 /*------------------------------------------------------------*/
1682 /*--- Helpers for flag handling and conditional insns      ---*/
1683 /*------------------------------------------------------------*/
1684
1685 static const HChar* nameARM64Condcode ( ARM64Condcode cond )
1686 {
1687    switch (cond) {
1688       case ARM64CondEQ:  return "eq";
1689       case ARM64CondNE:  return "ne";
1690       case ARM64CondCS:  return "cs";  // or 'hs'
1691       case ARM64CondCC:  return "cc";  // or 'lo'
1692       case ARM64CondMI:  return "mi";
1693       case ARM64CondPL:  return "pl";
1694       case ARM64CondVS:  return "vs";
1695       case ARM64CondVC:  return "vc";
1696       case ARM64CondHI:  return "hi";
1697       case ARM64CondLS:  return "ls";
1698       case ARM64CondGE:  return "ge";
1699       case ARM64CondLT:  return "lt";
1700       case ARM64CondGT:  return "gt";
1701       case ARM64CondLE:  return "le";
1702       case ARM64CondAL:  return "al";
1703       case ARM64CondNV:  return "nv";
1704       default: vpanic("name_ARM64Condcode");
1705    }
1706 }
1707
1708 /* and a handy shorthand for it */
1709 static const HChar* nameCC ( ARM64Condcode cond ) {
1710    return nameARM64Condcode(cond);
1711 }
1712
1713
1714 /* Build IR to calculate some particular condition from stored
1715    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
1716    Ity_I64, suitable for narrowing.  Although the return type is
1717    Ity_I64, the returned value is either 0 or 1.  'cond' must be
1718    :: Ity_I64 and must denote the condition to compute in
1719    bits 7:4, and be zero everywhere else.
1720 */
1721 static IRExpr* mk_arm64g_calculate_condition_dyn ( IRExpr* cond )
1722 {
1723    vassert(typeOfIRExpr(irsb->tyenv, cond) == Ity_I64);
1724    /* And 'cond' had better produce a value in which only bits 7:4 are
1725       nonzero.  However, obviously we can't assert for that. */
1726
1727    /* So what we're constructing for the first argument is
1728       "(cond << 4) | stored-operation".
1729       However, as per comments above, 'cond' must be supplied
1730       pre-shifted to this function.
1731
1732       This pairing scheme requires that the ARM64_CC_OP_ values all fit
1733       in 4 bits.  Hence we are passing a (COND, OP) pair in the lowest
1734       8 bits of the first argument. */
1735    IRExpr** args
1736       = mkIRExprVec_4(
1737            binop(Iop_Or64, IRExpr_Get(OFFB_CC_OP, Ity_I64), cond),
1738            IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1739            IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1740            IRExpr_Get(OFFB_CC_NDEP, Ity_I64)
1741         );
1742    IRExpr* call
1743       = mkIRExprCCall(
1744            Ity_I64,
1745            0/*regparm*/,
1746            "arm64g_calculate_condition", &arm64g_calculate_condition,
1747            args
1748         );
1749
1750    /* Exclude the requested condition, OP and NDEP from definedness
1751       checking.  We're only interested in DEP1 and DEP2. */
1752    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1753    return call;
1754 }
1755
1756
1757 /* Build IR to calculate some particular condition from stored
1758    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
1759    Ity_I64, suitable for narrowing.  Although the return type is
1760    Ity_I64, the returned value is either 0 or 1.
1761 */
1762 static IRExpr* mk_arm64g_calculate_condition ( ARM64Condcode cond )
1763 {
1764   /* First arg is "(cond << 4) | condition".  This requires that the
1765      ARM64_CC_OP_ values all fit in 4 bits.  Hence we are passing a
1766      (COND, OP) pair in the lowest 8 bits of the first argument. */
1767    vassert(cond >= 0 && cond <= 15);
1768    return mk_arm64g_calculate_condition_dyn( mkU64(cond << 4) );
1769 }
1770
1771
1772 /* Build IR to calculate just the carry flag from stored
1773    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1774    Ity_I64. */
1775 static IRExpr* mk_arm64g_calculate_flag_c ( void )
1776 {
1777    IRExpr** args
1778       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1779                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1780                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1781                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1782    IRExpr* call
1783       = mkIRExprCCall(
1784            Ity_I64,
1785            0/*regparm*/,
1786            "arm64g_calculate_flag_c", &arm64g_calculate_flag_c,
1787            args
1788         );
1789    /* Exclude OP and NDEP from definedness checking.  We're only
1790       interested in DEP1 and DEP2. */
1791    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1792    return call;
1793 }
1794
1795
1796 //ZZ /* Build IR to calculate just the overflow flag from stored
1797 //ZZ    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1798 //ZZ    Ity_I32. */
1799 //ZZ static IRExpr* mk_armg_calculate_flag_v ( void )
1800 //ZZ {
1801 //ZZ    IRExpr** args
1802 //ZZ       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
1803 //ZZ                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
1804 //ZZ                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
1805 //ZZ                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
1806 //ZZ    IRExpr* call
1807 //ZZ       = mkIRExprCCall(
1808 //ZZ            Ity_I32,
1809 //ZZ            0/*regparm*/,
1810 //ZZ            "armg_calculate_flag_v", &armg_calculate_flag_v,
1811 //ZZ            args
1812 //ZZ         );
1813 //ZZ    /* Exclude OP and NDEP from definedness checking.  We're only
1814 //ZZ       interested in DEP1 and DEP2. */
1815 //ZZ    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1816 //ZZ    return call;
1817 //ZZ }
1818
1819
1820 /* Build IR to calculate N Z C V in bits 31:28 of the
1821    returned word. */
1822 static IRExpr* mk_arm64g_calculate_flags_nzcv ( void )
1823 {
1824    IRExpr** args
1825       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1826                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1827                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1828                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1829    IRExpr* call
1830       = mkIRExprCCall(
1831            Ity_I64,
1832            0/*regparm*/,
1833            "arm64g_calculate_flags_nzcv", &arm64g_calculate_flags_nzcv,
1834            args
1835         );
1836    /* Exclude OP and NDEP from definedness checking.  We're only
1837       interested in DEP1 and DEP2. */
1838    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1839    return call;
1840 }
1841
1842
1843 /* Build IR to set the flags thunk, in the most general case. */
1844 static
1845 void setFlags_D1_D2_ND ( UInt cc_op,
1846                          IRTemp t_dep1, IRTemp t_dep2, IRTemp t_ndep )
1847 {
1848    vassert(typeOfIRTemp(irsb->tyenv, t_dep1 == Ity_I64));
1849    vassert(typeOfIRTemp(irsb->tyenv, t_dep2 == Ity_I64));
1850    vassert(typeOfIRTemp(irsb->tyenv, t_ndep == Ity_I64));
1851    vassert(cc_op >= ARM64G_CC_OP_COPY && cc_op < ARM64G_CC_OP_NUMBER);
1852    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(cc_op) ));
1853    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t_dep1) ));
1854    stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(t_dep2) ));
1855    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(t_ndep) ));
1856 }
1857
1858 /* Build IR to set the flags thunk after ADD or SUB. */
1859 static
1860 void setFlags_ADD_SUB ( Bool is64, Bool isSUB, IRTemp argL, IRTemp argR )
1861 {
1862    IRTemp argL64 = IRTemp_INVALID;
1863    IRTemp argR64 = IRTemp_INVALID;
1864    IRTemp z64    = newTemp(Ity_I64);
1865    if (is64) {
1866       argL64 = argL;
1867       argR64 = argR;
1868    } else {
1869       argL64 = newTemp(Ity_I64);
1870       argR64 = newTemp(Ity_I64);
1871       assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
1872       assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
1873    }
1874    assign(z64, mkU64(0));
1875    UInt cc_op = ARM64G_CC_OP_NUMBER;
1876    /**/ if ( isSUB &&  is64) { cc_op = ARM64G_CC_OP_SUB64; }
1877    else if ( isSUB && !is64) { cc_op = ARM64G_CC_OP_SUB32; }
1878    else if (!isSUB &&  is64) { cc_op = ARM64G_CC_OP_ADD64; }
1879    else if (!isSUB && !is64) { cc_op = ARM64G_CC_OP_ADD32; }
1880    else                      { vassert(0); }
1881    setFlags_D1_D2_ND(cc_op, argL64, argR64, z64);
1882 }
1883
1884 /* Build IR to set the flags thunk after ADC or SBC. */
1885 static
1886 void setFlags_ADC_SBC ( Bool is64, Bool isSBC,
1887                         IRTemp argL, IRTemp argR, IRTemp oldC )
1888 {
1889    IRTemp argL64 = IRTemp_INVALID;
1890    IRTemp argR64 = IRTemp_INVALID;
1891    IRTemp oldC64 = IRTemp_INVALID;
1892    if (is64) {
1893       argL64 = argL;
1894       argR64 = argR;
1895       oldC64 = oldC;
1896    } else {
1897       argL64 = newTemp(Ity_I64);
1898       argR64 = newTemp(Ity_I64);
1899       oldC64 = newTemp(Ity_I64);
1900       assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
1901       assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
1902       assign(oldC64, unop(Iop_32Uto64, mkexpr(oldC)));
1903    }
1904    UInt cc_op = ARM64G_CC_OP_NUMBER;
1905    /**/ if ( isSBC &&  is64) { cc_op = ARM64G_CC_OP_SBC64; }
1906    else if ( isSBC && !is64) { cc_op = ARM64G_CC_OP_SBC32; }
1907    else if (!isSBC &&  is64) { cc_op = ARM64G_CC_OP_ADC64; }
1908    else if (!isSBC && !is64) { cc_op = ARM64G_CC_OP_ADC32; }
1909    else                      { vassert(0); }
1910    setFlags_D1_D2_ND(cc_op, argL64, argR64, oldC64);
1911 }
1912
1913 /* Build IR to set the flags thunk after ADD or SUB, if the given
1914    condition evaluates to True at run time.  If not, the flags are set
1915    to the specified NZCV value. */
1916 static
1917 void setFlags_ADD_SUB_conditionally (
1918         Bool is64, Bool isSUB,
1919         IRTemp cond, IRTemp argL, IRTemp argR, UInt nzcv
1920      )
1921 {
1922    /* Generate IR as follows:
1923         CC_OP   = ITE(cond, OP_{ADD,SUB}{32,64}, OP_COPY)
1924         CC_DEP1 = ITE(cond, argL64, nzcv << 28)
1925         CC_DEP2 = ITE(cond, argR64, 0)
1926         CC_NDEP = 0
1927    */
1928
1929    IRTemp z64 = newTemp(Ity_I64);
1930    assign(z64, mkU64(0));
1931
1932    /* Establish the operation and operands for the True case. */
1933    IRTemp t_dep1 = IRTemp_INVALID;
1934    IRTemp t_dep2 = IRTemp_INVALID;
1935    UInt   t_op   = ARM64G_CC_OP_NUMBER;
1936    /**/ if ( isSUB &&  is64) { t_op = ARM64G_CC_OP_SUB64; }
1937    else if ( isSUB && !is64) { t_op = ARM64G_CC_OP_SUB32; }
1938    else if (!isSUB &&  is64) { t_op = ARM64G_CC_OP_ADD64; }
1939    else if (!isSUB && !is64) { t_op = ARM64G_CC_OP_ADD32; }
1940    else                      { vassert(0); }
1941    /* */
1942    if (is64) {
1943       t_dep1 = argL;
1944       t_dep2 = argR;
1945    } else {
1946       t_dep1 = newTemp(Ity_I64);
1947       t_dep2 = newTemp(Ity_I64);
1948       assign(t_dep1, unop(Iop_32Uto64, mkexpr(argL)));
1949       assign(t_dep2, unop(Iop_32Uto64, mkexpr(argR)));
1950    }
1951
1952    /* Establish the operation and operands for the False case. */
1953    IRTemp f_dep1 = newTemp(Ity_I64);
1954    IRTemp f_dep2 = z64;
1955    UInt   f_op   = ARM64G_CC_OP_COPY;
1956    assign(f_dep1, mkU64(nzcv << 28));
1957
1958    /* Final thunk values */
1959    IRTemp dep1 = newTemp(Ity_I64);
1960    IRTemp dep2 = newTemp(Ity_I64);
1961    IRTemp op   = newTemp(Ity_I64);
1962
1963    assign(op,   IRExpr_ITE(mkexpr(cond), mkU64(t_op), mkU64(f_op)));
1964    assign(dep1, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep1), mkexpr(f_dep1)));
1965    assign(dep2, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep2), mkexpr(f_dep2)));
1966
1967    /* finally .. */
1968    stmt( IRStmt_Put( OFFB_CC_OP,   mkexpr(op) ));
1969    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(dep1) ));
1970    stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(dep2) ));
1971    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(z64) ));
1972 }
1973
1974 /* Build IR to set the flags thunk after AND/OR/XOR or variants thereof. */
1975 static
1976 void setFlags_LOGIC ( Bool is64, IRTemp res )
1977 {
1978    IRTemp res64 = IRTemp_INVALID;
1979    IRTemp z64   = newTemp(Ity_I64);
1980    UInt   cc_op = ARM64G_CC_OP_NUMBER;
1981    if (is64) {
1982       res64 = res;
1983       cc_op = ARM64G_CC_OP_LOGIC64;
1984    } else {
1985       res64 = newTemp(Ity_I64);
1986       assign(res64, unop(Iop_32Uto64, mkexpr(res)));
1987       cc_op = ARM64G_CC_OP_LOGIC32;
1988    }
1989    assign(z64, mkU64(0));
1990    setFlags_D1_D2_ND(cc_op, res64, z64, z64);
1991 }
1992
1993 /* Build IR to set the flags thunk to a given NZCV value.  NZCV is
1994    located in bits 31:28 of the supplied value. */
1995 static
1996 void setFlags_COPY ( IRTemp nzcv_28x0 )
1997 {
1998    IRTemp z64 = newTemp(Ity_I64);
1999    assign(z64, mkU64(0));
2000    setFlags_D1_D2_ND(ARM64G_CC_OP_COPY, nzcv_28x0, z64, z64);
2001 }
2002
2003
2004 //ZZ /* Minor variant of the above that sets NDEP to zero (if it
2005 //ZZ    sets it at all) */
2006 //ZZ static void setFlags_D1_D2 ( UInt cc_op, IRTemp t_dep1,
2007 //ZZ                              IRTemp t_dep2,
2008 //ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2009 //ZZ {
2010 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2011 //ZZ    assign( z32, mkU32(0) );
2012 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, t_dep2, z32, guardT );
2013 //ZZ }
2014 //ZZ
2015 //ZZ
2016 //ZZ /* Minor variant of the above that sets DEP2 to zero (if it
2017 //ZZ    sets it at all) */
2018 //ZZ static void setFlags_D1_ND ( UInt cc_op, IRTemp t_dep1,
2019 //ZZ                              IRTemp t_ndep,
2020 //ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2021 //ZZ {
2022 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2023 //ZZ    assign( z32, mkU32(0) );
2024 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, t_ndep, guardT );
2025 //ZZ }
2026 //ZZ
2027 //ZZ
2028 //ZZ /* Minor variant of the above that sets DEP2 and NDEP to zero (if it
2029 //ZZ    sets them at all) */
2030 //ZZ static void setFlags_D1 ( UInt cc_op, IRTemp t_dep1,
2031 //ZZ                           IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2032 //ZZ {
2033 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2034 //ZZ    assign( z32, mkU32(0) );
2035 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, z32, guardT );
2036 //ZZ }
2037
2038
2039 /*------------------------------------------------------------*/
2040 /*--- Misc math helpers                                    ---*/
2041 /*------------------------------------------------------------*/
2042
2043 /* Generate IR for ((x & mask) >>u sh) | ((x << sh) & mask) */
2044 static IRTemp math_SWAPHELPER ( IRTemp x, ULong mask, Int sh )
2045 {
2046    IRTemp maskT = newTemp(Ity_I64);
2047    IRTemp res   = newTemp(Ity_I64);
2048    vassert(sh >= 1 && sh <= 63);
2049    assign(maskT, mkU64(mask));
2050    assign( res,
2051            binop(Iop_Or64,
2052                  binop(Iop_Shr64,
2053                        binop(Iop_And64,mkexpr(x),mkexpr(maskT)),
2054                        mkU8(sh)),
2055                  binop(Iop_And64,
2056                        binop(Iop_Shl64,mkexpr(x),mkU8(sh)),
2057                        mkexpr(maskT))
2058                  )
2059            );
2060    return res;
2061 }
2062
2063 /* Generates byte swaps within 32-bit lanes. */
2064 static IRTemp math_UINTSWAP64 ( IRTemp src )
2065 {
2066    IRTemp res;
2067    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2068    res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
2069    return res;
2070 }
2071
2072 /* Generates byte swaps within 16-bit lanes. */
2073 static IRTemp math_USHORTSWAP64 ( IRTemp src )
2074 {
2075    IRTemp res;
2076    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2077    return res;
2078 }
2079
2080 /* Generates a 64-bit byte swap. */
2081 static IRTemp math_BYTESWAP64 ( IRTemp src )
2082 {
2083    IRTemp res;
2084    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2085    res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
2086    res = math_SWAPHELPER(res, 0xFFFFFFFF00000000ULL, 32);
2087    return res;
2088 }
2089
2090 /* Generates a 64-bit bit swap. */
2091 static IRTemp math_BITSWAP64 ( IRTemp src )
2092 {
2093    IRTemp res;
2094    res = math_SWAPHELPER(src, 0xAAAAAAAAAAAAAAAAULL, 1);
2095    res = math_SWAPHELPER(res, 0xCCCCCCCCCCCCCCCCULL, 2);
2096    res = math_SWAPHELPER(res, 0xF0F0F0F0F0F0F0F0ULL, 4);
2097    return math_BYTESWAP64(res);
2098 }
2099
2100 /* Duplicates the bits at the bottom of the given word to fill the
2101    whole word.  src :: Ity_I64 is assumed to have zeroes everywhere
2102    except for the bottom bits. */
2103 static IRTemp math_DUP_TO_64 ( IRTemp src, IRType srcTy )
2104 {
2105    if (srcTy == Ity_I8) {
2106       IRTemp t16 = newTemp(Ity_I64);
2107       assign(t16, binop(Iop_Or64, mkexpr(src),
2108                                   binop(Iop_Shl64, mkexpr(src), mkU8(8))));
2109       IRTemp t32 = newTemp(Ity_I64);
2110       assign(t32, binop(Iop_Or64, mkexpr(t16),
2111                                   binop(Iop_Shl64, mkexpr(t16), mkU8(16))));
2112       IRTemp t64 = newTemp(Ity_I64);
2113       assign(t64, binop(Iop_Or64, mkexpr(t32),
2114                                   binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
2115       return t64;
2116    }
2117    if (srcTy == Ity_I16) {
2118       IRTemp t32 = newTemp(Ity_I64);
2119       assign(t32, binop(Iop_Or64, mkexpr(src),
2120                                   binop(Iop_Shl64, mkexpr(src), mkU8(16))));
2121       IRTemp t64 = newTemp(Ity_I64);
2122       assign(t64, binop(Iop_Or64, mkexpr(t32),
2123                                   binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
2124       return t64;
2125    }
2126    if (srcTy == Ity_I32) {
2127       IRTemp t64 = newTemp(Ity_I64);
2128       assign(t64, binop(Iop_Or64, mkexpr(src),
2129                                   binop(Iop_Shl64, mkexpr(src), mkU8(32))));
2130       return t64;
2131    }
2132    if (srcTy == Ity_I64) {
2133       return src;
2134    }
2135    vassert(0);
2136 }
2137
2138
2139 /* Duplicates the src element exactly so as to fill a V128 value. */
2140 static IRTemp math_DUP_TO_V128 ( IRTemp src, IRType srcTy )
2141 {
2142    IRTemp res = newTempV128();
2143    if (srcTy == Ity_F64) {
2144       IRTemp i64 = newTemp(Ity_I64);
2145       assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(src)));
2146       assign(res, binop(Iop_64HLtoV128, mkexpr(i64), mkexpr(i64)));
2147       return res;
2148    }
2149    if (srcTy == Ity_F32) {
2150       IRTemp i64a = newTemp(Ity_I64);
2151       assign(i64a, unop(Iop_32Uto64, unop(Iop_ReinterpF32asI32, mkexpr(src))));
2152       IRTemp i64b = newTemp(Ity_I64);
2153       assign(i64b, binop(Iop_Or64, binop(Iop_Shl64, mkexpr(i64a), mkU8(32)),
2154                                    mkexpr(i64a)));
2155       assign(res, binop(Iop_64HLtoV128, mkexpr(i64b), mkexpr(i64b)));
2156       return res;
2157    }
2158    if (srcTy == Ity_I64) {
2159       assign(res, binop(Iop_64HLtoV128, mkexpr(src), mkexpr(src)));
2160       return res;
2161    }
2162    if (srcTy == Ity_I32 || srcTy == Ity_I16 || srcTy == Ity_I8) {
2163       IRTemp t1 = newTemp(Ity_I64);
2164       assign(t1, widenUto64(srcTy, mkexpr(src)));
2165       IRTemp t2 = math_DUP_TO_64(t1, srcTy);
2166       assign(res, binop(Iop_64HLtoV128, mkexpr(t2), mkexpr(t2)));
2167       return res;
2168    }
2169    vassert(0);
2170 }
2171
2172
2173 /* |fullWidth| is a full V128 width result.  Depending on bitQ,
2174    zero out the upper half. */
2175 static IRExpr* math_MAYBE_ZERO_HI64 ( UInt bitQ, IRTemp fullWidth )
2176 {
2177    if (bitQ == 1) return mkexpr(fullWidth);
2178    if (bitQ == 0) return unop(Iop_ZeroHI64ofV128, mkexpr(fullWidth));
2179    vassert(0);
2180 }
2181
2182 /* The same, but from an expression instead. */
2183 static IRExpr* math_MAYBE_ZERO_HI64_fromE ( UInt bitQ, IRExpr* fullWidth )
2184 {
2185    IRTemp fullWidthT = newTempV128();
2186    assign(fullWidthT, fullWidth);
2187    return math_MAYBE_ZERO_HI64(bitQ, fullWidthT);
2188 }
2189
2190
2191 /*------------------------------------------------------------*/
2192 /*--- FP comparison helpers                                ---*/
2193 /*------------------------------------------------------------*/
2194
2195 /* irRes :: Ity_I32 holds a floating point comparison result encoded
2196    as an IRCmpF64Result.  Generate code to convert it to an
2197    ARM64-encoded (N,Z,C,V) group in the lowest 4 bits of an I64 value.
2198    Assign a new temp to hold that value, and return the temp. */
2199 static
2200 IRTemp mk_convert_IRCmpF64Result_to_NZCV ( IRTemp irRes32 )
2201 {
2202    IRTemp ix       = newTemp(Ity_I64);
2203    IRTemp termL    = newTemp(Ity_I64);
2204    IRTemp termR    = newTemp(Ity_I64);
2205    IRTemp nzcv     = newTemp(Ity_I64);
2206    IRTemp irRes    = newTemp(Ity_I64);
2207
2208    /* This is where the fun starts.  We have to convert 'irRes' from
2209       an IR-convention return result (IRCmpF64Result) to an
2210       ARM-encoded (N,Z,C,V) group.  The final result is in the bottom
2211       4 bits of 'nzcv'. */
2212    /* Map compare result from IR to ARM(nzcv) */
2213    /*
2214       FP cmp result | IR   | ARM(nzcv)
2215       --------------------------------
2216       UN              0x45   0011
2217       LT              0x01   1000
2218       GT              0x00   0010
2219       EQ              0x40   0110
2220    */
2221    /* Now since you're probably wondering WTF ..
2222
2223       ix fishes the useful bits out of the IR value, bits 6 and 0, and
2224       places them side by side, giving a number which is 0, 1, 2 or 3.
2225
2226       termL is a sequence cooked up by GNU superopt.  It converts ix
2227          into an almost correct value NZCV value (incredibly), except
2228          for the case of UN, where it produces 0100 instead of the
2229          required 0011.
2230
2231       termR is therefore a correction term, also computed from ix.  It
2232          is 1 in the UN case and 0 for LT, GT and UN.  Hence, to get
2233          the final correct value, we subtract termR from termL.
2234
2235       Don't take my word for it.  There's a test program at the bottom
2236       of guest_arm_toIR.c, to try this out with.
2237    */
2238    assign(irRes, unop(Iop_32Uto64, mkexpr(irRes32)));
2239
2240    assign(
2241       ix,
2242       binop(Iop_Or64,
2243             binop(Iop_And64,
2244                   binop(Iop_Shr64, mkexpr(irRes), mkU8(5)),
2245                   mkU64(3)),
2246             binop(Iop_And64, mkexpr(irRes), mkU64(1))));
2247
2248    assign(
2249       termL,
2250       binop(Iop_Add64,
2251             binop(Iop_Shr64,
2252                   binop(Iop_Sub64,
2253                         binop(Iop_Shl64,
2254                               binop(Iop_Xor64, mkexpr(ix), mkU64(1)),
2255                               mkU8(62)),
2256                         mkU64(1)),
2257                   mkU8(61)),
2258             mkU64(1)));
2259
2260    assign(
2261       termR,
2262       binop(Iop_And64,
2263             binop(Iop_And64,
2264                   mkexpr(ix),
2265                   binop(Iop_Shr64, mkexpr(ix), mkU8(1))),
2266             mkU64(1)));
2267
2268    assign(nzcv, binop(Iop_Sub64, mkexpr(termL), mkexpr(termR)));
2269    return nzcv;
2270 }
2271
2272
2273 /*------------------------------------------------------------*/
2274 /*--- Data processing (immediate)                          ---*/
2275 /*------------------------------------------------------------*/
2276
2277 /* Helper functions for supporting "DecodeBitMasks" */
2278
2279 static ULong dbm_ROR ( Int width, ULong x, Int rot )
2280 {
2281    vassert(width > 0 && width <= 64);
2282    vassert(rot >= 0 && rot < width);
2283    if (rot == 0) return x;
2284    ULong res = x >> rot;
2285    res |= (x << (width - rot));
2286    if (width < 64)
2287      res &= ((1ULL << width) - 1);
2288    return res;
2289 }
2290
2291 static ULong dbm_RepTo64( Int esize, ULong x )
2292 {
2293    switch (esize) {
2294       case 64:
2295          return x;
2296       case 32:
2297          x &= 0xFFFFFFFF; x |= (x << 32);
2298          return x;
2299       case 16:
2300          x &= 0xFFFF; x |= (x << 16); x |= (x << 32);
2301          return x;
2302       case 8:
2303          x &= 0xFF; x |= (x << 8); x |= (x << 16); x |= (x << 32);
2304          return x;
2305       case 4:
2306          x &= 0xF; x |= (x << 4); x |= (x << 8);
2307          x |= (x << 16); x |= (x << 32);
2308          return x;
2309       case 2:
2310          x &= 0x3; x |= (x << 2); x |= (x << 4); x |= (x << 8);
2311          x |= (x << 16); x |= (x << 32);
2312          return x;
2313       default:
2314          break;
2315    }
2316    vpanic("dbm_RepTo64");
2317    /*NOTREACHED*/
2318    return 0;
2319 }
2320
2321 static Int dbm_highestSetBit ( ULong x )
2322 {
2323    Int i;
2324    for (i = 63; i >= 0; i--) {
2325       if (x & (1ULL << i))
2326          return i;
2327    }
2328    vassert(x == 0);
2329    return -1;
2330 }
2331
2332 static
2333 Bool dbm_DecodeBitMasks ( /*OUT*/ULong* wmask, /*OUT*/ULong* tmask,
2334                           ULong immN, ULong imms, ULong immr, Bool immediate,
2335                           UInt M /*32 or 64*/)
2336 {
2337    vassert(immN < (1ULL << 1));
2338    vassert(imms < (1ULL << 6));
2339    vassert(immr < (1ULL << 6));
2340    vassert(immediate == False || immediate == True);
2341    vassert(M == 32 || M == 64);
2342
2343    Int len = dbm_highestSetBit( ((immN << 6) & 64) | ((~imms) & 63) );
2344    if (len < 1) { /* printf("fail1\n"); */ return False; }
2345    vassert(len <= 6);
2346    vassert(M >= (1 << len));
2347
2348    vassert(len >= 1 && len <= 6);
2349    ULong levels = // (zeroes(6 - len) << (6-len)) | ones(len);
2350                   (1 << len) - 1;
2351    vassert(levels >= 1 && levels <= 63);
2352
2353    if (immediate && ((imms & levels) == levels)) {
2354       /* printf("fail2 imms %llu levels %llu len %d\n", imms, levels, len); */
2355       return False;
2356    }
2357
2358    ULong S = imms & levels;
2359    ULong R = immr & levels;
2360    Int   diff = S - R;
2361    diff &= 63;
2362    Int esize = 1 << len;
2363    vassert(2 <= esize && esize <= 64);
2364
2365    /* Be careful of these (1ULL << (S+1)) - 1 expressions, and the
2366       same below with d.  S can be 63 in which case we have an out of
2367       range and hence undefined shift. */
2368    vassert(S >= 0 && S <= 63);
2369    vassert(esize >= (S+1));
2370    ULong elem_s = // Zeroes(esize-(S+1)):Ones(S+1)
2371                   //(1ULL << (S+1)) - 1;
2372                   ((1ULL << S) - 1) + (1ULL << S);
2373
2374    Int d = // diff<len-1:0>
2375            diff & ((1 << len)-1);
2376    vassert(esize >= (d+1));
2377    vassert(d >= 0 && d <= 63);
2378
2379    ULong elem_d = // Zeroes(esize-(d+1)):Ones(d+1)
2380                   //(1ULL << (d+1)) - 1;
2381                   ((1ULL << d) - 1) + (1ULL << d);
2382
2383    if (esize != 64) vassert(elem_s < (1ULL << esize));
2384    if (esize != 64) vassert(elem_d < (1ULL << esize));
2385
2386    if (wmask) *wmask = dbm_RepTo64(esize, dbm_ROR(esize, elem_s, R));
2387    if (tmask) *tmask = dbm_RepTo64(esize, elem_d);
2388
2389    return True;
2390 }
2391
2392
2393 static
2394 Bool dis_ARM64_data_processing_immediate(/*MB_OUT*/DisResult* dres,
2395                                          UInt insn)
2396 {
2397 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
2398
2399    /* insn[28:23]
2400       10000x PC-rel addressing
2401       10001x Add/subtract (immediate)
2402       100100 Logical (immediate)
2403       100101 Move Wide (immediate)
2404       100110 Bitfield
2405       100111 Extract
2406    */
2407
2408    /* ------------------ ADD/SUB{,S} imm12 ------------------ */
2409    if (INSN(28,24) == BITS5(1,0,0,0,1)) {
2410       Bool is64   = INSN(31,31) == 1;
2411       Bool isSub  = INSN(30,30) == 1;
2412       Bool setCC  = INSN(29,29) == 1;
2413       UInt sh     = INSN(23,22);
2414       UInt uimm12 = INSN(21,10);
2415       UInt nn     = INSN(9,5);
2416       UInt dd     = INSN(4,0);
2417       const HChar* nm = isSub ? "sub" : "add";
2418       if (sh >= 2) {
2419          /* Invalid; fall through */
2420       } else {
2421          vassert(sh <= 1);
2422          uimm12 <<= (12 * sh);
2423          if (is64) {
2424             IRTemp argL  = newTemp(Ity_I64);
2425             IRTemp argR  = newTemp(Ity_I64);
2426             IRTemp res   = newTemp(Ity_I64);
2427             assign(argL, getIReg64orSP(nn));
2428             assign(argR, mkU64(uimm12));
2429             assign(res,  binop(isSub ? Iop_Sub64 : Iop_Add64,
2430                                mkexpr(argL), mkexpr(argR)));
2431             if (setCC) {
2432                putIReg64orZR(dd, mkexpr(res));
2433                setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
2434                DIP("%ss %s, %s, 0x%x\n",
2435                    nm, nameIReg64orZR(dd), nameIReg64orSP(nn), uimm12);
2436             } else {
2437                putIReg64orSP(dd, mkexpr(res));
2438                DIP("%s %s, %s, 0x%x\n",
2439                    nm, nameIReg64orSP(dd), nameIReg64orSP(nn), uimm12);
2440             }
2441          } else {
2442             IRTemp argL  = newTemp(Ity_I32);
2443             IRTemp argR  = newTemp(Ity_I32);
2444             IRTemp res   = newTemp(Ity_I32);
2445             assign(argL, getIReg32orSP(nn));
2446             assign(argR, mkU32(uimm12));
2447             assign(res,  binop(isSub ? Iop_Sub32 : Iop_Add32,
2448                                mkexpr(argL), mkexpr(argR)));
2449             if (setCC) {
2450                putIReg32orZR(dd, mkexpr(res));
2451                setFlags_ADD_SUB(False/*!is64*/, isSub, argL, argR);
2452                DIP("%ss %s, %s, 0x%x\n",
2453                    nm, nameIReg32orZR(dd), nameIReg32orSP(nn), uimm12);
2454             } else {
2455                putIReg32orSP(dd, mkexpr(res));
2456                DIP("%s %s, %s, 0x%x\n",
2457                    nm, nameIReg32orSP(dd), nameIReg32orSP(nn), uimm12);
2458             }
2459          }
2460          return True;
2461       }
2462    }
2463
2464    /* -------------------- ADR/ADRP -------------------- */
2465    if (INSN(28,24) == BITS5(1,0,0,0,0)) {
2466       UInt  bP    = INSN(31,31);
2467       UInt  immLo = INSN(30,29);
2468       UInt  immHi = INSN(23,5);
2469       UInt  rD    = INSN(4,0);
2470       ULong uimm  = (immHi << 2) | immLo;
2471       ULong simm  = sx_to_64(uimm, 21);
2472       ULong val;
2473       if (bP) {
2474          val = (guest_PC_curr_instr & 0xFFFFFFFFFFFFF000ULL) + (simm << 12);
2475       } else {
2476          val = guest_PC_curr_instr + simm;
2477       }
2478       putIReg64orZR(rD, mkU64(val));
2479       DIP("adr%s %s, 0x%llx\n", bP ? "p" : "", nameIReg64orZR(rD), val);
2480       return True;
2481    }
2482
2483    /* -------------------- LOGIC(imm) -------------------- */
2484    if (INSN(28,23) == BITS6(1,0,0,1,0,0)) {
2485       /* 31 30 28     22 21   15   9  4
2486          sf op 100100 N  immr imms Rn Rd
2487            op=00: AND  Rd|SP, Rn, #imm
2488            op=01: ORR  Rd|SP, Rn, #imm
2489            op=10: EOR  Rd|SP, Rn, #imm
2490            op=11: ANDS Rd|ZR, Rn, #imm
2491       */
2492       Bool  is64 = INSN(31,31) == 1;
2493       UInt  op   = INSN(30,29);
2494       UInt  N    = INSN(22,22);
2495       UInt  immR = INSN(21,16);
2496       UInt  immS = INSN(15,10);
2497       UInt  nn   = INSN(9,5);
2498       UInt  dd   = INSN(4,0);
2499       ULong imm  = 0;
2500       Bool  ok;
2501       if (N == 1 && !is64)
2502          goto after_logic_imm; /* not allowed; fall through */
2503       ok = dbm_DecodeBitMasks(&imm, NULL,
2504                               N, immS, immR, True, is64 ? 64 : 32);
2505       if (!ok)
2506          goto after_logic_imm;
2507
2508       const HChar* names[4] = { "and", "orr", "eor", "ands" };
2509       const IROp   ops64[4] = { Iop_And64, Iop_Or64, Iop_Xor64, Iop_And64 };
2510       const IROp   ops32[4] = { Iop_And32, Iop_Or32, Iop_Xor32, Iop_And32 };
2511
2512       vassert(op < 4);
2513       if (is64) {
2514          IRExpr* argL = getIReg64orZR(nn);
2515          IRExpr* argR = mkU64(imm);
2516          IRTemp  res  = newTemp(Ity_I64);
2517          assign(res, binop(ops64[op], argL, argR));
2518          if (op < 3) {
2519             putIReg64orSP(dd, mkexpr(res));
2520             DIP("%s %s, %s, 0x%llx\n", names[op],
2521                 nameIReg64orSP(dd), nameIReg64orZR(nn), imm);
2522          } else {
2523             putIReg64orZR(dd, mkexpr(res));
2524             setFlags_LOGIC(True/*is64*/, res);
2525             DIP("%s %s, %s, 0x%llx\n", names[op],
2526                 nameIReg64orZR(dd), nameIReg64orZR(nn), imm);
2527          }
2528       } else {
2529          IRExpr* argL = getIReg32orZR(nn);
2530          IRExpr* argR = mkU32((UInt)imm);
2531          IRTemp  res  = newTemp(Ity_I32);
2532          assign(res, binop(ops32[op], argL, argR));
2533          if (op < 3) {
2534             putIReg32orSP(dd, mkexpr(res));
2535             DIP("%s %s, %s, 0x%x\n", names[op],
2536                 nameIReg32orSP(dd), nameIReg32orZR(nn), (UInt)imm);
2537          } else {
2538             putIReg32orZR(dd, mkexpr(res));
2539             setFlags_LOGIC(False/*!is64*/, res);
2540             DIP("%s %s, %s, 0x%x\n", names[op],
2541                 nameIReg32orZR(dd), nameIReg32orZR(nn), (UInt)imm);
2542          }
2543       }
2544       return True;
2545    }
2546    after_logic_imm:
2547
2548    /* -------------------- MOV{Z,N,K} -------------------- */
2549    if (INSN(28,23) == BITS6(1,0,0,1,0,1)) {
2550       /* 31 30 28      22 20    4
2551          |  |  |       |  |     |
2552          sf 10 100 101 hw imm16 Rd   MOV(Z) Rd, (imm16 << (16*hw))
2553          sf 00 100 101 hw imm16 Rd   MOV(N) Rd, ~(imm16 << (16*hw))
2554          sf 11 100 101 hw imm16 Rd   MOV(K) Rd, (imm16 << (16*hw))
2555       */
2556       Bool is64   = INSN(31,31) == 1;
2557       UInt subopc = INSN(30,29);
2558       UInt hw     = INSN(22,21);
2559       UInt imm16  = INSN(20,5);
2560       UInt dd     = INSN(4,0);
2561       if (subopc == BITS2(0,1) || (!is64 && hw >= 2)) {
2562          /* invalid; fall through */
2563       } else {
2564          ULong imm64 = ((ULong)imm16) << (16 * hw);
2565          if (!is64)
2566             vassert(imm64 < 0x100000000ULL);
2567          switch (subopc) {
2568             case BITS2(1,0): // MOVZ
2569                putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
2570                DIP("movz %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
2571                break;
2572             case BITS2(0,0): // MOVN
2573                imm64 = ~imm64;
2574                if (!is64)
2575                   imm64 &= 0xFFFFFFFFULL;
2576                putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
2577                DIP("movn %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
2578                break;
2579             case BITS2(1,1): // MOVK
2580                /* This is more complex.  We are inserting a slice into
2581                   the destination register, so we need to have the old
2582                   value of it. */
2583                if (is64) {
2584                   IRTemp old = newTemp(Ity_I64);
2585                   assign(old, getIReg64orZR(dd));
2586                   ULong mask = 0xFFFFULL << (16 * hw);
2587                   IRExpr* res
2588                      = binop(Iop_Or64,
2589                              binop(Iop_And64, mkexpr(old), mkU64(~mask)),
2590                              mkU64(imm64));
2591                   putIReg64orZR(dd, res);
2592                   DIP("movk %s, 0x%x, lsl %u\n",
2593                       nameIReg64orZR(dd), imm16, 16*hw);
2594                } else {
2595                   IRTemp old = newTemp(Ity_I32);
2596                   assign(old, getIReg32orZR(dd));
2597                   vassert(hw <= 1);
2598                   UInt mask = ((UInt)0xFFFF) << (16 * hw);
2599                   IRExpr* res
2600                      = binop(Iop_Or32,
2601                              binop(Iop_And32, mkexpr(old), mkU32(~mask)),
2602                              mkU32((UInt)imm64));
2603                   putIReg32orZR(dd, res);
2604                   DIP("movk %s, 0x%x, lsl %u\n",
2605                       nameIReg32orZR(dd), imm16, 16*hw);
2606                }
2607                break;
2608             default:
2609                vassert(0);
2610          }
2611          return True;
2612       }
2613    }
2614
2615    /* -------------------- {U,S,}BFM -------------------- */
2616    /*    30 28     22 21   15   9  4
2617
2618       sf 10 100110 N  immr imms nn dd
2619          UBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2620          UBFM Xd, Xn, #immr, #imms   when sf=1, N=1
2621
2622       sf 00 100110 N  immr imms nn dd
2623          SBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2624          SBFM Xd, Xn, #immr, #imms   when sf=1, N=1
2625
2626       sf 01 100110 N  immr imms nn dd
2627          BFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2628          BFM Xd, Xn, #immr, #imms   when sf=1, N=1
2629    */
2630    if (INSN(28,23) == BITS6(1,0,0,1,1,0)) {
2631       UInt sf     = INSN(31,31);
2632       UInt opc    = INSN(30,29);
2633       UInt N      = INSN(22,22);
2634       UInt immR   = INSN(21,16);
2635       UInt immS   = INSN(15,10);
2636       UInt nn     = INSN(9,5);
2637       UInt dd     = INSN(4,0);
2638       Bool inZero = False;
2639       Bool extend = False;
2640       const HChar* nm = "???";
2641       /* skip invalid combinations */
2642       switch (opc) {
2643          case BITS2(0,0):
2644             inZero = True; extend = True; nm = "sbfm"; break;
2645          case BITS2(0,1):
2646             inZero = False; extend = False; nm = "bfm"; break;
2647          case BITS2(1,0):
2648             inZero = True; extend = False; nm = "ubfm"; break;
2649          case BITS2(1,1):
2650             goto after_bfm; /* invalid */
2651          default:
2652             vassert(0);
2653       }
2654       if (sf == 1 && N != 1) goto after_bfm;
2655       if (sf == 0 && (N != 0 || ((immR >> 5) & 1) != 0
2656                              || ((immS >> 5) & 1) != 0)) goto after_bfm;
2657       ULong wmask = 0, tmask = 0;
2658       Bool ok = dbm_DecodeBitMasks(&wmask, &tmask,
2659                                    N, immS, immR, False, sf == 1 ? 64 : 32);
2660       if (!ok) goto after_bfm; /* hmmm */
2661
2662       Bool   is64 = sf == 1;
2663       IRType ty   = is64 ? Ity_I64 : Ity_I32;
2664
2665       IRTemp dst = newTemp(ty);
2666       IRTemp src = newTemp(ty);
2667       IRTemp bot = newTemp(ty);
2668       IRTemp top = newTemp(ty);
2669       IRTemp res = newTemp(ty);
2670       assign(dst, inZero ? mkU(ty,0) : getIRegOrZR(is64, dd));
2671       assign(src, getIRegOrZR(is64, nn));
2672       /* perform bitfield move on low bits */
2673       assign(bot, binop(mkOR(ty),
2674                         binop(mkAND(ty), mkexpr(dst), mkU(ty, ~wmask)),
2675                         binop(mkAND(ty), mkexpr(mathROR(ty, src, immR)),
2676                                          mkU(ty, wmask))));
2677       /* determine extension bits (sign, zero or dest register) */
2678       assign(top, mkexpr(extend ? mathREPLICATE(ty, src, immS) : dst));
2679       /* combine extension bits and result bits */
2680       assign(res, binop(mkOR(ty),
2681                         binop(mkAND(ty), mkexpr(top), mkU(ty, ~tmask)),
2682                         binop(mkAND(ty), mkexpr(bot), mkU(ty, tmask))));
2683       putIRegOrZR(is64, dd, mkexpr(res));
2684       DIP("%s %s, %s, immR=%u, immS=%u\n",
2685           nm, nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR, immS);
2686       return True;
2687    }
2688    after_bfm:
2689
2690    /* ---------------------- EXTR ---------------------- */
2691    /*   30 28     22 20 15   9 4
2692       1 00 100111 10 m  imm6 n d  EXTR Xd, Xn, Xm, #imm6
2693       0 00 100111 00 m  imm6 n d  EXTR Wd, Wn, Wm, #imm6 when #imm6 < 32
2694    */
2695    if (INSN(30,23) == BITS8(0,0,1,0,0,1,1,1) && INSN(21,21) == 0) {
2696       Bool is64  = INSN(31,31) == 1;
2697       UInt mm    = INSN(20,16);
2698       UInt imm6  = INSN(15,10);
2699       UInt nn    = INSN(9,5);
2700       UInt dd    = INSN(4,0);
2701       Bool valid = True;
2702       if (INSN(31,31) != INSN(22,22))
2703         valid = False;
2704       if (!is64 && imm6 >= 32)
2705         valid = False;
2706       if (!valid) goto after_extr;
2707       IRType ty    = is64 ? Ity_I64 : Ity_I32;
2708       IRTemp srcHi = newTemp(ty);
2709       IRTemp srcLo = newTemp(ty);
2710       IRTemp res   = newTemp(ty);
2711       assign(srcHi, getIRegOrZR(is64, nn));
2712       assign(srcLo, getIRegOrZR(is64, mm));
2713       if (imm6 == 0) {
2714         assign(res, mkexpr(srcLo));
2715       } else {
2716         UInt szBits = 8 * sizeofIRType(ty);
2717         vassert(imm6 > 0 && imm6 < szBits);
2718         assign(res, binop(mkOR(ty),
2719                           binop(mkSHL(ty), mkexpr(srcHi), mkU8(szBits-imm6)),
2720                           binop(mkSHR(ty), mkexpr(srcLo), mkU8(imm6))));
2721       }
2722       putIRegOrZR(is64, dd, mkexpr(res));
2723       DIP("extr %s, %s, %s, #%u\n",
2724           nameIRegOrZR(is64,dd),
2725           nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm), imm6);
2726       return True;
2727    }
2728   after_extr:
2729
2730    vex_printf("ARM64 front end: data_processing_immediate\n");
2731    return False;
2732 #  undef INSN
2733 }
2734
2735
2736 /*------------------------------------------------------------*/
2737 /*--- Data processing (register) instructions              ---*/
2738 /*------------------------------------------------------------*/
2739
2740 static const HChar* nameSH ( UInt sh ) {
2741    switch (sh) {
2742       case 0: return "lsl";
2743       case 1: return "lsr";
2744       case 2: return "asr";
2745       case 3: return "ror";
2746       default: vassert(0);
2747    }
2748 }
2749
2750 /* Generate IR to get a register value, possibly shifted by an
2751    immediate.  Returns either a 32- or 64-bit temporary holding the
2752    result.  After the shift, the value can optionally be NOT-ed
2753    too.
2754
2755    sh_how coding: 00=SHL, 01=SHR, 10=SAR, 11=ROR.  sh_amt may only be
2756    in the range 0 to (is64 ? 64 : 32)-1.  For some instructions, ROR
2757    isn't allowed, but it's the job of the caller to check that.
2758 */
2759 static IRTemp getShiftedIRegOrZR ( Bool is64,
2760                                    UInt sh_how, UInt sh_amt, UInt regNo,
2761                                    Bool invert )
2762 {
2763    vassert(sh_how < 4);
2764    vassert(sh_amt < (is64 ? 64 : 32));
2765    IRType ty = is64 ? Ity_I64 : Ity_I32;
2766    IRTemp t0 = newTemp(ty);
2767    assign(t0, getIRegOrZR(is64, regNo));
2768    IRTemp t1 = newTemp(ty);
2769    switch (sh_how) {
2770       case BITS2(0,0):
2771          assign(t1, binop(mkSHL(ty), mkexpr(t0), mkU8(sh_amt)));
2772          break;
2773       case BITS2(0,1):
2774          assign(t1, binop(mkSHR(ty), mkexpr(t0), mkU8(sh_amt)));
2775          break;
2776       case BITS2(1,0):
2777          assign(t1, binop(mkSAR(ty), mkexpr(t0), mkU8(sh_amt)));
2778          break;
2779       case BITS2(1,1):
2780          assign(t1, mkexpr(mathROR(ty, t0, sh_amt)));
2781          break;
2782       default:
2783          vassert(0);
2784    }
2785    if (invert) {
2786       IRTemp t2 = newTemp(ty);
2787       assign(t2, unop(mkNOT(ty), mkexpr(t1)));
2788       return t2;
2789    } else {
2790       return t1;
2791    }
2792 }
2793
2794
2795 static
2796 Bool dis_ARM64_data_processing_register(/*MB_OUT*/DisResult* dres,
2797                                         UInt insn)
2798 {
2799 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
2800
2801    /* ------------------- ADD/SUB(reg) ------------------- */
2802    /* x==0 => 32 bit op      x==1 => 64 bit op
2803       sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR(NOT ALLOWED)
2804
2805       31 30 29 28    23 21 20 15   9  4
2806       |  |  |  |     |  |  |  |    |  |
2807       x  0  0  01011 sh 0  Rm imm6 Rn Rd   ADD  Rd,Rn, sh(Rm,imm6)
2808       x  0  1  01011 sh 0  Rm imm6 Rn Rd   ADDS Rd,Rn, sh(Rm,imm6)
2809       x  1  0  01011 sh 0  Rm imm6 Rn Rd   SUB  Rd,Rn, sh(Rm,imm6)
2810       x  1  1  01011 sh 0  Rm imm6 Rn Rd   SUBS Rd,Rn, sh(Rm,imm6)
2811    */
2812    if (INSN(28,24) == BITS5(0,1,0,1,1) && INSN(21,21) == 0) {
2813       UInt   bX    = INSN(31,31);
2814       UInt   bOP   = INSN(30,30); /* 0: ADD, 1: SUB */
2815       UInt   bS    = INSN(29, 29); /* set flags? */
2816       UInt   sh    = INSN(23,22);
2817       UInt   rM    = INSN(20,16);
2818       UInt   imm6  = INSN(15,10);
2819       UInt   rN    = INSN(9,5);
2820       UInt   rD    = INSN(4,0);
2821       Bool   isSUB = bOP == 1;
2822       Bool   is64  = bX == 1;
2823       IRType ty    = is64 ? Ity_I64 : Ity_I32;
2824       if ((!is64 && imm6 > 31) || sh == BITS2(1,1)) {
2825          /* invalid; fall through */
2826       } else {
2827          IRTemp argL = newTemp(ty);
2828          assign(argL, getIRegOrZR(is64, rN));
2829          IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, False);
2830          IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
2831          IRTemp res  = newTemp(ty);
2832          assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
2833          if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
2834          if (bS) {
2835             setFlags_ADD_SUB(is64, isSUB, argL, argR);
2836          }
2837          DIP("%s%s %s, %s, %s, %s #%u\n",
2838              bOP ? "sub" : "add", bS ? "s" : "",
2839              nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
2840              nameIRegOrZR(is64, rM), nameSH(sh), imm6);
2841          return True;
2842       }
2843    }
2844
2845    /* ------------------- ADC/SBC(reg) ------------------- */
2846    /* x==0 => 32 bit op      x==1 => 64 bit op
2847
2848       31 30 29 28    23 21 20 15     9  4
2849       |  |  |  |     |  |  |  |      |  |
2850       x  0  0  11010 00 0  Rm 000000 Rn Rd   ADC  Rd,Rn,Rm
2851       x  0  1  11010 00 0  Rm 000000 Rn Rd   ADCS Rd,Rn,Rm
2852       x  1  0  11010 00 0  Rm 000000 Rn Rd   SBC  Rd,Rn,Rm
2853       x  1  1  11010 00 0  Rm 000000 Rn Rd   SBCS Rd,Rn,Rm
2854    */
2855
2856    if (INSN(28,21) == BITS8(1,1,0,1,0,0,0,0) && INSN(15,10) == 0 ) {
2857       UInt   bX    = INSN(31,31);
2858       UInt   bOP   = INSN(30,30); /* 0: ADC, 1: SBC */
2859       UInt   bS    = INSN(29,29); /* set flags */
2860       UInt   rM    = INSN(20,16);
2861       UInt   rN    = INSN(9,5);
2862       UInt   rD    = INSN(4,0);
2863
2864       Bool   isSUB = bOP == 1;
2865       Bool   is64  = bX == 1;
2866       IRType ty    = is64 ? Ity_I64 : Ity_I32;
2867
2868       IRTemp oldC = newTemp(ty);
2869       assign(oldC,
2870              is64 ? mk_arm64g_calculate_flag_c()
2871                   : unop(Iop_64to32, mk_arm64g_calculate_flag_c()) );
2872
2873       IRTemp argL = newTemp(ty);
2874       assign(argL, getIRegOrZR(is64, rN));
2875       IRTemp argR = newTemp(ty);
2876       assign(argR, getIRegOrZR(is64, rM));
2877
2878       IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
2879       IRTemp res  = newTemp(ty);
2880       if (isSUB) {
2881          IRExpr* one = is64 ? mkU64(1) : mkU32(1);
2882          IROp xorOp = is64 ? Iop_Xor64 : Iop_Xor32;
2883          assign(res,
2884                 binop(op,
2885                       binop(op, mkexpr(argL), mkexpr(argR)),
2886                       binop(xorOp, mkexpr(oldC), one)));
2887       } else {
2888          assign(res,
2889                 binop(op,
2890                       binop(op, mkexpr(argL), mkexpr(argR)),
2891                       mkexpr(oldC)));
2892       }
2893
2894       if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
2895
2896       if (bS) {
2897          setFlags_ADC_SBC(is64, isSUB, argL, argR, oldC);
2898       }
2899
2900       DIP("%s%s %s, %s, %s\n",
2901           bOP ? "sbc" : "adc", bS ? "s" : "",
2902           nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
2903           nameIRegOrZR(is64, rM));
2904       return True;
2905    }
2906
2907    /* -------------------- LOGIC(reg) -------------------- */
2908    /* x==0 => 32 bit op      x==1 => 64 bit op
2909       N==0 => inv? is no-op (no inversion)
2910       N==1 => inv? is NOT
2911       sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR
2912
2913       31 30 28    23 21 20 15   9  4
2914       |  |  |     |  |  |  |    |  |
2915       x  00 01010 sh N  Rm imm6 Rn Rd  AND  Rd,Rn, inv?(sh(Rm,imm6))
2916       x  01 01010 sh N  Rm imm6 Rn Rd  ORR  Rd,Rn, inv?(sh(Rm,imm6))
2917       x  10 01010 sh N  Rm imm6 Rn Rd  EOR  Rd,Rn, inv?(sh(Rm,imm6))
2918       x  11 01010 sh N  Rm imm6 Rn Rd  ANDS Rd,Rn, inv?(sh(Rm,imm6))
2919       With N=1, the names are: BIC ORN EON BICS
2920    */
2921    if (INSN(28,24) == BITS5(0,1,0,1,0)) {
2922       UInt   bX   = INSN(31,31);
2923       UInt   sh   = INSN(23,22);
2924       UInt   bN   = INSN(21,21);
2925       UInt   rM   = INSN(20,16);
2926       UInt   imm6 = INSN(15,10);
2927       UInt   rN   = INSN(9,5);
2928       UInt   rD   = INSN(4,0);
2929       Bool   is64 = bX == 1;
2930       IRType ty   = is64 ? Ity_I64 : Ity_I32;
2931       if (!is64 && imm6 > 31) {
2932          /* invalid; fall though */
2933       } else {
2934          IRTemp argL = newTemp(ty);
2935          assign(argL, getIRegOrZR(is64, rN));
2936          IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, bN == 1);
2937          IROp   op   = Iop_INVALID;
2938          switch (INSN(30,29)) {
2939             case BITS2(0,0): case BITS2(1,1): op = mkAND(ty); break;
2940             case BITS2(0,1):                  op = mkOR(ty);  break;
2941             case BITS2(1,0):                  op = mkXOR(ty); break;
2942             default: vassert(0);
2943          }
2944          IRTemp res = newTemp(ty);
2945          assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
2946          if (INSN(30,29) == BITS2(1,1)) {
2947             setFlags_LOGIC(is64, res);
2948          }
2949          putIRegOrZR(is64, rD, mkexpr(res));
2950
2951          static const HChar* names_op[8]
2952             = { "and", "orr", "eor", "ands", "bic", "orn", "eon", "bics" };
2953          vassert(((bN << 2) | INSN(30,29)) < 8);
2954          const HChar* nm_op = names_op[(bN << 2) | INSN(30,29)];
2955          /* Special-case the printing of "MOV" */
2956          if (rN == 31/*zr*/ && sh == 0/*LSL*/ && imm6 == 0 && bN == 0) {
2957             DIP("mov %s, %s\n", nameIRegOrZR(is64, rD),
2958                                 nameIRegOrZR(is64, rM));
2959          } else {
2960             DIP("%s %s, %s, %s, %s #%u\n", nm_op,
2961                 nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
2962                 nameIRegOrZR(is64, rM), nameSH(sh), imm6);
2963          }
2964          return True;
2965       }
2966    }
2967
2968    /* -------------------- {U,S}MULH -------------------- */
2969    /* 31       23 22 20 15     9   4
2970       10011011 1  10 Rm 011111 Rn Rd   UMULH Xd,Xn,Xm
2971       10011011 0  10 Rm 011111 Rn Rd   SMULH Xd,Xn,Xm
2972    */
2973    if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1)
2974        && INSN(22,21) == BITS2(1,0) && INSN(15,10) == BITS6(0,1,1,1,1,1)) {
2975       Bool isU = INSN(23,23) == 1;
2976       UInt mm  = INSN(20,16);
2977       UInt nn  = INSN(9,5);
2978       UInt dd  = INSN(4,0);
2979       putIReg64orZR(dd, unop(Iop_128HIto64,
2980                              binop(isU ? Iop_MullU64 : Iop_MullS64,
2981                                    getIReg64orZR(nn), getIReg64orZR(mm))));
2982       DIP("%cmulh %s, %s, %s\n",
2983           isU ? 'u' : 's',
2984           nameIReg64orZR(dd), nameIReg64orZR(nn), nameIReg64orZR(mm));
2985       return True;
2986    }
2987
2988    /* -------------------- M{ADD,SUB} -------------------- */
2989    /* 31 30           20 15 14 9 4
2990       sf 00 11011 000 m  0  a  n r   MADD Rd,Rn,Rm,Ra  d = a+m*n
2991       sf 00 11011 000 m  1  a  n r   MADD Rd,Rn,Rm,Ra  d = a-m*n
2992    */
2993    if (INSN(30,21) == BITS10(0,0,1,1,0,1,1,0,0,0)) {
2994       Bool is64  = INSN(31,31) == 1;
2995       UInt mm    = INSN(20,16);
2996       Bool isAdd = INSN(15,15) == 0;
2997       UInt aa    = INSN(14,10);
2998       UInt nn    = INSN(9,5);
2999       UInt dd    = INSN(4,0);
3000       if (is64) {
3001          putIReg64orZR(
3002             dd,
3003             binop(isAdd ? Iop_Add64 : Iop_Sub64,
3004                   getIReg64orZR(aa),
3005                   binop(Iop_Mul64, getIReg64orZR(mm), getIReg64orZR(nn))));
3006       } else {
3007          putIReg32orZR(
3008             dd,
3009             binop(isAdd ? Iop_Add32 : Iop_Sub32,
3010                   getIReg32orZR(aa),
3011                   binop(Iop_Mul32, getIReg32orZR(mm), getIReg32orZR(nn))));
3012       }
3013       DIP("%s %s, %s, %s, %s\n",
3014           isAdd ? "madd" : "msub",
3015           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
3016           nameIRegOrZR(is64, mm), nameIRegOrZR(is64, aa));
3017       return True;
3018    }
3019
3020    /* ---------------- CS{EL,INC,INV,NEG} ---------------- */
3021    /* 31 30 28        20 15   11 9  4
3022       sf 00 1101 0100 mm cond 00 nn dd   CSEL  Rd,Rn,Rm
3023       sf 00 1101 0100 mm cond 01 nn dd   CSINC Rd,Rn,Rm
3024       sf 10 1101 0100 mm cond 00 nn dd   CSINV Rd,Rn,Rm
3025       sf 10 1101 0100 mm cond 01 nn dd   CSNEG Rd,Rn,Rm
3026       In all cases, the operation is: Rd = if cond then Rn else OP(Rm)
3027    */
3028    if (INSN(29,21) == BITS9(0, 1,1,0,1, 0,1,0,0) && INSN(11,11) == 0) {
3029       Bool    is64 = INSN(31,31) == 1;
3030       UInt    b30  = INSN(30,30);
3031       UInt    mm   = INSN(20,16);
3032       UInt    cond = INSN(15,12);
3033       UInt    b10  = INSN(10,10);
3034       UInt    nn   = INSN(9,5);
3035       UInt    dd   = INSN(4,0);
3036       UInt    op   = (b30 << 1) | b10; /* 00=id 01=inc 10=inv 11=neg */
3037       IRType  ty   = is64 ? Ity_I64 : Ity_I32;
3038       IRExpr* argL = getIRegOrZR(is64, nn);
3039       IRExpr* argR = getIRegOrZR(is64, mm);
3040       switch (op) {
3041          case BITS2(0,0):
3042             break;
3043          case BITS2(0,1):
3044             argR = binop(mkADD(ty), argR, mkU(ty,1));
3045             break;
3046          case BITS2(1,0):
3047             argR = unop(mkNOT(ty), argR);
3048             break;
3049          case BITS2(1,1):
3050             argR = binop(mkSUB(ty), mkU(ty,0), argR);
3051             break;
3052          default:
3053             vassert(0);
3054       }
3055       putIRegOrZR(
3056          is64, dd,
3057          IRExpr_ITE(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
3058                     argL, argR)
3059       );
3060       const HChar* op_nm[4] = { "csel", "csinc", "csinv", "csneg" };
3061       DIP("%s %s, %s, %s, %s\n", op_nm[op],
3062           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
3063           nameIRegOrZR(is64, mm), nameCC(cond));
3064       return True;
3065    }
3066
3067    /* -------------- ADD/SUB(extended reg) -------------- */
3068    /*     28         20 15  12   9 4
3069       000 01011 00 1 m  opt imm3 n d   ADD  Wd|SP, Wn|SP, Wm ext&lsld
3070       100 01011 00 1 m  opt imm3 n d   ADD  Xd|SP, Xn|SP, Rm ext&lsld
3071
3072       001 01011 00 1 m  opt imm3 n d   ADDS Wd,    Wn|SP, Wm ext&lsld
3073       101 01011 00 1 m  opt imm3 n d   ADDS Xd,    Xn|SP, Rm ext&lsld
3074
3075       010 01011 00 1 m  opt imm3 n d   SUB  Wd|SP, Wn|SP, Wm ext&lsld
3076       110 01011 00 1 m  opt imm3 n d   SUB  Xd|SP, Xn|SP, Rm ext&lsld
3077
3078       011 01011 00 1 m  opt imm3 n d   SUBS Wd,    Wn|SP, Wm ext&lsld
3079       111 01011 00 1 m  opt imm3 n d   SUBS Xd,    Xn|SP, Rm ext&lsld
3080
3081       The 'm' operand is extended per opt, thusly:
3082
3083         000   Xm & 0xFF           UXTB
3084         001   Xm & 0xFFFF         UXTH
3085         010   Xm & (2^32)-1       UXTW
3086         011   Xm                  UXTX
3087
3088         100   Xm sx from bit 7    SXTB
3089         101   Xm sx from bit 15   SXTH
3090         110   Xm sx from bit 31   SXTW
3091         111   Xm                  SXTX
3092
3093       In the 64 bit case (bit31 == 1), UXTX and SXTX are the identity
3094       operation on Xm.  In the 32 bit case, UXTW, UXTX, SXTW and SXTX
3095       are the identity operation on Wm.
3096
3097       After extension, the value is shifted left by imm3 bits, which
3098       may only be in the range 0 .. 4 inclusive.
3099    */
3100    if (INSN(28,21) == BITS8(0,1,0,1,1,0,0,1) && INSN(12,10) <= 4) {
3101       Bool is64  = INSN(31,31) == 1;
3102       Bool isSub = INSN(30,30) == 1;
3103       Bool setCC = INSN(29,29) == 1;
3104       UInt mm    = INSN(20,16);
3105       UInt opt   = INSN(15,13);
3106       UInt imm3  = INSN(12,10);
3107       UInt nn    = INSN(9,5);
3108       UInt dd    = INSN(4,0);
3109       const HChar* nameExt[8] = { "uxtb", "uxth", "uxtw", "uxtx",
3110                                   "sxtb", "sxth", "sxtw", "sxtx" };
3111       /* Do almost the same thing in the 32- and 64-bit cases. */
3112       IRTemp xN = newTemp(Ity_I64);
3113       IRTemp xM = newTemp(Ity_I64);
3114       assign(xN, getIReg64orSP(nn));
3115       assign(xM, getIReg64orZR(mm));
3116       IRExpr* xMw  = mkexpr(xM); /* "xM widened" */
3117       Int     shSX = 0;
3118       /* widen Xm .. */
3119       switch (opt) {
3120          case BITS3(0,0,0): // UXTB
3121             xMw = binop(Iop_And64, xMw, mkU64(0xFF)); break;
3122          case BITS3(0,0,1): // UXTH
3123             xMw = binop(Iop_And64, xMw, mkU64(0xFFFF)); break;
3124          case BITS3(0,1,0): // UXTW -- noop for the 32bit case
3125             if (is64) {
3126                xMw = unop(Iop_32Uto64, unop(Iop_64to32, xMw));
3127             }
3128             break;
3129          case BITS3(0,1,1): // UXTX -- always a noop
3130             break;
3131          case BITS3(1,0,0): // SXTB
3132             shSX = 56; goto sxTo64;
3133          case BITS3(1,0,1): // SXTH
3134             shSX = 48; goto sxTo64;
3135          case BITS3(1,1,0): // SXTW -- noop for the 32bit case
3136             if (is64) {
3137                shSX = 32; goto sxTo64;
3138             }
3139             break;
3140          case BITS3(1,1,1): // SXTX -- always a noop
3141             break;
3142          sxTo64:
3143             vassert(shSX >= 32);
3144             xMw = binop(Iop_Sar64, binop(Iop_Shl64, xMw, mkU8(shSX)),
3145                         mkU8(shSX));
3146             break;
3147          default:
3148             vassert(0);
3149       }
3150       /* and now shift */
3151       IRTemp argL = xN;
3152       IRTemp argR = newTemp(Ity_I64);
3153       assign(argR, binop(Iop_Shl64, xMw, mkU8(imm3)));
3154       IRTemp res = newTemp(Ity_I64);
3155       assign(res, binop(isSub ? Iop_Sub64 : Iop_Add64,
3156                         mkexpr(argL), mkexpr(argR)));
3157       if (is64) {
3158          if (setCC) {
3159             putIReg64orZR(dd, mkexpr(res));
3160             setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
3161          } else {
3162             putIReg64orSP(dd, mkexpr(res));
3163          }
3164       } else {
3165          if (setCC) {
3166             IRTemp argL32 = newTemp(Ity_I32);
3167             IRTemp argR32 = newTemp(Ity_I32);
3168             putIReg32orZR(dd, unop(Iop_64to32, mkexpr(res)));
3169             assign(argL32, unop(Iop_64to32, mkexpr(argL)));
3170             assign(argR32, unop(Iop_64to32, mkexpr(argR)));
3171             setFlags_ADD_SUB(False/*!is64*/, isSub, argL32, argR32);
3172          } else {
3173             putIReg32orSP(dd, unop(Iop_64to32, mkexpr(res)));
3174          }
3175       }
3176       DIP("%s%s %s, %s, %s %s lsl %u\n",
3177           isSub ? "sub" : "add", setCC ? "s" : "",
3178           setCC ? nameIRegOrZR(is64, dd) : nameIRegOrSP(is64, dd),
3179           nameIRegOrSP(is64, nn), nameIRegOrSP(is64, mm),
3180           nameExt[opt], imm3);
3181       return True;
3182    }
3183
3184    /* ---------------- CCMP/CCMN(imm) ---------------- */
3185    /* Bizarrely, these appear in the "data processing register"
3186       category, even though they are operations against an
3187       immediate. */
3188    /* 31   29        20   15   11 9    3
3189       sf 1 111010010 imm5 cond 10 Rn 0 nzcv   CCMP Rn, #imm5, #nzcv, cond
3190       sf 0 111010010 imm5 cond 10 Rn 0 nzcv   CCMN Rn, #imm5, #nzcv, cond
3191
3192       Operation is:
3193          (CCMP) flags = if cond then flags-after-sub(Rn,imm5) else nzcv
3194          (CCMN) flags = if cond then flags-after-add(Rn,imm5) else nzcv
3195    */
3196    if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
3197        && INSN(11,10) == BITS2(1,0) && INSN(4,4) == 0) {
3198       Bool is64  = INSN(31,31) == 1;
3199       Bool isSUB = INSN(30,30) == 1;
3200       UInt imm5  = INSN(20,16);
3201       UInt cond  = INSN(15,12);
3202       UInt nn    = INSN(9,5);
3203       UInt nzcv  = INSN(3,0);
3204
3205       IRTemp condT = newTemp(Ity_I1);
3206       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
3207
3208       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3209       IRTemp argL = newTemp(ty);
3210       IRTemp argR = newTemp(ty);
3211
3212       if (is64) {
3213          assign(argL, getIReg64orZR(nn));
3214          assign(argR, mkU64(imm5));
3215       } else {
3216          assign(argL, getIReg32orZR(nn));
3217          assign(argR, mkU32(imm5));
3218       }
3219       setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
3220
3221       DIP("ccm%c %s, #%u, #%u, %s\n",
3222           isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
3223           imm5, nzcv, nameCC(cond));
3224       return True;
3225    }
3226
3227    /* ---------------- CCMP/CCMN(reg) ---------------- */
3228    /* 31   29        20 15   11 9    3
3229       sf 1 111010010 Rm cond 00 Rn 0 nzcv   CCMP Rn, Rm, #nzcv, cond
3230       sf 0 111010010 Rm cond 00 Rn 0 nzcv   CCMN Rn, Rm, #nzcv, cond
3231       Operation is:
3232          (CCMP) flags = if cond then flags-after-sub(Rn,Rm) else nzcv
3233          (CCMN) flags = if cond then flags-after-add(Rn,Rm) else nzcv
3234    */
3235    if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
3236        && INSN(11,10) == BITS2(0,0) && INSN(4,4) == 0) {
3237       Bool is64  = INSN(31,31) == 1;
3238       Bool isSUB = INSN(30,30) == 1;
3239       UInt mm    = INSN(20,16);
3240       UInt cond  = INSN(15,12);
3241       UInt nn    = INSN(9,5);
3242       UInt nzcv  = INSN(3,0);
3243
3244       IRTemp condT = newTemp(Ity_I1);
3245       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
3246
3247       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3248       IRTemp argL = newTemp(ty);
3249       IRTemp argR = newTemp(ty);
3250
3251       if (is64) {
3252          assign(argL, getIReg64orZR(nn));
3253          assign(argR, getIReg64orZR(mm));
3254       } else {
3255          assign(argL, getIReg32orZR(nn));
3256          assign(argR, getIReg32orZR(mm));
3257       }
3258       setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
3259
3260       DIP("ccm%c %s, %s, #%u, %s\n",
3261           isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
3262           nameIRegOrZR(is64, mm), nzcv, nameCC(cond));
3263       return True;
3264    }
3265
3266
3267    /* -------------- REV/REV16/REV32/RBIT -------------- */
3268    /* 31 30 28       20    15   11 9 4
3269
3270       1  10 11010110 00000 0000 11 n d    (1) REV   Xd, Xn
3271       0  10 11010110 00000 0000 10 n d    (2) REV   Wd, Wn
3272
3273       1  10 11010110 00000 0000 00 n d    (3) RBIT  Xd, Xn
3274       0  10 11010110 00000 0000 00 n d    (4) RBIT  Wd, Wn
3275
3276       1  10 11010110 00000 0000 01 n d    (5) REV16 Xd, Xn
3277       0  10 11010110 00000 0000 01 n d    (6) REV16 Wd, Wn
3278
3279       1  10 11010110 00000 0000 10 n d    (7) REV32 Xd, Xn
3280    */
3281    if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
3282        && INSN(20,12) == BITS9(0,0,0,0,0,0,0,0,0)) {
3283       UInt b31 = INSN(31,31);
3284       UInt opc = INSN(11,10);
3285
3286       UInt ix = 0;
3287       /**/ if (b31 == 1 && opc == BITS2(1,1)) ix = 1;
3288       else if (b31 == 0 && opc == BITS2(1,0)) ix = 2;
3289       else if (b31 == 1 && opc == BITS2(0,0)) ix = 3;
3290       else if (b31 == 0 && opc == BITS2(0,0)) ix = 4;
3291       else if (b31 == 1 && opc == BITS2(0,1)) ix = 5;
3292       else if (b31 == 0 && opc == BITS2(0,1)) ix = 6;
3293       else if (b31 == 1 && opc == BITS2(1,0)) ix = 7;
3294       if (ix >= 1 && ix <= 7) {
3295          Bool   is64  = ix == 1 || ix == 3 || ix == 5 || ix == 7;
3296          UInt   nn    = INSN(9,5);
3297          UInt   dd    = INSN(4,0);
3298          IRTemp src   = newTemp(Ity_I64);
3299          IRTemp dst   = IRTemp_INVALID;
3300          IRTemp (*math)(IRTemp) = NULL;
3301          switch (ix) {
3302             case 1: case 2: math = math_BYTESWAP64;   break;
3303             case 3: case 4: math = math_BITSWAP64;    break;
3304             case 5: case 6: math = math_USHORTSWAP64; break;
3305             case 7:         math = math_UINTSWAP64;   break;
3306             default: vassert(0);
3307          }
3308          const HChar* names[7]
3309            = { "rev", "rev", "rbit", "rbit", "rev16", "rev16", "rev32" };
3310          const HChar* nm = names[ix-1];
3311          vassert(math);
3312          if (ix == 6) {
3313             /* This has to be special cased, since the logic below doesn't
3314                handle it correctly. */
3315             assign(src, getIReg64orZR(nn));
3316             dst = math(src);
3317             putIReg64orZR(dd,
3318                           unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(dst))));
3319          } else if (is64) {
3320             assign(src, getIReg64orZR(nn));
3321             dst = math(src);
3322             putIReg64orZR(dd, mkexpr(dst));
3323          } else {
3324             assign(src, binop(Iop_Shl64, getIReg64orZR(nn), mkU8(32)));
3325             dst = math(src);
3326             putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
3327          }
3328          DIP("%s %s, %s\n", nm,
3329              nameIRegOrZR(is64,dd), nameIRegOrZR(is64,nn));
3330          return True;
3331       }
3332       /* else fall through */
3333    }
3334
3335    /* -------------------- CLZ/CLS -------------------- */
3336    /*    30 28   24   20    15      9 4
3337       sf 10 1101 0110 00000 00010 0 n d    CLZ Rd, Rn
3338       sf 10 1101 0110 00000 00010 1 n d    CLS Rd, Rn
3339    */
3340    if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
3341        && INSN(20,11) == BITS10(0,0,0,0,0,0,0,0,1,0)) {
3342       Bool   is64  = INSN(31,31) == 1;
3343       Bool   isCLS = INSN(10,10) == 1;
3344       UInt   nn    = INSN(9,5);
3345       UInt   dd    = INSN(4,0);
3346       IRTemp src   = newTemp(Ity_I64);
3347       IRTemp srcZ  = newTemp(Ity_I64);
3348       IRTemp dst   = newTemp(Ity_I64);
3349       /* Get the argument, widened out to 64 bit */
3350       if (is64) {
3351          assign(src, getIReg64orZR(nn));
3352       } else {
3353          assign(src, binop(Iop_Shl64,
3354                            unop(Iop_32Uto64, getIReg32orZR(nn)), mkU8(32)));
3355       }
3356       /* If this is CLS, mash the arg around accordingly */
3357       if (isCLS) {
3358          IRExpr* one = mkU8(1);
3359          assign(srcZ,
3360          binop(Iop_Xor64,
3361                binop(Iop_Shl64, mkexpr(src), one),
3362                binop(Iop_Shl64, binop(Iop_Shr64, mkexpr(src), one), one)));
3363       } else {
3364          assign(srcZ, mkexpr(src));
3365       }
3366       /* And compute CLZ. */
3367       if (is64) {
3368          assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
3369                                 mkU64(isCLS ? 63 : 64),
3370                                 unop(Iop_Clz64, mkexpr(srcZ))));
3371          putIReg64orZR(dd, mkexpr(dst));
3372       } else {
3373          assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
3374                                 mkU64(isCLS ? 31 : 32),
3375                                 unop(Iop_Clz64, mkexpr(srcZ))));
3376          putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
3377       }
3378       DIP("cl%c %s, %s\n", isCLS ? 's' : 'z',
3379           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn));
3380       return True;
3381    }
3382
3383    /* ------------------ LSLV/LSRV/ASRV/RORV ------------------ */
3384    /*    30 28        20 15   11 9 4
3385       sf 00 1101 0110 m  0010 00 n d   LSLV Rd,Rn,Rm
3386       sf 00 1101 0110 m  0010 01 n d   LSRV Rd,Rn,Rm
3387       sf 00 1101 0110 m  0010 10 n d   ASRV Rd,Rn,Rm
3388       sf 00 1101 0110 m  0010 11 n d   RORV Rd,Rn,Rm
3389    */
3390    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3391        && INSN(15,12) == BITS4(0,0,1,0)) {
3392       Bool   is64 = INSN(31,31) == 1;
3393       UInt   mm   = INSN(20,16);
3394       UInt   op   = INSN(11,10);
3395       UInt   nn   = INSN(9,5);
3396       UInt   dd   = INSN(4,0);
3397       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3398       IRTemp srcL = newTemp(ty);
3399       IRTemp srcR = newTemp(Ity_I64);
3400       IRTemp res  = newTemp(ty);
3401       IROp   iop  = Iop_INVALID;
3402       assign(srcL, getIRegOrZR(is64, nn));
3403       assign(srcR, binop(Iop_And64, getIReg64orZR(mm),
3404                                     mkU64(is64 ? 63 : 31)));
3405       if (op < 3) {
3406          // LSLV, LSRV, ASRV
3407          switch (op) {
3408             case BITS2(0,0): iop = mkSHL(ty); break;
3409             case BITS2(0,1): iop = mkSHR(ty); break;
3410             case BITS2(1,0): iop = mkSAR(ty); break;
3411             default: vassert(0);
3412          }
3413          assign(res, binop(iop, mkexpr(srcL),
3414                                 unop(Iop_64to8, mkexpr(srcR))));
3415       } else {
3416          // RORV
3417          IROp opSHL = mkSHL(ty);
3418          IROp opSHR = mkSHR(ty);
3419          IROp opOR  = mkOR(ty);
3420          IRExpr* width = mkU64(is64 ? 64: 32);
3421          assign(
3422             res,
3423             IRExpr_ITE(
3424                binop(Iop_CmpEQ64, mkexpr(srcR), mkU64(0)),
3425                mkexpr(srcL),
3426                binop(opOR,
3427                      binop(opSHL,
3428                            mkexpr(srcL),
3429                            unop(Iop_64to8, binop(Iop_Sub64, width,
3430                                                             mkexpr(srcR)))),
3431                      binop(opSHR,
3432                            mkexpr(srcL), unop(Iop_64to8, mkexpr(srcR))))
3433          ));
3434       }
3435       putIRegOrZR(is64, dd, mkexpr(res));
3436       vassert(op < 4);
3437       const HChar* names[4] = { "lslv", "lsrv", "asrv", "rorv" };
3438       DIP("%s %s, %s, %s\n",
3439           names[op], nameIRegOrZR(is64,dd),
3440                      nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm));
3441       return True;
3442    }
3443
3444    /* -------------------- SDIV/UDIV -------------------- */
3445    /*    30 28        20 15    10 9 4
3446       sf 00 1101 0110 m  00001  1 n d  SDIV Rd,Rn,Rm
3447       sf 00 1101 0110 m  00001  0 n d  UDIV Rd,Rn,Rm
3448    */
3449    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3450        && INSN(15,11) == BITS5(0,0,0,0,1)) {
3451       Bool is64 = INSN(31,31) == 1;
3452       UInt mm   = INSN(20,16);
3453       Bool isS  = INSN(10,10) == 1;
3454       UInt nn   = INSN(9,5);
3455       UInt dd   = INSN(4,0);
3456       if (isS) {
3457          putIRegOrZR(is64, dd, binop(is64 ? Iop_DivS64 : Iop_DivS32,
3458                                      getIRegOrZR(is64, nn),
3459                                      getIRegOrZR(is64, mm)));
3460       } else {
3461          putIRegOrZR(is64, dd, binop(is64 ? Iop_DivU64 : Iop_DivU32,
3462                                      getIRegOrZR(is64, nn),
3463                                      getIRegOrZR(is64, mm)));
3464       }
3465       DIP("%cdiv %s, %s, %s\n", isS ? 's' : 'u',
3466           nameIRegOrZR(is64, dd),
3467           nameIRegOrZR(is64, nn), nameIRegOrZR(is64, mm));
3468       return True;
3469    }
3470
3471    /* ------------------ {S,U}M{ADD,SUB}L ------------------ */
3472    /* 31        23  20 15 14 9 4
3473       1001 1011 101 m  0  a  n d   UMADDL Xd,Wn,Wm,Xa
3474       1001 1011 001 m  0  a  n d   SMADDL Xd,Wn,Wm,Xa
3475       1001 1011 101 m  1  a  n d   UMSUBL Xd,Wn,Wm,Xa
3476       1001 1011 001 m  1  a  n d   SMSUBL Xd,Wn,Wm,Xa
3477       with operation
3478          Xd = Xa +/- (Wn *u/s Wm)
3479    */
3480    if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1) && INSN(22,21) == BITS2(0,1)) {
3481       Bool   isU   = INSN(23,23) == 1;
3482       UInt   mm    = INSN(20,16);
3483       Bool   isAdd = INSN(15,15) == 0;
3484       UInt   aa    = INSN(14,10);
3485       UInt   nn    = INSN(9,5);
3486       UInt   dd    = INSN(4,0);
3487       IRTemp wN    = newTemp(Ity_I32);
3488       IRTemp wM    = newTemp(Ity_I32);
3489       IRTemp xA    = newTemp(Ity_I64);
3490       IRTemp muld  = newTemp(Ity_I64);
3491       IRTemp res   = newTemp(Ity_I64);
3492       assign(wN, getIReg32orZR(nn));
3493       assign(wM, getIReg32orZR(mm));
3494       assign(xA, getIReg64orZR(aa));
3495       assign(muld, binop(isU ? Iop_MullU32 : Iop_MullS32,
3496                          mkexpr(wN), mkexpr(wM)));
3497       assign(res, binop(isAdd ? Iop_Add64 : Iop_Sub64,
3498                         mkexpr(xA), mkexpr(muld)));
3499       putIReg64orZR(dd, mkexpr(res));
3500       DIP("%cm%sl %s, %s, %s, %s\n", isU ? 'u' : 's', isAdd ? "add" : "sub",
3501           nameIReg64orZR(dd), nameIReg32orZR(nn),
3502           nameIReg32orZR(mm), nameIReg64orZR(aa));
3503       return True;
3504    }
3505
3506    /* -------------------- CRC32/CRC32C -------------------- */
3507    /* 31 30           20 15   11 9 4
3508       sf 00 1101 0110 m  0100 sz n d   CRC32<sz>  Wd, Wn, Wm|Xm
3509       sf 00 1101 0110 m  0101 sz n d   CRC32C<sz> Wd, Wn, Wm|Xm
3510    */
3511    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3512        && INSN(15,13) == BITS3(0,1,0)) {
3513       UInt bitSF = INSN(31,31);
3514       UInt mm    = INSN(20,16);
3515       UInt bitC  = INSN(12,12);
3516       UInt sz    = INSN(11,10);
3517       UInt nn    = INSN(9,5);
3518       UInt dd    = INSN(4,0);
3519       vassert(sz >= 0 && sz <= 3);
3520       if ((bitSF == 0 && sz <= BITS2(1,0))
3521           || (bitSF == 1 && sz == BITS2(1,1))) {
3522          UInt ix = (bitC == 1 ? 4 : 0) | sz;
3523          void* helpers[8]
3524             = { &arm64g_calc_crc32b,   &arm64g_calc_crc32h,
3525                 &arm64g_calc_crc32w,   &arm64g_calc_crc32x,
3526                 &arm64g_calc_crc32cb,  &arm64g_calc_crc32ch,
3527                 &arm64g_calc_crc32cw,  &arm64g_calc_crc32cx };
3528          const HChar* hNames[8]
3529             = { "arm64g_calc_crc32b",  "arm64g_calc_crc32h",
3530                 "arm64g_calc_crc32w",  "arm64g_calc_crc32x",
3531                 "arm64g_calc_crc32cb", "arm64g_calc_crc32ch",
3532                 "arm64g_calc_crc32cw", "arm64g_calc_crc32cx" };
3533          const HChar* iNames[8]
3534             = { "crc32b",  "crc32h",  "crc32w",  "crc32x",
3535                 "crc32cb", "crc32ch", "crc32cw", "crc32cx" };
3536
3537          IRTemp srcN = newTemp(Ity_I64);
3538          assign(srcN, unop(Iop_32Uto64, unop(Iop_64to32, getIReg64orZR(nn))));
3539
3540          IRTemp  srcM = newTemp(Ity_I64);
3541          IRExpr* at64 = getIReg64orZR(mm);
3542          switch (sz) {
3543             case BITS2(0,0):
3544                assign(srcM, binop(Iop_And64, at64, mkU64(0xFF))); break;
3545             case BITS2(0,1):
3546                assign(srcM, binop(Iop_And64, at64, mkU64(0xFFFF))); break;
3547             case BITS2(1,0):
3548                assign(srcM, binop(Iop_And64, at64, mkU64(0xFFFFFFFF))); break;
3549             case BITS2(1,1):
3550                assign(srcM, at64); break;
3551             default:
3552                vassert(0);
3553          }
3554
3555          vassert(ix >= 0 && ix <= 7);
3556
3557          putIReg64orZR(
3558             dd,
3559             unop(Iop_32Uto64,
3560                  unop(Iop_64to32,
3561                       mkIRExprCCall(Ity_I64, 0/*regparm*/,
3562                                     hNames[ix], helpers[ix],
3563                                     mkIRExprVec_2(mkexpr(srcN),
3564                                                   mkexpr(srcM))))));
3565
3566          DIP("%s %s, %s, %s\n", iNames[ix],
3567              nameIReg32orZR(dd),
3568              nameIReg32orZR(nn), nameIRegOrZR(bitSF == 1, mm));
3569          return True;
3570       }
3571       /* fall through */
3572    }
3573
3574    vex_printf("ARM64 front end: data_processing_register\n");
3575    return False;
3576 #  undef INSN
3577 }
3578
3579
3580 /*------------------------------------------------------------*/
3581 /*--- Math helpers for vector interleave/deinterleave      ---*/
3582 /*------------------------------------------------------------*/
3583
3584 #define EX(_tmp) \
3585            mkexpr(_tmp)
3586 #define SL(_hi128,_lo128,_nbytes) \
3587            ( (_nbytes) == 0 \
3588                 ? (_lo128) \
3589                 : triop(Iop_SliceV128,(_hi128),(_lo128),mkU8(_nbytes)) )
3590 #define ROR(_v128,_nbytes) \
3591            SL((_v128),(_v128),(_nbytes))
3592 #define ROL(_v128,_nbytes) \
3593            SL((_v128),(_v128),16-(_nbytes))
3594 #define SHR(_v128,_nbytes) \
3595            binop(Iop_ShrV128,(_v128),mkU8(8*(_nbytes)))
3596 #define SHL(_v128,_nbytes) \
3597            binop(Iop_ShlV128,(_v128),mkU8(8*(_nbytes)))
3598 #define ILO64x2(_argL,_argR) \
3599            binop(Iop_InterleaveLO64x2,(_argL),(_argR))
3600 #define IHI64x2(_argL,_argR) \
3601            binop(Iop_InterleaveHI64x2,(_argL),(_argR))
3602 #define ILO32x4(_argL,_argR) \
3603            binop(Iop_InterleaveLO32x4,(_argL),(_argR))
3604 #define IHI32x4(_argL,_argR) \
3605            binop(Iop_InterleaveHI32x4,(_argL),(_argR))
3606 #define ILO16x8(_argL,_argR) \
3607            binop(Iop_InterleaveLO16x8,(_argL),(_argR))
3608 #define IHI16x8(_argL,_argR) \
3609            binop(Iop_InterleaveHI16x8,(_argL),(_argR))
3610 #define ILO8x16(_argL,_argR) \
3611            binop(Iop_InterleaveLO8x16,(_argL),(_argR))
3612 #define IHI8x16(_argL,_argR) \
3613            binop(Iop_InterleaveHI8x16,(_argL),(_argR))
3614 #define CEV32x4(_argL,_argR) \
3615            binop(Iop_CatEvenLanes32x4,(_argL),(_argR))
3616 #define COD32x4(_argL,_argR) \
3617            binop(Iop_CatOddLanes32x4,(_argL),(_argR))
3618 #define COD16x8(_argL,_argR) \
3619            binop(Iop_CatOddLanes16x8,(_argL),(_argR))
3620 #define COD8x16(_argL,_argR) \
3621            binop(Iop_CatOddLanes8x16,(_argL),(_argR))
3622 #define CEV8x16(_argL,_argR) \
3623            binop(Iop_CatEvenLanes8x16,(_argL),(_argR))
3624 #define AND(_arg1,_arg2) \
3625            binop(Iop_AndV128,(_arg1),(_arg2))
3626 #define OR2(_arg1,_arg2) \
3627            binop(Iop_OrV128,(_arg1),(_arg2))
3628 #define OR3(_arg1,_arg2,_arg3) \
3629            binop(Iop_OrV128,(_arg1),binop(Iop_OrV128,(_arg2),(_arg3)))
3630 #define OR4(_arg1,_arg2,_arg3,_arg4) \
3631            binop(Iop_OrV128, \
3632                  binop(Iop_OrV128,(_arg1),(_arg2)), \
3633                  binop(Iop_OrV128,(_arg3),(_arg4)))
3634
3635
3636 /* Do interleaving for 1 128 bit vector, for ST1 insns. */
3637 static
3638 void math_INTERLEAVE1_128( /*OUTx1*/ IRTemp* i0,
3639                            UInt laneSzBlg2, IRTemp u0 )
3640 {
3641    assign(*i0, mkexpr(u0));
3642 }
3643
3644
3645 /* Do interleaving for 2 128 bit vectors, for ST2 insns. */
3646 static
3647 void math_INTERLEAVE2_128( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
3648                            UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
3649 {
3650    /* This is pretty easy, since we have primitives directly to
3651       hand. */
3652    if (laneSzBlg2 == 3) {
3653       // 64x2
3654       // u1 == B1 B0, u0 == A1 A0
3655       // i1 == B1 A1, i0 == B0 A0
3656       assign(*i0, binop(Iop_InterleaveLO64x2, mkexpr(u1), mkexpr(u0)));
3657       assign(*i1, binop(Iop_InterleaveHI64x2, mkexpr(u1), mkexpr(u0)));
3658       return;
3659    }
3660    if (laneSzBlg2 == 2) {
3661       // 32x4
3662       // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
3663       // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
3664       assign(*i0, binop(Iop_InterleaveLO32x4, mkexpr(u1), mkexpr(u0)));
3665       assign(*i1, binop(Iop_InterleaveHI32x4, mkexpr(u1), mkexpr(u0)));
3666       return;
3667    }
3668    if (laneSzBlg2 == 1) {
3669       // 16x8
3670       // u1 == B{7..0}, u0 == A{7..0}
3671       // i0 == B3 A3 B2 A2 B1 A1 B0 A0
3672       // i1 == B7 A7 B6 A6 B5 A5 B4 A4
3673       assign(*i0, binop(Iop_InterleaveLO16x8, mkexpr(u1), mkexpr(u0)));
3674       assign(*i1, binop(Iop_InterleaveHI16x8, mkexpr(u1), mkexpr(u0)));
3675       return;
3676    }
3677    if (laneSzBlg2 == 0) {
3678       // 8x16
3679       // u1 == B{f..0}, u0 == A{f..0}
3680       // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
3681       // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
3682       assign(*i0, binop(Iop_InterleaveLO8x16, mkexpr(u1), mkexpr(u0)));
3683       assign(*i1, binop(Iop_InterleaveHI8x16, mkexpr(u1), mkexpr(u0)));
3684       return;
3685    }
3686    /*NOTREACHED*/
3687    vassert(0);
3688 }
3689
3690
3691 /* Do interleaving for 3 128 bit vectors, for ST3 insns. */
3692 static
3693 void math_INTERLEAVE3_128(
3694         /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
3695         UInt laneSzBlg2,
3696         IRTemp u0, IRTemp u1, IRTemp u2 )
3697 {
3698    if (laneSzBlg2 == 3) {
3699       // 64x2
3700       // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
3701       // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
3702       assign(*i2, IHI64x2( EX(u2), EX(u1) ));
3703       assign(*i1, ILO64x2( ROR(EX(u0),8), EX(u2) ));
3704       assign(*i0, ILO64x2( EX(u1), EX(u0) ));
3705       return;
3706    }
3707
3708    if (laneSzBlg2 == 2) {
3709       // 32x4
3710       // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
3711       // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
3712       // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
3713       IRTemp p0    = newTempV128();
3714       IRTemp p1    = newTempV128();
3715       IRTemp p2    = newTempV128();
3716       IRTemp c1100 = newTempV128();
3717       IRTemp c0011 = newTempV128();
3718       IRTemp c0110 = newTempV128();
3719       assign(c1100, mkV128(0xFF00));
3720       assign(c0011, mkV128(0x00FF));
3721       assign(c0110, mkV128(0x0FF0));
3722       // First interleave them at 64x2 granularity,
3723       // generating partial ("p") values.
3724       math_INTERLEAVE3_128(&p0, &p1, &p2, 3, u0, u1, u2);
3725       // And more shuffling around for the final answer
3726       assign(*i2, OR2( AND( IHI32x4(EX(p2), ROL(EX(p2),8)), EX(c1100) ),
3727                        AND( IHI32x4(ROR(EX(p1),4), EX(p2)), EX(c0011) ) ));
3728       assign(*i1, OR3( SHL(EX(p2),12),
3729                        AND(EX(p1),EX(c0110)),
3730                        SHR(EX(p0),12) ));
3731       assign(*i0, OR2( AND( ILO32x4(EX(p0),ROL(EX(p1),4)), EX(c1100) ),
3732                        AND( ILO32x4(ROR(EX(p0),8),EX(p0)), EX(c0011) ) ));
3733       return;
3734    }
3735
3736    if (laneSzBlg2 == 1) {
3737       // 16x8
3738       // u2 == C7 C6 C5 C4 C3 C2 C1 C0
3739       // u1 == B7 B6 B5 B4 B3 B2 B1 B0
3740       // u0 == A7 A6 A5 A4 A3 A2 A1 A0
3741       //
3742       // p2 == C7 C6 B7 B6 A7 A6 C5 C4
3743       // p1 == B5 B4 A5 A4 C3 C2 B3 B2
3744       // p0 == A3 A2 C1 C0 B1 B0 A1 A0
3745       //
3746       // i2 == C7 B7 A7 C6 B6 A6 C5 B5
3747       // i1 == A5 C4 B4 A4 C4 B3 A3 C2
3748       // i0 == B2 A2 C1 B1 A1 C0 B0 A0
3749       IRTemp p0    = newTempV128();
3750       IRTemp p1    = newTempV128();
3751       IRTemp p2    = newTempV128();
3752       IRTemp c1000 = newTempV128();
3753       IRTemp c0100 = newTempV128();
3754       IRTemp c0010 = newTempV128();
3755       IRTemp c0001 = newTempV128();
3756       assign(c1000, mkV128(0xF000));
3757       assign(c0100, mkV128(0x0F00));
3758       assign(c0010, mkV128(0x00F0));
3759       assign(c0001, mkV128(0x000F));
3760       // First interleave them at 32x4 granularity,
3761       // generating partial ("p") values.
3762       math_INTERLEAVE3_128(&p0, &p1, &p2, 2, u0, u1, u2);
3763       // And more shuffling around for the final answer
3764       assign(*i2,
3765              OR4( AND( IHI16x8( EX(p2),        ROL(EX(p2),4) ), EX(c1000) ),
3766                   AND( IHI16x8( ROL(EX(p2),6), EX(p2)        ), EX(c0100) ),
3767                   AND( IHI16x8( ROL(EX(p2),2), ROL(EX(p2),6) ), EX(c0010) ),
3768                   AND( ILO16x8( ROR(EX(p2),2), ROL(EX(p1),2) ), EX(c0001) )
3769       ));
3770       assign(*i1,
3771              OR4( AND( IHI16x8( ROL(EX(p1),4), ROR(EX(p2),2) ), EX(c1000) ),
3772                   AND( IHI16x8( EX(p1),        ROL(EX(p1),4) ), EX(c0100) ),
3773                   AND( IHI16x8( ROL(EX(p1),4), ROL(EX(p1),8) ), EX(c0010) ),
3774                   AND( IHI16x8( ROR(EX(p0),6), ROL(EX(p1),4) ), EX(c0001) )
3775       ));
3776       assign(*i0,
3777              OR4( AND( IHI16x8( ROR(EX(p1),2), ROL(EX(p0),2) ), EX(c1000) ),
3778                   AND( IHI16x8( ROL(EX(p0),2), ROL(EX(p0),6) ), EX(c0100) ),
3779                   AND( IHI16x8( ROL(EX(p0),8), ROL(EX(p0),2) ), EX(c0010) ),
3780                   AND( IHI16x8( ROL(EX(p0),4), ROL(EX(p0),8) ), EX(c0001) )
3781       ));
3782       return;
3783    }
3784
3785    if (laneSzBlg2 == 0) {
3786       // 8x16.  It doesn't seem worth the hassle of first doing a
3787       // 16x8 interleave, so just generate all 24 partial results
3788       // directly :-(
3789       // u2 == Cf .. C0, u1 == Bf .. B0, u0 == Af .. A0
3790       // i2 == Cf Bf Af Ce .. Bb Ab Ca
3791       // i1 == Ba Aa C9 B9 .. A6 C5 B5
3792       // i0 == A5 C4 B4 A4 .. C0 B0 A0
3793
3794       IRTemp i2_FEDC = newTempV128(); IRTemp i2_BA98 = newTempV128();
3795       IRTemp i2_7654 = newTempV128(); IRTemp i2_3210 = newTempV128();
3796       IRTemp i1_FEDC = newTempV128(); IRTemp i1_BA98 = newTempV128();
3797       IRTemp i1_7654 = newTempV128(); IRTemp i1_3210 = newTempV128();
3798       IRTemp i0_FEDC = newTempV128(); IRTemp i0_BA98 = newTempV128();
3799       IRTemp i0_7654 = newTempV128(); IRTemp i0_3210 = newTempV128();
3800       IRTemp i2_hi64 = newTempV128(); IRTemp i2_lo64 = newTempV128();
3801       IRTemp i1_hi64 = newTempV128(); IRTemp i1_lo64 = newTempV128();
3802       IRTemp i0_hi64 = newTempV128(); IRTemp i0_lo64 = newTempV128();
3803
3804       // eg XXXX(qqq, CC, 0xF, BB, 0xA)) sets qqq to be a vector
3805       // of the form 14 bytes junk : CC[0xF] : BB[0xA]
3806       //
3807 #     define XXXX(_tempName,_srcVec1,_srcShift1,_srcVec2,_srcShift2) \
3808          IRTemp t_##_tempName = newTempV128(); \
3809          assign(t_##_tempName, \
3810                 ILO8x16( ROR(EX(_srcVec1),(_srcShift1)), \
3811                          ROR(EX(_srcVec2),(_srcShift2)) ) )
3812
3813       // Let CC, BB, AA be (handy) aliases of u2, u1, u0 respectively
3814       IRTemp CC = u2; IRTemp BB = u1; IRTemp AA = u0;
3815
3816       // The slicing and reassembly are done as interleavedly as possible,
3817       // so as to minimise the demand for registers in the back end, which
3818       // was observed to be a problem in testing.
3819
3820       XXXX(CfBf, CC, 0xf, BB, 0xf); // i2[15:14]
3821       XXXX(AfCe, AA, 0xf, CC, 0xe);
3822       assign(i2_FEDC, ILO16x8(EX(t_CfBf), EX(t_AfCe)));
3823
3824       XXXX(BeAe, BB, 0xe, AA, 0xe);
3825       XXXX(CdBd, CC, 0xd, BB, 0xd);
3826       assign(i2_BA98, ILO16x8(EX(t_BeAe), EX(t_CdBd)));
3827       assign(i2_hi64, ILO32x4(EX(i2_FEDC), EX(i2_BA98)));
3828
3829       XXXX(AdCc, AA, 0xd, CC, 0xc);
3830       XXXX(BcAc, BB, 0xc, AA, 0xc);
3831       assign(i2_7654, ILO16x8(EX(t_AdCc), EX(t_BcAc)));
3832
3833       XXXX(CbBb, CC, 0xb, BB, 0xb);
3834       XXXX(AbCa, AA, 0xb, CC, 0xa); // i2[1:0]
3835       assign(i2_3210, ILO16x8(EX(t_CbBb), EX(t_AbCa)));
3836       assign(i2_lo64, ILO32x4(EX(i2_7654), EX(i2_3210)));
3837       assign(*i2, ILO64x2(EX(i2_hi64), EX(i2_lo64)));
3838
3839       XXXX(BaAa, BB, 0xa, AA, 0xa); // i1[15:14]
3840       XXXX(C9B9, CC, 0x9, BB, 0x9);
3841       assign(i1_FEDC, ILO16x8(EX(t_BaAa), EX(t_C9B9)));
3842
3843       XXXX(A9C8, AA, 0x9, CC, 0x8);
3844       XXXX(B8A8, BB, 0x8, AA, 0x8);
3845       assign(i1_BA98, ILO16x8(EX(t_A9C8), EX(t_B8A8)));
3846       assign(i1_hi64, ILO32x4(EX(i1_FEDC), EX(i1_BA98)));
3847
3848       XXXX(C7B7, CC, 0x7, BB, 0x7);
3849       XXXX(A7C6, AA, 0x7, CC, 0x6);
3850       assign(i1_7654, ILO16x8(EX(t_C7B7), EX(t_A7C6)));
3851
3852       XXXX(B6A6, BB, 0x6, AA, 0x6);
3853       XXXX(C5B5, CC, 0x5, BB, 0x5); // i1[1:0]
3854       assign(i1_3210, ILO16x8(EX(t_B6A6), EX(t_C5B5)));
3855       assign(i1_lo64, ILO32x4(EX(i1_7654), EX(i1_3210)));
3856       assign(*i1, ILO64x2(EX(i1_hi64), EX(i1_lo64)));
3857
3858       XXXX(A5C4, AA, 0x5, CC, 0x4); // i0[15:14]
3859       XXXX(B4A4, BB, 0x4, AA, 0x4);
3860       assign(i0_FEDC, ILO16x8(EX(t_A5C4), EX(t_B4A4)));
3861
3862       XXXX(C3B3, CC, 0x3, BB, 0x3);
3863       XXXX(A3C2, AA, 0x3, CC, 0x2);
3864       assign(i0_BA98, ILO16x8(EX(t_C3B3), EX(t_A3C2)));
3865       assign(i0_hi64, ILO32x4(EX(i0_FEDC), EX(i0_BA98)));
3866
3867       XXXX(B2A2, BB, 0x2, AA, 0x2);
3868       XXXX(C1B1, CC, 0x1, BB, 0x1);
3869       assign(i0_7654, ILO16x8(EX(t_B2A2), EX(t_C1B1)));
3870
3871       XXXX(A1C0, AA, 0x1, CC, 0x0);
3872       XXXX(B0A0, BB, 0x0, AA, 0x0); // i0[1:0]
3873       assign(i0_3210, ILO16x8(EX(t_A1C0), EX(t_B0A0)));
3874       assign(i0_lo64, ILO32x4(EX(i0_7654), EX(i0_3210)));
3875       assign(*i0, ILO64x2(EX(i0_hi64), EX(i0_lo64)));
3876
3877 #     undef XXXX
3878       return;
3879    }
3880
3881    /*NOTREACHED*/
3882    vassert(0);
3883 }
3884
3885
3886 /* Do interleaving for 4 128 bit vectors, for ST4 insns. */
3887 static
3888 void math_INTERLEAVE4_128(
3889         /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
3890         UInt laneSzBlg2,
3891         IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
3892 {
3893    if (laneSzBlg2 == 3) {
3894       // 64x2
3895       assign(*i0, ILO64x2(EX(u1), EX(u0)));
3896       assign(*i1, ILO64x2(EX(u3), EX(u2)));
3897       assign(*i2, IHI64x2(EX(u1), EX(u0)));
3898       assign(*i3, IHI64x2(EX(u3), EX(u2)));
3899       return;
3900    }
3901    if (laneSzBlg2 == 2) {
3902       // 32x4
3903       // First, interleave at the 64-bit lane size.
3904       IRTemp p0 = newTempV128();
3905       IRTemp p1 = newTempV128();
3906       IRTemp p2 = newTempV128();
3907       IRTemp p3 = newTempV128();
3908       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 3, u0, u1, u2, u3);
3909       // And interleave (cat) at the 32 bit size.
3910       assign(*i0, CEV32x4(EX(p1), EX(p0)));
3911       assign(*i1, COD32x4(EX(p1), EX(p0)));
3912       assign(*i2, CEV32x4(EX(p3), EX(p2)));
3913       assign(*i3, COD32x4(EX(p3), EX(p2)));
3914       return;
3915    }
3916    if (laneSzBlg2 == 1) {
3917       // 16x8
3918       // First, interleave at the 32-bit lane size.
3919       IRTemp p0 = newTempV128();
3920       IRTemp p1 = newTempV128();
3921       IRTemp p2 = newTempV128();
3922       IRTemp p3 = newTempV128();
3923       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 2, u0, u1, u2, u3);
3924       // And rearrange within each vector, to get the right 16 bit lanes.
3925       assign(*i0, COD16x8(EX(p0), SHL(EX(p0), 2)));
3926       assign(*i1, COD16x8(EX(p1), SHL(EX(p1), 2)));
3927       assign(*i2, COD16x8(EX(p2), SHL(EX(p2), 2)));
3928       assign(*i3, COD16x8(EX(p3), SHL(EX(p3), 2)));
3929       return;
3930    }
3931    if (laneSzBlg2 == 0) {
3932       // 8x16
3933       // First, interleave at the 16-bit lane size.
3934       IRTemp p0 = newTempV128();
3935       IRTemp p1 = newTempV128();
3936       IRTemp p2 = newTempV128();
3937       IRTemp p3 = newTempV128();
3938       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 1, u0, u1, u2, u3);
3939       // And rearrange within each vector, to get the right 8 bit lanes.
3940       assign(*i0, IHI32x4(COD8x16(EX(p0),EX(p0)), CEV8x16(EX(p0),EX(p0))));
3941       assign(*i1, IHI32x4(COD8x16(EX(p1),EX(p1)), CEV8x16(EX(p1),EX(p1))));
3942       assign(*i2, IHI32x4(COD8x16(EX(p2),EX(p2)), CEV8x16(EX(p2),EX(p2))));
3943       assign(*i3, IHI32x4(COD8x16(EX(p3),EX(p3)), CEV8x16(EX(p3),EX(p3))));
3944       return;
3945    }
3946    /*NOTREACHED*/
3947    vassert(0);
3948 }
3949
3950
3951 /* Do deinterleaving for 1 128 bit vector, for LD1 insns. */
3952 static
3953 void math_DEINTERLEAVE1_128( /*OUTx1*/ IRTemp* u0,
3954                              UInt laneSzBlg2, IRTemp i0 )
3955 {
3956    assign(*u0, mkexpr(i0));
3957 }
3958
3959
3960 /* Do deinterleaving for 2 128 bit vectors, for LD2 insns. */
3961 static
3962 void math_DEINTERLEAVE2_128( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
3963                              UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
3964 {
3965    /* This is pretty easy, since we have primitives directly to
3966       hand. */
3967    if (laneSzBlg2 == 3) {
3968       // 64x2
3969       // i1 == B1 A1, i0 == B0 A0
3970       // u1 == B1 B0, u0 == A1 A0
3971       assign(*u0, binop(Iop_InterleaveLO64x2, mkexpr(i1), mkexpr(i0)));
3972       assign(*u1, binop(Iop_InterleaveHI64x2, mkexpr(i1), mkexpr(i0)));
3973       return;
3974    }
3975    if (laneSzBlg2 == 2) {
3976       // 32x4
3977       // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
3978       // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
3979       assign(*u0, binop(Iop_CatEvenLanes32x4, mkexpr(i1), mkexpr(i0)));
3980       assign(*u1, binop(Iop_CatOddLanes32x4, mkexpr(i1), mkexpr(i0)));
3981       return;
3982    }
3983    if (laneSzBlg2 == 1) {
3984       // 16x8
3985       // i0 == B3 A3 B2 A2 B1 A1 B0 A0
3986       // i1 == B7 A7 B6 A6 B5 A5 B4 A4
3987       // u1 == B{7..0}, u0 == A{7..0}
3988       assign(*u0, binop(Iop_CatEvenLanes16x8, mkexpr(i1), mkexpr(i0)));
3989       assign(*u1, binop(Iop_CatOddLanes16x8,  mkexpr(i1), mkexpr(i0)));
3990       return;
3991    }
3992    if (laneSzBlg2 == 0) {
3993       // 8x16
3994       // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
3995       // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
3996       // u1 == B{f..0}, u0 == A{f..0}
3997       assign(*u0, binop(Iop_CatEvenLanes8x16, mkexpr(i1), mkexpr(i0)));
3998       assign(*u1, binop(Iop_CatOddLanes8x16,  mkexpr(i1), mkexpr(i0)));
3999       return;
4000    }
4001    /*NOTREACHED*/
4002    vassert(0);
4003 }
4004
4005
4006 /* Do deinterleaving for 3 128 bit vectors, for LD3 insns. */
4007 static
4008 void math_DEINTERLEAVE3_128(
4009         /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
4010         UInt laneSzBlg2,
4011         IRTemp i0, IRTemp i1, IRTemp i2 )
4012 {
4013    if (laneSzBlg2 == 3) {
4014       // 64x2
4015       // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
4016       // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
4017       assign(*u2, ILO64x2( ROL(EX(i2),8), EX(i1)        ));
4018       assign(*u1, ILO64x2( EX(i2),        ROL(EX(i0),8) ));
4019       assign(*u0, ILO64x2( ROL(EX(i1),8), EX(i0)        ));
4020       return;
4021    }
4022
4023    if (laneSzBlg2 == 2) {
4024       // 32x4
4025       // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
4026       // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
4027       // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
4028       IRTemp t_a1c0b0a0 = newTempV128();
4029       IRTemp t_a2c1b1a1 = newTempV128();
4030       IRTemp t_a3c2b2a2 = newTempV128();
4031       IRTemp t_a0c3b3a3 = newTempV128();
4032       IRTemp p0 = newTempV128();
4033       IRTemp p1 = newTempV128();
4034       IRTemp p2 = newTempV128();
4035       // Compute some intermediate values.
4036       assign(t_a1c0b0a0, EX(i0));
4037       assign(t_a2c1b1a1, SL(EX(i1),EX(i0),3*4));
4038       assign(t_a3c2b2a2, SL(EX(i2),EX(i1),2*4));
4039       assign(t_a0c3b3a3, SL(EX(i0),EX(i2),1*4));
4040       // First deinterleave into lane-pairs
4041       assign(p0, ILO32x4(EX(t_a2c1b1a1),EX(t_a1c0b0a0)));
4042       assign(p1, ILO64x2(ILO32x4(EX(t_a0c3b3a3), EX(t_a3c2b2a2)),
4043                          IHI32x4(EX(t_a2c1b1a1), EX(t_a1c0b0a0))));
4044       assign(p2, ILO32x4(ROR(EX(t_a0c3b3a3),1*4), ROR(EX(t_a3c2b2a2),1*4)));
4045       // Then deinterleave at 64x2 granularity.
4046       math_DEINTERLEAVE3_128(u0, u1, u2, 3, p0, p1, p2);
4047       return;
4048    }
4049
4050    if (laneSzBlg2 == 1) {
4051       // 16x8
4052       // u2 == C7 C6 C5 C4 C3 C2 C1 C0
4053       // u1 == B7 B6 B5 B4 B3 B2 B1 B0
4054       // u0 == A7 A6 A5 A4 A3 A2 A1 A0
4055       //
4056       // i2 == C7 B7 A7 C6 B6 A6 C5 B5
4057       // i1 == A5 C4 B4 A4 C4 B3 A3 C2
4058       // i0 == B2 A2 C1 B1 A1 C0 B0 A0
4059       //
4060       // p2 == C7 C6 B7 B6 A7 A6 C5 C4
4061       // p1 == B5 B4 A5 A4 C3 C2 B3 B2
4062       // p0 == A3 A2 C1 C0 B1 B0 A1 A0
4063
4064       IRTemp s0, s1, s2, s3, t0, t1, t2, t3, p0, p1, p2, c00111111;
4065       s0 = s1 = s2 = s3
4066          = t0 = t1 = t2 = t3 = p0 = p1 = p2 = c00111111 = IRTemp_INVALID;
4067       newTempsV128_4(&s0, &s1, &s2, &s3);
4068       newTempsV128_4(&t0, &t1, &t2, &t3);
4069       newTempsV128_4(&p0, &p1, &p2, &c00111111);
4070
4071       // s0 == b2a2 c1b1a1 c0b0a0
4072       // s1 == b4a4 c3b3c3 c2b2a2
4073       // s2 == b6a6 c5b5a5 c4b4a4
4074       // s3 == b0a0 c7b7a7 c6b6a6
4075       assign(s0, EX(i0));
4076       assign(s1, SL(EX(i1),EX(i0),6*2));
4077       assign(s2, SL(EX(i2),EX(i1),4*2));
4078       assign(s3, SL(EX(i0),EX(i2),2*2));
4079
4080       // t0 == 0 0 c1c0 b1b0 a1a0
4081       // t1 == 0 0 c3c2 b3b2 a3a2
4082       // t2 == 0 0 c5c4 b5b4 a5a4
4083       // t3 == 0 0 c7c6 b7b6 a7a6
4084       assign(c00111111, mkV128(0x0FFF));
4085       assign(t0, AND( ILO16x8( ROR(EX(s0),3*2), EX(s0)), EX(c00111111)));
4086       assign(t1, AND( ILO16x8( ROR(EX(s1),3*2), EX(s1)), EX(c00111111)));
4087       assign(t2, AND( ILO16x8( ROR(EX(s2),3*2), EX(s2)), EX(c00111111)));
4088       assign(t3, AND( ILO16x8( ROR(EX(s3),3*2), EX(s3)), EX(c00111111)));
4089
4090       assign(p0, OR2(EX(t0),          SHL(EX(t1),6*2)));
4091       assign(p1, OR2(SHL(EX(t2),4*2), SHR(EX(t1),2*2)));
4092       assign(p2, OR2(SHL(EX(t3),2*2), SHR(EX(t2),4*2)));
4093
4094       // Then deinterleave at 32x4 granularity.
4095       math_DEINTERLEAVE3_128(u0, u1, u2, 2, p0, p1, p2);
4096       return;
4097    }
4098
4099    if (laneSzBlg2 == 0) {
4100       // 8x16.  This is the same scheme as for 16x8, with twice the
4101       // number of intermediate values.
4102       //
4103       // u2 == C{f..0}
4104       // u1 == B{f..0}
4105       // u0 == A{f..0}
4106       //
4107       // i2 == CBA{f} CBA{e} CBA{d} CBA{c} CBA{b} C{a}
4108       // i1 ==  BA{a} CBA{9} CBA{8} CBA{7} CBA{6} CB{5}
4109       // i0 ==   A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
4110       //
4111       // p2 == C{fe} B{fe} A{fe} C{dc} B{dc} A{dc} C{ba} B{ba}
4112       // p1 == A{ba} C{98} B{98} A{98} C{76} B{76} A{76} C{54}
4113       // p0 == B{54} A{54} C{32} B{32} A{32} C{10} B{10} A{10}
4114       //
4115       IRTemp s0, s1, s2, s3, s4, s5, s6, s7,
4116              t0, t1, t2, t3, t4, t5, t6, t7, p0, p1, p2, cMASK;
4117       s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7
4118          = t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = p0 = p1 = p2 = cMASK
4119          = IRTemp_INVALID;
4120       newTempsV128_4(&s0, &s1, &s2, &s3);
4121       newTempsV128_4(&s4, &s5, &s6, &s7);
4122       newTempsV128_4(&t0, &t1, &t2, &t3);
4123       newTempsV128_4(&t4, &t5, &t6, &t7);
4124       newTempsV128_4(&p0, &p1, &p2, &cMASK);
4125
4126       // s0 == A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
4127       // s1 == A{7} CBA{6} CBA{5} CBA{4} CBA{3} CBA{2}
4128       // s2 == A{9} CBA{8} CBA{7} CBA{6} CBA{5} CBA{4}
4129       // s3 == A{b} CBA{a} CBA{9} CBA{8} CBA{7} CBA{6}
4130       // s4 == A{d} CBA{c} CBA{b} CBA{a} CBA{9} CBA{8}
4131       // s5 == A{f} CBA{e} CBA{d} CBA{c} CBA{b} CBA{a}
4132       // s6 == A{1} CBA{0} CBA{f} CBA{e} CBA{d} CBA{c}
4133       // s7 == A{3} CBA{2} CBA{1} CBA{0} CBA{f} CBA{e}
4134       assign(s0, SL(EX(i1),EX(i0), 0));
4135       assign(s1, SL(EX(i1),EX(i0), 6));
4136       assign(s2, SL(EX(i1),EX(i0),12));
4137       assign(s3, SL(EX(i2),EX(i1), 2));
4138       assign(s4, SL(EX(i2),EX(i1), 8));
4139       assign(s5, SL(EX(i2),EX(i1),14));
4140       assign(s6, SL(EX(i0),EX(i2), 4));
4141       assign(s7, SL(EX(i0),EX(i2),10));
4142
4143       // t0 == 0--(ten)--0 C1 C0 B1 B0 A1 A0
4144       // t1 == 0--(ten)--0 C3 C2 B3 B2 A3 A2
4145       // t2 == 0--(ten)--0 C5 C4 B5 B4 A5 A4
4146       // t3 == 0--(ten)--0 C7 C6 B7 B6 A7 A6
4147       // t4 == 0--(ten)--0 C9 C8 B9 B8 A9 A8
4148       // t5 == 0--(ten)--0 Cb Ca Bb Ba Ab Aa
4149       // t6 == 0--(ten)--0 Cd Cc Bd Bc Ad Ac
4150       // t7 == 0--(ten)--0 Cf Ce Bf Be Af Ae
4151       assign(cMASK, mkV128(0x003F));
4152       assign(t0, AND( ILO8x16( ROR(EX(s0),3), EX(s0)), EX(cMASK)));
4153       assign(t1, AND( ILO8x16( ROR(EX(s1),3), EX(s1)), EX(cMASK)));
4154       assign(t2, AND( ILO8x16( ROR(EX(s2),3), EX(s2)), EX(cMASK)));
4155       assign(t3, AND( ILO8x16( ROR(EX(s3),3), EX(s3)), EX(cMASK)));
4156       assign(t4, AND( ILO8x16( ROR(EX(s4),3), EX(s4)), EX(cMASK)));
4157       assign(t5, AND( ILO8x16( ROR(EX(s5),3), EX(s5)), EX(cMASK)));
4158       assign(t6, AND( ILO8x16( ROR(EX(s6),3), EX(s6)), EX(cMASK)));
4159       assign(t7, AND( ILO8x16( ROR(EX(s7),3), EX(s7)), EX(cMASK)));
4160
4161       assign(p0, OR3( SHL(EX(t2),12), SHL(EX(t1),6), EX(t0) ));
4162       assign(p1, OR4( SHL(EX(t5),14), SHL(EX(t4),8),
4163                  SHL(EX(t3),2), SHR(EX(t2),4) ));
4164       assign(p2, OR3( SHL(EX(t7),10), SHL(EX(t6),4), SHR(EX(t5),2) ));
4165
4166       // Then deinterleave at 16x8 granularity.
4167       math_DEINTERLEAVE3_128(u0, u1, u2, 1, p0, p1, p2);
4168       return;
4169    }
4170
4171    /*NOTREACHED*/
4172    vassert(0);
4173 }
4174
4175
4176 /* Do deinterleaving for 4 128 bit vectors, for LD4 insns. */
4177 static
4178 void math_DEINTERLEAVE4_128(
4179         /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
4180         UInt laneSzBlg2,
4181         IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
4182 {
4183    if (laneSzBlg2 == 3) {
4184       // 64x2
4185       assign(*u0, ILO64x2(EX(i2), EX(i0)));
4186       assign(*u1, IHI64x2(EX(i2), EX(i0)));
4187       assign(*u2, ILO64x2(EX(i3), EX(i1)));
4188       assign(*u3, IHI64x2(EX(i3), EX(i1)));
4189       return;
4190    }
4191    if (laneSzBlg2 == 2) {
4192       // 32x4
4193       IRTemp p0 = newTempV128();
4194       IRTemp p2 = newTempV128();
4195       IRTemp p1 = newTempV128();
4196       IRTemp p3 = newTempV128();
4197       assign(p0, ILO32x4(EX(i1), EX(i0)));
4198       assign(p1, IHI32x4(EX(i1), EX(i0)));
4199       assign(p2, ILO32x4(EX(i3), EX(i2)));
4200       assign(p3, IHI32x4(EX(i3), EX(i2)));
4201       // And now do what we did for the 64-bit case.
4202       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 3, p0, p1, p2, p3);
4203       return;
4204    }
4205    if (laneSzBlg2 == 1) {
4206       // 16x8
4207       // Deinterleave into 32-bit chunks, then do as the 32-bit case.
4208       IRTemp p0 = newTempV128();
4209       IRTemp p1 = newTempV128();
4210       IRTemp p2 = newTempV128();
4211       IRTemp p3 = newTempV128();
4212       assign(p0, IHI16x8(EX(i0), SHL(EX(i0), 8)));
4213       assign(p1, IHI16x8(EX(i1), SHL(EX(i1), 8)));
4214       assign(p2, IHI16x8(EX(i2), SHL(EX(i2), 8)));
4215       assign(p3, IHI16x8(EX(i3), SHL(EX(i3), 8)));
4216       // From here on is like the 32 bit case.
4217       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 2, p0, p1, p2, p3);
4218       return;
4219    }
4220    if (laneSzBlg2 == 0) {
4221       // 8x16
4222       // Deinterleave into 16-bit chunks, then do as the 16-bit case.
4223       IRTemp p0 = newTempV128();
4224       IRTemp p1 = newTempV128();
4225       IRTemp p2 = newTempV128();
4226       IRTemp p3 = newTempV128();
4227       assign(p0, IHI64x2( IHI8x16(EX(i0),ROL(EX(i0),4)),
4228                           ILO8x16(EX(i0),ROL(EX(i0),4)) ));
4229       assign(p1, IHI64x2( IHI8x16(EX(i1),ROL(EX(i1),4)),
4230                           ILO8x16(EX(i1),ROL(EX(i1),4)) ));
4231       assign(p2, IHI64x2( IHI8x16(EX(i2),ROL(EX(i2),4)),
4232                           ILO8x16(EX(i2),ROL(EX(i2),4)) ));
4233       assign(p3, IHI64x2( IHI8x16(EX(i3),ROL(EX(i3),4)),
4234                           ILO8x16(EX(i3),ROL(EX(i3),4)) ));
4235       // From here on is like the 16 bit case.
4236       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 1, p0, p1, p2, p3);
4237       return;
4238    }
4239    /*NOTREACHED*/
4240    vassert(0);
4241 }
4242
4243
4244 /* Wrappers that use the full-width (de)interleavers to do half-width
4245    (de)interleaving.  The scheme is to clone each input lane in the
4246    lower half of each incoming value, do a full width (de)interleave
4247    at the next lane size up, and remove every other lane of the the
4248    result.  The returned values may have any old junk in the upper
4249    64 bits -- the caller must ignore that. */
4250
4251 /* Helper function -- get doubling and narrowing operations. */
4252 static
4253 void math_get_doubler_and_halver ( /*OUT*/IROp* doubler,
4254                                    /*OUT*/IROp* halver,
4255                                    UInt laneSzBlg2 )
4256 {
4257    switch (laneSzBlg2) {
4258       case 2:
4259          *doubler = Iop_InterleaveLO32x4; *halver = Iop_CatEvenLanes32x4;
4260          break;
4261       case 1:
4262          *doubler = Iop_InterleaveLO16x8; *halver = Iop_CatEvenLanes16x8;
4263          break;
4264       case 0:
4265          *doubler = Iop_InterleaveLO8x16; *halver = Iop_CatEvenLanes8x16;
4266          break;
4267       default:
4268          vassert(0);
4269    }
4270 }
4271
4272 /* Do interleaving for 1 64 bit vector, for ST1 insns. */
4273 static
4274 void math_INTERLEAVE1_64( /*OUTx1*/ IRTemp* i0,
4275                           UInt laneSzBlg2, IRTemp u0 )
4276 {
4277    assign(*i0, mkexpr(u0));
4278 }
4279
4280
4281 /* Do interleaving for 2 64 bit vectors, for ST2 insns. */
4282 static
4283 void math_INTERLEAVE2_64( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
4284                           UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
4285 {
4286    if (laneSzBlg2 == 3) {
4287       // 1x64, degenerate case
4288       assign(*i0, EX(u0));
4289       assign(*i1, EX(u1));
4290       return;
4291    }
4292
4293    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4294    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4295    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4296
4297    IRTemp du0 = newTempV128();
4298    IRTemp du1 = newTempV128();
4299    assign(du0, binop(doubler, EX(u0), EX(u0)));
4300    assign(du1, binop(doubler, EX(u1), EX(u1)));
4301    IRTemp di0 = newTempV128();
4302    IRTemp di1 = newTempV128();
4303    math_INTERLEAVE2_128(&di0, &di1, laneSzBlg2 + 1, du0, du1);
4304    assign(*i0, binop(halver, EX(di0), EX(di0)));
4305    assign(*i1, binop(halver, EX(di1), EX(di1)));
4306 }
4307
4308
4309 /* Do interleaving for 3 64 bit vectors, for ST3 insns. */
4310 static
4311 void math_INTERLEAVE3_64(
4312         /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
4313         UInt laneSzBlg2,
4314         IRTemp u0, IRTemp u1, IRTemp u2 )
4315 {
4316    if (laneSzBlg2 == 3) {
4317       // 1x64, degenerate case
4318       assign(*i0, EX(u0));
4319       assign(*i1, EX(u1));
4320       assign(*i2, EX(u2));
4321       return;
4322    }
4323
4324    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4325    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4326    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4327
4328    IRTemp du0 = newTempV128();
4329    IRTemp du1 = newTempV128();
4330    IRTemp du2 = newTempV128();
4331    assign(du0, binop(doubler, EX(u0), EX(u0)));
4332    assign(du1, binop(doubler, EX(u1), EX(u1)));
4333    assign(du2, binop(doubler, EX(u2), EX(u2)));
4334    IRTemp di0 = newTempV128();
4335    IRTemp di1 = newTempV128();
4336    IRTemp di2 = newTempV128();
4337    math_INTERLEAVE3_128(&di0, &di1, &di2, laneSzBlg2 + 1, du0, du1, du2);
4338    assign(*i0, binop(halver, EX(di0), EX(di0)));
4339    assign(*i1, binop(halver, EX(di1), EX(di1)));
4340    assign(*i2, binop(halver, EX(di2), EX(di2)));
4341 }
4342
4343
4344 /* Do interleaving for 4 64 bit vectors, for ST4 insns. */
4345 static
4346 void math_INTERLEAVE4_64(
4347         /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
4348         UInt laneSzBlg2,
4349         IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
4350 {
4351    if (laneSzBlg2 == 3) {
4352       // 1x64, degenerate case
4353       assign(*i0, EX(u0));
4354       assign(*i1, EX(u1));
4355       assign(*i2, EX(u2));
4356       assign(*i3, EX(u3));
4357       return;
4358    }
4359
4360    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4361    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4362    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4363
4364    IRTemp du0 = newTempV128();
4365    IRTemp du1 = newTempV128();
4366    IRTemp du2 = newTempV128();
4367    IRTemp du3 = newTempV128();
4368    assign(du0, binop(doubler, EX(u0), EX(u0)));
4369    assign(du1, binop(doubler, EX(u1), EX(u1)));
4370    assign(du2, binop(doubler, EX(u2), EX(u2)));
4371    assign(du3, binop(doubler, EX(u3), EX(u3)));
4372    IRTemp di0 = newTempV128();
4373    IRTemp di1 = newTempV128();
4374    IRTemp di2 = newTempV128();
4375    IRTemp di3 = newTempV128();
4376    math_INTERLEAVE4_128(&di0, &di1, &di2, &di3,
4377                         laneSzBlg2 + 1, du0, du1, du2, du3);
4378    assign(*i0, binop(halver, EX(di0), EX(di0)));
4379    assign(*i1, binop(halver, EX(di1), EX(di1)));
4380    assign(*i2, binop(halver, EX(di2), EX(di2)));
4381    assign(*i3, binop(halver, EX(di3), EX(di3)));
4382 }
4383
4384
4385 /* Do deinterleaving for 1 64 bit vector, for LD1 insns. */
4386 static
4387 void math_DEINTERLEAVE1_64( /*OUTx1*/ IRTemp* u0,
4388                             UInt laneSzBlg2, IRTemp i0 )
4389 {
4390    assign(*u0, mkexpr(i0));
4391 }
4392
4393
4394 /* Do deinterleaving for 2 64 bit vectors, for LD2 insns. */
4395 static
4396 void math_DEINTERLEAVE2_64( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
4397                             UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
4398 {
4399    if (laneSzBlg2 == 3) {
4400       // 1x64, degenerate case
4401       assign(*u0, EX(i0));
4402       assign(*u1, EX(i1));
4403       return;
4404    }
4405
4406    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4407    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4408    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4409
4410    IRTemp di0 = newTempV128();
4411    IRTemp di1 = newTempV128();
4412    assign(di0, binop(doubler, EX(i0), EX(i0)));
4413    assign(di1, binop(doubler, EX(i1), EX(i1)));
4414
4415    IRTemp du0 = newTempV128();
4416    IRTemp du1 = newTempV128();
4417    math_DEINTERLEAVE2_128(&du0, &du1, laneSzBlg2 + 1, di0, di1);
4418    assign(*u0, binop(halver, EX(du0), EX(du0)));
4419    assign(*u1, binop(halver, EX(du1), EX(du1)));
4420 }
4421
4422
4423 /* Do deinterleaving for 3 64 bit vectors, for LD3 insns. */
4424 static
4425 void math_DEINTERLEAVE3_64(
4426         /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
4427         UInt laneSzBlg2,
4428         IRTemp i0, IRTemp i1, IRTemp i2 )
4429 {
4430    if (laneSzBlg2 == 3) {
4431       // 1x64, degenerate case
4432       assign(*u0, EX(i0));
4433       assign(*u1, EX(i1));
4434       assign(*u2, EX(i2));
4435       return;
4436    }
4437
4438    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4439    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4440    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4441
4442    IRTemp di0 = newTempV128();
4443    IRTemp di1 = newTempV128();
4444    IRTemp di2 = newTempV128();
4445    assign(di0, binop(doubler, EX(i0), EX(i0)));
4446    assign(di1, binop(doubler, EX(i1), EX(i1)));
4447    assign(di2, binop(doubler, EX(i2), EX(i2)));
4448    IRTemp du0 = newTempV128();
4449    IRTemp du1 = newTempV128();
4450    IRTemp du2 = newTempV128();
4451    math_DEINTERLEAVE3_128(&du0, &du1, &du2, laneSzBlg2 + 1, di0, di1, di2);
4452    assign(*u0, binop(halver, EX(du0), EX(du0)));
4453    assign(*u1, binop(halver, EX(du1), EX(du1)));
4454    assign(*u2, binop(halver, EX(du2), EX(du2)));
4455 }
4456
4457
4458 /* Do deinterleaving for 4 64 bit vectors, for LD4 insns. */
4459 static
4460 void math_DEINTERLEAVE4_64(
4461         /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
4462         UInt laneSzBlg2,
4463         IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
4464 {
4465    if (laneSzBlg2 == 3) {
4466       // 1x64, degenerate case
4467       assign(*u0, EX(i0));
4468       assign(*u1, EX(i1));
4469       assign(*u2, EX(i2));
4470       assign(*u3, EX(i3));
4471       return;
4472    }
4473
4474    vassert(laneSzBlg2 >= 0 && laneSzBlg2 <= 2);
4475    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4476    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4477
4478    IRTemp di0 = newTempV128();
4479    IRTemp di1 = newTempV128();
4480    IRTemp di2 = newTempV128();
4481    IRTemp di3 = newTempV128();
4482    assign(di0, binop(doubler, EX(i0), EX(i0)));
4483    assign(di1, binop(doubler, EX(i1), EX(i1)));
4484    assign(di2, binop(doubler, EX(i2), EX(i2)));
4485    assign(di3, binop(doubler, EX(i3), EX(i3)));
4486    IRTemp du0 = newTempV128();
4487    IRTemp du1 = newTempV128();
4488    IRTemp du2 = newTempV128();
4489    IRTemp du3 = newTempV128();
4490    math_DEINTERLEAVE4_128(&du0, &du1, &du2, &du3,
4491                           laneSzBlg2 + 1, di0, di1, di2, di3);
4492    assign(*u0, binop(halver, EX(du0), EX(du0)));
4493    assign(*u1, binop(halver, EX(du1), EX(du1)));
4494    assign(*u2, binop(halver, EX(du2), EX(du2)));
4495    assign(*u3, binop(halver, EX(du3), EX(du3)));
4496 }
4497
4498
4499 #undef EX
4500 #undef SL
4501 #undef ROR
4502 #undef ROL
4503 #undef SHR
4504 #undef SHL
4505 #undef ILO64x2
4506 #undef IHI64x2
4507 #undef ILO32x4
4508 #undef IHI32x4
4509 #undef ILO16x8
4510 #undef IHI16x8
4511 #undef ILO16x8
4512 #undef IHI16x8
4513 #undef CEV32x4
4514 #undef COD32x4
4515 #undef COD16x8
4516 #undef COD8x16
4517 #undef CEV8x16
4518 #undef AND
4519 #undef OR2
4520 #undef OR3
4521 #undef OR4
4522
4523
4524 /*------------------------------------------------------------*/
4525 /*--- Load and Store instructions                          ---*/
4526 /*------------------------------------------------------------*/
4527
4528 /* Generate the EA for a "reg + reg" style amode.  This is done from
4529    parts of the insn, but for sanity checking sake it takes the whole
4530    insn.  This appears to depend on insn[15:12], with opt=insn[15:13]
4531    and S=insn[12]:
4532
4533    The possible forms, along with their opt:S values, are:
4534       011:0   Xn|SP + Xm
4535       111:0   Xn|SP + Xm
4536       011:1   Xn|SP + Xm * transfer_szB
4537       111:1   Xn|SP + Xm * transfer_szB
4538       010:0   Xn|SP + 32Uto64(Wm)
4539       010:1   Xn|SP + 32Uto64(Wm) * transfer_szB
4540       110:0   Xn|SP + 32Sto64(Wm)
4541       110:1   Xn|SP + 32Sto64(Wm) * transfer_szB
4542
4543    Rm is insn[20:16].  Rn is insn[9:5].  Rt is insn[4:0].  Log2 of
4544    the transfer size is insn[23,31,30].  For integer loads/stores,
4545    insn[23] is zero, hence szLg2 can be at most 3 in such cases.
4546
4547    If the decoding fails, it returns IRTemp_INVALID.
4548
4549    isInt is True iff this is decoding is for transfers to/from integer
4550    registers.  If False it is for transfers to/from vector registers.
4551 */
4552 static IRTemp gen_indexed_EA ( /*OUT*/HChar* buf, UInt insn, Bool isInt )
4553 {
4554    UInt    optS  = SLICE_UInt(insn, 15, 12);
4555    UInt    mm    = SLICE_UInt(insn, 20, 16);
4556    UInt    nn    = SLICE_UInt(insn, 9, 5);
4557    UInt    szLg2 = (isInt ? 0 : (SLICE_UInt(insn, 23, 23) << 2))
4558                    | SLICE_UInt(insn, 31, 30); // Log2 of the size
4559
4560    buf[0] = 0;
4561
4562    /* Sanity checks, that this really is a load/store insn. */
4563    if (SLICE_UInt(insn, 11, 10) != BITS2(1,0))
4564       goto fail;
4565
4566    if (isInt
4567        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,1,1)/*LDR*/
4568        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,0,1)/*STR*/
4569        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,0,1)/*LDRSbhw Xt*/
4570        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,1,1))/*LDRSbhw Wt*/
4571       goto fail;
4572
4573    if (!isInt
4574        && SLICE_UInt(insn, 29, 24) != BITS6(1,1,1,1,0,0)) /*LDR/STR*/
4575       goto fail;
4576
4577    /* Throw out non-verified but possibly valid cases. */
4578    switch (szLg2) {
4579       case BITS3(0,0,0): break; //  8 bit, valid for both int and vec
4580       case BITS3(0,0,1): break; // 16 bit, valid for both int and vec
4581       case BITS3(0,1,0): break; // 32 bit, valid for both int and vec
4582       case BITS3(0,1,1): break; // 64 bit, valid for both int and vec
4583       case BITS3(1,0,0): // can only ever be valid for the vector case
4584                          if (isInt) goto fail; else break;
4585       case BITS3(1,0,1): // these sizes are never valid
4586       case BITS3(1,1,0):
4587       case BITS3(1,1,1): goto fail;
4588
4589       default: vassert(0);
4590    }
4591
4592    IRExpr* rhs  = NULL;
4593    switch (optS) {
4594       case BITS4(1,1,1,0): goto fail; //ATC
4595       case BITS4(0,1,1,0):
4596          rhs = getIReg64orZR(mm);
4597          vex_sprintf(buf, "[%s, %s]",
4598                      nameIReg64orZR(nn), nameIReg64orZR(mm));
4599          break;
4600       case BITS4(1,1,1,1): goto fail; //ATC
4601       case BITS4(0,1,1,1):
4602          rhs = binop(Iop_Shl64, getIReg64orZR(mm), mkU8(szLg2));
4603          vex_sprintf(buf, "[%s, %s lsl %u]",
4604                      nameIReg64orZR(nn), nameIReg64orZR(mm), szLg2);
4605          break;
4606       case BITS4(0,1,0,0):
4607          rhs = unop(Iop_32Uto64, getIReg32orZR(mm));
4608          vex_sprintf(buf, "[%s, %s uxtx]",
4609                      nameIReg64orZR(nn), nameIReg32orZR(mm));
4610          break;
4611       case BITS4(0,1,0,1):
4612          rhs = binop(Iop_Shl64,
4613                      unop(Iop_32Uto64, getIReg32orZR(mm)), mkU8(szLg2));
4614          vex_sprintf(buf, "[%s, %s uxtx, lsl %u]",
4615                      nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
4616          break;
4617       case BITS4(1,1,0,0):
4618          rhs = unop(Iop_32Sto64, getIReg32orZR(mm));
4619          vex_sprintf(buf, "[%s, %s sxtx]",
4620                      nameIReg64orZR(nn), nameIReg32orZR(mm));
4621          break;
4622       case BITS4(1,1,0,1):
4623          rhs = binop(Iop_Shl64,
4624                      unop(Iop_32Sto64, getIReg32orZR(mm)), mkU8(szLg2));
4625          vex_sprintf(buf, "[%s, %s sxtx, lsl %u]",
4626                      nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
4627          break;
4628       default:
4629          /* The rest appear to be genuinely invalid */
4630          goto fail;
4631    }
4632
4633    vassert(rhs);
4634    IRTemp res = newTemp(Ity_I64);
4635    assign(res, binop(Iop_Add64, getIReg64orSP(nn), rhs));
4636    return res;
4637
4638   fail:
4639    vex_printf("gen_indexed_EA: unhandled case optS == 0x%x\n", optS);
4640    return IRTemp_INVALID;
4641 }
4642
4643
4644 /* Generate an 8/16/32/64 bit integer store to ADDR for the lowest
4645    bits of DATAE :: Ity_I64. */
4646 static void gen_narrowing_store ( UInt szB, IRTemp addr, IRExpr* dataE )
4647 {
4648    IRExpr* addrE = mkexpr(addr);
4649    switch (szB) {
4650       case 8:
4651          storeLE(addrE, dataE);
4652          break;
4653       case 4:
4654          storeLE(addrE, unop(Iop_64to32, dataE));
4655          break;
4656       case 2:
4657          storeLE(addrE, unop(Iop_64to16, dataE));
4658          break;
4659       case 1:
4660          storeLE(addrE, unop(Iop_64to8, dataE));
4661          break;
4662       default:
4663          vassert(0);
4664    }
4665 }
4666
4667
4668 /* Generate an 8/16/32/64 bit unsigned widening load from ADDR,
4669    placing the result in an Ity_I64 temporary. */
4670 static IRTemp gen_zwidening_load ( UInt szB, IRTemp addr )
4671 {
4672    IRTemp  res   = newTemp(Ity_I64);
4673    IRExpr* addrE = mkexpr(addr);
4674    switch (szB) {
4675       case 8:
4676          assign(res, loadLE(Ity_I64,addrE));
4677          break;
4678       case 4:
4679          assign(res, unop(Iop_32Uto64, loadLE(Ity_I32,addrE)));
4680          break;
4681       case 2:
4682          assign(res, unop(Iop_16Uto64, loadLE(Ity_I16,addrE)));
4683          break;
4684       case 1:
4685          assign(res, unop(Iop_8Uto64, loadLE(Ity_I8,addrE)));
4686          break;
4687       default:
4688          vassert(0);
4689    }
4690    return res;
4691 }
4692
4693
4694 /* Generate a "standard 7" name, from bitQ and size.  But also
4695    allow ".1d" since that's occasionally useful. */
4696 static
4697 const HChar* nameArr_Q_SZ ( UInt bitQ, UInt size )
4698 {
4699    vassert(bitQ <= 1 && size <= 3);
4700    const HChar* nms[8]
4701       = { "8b", "4h", "2s", "1d", "16b", "8h", "4s", "2d" };
4702    UInt ix = (bitQ << 2) | size;
4703    vassert(ix < 8);
4704    return nms[ix];
4705 }
4706
4707
4708 static
4709 Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
4710                           const VexAbiInfo* abiinfo
4711 )
4712 {
4713 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
4714
4715    /* ------------ LDR,STR (immediate, uimm12) ----------- */
4716    /* uimm12 is scaled by the transfer size
4717
4718       31 29  26    21    9  4
4719       |  |   |     |     |  |
4720       11 111 00100 imm12 nn tt    STR  Xt, [Xn|SP, #imm12 * 8]
4721       11 111 00101 imm12 nn tt    LDR  Xt, [Xn|SP, #imm12 * 8]
4722
4723       10 111 00100 imm12 nn tt    STR  Wt, [Xn|SP, #imm12 * 4]
4724       10 111 00101 imm12 nn tt    LDR  Wt, [Xn|SP, #imm12 * 4]
4725
4726       01 111 00100 imm12 nn tt    STRH Wt, [Xn|SP, #imm12 * 2]
4727       01 111 00101 imm12 nn tt    LDRH Wt, [Xn|SP, #imm12 * 2]
4728
4729       00 111 00100 imm12 nn tt    STRB Wt, [Xn|SP, #imm12 * 1]
4730       00 111 00101 imm12 nn tt    LDRB Wt, [Xn|SP, #imm12 * 1]
4731    */
4732    if (INSN(29,23) == BITS7(1,1,1,0,0,1,0)) {
4733       UInt   szLg2 = INSN(31,30);
4734       UInt   szB   = 1 << szLg2;
4735       Bool   isLD  = INSN(22,22) == 1;
4736       UInt   offs  = INSN(21,10) * szB;
4737       UInt   nn    = INSN(9,5);
4738       UInt   tt    = INSN(4,0);
4739       IRTemp ta    = newTemp(Ity_I64);
4740       assign(ta, binop(Iop_Add64, getIReg64orSP(nn), mkU64(offs)));
4741       if (nn == 31) { /* FIXME generate stack alignment check */ }
4742       vassert(szLg2 < 4);
4743       if (isLD) {
4744          putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, ta)));
4745       } else {
4746          gen_narrowing_store(szB, ta, getIReg64orZR(tt));
4747       }
4748       const HChar* ld_name[4] = { "ldrb", "ldrh", "ldr", "ldr" };
4749       const HChar* st_name[4] = { "strb", "strh", "str", "str" };
4750       DIP("%s %s, [%s, #%u]\n",
4751           (isLD ? ld_name : st_name)[szLg2], nameIRegOrZR(szB == 8, tt),
4752           nameIReg64orSP(nn), offs);
4753       return True;
4754    }
4755
4756    /* ------------ LDUR,STUR (immediate, simm9) ----------- */
4757    /*
4758       31 29  26      20   11 9  4
4759       |  |   |       |    |  |  |
4760       (at-Rn-then-Rn=EA)  |  |  |
4761       sz 111 00000 0 imm9 01 Rn Rt   STR Rt, [Xn|SP], #simm9
4762       sz 111 00001 0 imm9 01 Rn Rt   LDR Rt, [Xn|SP], #simm9
4763
4764       (at-EA-then-Rn=EA)
4765       sz 111 00000 0 imm9 11 Rn Rt   STR Rt, [Xn|SP, #simm9]!
4766       sz 111 00001 0 imm9 11 Rn Rt   LDR Rt, [Xn|SP, #simm9]!
4767
4768       (at-EA)
4769       sz 111 00000 0 imm9 00 Rn Rt   STR Rt, [Xn|SP, #simm9]
4770       sz 111 00001 0 imm9 00 Rn Rt   LDR Rt, [Xn|SP, #simm9]
4771
4772       simm9 is unscaled.
4773
4774       The case 'wback && Rn == Rt && Rt != 31' is disallowed.  In the
4775       load case this is because would create two competing values for
4776       Rt.  In the store case the reason is unclear, but the spec
4777       disallows it anyway.
4778
4779       Stores are narrowing, loads are unsigned widening.  sz encodes
4780       the transfer size in the normal way: 00=1, 01=2, 10=4, 11=8.
4781    */
4782    if ((INSN(29,21) & BITS9(1,1,1, 1,1,1,1,0, 1))
4783        == BITS9(1,1,1, 0,0,0,0,0, 0)) {
4784       UInt szLg2  = INSN(31,30);
4785       UInt szB    = 1 << szLg2;
4786       Bool isLoad = INSN(22,22) == 1;
4787       UInt imm9   = INSN(20,12);
4788       UInt nn     = INSN(9,5);
4789       UInt tt     = INSN(4,0);
4790       Bool wBack  = INSN(10,10) == 1;
4791       UInt how    = INSN(11,10);
4792       if (how == BITS2(1,0) || (wBack && nn == tt && tt != 31)) {
4793          /* undecodable; fall through */
4794       } else {
4795          if (nn == 31) { /* FIXME generate stack alignment check */ }
4796
4797          // Compute the transfer address TA and the writeback address WA.
4798          IRTemp tRN = newTemp(Ity_I64);
4799          assign(tRN, getIReg64orSP(nn));
4800          IRTemp tEA = newTemp(Ity_I64);
4801          Long simm9 = (Long)sx_to_64(imm9, 9);
4802          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
4803
4804          IRTemp tTA = newTemp(Ity_I64);
4805          IRTemp tWA = newTemp(Ity_I64);
4806          switch (how) {
4807             case BITS2(0,1):
4808                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
4809             case BITS2(1,1):
4810                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
4811             case BITS2(0,0):
4812                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
4813             default:
4814                vassert(0); /* NOTREACHED */
4815          }
4816
4817          /* Normally rN would be updated after the transfer.  However, in
4818             the special cases typifed by
4819                str x30, [sp,#-16]!
4820                str w1, [sp,#-32]!
4821             it is necessary to update SP before the transfer, (1)
4822             because Memcheck will otherwise complain about a write
4823             below the stack pointer, and (2) because the segfault
4824             stack extension mechanism will otherwise extend the stack
4825             only down to SP before the instruction, which might not be
4826             far enough, if the -16/-32 bit takes the actual access
4827             address to the next page.
4828          */
4829          Bool earlyWBack
4830            = wBack && simm9 < 0 && (szB == 8 || szB == 4)
4831              && how == BITS2(1,1) && nn == 31 && !isLoad;
4832
4833          if (wBack && earlyWBack)
4834             putIReg64orSP(nn, mkexpr(tEA));
4835
4836          if (isLoad) {
4837             putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, tTA)));
4838          } else {
4839             gen_narrowing_store(szB, tTA, getIReg64orZR(tt));
4840          }
4841
4842          if (wBack && !earlyWBack)
4843             putIReg64orSP(nn, mkexpr(tEA));
4844
4845          const HChar* ld_name[4] = { "ldurb", "ldurh", "ldur", "ldur" };
4846          const HChar* st_name[4] = { "sturb", "sturh", "stur", "stur" };
4847          const HChar* fmt_str = NULL;
4848          switch (how) {
4849             case BITS2(0,1):
4850                fmt_str = "%s %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
4851                break;
4852             case BITS2(1,1):
4853                fmt_str = "%s %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
4854                break;
4855             case BITS2(0,0):
4856                fmt_str = "%s %s, [%s, #%lld] (at-Rn)\n";
4857                break;
4858             default:
4859                vassert(0);
4860          }
4861          DIP(fmt_str, (isLoad ? ld_name : st_name)[szLg2],
4862                       nameIRegOrZR(szB == 8, tt),
4863                       nameIReg64orSP(nn), simm9);
4864          return True;
4865       }
4866    }
4867
4868    /* -------- LDP,STP (immediate, simm7) (INT REGS) -------- */
4869    /* L==1 => mm==LD
4870       L==0 => mm==ST
4871       x==0 => 32 bit transfers, and zero extended loads
4872       x==1 => 64 bit transfers
4873       simm7 is scaled by the (single-register) transfer size
4874
4875       (at-Rn-then-Rn=EA)
4876       x0 101 0001 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP], #imm
4877
4878       (at-EA-then-Rn=EA)
4879       x0 101 0011 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]!
4880
4881       (at-EA)
4882       x0 101 0010 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]
4883    */
4884    UInt insn_30_23 = INSN(30,23);
4885    if (insn_30_23 == BITS8(0,1,0,1,0,0,0,1)
4886        || insn_30_23 == BITS8(0,1,0,1,0,0,1,1)
4887        || insn_30_23 == BITS8(0,1,0,1,0,0,1,0)) {
4888       UInt bL     = INSN(22,22);
4889       UInt bX     = INSN(31,31);
4890       UInt bWBack = INSN(23,23);
4891       UInt rT1    = INSN(4,0);
4892       UInt rN     = INSN(9,5);
4893       UInt rT2    = INSN(14,10);
4894       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
4895       if ((bWBack && (rT1 == rN || rT2 == rN) && rN != 31)
4896           || (bL && rT1 == rT2)) {
4897          /* undecodable; fall through */
4898       } else {
4899          if (rN == 31) { /* FIXME generate stack alignment check */ }
4900
4901          // Compute the transfer address TA and the writeback address WA.
4902          IRTemp tRN = newTemp(Ity_I64);
4903          assign(tRN, getIReg64orSP(rN));
4904          IRTemp tEA = newTemp(Ity_I64);
4905          simm7 = (bX ? 8 : 4) * simm7;
4906          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
4907
4908          IRTemp tTA = newTemp(Ity_I64);
4909          IRTemp tWA = newTemp(Ity_I64);
4910          switch (INSN(24,23)) {
4911             case BITS2(0,1):
4912                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
4913             case BITS2(1,1):
4914                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
4915             case BITS2(1,0):
4916                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
4917             default:
4918                vassert(0); /* NOTREACHED */
4919          }
4920
4921          /* Normally rN would be updated after the transfer.  However, in
4922             the special case typifed by
4923                stp x29, x30, [sp,#-112]!
4924             it is necessary to update SP before the transfer, (1)
4925             because Memcheck will otherwise complain about a write
4926             below the stack pointer, and (2) because the segfault
4927             stack extension mechanism will otherwise extend the stack
4928             only down to SP before the instruction, which might not be
4929             far enough, if the -112 bit takes the actual access
4930             address to the next page.
4931          */
4932          Bool earlyWBack
4933            = bWBack && simm7 < 0
4934              && INSN(24,23) == BITS2(1,1) && rN == 31 && bL == 0;
4935
4936          if (bWBack && earlyWBack)
4937             putIReg64orSP(rN, mkexpr(tEA));
4938
4939          /**/ if (bL == 1 && bX == 1) {
4940             // 64 bit load
4941             putIReg64orZR(rT1, loadLE(Ity_I64,
4942                                       binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
4943             putIReg64orZR(rT2, loadLE(Ity_I64,
4944                                       binop(Iop_Add64,mkexpr(tTA),mkU64(8))));
4945          } else if (bL == 1 && bX == 0) {
4946             // 32 bit load
4947             putIReg32orZR(rT1, loadLE(Ity_I32,
4948                                       binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
4949             putIReg32orZR(rT2, loadLE(Ity_I32,
4950                                       binop(Iop_Add64,mkexpr(tTA),mkU64(4))));
4951          } else if (bL == 0 && bX == 1) {
4952             // 64 bit store
4953             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
4954                     getIReg64orZR(rT1));
4955             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(8)),
4956                     getIReg64orZR(rT2));
4957          } else {
4958             vassert(bL == 0 && bX == 0);
4959             // 32 bit store
4960             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
4961                     getIReg32orZR(rT1));
4962             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(4)),
4963                     getIReg32orZR(rT2));
4964          }
4965
4966          if (bWBack && !earlyWBack)
4967             putIReg64orSP(rN, mkexpr(tEA));
4968
4969          const HChar* fmt_str = NULL;
4970          switch (INSN(24,23)) {
4971             case BITS2(0,1):
4972                fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
4973                break;
4974             case BITS2(1,1):
4975                fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
4976                break;
4977             case BITS2(1,0):
4978                fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
4979                break;
4980             default:
4981                vassert(0);
4982          }
4983          DIP(fmt_str, bL == 0 ? "st" : "ld",
4984                       nameIRegOrZR(bX == 1, rT1),
4985                       nameIRegOrZR(bX == 1, rT2),
4986                       nameIReg64orSP(rN), simm7);
4987          return True;
4988       }
4989    }
4990
4991    /* -------- LDPSW (immediate, simm7) (INT REGS) -------- */
4992    /* Does 32 bit transfers which are sign extended to 64 bits.
4993       simm7 is scaled by the (single-register) transfer size
4994
4995       (at-Rn-then-Rn=EA)
4996       01 101 0001 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP], #imm
4997
4998       (at-EA-then-Rn=EA)
4999       01 101 0011 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP, #imm]!
5000
5001       (at-EA)
5002       01 101 0010 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP, #imm]
5003    */
5004    UInt insn_31_22 = INSN(31,22);
5005    if (insn_31_22 == BITS10(0,1,1,0,1,0,0,0,1,1)
5006        || insn_31_22 == BITS10(0,1,1,0,1,0,0,1,1,1)
5007        || insn_31_22 == BITS10(0,1,1,0,1,0,0,1,0,1)) {
5008       UInt bWBack = INSN(23,23);
5009       UInt rT1    = INSN(4,0);
5010       UInt rN     = INSN(9,5);
5011       UInt rT2    = INSN(14,10);
5012       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5013       if ((bWBack && (rT1 == rN || rT2 == rN) && rN != 31)
5014           || (rT1 == rT2)) {
5015          /* undecodable; fall through */
5016       } else {
5017          if (rN == 31) { /* FIXME generate stack alignment check */ }
5018
5019          // Compute the transfer address TA and the writeback address WA.
5020          IRTemp tRN = newTemp(Ity_I64);
5021          assign(tRN, getIReg64orSP(rN));
5022          IRTemp tEA = newTemp(Ity_I64);
5023          simm7 = 4 * simm7;
5024          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5025
5026          IRTemp tTA = newTemp(Ity_I64);
5027          IRTemp tWA = newTemp(Ity_I64);
5028          switch (INSN(24,23)) {
5029             case BITS2(0,1):
5030                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5031             case BITS2(1,1):
5032                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5033             case BITS2(1,0):
5034                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5035             default:
5036                vassert(0); /* NOTREACHED */
5037          }
5038
5039          // 32 bit load, sign extended to 64 bits
5040          putIReg64orZR(rT1, unop(Iop_32Sto64,
5041                                  loadLE(Ity_I32, binop(Iop_Add64,
5042                                                        mkexpr(tTA),
5043                                                        mkU64(0)))));
5044          putIReg64orZR(rT2, unop(Iop_32Sto64,
5045                                  loadLE(Ity_I32, binop(Iop_Add64,
5046                                                        mkexpr(tTA),
5047                                                        mkU64(4)))));
5048          if (bWBack)
5049             putIReg64orSP(rN, mkexpr(tEA));
5050
5051          const HChar* fmt_str = NULL;
5052          switch (INSN(24,23)) {
5053             case BITS2(0,1):
5054                fmt_str = "ldpsw %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5055                break;
5056             case BITS2(1,1):
5057                fmt_str = "ldpsw %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5058                break;
5059             case BITS2(1,0):
5060                fmt_str = "ldpsw %s, %s, [%s, #%lld] (at-Rn)\n";
5061                break;
5062             default:
5063                vassert(0);
5064          }
5065          DIP(fmt_str, nameIReg64orZR(rT1),
5066                       nameIReg64orZR(rT2),
5067                       nameIReg64orSP(rN), simm7);
5068          return True;
5069       }
5070    }
5071
5072    /* ---------------- LDR (literal, int reg) ---------------- */
5073    /* 31 29      23    4
5074       00 011 000 imm19 Rt   LDR   Wt, [PC + sxTo64(imm19 << 2)]
5075       01 011 000 imm19 Rt   LDR   Xt, [PC + sxTo64(imm19 << 2)]
5076       10 011 000 imm19 Rt   LDRSW Xt, [PC + sxTo64(imm19 << 2)]
5077       11 011 000 imm19 Rt   prefetch  [PC + sxTo64(imm19 << 2)]
5078       Just handles the first two cases for now.
5079    */
5080    if (INSN(29,24) == BITS6(0,1,1,0,0,0) && INSN(31,31) == 0) {
5081       UInt  imm19 = INSN(23,5);
5082       UInt  rT    = INSN(4,0);
5083       UInt  bX    = INSN(30,30);
5084       ULong ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
5085       if (bX) {
5086          putIReg64orZR(rT, loadLE(Ity_I64, mkU64(ea)));
5087       } else {
5088          putIReg32orZR(rT, loadLE(Ity_I32, mkU64(ea)));
5089       }
5090       DIP("ldr %s, 0x%llx (literal)\n", nameIRegOrZR(bX == 1, rT), ea);
5091       return True;
5092    }
5093
5094    /* -------------- {LD,ST}R (integer register) --------------- */
5095    /* 31 29        20 15     12 11 9  4
5096       |  |         |  |      |  |  |  |
5097       11 111000011 Rm option S  10 Rn Rt  LDR  Xt, [Xn|SP, R<m>{ext/sh}]
5098       10 111000011 Rm option S  10 Rn Rt  LDR  Wt, [Xn|SP, R<m>{ext/sh}]
5099       01 111000011 Rm option S  10 Rn Rt  LDRH Wt, [Xn|SP, R<m>{ext/sh}]
5100       00 111000011 Rm option S  10 Rn Rt  LDRB Wt, [Xn|SP, R<m>{ext/sh}]
5101
5102       11 111000001 Rm option S  10 Rn Rt  STR  Xt, [Xn|SP, R<m>{ext/sh}]
5103       10 111000001 Rm option S  10 Rn Rt  STR  Wt, [Xn|SP, R<m>{ext/sh}]
5104       01 111000001 Rm option S  10 Rn Rt  STRH Wt, [Xn|SP, R<m>{ext/sh}]
5105       00 111000001 Rm option S  10 Rn Rt  STRB Wt, [Xn|SP, R<m>{ext/sh}]
5106    */
5107    if (INSN(29,23) == BITS7(1,1,1,0,0,0,0)
5108        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5109       HChar  dis_buf[64];
5110       UInt   szLg2 = INSN(31,30);
5111       Bool   isLD  = INSN(22,22) == 1;
5112       UInt   tt    = INSN(4,0);
5113       IRTemp ea    = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
5114       if (ea != IRTemp_INVALID) {
5115          switch (szLg2) {
5116             case 3: /* 64 bit */
5117                if (isLD) {
5118                   putIReg64orZR(tt, loadLE(Ity_I64, mkexpr(ea)));
5119                   DIP("ldr %s, %s\n", nameIReg64orZR(tt), dis_buf);
5120                } else {
5121                   storeLE(mkexpr(ea), getIReg64orZR(tt));
5122                   DIP("str %s, %s\n", nameIReg64orZR(tt), dis_buf);
5123                }
5124                break;
5125             case 2: /* 32 bit */
5126                if (isLD) {
5127                   putIReg32orZR(tt, loadLE(Ity_I32, mkexpr(ea)));
5128                   DIP("ldr %s, %s\n", nameIReg32orZR(tt), dis_buf);
5129                } else {
5130                   storeLE(mkexpr(ea), getIReg32orZR(tt));
5131                   DIP("str %s, %s\n", nameIReg32orZR(tt), dis_buf);
5132                }
5133                break;
5134             case 1: /* 16 bit */
5135                if (isLD) {
5136                   putIReg64orZR(tt, unop(Iop_16Uto64,
5137                                          loadLE(Ity_I16, mkexpr(ea))));
5138                   DIP("ldruh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5139                } else {
5140                   storeLE(mkexpr(ea), unop(Iop_64to16, getIReg64orZR(tt)));
5141                   DIP("strh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5142                }
5143                break;
5144             case 0: /* 8 bit */
5145                if (isLD) {
5146                   putIReg64orZR(tt, unop(Iop_8Uto64,
5147                                          loadLE(Ity_I8, mkexpr(ea))));
5148                   DIP("ldrub %s, %s\n", nameIReg32orZR(tt), dis_buf);
5149                } else {
5150                   storeLE(mkexpr(ea), unop(Iop_64to8, getIReg64orZR(tt)));
5151                   DIP("strb %s, %s\n", nameIReg32orZR(tt), dis_buf);
5152                }
5153                break;
5154             default:
5155                vassert(0);
5156          }
5157          return True;
5158       }
5159    }
5160
5161    /* -------------- LDRS{B,H,W} (uimm12) -------------- */
5162    /* 31 29  26  23 21    9 4
5163       10 111 001 10 imm12 n t   LDRSW Xt, [Xn|SP, #pimm12 * 4]
5164       01 111 001 1x imm12 n t   LDRSH Rt, [Xn|SP, #pimm12 * 2]
5165       00 111 001 1x imm12 n t   LDRSB Rt, [Xn|SP, #pimm12 * 1]
5166       where
5167          Rt is Wt when x==1, Xt when x==0
5168    */
5169    if (INSN(29,23) == BITS7(1,1,1,0,0,1,1)) {
5170       /* Further checks on bits 31:30 and 22 */
5171       Bool valid = False;
5172       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5173          case BITS3(1,0,0):
5174          case BITS3(0,1,0): case BITS3(0,1,1):
5175          case BITS3(0,0,0): case BITS3(0,0,1):
5176             valid = True;
5177             break;
5178       }
5179       if (valid) {
5180          UInt    szLg2 = INSN(31,30);
5181          UInt    bitX  = INSN(22,22);
5182          UInt    imm12 = INSN(21,10);
5183          UInt    nn    = INSN(9,5);
5184          UInt    tt    = INSN(4,0);
5185          UInt    szB   = 1 << szLg2;
5186          IRExpr* ea    = binop(Iop_Add64,
5187                                getIReg64orSP(nn), mkU64(imm12 * szB));
5188          switch (szB) {
5189             case 4:
5190                vassert(bitX == 0);
5191                putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, ea)));
5192                DIP("ldrsw %s, [%s, #%u]\n", nameIReg64orZR(tt),
5193                    nameIReg64orSP(nn), imm12 * szB);
5194                break;
5195             case 2:
5196                if (bitX == 1) {
5197                   putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, ea)));
5198                } else {
5199                   putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, ea)));
5200                }
5201                DIP("ldrsh %s, [%s, #%u]\n",
5202                    nameIRegOrZR(bitX == 0, tt),
5203                    nameIReg64orSP(nn), imm12 * szB);
5204                break;
5205             case 1:
5206                if (bitX == 1) {
5207                   putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, ea)));
5208                } else {
5209                   putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, ea)));
5210                }
5211                DIP("ldrsb %s, [%s, #%u]\n",
5212                    nameIRegOrZR(bitX == 0, tt),
5213                    nameIReg64orSP(nn), imm12 * szB);
5214                break;
5215             default:
5216                vassert(0);
5217          }
5218          return True;
5219       }
5220       /* else fall through */
5221    }
5222
5223    /* -------------- LDRS{B,H,W} (simm9, upd) -------------- */
5224    /* (at-Rn-then-Rn=EA)
5225       31 29      23 21 20   11 9 4
5226       00 111 000 1x 0  imm9 01 n t  LDRSB Rt, [Xn|SP], #simm9
5227       01 111 000 1x 0  imm9 01 n t  LDRSH Rt, [Xn|SP], #simm9
5228       10 111 000 10 0  imm9 01 n t  LDRSW Xt, [Xn|SP], #simm9
5229
5230       (at-EA-then-Rn=EA)
5231       00 111 000 1x 0  imm9 11 n t  LDRSB Rt, [Xn|SP, #simm9]!
5232       01 111 000 1x 0  imm9 11 n t  LDRSH Rt, [Xn|SP, #simm9]!
5233       10 111 000 10 0  imm9 11 n t  LDRSW Xt, [Xn|SP, #simm9]!
5234       where
5235          Rt is Wt when x==1, Xt when x==0
5236          transfer-at-Rn when [11]==0, at EA when [11]==1
5237    */
5238    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5239        && INSN(21,21) == 0 && INSN(10,10) == 1) {
5240       /* Further checks on bits 31:30 and 22 */
5241       Bool valid = False;
5242       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5243          case BITS3(1,0,0):                    // LDRSW Xt
5244          case BITS3(0,1,0): case BITS3(0,1,1): // LDRSH Xt, Wt
5245          case BITS3(0,0,0): case BITS3(0,0,1): // LDRSB Xt, Wt
5246             valid = True;
5247             break;
5248       }
5249       if (valid) {
5250          UInt   szLg2 = INSN(31,30);
5251          UInt   imm9  = INSN(20,12);
5252          Bool   atRN  = INSN(11,11) == 0;
5253          UInt   nn    = INSN(9,5);
5254          UInt   tt    = INSN(4,0);
5255          IRTemp tRN   = newTemp(Ity_I64);
5256          IRTemp tEA   = newTemp(Ity_I64);
5257          IRTemp tTA   = IRTemp_INVALID;
5258          ULong  simm9 = sx_to_64(imm9, 9);
5259          Bool   is64  = INSN(22,22) == 0;
5260          assign(tRN, getIReg64orSP(nn));
5261          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5262          tTA = atRN ? tRN : tEA;
5263          HChar ch = '?';
5264          /* There are 5 cases:
5265                byte     load,           SX to 64
5266                byte     load, SX to 32, ZX to 64
5267                halfword load,           SX to 64
5268                halfword load, SX to 32, ZX to 64
5269                word     load,           SX to 64
5270             The ifs below handle them in the listed order.
5271          */
5272          if (szLg2 == 0) {
5273             ch = 'b';
5274             if (is64) {
5275                putIReg64orZR(tt, unop(Iop_8Sto64,
5276                                       loadLE(Ity_I8, mkexpr(tTA))));
5277             } else {
5278                putIReg32orZR(tt, unop(Iop_8Sto32,
5279                                       loadLE(Ity_I8, mkexpr(tTA))));
5280             }
5281          }
5282          else if (szLg2 == 1) {
5283             ch = 'h';
5284             if (is64) {
5285                putIReg64orZR(tt, unop(Iop_16Sto64,
5286                                       loadLE(Ity_I16, mkexpr(tTA))));
5287             } else {
5288                putIReg32orZR(tt, unop(Iop_16Sto32,
5289                                       loadLE(Ity_I16, mkexpr(tTA))));
5290             }
5291          }
5292          else if (szLg2 == 2 && is64) {
5293             ch = 'w';
5294             putIReg64orZR(tt, unop(Iop_32Sto64,
5295                                    loadLE(Ity_I32, mkexpr(tTA))));
5296          }
5297          else {
5298             vassert(0);
5299          }
5300          putIReg64orSP(nn, mkexpr(tEA));
5301          DIP(atRN ? "ldrs%c %s, [%s], #%llu\n" : "ldrs%c %s, [%s, #%llu]!",
5302              ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), simm9);
5303          return True;
5304       }
5305       /* else fall through */
5306    }
5307
5308    /* -------------- LDRS{B,H,W} (simm9, noUpd) -------------- */
5309    /* 31 29      23 21 20   11 9 4
5310       00 111 000 1x 0  imm9 00 n t  LDURSB Rt, [Xn|SP, #simm9]
5311       01 111 000 1x 0  imm9 00 n t  LDURSH Rt, [Xn|SP, #simm9]
5312       10 111 000 10 0  imm9 00 n t  LDURSW Xt, [Xn|SP, #simm9]
5313       where
5314          Rt is Wt when x==1, Xt when x==0
5315    */
5316    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5317        && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
5318       /* Further checks on bits 31:30 and 22 */
5319       Bool valid = False;
5320       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5321          case BITS3(1,0,0):                    // LDURSW Xt
5322          case BITS3(0,1,0): case BITS3(0,1,1): // LDURSH Xt, Wt
5323          case BITS3(0,0,0): case BITS3(0,0,1): // LDURSB Xt, Wt
5324             valid = True;
5325             break;
5326       }
5327       if (valid) {
5328          UInt   szLg2 = INSN(31,30);
5329          UInt   imm9  = INSN(20,12);
5330          UInt   nn    = INSN(9,5);
5331          UInt   tt    = INSN(4,0);
5332          IRTemp tRN   = newTemp(Ity_I64);
5333          IRTemp tEA   = newTemp(Ity_I64);
5334          ULong  simm9 = sx_to_64(imm9, 9);
5335          Bool   is64  = INSN(22,22) == 0;
5336          assign(tRN, getIReg64orSP(nn));
5337          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5338          HChar ch = '?';
5339          /* There are 5 cases:
5340                byte     load,           SX to 64
5341                byte     load, SX to 32, ZX to 64
5342                halfword load,           SX to 64
5343                halfword load, SX to 32, ZX to 64
5344                word     load,           SX to 64
5345             The ifs below handle them in the listed order.
5346          */
5347          if (szLg2 == 0) {
5348             ch = 'b';
5349             if (is64) {
5350                putIReg64orZR(tt, unop(Iop_8Sto64,
5351                                       loadLE(Ity_I8, mkexpr(tEA))));
5352             } else {
5353                putIReg32orZR(tt, unop(Iop_8Sto32,
5354                                       loadLE(Ity_I8, mkexpr(tEA))));
5355             }
5356          }
5357          else if (szLg2 == 1) {
5358             ch = 'h';
5359             if (is64) {
5360                putIReg64orZR(tt, unop(Iop_16Sto64,
5361                                       loadLE(Ity_I16, mkexpr(tEA))));
5362             } else {
5363                putIReg32orZR(tt, unop(Iop_16Sto32,
5364                                       loadLE(Ity_I16, mkexpr(tEA))));
5365             }
5366          }
5367          else if (szLg2 == 2 && is64) {
5368             ch = 'w';
5369             putIReg64orZR(tt, unop(Iop_32Sto64,
5370                                    loadLE(Ity_I32, mkexpr(tEA))));
5371          }
5372          else {
5373             vassert(0);
5374          }
5375          DIP("ldurs%c %s, [%s, #%lld]",
5376              ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), (Long)simm9);
5377          return True;
5378       }
5379       /* else fall through */
5380    }
5381
5382    /* -------- LDP,STP (immediate, simm7) (FP&VEC) -------- */
5383    /* L==1    => mm==LD
5384       L==0    => mm==ST
5385       sz==00  => 32 bit (S) transfers
5386       sz==01  => 64 bit (D) transfers
5387       sz==10  => 128 bit (Q) transfers
5388       sz==11  isn't allowed
5389       simm7 is scaled by the (single-register) transfer size
5390
5391       31 29  26   22 21   14 9 4
5392
5393       sz 101 1000 L  imm7 t2 n t1   mmNP SDQt1, SDQt2, [Xn|SP, #imm]
5394                                     (at-EA, with nontemporal hint)
5395
5396       sz 101 1001 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP], #imm
5397                                     (at-Rn-then-Rn=EA)
5398
5399       sz 101 1010 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]
5400                                     (at-EA)
5401
5402       sz 101 1011 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]!
5403                                     (at-EA-then-Rn=EA)
5404    */
5405    if (INSN(29,25) == BITS5(1,0,1,1,0)) {
5406       UInt szSlg2 = INSN(31,30); // log2 of the xfer size in 32-bit units
5407       Bool isLD   = INSN(22,22) == 1;
5408       Bool wBack  = INSN(23,23) == 1;
5409       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5410       UInt tt2    = INSN(14,10);
5411       UInt nn     = INSN(9,5);
5412       UInt tt1    = INSN(4,0);
5413       if (szSlg2 == BITS2(1,1) || (isLD && tt1 == tt2)) {
5414          /* undecodable; fall through */
5415       } else {
5416          if (nn == 31) { /* FIXME generate stack alignment check */ }
5417
5418          // Compute the transfer address TA and the writeback address WA.
5419          UInt   szB = 4 << szSlg2; /* szB is the per-register size */
5420          IRTemp tRN = newTemp(Ity_I64);
5421          assign(tRN, getIReg64orSP(nn));
5422          IRTemp tEA = newTemp(Ity_I64);
5423          simm7 = szB * simm7;
5424          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5425
5426          IRTemp tTA = newTemp(Ity_I64);
5427          IRTemp tWA = newTemp(Ity_I64);
5428          switch (INSN(24,23)) {
5429             case BITS2(0,1):
5430                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5431             case BITS2(1,1):
5432                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5433             case BITS2(1,0):
5434             case BITS2(0,0):
5435                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5436             default:
5437                vassert(0); /* NOTREACHED */
5438          }
5439
5440          IRType ty = Ity_INVALID;
5441          switch (szB) {
5442             case 4:  ty = Ity_F32;  break;
5443             case 8:  ty = Ity_F64;  break;
5444             case 16: ty = Ity_V128; break;
5445             default: vassert(0);
5446          }
5447
5448          /* Normally rN would be updated after the transfer.  However, in
5449             the special cases typifed by
5450                stp q0, q1, [sp,#-512]!
5451                stp d0, d1, [sp,#-512]!
5452                stp s0, s1, [sp,#-512]!
5453             it is necessary to update SP before the transfer, (1)
5454             because Memcheck will otherwise complain about a write
5455             below the stack pointer, and (2) because the segfault
5456             stack extension mechanism will otherwise extend the stack
5457             only down to SP before the instruction, which might not be
5458             far enough, if the -512 bit takes the actual access
5459             address to the next page.
5460          */
5461          Bool earlyWBack
5462            = wBack && simm7 < 0
5463              && INSN(24,23) == BITS2(1,1) && nn == 31 && !isLD;
5464
5465          if (wBack && earlyWBack)
5466             putIReg64orSP(nn, mkexpr(tEA));
5467
5468          if (isLD) {
5469             if (szB < 16) {
5470                putQReg128(tt1, mkV128(0x0000));
5471             }
5472             putQRegLO(tt1,
5473                       loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(0))));
5474             if (szB < 16) {
5475                putQReg128(tt2, mkV128(0x0000));
5476             }
5477             putQRegLO(tt2,
5478                       loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(szB))));
5479          } else {
5480             storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(0)),
5481                     getQRegLO(tt1, ty));
5482             storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(szB)),
5483                     getQRegLO(tt2, ty));
5484          }
5485
5486          if (wBack && !earlyWBack)
5487             putIReg64orSP(nn, mkexpr(tEA));
5488
5489          const HChar* fmt_str = NULL;
5490          switch (INSN(24,23)) {
5491             case BITS2(0,1):
5492                fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5493                break;
5494             case BITS2(1,1):
5495                fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5496                break;
5497             case BITS2(1,0):
5498                fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
5499                break;
5500             case BITS2(0,0):
5501                fmt_str = "%snp %s, %s, [%s, #%lld] (at-Rn)\n";
5502                break;
5503             default:
5504                vassert(0);
5505          }
5506          DIP(fmt_str, isLD ? "ld" : "st",
5507                       nameQRegLO(tt1, ty), nameQRegLO(tt2, ty),
5508                       nameIReg64orSP(nn), simm7);
5509          return True;
5510       }
5511    }
5512
5513    /* -------------- {LD,ST}R (vector register) --------------- */
5514    /* 31 29     23  20 15     12 11 9  4
5515       |  |      |   |  |      |  |  |  |
5516       00 111100 011 Rm option S  10 Rn Rt  LDR Bt, [Xn|SP, R<m>{ext/sh}]
5517       01 111100 011 Rm option S  10 Rn Rt  LDR Ht, [Xn|SP, R<m>{ext/sh}]
5518       10 111100 011 Rm option S  10 Rn Rt  LDR St, [Xn|SP, R<m>{ext/sh}]
5519       11 111100 011 Rm option S  10 Rn Rt  LDR Dt, [Xn|SP, R<m>{ext/sh}]
5520       00 111100 111 Rm option S  10 Rn Rt  LDR Qt, [Xn|SP, R<m>{ext/sh}]
5521
5522       00 111100 001 Rm option S  10 Rn Rt  STR Bt, [Xn|SP, R<m>{ext/sh}]
5523       01 111100 001 Rm option S  10 Rn Rt  STR Ht, [Xn|SP, R<m>{ext/sh}]
5524       10 111100 001 Rm option S  10 Rn Rt  STR St, [Xn|SP, R<m>{ext/sh}]
5525       11 111100 001 Rm option S  10 Rn Rt  STR Dt, [Xn|SP, R<m>{ext/sh}]
5526       00 111100 101 Rm option S  10 Rn Rt  STR Qt, [Xn|SP, R<m>{ext/sh}]
5527    */
5528    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5529        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5530       HChar  dis_buf[64];
5531       UInt   szLg2 = (INSN(23,23) << 2) | INSN(31,30);
5532       Bool   isLD  = INSN(22,22) == 1;
5533       UInt   tt    = INSN(4,0);
5534       if (szLg2 > 4) goto after_LDR_STR_vector_register;
5535       IRTemp ea    = gen_indexed_EA(dis_buf, insn, False/*to/from vec regs*/);
5536       if (ea == IRTemp_INVALID) goto after_LDR_STR_vector_register;
5537       switch (szLg2) {
5538          case 0: /* 8 bit */
5539             if (isLD) {
5540                putQReg128(tt, mkV128(0x0000));
5541                putQRegLO(tt, loadLE(Ity_I8, mkexpr(ea)));
5542                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
5543             } else {
5544                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I8));
5545                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
5546             }
5547             break;
5548          case 1:
5549             if (isLD) {
5550                putQReg128(tt, mkV128(0x0000));
5551                putQRegLO(tt, loadLE(Ity_I16, mkexpr(ea)));
5552                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
5553             } else {
5554                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I16));
5555                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
5556             }
5557             break;
5558          case 2: /* 32 bit */
5559             if (isLD) {
5560                putQReg128(tt, mkV128(0x0000));
5561                putQRegLO(tt, loadLE(Ity_I32, mkexpr(ea)));
5562                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
5563             } else {
5564                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I32));
5565                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
5566             }
5567             break;
5568          case 3: /* 64 bit */
5569             if (isLD) {
5570                putQReg128(tt, mkV128(0x0000));
5571                putQRegLO(tt, loadLE(Ity_I64, mkexpr(ea)));
5572                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
5573             } else {
5574                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I64));
5575                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
5576             }
5577             break;
5578          case 4:
5579             if (isLD) {
5580                putQReg128(tt, loadLE(Ity_V128, mkexpr(ea)));
5581                DIP("ldr %s, %s\n", nameQReg128(tt), dis_buf);
5582             } else {
5583                storeLE(mkexpr(ea), getQReg128(tt));
5584                DIP("str %s, %s\n", nameQReg128(tt), dis_buf);
5585             }
5586             break;
5587          default:
5588             vassert(0);
5589       }
5590       return True;
5591    }
5592   after_LDR_STR_vector_register:
5593
5594    /* ---------- LDRS{B,H,W} (integer register, SX) ---------- */
5595    /* 31 29      22 20 15  12 11 9  4
5596       |  |       |  |  |   |  |  |  |
5597       10 1110001 01 Rm opt S 10 Rn Rt    LDRSW Xt, [Xn|SP, R<m>{ext/sh}]
5598
5599       01 1110001 01 Rm opt S 10 Rn Rt    LDRSH Xt, [Xn|SP, R<m>{ext/sh}]
5600       01 1110001 11 Rm opt S 10 Rn Rt    LDRSH Wt, [Xn|SP, R<m>{ext/sh}]
5601
5602       00 1110001 01 Rm opt S 10 Rn Rt    LDRSB Xt, [Xn|SP, R<m>{ext/sh}]
5603       00 1110001 11 Rm opt S 10 Rn Rt    LDRSB Wt, [Xn|SP, R<m>{ext/sh}]
5604    */
5605    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5606        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5607       HChar  dis_buf[64];
5608       UInt   szLg2  = INSN(31,30);
5609       Bool   sxTo64 = INSN(22,22) == 0; // else sx to 32 and zx to 64
5610       UInt   tt     = INSN(4,0);
5611       if (szLg2 == 3) goto after_LDRS_integer_register;
5612       IRTemp ea     = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
5613       if (ea == IRTemp_INVALID) goto after_LDRS_integer_register;
5614       /* Enumerate the 5 variants explicitly. */
5615       if (szLg2 == 2/*32 bit*/ && sxTo64) {
5616          putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, mkexpr(ea))));
5617          DIP("ldrsw %s, %s\n", nameIReg64orZR(tt), dis_buf);
5618          return True;
5619       }
5620       else
5621       if (szLg2 == 1/*16 bit*/) {
5622          if (sxTo64) {
5623             putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, mkexpr(ea))));
5624             DIP("ldrsh %s, %s\n", nameIReg64orZR(tt), dis_buf);
5625          } else {
5626             putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, mkexpr(ea))));
5627             DIP("ldrsh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5628          }
5629          return True;
5630       }
5631       else
5632       if (szLg2 == 0/*8 bit*/) {
5633          if (sxTo64) {
5634             putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, mkexpr(ea))));
5635             DIP("ldrsb %s, %s\n", nameIReg64orZR(tt), dis_buf);
5636          } else {
5637             putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, mkexpr(ea))));
5638             DIP("ldrsb %s, %s\n", nameIReg32orZR(tt), dis_buf);
5639          }
5640          return True;
5641       }
5642       /* else it's an invalid combination */
5643    }
5644   after_LDRS_integer_register:
5645
5646    /* -------- LDR/STR (immediate, SIMD&FP, unsigned offset) -------- */
5647    /* This is the Unsigned offset variant only.  The Post-Index and
5648       Pre-Index variants are below.
5649
5650       31 29      23 21    9 4
5651       00 111 101 01 imm12 n t   LDR Bt, [Xn|SP + imm12 * 1]
5652       01 111 101 01 imm12 n t   LDR Ht, [Xn|SP + imm12 * 2]
5653       10 111 101 01 imm12 n t   LDR St, [Xn|SP + imm12 * 4]
5654       11 111 101 01 imm12 n t   LDR Dt, [Xn|SP + imm12 * 8]
5655       00 111 101 11 imm12 n t   LDR Qt, [Xn|SP + imm12 * 16]
5656
5657       00 111 101 00 imm12 n t   STR Bt, [Xn|SP + imm12 * 1]
5658       01 111 101 00 imm12 n t   STR Ht, [Xn|SP + imm12 * 2]
5659       10 111 101 00 imm12 n t   STR St, [Xn|SP + imm12 * 4]
5660       11 111 101 00 imm12 n t   STR Dt, [Xn|SP + imm12 * 8]
5661       00 111 101 10 imm12 n t   STR Qt, [Xn|SP + imm12 * 16]
5662    */
5663    if (INSN(29,24) == BITS6(1,1,1,1,0,1)
5664        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4) {
5665       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5666       Bool   isLD   = INSN(22,22) == 1;
5667       UInt   pimm12 = INSN(21,10) << szLg2;
5668       UInt   nn     = INSN(9,5);
5669       UInt   tt     = INSN(4,0);
5670       IRTemp tEA    = newTemp(Ity_I64);
5671       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5672       assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(pimm12)));
5673       if (isLD) {
5674          if (szLg2 < 4) {
5675             putQReg128(tt, mkV128(0x0000));
5676          }
5677          putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
5678       } else {
5679          storeLE(mkexpr(tEA), getQRegLO(tt, ty));
5680       }
5681       DIP("%s %s, [%s, #%u]\n",
5682           isLD ? "ldr" : "str",
5683           nameQRegLO(tt, ty), nameIReg64orSP(nn), pimm12);
5684       return True;
5685    }
5686
5687    /* -------- LDR/STR (immediate, SIMD&FP, pre/post index) -------- */
5688    /* These are the Post-Index and Pre-Index variants.
5689
5690       31 29      23   20   11 9 4
5691       (at-Rn-then-Rn=EA)
5692       00 111 100 01 0 imm9 01 n t   LDR Bt, [Xn|SP], #simm
5693       01 111 100 01 0 imm9 01 n t   LDR Ht, [Xn|SP], #simm
5694       10 111 100 01 0 imm9 01 n t   LDR St, [Xn|SP], #simm
5695       11 111 100 01 0 imm9 01 n t   LDR Dt, [Xn|SP], #simm
5696       00 111 100 11 0 imm9 01 n t   LDR Qt, [Xn|SP], #simm
5697
5698       (at-EA-then-Rn=EA)
5699       00 111 100 01 0 imm9 11 n t   LDR Bt, [Xn|SP, #simm]!
5700       01 111 100 01 0 imm9 11 n t   LDR Ht, [Xn|SP, #simm]!
5701       10 111 100 01 0 imm9 11 n t   LDR St, [Xn|SP, #simm]!
5702       11 111 100 01 0 imm9 11 n t   LDR Dt, [Xn|SP, #simm]!
5703       00 111 100 11 0 imm9 11 n t   LDR Qt, [Xn|SP, #simm]!
5704
5705       Stores are the same except with bit 22 set to 0.
5706    */
5707    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5708        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
5709        && INSN(21,21) == 0 && INSN(10,10) == 1) {
5710       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5711       Bool   isLD   = INSN(22,22) == 1;
5712       UInt   imm9   = INSN(20,12);
5713       Bool   atRN   = INSN(11,11) == 0;
5714       UInt   nn     = INSN(9,5);
5715       UInt   tt     = INSN(4,0);
5716       IRTemp tRN    = newTemp(Ity_I64);
5717       IRTemp tEA    = newTemp(Ity_I64);
5718       IRTemp tTA    = IRTemp_INVALID;
5719       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5720       ULong  simm9  = sx_to_64(imm9, 9);
5721       assign(tRN, getIReg64orSP(nn));
5722       assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5723       tTA = atRN ? tRN : tEA;
5724
5725       /* Do early writeback for the cases typified by
5726             str d8, [sp, #-32]!
5727             str d10, [sp, #-128]!
5728             str q1, [sp, #-32]!
5729          for the same reasons as described in a similar comment in the
5730          "LDP,STP (immediate, simm7) (FP&VEC)" case just above.
5731       */
5732       Bool earlyWBack
5733          = !atRN && !isLD && (ty == Ity_F64 || ty == Ity_V128)
5734            && nn == 31 && ((Long)simm9) < 0;
5735
5736       if (earlyWBack)
5737          putIReg64orSP(nn, mkexpr(tEA));
5738
5739       if (isLD) {
5740          if (szLg2 < 4) {
5741             putQReg128(tt, mkV128(0x0000));
5742          }
5743          putQRegLO(tt, loadLE(ty, mkexpr(tTA)));
5744       } else {
5745          storeLE(mkexpr(tTA), getQRegLO(tt, ty));
5746       }
5747
5748       if (!earlyWBack)
5749          putIReg64orSP(nn, mkexpr(tEA));
5750
5751       DIP(atRN ? "%s %s, [%s], #%lld\n" : "%s %s, [%s, #%lld]!\n",
5752           isLD ? "ldr" : "str",
5753           nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
5754       return True;
5755    }
5756
5757    /* -------- LDUR/STUR (unscaled offset, SIMD&FP) -------- */
5758    /* 31 29      23   20   11 9 4
5759       00 111 100 01 0 imm9 00 n t   LDR Bt, [Xn|SP, #simm]
5760       01 111 100 01 0 imm9 00 n t   LDR Ht, [Xn|SP, #simm]
5761       10 111 100 01 0 imm9 00 n t   LDR St, [Xn|SP, #simm]
5762       11 111 100 01 0 imm9 00 n t   LDR Dt, [Xn|SP, #simm]
5763       00 111 100 11 0 imm9 00 n t   LDR Qt, [Xn|SP, #simm]
5764
5765       00 111 100 00 0 imm9 00 n t   STR Bt, [Xn|SP, #simm]
5766       01 111 100 00 0 imm9 00 n t   STR Ht, [Xn|SP, #simm]
5767       10 111 100 00 0 imm9 00 n t   STR St, [Xn|SP, #simm]
5768       11 111 100 00 0 imm9 00 n t   STR Dt, [Xn|SP, #simm]
5769       00 111 100 10 0 imm9 00 n t   STR Qt, [Xn|SP, #simm]
5770    */
5771    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5772        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
5773        && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
5774       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5775       Bool   isLD   = INSN(22,22) == 1;
5776       UInt   imm9   = INSN(20,12);
5777       UInt   nn     = INSN(9,5);
5778       UInt   tt     = INSN(4,0);
5779       ULong  simm9  = sx_to_64(imm9, 9);
5780       IRTemp tEA    = newTemp(Ity_I64);
5781       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5782       assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(simm9)));
5783       if (isLD) {
5784          if (szLg2 < 4) {
5785             putQReg128(tt, mkV128(0x0000));
5786          }
5787          putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
5788       } else {
5789          storeLE(mkexpr(tEA), getQRegLO(tt, ty));
5790       }
5791       DIP("%s %s, [%s, #%lld]\n",
5792           isLD ? "ldur" : "stur",
5793           nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
5794       return True;
5795    }
5796
5797    /* ---------------- LDR (literal, SIMD&FP) ---------------- */
5798    /* 31 29      23    4
5799       00 011 100 imm19 t    LDR St, [PC + sxTo64(imm19 << 2)]
5800       01 011 100 imm19 t    LDR Dt, [PC + sxTo64(imm19 << 2)]
5801       10 011 100 imm19 t    LDR Qt, [PC + sxTo64(imm19 << 2)]
5802    */
5803    if (INSN(29,24) == BITS6(0,1,1,1,0,0) && INSN(31,30) < BITS2(1,1)) {
5804       UInt   szB   = 4 << INSN(31,30);
5805       UInt   imm19 = INSN(23,5);
5806       UInt   tt    = INSN(4,0);
5807       ULong  ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
5808       IRType ty    = preferredVectorSubTypeFromSize(szB);
5809       putQReg128(tt, mkV128(0x0000));
5810       putQRegLO(tt, loadLE(ty, mkU64(ea)));
5811       DIP("ldr %s, 0x%llx (literal)\n", nameQRegLO(tt, ty), ea);
5812       return True;
5813    }
5814
5815    /* ------ LD1/ST1 (multiple 1-elem structs to/from 1 reg  ------ */
5816    /* ------ LD2/ST2 (multiple 2-elem structs to/from 2 regs ------ */
5817    /* ------ LD3/ST3 (multiple 3-elem structs to/from 3 regs ------ */
5818    /* ------ LD4/ST4 (multiple 4-elem structs to/from 4 regs ------ */
5819    /* 31 29  26   22 21 20    15   11 9 4
5820
5821       0q 001 1000 L  0  00000 0000 sz n t  xx4 {Vt..t+3.T}, [Xn|SP]
5822       0q 001 1001 L  0  m     0000 sz n t  xx4 {Vt..t+3.T}, [Xn|SP], step
5823
5824       0q 001 1000 L  0  00000 0100 sz n t  xx3 {Vt..t+2.T}, [Xn|SP]
5825       0q 001 1001 L  0  m     0100 sz n t  xx3 {Vt..t+2.T}, [Xn|SP], step
5826
5827       0q 001 1000 L  0  00000 1000 sz n t  xx2 {Vt..t+1.T}, [Xn|SP]
5828       0q 001 1001 L  0  m     1000 sz n t  xx2 {Vt..t+1.T}, [Xn|SP], step
5829
5830       0q 001 1000 L  0  00000 0111 sz n t  xx1 {Vt.T},      [Xn|SP]
5831       0q 001 1001 L  0  m     0111 sz n t  xx1 {Vt.T},      [Xn|SP], step
5832
5833       T    = defined by Q and sz in the normal way
5834       step = if m == 11111 then transfer-size else Xm
5835       xx   = case L of 1 -> LD ; 0 -> ST
5836    */
5837    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
5838        && INSN(21,21) == 0) {
5839       Bool bitQ  = INSN(30,30);
5840       Bool isPX  = INSN(23,23) == 1;
5841       Bool isLD  = INSN(22,22) == 1;
5842       UInt mm    = INSN(20,16);
5843       UInt opc   = INSN(15,12);
5844       UInt sz    = INSN(11,10);
5845       UInt nn    = INSN(9,5);
5846       UInt tt    = INSN(4,0);
5847       Bool isQ   = bitQ == 1;
5848       Bool is1d  = sz == BITS2(1,1) && !isQ;
5849       UInt nRegs = 0;
5850       switch (opc) {
5851          case BITS4(0,0,0,0): nRegs = 4; break;
5852          case BITS4(0,1,0,0): nRegs = 3; break;
5853          case BITS4(1,0,0,0): nRegs = 2; break;
5854          case BITS4(0,1,1,1): nRegs = 1; break;
5855          default: break;
5856       }
5857
5858       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
5859          If we see it, set nRegs to 0 so as to cause the next conditional
5860          to fail. */
5861       if (!isPX && mm != 0)
5862          nRegs = 0;
5863
5864       if (nRegs == 1                             /* .1d is allowed */
5865           || (nRegs >= 2 && nRegs <= 4 && !is1d) /* .1d is not allowed */) {
5866
5867          UInt xferSzB = (isQ ? 16 : 8) * nRegs;
5868
5869          /* Generate the transfer address (TA) and if necessary the
5870             writeback address (WB) */
5871          IRTemp tTA = newTemp(Ity_I64);
5872          assign(tTA, getIReg64orSP(nn));
5873          if (nn == 31) { /* FIXME generate stack alignment check */ }
5874          IRTemp tWB = IRTemp_INVALID;
5875          if (isPX) {
5876             tWB = newTemp(Ity_I64);
5877             assign(tWB, binop(Iop_Add64,
5878                               mkexpr(tTA),
5879                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
5880                                                      : getIReg64orZR(mm)));
5881          }
5882
5883          /* -- BEGIN generate the transfers -- */
5884
5885          IRTemp u0, u1, u2, u3, i0, i1, i2, i3;
5886          u0 = u1 = u2 = u3 = i0 = i1 = i2 = i3 = IRTemp_INVALID;
5887          switch (nRegs) {
5888             case 4: u3 = newTempV128(); i3 = newTempV128(); /* fallthru */
5889             case 3: u2 = newTempV128(); i2 = newTempV128(); /* fallthru */
5890             case 2: u1 = newTempV128(); i1 = newTempV128(); /* fallthru */
5891             case 1: u0 = newTempV128(); i0 = newTempV128(); break;
5892             default: vassert(0);
5893          }
5894
5895          /* -- Multiple 128 or 64 bit stores -- */
5896          if (!isLD) {
5897             switch (nRegs) {
5898                case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
5899                case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
5900                case 2: assign(u1, getQReg128((tt+1) % 32)); /* fallthru */
5901                case 1: assign(u0, getQReg128((tt+0) % 32)); break;
5902                default: vassert(0);
5903             }
5904             switch (nRegs) {
5905                case 4:  (isQ ? math_INTERLEAVE4_128 : math_INTERLEAVE4_64)
5906                            (&i0, &i1, &i2, &i3, sz, u0, u1, u2, u3);
5907                         break;
5908                case 3:  (isQ ? math_INTERLEAVE3_128 : math_INTERLEAVE3_64)
5909                            (&i0, &i1, &i2, sz, u0, u1, u2);
5910                         break;
5911                case 2:  (isQ ? math_INTERLEAVE2_128 : math_INTERLEAVE2_64)
5912                            (&i0, &i1, sz, u0, u1);
5913                         break;
5914                case 1:  (isQ ? math_INTERLEAVE1_128 : math_INTERLEAVE1_64)
5915                            (&i0, sz, u0);
5916                         break;
5917                default: vassert(0);
5918             }
5919 #           define MAYBE_NARROW_TO_64(_expr) \
5920                       (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
5921             UInt step = isQ ? 16 : 8;
5922             switch (nRegs) {
5923                case 4:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
5924                                  MAYBE_NARROW_TO_64(mkexpr(i3)) );
5925                         /* fallthru */
5926                case 3:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
5927                                  MAYBE_NARROW_TO_64(mkexpr(i2)) );
5928                         /* fallthru */
5929                case 2:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
5930                                  MAYBE_NARROW_TO_64(mkexpr(i1)) );
5931                         /* fallthru */
5932                case 1:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
5933                                  MAYBE_NARROW_TO_64(mkexpr(i0)) );
5934                         break;
5935                default: vassert(0);
5936             }
5937 #           undef MAYBE_NARROW_TO_64
5938          }
5939
5940          /* -- Multiple 128 or 64 bit loads -- */
5941          else /* isLD */ {
5942             UInt   step   = isQ ? 16 : 8;
5943             IRType loadTy = isQ ? Ity_V128 : Ity_I64;
5944 #           define MAYBE_WIDEN_FROM_64(_expr) \
5945                       (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
5946             switch (nRegs) {
5947                case 4:
5948                   assign(i3, MAYBE_WIDEN_FROM_64(
5949                                 loadLE(loadTy,
5950                                        binop(Iop_Add64, mkexpr(tTA),
5951                                                         mkU64(3 * step)))));
5952                   /* fallthru */
5953                case 3:
5954                   assign(i2, MAYBE_WIDEN_FROM_64(
5955                                 loadLE(loadTy,
5956                                        binop(Iop_Add64, mkexpr(tTA),
5957                                                         mkU64(2 * step)))));
5958                   /* fallthru */
5959                case 2:
5960                   assign(i1, MAYBE_WIDEN_FROM_64(
5961                                 loadLE(loadTy,
5962                                        binop(Iop_Add64, mkexpr(tTA),
5963                                                         mkU64(1 * step)))));
5964                   /* fallthru */
5965                case 1:
5966                   assign(i0, MAYBE_WIDEN_FROM_64(
5967                                 loadLE(loadTy,
5968                                        binop(Iop_Add64, mkexpr(tTA),
5969                                                         mkU64(0 * step)))));
5970                   break;
5971                default:
5972                   vassert(0);
5973             }
5974 #           undef MAYBE_WIDEN_FROM_64
5975             switch (nRegs) {
5976                case 4:  (isQ ? math_DEINTERLEAVE4_128 : math_DEINTERLEAVE4_64)
5977                            (&u0, &u1, &u2, &u3, sz, i0,i1,i2,i3);
5978                         break;
5979                case 3:  (isQ ? math_DEINTERLEAVE3_128 : math_DEINTERLEAVE3_64)
5980                            (&u0, &u1, &u2, sz, i0, i1, i2);
5981                         break;
5982                case 2:  (isQ ? math_DEINTERLEAVE2_128 : math_DEINTERLEAVE2_64)
5983                            (&u0, &u1, sz, i0, i1);
5984                         break;
5985                case 1:  (isQ ? math_DEINTERLEAVE1_128 : math_DEINTERLEAVE1_64)
5986                            (&u0, sz, i0);
5987                         break;
5988                default: vassert(0);
5989             }
5990             switch (nRegs) {
5991                case 4:  putQReg128( (tt+3) % 32,
5992                                     math_MAYBE_ZERO_HI64(bitQ, u3));
5993                         /* fallthru */
5994                case 3:  putQReg128( (tt+2) % 32,
5995                                     math_MAYBE_ZERO_HI64(bitQ, u2));
5996                         /* fallthru */
5997                case 2:  putQReg128( (tt+1) % 32,
5998                                     math_MAYBE_ZERO_HI64(bitQ, u1));
5999                         /* fallthru */
6000                case 1:  putQReg128( (tt+0) % 32,
6001                                     math_MAYBE_ZERO_HI64(bitQ, u0));
6002                         break;
6003                default: vassert(0);
6004             }
6005          }
6006
6007          /* -- END generate the transfers -- */
6008
6009          /* Do the writeback, if necessary */
6010          if (isPX) {
6011             putIReg64orSP(nn, mkexpr(tWB));
6012          }
6013
6014          HChar pxStr[20];
6015          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6016          if (isPX) {
6017             if (mm == BITS5(1,1,1,1,1))
6018                vex_sprintf(pxStr, ", #%u", xferSzB);
6019             else
6020                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6021          }
6022          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6023          DIP("%s%u {v%u.%s .. v%u.%s}, [%s]%s\n",
6024              isLD ? "ld" : "st", nRegs,
6025              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6026              pxStr);
6027
6028          return True;
6029       }
6030       /* else fall through */
6031    }
6032
6033    /* ------ LD1/ST1 (multiple 1-elem structs to/from 2 regs  ------ */
6034    /* ------ LD1/ST1 (multiple 1-elem structs to/from 3 regs  ------ */
6035    /* ------ LD1/ST1 (multiple 1-elem structs to/from 4 regs  ------ */
6036    /* 31 29  26   22 21 20    15   11 9 4
6037
6038       0q 001 1000 L  0  00000 0010 sz n t  xx1 {Vt..t+3.T}, [Xn|SP]
6039       0q 001 1001 L  0  m     0010 sz n t  xx1 {Vt..t+3.T}, [Xn|SP], step
6040
6041       0q 001 1000 L  0  00000 0110 sz n t  xx1 {Vt..t+2.T}, [Xn|SP]
6042       0q 001 1001 L  0  m     0110 sz n t  xx1 {Vt..t+2.T}, [Xn|SP], step
6043
6044       0q 001 1000 L  0  00000 1010 sz n t  xx1 {Vt..t+1.T}, [Xn|SP]
6045       0q 001 1001 L  0  m     1010 sz n t  xx1 {Vt..t+1.T}, [Xn|SP], step
6046
6047       T    = defined by Q and sz in the normal way
6048       step = if m == 11111 then transfer-size else Xm
6049       xx   = case L of 1 -> LD ; 0 -> ST
6050    */
6051    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
6052        && INSN(21,21) == 0) {
6053       Bool bitQ  = INSN(30,30);
6054       Bool isPX  = INSN(23,23) == 1;
6055       Bool isLD  = INSN(22,22) == 1;
6056       UInt mm    = INSN(20,16);
6057       UInt opc   = INSN(15,12);
6058       UInt sz    = INSN(11,10);
6059       UInt nn    = INSN(9,5);
6060       UInt tt    = INSN(4,0);
6061       Bool isQ   = bitQ == 1;
6062       UInt nRegs = 0;
6063       switch (opc) {
6064          case BITS4(0,0,1,0): nRegs = 4; break;
6065          case BITS4(0,1,1,0): nRegs = 3; break;
6066          case BITS4(1,0,1,0): nRegs = 2; break;
6067          default: break;
6068       }
6069
6070       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
6071          If we see it, set nRegs to 0 so as to cause the next conditional
6072          to fail. */
6073       if (!isPX && mm != 0)
6074          nRegs = 0;
6075
6076       if (nRegs >= 2 && nRegs <= 4) {
6077
6078          UInt xferSzB = (isQ ? 16 : 8) * nRegs;
6079
6080          /* Generate the transfer address (TA) and if necessary the
6081             writeback address (WB) */
6082          IRTemp tTA = newTemp(Ity_I64);
6083          assign(tTA, getIReg64orSP(nn));
6084          if (nn == 31) { /* FIXME generate stack alignment check */ }
6085          IRTemp tWB = IRTemp_INVALID;
6086          if (isPX) {
6087             tWB = newTemp(Ity_I64);
6088             assign(tWB, binop(Iop_Add64,
6089                               mkexpr(tTA),
6090                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6091                                                      : getIReg64orZR(mm)));
6092          }
6093
6094          /* -- BEGIN generate the transfers -- */
6095
6096          IRTemp u0, u1, u2, u3;
6097          u0 = u1 = u2 = u3 = IRTemp_INVALID;
6098          switch (nRegs) {
6099             case 4: u3 = newTempV128(); /* fallthru */
6100             case 3: u2 = newTempV128(); /* fallthru */
6101             case 2: u1 = newTempV128();
6102                     u0 = newTempV128(); break;
6103             default: vassert(0);
6104          }
6105
6106          /* -- Multiple 128 or 64 bit stores -- */
6107          if (!isLD) {
6108             switch (nRegs) {
6109                case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
6110                case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
6111                case 2: assign(u1, getQReg128((tt+1) % 32));
6112                        assign(u0, getQReg128((tt+0) % 32)); break;
6113                default: vassert(0);
6114             }
6115 #           define MAYBE_NARROW_TO_64(_expr) \
6116                       (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
6117             UInt step = isQ ? 16 : 8;
6118             switch (nRegs) {
6119                case 4:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
6120                                  MAYBE_NARROW_TO_64(mkexpr(u3)) );
6121                         /* fallthru */
6122                case 3:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
6123                                  MAYBE_NARROW_TO_64(mkexpr(u2)) );
6124                         /* fallthru */
6125                case 2:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
6126                                  MAYBE_NARROW_TO_64(mkexpr(u1)) );
6127                         storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
6128                                  MAYBE_NARROW_TO_64(mkexpr(u0)) );
6129                         break;
6130                default: vassert(0);
6131             }
6132 #           undef MAYBE_NARROW_TO_64
6133          }
6134
6135          /* -- Multiple 128 or 64 bit loads -- */
6136          else /* isLD */ {
6137             UInt   step   = isQ ? 16 : 8;
6138             IRType loadTy = isQ ? Ity_V128 : Ity_I64;
6139 #           define MAYBE_WIDEN_FROM_64(_expr) \
6140                       (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
6141             switch (nRegs) {
6142                case 4:
6143                   assign(u3, MAYBE_WIDEN_FROM_64(
6144                                 loadLE(loadTy,
6145                                        binop(Iop_Add64, mkexpr(tTA),
6146                                                         mkU64(3 * step)))));
6147                   /* fallthru */
6148                case 3:
6149                   assign(u2, MAYBE_WIDEN_FROM_64(
6150                                 loadLE(loadTy,
6151                                        binop(Iop_Add64, mkexpr(tTA),
6152                                                         mkU64(2 * step)))));
6153                   /* fallthru */
6154                case 2:
6155                   assign(u1, MAYBE_WIDEN_FROM_64(
6156                                 loadLE(loadTy,
6157                                        binop(Iop_Add64, mkexpr(tTA),
6158                                                         mkU64(1 * step)))));
6159                   assign(u0, MAYBE_WIDEN_FROM_64(
6160                                 loadLE(loadTy,
6161                                        binop(Iop_Add64, mkexpr(tTA),
6162                                                         mkU64(0 * step)))));
6163                   break;
6164                default:
6165                   vassert(0);
6166             }
6167 #           undef MAYBE_WIDEN_FROM_64
6168             switch (nRegs) {
6169                case 4:  putQReg128( (tt+3) % 32,
6170                                     math_MAYBE_ZERO_HI64(bitQ, u3));
6171                         /* fallthru */
6172                case 3:  putQReg128( (tt+2) % 32,
6173                                     math_MAYBE_ZERO_HI64(bitQ, u2));
6174                         /* fallthru */
6175                case 2:  putQReg128( (tt+1) % 32,
6176                                     math_MAYBE_ZERO_HI64(bitQ, u1));
6177                         putQReg128( (tt+0) % 32,
6178                                     math_MAYBE_ZERO_HI64(bitQ, u0));
6179                         break;
6180                default: vassert(0);
6181             }
6182          }
6183
6184          /* -- END generate the transfers -- */
6185
6186          /* Do the writeback, if necessary */
6187          if (isPX) {
6188             putIReg64orSP(nn, mkexpr(tWB));
6189          }
6190
6191          HChar pxStr[20];
6192          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6193          if (isPX) {
6194             if (mm == BITS5(1,1,1,1,1))
6195                vex_sprintf(pxStr, ", #%u", xferSzB);
6196             else
6197                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6198          }
6199          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6200          DIP("%s1 {v%u.%s .. v%u.%s}, [%s]%s\n",
6201              isLD ? "ld" : "st",
6202              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6203              pxStr);
6204
6205          return True;
6206       }
6207       /* else fall through */
6208    }
6209
6210    /* ---------- LD1R (single structure, replicate) ---------- */
6211    /* ---------- LD2R (single structure, replicate) ---------- */
6212    /* ---------- LD3R (single structure, replicate) ---------- */
6213    /* ---------- LD4R (single structure, replicate) ---------- */
6214    /* 31 29       22 20    15    11 9 4
6215       0q 001 1010 10 00000 110 0 sz n t  LD1R {Vt.T}, [Xn|SP]
6216       0q 001 1011 10 m     110 0 sz n t  LD1R {Vt.T}, [Xn|SP], step
6217
6218       0q 001 1010 11 00000 110 0 sz n t  LD2R {Vt..t+1.T}, [Xn|SP]
6219       0q 001 1011 11 m     110 0 sz n t  LD2R {Vt..t+1.T}, [Xn|SP], step
6220
6221       0q 001 1010 10 00000 111 0 sz n t  LD3R {Vt..t+2.T}, [Xn|SP]
6222       0q 001 1011 10 m     111 0 sz n t  LD3R {Vt..t+2.T}, [Xn|SP], step
6223
6224       0q 001 1010 11 00000 111 0 sz n t  LD4R {Vt..t+3.T}, [Xn|SP]
6225       0q 001 1011 11 m     111 0 sz n t  LD4R {Vt..t+3.T}, [Xn|SP], step
6226
6227       step = if m == 11111 then transfer-size else Xm
6228    */
6229    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)
6230        && INSN(22,22) == 1 && INSN(15,14) == BITS2(1,1)
6231        && INSN(12,12) == 0) {
6232       UInt   bitQ  = INSN(30,30);
6233       Bool   isPX  = INSN(23,23) == 1;
6234       UInt   nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
6235       UInt   mm    = INSN(20,16);
6236       UInt   sz    = INSN(11,10);
6237       UInt   nn    = INSN(9,5);
6238       UInt   tt    = INSN(4,0);
6239
6240       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
6241       if (isPX || mm == 0) {
6242
6243          IRType ty    = integerIRTypeOfSize(1 << sz);
6244
6245          UInt laneSzB = 1 << sz;
6246          UInt xferSzB = laneSzB * nRegs;
6247
6248          /* Generate the transfer address (TA) and if necessary the
6249             writeback address (WB) */
6250          IRTemp tTA = newTemp(Ity_I64);
6251          assign(tTA, getIReg64orSP(nn));
6252          if (nn == 31) { /* FIXME generate stack alignment check */ }
6253          IRTemp tWB = IRTemp_INVALID;
6254          if (isPX) {
6255             tWB = newTemp(Ity_I64);
6256             assign(tWB, binop(Iop_Add64,
6257                               mkexpr(tTA),
6258                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6259                                                      : getIReg64orZR(mm)));
6260          }
6261
6262          /* Do the writeback, if necessary */
6263          if (isPX) {
6264             putIReg64orSP(nn, mkexpr(tWB));
6265          }
6266
6267          IRTemp e0, e1, e2, e3, v0, v1, v2, v3;
6268          e0 = e1 = e2 = e3 = v0 = v1 = v2 = v3 = IRTemp_INVALID;
6269          switch (nRegs) {
6270             case 4:
6271                e3 = newTemp(ty);
6272                assign(e3, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6273                                                       mkU64(3 * laneSzB))));
6274                v3 = math_DUP_TO_V128(e3, ty);
6275                putQReg128((tt+3) % 32, math_MAYBE_ZERO_HI64(bitQ, v3));
6276                /* fallthrough */
6277             case 3:
6278                e2 = newTemp(ty);
6279                assign(e2, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6280                                                       mkU64(2 * laneSzB))));
6281                v2 = math_DUP_TO_V128(e2, ty);
6282                putQReg128((tt+2) % 32, math_MAYBE_ZERO_HI64(bitQ, v2));
6283                /* fallthrough */
6284             case 2:
6285                e1 = newTemp(ty);
6286                assign(e1, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6287                                                       mkU64(1 * laneSzB))));
6288                v1 = math_DUP_TO_V128(e1, ty);
6289                putQReg128((tt+1) % 32, math_MAYBE_ZERO_HI64(bitQ, v1));
6290                /* fallthrough */
6291             case 1:
6292                e0 = newTemp(ty);
6293                assign(e0, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6294                                                       mkU64(0 * laneSzB))));
6295                v0 = math_DUP_TO_V128(e0, ty);
6296                putQReg128((tt+0) % 32, math_MAYBE_ZERO_HI64(bitQ, v0));
6297                break;
6298             default:
6299                vassert(0);
6300          }
6301
6302          HChar pxStr[20];
6303          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6304          if (isPX) {
6305             if (mm == BITS5(1,1,1,1,1))
6306                vex_sprintf(pxStr, ", #%u", xferSzB);
6307             else
6308                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6309          }
6310          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6311          DIP("ld%ur {v%u.%s .. v%u.%s}, [%s]%s\n",
6312              nRegs,
6313              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6314              pxStr);
6315
6316          return True;
6317       }
6318       /* else fall through */
6319    }
6320
6321    /* ------ LD1/ST1 (single structure, to/from one lane) ------ */
6322    /* ------ LD2/ST2 (single structure, to/from one lane) ------ */
6323    /* ------ LD3/ST3 (single structure, to/from one lane) ------ */
6324    /* ------ LD4/ST4 (single structure, to/from one lane) ------ */
6325    /* 31 29       22 21 20    15    11 9 4
6326       0q 001 1010 L  0  00000 xx0 S sz n t  op1 {Vt.T}[ix], [Xn|SP]
6327       0q 001 1011 L  0  m     xx0 S sz n t  op1 {Vt.T}[ix], [Xn|SP], step
6328
6329       0q 001 1010 L  1  00000 xx0 S sz n t  op2 {Vt..t+1.T}[ix], [Xn|SP]
6330       0q 001 1011 L  1  m     xx0 S sz n t  op2 {Vt..t+1.T}[ix], [Xn|SP], step
6331
6332       0q 001 1010 L  0  00000 xx1 S sz n t  op3 {Vt..t+2.T}[ix], [Xn|SP]
6333       0q 001 1011 L  0  m     xx1 S sz n t  op3 {Vt..t+2.T}[ix], [Xn|SP], step
6334
6335       0q 001 1010 L  1  00000 xx1 S sz n t  op4 {Vt..t+3.T}[ix], [Xn|SP]
6336       0q 001 1011 L  1  m     xx1 S sz n t  op4 {Vt..t+3.T}[ix], [Xn|SP], step
6337
6338       step = if m == 11111 then transfer-size else Xm
6339       op   = case L of 1 -> LD ; 0 -> ST
6340
6341       laneszB,ix = case xx:q:S:sz of 00:b:b:bb -> 1, bbbb
6342                                      01:b:b:b0 -> 2, bbb
6343                                      10:b:b:00 -> 4, bb
6344                                      10:b:0:01 -> 8, b
6345    */
6346    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)) {
6347       UInt   bitQ  = INSN(30,30);
6348       Bool   isPX  = INSN(23,23) == 1;
6349       Bool   isLD  = INSN(22,22) == 1;
6350       UInt   nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
6351       UInt   mm    = INSN(20,16);
6352       UInt   xx    = INSN(15,14);
6353       UInt   bitS  = INSN(12,12);
6354       UInt   sz    = INSN(11,10);
6355       UInt   nn    = INSN(9,5);
6356       UInt   tt    = INSN(4,0);
6357
6358       Bool valid = True;
6359
6360       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
6361       if (!isPX && mm != 0)
6362          valid = False;
6363
6364       UInt laneSzB = 0;  /* invalid */
6365       UInt ix      = 16; /* invalid */
6366
6367       UInt xx_q_S_sz = (xx << 4) | (bitQ << 3) | (bitS << 2) | sz;
6368       switch (xx_q_S_sz) {
6369          case 0x00: case 0x01: case 0x02: case 0x03:
6370          case 0x04: case 0x05: case 0x06: case 0x07:
6371          case 0x08: case 0x09: case 0x0A: case 0x0B:
6372          case 0x0C: case 0x0D: case 0x0E: case 0x0F:
6373             laneSzB = 1; ix = xx_q_S_sz & 0xF;
6374             break;
6375          case 0x10: case 0x12: case 0x14: case 0x16:
6376          case 0x18: case 0x1A: case 0x1C: case 0x1E:
6377             laneSzB = 2; ix = (xx_q_S_sz >> 1) & 7;
6378             break;
6379          case 0x20: case 0x24: case 0x28: case 0x2C:
6380             laneSzB = 4; ix = (xx_q_S_sz >> 2) & 3;
6381             break;
6382          case 0x21: case 0x29:
6383             laneSzB = 8; ix = (xx_q_S_sz >> 3) & 1;
6384             break;
6385          default:
6386             break;
6387       }
6388
6389       if (valid && laneSzB != 0) {
6390
6391          IRType ty      = integerIRTypeOfSize(laneSzB);
6392          UInt   xferSzB = laneSzB * nRegs;
6393
6394          /* Generate the transfer address (TA) and if necessary the
6395             writeback address (WB) */
6396          IRTemp tTA = newTemp(Ity_I64);
6397          assign(tTA, getIReg64orSP(nn));
6398          if (nn == 31) { /* FIXME generate stack alignment check */ }
6399          IRTemp tWB = IRTemp_INVALID;
6400          if (isPX) {
6401             tWB = newTemp(Ity_I64);
6402             assign(tWB, binop(Iop_Add64,
6403                               mkexpr(tTA),
6404                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6405                                                      : getIReg64orZR(mm)));
6406          }
6407
6408          /* Do the writeback, if necessary */
6409          if (isPX) {
6410             putIReg64orSP(nn, mkexpr(tWB));
6411          }
6412
6413          switch (nRegs) {
6414             case 4: {
6415                IRExpr* addr
6416                   = binop(Iop_Add64, mkexpr(tTA), mkU64(3 * laneSzB));
6417                if (isLD) {
6418                   putQRegLane((tt+3) % 32, ix, loadLE(ty, addr));
6419                } else {
6420                   storeLE(addr, getQRegLane((tt+3) % 32, ix, ty));
6421                }
6422                /* fallthrough */
6423             }
6424             case 3: {
6425                IRExpr* addr
6426                   = binop(Iop_Add64, mkexpr(tTA), mkU64(2 * laneSzB));
6427                if (isLD) {
6428                   putQRegLane((tt+2) % 32, ix, loadLE(ty, addr));
6429                } else {
6430                   storeLE(addr, getQRegLane((tt+2) % 32, ix, ty));
6431                }
6432                /* fallthrough */
6433             }
6434             case 2: {
6435                IRExpr* addr
6436                   = binop(Iop_Add64, mkexpr(tTA), mkU64(1 * laneSzB));
6437                if (isLD) {
6438                   putQRegLane((tt+1) % 32, ix, loadLE(ty, addr));
6439                } else {
6440                   storeLE(addr, getQRegLane((tt+1) % 32, ix, ty));
6441                }
6442                /* fallthrough */
6443             }
6444             case 1: {
6445                IRExpr* addr
6446                   = binop(Iop_Add64, mkexpr(tTA), mkU64(0 * laneSzB));
6447                if (isLD) {
6448                   putQRegLane((tt+0) % 32, ix, loadLE(ty, addr));
6449                } else {
6450                   storeLE(addr, getQRegLane((tt+0) % 32, ix, ty));
6451                }
6452                break;
6453             }
6454             default:
6455                vassert(0);
6456          }
6457
6458          HChar pxStr[20];
6459          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6460          if (isPX) {
6461             if (mm == BITS5(1,1,1,1,1))
6462                vex_sprintf(pxStr, ", #%u", xferSzB);
6463             else
6464                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6465          }
6466          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6467          DIP("%s%u {v%u.%s .. v%u.%s}[%u], [%s]%s\n",
6468              isLD ? "ld" : "st", nRegs,
6469              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr,
6470              ix, nameIReg64orSP(nn), pxStr);
6471
6472          return True;
6473       }
6474       /* else fall through */
6475    }
6476
6477    /* ------------------ LD{,A}X{R,RH,RB} ------------------ */
6478    /* ------------------ ST{,L}X{R,RH,RB} ------------------ */
6479    /* 31 29     23  20      14    9 4
6480       sz 001000 010 11111 0 11111 n t   LDX{R,RH,RB}  Rt, [Xn|SP]
6481       sz 001000 010 11111 1 11111 n t   LDAX{R,RH,RB} Rt, [Xn|SP]
6482       sz 001000 000 s     0 11111 n t   STX{R,RH,RB}  Ws, Rt, [Xn|SP]
6483       sz 001000 000 s     1 11111 n t   STLX{R,RH,RB} Ws, Rt, [Xn|SP]
6484    */
6485    /* For the "standard" implementation we pass through the LL and SC to
6486       the host.  For the "fallback" implementation, for details see
6487         https://bugs.kde.org/show_bug.cgi?id=344524 and
6488         https://bugs.kde.org/show_bug.cgi?id=369459,
6489       but in short:
6490
6491       LoadLinked(addr)
6492         gs.LLsize = load_size // 1, 2, 4 or 8
6493         gs.LLaddr = addr
6494         gs.LLdata = zeroExtend(*addr)
6495
6496       StoreCond(addr, data)
6497         tmp_LLsize = gs.LLsize
6498         gs.LLsize = 0 // "no transaction"
6499         if tmp_LLsize != store_size        -> fail
6500         if addr != gs.LLaddr               -> fail
6501         if zeroExtend(*addr) != gs.LLdata  -> fail
6502         cas_ok = CAS(store_size, addr, gs.LLdata -> data)
6503         if !cas_ok                         -> fail
6504         succeed
6505
6506       When thread scheduled
6507         gs.LLsize = 0 // "no transaction"
6508         (coregrind/m_scheduler/scheduler.c, run_thread_for_a_while()
6509          has to do this bit)
6510    */
6511    if (INSN(29,23) == BITS7(0,0,1,0,0,0,0)
6512        && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,0)
6513        && INSN(14,10) == BITS5(1,1,1,1,1)) {
6514       UInt szBlg2     = INSN(31,30);
6515       Bool isLD       = INSN(22,22) == 1;
6516       Bool isAcqOrRel = INSN(15,15) == 1;
6517       UInt ss         = INSN(20,16);
6518       UInt nn         = INSN(9,5);
6519       UInt tt         = INSN(4,0);
6520
6521       vassert(szBlg2 < 4);
6522       UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
6523       IRType ty  = integerIRTypeOfSize(szB);
6524       const HChar* suffix[4] = { "rb", "rh", "r", "r" };
6525
6526       IRTemp ea = newTemp(Ity_I64);
6527       assign(ea, getIReg64orSP(nn));
6528       /* FIXME generate check that ea is szB-aligned */
6529
6530       if (isLD && ss == BITS5(1,1,1,1,1)) {
6531          IRTemp res = newTemp(ty);
6532          if (abiinfo->guest__use_fallback_LLSC) {
6533             // Do the load first so we don't update any guest state
6534             // if it faults.
6535             IRTemp loaded_data64 = newTemp(Ity_I64);
6536             assign(loaded_data64, widenUto64(ty, loadLE(ty, mkexpr(ea))));
6537             stmt( IRStmt_Put( OFFB_LLSC_DATA, mkexpr(loaded_data64) ));
6538             stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
6539             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(szB) ));
6540             putIReg64orZR(tt, mkexpr(loaded_data64));
6541          } else {
6542             stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
6543             putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
6544          }
6545          if (isAcqOrRel) {
6546             stmt(IRStmt_MBE(Imbe_Fence));
6547          }
6548          DIP("ld%sx%s %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
6549              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
6550              abiinfo->guest__use_fallback_LLSC
6551                 ? "(fallback implementation)" : "");
6552          return True;
6553       }
6554       if (!isLD) {
6555          if (isAcqOrRel) {
6556             stmt(IRStmt_MBE(Imbe_Fence));
6557          }
6558          IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
6559          if (abiinfo->guest__use_fallback_LLSC) {
6560             // This is really ugly, since we don't have any way to do
6561             // proper if-then-else.  First, set up as if the SC failed,
6562             // and jump forwards if it really has failed.
6563
6564             // Continuation address
6565             IRConst* nia = IRConst_U64(guest_PC_curr_instr + 4);
6566
6567             // "the SC failed".  Any non-zero value means failure.
6568             putIReg64orZR(ss, mkU64(1));
6569
6570             IRTemp tmp_LLsize = newTemp(Ity_I64);
6571             assign(tmp_LLsize, IRExpr_Get(OFFB_LLSC_SIZE, Ity_I64));
6572             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) // "no transaction"
6573             ));
6574             // Fail if no or wrong-size transaction
6575             vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
6576             stmt( IRStmt_Exit(
6577                      binop(Iop_CmpNE64, mkexpr(tmp_LLsize), mkU64(szB)),
6578                      Ijk_Boring, nia, OFFB_PC
6579             ));
6580             // Fail if the address doesn't match the LL address
6581             stmt( IRStmt_Exit(
6582                       binop(Iop_CmpNE64, mkexpr(ea),
6583                                          IRExpr_Get(OFFB_LLSC_ADDR, Ity_I64)),
6584                       Ijk_Boring, nia, OFFB_PC
6585             ));
6586             // Fail if the data doesn't match the LL data
6587             IRTemp llsc_data64 = newTemp(Ity_I64);
6588             assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA, Ity_I64));
6589             stmt( IRStmt_Exit(
6590                       binop(Iop_CmpNE64, widenUto64(ty, loadLE(ty, mkexpr(ea))),
6591                                          mkexpr(llsc_data64)),
6592                       Ijk_Boring, nia, OFFB_PC
6593             ));
6594             // Try to CAS the new value in.
6595             IRTemp old = newTemp(ty);
6596             IRTemp expd = newTemp(ty);
6597             assign(expd, narrowFrom64(ty, mkexpr(llsc_data64)));
6598             stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
6599                                      Iend_LE, mkexpr(ea),
6600                                      /*expdHi*/NULL, mkexpr(expd),
6601                                      /*dataHi*/NULL, data
6602             )));
6603             // Fail if the CAS failed (viz, old != expd)
6604             stmt( IRStmt_Exit(
6605                       binop(Iop_CmpNE64,
6606                             widenUto64(ty, mkexpr(old)),
6607                             widenUto64(ty, mkexpr(expd))),
6608                       Ijk_Boring, nia, OFFB_PC
6609             ));
6610             // Otherwise we succeeded (!)
6611             putIReg64orZR(ss, mkU64(0));
6612          } else {
6613             IRTemp res = newTemp(Ity_I1);
6614             stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
6615             /* IR semantics: res is 1 if store succeeds, 0 if it fails.
6616                Need to set rS to 1 on failure, 0 on success. */
6617             putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
6618                                                mkU64(1)));
6619          }
6620          DIP("st%sx%s %s, %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
6621              nameIRegOrZR(False, ss),
6622              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
6623              abiinfo->guest__use_fallback_LLSC
6624                 ? "(fallback implementation)" : "");
6625          return True;
6626       }
6627       /* else fall through */
6628    }
6629
6630    /* ------------------ LDA{R,RH,RB} ------------------ */
6631    /* ------------------ STL{R,RH,RB} ------------------ */
6632    /* 31 29     23  20      14    9 4
6633       sz 001000 110 11111 1 11111 n t   LDAR<sz> Rt, [Xn|SP]
6634       sz 001000 100 11111 1 11111 n t   STLR<sz> Rt, [Xn|SP]
6635    */
6636    if (INSN(29,23) == BITS7(0,0,1,0,0,0,1)
6637        && INSN(21,10) == BITS12(0,1,1,1,1,1,1,1,1,1,1,1)) {
6638       UInt szBlg2 = INSN(31,30);
6639       Bool isLD   = INSN(22,22) == 1;
6640       UInt nn     = INSN(9,5);
6641       UInt tt     = INSN(4,0);
6642
6643       vassert(szBlg2 < 4);
6644       UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
6645       IRType ty  = integerIRTypeOfSize(szB);
6646       const HChar* suffix[4] = { "rb", "rh", "r", "r" };
6647
6648       IRTemp ea = newTemp(Ity_I64);
6649       assign(ea, getIReg64orSP(nn));
6650       /* FIXME generate check that ea is szB-aligned */
6651
6652       if (isLD) {
6653          IRTemp res = newTemp(ty);
6654          assign(res, loadLE(ty, mkexpr(ea)));
6655          putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
6656          stmt(IRStmt_MBE(Imbe_Fence));
6657          DIP("lda%s %s, [%s]\n", suffix[szBlg2],
6658              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
6659       } else {
6660          stmt(IRStmt_MBE(Imbe_Fence));
6661          IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
6662          storeLE(mkexpr(ea), data);
6663          DIP("stl%s %s, [%s]\n", suffix[szBlg2],
6664              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
6665       }
6666       return True;
6667    }
6668
6669    /* The PRFM cases that follow are possibly allow Rt values (the
6670       prefetch operation) which are not allowed by the documentation.
6671       This should be looked into. */
6672    /* ------------------ PRFM (immediate) ------------------ */
6673    /* 31           21    9 4
6674       11 111 00110 imm12 n t   PRFM pfrop=Rt, [Xn|SP, #pimm]
6675    */
6676    if (INSN(31,22) == BITS10(1,1,1,1,1,0,0,1,1,0)) {
6677       UInt imm12 = INSN(21,10);
6678       UInt nn    = INSN(9,5);
6679       UInt tt    = INSN(4,0);
6680       /* Generating any IR here is pointless, except for documentation
6681          purposes, as it will get optimised away later. */
6682       IRTemp ea = newTemp(Ity_I64);
6683       assign(ea, binop(Iop_Add64, getIReg64orSP(nn), mkU64(imm12 * 8)));
6684       DIP("prfm prfop=%u, [%s, #%u]\n", tt, nameIReg64orSP(nn), imm12 * 8);
6685       return True;
6686    }
6687
6688    /* ------------------ PRFM (register) ------------------ */
6689    /* 31 29      22 20 15  12 11 9  4
6690       11 1110001 01 Rm opt S  10 Rn Rt    PRFM pfrop=Rt, [Xn|SP, R<m>{ext/sh}]
6691    */
6692    if (INSN(31,21) == BITS11(1,1,1,1,1,0,0,0,1,0,1)
6693        && INSN(11,10) == BITS2(1,0)) {
6694       HChar  dis_buf[64];
6695       UInt   tt = INSN(4,0);
6696       IRTemp ea = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
6697       if (ea != IRTemp_INVALID) {
6698          /* No actual code to generate. */
6699          DIP("prfm prfop=%u, %s\n", tt, dis_buf);
6700          return True;
6701       }
6702    }
6703
6704    /* ------------------ PRFM (unscaled offset) ------------------ */
6705    /* 31 29      22 20   11 9  4
6706       11 1110001 00 imm9 00 Rn Rt    PRFM pfrop=Rt, [Xn|SP, #simm]
6707    */
6708    if (INSN(31,21) == BITS11(1,1, 1,1,1,0,0,0,1, 0,0)
6709        && INSN(11,10) == BITS2(0,0)) {
6710       ULong  imm9   = INSN(20,12);
6711       UInt   nn     = INSN(9,5);
6712       UInt   tt     = INSN(4,0);
6713       ULong  offset = sx_to_64(imm9, 9);
6714       IRTemp ea     = newTemp(Ity_I64);
6715       assign(ea, binop(Iop_Add64, getIReg64orSP(nn), mkU64(offset)));
6716       /* No actual code to generate. */
6717       DIP("prfum prfop=%u, [%s, #0x%llx]\n", tt, nameIReg64orSP(nn), offset);
6718       return True;
6719    }
6720
6721    vex_printf("ARM64 front end: load_store\n");
6722    return False;
6723 #  undef INSN
6724 }
6725
6726
6727 /*------------------------------------------------------------*/
6728 /*--- Control flow and misc instructions                   ---*/
6729 /*------------------------------------------------------------*/
6730
6731 static
6732 Bool dis_ARM64_branch_etc(/*MB_OUT*/DisResult* dres, UInt insn,
6733                           const VexArchInfo* archinfo,
6734                           const VexAbiInfo* abiinfo)
6735 {
6736 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
6737
6738    /* ---------------------- B cond ----------------------- */
6739    /* 31        24    4 3
6740       0101010 0 imm19 0 cond */
6741    if (INSN(31,24) == BITS8(0,1,0,1,0,1,0,0) && INSN(4,4) == 0) {
6742       UInt  cond   = INSN(3,0);
6743       ULong uimm64 = INSN(23,5) << 2;
6744       Long  simm64 = (Long)sx_to_64(uimm64, 21);
6745       vassert(dres->whatNext    == Dis_Continue);
6746       vassert(dres->len         == 4);
6747       vassert(dres->continueAt  == 0);
6748       vassert(dres->jk_StopHere == Ijk_INVALID);
6749       stmt( IRStmt_Exit(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
6750                         Ijk_Boring,
6751                         IRConst_U64(guest_PC_curr_instr + simm64),
6752                         OFFB_PC) );
6753       putPC(mkU64(guest_PC_curr_instr + 4));
6754       dres->whatNext    = Dis_StopHere;
6755       dres->jk_StopHere = Ijk_Boring;
6756       DIP("b.%s 0x%llx\n", nameCC(cond), guest_PC_curr_instr + simm64);
6757       return True;
6758    }
6759
6760    /* -------------------- B{L} uncond -------------------- */
6761    if (INSN(30,26) == BITS5(0,0,1,0,1)) {
6762       /* 000101 imm26  B  (PC + sxTo64(imm26 << 2))
6763          100101 imm26  B  (PC + sxTo64(imm26 << 2))
6764       */
6765       UInt  bLink  = INSN(31,31);
6766       ULong uimm64 = INSN(25,0) << 2;
6767       Long  simm64 = (Long)sx_to_64(uimm64, 28);
6768       if (bLink) {
6769          putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
6770       }
6771       putPC(mkU64(guest_PC_curr_instr + simm64));
6772       dres->whatNext = Dis_StopHere;
6773       dres->jk_StopHere = Ijk_Call;
6774       DIP("b%s 0x%llx\n", bLink == 1 ? "l" : "",
6775                           guest_PC_curr_instr + simm64);
6776       return True;
6777    }
6778
6779    /* --------------------- B{L} reg --------------------- */
6780    /* 31      24 22 20    15     9  4
6781       1101011 00 10 11111 000000 nn 00000  RET  Rn
6782       1101011 00 01 11111 000000 nn 00000  CALL Rn
6783       1101011 00 00 11111 000000 nn 00000  JMP  Rn
6784    */
6785    if (INSN(31,23) == BITS9(1,1,0,1,0,1,1,0,0)
6786        && INSN(20,16) == BITS5(1,1,1,1,1)
6787        && INSN(15,10) == BITS6(0,0,0,0,0,0)
6788        && INSN(4,0) == BITS5(0,0,0,0,0)) {
6789       UInt branch_type = INSN(22,21);
6790       UInt nn          = INSN(9,5);
6791       if (branch_type == BITS2(1,0) /* RET */) {
6792          putPC(getIReg64orZR(nn));
6793          dres->whatNext = Dis_StopHere;
6794          dres->jk_StopHere = Ijk_Ret;
6795          DIP("ret %s\n", nameIReg64orZR(nn));
6796          return True;
6797       }
6798       if (branch_type == BITS2(0,1) /* CALL */) {
6799          IRTemp dst = newTemp(Ity_I64);
6800          assign(dst, getIReg64orZR(nn));
6801          putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
6802          putPC(mkexpr(dst));
6803          dres->whatNext = Dis_StopHere;
6804          dres->jk_StopHere = Ijk_Call;
6805          DIP("blr %s\n", nameIReg64orZR(nn));
6806          return True;
6807       }
6808       if (branch_type == BITS2(0,0) /* JMP */) {
6809          putPC(getIReg64orZR(nn));
6810          dres->whatNext = Dis_StopHere;
6811          dres->jk_StopHere = Ijk_Boring;
6812          DIP("jmp %s\n", nameIReg64orZR(nn));
6813          return True;
6814       }
6815    }
6816
6817    /* -------------------- CB{N}Z -------------------- */
6818    /* sf 011 010 1 imm19 Rt   CBNZ Xt|Wt, (PC + sxTo64(imm19 << 2))
6819       sf 011 010 0 imm19 Rt   CBZ  Xt|Wt, (PC + sxTo64(imm19 << 2))
6820    */
6821    if (INSN(30,25) == BITS6(0,1,1,0,1,0)) {
6822       Bool    is64   = INSN(31,31) == 1;
6823       Bool    bIfZ   = INSN(24,24) == 0;
6824       ULong   uimm64 = INSN(23,5) << 2;
6825       UInt    rT     = INSN(4,0);
6826       Long    simm64 = (Long)sx_to_64(uimm64, 21);
6827       IRExpr* cond   = NULL;
6828       if (is64) {
6829          cond = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
6830                       getIReg64orZR(rT), mkU64(0));
6831       } else {
6832          cond = binop(bIfZ ? Iop_CmpEQ32 : Iop_CmpNE32,
6833                       getIReg32orZR(rT), mkU32(0));
6834       }
6835       stmt( IRStmt_Exit(cond,
6836                         Ijk_Boring,
6837                         IRConst_U64(guest_PC_curr_instr + simm64),
6838                         OFFB_PC) );
6839       putPC(mkU64(guest_PC_curr_instr + 4));
6840       dres->whatNext    = Dis_StopHere;
6841       dres->jk_StopHere = Ijk_Boring;
6842       DIP("cb%sz %s, 0x%llx\n",
6843           bIfZ ? "" : "n", nameIRegOrZR(is64, rT),
6844           guest_PC_curr_instr + simm64);
6845       return True;
6846    }
6847
6848    /* -------------------- TB{N}Z -------------------- */
6849    /* 31 30      24 23  18  5 4
6850       b5 011 011 1  b40 imm14 t  TBNZ Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
6851       b5 011 011 0  b40 imm14 t  TBZ  Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
6852    */
6853    if (INSN(30,25) == BITS6(0,1,1,0,1,1)) {
6854       UInt    b5     = INSN(31,31);
6855       Bool    bIfZ   = INSN(24,24) == 0;
6856       UInt    b40    = INSN(23,19);
6857       UInt    imm14  = INSN(18,5);
6858       UInt    tt     = INSN(4,0);
6859       UInt    bitNo  = (b5 << 5) | b40;
6860       ULong   uimm64 = imm14 << 2;
6861       Long    simm64 = sx_to_64(uimm64, 16);
6862       IRExpr* cond
6863          = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
6864                  binop(Iop_And64,
6865                        binop(Iop_Shr64, getIReg64orZR(tt), mkU8(bitNo)),
6866                        mkU64(1)),
6867                  mkU64(0));
6868       stmt( IRStmt_Exit(cond,
6869                         Ijk_Boring,
6870                         IRConst_U64(guest_PC_curr_instr + simm64),
6871                         OFFB_PC) );
6872       putPC(mkU64(guest_PC_curr_instr + 4));
6873       dres->whatNext    = Dis_StopHere;
6874       dres->jk_StopHere = Ijk_Boring;
6875       DIP("tb%sz %s, #%u, 0x%llx\n",
6876           bIfZ ? "" : "n", nameIReg64orZR(tt), bitNo,
6877           guest_PC_curr_instr + simm64);
6878       return True;
6879    }
6880
6881    /* -------------------- SVC -------------------- */
6882    /* 11010100 000 imm16 000 01
6883       Don't bother with anything except the imm16==0 case.
6884    */
6885    if (INSN(31,0) == 0xD4000001) {
6886       putPC(mkU64(guest_PC_curr_instr + 4));
6887       dres->whatNext    = Dis_StopHere;
6888       dres->jk_StopHere = Ijk_Sys_syscall;
6889       DIP("svc #0\n");
6890       return True;
6891    }
6892
6893    /* ------------------ M{SR,RS} ------------------ */
6894    /* ---- Cases for TPIDR_EL0 ----
6895       0xD51BD0 010 Rt   MSR tpidr_el0, rT
6896       0xD53BD0 010 Rt   MRS rT, tpidr_el0
6897    */
6898    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51BD040 /*MSR*/
6899        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53BD040 /*MRS*/) {
6900       Bool toSys = INSN(21,21) == 0;
6901       UInt tt    = INSN(4,0);
6902       if (toSys) {
6903          stmt( IRStmt_Put( OFFB_TPIDR_EL0, getIReg64orZR(tt)) );
6904          DIP("msr tpidr_el0, %s\n", nameIReg64orZR(tt));
6905       } else {
6906          putIReg64orZR(tt, IRExpr_Get( OFFB_TPIDR_EL0, Ity_I64 ));
6907          DIP("mrs %s, tpidr_el0\n", nameIReg64orZR(tt));
6908       }
6909       return True;
6910    }
6911    /* ---- Cases for FPCR ----
6912       0xD51B44 000 Rt  MSR fpcr, rT
6913       0xD53B44 000 Rt  MSR rT, fpcr
6914    */
6915    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4400 /*MSR*/
6916        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4400 /*MRS*/) {
6917       Bool toSys = INSN(21,21) == 0;
6918       UInt tt    = INSN(4,0);
6919       if (toSys) {
6920          stmt( IRStmt_Put( OFFB_FPCR, getIReg32orZR(tt)) );
6921          DIP("msr fpcr, %s\n", nameIReg64orZR(tt));
6922       } else {
6923          putIReg32orZR(tt, IRExpr_Get(OFFB_FPCR, Ity_I32));
6924          DIP("mrs %s, fpcr\n", nameIReg64orZR(tt));
6925       }
6926       return True;
6927    }
6928    /* ---- Cases for FPSR ----
6929       0xD51B44 001 Rt  MSR fpsr, rT
6930       0xD53B44 001 Rt  MSR rT, fpsr
6931       The only part of this we model is FPSR.QC.  All other bits
6932       are ignored when writing to it and RAZ when reading from it.
6933    */
6934    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4420 /*MSR*/
6935        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4420 /*MRS*/) {
6936       Bool toSys = INSN(21,21) == 0;
6937       UInt tt    = INSN(4,0);
6938       if (toSys) {
6939          /* Just deal with FPSR.QC.  Make up a V128 value which is
6940             zero if Xt[27] is zero and any other value if Xt[27] is
6941             nonzero. */
6942          IRTemp qc64 = newTemp(Ity_I64);
6943          assign(qc64, binop(Iop_And64,
6944                             binop(Iop_Shr64, getIReg64orZR(tt), mkU8(27)),
6945                             mkU64(1)));
6946          IRExpr* qcV128 = binop(Iop_64HLtoV128, mkexpr(qc64), mkexpr(qc64));
6947          stmt( IRStmt_Put( OFFB_QCFLAG, qcV128 ) );
6948          DIP("msr fpsr, %s\n", nameIReg64orZR(tt));
6949       } else {
6950          /* Generate a value which is all zeroes except for bit 27,
6951             which must be zero if QCFLAG is all zeroes and one otherwise. */
6952          IRTemp qcV128 = newTempV128();
6953          assign(qcV128, IRExpr_Get( OFFB_QCFLAG, Ity_V128 ));
6954          IRTemp qc64 = newTemp(Ity_I64);
6955          assign(qc64, binop(Iop_Or64, unop(Iop_V128HIto64, mkexpr(qcV128)),
6956                                       unop(Iop_V128to64,   mkexpr(qcV128))));
6957          IRExpr* res = binop(Iop_Shl64,
6958                              unop(Iop_1Uto64,
6959                                   binop(Iop_CmpNE64, mkexpr(qc64), mkU64(0))),
6960                              mkU8(27));
6961          putIReg64orZR(tt, res);
6962          DIP("mrs %s, fpsr\n", nameIReg64orZR(tt));
6963       }
6964       return True;
6965    }
6966    /* ---- Cases for NZCV ----
6967       D51B42 000 Rt  MSR nzcv, rT
6968       D53B42 000 Rt  MRS rT, nzcv
6969       The only parts of NZCV that actually exist are bits 31:28, which
6970       are the N Z C and V bits themselves.  Hence the flags thunk provides
6971       all the state we need.
6972    */
6973    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4200 /*MSR*/
6974        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4200 /*MRS*/) {
6975       Bool  toSys = INSN(21,21) == 0;
6976       UInt  tt    = INSN(4,0);
6977       if (toSys) {
6978          IRTemp t = newTemp(Ity_I64);
6979          assign(t, binop(Iop_And64, getIReg64orZR(tt), mkU64(0xF0000000ULL)));
6980          setFlags_COPY(t);
6981          DIP("msr %s, nzcv\n", nameIReg32orZR(tt));
6982       } else {
6983          IRTemp res = newTemp(Ity_I64);
6984          assign(res, mk_arm64g_calculate_flags_nzcv());
6985          putIReg32orZR(tt, unop(Iop_64to32, mkexpr(res)));
6986          DIP("mrs %s, nzcv\n", nameIReg64orZR(tt));
6987       }
6988       return True;
6989    }
6990    /* ---- Cases for DCZID_EL0 ----
6991       Don't support arbitrary reads and writes to this register.  Just
6992       return the value 16, which indicates that the DC ZVA instruction
6993       is not permitted, so we don't have to emulate it.
6994       D5 3B 00 111 Rt  MRS rT, dczid_el0
6995    */
6996    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B00E0) {
6997       UInt tt = INSN(4,0);
6998       putIReg64orZR(tt, mkU64(1<<4));
6999       DIP("mrs %s, dczid_el0 (FAKED)\n", nameIReg64orZR(tt));
7000       return True;
7001    }
7002    /* ---- Cases for CTR_EL0 ----
7003       We just handle reads, and make up a value from the D and I line
7004       sizes in the VexArchInfo we are given, and patch in the following
7005       fields that the Foundation model gives ("natively"):
7006       CWG = 0b0100, ERG = 0b0100, L1Ip = 0b11
7007       D5 3B 00 001 Rt  MRS rT, dczid_el0
7008    */
7009    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B0020) {
7010       UInt tt = INSN(4,0);
7011       /* Need to generate a value from dMinLine_lg2_szB and
7012          dMinLine_lg2_szB.  The value in the register is in 32-bit
7013          units, so need to subtract 2 from the values in the
7014          VexArchInfo.  We can assume that the values here are valid --
7015          disInstr_ARM64 checks them -- so there's no need to deal with
7016          out-of-range cases. */
7017       vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
7018               && archinfo->arm64_dMinLine_lg2_szB <= 17
7019               && archinfo->arm64_iMinLine_lg2_szB >= 2
7020               && archinfo->arm64_iMinLine_lg2_szB <= 17);
7021       UInt val
7022          = 0x8440c000 | ((0xF & (archinfo->arm64_dMinLine_lg2_szB - 2)) << 16)
7023                       | ((0xF & (archinfo->arm64_iMinLine_lg2_szB - 2)) << 0);
7024       putIReg64orZR(tt, mkU64(val));
7025       DIP("mrs %s, ctr_el0\n", nameIReg64orZR(tt));
7026       return True;
7027    }
7028    /* ---- Cases for CNTVCT_EL0 ----
7029       This is a timestamp counter of some sort.  Support reads of it only
7030       by passing through to the host.
7031       D5 3B E0 010 Rt  MRS Xt, cntvct_el0
7032    */
7033    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53BE040) {
7034       UInt     tt   = INSN(4,0);
7035       IRTemp   val  = newTemp(Ity_I64);
7036       IRExpr** args = mkIRExprVec_0();
7037       IRDirty* d    = unsafeIRDirty_1_N (
7038                          val,
7039                          0/*regparms*/,
7040                          "arm64g_dirtyhelper_MRS_CNTVCT_EL0",
7041                          &arm64g_dirtyhelper_MRS_CNTVCT_EL0,
7042                          args
7043                       );
7044       /* execute the dirty call, dumping the result in val. */
7045       stmt( IRStmt_Dirty(d) );
7046       putIReg64orZR(tt, mkexpr(val));
7047       DIP("mrs %s, cntvct_el0\n", nameIReg64orZR(tt));
7048       return True;
7049    }
7050    /* ---- Cases for CNTFRQ_EL0 ----
7051       This is always RO at EL0, so it's safe to pass through to the host.
7052       D5 3B E0 000 Rt  MRS Xt, cntfrq_el0
7053    */
7054    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53BE000) {
7055       UInt     tt   = INSN(4,0);
7056       IRTemp   val  = newTemp(Ity_I64);
7057       IRExpr** args = mkIRExprVec_0();
7058       IRDirty* d    = unsafeIRDirty_1_N (
7059                          val,
7060                          0/*regparms*/,
7061                          "arm64g_dirtyhelper_MRS_CNTFRQ_EL0",
7062                          &arm64g_dirtyhelper_MRS_CNTFRQ_EL0,
7063                          args
7064                       );
7065       /* execute the dirty call, dumping the result in val. */
7066       stmt( IRStmt_Dirty(d) );
7067       putIReg64orZR(tt, mkexpr(val));
7068       DIP("mrs %s, cntfrq_el0\n", nameIReg64orZR(tt));
7069       return True;
7070    }
7071
7072    /* ------------------ IC_IVAU ------------------ */
7073    /* D5 0B 75 001 Rt  ic ivau, rT
7074    */
7075    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD50B7520) {
7076       /* We will always be provided with a valid iMinLine value. */
7077       vassert(archinfo->arm64_iMinLine_lg2_szB >= 2
7078               && archinfo->arm64_iMinLine_lg2_szB <= 17);
7079       /* Round the requested address, in rT, down to the start of the
7080          containing block. */
7081       UInt   tt      = INSN(4,0);
7082       ULong  lineszB = 1ULL << archinfo->arm64_iMinLine_lg2_szB;
7083       IRTemp addr    = newTemp(Ity_I64);
7084       assign( addr, binop( Iop_And64,
7085                            getIReg64orZR(tt),
7086                            mkU64(~(lineszB - 1))) );
7087       /* Set the invalidation range, request exit-and-invalidate, with
7088          continuation at the next instruction. */
7089       stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
7090       stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
7091       /* be paranoid ... */
7092       stmt( IRStmt_MBE(Imbe_Fence) );
7093       putPC(mkU64( guest_PC_curr_instr + 4 ));
7094       dres->whatNext    = Dis_StopHere;
7095       dres->jk_StopHere = Ijk_InvalICache;
7096       DIP("ic ivau, %s\n", nameIReg64orZR(tt));
7097       return True;
7098    }
7099
7100    /* ------------------ DC_CVAU ------------------ */
7101    /* D5 0B 7B 001 Rt  dc cvau, rT
7102    */
7103    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD50B7B20) {
7104       /* Exactly the same scheme as for IC IVAU, except we observe the
7105          dMinLine size, and request an Ijk_FlushDCache instead of
7106          Ijk_InvalICache. */
7107       /* We will always be provided with a valid dMinLine value. */
7108       vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
7109               && archinfo->arm64_dMinLine_lg2_szB <= 17);
7110       /* Round the requested address, in rT, down to the start of the
7111          containing block. */
7112       UInt   tt      = INSN(4,0);
7113       ULong  lineszB = 1ULL << archinfo->arm64_dMinLine_lg2_szB;
7114       IRTemp addr    = newTemp(Ity_I64);
7115       assign( addr, binop( Iop_And64,
7116                            getIReg64orZR(tt),
7117                            mkU64(~(lineszB - 1))) );
7118       /* Set the flush range, request exit-and-flush, with
7119          continuation at the next instruction. */
7120       stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
7121       stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
7122       /* be paranoid ... */
7123       stmt( IRStmt_MBE(Imbe_Fence) );
7124       putPC(mkU64( guest_PC_curr_instr + 4 ));
7125       dres->whatNext    = Dis_StopHere;
7126       dres->jk_StopHere = Ijk_FlushDCache;
7127       DIP("dc cvau, %s\n", nameIReg64orZR(tt));
7128       return True;
7129    }
7130
7131    /* ------------------ ISB, DMB, DSB ------------------ */
7132    /* 31          21            11  7 6  4
7133       11010 10100 0 00 011 0011 CRm 1 01 11111  DMB opt
7134       11010 10100 0 00 011 0011 CRm 1 00 11111  DSB opt
7135       11010 10100 0 00 011 0011 CRm 1 10 11111  ISB opt
7136    */
7137    if (INSN(31,22) == BITS10(1,1,0,1,0,1,0,1,0,0)
7138        && INSN(21,12) == BITS10(0,0,0,0,1,1,0,0,1,1)
7139        && INSN(7,7) == 1
7140        && INSN(6,5) <= BITS2(1,0) && INSN(4,0) == BITS5(1,1,1,1,1)) {
7141       UInt opc = INSN(6,5);
7142       UInt CRm = INSN(11,8);
7143       vassert(opc <= 2 && CRm <= 15);
7144       stmt(IRStmt_MBE(Imbe_Fence));
7145       const HChar* opNames[3]
7146          = { "dsb", "dmb", "isb" };
7147       const HChar* howNames[16]
7148          = { "#0", "oshld", "oshst", "osh", "#4", "nshld", "nshst", "nsh",
7149              "#8", "ishld", "ishst", "ish", "#12", "ld", "st", "sy" };
7150       DIP("%s %s\n", opNames[opc], howNames[CRm]);
7151       return True;
7152    }
7153
7154    /* -------------------- NOP -------------------- */
7155    if (INSN(31,0) == 0xD503201F) {
7156       DIP("nop\n");
7157       return True;
7158    }
7159
7160    /* -------------------- BRK -------------------- */
7161    /* 31        23  20    4
7162       1101 0100 001 imm16 00000  BRK #imm16
7163    */
7164    if (INSN(31,24) == BITS8(1,1,0,1,0,1,0,0)
7165        && INSN(23,21) == BITS3(0,0,1) && INSN(4,0) == BITS5(0,0,0,0,0)) {
7166       UInt imm16 = INSN(20,5);
7167       /* Request SIGTRAP and then restart of this insn. */
7168       putPC(mkU64(guest_PC_curr_instr + 0));
7169       dres->whatNext    = Dis_StopHere;
7170       dres->jk_StopHere = Ijk_SigTRAP;
7171       DIP("brk #%u\n", imm16);
7172       return True;
7173    }
7174
7175    /* ------------------- YIELD ------------------- */
7176    /* 31        23        15        7
7177       1101 0101 0000 0011 0010 0000 0011 1111
7178    */
7179    if (INSN(31,0) == 0xD503203F) {
7180       /* Request yield followed by continuation at the next insn. */
7181       putPC(mkU64(guest_PC_curr_instr + 4));
7182       dres->whatNext    = Dis_StopHere;
7183       dres->jk_StopHere = Ijk_Yield;
7184       DIP("yield\n");
7185       return True;
7186    }
7187
7188    /* -------------------- HINT ------------------- */
7189    /* 31        23        15   11   4 3
7190       1101 0101 0000 0011 0010 imm7 1 1111
7191       Catch otherwise unhandled HINT instructions - any
7192       like YIELD which are explicitly handled should go
7193       above this case.
7194    */
7195    if (INSN(31,24) == BITS8(1,1,0,1,0,1,0,1)
7196        && INSN(23,16) == BITS8(0,0,0,0,0,0,1,1)
7197        && INSN(15,12) == BITS4(0,0,1,0)
7198        && INSN(4,0) == BITS5(1,1,1,1,1)) {
7199       UInt imm7 = INSN(11,5);
7200       DIP("hint #%u\n", imm7);
7201       return True;
7202    }
7203
7204    /* ------------------- CLREX ------------------ */
7205    /* 31        23        15   11 7
7206       1101 0101 0000 0011 0011 m  0101 1111  CLREX CRm
7207       CRm is apparently ignored.
7208    */
7209    if ((INSN(31,0) & 0xFFFFF0FF) == 0xD503305F) {
7210       UInt mm = INSN(11,8);
7211       /* AFAICS, this simply cancels a (all?) reservations made by a
7212          (any?) preceding LDREX(es).  Arrange to hand it through to
7213          the back end. */
7214       if (abiinfo->guest__use_fallback_LLSC) {
7215          stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) )); // "no transaction"
7216       } else {
7217          stmt( IRStmt_MBE(Imbe_CancelReservation) );
7218       }
7219       DIP("clrex #%u\n", mm);
7220       return True;
7221    }
7222
7223    vex_printf("ARM64 front end: branch_etc\n");
7224    return False;
7225 #  undef INSN
7226 }
7227
7228
7229 /*------------------------------------------------------------*/
7230 /*--- SIMD and FP instructions: helper functions           ---*/
7231 /*------------------------------------------------------------*/
7232
7233 /* Some constructors for interleave/deinterleave expressions. */
7234
7235 static IRExpr* mk_CatEvenLanes64x2 ( IRTemp a10, IRTemp b10 ) {
7236    // returns a0 b0
7237    return binop(Iop_InterleaveLO64x2, mkexpr(a10), mkexpr(b10));
7238 }
7239
7240 static IRExpr* mk_CatOddLanes64x2 ( IRTemp a10, IRTemp b10 ) {
7241    // returns a1 b1
7242    return binop(Iop_InterleaveHI64x2, mkexpr(a10), mkexpr(b10));
7243 }
7244
7245 static IRExpr* mk_CatEvenLanes32x4 ( IRTemp a3210, IRTemp b3210 ) {
7246    // returns a2 a0 b2 b0
7247    return binop(Iop_CatEvenLanes32x4, mkexpr(a3210), mkexpr(b3210));
7248 }
7249
7250 static IRExpr* mk_CatOddLanes32x4 ( IRTemp a3210, IRTemp b3210 ) {
7251    // returns a3 a1 b3 b1
7252    return binop(Iop_CatOddLanes32x4, mkexpr(a3210), mkexpr(b3210));
7253 }
7254
7255 static IRExpr* mk_InterleaveLO32x4 ( IRTemp a3210, IRTemp b3210 ) {
7256    // returns a1 b1 a0 b0
7257    return binop(Iop_InterleaveLO32x4, mkexpr(a3210), mkexpr(b3210));
7258 }
7259
7260 static IRExpr* mk_InterleaveHI32x4 ( IRTemp a3210, IRTemp b3210 ) {
7261    // returns a3 b3 a2 b2
7262    return binop(Iop_InterleaveHI32x4, mkexpr(a3210), mkexpr(b3210));
7263 }
7264
7265 static IRExpr* mk_CatEvenLanes16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7266    // returns a6 a4 a2 a0 b6 b4 b2 b0
7267    return binop(Iop_CatEvenLanes16x8, mkexpr(a76543210), mkexpr(b76543210));
7268 }
7269
7270 static IRExpr* mk_CatOddLanes16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7271    // returns a7 a5 a3 a1 b7 b5 b3 b1
7272    return binop(Iop_CatOddLanes16x8, mkexpr(a76543210), mkexpr(b76543210));
7273 }
7274
7275 static IRExpr* mk_InterleaveLO16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7276    // returns a3 b3 a2 b2 a1 b1 a0 b0
7277    return binop(Iop_InterleaveLO16x8, mkexpr(a76543210), mkexpr(b76543210));
7278 }
7279
7280 static IRExpr* mk_InterleaveHI16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
7281    // returns a7 b7 a6 b6 a5 b5 a4 b4
7282    return binop(Iop_InterleaveHI16x8, mkexpr(a76543210), mkexpr(b76543210));
7283 }
7284
7285 static IRExpr* mk_CatEvenLanes8x16 ( IRTemp aFEDCBA9876543210,
7286                                      IRTemp bFEDCBA9876543210 ) {
7287    // returns aE aC aA a8 a6 a4 a2 a0 bE bC bA b8 b6 b4 b2 b0
7288    return binop(Iop_CatEvenLanes8x16, mkexpr(aFEDCBA9876543210),
7289                                       mkexpr(bFEDCBA9876543210));
7290 }
7291
7292 static IRExpr* mk_CatOddLanes8x16 ( IRTemp aFEDCBA9876543210,
7293                                     IRTemp bFEDCBA9876543210 ) {
7294    // returns aF aD aB a9 a7 a5 a3 a1 bF bD bB b9 b7 b5 b3 b1
7295    return binop(Iop_CatOddLanes8x16, mkexpr(aFEDCBA9876543210),
7296                                      mkexpr(bFEDCBA9876543210));
7297 }
7298
7299 static IRExpr* mk_InterleaveLO8x16 ( IRTemp aFEDCBA9876543210,
7300                                      IRTemp bFEDCBA9876543210 ) {
7301    // returns a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0
7302    return binop(Iop_InterleaveLO8x16, mkexpr(aFEDCBA9876543210),
7303                                       mkexpr(bFEDCBA9876543210));
7304 }
7305
7306 static IRExpr* mk_InterleaveHI8x16 ( IRTemp aFEDCBA9876543210,
7307                                      IRTemp bFEDCBA9876543210 ) {
7308    // returns aF bF aE bE aD bD aC bC aB bB aA bA a9 b9 a8 b8
7309    return binop(Iop_InterleaveHI8x16, mkexpr(aFEDCBA9876543210),
7310                                       mkexpr(bFEDCBA9876543210));
7311 }
7312
7313 /* Generate N copies of |bit| in the bottom of a ULong. */
7314 static ULong Replicate ( ULong bit, Int N )
7315 {
7316    vassert(bit <= 1 && N >= 1 && N < 64);
7317    if (bit == 0) {
7318       return 0;
7319     } else {
7320       /* Careful.  This won't work for N == 64. */
7321       return (1ULL << N) - 1;
7322    }
7323 }
7324
7325 static ULong Replicate32x2 ( ULong bits32 )
7326 {
7327    vassert(0 == (bits32 & ~0xFFFFFFFFULL));
7328    return (bits32 << 32) | bits32;
7329 }
7330
7331 static ULong Replicate16x4 ( ULong bits16 )
7332 {
7333    vassert(0 == (bits16 & ~0xFFFFULL));
7334    return Replicate32x2((bits16 << 16) | bits16);
7335 }
7336
7337 static ULong Replicate8x8 ( ULong bits8 )
7338 {
7339    vassert(0 == (bits8 & ~0xFFULL));
7340    return Replicate16x4((bits8 << 8) | bits8);
7341 }
7342
7343 /* Expand the VFPExpandImm-style encoding in the bottom 8 bits of
7344    |imm8| to either a 32-bit value if N is 32 or a 64 bit value if N
7345    is 64.  In the former case, the upper 32 bits of the returned value
7346    are guaranteed to be zero. */
7347 static ULong VFPExpandImm ( ULong imm8, Int N )
7348 {
7349    vassert(imm8 <= 0xFF);
7350    vassert(N == 32 || N == 64);
7351    Int E = ((N == 32) ? 8 : 11) - 2; // The spec incorrectly omits the -2.
7352    Int F = N - E - 1;
7353    ULong imm8_6 = (imm8 >> 6) & 1;
7354    /* sign: 1 bit */
7355    /* exp:  E bits */
7356    /* frac: F bits */
7357    ULong sign = (imm8 >> 7) & 1;
7358    ULong exp  = ((imm8_6 ^ 1) << (E-1)) | Replicate(imm8_6, E-1);
7359    ULong frac = ((imm8 & 63) << (F-6)) | Replicate(0, F-6);
7360    vassert(sign < (1ULL << 1));
7361    vassert(exp  < (1ULL << E));
7362    vassert(frac < (1ULL << F));
7363    vassert(1 + E + F == N);
7364    ULong res = (sign << (E+F)) | (exp << F) | frac;
7365    return res;
7366 }
7367
7368 /* Expand an AdvSIMDExpandImm-style encoding into a 64-bit value.
7369    This might fail, as indicated by the returned Bool.  Page 2530 of
7370    the manual. */
7371 static Bool AdvSIMDExpandImm ( /*OUT*/ULong* res,
7372                                UInt op, UInt cmode, UInt imm8 )
7373 {
7374    vassert(op <= 1);
7375    vassert(cmode <= 15);
7376    vassert(imm8 <= 255);
7377
7378    *res = 0; /* will overwrite iff returning True */
7379
7380    ULong imm64    = 0;
7381    Bool  testimm8 = False;
7382
7383    switch (cmode >> 1) {
7384       case 0:
7385          testimm8 = False; imm64 = Replicate32x2(imm8); break;
7386       case 1:
7387          testimm8 = True; imm64 = Replicate32x2(imm8 << 8); break;
7388       case 2:
7389          testimm8 = True; imm64 = Replicate32x2(imm8 << 16); break;
7390       case 3:
7391          testimm8 = True; imm64 = Replicate32x2(imm8 << 24); break;
7392       case 4:
7393           testimm8 = False; imm64 = Replicate16x4(imm8); break;
7394       case 5:
7395           testimm8 = True; imm64 = Replicate16x4(imm8 << 8); break;
7396       case 6:
7397           testimm8 = True;
7398           if ((cmode & 1) == 0)
7399               imm64 = Replicate32x2((imm8 << 8) | 0xFF);
7400           else
7401               imm64 = Replicate32x2((imm8 << 16) | 0xFFFF);
7402           break;
7403       case 7:
7404          testimm8 = False;
7405          if ((cmode & 1) == 0 && op == 0)
7406              imm64 = Replicate8x8(imm8);
7407          if ((cmode & 1) == 0 && op == 1) {
7408              imm64 = 0;   imm64 |= (imm8 & 0x80) ? 0xFF : 0x00;
7409              imm64 <<= 8; imm64 |= (imm8 & 0x40) ? 0xFF : 0x00;
7410              imm64 <<= 8; imm64 |= (imm8 & 0x20) ? 0xFF : 0x00;
7411              imm64 <<= 8; imm64 |= (imm8 & 0x10) ? 0xFF : 0x00;
7412              imm64 <<= 8; imm64 |= (imm8 & 0x08) ? 0xFF : 0x00;
7413              imm64 <<= 8; imm64 |= (imm8 & 0x04) ? 0xFF : 0x00;
7414              imm64 <<= 8; imm64 |= (imm8 & 0x02) ? 0xFF : 0x00;
7415              imm64 <<= 8; imm64 |= (imm8 & 0x01) ? 0xFF : 0x00;
7416          }
7417          if ((cmode & 1) == 1 && op == 0) {
7418             ULong imm8_7  = (imm8 >> 7) & 1;
7419             ULong imm8_6  = (imm8 >> 6) & 1;
7420             ULong imm8_50 = imm8 & 63;
7421             ULong imm32 = (imm8_7                 << (1 + 5 + 6 + 19))
7422                           | ((imm8_6 ^ 1)         << (5 + 6 + 19))
7423                           | (Replicate(imm8_6, 5) << (6 + 19))
7424                           | (imm8_50              << 19);
7425             imm64 = Replicate32x2(imm32);
7426          }
7427          if ((cmode & 1) == 1 && op == 1) {
7428             // imm64 = imm8<7>:NOT(imm8<6>)
7429             //                :Replicate(imm8<6>,8):imm8<5:0>:Zeros(48);
7430             ULong imm8_7  = (imm8 >> 7) & 1;
7431             ULong imm8_6  = (imm8 >> 6) & 1;
7432             ULong imm8_50 = imm8 & 63;
7433             imm64 = (imm8_7 << 63) | ((imm8_6 ^ 1) << 62)
7434                     | (Replicate(imm8_6, 8) << 54)
7435                     | (imm8_50 << 48);
7436          }
7437          break;
7438       default:
7439         vassert(0);
7440    }
7441
7442    if (testimm8 && imm8 == 0)
7443       return False;
7444
7445    *res = imm64;
7446    return True;
7447 }
7448
7449 /* Help a bit for decoding laneage for vector operations that can be
7450    of the form 4x32, 2x64 or 2x32-and-zero-upper-half, as encoded by Q
7451    and SZ bits, typically for vector floating point. */
7452 static Bool getLaneInfo_Q_SZ ( /*OUT*/IRType* tyI,  /*OUT*/IRType* tyF,
7453                                /*OUT*/UInt* nLanes, /*OUT*/Bool* zeroUpper,
7454                                /*OUT*/const HChar** arrSpec,
7455                                Bool bitQ, Bool bitSZ )
7456 {
7457    vassert(bitQ == True || bitQ == False);
7458    vassert(bitSZ == True || bitSZ == False);
7459    if (bitQ && bitSZ) { // 2x64
7460       if (tyI)       *tyI       = Ity_I64;
7461       if (tyF)       *tyF       = Ity_F64;
7462       if (nLanes)    *nLanes    = 2;
7463       if (zeroUpper) *zeroUpper = False;
7464       if (arrSpec)   *arrSpec   = "2d";
7465       return True;
7466    }
7467    if (bitQ && !bitSZ) { // 4x32
7468       if (tyI)       *tyI       = Ity_I32;
7469       if (tyF)       *tyF       = Ity_F32;
7470       if (nLanes)    *nLanes    = 4;
7471       if (zeroUpper) *zeroUpper = False;
7472       if (arrSpec)   *arrSpec   = "4s";
7473       return True;
7474    }
7475    if (!bitQ && !bitSZ) { // 2x32
7476       if (tyI)       *tyI       = Ity_I32;
7477       if (tyF)       *tyF       = Ity_F32;
7478       if (nLanes)    *nLanes    = 2;
7479       if (zeroUpper) *zeroUpper = True;
7480       if (arrSpec)   *arrSpec   = "2s";
7481       return True;
7482    }
7483    // Else impliedly 1x64, which isn't allowed.
7484    return False;
7485 }
7486
7487 /* Helper for decoding laneage for shift-style vector operations
7488    that involve an immediate shift amount. */
7489 static Bool getLaneInfo_IMMH_IMMB ( /*OUT*/UInt* shift, /*OUT*/UInt* szBlg2,
7490                                     UInt immh, UInt immb )
7491 {
7492    vassert(immh < (1<<4));
7493    vassert(immb < (1<<3));
7494    UInt immhb = (immh << 3) | immb;
7495    if (immh & 8) {
7496       if (shift)  *shift  = 128 - immhb;
7497       if (szBlg2) *szBlg2 = 3;
7498       return True;
7499    }
7500    if (immh & 4) {
7501       if (shift)  *shift  = 64 - immhb;
7502       if (szBlg2) *szBlg2 = 2;
7503       return True;
7504    }
7505    if (immh & 2) {
7506       if (shift)  *shift  = 32 - immhb;
7507       if (szBlg2) *szBlg2 = 1;
7508       return True;
7509    }
7510    if (immh & 1) {
7511       if (shift)  *shift  = 16 - immhb;
7512       if (szBlg2) *szBlg2 = 0;
7513       return True;
7514    }
7515    return False;
7516 }
7517
7518 /* Generate IR to fold all lanes of the V128 value in 'src' as
7519    characterised by the operator 'op', and return the result in the
7520    bottom bits of a V128, with all other bits set to zero. */
7521 static IRTemp math_FOLDV ( IRTemp src, IROp op )
7522 {
7523    /* The basic idea is to use repeated applications of Iop_CatEven*
7524       and Iop_CatOdd* operators to 'src' so as to clone each lane into
7525       a complete vector.  Then fold all those vectors with 'op' and
7526       zero out all but the least significant lane. */
7527    switch (op) {
7528       case Iop_Min8Sx16: case Iop_Min8Ux16:
7529       case Iop_Max8Sx16: case Iop_Max8Ux16: case Iop_Add8x16: {
7530          /* NB: temp naming here is misleading -- the naming is for 8
7531             lanes of 16 bit, whereas what is being operated on is 16
7532             lanes of 8 bits. */
7533          IRTemp x76543210 = src;
7534          IRTemp x76547654 = newTempV128();
7535          IRTemp x32103210 = newTempV128();
7536          assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
7537          assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
7538          IRTemp x76767676 = newTempV128();
7539          IRTemp x54545454 = newTempV128();
7540          IRTemp x32323232 = newTempV128();
7541          IRTemp x10101010 = newTempV128();
7542          assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
7543          assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
7544          assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
7545          assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
7546          IRTemp x77777777 = newTempV128();
7547          IRTemp x66666666 = newTempV128();
7548          IRTemp x55555555 = newTempV128();
7549          IRTemp x44444444 = newTempV128();
7550          IRTemp x33333333 = newTempV128();
7551          IRTemp x22222222 = newTempV128();
7552          IRTemp x11111111 = newTempV128();
7553          IRTemp x00000000 = newTempV128();
7554          assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
7555          assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
7556          assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
7557          assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
7558          assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
7559          assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
7560          assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
7561          assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
7562          /* Naming not misleading after here. */
7563          IRTemp xAllF = newTempV128();
7564          IRTemp xAllE = newTempV128();
7565          IRTemp xAllD = newTempV128();
7566          IRTemp xAllC = newTempV128();
7567          IRTemp xAllB = newTempV128();
7568          IRTemp xAllA = newTempV128();
7569          IRTemp xAll9 = newTempV128();
7570          IRTemp xAll8 = newTempV128();
7571          IRTemp xAll7 = newTempV128();
7572          IRTemp xAll6 = newTempV128();
7573          IRTemp xAll5 = newTempV128();
7574          IRTemp xAll4 = newTempV128();
7575          IRTemp xAll3 = newTempV128();
7576          IRTemp xAll2 = newTempV128();
7577          IRTemp xAll1 = newTempV128();
7578          IRTemp xAll0 = newTempV128();
7579          assign(xAllF, mk_CatOddLanes8x16 (x77777777, x77777777));
7580          assign(xAllE, mk_CatEvenLanes8x16(x77777777, x77777777));
7581          assign(xAllD, mk_CatOddLanes8x16 (x66666666, x66666666));
7582          assign(xAllC, mk_CatEvenLanes8x16(x66666666, x66666666));
7583          assign(xAllB, mk_CatOddLanes8x16 (x55555555, x55555555));
7584          assign(xAllA, mk_CatEvenLanes8x16(x55555555, x55555555));
7585          assign(xAll9, mk_CatOddLanes8x16 (x44444444, x44444444));
7586          assign(xAll8, mk_CatEvenLanes8x16(x44444444, x44444444));
7587          assign(xAll7, mk_CatOddLanes8x16 (x33333333, x33333333));
7588          assign(xAll6, mk_CatEvenLanes8x16(x33333333, x33333333));
7589          assign(xAll5, mk_CatOddLanes8x16 (x22222222, x22222222));
7590          assign(xAll4, mk_CatEvenLanes8x16(x22222222, x22222222));
7591          assign(xAll3, mk_CatOddLanes8x16 (x11111111, x11111111));
7592          assign(xAll2, mk_CatEvenLanes8x16(x11111111, x11111111));
7593          assign(xAll1, mk_CatOddLanes8x16 (x00000000, x00000000));
7594          assign(xAll0, mk_CatEvenLanes8x16(x00000000, x00000000));
7595          IRTemp maxFE = newTempV128();
7596          IRTemp maxDC = newTempV128();
7597          IRTemp maxBA = newTempV128();
7598          IRTemp max98 = newTempV128();
7599          IRTemp max76 = newTempV128();
7600          IRTemp max54 = newTempV128();
7601          IRTemp max32 = newTempV128();
7602          IRTemp max10 = newTempV128();
7603          assign(maxFE, binop(op, mkexpr(xAllF), mkexpr(xAllE)));
7604          assign(maxDC, binop(op, mkexpr(xAllD), mkexpr(xAllC)));
7605          assign(maxBA, binop(op, mkexpr(xAllB), mkexpr(xAllA)));
7606          assign(max98, binop(op, mkexpr(xAll9), mkexpr(xAll8)));
7607          assign(max76, binop(op, mkexpr(xAll7), mkexpr(xAll6)));
7608          assign(max54, binop(op, mkexpr(xAll5), mkexpr(xAll4)));
7609          assign(max32, binop(op, mkexpr(xAll3), mkexpr(xAll2)));
7610          assign(max10, binop(op, mkexpr(xAll1), mkexpr(xAll0)));
7611          IRTemp maxFEDC = newTempV128();
7612          IRTemp maxBA98 = newTempV128();
7613          IRTemp max7654 = newTempV128();
7614          IRTemp max3210 = newTempV128();
7615          assign(maxFEDC, binop(op, mkexpr(maxFE), mkexpr(maxDC)));
7616          assign(maxBA98, binop(op, mkexpr(maxBA), mkexpr(max98)));
7617          assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
7618          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
7619          IRTemp maxFEDCBA98 = newTempV128();
7620          IRTemp max76543210 = newTempV128();
7621          assign(maxFEDCBA98, binop(op, mkexpr(maxFEDC), mkexpr(maxBA98)));
7622          assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
7623          IRTemp maxAllLanes = newTempV128();
7624          assign(maxAllLanes, binop(op, mkexpr(maxFEDCBA98),
7625                                        mkexpr(max76543210)));
7626          IRTemp res = newTempV128();
7627          assign(res, unop(Iop_ZeroHI120ofV128, mkexpr(maxAllLanes)));
7628          return res;
7629       }
7630       case Iop_Min16Sx8: case Iop_Min16Ux8:
7631       case Iop_Max16Sx8: case Iop_Max16Ux8: case Iop_Add16x8: {
7632          IRTemp x76543210 = src;
7633          IRTemp x76547654 = newTempV128();
7634          IRTemp x32103210 = newTempV128();
7635          assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
7636          assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
7637          IRTemp x76767676 = newTempV128();
7638          IRTemp x54545454 = newTempV128();
7639          IRTemp x32323232 = newTempV128();
7640          IRTemp x10101010 = newTempV128();
7641          assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
7642          assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
7643          assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
7644          assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
7645          IRTemp x77777777 = newTempV128();
7646          IRTemp x66666666 = newTempV128();
7647          IRTemp x55555555 = newTempV128();
7648          IRTemp x44444444 = newTempV128();
7649          IRTemp x33333333 = newTempV128();
7650          IRTemp x22222222 = newTempV128();
7651          IRTemp x11111111 = newTempV128();
7652          IRTemp x00000000 = newTempV128();
7653          assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
7654          assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
7655          assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
7656          assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
7657          assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
7658          assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
7659          assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
7660          assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
7661          IRTemp max76 = newTempV128();
7662          IRTemp max54 = newTempV128();
7663          IRTemp max32 = newTempV128();
7664          IRTemp max10 = newTempV128();
7665          assign(max76, binop(op, mkexpr(x77777777), mkexpr(x66666666)));
7666          assign(max54, binop(op, mkexpr(x55555555), mkexpr(x44444444)));
7667          assign(max32, binop(op, mkexpr(x33333333), mkexpr(x22222222)));
7668          assign(max10, binop(op, mkexpr(x11111111), mkexpr(x00000000)));
7669          IRTemp max7654 = newTempV128();
7670          IRTemp max3210 = newTempV128();
7671          assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
7672          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
7673          IRTemp max76543210 = newTempV128();
7674          assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
7675          IRTemp res = newTempV128();
7676          assign(res, unop(Iop_ZeroHI112ofV128, mkexpr(max76543210)));
7677          return res;
7678       }
7679       case Iop_Max32Fx4: case Iop_Min32Fx4:
7680       case Iop_Min32Sx4: case Iop_Min32Ux4:
7681       case Iop_Max32Sx4: case Iop_Max32Ux4: case Iop_Add32x4: {
7682          IRTemp x3210 = src;
7683          IRTemp x3232 = newTempV128();
7684          IRTemp x1010 = newTempV128();
7685          assign(x3232, mk_CatOddLanes64x2 (x3210, x3210));
7686          assign(x1010, mk_CatEvenLanes64x2(x3210, x3210));
7687          IRTemp x3333 = newTempV128();
7688          IRTemp x2222 = newTempV128();
7689          IRTemp x1111 = newTempV128();
7690          IRTemp x0000 = newTempV128();
7691          assign(x3333, mk_CatOddLanes32x4 (x3232, x3232));
7692          assign(x2222, mk_CatEvenLanes32x4(x3232, x3232));
7693          assign(x1111, mk_CatOddLanes32x4 (x1010, x1010));
7694          assign(x0000, mk_CatEvenLanes32x4(x1010, x1010));
7695          IRTemp max32 = newTempV128();
7696          IRTemp max10 = newTempV128();
7697          assign(max32, binop(op, mkexpr(x3333), mkexpr(x2222)));
7698          assign(max10, binop(op, mkexpr(x1111), mkexpr(x0000)));
7699          IRTemp max3210 = newTempV128();
7700          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
7701          IRTemp res = newTempV128();
7702          assign(res, unop(Iop_ZeroHI96ofV128, mkexpr(max3210)));
7703          return res;
7704       }
7705       case Iop_Add64x2: {
7706          IRTemp x10 = src;
7707          IRTemp x00 = newTempV128();
7708          IRTemp x11 = newTempV128();
7709          assign(x11, binop(Iop_InterleaveHI64x2, mkexpr(x10), mkexpr(x10)));
7710          assign(x00, binop(Iop_InterleaveLO64x2, mkexpr(x10), mkexpr(x10)));
7711          IRTemp max10 = newTempV128();
7712          assign(max10, binop(op, mkexpr(x11), mkexpr(x00)));
7713          IRTemp res = newTempV128();
7714          assign(res, unop(Iop_ZeroHI64ofV128, mkexpr(max10)));
7715          return res;
7716       }
7717       default:
7718          vassert(0);
7719    }
7720 }
7721
7722
7723 /* Generate IR for TBL and TBX.  This deals with the 128 bit case
7724    only. */
7725 static IRTemp math_TBL_TBX ( IRTemp tab[4], UInt len, IRTemp src,
7726                              IRTemp oor_values )
7727 {
7728    vassert(len >= 0 && len <= 3);
7729
7730    /* Generate some useful constants as concisely as possible. */
7731    IRTemp half15 = newTemp(Ity_I64);
7732    assign(half15, mkU64(0x0F0F0F0F0F0F0F0FULL));
7733    IRTemp half16 = newTemp(Ity_I64);
7734    assign(half16, mkU64(0x1010101010101010ULL));
7735
7736    /* A zero vector */
7737    IRTemp allZero = newTempV128();
7738    assign(allZero, mkV128(0x0000));
7739    /* A vector containing 15 in each 8-bit lane */
7740    IRTemp all15 = newTempV128();
7741    assign(all15, binop(Iop_64HLtoV128, mkexpr(half15), mkexpr(half15)));
7742    /* A vector containing 16 in each 8-bit lane */
7743    IRTemp all16 = newTempV128();
7744    assign(all16, binop(Iop_64HLtoV128, mkexpr(half16), mkexpr(half16)));
7745    /* A vector containing 32 in each 8-bit lane */
7746    IRTemp all32 = newTempV128();
7747    assign(all32, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all16)));
7748    /* A vector containing 48 in each 8-bit lane */
7749    IRTemp all48 = newTempV128();
7750    assign(all48, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all32)));
7751    /* A vector containing 64 in each 8-bit lane */
7752    IRTemp all64 = newTempV128();
7753    assign(all64, binop(Iop_Add8x16, mkexpr(all32), mkexpr(all32)));
7754
7755    /* Group the 16/32/48/64 vectors so as to be indexable. */
7756    IRTemp allXX[4] = { all16, all32, all48, all64 };
7757
7758    /* Compute the result for each table vector, with zeroes in places
7759       where the index values are out of range, and OR them into the
7760       running vector. */
7761    IRTemp running_result = newTempV128();
7762    assign(running_result, mkV128(0));
7763
7764    UInt tabent;
7765    for (tabent = 0; tabent <= len; tabent++) {
7766       vassert(tabent >= 0 && tabent < 4);
7767       IRTemp bias = newTempV128();
7768       assign(bias,
7769              mkexpr(tabent == 0 ? allZero : allXX[tabent-1]));
7770       IRTemp biased_indices = newTempV128();
7771       assign(biased_indices,
7772              binop(Iop_Sub8x16, mkexpr(src), mkexpr(bias)));
7773       IRTemp valid_mask = newTempV128();
7774       assign(valid_mask,
7775              binop(Iop_CmpGT8Ux16, mkexpr(all16), mkexpr(biased_indices)));
7776       IRTemp safe_biased_indices = newTempV128();
7777       assign(safe_biased_indices,
7778              binop(Iop_AndV128, mkexpr(biased_indices), mkexpr(all15)));
7779       IRTemp results_or_junk = newTempV128();
7780       assign(results_or_junk,
7781              binop(Iop_Perm8x16, mkexpr(tab[tabent]),
7782                                  mkexpr(safe_biased_indices)));
7783       IRTemp results_or_zero = newTempV128();
7784       assign(results_or_zero,
7785              binop(Iop_AndV128, mkexpr(results_or_junk), mkexpr(valid_mask)));
7786       /* And OR that into the running result. */
7787       IRTemp tmp = newTempV128();
7788       assign(tmp, binop(Iop_OrV128, mkexpr(results_or_zero),
7789                         mkexpr(running_result)));
7790       running_result = tmp;
7791    }
7792
7793    /* So now running_result holds the overall result where the indices
7794       are in range, and zero in out-of-range lanes.  Now we need to
7795       compute an overall validity mask and use this to copy in the
7796       lanes in the oor_values for out of range indices.  This is
7797       unnecessary for TBL but will get folded out by iropt, so we lean
7798       on that and generate the same code for TBL and TBX here. */
7799    IRTemp overall_valid_mask = newTempV128();
7800    assign(overall_valid_mask,
7801           binop(Iop_CmpGT8Ux16, mkexpr(allXX[len]), mkexpr(src)));
7802    IRTemp result = newTempV128();
7803    assign(result,
7804           binop(Iop_OrV128,
7805                 mkexpr(running_result),
7806                 binop(Iop_AndV128,
7807                       mkexpr(oor_values),
7808                       unop(Iop_NotV128, mkexpr(overall_valid_mask)))));
7809    return result;
7810 }
7811
7812
7813 /* Let |argL| and |argR| be V128 values, and let |opI64x2toV128| be
7814    an op which takes two I64s and produces a V128.  That is, a widening
7815    operator.  Generate IR which applies |opI64x2toV128| to either the
7816    lower (if |is2| is False) or upper (if |is2| is True) halves of
7817    |argL| and |argR|, and return the value in a new IRTemp.
7818 */
7819 static
7820 IRTemp math_BINARY_WIDENING_V128 ( Bool is2, IROp opI64x2toV128,
7821                                    IRExpr* argL, IRExpr* argR )
7822 {
7823    IRTemp res   = newTempV128();
7824    IROp   slice = is2 ? Iop_V128HIto64 : Iop_V128to64;
7825    assign(res, binop(opI64x2toV128, unop(slice, argL),
7826                                     unop(slice, argR)));
7827    return res;
7828 }
7829
7830
7831 /* Generate signed/unsigned absolute difference vector IR. */
7832 static
7833 IRTemp math_ABD ( Bool isU, UInt size, IRExpr* argLE, IRExpr* argRE )
7834 {
7835    vassert(size <= 3);
7836    IRTemp argL = newTempV128();
7837    IRTemp argR = newTempV128();
7838    IRTemp msk  = newTempV128();
7839    IRTemp res  = newTempV128();
7840    assign(argL, argLE);
7841    assign(argR, argRE);
7842    assign(msk, binop(isU ? mkVecCMPGTU(size) : mkVecCMPGTS(size),
7843                      mkexpr(argL), mkexpr(argR)));
7844    assign(res,
7845           binop(Iop_OrV128,
7846                 binop(Iop_AndV128,
7847                       binop(mkVecSUB(size), mkexpr(argL), mkexpr(argR)),
7848                       mkexpr(msk)),
7849                 binop(Iop_AndV128,
7850                       binop(mkVecSUB(size), mkexpr(argR), mkexpr(argL)),
7851                       unop(Iop_NotV128, mkexpr(msk)))));
7852    return res;
7853 }
7854
7855
7856 /* Generate IR that takes a V128 and sign- or zero-widens
7857    either the lower or upper set of lanes to twice-as-wide,
7858    resulting in a new V128 value. */
7859 static
7860 IRTemp math_WIDEN_LO_OR_HI_LANES ( Bool zWiden, Bool fromUpperHalf,
7861                                    UInt sizeNarrow, IRExpr* srcE )
7862 {
7863    IRTemp src = newTempV128();
7864    IRTemp res = newTempV128();
7865    assign(src, srcE);
7866    switch (sizeNarrow) {
7867       case X10:
7868          assign(res,
7869                 binop(zWiden ? Iop_ShrN64x2 : Iop_SarN64x2,
7870                       binop(fromUpperHalf ? Iop_InterleaveHI32x4
7871                                           : Iop_InterleaveLO32x4,
7872                             mkexpr(src),
7873                             mkexpr(src)),
7874                       mkU8(32)));
7875          break;
7876       case X01:
7877          assign(res,
7878                 binop(zWiden ? Iop_ShrN32x4 : Iop_SarN32x4,
7879                       binop(fromUpperHalf ? Iop_InterleaveHI16x8
7880                                           : Iop_InterleaveLO16x8,
7881                             mkexpr(src),
7882                             mkexpr(src)),
7883                       mkU8(16)));
7884          break;
7885       case X00:
7886          assign(res,
7887                 binop(zWiden ? Iop_ShrN16x8 : Iop_SarN16x8,
7888                       binop(fromUpperHalf ? Iop_InterleaveHI8x16
7889                                           : Iop_InterleaveLO8x16,
7890                             mkexpr(src),
7891                             mkexpr(src)),
7892                       mkU8(8)));
7893          break;
7894       default:
7895          vassert(0);
7896    }
7897    return res;
7898 }
7899
7900
7901 /* Generate IR that takes a V128 and sign- or zero-widens
7902    either the even or odd lanes to twice-as-wide,
7903    resulting in a new V128 value. */
7904 static
7905 IRTemp math_WIDEN_EVEN_OR_ODD_LANES ( Bool zWiden, Bool fromOdd,
7906                                       UInt sizeNarrow, IRExpr* srcE )
7907 {
7908    IRTemp src   = newTempV128();
7909    IRTemp res   = newTempV128();
7910    IROp   opSAR = mkVecSARN(sizeNarrow+1);
7911    IROp   opSHR = mkVecSHRN(sizeNarrow+1);
7912    IROp   opSHL = mkVecSHLN(sizeNarrow+1);
7913    IROp   opSxR = zWiden ? opSHR : opSAR;
7914    UInt   amt   = 0;
7915    switch (sizeNarrow) {
7916       case X10: amt = 32; break;
7917       case X01: amt = 16; break;
7918       case X00: amt = 8;  break;
7919       default: vassert(0);
7920    }
7921    assign(src, srcE);
7922    if (fromOdd) {
7923       assign(res, binop(opSxR, mkexpr(src), mkU8(amt)));
7924    } else {
7925       assign(res, binop(opSxR, binop(opSHL, mkexpr(src), mkU8(amt)),
7926                                mkU8(amt)));
7927    }
7928    return res;
7929 }
7930
7931
7932 /* Generate IR that takes two V128s and narrows (takes lower half)
7933    of each lane, producing a single V128 value. */
7934 static
7935 IRTemp math_NARROW_LANES ( IRTemp argHi, IRTemp argLo, UInt sizeNarrow )
7936 {
7937    IRTemp res = newTempV128();
7938    assign(res, binop(mkVecCATEVENLANES(sizeNarrow),
7939                      mkexpr(argHi), mkexpr(argLo)));
7940    return res;
7941 }
7942
7943
7944 /* Return a temp which holds the vector dup of the lane of width
7945    (1 << size) obtained from src[laneNo]. */
7946 static
7947 IRTemp math_DUP_VEC_ELEM ( IRExpr* src, UInt size, UInt laneNo )
7948 {
7949    vassert(size <= 3);
7950    /* Normalise |laneNo| so it is of the form
7951       x000 for D, xx00 for S, xxx0 for H, and xxxx for B.
7952       This puts the bits we want to inspect at constant offsets
7953       regardless of the value of |size|.
7954    */
7955    UInt ix = laneNo << size;
7956    vassert(ix <= 15);
7957    IROp ops[4] = { Iop_INVALID, Iop_INVALID, Iop_INVALID, Iop_INVALID };
7958    switch (size) {
7959       case 0: /* B */
7960          ops[0] = (ix & 1) ? Iop_CatOddLanes8x16 : Iop_CatEvenLanes8x16;
7961          /* fallthrough */
7962       case 1: /* H */
7963          ops[1] = (ix & 2) ? Iop_CatOddLanes16x8 : Iop_CatEvenLanes16x8;
7964          /* fallthrough */
7965       case 2: /* S */
7966          ops[2] = (ix & 4) ? Iop_CatOddLanes32x4 : Iop_CatEvenLanes32x4;
7967          /* fallthrough */
7968       case 3: /* D */
7969          ops[3] = (ix & 8) ? Iop_InterleaveHI64x2 : Iop_InterleaveLO64x2;
7970          break;
7971       default:
7972          vassert(0);
7973    }
7974    IRTemp res = newTempV128();
7975    assign(res, src);
7976    Int i;
7977    for (i = 3; i >= 0; i--) {
7978       if (ops[i] == Iop_INVALID)
7979          break;
7980       IRTemp tmp = newTempV128();
7981       assign(tmp, binop(ops[i], mkexpr(res), mkexpr(res)));
7982       res = tmp;
7983    }
7984    return res;
7985 }
7986
7987
7988 /* Let |srcV| be a V128 value, and let |imm5| be a lane-and-size
7989    selector encoded as shown below.  Return a new V128 holding the
7990    selected lane from |srcV| dup'd out to V128, and also return the
7991    lane number, log2 of the lane size in bytes, and width-character via
7992    *laneNo, *laneSzLg2 and *laneCh respectively.  It may be that imm5
7993    is an invalid selector, in which case return
7994    IRTemp_INVALID, 0, 0 and '?' respectively.
7995
7996    imm5 = xxxx1   signifies .b[xxxx]
7997         = xxx10   .h[xxx]
7998         = xx100   .s[xx]
7999         = x1000   .d[x]
8000         otherwise invalid
8001 */
8002 static
8003 IRTemp handle_DUP_VEC_ELEM ( /*OUT*/UInt* laneNo,
8004                              /*OUT*/UInt* laneSzLg2, /*OUT*/HChar* laneCh,
8005                              IRExpr* srcV, UInt imm5 )
8006 {
8007    *laneNo    = 0;
8008    *laneSzLg2 = 0;
8009    *laneCh    = '?';
8010
8011    if (imm5 & 1) {
8012       *laneNo    = (imm5 >> 1) & 15;
8013       *laneSzLg2 = 0;
8014       *laneCh    = 'b';
8015    }
8016    else if (imm5 & 2) {
8017       *laneNo    = (imm5 >> 2) & 7;
8018       *laneSzLg2 = 1;
8019       *laneCh    = 'h';
8020    }
8021    else if (imm5 & 4) {
8022       *laneNo    = (imm5 >> 3) & 3;
8023       *laneSzLg2 = 2;
8024       *laneCh    = 's';
8025    }
8026    else if (imm5 & 8) {
8027       *laneNo    = (imm5 >> 4) & 1;
8028       *laneSzLg2 = 3;
8029       *laneCh    = 'd';
8030    }
8031    else {
8032       /* invalid */
8033       return IRTemp_INVALID;
8034    }
8035
8036    return math_DUP_VEC_ELEM(srcV, *laneSzLg2, *laneNo);
8037 }
8038
8039
8040 /* Clone |imm| to every lane of a V128, with lane size log2 of |size|. */
8041 static
8042 IRTemp math_VEC_DUP_IMM ( UInt size, ULong imm )
8043 {
8044    IRType ty  = Ity_INVALID;
8045    IRTemp rcS = IRTemp_INVALID;
8046    switch (size) {
8047       case X01:
8048          vassert(imm <= 0xFFFFULL);
8049          ty  = Ity_I16;
8050          rcS = newTemp(ty); assign(rcS, mkU16( (UShort)imm ));
8051          break;
8052       case X10:
8053          vassert(imm <= 0xFFFFFFFFULL);
8054          ty  = Ity_I32;
8055          rcS = newTemp(ty); assign(rcS, mkU32( (UInt)imm ));
8056          break;
8057       case X11:
8058          ty  = Ity_I64;
8059          rcS = newTemp(ty); assign(rcS, mkU64(imm)); break;
8060       default:
8061          vassert(0);
8062    }
8063    IRTemp rcV = math_DUP_TO_V128(rcS, ty);
8064    return rcV;
8065 }
8066
8067
8068 /* Let |new64| be a V128 in which only the lower 64 bits are interesting,
8069    and the upper can contain any value -- it is ignored.  If |is2| is False,
8070    generate IR to put |new64| in the lower half of vector reg |dd| and zero
8071    the upper half.  If |is2| is True, generate IR to put |new64| in the upper
8072    half of vector reg |dd| and leave the lower half unchanged.  This
8073    simulates the behaviour of the "foo/foo2" instructions in which the
8074    destination is half the width of sources, for example addhn/addhn2.
8075 */
8076 static
8077 void putLO64andZUorPutHI64 ( Bool is2, UInt dd, IRTemp new64 )
8078 {
8079    if (is2) {
8080       /* Get the old contents of Vdd, zero the upper half, and replace
8081          it with 'x'. */
8082       IRTemp t_zero_oldLO = newTempV128();
8083       assign(t_zero_oldLO, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
8084       IRTemp t_newHI_zero = newTempV128();
8085       assign(t_newHI_zero, binop(Iop_InterleaveLO64x2, mkexpr(new64),
8086                                                        mkV128(0x0000)));
8087       IRTemp res = newTempV128();
8088       assign(res, binop(Iop_OrV128, mkexpr(t_zero_oldLO),
8089                                     mkexpr(t_newHI_zero)));
8090       putQReg128(dd, mkexpr(res));
8091    } else {
8092       /* This is simple. */
8093       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(new64)));
8094    }
8095 }
8096
8097
8098 /* Compute vector SQABS at lane size |size| for |srcE|, returning
8099    the q result in |*qabs| and the normal result in |*nabs|. */
8100 static
8101 void math_SQABS ( /*OUT*/IRTemp* qabs, /*OUT*/IRTemp* nabs,
8102                   IRExpr* srcE, UInt size )
8103 {
8104       IRTemp src, mask, maskn, nsub, qsub;
8105       src = mask = maskn = nsub = qsub = IRTemp_INVALID;
8106       newTempsV128_7(&src, &mask, &maskn, &nsub, &qsub, nabs, qabs);
8107       assign(src,   srcE);
8108       assign(mask,  binop(mkVecCMPGTS(size),  mkV128(0x0000), mkexpr(src)));
8109       assign(maskn, unop(Iop_NotV128, mkexpr(mask)));
8110       assign(nsub,  binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
8111       assign(qsub,  binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
8112       assign(*nabs, binop(Iop_OrV128,
8113                           binop(Iop_AndV128, mkexpr(nsub), mkexpr(mask)),
8114                           binop(Iop_AndV128, mkexpr(src),  mkexpr(maskn))));
8115       assign(*qabs, binop(Iop_OrV128,
8116                           binop(Iop_AndV128, mkexpr(qsub), mkexpr(mask)),
8117                           binop(Iop_AndV128, mkexpr(src),  mkexpr(maskn))));
8118 }
8119
8120
8121 /* Compute vector SQNEG at lane size |size| for |srcE|, returning
8122    the q result in |*qneg| and the normal result in |*nneg|. */
8123 static
8124 void math_SQNEG ( /*OUT*/IRTemp* qneg, /*OUT*/IRTemp* nneg,
8125                   IRExpr* srcE, UInt size )
8126 {
8127       IRTemp src = IRTemp_INVALID;
8128       newTempsV128_3(&src, nneg, qneg);
8129       assign(src,   srcE);
8130       assign(*nneg, binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
8131       assign(*qneg, binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
8132 }
8133
8134
8135 /* Zero all except the least significant lane of |srcE|, where |size|
8136    indicates the lane size in the usual way. */
8137 static IRTemp math_ZERO_ALL_EXCEPT_LOWEST_LANE ( UInt size, IRExpr* srcE )
8138 {
8139    vassert(size < 4);
8140    IRTemp t = newTempV128();
8141    assign(t, unop(mkVecZEROHIxxOFV128(size), srcE));
8142    return t;
8143 }
8144
8145
8146 /* Generate IR to compute vector widening MULL from either the lower
8147    (is2==False) or upper (is2==True) halves of vecN and vecM.  The
8148    widening multiplies are unsigned when isU==True and signed when
8149    isU==False.  |size| is the narrow lane size indication.  Optionally,
8150    the product may be added to or subtracted from vecD, at the wide lane
8151    size.  This happens when |mas| is 'a' (add) or 's' (sub).  When |mas|
8152    is 'm' (only multiply) then the accumulate part does not happen, and
8153    |vecD| is expected to == IRTemp_INVALID.
8154
8155    Only size==0 (h_b_b), size==1 (s_h_h) and size==2 (d_s_s) variants
8156    are allowed.  The result is returned in a new IRTemp, which is
8157    returned in *res. */
8158 static
8159 void math_MULL_ACC ( /*OUT*/IRTemp* res,
8160                      Bool is2, Bool isU, UInt size, HChar mas,
8161                      IRTemp vecN, IRTemp vecM, IRTemp vecD )
8162 {
8163    vassert(res && *res == IRTemp_INVALID);
8164    vassert(size <= 2);
8165    vassert(mas == 'm' || mas == 'a' || mas == 's');
8166    if (mas == 'm') vassert(vecD == IRTemp_INVALID);
8167    IROp   mulOp = isU ? mkVecMULLU(size) : mkVecMULLS(size);
8168    IROp   accOp = (mas == 'a') ? mkVecADD(size+1)
8169                   : (mas == 's' ? mkVecSUB(size+1)
8170                   : Iop_INVALID);
8171    IRTemp mul   = math_BINARY_WIDENING_V128(is2, mulOp,
8172                                             mkexpr(vecN), mkexpr(vecM));
8173    *res = newTempV128();
8174    assign(*res, mas == 'm' ? mkexpr(mul)
8175                            : binop(accOp, mkexpr(vecD), mkexpr(mul)));
8176 }
8177
8178
8179 /* Same as math_MULL_ACC, except the multiply is signed widening,
8180    the multiplied value is then doubled, before being added to or
8181    subtracted from the accumulated value.  And everything is
8182    saturated.  In all cases, saturation residuals are returned
8183    via (sat1q, sat1n), and in the accumulate cases,
8184    via (sat2q, sat2n) too.  All results are returned in new temporaries.
8185    In the no-accumulate case, *sat2q and *sat2n are never instantiated,
8186    so the caller can tell this has happened. */
8187 static
8188 void math_SQDMULL_ACC ( /*OUT*/IRTemp* res,
8189                         /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
8190                         /*OUT*/IRTemp* sat2q, /*OUT*/IRTemp* sat2n,
8191                         Bool is2, UInt size, HChar mas,
8192                         IRTemp vecN, IRTemp vecM, IRTemp vecD )
8193 {
8194    vassert(size <= 2);
8195    vassert(mas == 'm' || mas == 'a' || mas == 's');
8196    /* Compute
8197          sat1q = vecN.D[is2] *sq vecM.d[is2] *q 2
8198          sat1n = vecN.D[is2] *s  vecM.d[is2] *  2
8199       IOW take either the low or high halves of vecN and vecM, signed widen,
8200       multiply, double that, and signedly saturate.  Also compute the same
8201       but without saturation.
8202    */
8203    vassert(sat2q && *sat2q == IRTemp_INVALID);
8204    vassert(sat2n && *sat2n == IRTemp_INVALID);
8205    newTempsV128_3(sat1q, sat1n, res);
8206    IRTemp tq = math_BINARY_WIDENING_V128(is2, mkVecQDMULLS(size),
8207                                          mkexpr(vecN), mkexpr(vecM));
8208    IRTemp tn = math_BINARY_WIDENING_V128(is2, mkVecMULLS(size),
8209                                          mkexpr(vecN), mkexpr(vecM));
8210    assign(*sat1q, mkexpr(tq));
8211    assign(*sat1n, binop(mkVecADD(size+1), mkexpr(tn), mkexpr(tn)));
8212
8213    /* If there is no accumulation, the final result is sat1q,
8214       and there's no assignment to sat2q or sat2n. */
8215    if (mas == 'm') {
8216       assign(*res, mkexpr(*sat1q));
8217       return;
8218    }
8219
8220    /* Compute
8221          sat2q  = vecD +sq/-sq sat1q
8222          sat2n  = vecD +/-     sat1n
8223          result = sat2q
8224    */
8225    newTempsV128_2(sat2q, sat2n);
8226    assign(*sat2q, binop(mas == 'a' ? mkVecQADDS(size+1) : mkVecQSUBS(size+1),
8227                         mkexpr(vecD), mkexpr(*sat1q)));
8228    assign(*sat2n, binop(mas == 'a' ? mkVecADD(size+1) : mkVecSUB(size+1),
8229                         mkexpr(vecD), mkexpr(*sat1n)));
8230    assign(*res, mkexpr(*sat2q));
8231 }
8232
8233
8234 /* Generate IR for widening signed vector multiplies.  The operands
8235    have their lane width signedly widened, and they are then multiplied
8236    at the wider width, returning results in two new IRTemps. */
8237 static
8238 void math_MULLS ( /*OUT*/IRTemp* resHI, /*OUT*/IRTemp* resLO,
8239                   UInt sizeNarrow, IRTemp argL, IRTemp argR )
8240 {
8241    vassert(sizeNarrow <= 2);
8242    newTempsV128_2(resHI, resLO);
8243    IRTemp argLhi = newTemp(Ity_I64);
8244    IRTemp argLlo = newTemp(Ity_I64);
8245    IRTemp argRhi = newTemp(Ity_I64);
8246    IRTemp argRlo = newTemp(Ity_I64);
8247    assign(argLhi, unop(Iop_V128HIto64, mkexpr(argL)));
8248    assign(argLlo, unop(Iop_V128to64,   mkexpr(argL)));
8249    assign(argRhi, unop(Iop_V128HIto64, mkexpr(argR)));
8250    assign(argRlo, unop(Iop_V128to64,   mkexpr(argR)));
8251    IROp opMulls = mkVecMULLS(sizeNarrow);
8252    assign(*resHI, binop(opMulls, mkexpr(argLhi), mkexpr(argRhi)));
8253    assign(*resLO, binop(opMulls, mkexpr(argLlo), mkexpr(argRlo)));
8254 }
8255
8256
8257 /* Generate IR for SQDMULH and SQRDMULH: signedly wideningly multiply,
8258    double that, possibly add a rounding constant (R variants), and take
8259    the high half. */
8260 static
8261 void math_SQDMULH ( /*OUT*/IRTemp* res,
8262                     /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
8263                     Bool isR, UInt size, IRTemp vN, IRTemp vM )
8264 {
8265    vassert(size == X01 || size == X10); /* s or h only */
8266
8267    newTempsV128_3(res, sat1q, sat1n);
8268
8269    IRTemp mullsHI = IRTemp_INVALID, mullsLO = IRTemp_INVALID;
8270    math_MULLS(&mullsHI, &mullsLO, size, vN, vM);
8271
8272    IRTemp addWide = mkVecADD(size+1);
8273
8274    if (isR) {
8275       assign(*sat1q, binop(mkVecQRDMULHIS(size), mkexpr(vN), mkexpr(vM)));
8276
8277       Int    rcShift    = size == X01 ? 15 : 31;
8278       IRTemp roundConst = math_VEC_DUP_IMM(size+1, 1ULL << rcShift);
8279       assign(*sat1n,
8280              binop(mkVecCATODDLANES(size),
8281                    binop(addWide,
8282                          binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
8283                          mkexpr(roundConst)),
8284                    binop(addWide,
8285                          binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO)),
8286                          mkexpr(roundConst))));
8287    } else {
8288       assign(*sat1q, binop(mkVecQDMULHIS(size), mkexpr(vN), mkexpr(vM)));
8289
8290       assign(*sat1n,
8291              binop(mkVecCATODDLANES(size),
8292                    binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
8293                    binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO))));
8294    }
8295
8296    assign(*res, mkexpr(*sat1q));
8297 }
8298
8299
8300 /* Generate IR for SQSHL, UQSHL, SQSHLU by imm.  Put the result in
8301    a new temp in *res, and the Q difference pair in new temps in
8302    *qDiff1 and *qDiff2 respectively.  |nm| denotes which of the
8303    three operations it is. */
8304 static
8305 void math_QSHL_IMM ( /*OUT*/IRTemp* res,
8306                      /*OUT*/IRTemp* qDiff1, /*OUT*/IRTemp* qDiff2,
8307                      IRTemp src, UInt size, UInt shift, const HChar* nm )
8308 {
8309    vassert(size <= 3);
8310    UInt laneBits = 8 << size;
8311    vassert(shift < laneBits);
8312    newTempsV128_3(res, qDiff1, qDiff2);
8313    IRTemp z128 = newTempV128();
8314    assign(z128, mkV128(0x0000));
8315
8316    /* UQSHL */
8317    if (vex_streq(nm, "uqshl")) {
8318       IROp qop = mkVecQSHLNSATUU(size);
8319       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
8320       if (shift == 0) {
8321          /* No shift means no saturation. */
8322          assign(*qDiff1, mkexpr(z128));
8323          assign(*qDiff2, mkexpr(z128));
8324       } else {
8325          /* Saturation has occurred if any of the shifted-out bits are
8326             nonzero.  We get the shifted-out bits by right-shifting the
8327             original value. */
8328          UInt rshift = laneBits - shift;
8329          vassert(rshift >= 1 && rshift < laneBits);
8330          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
8331          assign(*qDiff2, mkexpr(z128));
8332       }
8333       return;
8334    }
8335
8336    /* SQSHL */
8337    if (vex_streq(nm, "sqshl")) {
8338       IROp qop = mkVecQSHLNSATSS(size);
8339       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
8340       if (shift == 0) {
8341          /* No shift means no saturation. */
8342          assign(*qDiff1, mkexpr(z128));
8343          assign(*qDiff2, mkexpr(z128));
8344       } else {
8345          /* Saturation has occurred if any of the shifted-out bits are
8346             different from the top bit of the original value. */
8347          UInt rshift = laneBits - 1 - shift;
8348          vassert(rshift >= 0 && rshift < laneBits-1);
8349          /* qDiff1 is the shifted out bits, and the top bit of the original
8350             value, preceded by zeroes. */
8351          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
8352          /* qDiff2 is the top bit of the original value, cloned the
8353             correct number of times. */
8354          assign(*qDiff2, binop(mkVecSHRN(size),
8355                                binop(mkVecSARN(size), mkexpr(src),
8356                                                       mkU8(laneBits-1)),
8357                                mkU8(rshift)));
8358          /* This also succeeds in comparing the top bit of the original
8359             value to itself, which is a bit stupid, but not wrong. */
8360       }
8361       return;
8362    }
8363
8364    /* SQSHLU */
8365    if (vex_streq(nm, "sqshlu")) {
8366       IROp qop = mkVecQSHLNSATSU(size);
8367       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
8368       if (shift == 0) {
8369          /* If there's no shift, saturation depends on the top bit
8370             of the source. */
8371          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(laneBits-1)));
8372          assign(*qDiff2, mkexpr(z128));
8373       } else {
8374          /* Saturation has occurred if any of the shifted-out bits are
8375             nonzero.  We get the shifted-out bits by right-shifting the
8376             original value. */
8377          UInt rshift = laneBits - shift;
8378          vassert(rshift >= 1 && rshift < laneBits);
8379          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
8380          assign(*qDiff2, mkexpr(z128));
8381       }
8382       return;
8383    }
8384
8385    vassert(0);
8386 }
8387
8388
8389 /* Generate IR to do SRHADD and URHADD. */
8390 static
8391 IRTemp math_RHADD ( UInt size, Bool isU, IRTemp aa, IRTemp bb )
8392 {
8393    /* Generate this:
8394       (A >> 1) + (B >> 1) + (((A & 1) + (B & 1) + 1) >> 1)
8395    */
8396    vassert(size <= 3);
8397    IROp opSHR = isU ? mkVecSHRN(size) : mkVecSARN(size);
8398    IROp opADD = mkVecADD(size);
8399    /* The only tricky bit is to generate the correct vector 1 constant. */
8400    const ULong ones64[4]
8401       = { 0x0101010101010101ULL, 0x0001000100010001ULL,
8402           0x0000000100000001ULL, 0x0000000000000001ULL };
8403    IRTemp imm64 = newTemp(Ity_I64);
8404    assign(imm64, mkU64(ones64[size]));
8405    IRTemp vecOne = newTempV128();
8406    assign(vecOne, binop(Iop_64HLtoV128, mkexpr(imm64), mkexpr(imm64)));
8407    IRTemp scaOne = newTemp(Ity_I8);
8408    assign(scaOne, mkU8(1));
8409    IRTemp res = newTempV128();
8410    assign(res,
8411           binop(opADD,
8412                 binop(opSHR, mkexpr(aa), mkexpr(scaOne)),
8413                 binop(opADD,
8414                       binop(opSHR, mkexpr(bb), mkexpr(scaOne)),
8415                       binop(opSHR,
8416                             binop(opADD,
8417                                   binop(opADD,
8418                                         binop(Iop_AndV128, mkexpr(aa),
8419                                                            mkexpr(vecOne)),
8420                                         binop(Iop_AndV128, mkexpr(bb),
8421                                                            mkexpr(vecOne))
8422                                   ),
8423                                   mkexpr(vecOne)
8424                             ),
8425                             mkexpr(scaOne)
8426                       )
8427                 )
8428           )
8429    );
8430    return res;
8431 }
8432
8433
8434 /* QCFLAG tracks the SIMD sticky saturation status.  Update the status
8435    thusly: if, after application of |opZHI| to both |qres| and |nres|,
8436    they have the same value, leave QCFLAG unchanged.  Otherwise, set it
8437    (implicitly) to 1.  |opZHI| may only be one of the Iop_ZeroHIxxofV128
8438    operators, or Iop_INVALID, in which case |qres| and |nres| are used
8439    unmodified.  The presence |opZHI| means this function can be used to
8440    generate QCFLAG update code for both scalar and vector SIMD operations.
8441 */
8442 static
8443 void updateQCFLAGwithDifferenceZHI ( IRTemp qres, IRTemp nres, IROp opZHI )
8444 {
8445    IRTemp diff      = newTempV128();
8446    IRTemp oldQCFLAG = newTempV128();
8447    IRTemp newQCFLAG = newTempV128();
8448    if (opZHI == Iop_INVALID) {
8449       assign(diff, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres)));
8450    } else {
8451       vassert(opZHI == Iop_ZeroHI64ofV128
8452               || opZHI == Iop_ZeroHI96ofV128 || opZHI == Iop_ZeroHI112ofV128);
8453       assign(diff, unop(opZHI, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres))));
8454    }
8455    assign(oldQCFLAG, IRExpr_Get(OFFB_QCFLAG, Ity_V128));
8456    assign(newQCFLAG, binop(Iop_OrV128, mkexpr(oldQCFLAG), mkexpr(diff)));
8457    stmt(IRStmt_Put(OFFB_QCFLAG, mkexpr(newQCFLAG)));
8458 }
8459
8460
8461 /* A variant of updateQCFLAGwithDifferenceZHI in which |qres| and |nres|
8462    are used unmodified, hence suitable for QCFLAG updates for whole-vector
8463    operations. */
8464 static
8465 void updateQCFLAGwithDifference ( IRTemp qres, IRTemp nres )
8466 {
8467    updateQCFLAGwithDifferenceZHI(qres, nres, Iop_INVALID);
8468 }
8469
8470
8471 /* Generate IR to rearrange two vector values in a way which is useful
8472    for doing S/D add-pair etc operations.  There are 3 cases:
8473
8474    2d:  [m1 m0] [n1 n0]  -->  [m1 n1] [m0 n0]
8475
8476    4s:  [m3 m2 m1 m0] [n3 n2 n1 n0]  -->  [m3 m1 n3 n1] [m2 m0 n2 n0]
8477
8478    2s:  [m2 m2 m1 m0] [n3 n2 n1 n0]  -->  [0 0 m1 n1] [0 0 m0 n0]
8479
8480    The cases are distinguished as follows:
8481    isD == True,  bitQ == 1  =>  2d
8482    isD == False, bitQ == 1  =>  4s
8483    isD == False, bitQ == 0  =>  2s
8484 */
8485 static
8486 void math_REARRANGE_FOR_FLOATING_PAIRWISE (
8487         /*OUT*/IRTemp* rearrL, /*OUT*/IRTemp* rearrR,
8488         IRTemp vecM, IRTemp vecN, Bool isD, UInt bitQ
8489      )
8490 {
8491    vassert(rearrL && *rearrL == IRTemp_INVALID);
8492    vassert(rearrR && *rearrR == IRTemp_INVALID);
8493    *rearrL = newTempV128();
8494    *rearrR = newTempV128();
8495    if (isD) {
8496       // 2d case
8497       vassert(bitQ == 1);
8498       assign(*rearrL, binop(Iop_InterleaveHI64x2, mkexpr(vecM), mkexpr(vecN)));
8499       assign(*rearrR, binop(Iop_InterleaveLO64x2, mkexpr(vecM), mkexpr(vecN)));
8500    }
8501    else if (!isD && bitQ == 1) {
8502       // 4s case
8503       assign(*rearrL, binop(Iop_CatOddLanes32x4,  mkexpr(vecM), mkexpr(vecN)));
8504       assign(*rearrR, binop(Iop_CatEvenLanes32x4, mkexpr(vecM), mkexpr(vecN)));
8505    } else {
8506       // 2s case
8507       vassert(!isD && bitQ == 0);
8508       IRTemp m1n1m0n0 = newTempV128();
8509       IRTemp m0n0m1n1 = newTempV128();
8510       assign(m1n1m0n0, binop(Iop_InterleaveLO32x4,
8511                              mkexpr(vecM), mkexpr(vecN)));
8512       assign(m0n0m1n1, triop(Iop_SliceV128,
8513                              mkexpr(m1n1m0n0), mkexpr(m1n1m0n0), mkU8(8)));
8514       assign(*rearrL, unop(Iop_ZeroHI64ofV128, mkexpr(m1n1m0n0)));
8515       assign(*rearrR, unop(Iop_ZeroHI64ofV128, mkexpr(m0n0m1n1)));
8516    }
8517 }
8518
8519
8520 /* Returns 2.0 ^ (-n) for n in 1 .. 64 */
8521 static Double two_to_the_minus ( Int n )
8522 {
8523    if (n == 1) return 0.5;
8524    vassert(n >= 2 && n <= 64);
8525    Int half = n / 2;
8526    return two_to_the_minus(half) * two_to_the_minus(n - half);
8527 }
8528
8529
8530 /* Returns 2.0 ^ n for n in 1 .. 64 */
8531 static Double two_to_the_plus ( Int n )
8532 {
8533    if (n == 1) return 2.0;
8534    vassert(n >= 2 && n <= 64);
8535    Int half = n / 2;
8536    return two_to_the_plus(half) * two_to_the_plus(n - half);
8537 }
8538
8539
8540 /*------------------------------------------------------------*/
8541 /*--- SIMD and FP instructions                             ---*/
8542 /*------------------------------------------------------------*/
8543
8544 static
8545 Bool dis_AdvSIMD_EXT(/*MB_OUT*/DisResult* dres, UInt insn)
8546 {
8547    /* 31  29     23  21 20 15 14   10 9 4
8548       0 q 101110 op2 0  m  0  imm4 0  n d
8549       Decode fields: op2
8550    */
8551 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8552    if (INSN(31,31) != 0
8553        || INSN(29,24) != BITS6(1,0,1,1,1,0)
8554        || INSN(21,21) != 0 || INSN(15,15) != 0 || INSN(10,10) != 0) {
8555       return False;
8556    }
8557    UInt bitQ = INSN(30,30);
8558    UInt op2  = INSN(23,22);
8559    UInt mm   = INSN(20,16);
8560    UInt imm4 = INSN(14,11);
8561    UInt nn   = INSN(9,5);
8562    UInt dd   = INSN(4,0);
8563
8564    if (op2 == BITS2(0,0)) {
8565       /* -------- 00: EXT 16b_16b_16b, 8b_8b_8b -------- */
8566       IRTemp sHi = newTempV128();
8567       IRTemp sLo = newTempV128();
8568       IRTemp res = newTempV128();
8569       assign(sHi, getQReg128(mm));
8570       assign(sLo, getQReg128(nn));
8571       if (bitQ == 1) {
8572          if (imm4 == 0) {
8573             assign(res, mkexpr(sLo));
8574          } else {
8575             vassert(imm4 >= 1 && imm4 <= 15);
8576             assign(res, triop(Iop_SliceV128,
8577                               mkexpr(sHi), mkexpr(sLo), mkU8(imm4)));
8578          }
8579          putQReg128(dd, mkexpr(res));
8580          DIP("ext v%u.16b, v%u.16b, v%u.16b, #%u\n", dd, nn, mm, imm4);
8581       } else {
8582          if (imm4 >= 8) return False;
8583          if (imm4 == 0) {
8584             assign(res, mkexpr(sLo));
8585          } else {
8586             vassert(imm4 >= 1 && imm4 <= 7);
8587             IRTemp hi64lo64 = newTempV128();
8588             assign(hi64lo64, binop(Iop_InterleaveLO64x2,
8589                                    mkexpr(sHi), mkexpr(sLo)));
8590             assign(res, triop(Iop_SliceV128,
8591                               mkexpr(hi64lo64), mkexpr(hi64lo64), mkU8(imm4)));
8592          }
8593          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
8594          DIP("ext v%u.8b, v%u.8b, v%u.8b, #%u\n", dd, nn, mm, imm4);
8595       }
8596       return True;
8597    }
8598
8599    return False;
8600 #  undef INSN
8601 }
8602
8603
8604 static
8605 Bool dis_AdvSIMD_TBL_TBX(/*MB_OUT*/DisResult* dres, UInt insn)
8606 {
8607    /* 31  29     23  21 20 15 14  12 11 9 4
8608       0 q 001110 op2 0  m  0  len op 00 n d
8609       Decode fields: op2,len,op
8610    */
8611 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8612    if (INSN(31,31) != 0
8613        || INSN(29,24) != BITS6(0,0,1,1,1,0)
8614        || INSN(21,21) != 0
8615        || INSN(15,15) != 0
8616        || INSN(11,10) != BITS2(0,0)) {
8617       return False;
8618    }
8619    UInt bitQ  = INSN(30,30);
8620    UInt op2   = INSN(23,22);
8621    UInt mm    = INSN(20,16);
8622    UInt len   = INSN(14,13);
8623    UInt bitOP = INSN(12,12);
8624    UInt nn    = INSN(9,5);
8625    UInt dd    = INSN(4,0);
8626
8627    if (op2 == X00) {
8628       /* -------- 00,xx,0 TBL, xx register table -------- */
8629       /* -------- 00,xx,1 TBX, xx register table -------- */
8630       /* 31  28        20 15 14  12  9 4
8631          0q0 01110 000 m  0  len 000 n d  TBL Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
8632          0q0 01110 000 m  0  len 100 n d  TBX Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
8633          where Ta = 16b(q=1) or 8b(q=0)
8634       */
8635       Bool isTBX = bitOP == 1;
8636       /* The out-of-range values to use. */
8637       IRTemp oor_values = newTempV128();
8638       assign(oor_values, isTBX ? getQReg128(dd) : mkV128(0));
8639       /* src value */
8640       IRTemp src = newTempV128();
8641       assign(src, getQReg128(mm));
8642       /* The table values */
8643       IRTemp tab[4];
8644       UInt   i;
8645       for (i = 0; i <= len; i++) {
8646          vassert(i < 4);
8647          tab[i] = newTempV128();
8648          assign(tab[i], getQReg128((nn + i) % 32));
8649       }
8650       IRTemp res = math_TBL_TBX(tab, len, src, oor_values);
8651       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8652       const HChar* Ta = bitQ ==1 ? "16b" : "8b";
8653       const HChar* nm = isTBX ? "tbx" : "tbl";
8654       DIP("%s %s.%s, {v%u.16b .. v%u.16b}, %s.%s\n",
8655           nm, nameQReg128(dd), Ta, nn, (nn + len) % 32, nameQReg128(mm), Ta);
8656       return True;
8657    }
8658
8659 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8660    return False;
8661 #  undef INSN
8662 }
8663
8664
8665 static
8666 Bool dis_AdvSIMD_ZIP_UZP_TRN(/*MB_OUT*/DisResult* dres, UInt insn)
8667 {
8668    /* 31  29     23   21 20 15 14     11 9 4
8669       0 q 001110 size 0  m  0  opcode 10 n d
8670       Decode fields: opcode
8671    */
8672 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8673    if (INSN(31,31) != 0
8674        || INSN(29,24) != BITS6(0,0,1,1,1,0)
8675        || INSN(21,21) != 0 || INSN(15,15) != 0 || INSN(11,10) != BITS2(1,0)) {
8676       return False;
8677    }
8678    UInt bitQ   = INSN(30,30);
8679    UInt size   = INSN(23,22);
8680    UInt mm     = INSN(20,16);
8681    UInt opcode = INSN(14,12);
8682    UInt nn     = INSN(9,5);
8683    UInt dd     = INSN(4,0);
8684
8685    if (opcode == BITS3(0,0,1) || opcode == BITS3(1,0,1)) {
8686       /* -------- 001 UZP1 std7_std7_std7 -------- */
8687       /* -------- 101 UZP2 std7_std7_std7 -------- */
8688       if (bitQ == 0 && size == X11) return False; // implied 1d case
8689       Bool   isUZP1 = opcode == BITS3(0,0,1);
8690       IROp   op     = isUZP1 ? mkVecCATEVENLANES(size)
8691                              : mkVecCATODDLANES(size);
8692       IRTemp preL = newTempV128();
8693       IRTemp preR = newTempV128();
8694       IRTemp res  = newTempV128();
8695       if (bitQ == 0) {
8696          assign(preL, binop(Iop_InterleaveLO64x2, getQReg128(mm),
8697                                                   getQReg128(nn)));
8698          assign(preR, mkexpr(preL));
8699       } else {
8700          assign(preL, getQReg128(mm));
8701          assign(preR, getQReg128(nn));
8702       }
8703       assign(res, binop(op, mkexpr(preL), mkexpr(preR)));
8704       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8705       const HChar* nm  = isUZP1 ? "uzp1" : "uzp2";
8706       const HChar* arr = nameArr_Q_SZ(bitQ, size);
8707       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
8708           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
8709       return True;
8710    }
8711
8712    if (opcode == BITS3(0,1,0) || opcode == BITS3(1,1,0)) {
8713       /* -------- 010 TRN1 std7_std7_std7 -------- */
8714       /* -------- 110 TRN2 std7_std7_std7 -------- */
8715       if (bitQ == 0 && size == X11) return False; // implied 1d case
8716       Bool   isTRN1 = opcode == BITS3(0,1,0);
8717       IROp   op1    = isTRN1 ? mkVecCATEVENLANES(size)
8718                              : mkVecCATODDLANES(size);
8719       IROp op2 = mkVecINTERLEAVEHI(size);
8720       IRTemp srcM = newTempV128();
8721       IRTemp srcN = newTempV128();
8722       IRTemp res  = newTempV128();
8723       assign(srcM, getQReg128(mm));
8724       assign(srcN, getQReg128(nn));
8725       assign(res, binop(op2, binop(op1, mkexpr(srcM), mkexpr(srcM)),
8726                              binop(op1, mkexpr(srcN), mkexpr(srcN))));
8727       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8728       const HChar* nm  = isTRN1 ? "trn1" : "trn2";
8729       const HChar* arr = nameArr_Q_SZ(bitQ, size);
8730       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
8731           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
8732       return True;
8733    }
8734
8735    if (opcode == BITS3(0,1,1) || opcode == BITS3(1,1,1)) {
8736       /* -------- 011 ZIP1 std7_std7_std7 -------- */
8737       /* -------- 111 ZIP2 std7_std7_std7 -------- */
8738       if (bitQ == 0 && size == X11) return False; // implied 1d case
8739       Bool   isZIP1 = opcode == BITS3(0,1,1);
8740       IROp   op     = isZIP1 ? mkVecINTERLEAVELO(size)
8741                              : mkVecINTERLEAVEHI(size);
8742       IRTemp preL = newTempV128();
8743       IRTemp preR = newTempV128();
8744       IRTemp res  = newTempV128();
8745       if (bitQ == 0 && !isZIP1) {
8746          IRTemp z128 = newTempV128();
8747          assign(z128, mkV128(0x0000));
8748          // preL = Vm shifted left 32 bits
8749          // preR = Vn shifted left 32 bits
8750          assign(preL, triop(Iop_SliceV128,
8751                             getQReg128(mm), mkexpr(z128), mkU8(12)));
8752          assign(preR, triop(Iop_SliceV128,
8753                             getQReg128(nn), mkexpr(z128), mkU8(12)));
8754
8755       } else {
8756          assign(preL, getQReg128(mm));
8757          assign(preR, getQReg128(nn));
8758       }
8759       assign(res, binop(op, mkexpr(preL), mkexpr(preR)));
8760       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8761       const HChar* nm  = isZIP1 ? "zip1" : "zip2";
8762       const HChar* arr = nameArr_Q_SZ(bitQ, size);
8763       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
8764           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
8765       return True;
8766    }
8767
8768    return False;
8769 #  undef INSN
8770 }
8771
8772
8773 static
8774 Bool dis_AdvSIMD_across_lanes(/*MB_OUT*/DisResult* dres, UInt insn)
8775 {
8776    /* 31    28    23   21    16     11 9 4
8777       0 q u 01110 size 11000 opcode 10 n d
8778       Decode fields: u,size,opcode
8779    */
8780 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8781    if (INSN(31,31) != 0
8782        || INSN(28,24) != BITS5(0,1,1,1,0)
8783        || INSN(21,17) != BITS5(1,1,0,0,0) || INSN(11,10) != BITS2(1,0)) {
8784       return False;
8785    }
8786    UInt bitQ   = INSN(30,30);
8787    UInt bitU   = INSN(29,29);
8788    UInt size   = INSN(23,22);
8789    UInt opcode = INSN(16,12);
8790    UInt nn     = INSN(9,5);
8791    UInt dd     = INSN(4,0);
8792
8793    if (opcode == BITS5(0,0,0,1,1)) {
8794       /* -------- 0,xx,00011 SADDLV -------- */
8795       /* -------- 1,xx,00011 UADDLV -------- */
8796       /* size is the narrow size */
8797       if (size == X11 || (size == X10 && bitQ == 0)) return False;
8798       Bool   isU = bitU == 1;
8799       IRTemp src = newTempV128();
8800       assign(src, getQReg128(nn));
8801       /* The basic plan is to widen the lower half, and if Q = 1,
8802          the upper half too.  Add them together (if Q = 1), and in
8803          either case fold with add at twice the lane width.
8804       */
8805       IRExpr* widened
8806          = mkexpr(math_WIDEN_LO_OR_HI_LANES(
8807                      isU, False/*!fromUpperHalf*/, size, mkexpr(src)));
8808       if (bitQ == 1) {
8809          widened
8810             = binop(mkVecADD(size+1),
8811                     widened,
8812                     mkexpr(math_WIDEN_LO_OR_HI_LANES(
8813                               isU, True/*fromUpperHalf*/, size, mkexpr(src)))
8814               );
8815       }
8816       /* Now fold. */
8817       IRTemp tWi = newTempV128();
8818       assign(tWi, widened);
8819       IRTemp res = math_FOLDV(tWi, mkVecADD(size+1));
8820       putQReg128(dd, mkexpr(res));
8821       const HChar* arr = nameArr_Q_SZ(bitQ, size);
8822       const HChar  ch  = "bhsd"[size];
8823       DIP("%s %s.%c, %s.%s\n", isU ? "uaddlv" : "saddlv",
8824           nameQReg128(dd), ch, nameQReg128(nn), arr);
8825       return True;
8826    }
8827
8828    UInt ix = 0;
8829    /**/ if (opcode == BITS5(0,1,0,1,0)) { ix = bitU == 0 ? 1 : 2; }
8830    else if (opcode == BITS5(1,1,0,1,0)) { ix = bitU == 0 ? 3 : 4; }
8831    else if (opcode == BITS5(1,1,0,1,1) && bitU == 0) { ix = 5; }
8832    /**/
8833    if (ix != 0) {
8834       /* -------- 0,xx,01010: SMAXV -------- (1) */
8835       /* -------- 1,xx,01010: UMAXV -------- (2) */
8836       /* -------- 0,xx,11010: SMINV -------- (3) */
8837       /* -------- 1,xx,11010: UMINV -------- (4) */
8838       /* -------- 0,xx,11011: ADDV  -------- (5) */
8839       vassert(ix >= 1 && ix <= 5);
8840       if (size == X11) return False; // 1d,2d cases not allowed
8841       if (size == X10 && bitQ == 0) return False; // 2s case not allowed
8842       const IROp opMAXS[3]
8843          = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4 };
8844       const IROp opMAXU[3]
8845          = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4 };
8846       const IROp opMINS[3]
8847          = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4 };
8848       const IROp opMINU[3]
8849          = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4 };
8850       const IROp opADD[3]
8851          = { Iop_Add8x16,  Iop_Add16x8,  Iop_Add32x4 };
8852       vassert(size < 3);
8853       IROp op = Iop_INVALID;
8854       const HChar* nm = NULL;
8855       switch (ix) {
8856          case 1: op = opMAXS[size]; nm = "smaxv"; break;
8857          case 2: op = opMAXU[size]; nm = "umaxv"; break;
8858          case 3: op = opMINS[size]; nm = "sminv"; break;
8859          case 4: op = opMINU[size]; nm = "uminv"; break;
8860          case 5: op = opADD[size];  nm = "addv";  break;
8861          default: vassert(0);
8862       }
8863       vassert(op != Iop_INVALID && nm != NULL);
8864       IRTemp tN1 = newTempV128();
8865       assign(tN1, getQReg128(nn));
8866       /* If Q == 0, we're just folding lanes in the lower half of
8867          the value.  In which case, copy the lower half of the
8868          source into the upper half, so we can then treat it the
8869          same as the full width case.  Except for the addition case,
8870          in which we have to zero out the upper half. */
8871       IRTemp tN2 = newTempV128();
8872       assign(tN2, bitQ == 0
8873                      ? (ix == 5 ? unop(Iop_ZeroHI64ofV128, mkexpr(tN1))
8874                                 : mk_CatEvenLanes64x2(tN1,tN1))
8875                      : mkexpr(tN1));
8876       IRTemp res = math_FOLDV(tN2, op);
8877       if (res == IRTemp_INVALID)
8878          return False; /* means math_FOLDV
8879                           doesn't handle this case yet */
8880       putQReg128(dd, mkexpr(res));
8881       const IRType tys[3] = { Ity_I8, Ity_I16, Ity_I32 };
8882       IRType laneTy = tys[size];
8883       const HChar* arr = nameArr_Q_SZ(bitQ, size);
8884       DIP("%s %s, %s.%s\n", nm,
8885           nameQRegLO(dd, laneTy), nameQReg128(nn), arr);
8886       return True;
8887    }
8888
8889    if ((size == X00 || size == X10)
8890        && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) {
8891       /* -------- 0,00,01100: FMAXMNV s_4s -------- */
8892       /* -------- 0,10,01100: FMINMNV s_4s -------- */
8893       /* -------- 1,00,01111: FMAXV   s_4s -------- */
8894       /* -------- 1,10,01111: FMINV   s_4s -------- */
8895       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
8896       if (bitQ == 0) return False; // Only 4s is allowed
8897       Bool   isMIN = (size & 2) == 2;
8898       Bool   isNM  = opcode == BITS5(0,1,1,0,0);
8899       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(2);
8900       IRTemp src = newTempV128();
8901       assign(src, getQReg128(nn));
8902       IRTemp res = math_FOLDV(src, opMXX);
8903       putQReg128(dd, mkexpr(res));
8904       DIP("%s%sv s%u, %u.4s\n",
8905           isMIN ? "fmin" : "fmax", isNM ? "nm" : "", dd, nn);
8906       return True;
8907    }
8908
8909 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8910    return False;
8911 #  undef INSN
8912 }
8913
8914
8915 static
8916 Bool dis_AdvSIMD_copy(/*MB_OUT*/DisResult* dres, UInt insn)
8917 {
8918    /* 31     28       20   15 14   10 9 4
8919       0 q op 01110000 imm5 0  imm4 1  n d
8920       Decode fields: q,op,imm4
8921    */
8922 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
8923    if (INSN(31,31) != 0
8924        || INSN(28,21) != BITS8(0,1,1,1,0,0,0,0)
8925        || INSN(15,15) != 0 || INSN(10,10) != 1) {
8926       return False;
8927    }
8928    UInt bitQ  = INSN(30,30);
8929    UInt bitOP = INSN(29,29);
8930    UInt imm5  = INSN(20,16);
8931    UInt imm4  = INSN(14,11);
8932    UInt nn    = INSN(9,5);
8933    UInt dd    = INSN(4,0);
8934
8935    /* -------- x,0,0000: DUP (element, vector) -------- */
8936    /* 31  28       20   15     9 4
8937       0q0 01110000 imm5 000001 n d  DUP Vd.T, Vn.Ts[index]
8938    */
8939    if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
8940       UInt   laneNo    = 0;
8941       UInt   laneSzLg2 = 0;
8942       HChar  laneCh    = '?';
8943       IRTemp res       = handle_DUP_VEC_ELEM(&laneNo, &laneSzLg2, &laneCh,
8944                                              getQReg128(nn), imm5);
8945       if (res == IRTemp_INVALID)
8946          return False;
8947       if (bitQ == 0 && laneSzLg2 == X11)
8948          return False; /* .1d case */
8949       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
8950       const HChar* arT = nameArr_Q_SZ(bitQ, laneSzLg2);
8951       DIP("dup %s.%s, %s.%c[%u]\n",
8952            nameQReg128(dd), arT, nameQReg128(nn), laneCh, laneNo);
8953       return True;
8954    }
8955
8956    /* -------- x,0,0001: DUP (general, vector) -------- */
8957    /* 31  28       20   15       9 4
8958       0q0 01110000 imm5 0 0001 1 n d  DUP Vd.T, Rn
8959       Q=0 writes 64, Q=1 writes 128
8960       imm5: xxxx1  8B(q=0)      or 16b(q=1),     R=W
8961             xxx10  4H(q=0)      or 8H(q=1),      R=W
8962             xx100  2S(q=0)      or 4S(q=1),      R=W
8963             x1000  Invalid(q=0) or 2D(q=1),      R=X
8964             x0000  Invalid(q=0) or Invalid(q=1)
8965       Require op=0, imm4=0001
8966    */
8967    if (bitOP == 0 && imm4 == BITS4(0,0,0,1)) {
8968       Bool   isQ = bitQ == 1;
8969       IRTemp w0  = newTemp(Ity_I64);
8970       const HChar* arT = "??";
8971       IRType laneTy = Ity_INVALID;
8972       if (imm5 & 1) {
8973          arT    = isQ ? "16b" : "8b";
8974          laneTy = Ity_I8;
8975          assign(w0, unop(Iop_8Uto64, unop(Iop_64to8, getIReg64orZR(nn))));
8976       }
8977       else if (imm5 & 2) {
8978          arT    = isQ ? "8h" : "4h";
8979          laneTy = Ity_I16;
8980          assign(w0, unop(Iop_16Uto64, unop(Iop_64to16, getIReg64orZR(nn))));
8981       }
8982       else if (imm5 & 4) {
8983          arT    = isQ ? "4s" : "2s";
8984          laneTy = Ity_I32;
8985          assign(w0, unop(Iop_32Uto64, unop(Iop_64to32, getIReg64orZR(nn))));
8986       }
8987       else if ((imm5 & 8) && isQ) {
8988          arT    = "2d";
8989          laneTy = Ity_I64;
8990          assign(w0, getIReg64orZR(nn));
8991       }
8992       else {
8993          /* invalid; leave laneTy unchanged. */
8994       }
8995       /* */
8996       if (laneTy != Ity_INVALID) {
8997          IRTemp w1 = math_DUP_TO_64(w0, laneTy);
8998          putQReg128(dd, binop(Iop_64HLtoV128,
8999                               isQ ? mkexpr(w1) : mkU64(0), mkexpr(w1)));
9000          DIP("dup %s.%s, %s\n",
9001              nameQReg128(dd), arT, nameIRegOrZR(laneTy == Ity_I64, nn));
9002          return True;
9003       }
9004       /* invalid */
9005       return False;
9006    }
9007
9008    /* -------- 1,0,0011: INS (general) -------- */
9009    /* 31  28       20   15     9 4
9010       010 01110000 imm5 000111 n d  INS Vd.Ts[ix], Rn
9011       where Ts,ix = case imm5 of xxxx1 -> B, xxxx
9012                                  xxx10 -> H, xxx
9013                                  xx100 -> S, xx
9014                                  x1000 -> D, x
9015    */
9016    if (bitQ == 1 && bitOP == 0 && imm4 == BITS4(0,0,1,1)) {
9017       HChar   ts     = '?';
9018       UInt    laneNo = 16;
9019       IRExpr* src    = NULL;
9020       if (imm5 & 1) {
9021          src    = unop(Iop_64to8, getIReg64orZR(nn));
9022          laneNo = (imm5 >> 1) & 15;
9023          ts     = 'b';
9024       }
9025       else if (imm5 & 2) {
9026          src    = unop(Iop_64to16, getIReg64orZR(nn));
9027          laneNo = (imm5 >> 2) & 7;
9028          ts     = 'h';
9029       }
9030       else if (imm5 & 4) {
9031          src    = unop(Iop_64to32, getIReg64orZR(nn));
9032          laneNo = (imm5 >> 3) & 3;
9033          ts     = 's';
9034       }
9035       else if (imm5 & 8) {
9036          src    = getIReg64orZR(nn);
9037          laneNo = (imm5 >> 4) & 1;
9038          ts     = 'd';
9039       }
9040       /* */
9041       if (src) {
9042          vassert(laneNo < 16);
9043          putQRegLane(dd, laneNo, src);
9044          DIP("ins %s.%c[%u], %s\n",
9045              nameQReg128(dd), ts, laneNo, nameIReg64orZR(nn));
9046          return True;
9047       }
9048       /* invalid */
9049       return False;
9050    }
9051
9052    /* -------- x,0,0101: SMOV -------- */
9053    /* -------- x,0,0111: UMOV -------- */
9054    /* 31  28        20   15     9 4
9055       0q0 01110 000 imm5 001111 n d  UMOV Xd/Wd, Vn.Ts[index]
9056       0q0 01110 000 imm5 001011 n d  SMOV Xd/Wd, Vn.Ts[index]
9057       dest is Xd when q==1, Wd when q==0
9058       UMOV:
9059          Ts,index,ops = case q:imm5 of
9060                           0:xxxx1 -> B, xxxx, 8Uto64
9061                           1:xxxx1 -> invalid
9062                           0:xxx10 -> H, xxx,  16Uto64
9063                           1:xxx10 -> invalid
9064                           0:xx100 -> S, xx,   32Uto64
9065                           1:xx100 -> invalid
9066                           1:x1000 -> D, x,    copy64
9067                           other   -> invalid
9068       SMOV:
9069          Ts,index,ops = case q:imm5 of
9070                           0:xxxx1 -> B, xxxx, (32Uto64 . 8Sto32)
9071                           1:xxxx1 -> B, xxxx, 8Sto64
9072                           0:xxx10 -> H, xxx,  (32Uto64 . 16Sto32)
9073                           1:xxx10 -> H, xxx,  16Sto64
9074                           0:xx100 -> invalid
9075                           1:xx100 -> S, xx,   32Sto64
9076                           1:x1000 -> invalid
9077                           other   -> invalid
9078    */
9079    if (bitOP == 0 && (imm4 == BITS4(0,1,0,1) || imm4 == BITS4(0,1,1,1))) {
9080       Bool isU  = (imm4 & 2) == 2;
9081       const HChar* arTs = "??";
9082       UInt    laneNo = 16; /* invalid */
9083       // Setting 'res' to non-NULL determines valid/invalid
9084       IRExpr* res    = NULL;
9085       if (!bitQ && (imm5 & 1)) { // 0:xxxx1
9086          laneNo = (imm5 >> 1) & 15;
9087          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
9088          res = isU ? unop(Iop_8Uto64, lane)
9089                    : unop(Iop_32Uto64, unop(Iop_8Sto32, lane));
9090          arTs = "b";
9091       }
9092       else if (bitQ && (imm5 & 1)) { // 1:xxxx1
9093          laneNo = (imm5 >> 1) & 15;
9094          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
9095          res = isU ? NULL
9096                    : unop(Iop_8Sto64, lane);
9097          arTs = "b";
9098       }
9099       else if (!bitQ && (imm5 & 2)) { // 0:xxx10
9100          laneNo = (imm5 >> 2) & 7;
9101          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
9102          res = isU ? unop(Iop_16Uto64, lane)
9103                    : unop(Iop_32Uto64, unop(Iop_16Sto32, lane));
9104          arTs = "h";
9105       }
9106       else if (bitQ && (imm5 & 2)) { // 1:xxx10
9107          laneNo = (imm5 >> 2) & 7;
9108          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
9109          res = isU ? NULL
9110                    : unop(Iop_16Sto64, lane);
9111          arTs = "h";
9112       }
9113       else if (!bitQ && (imm5 & 4)) { // 0:xx100
9114          laneNo = (imm5 >> 3) & 3;
9115          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
9116          res = isU ? unop(Iop_32Uto64, lane)
9117                    : NULL;
9118          arTs = "s";
9119       }
9120       else if (bitQ && (imm5 & 4)) { // 1:xxx10
9121          laneNo = (imm5 >> 3) & 3;
9122          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
9123          res = isU ? NULL
9124                    : unop(Iop_32Sto64, lane);
9125          arTs = "s";
9126       }
9127       else if (bitQ && (imm5 & 8)) { // 1:x1000
9128          laneNo = (imm5 >> 4) & 1;
9129          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I64);
9130          res = isU ? lane
9131                    : NULL;
9132          arTs = "d";
9133       }
9134       /* */
9135       if (res) {
9136          vassert(laneNo < 16);
9137          putIReg64orZR(dd, res);
9138          DIP("%cmov %s, %s.%s[%u]\n", isU ? 'u' : 's',
9139              nameIRegOrZR(bitQ == 1, dd),
9140              nameQReg128(nn), arTs, laneNo);
9141          return True;
9142       }
9143       /* invalid */
9144       return False;
9145    }
9146
9147    /* -------- 1,1,xxxx: INS (element) -------- */
9148    /* 31  28       20     14   9 4
9149       011 01110000 imm5 0 imm4 n d  INS Vd.Ts[ix1], Vn.Ts[ix2]
9150       where Ts,ix1,ix2
9151                = case imm5 of xxxx1 -> B, xxxx, imm4[3:0]
9152                               xxx10 -> H, xxx,  imm4[3:1]
9153                               xx100 -> S, xx,   imm4[3:2]
9154                               x1000 -> D, x,    imm4[3:3]
9155    */
9156    if (bitQ == 1 && bitOP == 1) {
9157       HChar   ts  = '?';
9158       IRType  ity = Ity_INVALID;
9159       UInt    ix1 = 16;
9160       UInt    ix2 = 16;
9161       if (imm5 & 1) {
9162          ts  = 'b';
9163          ity = Ity_I8;
9164          ix1 = (imm5 >> 1) & 15;
9165          ix2 = (imm4 >> 0) & 15;
9166       }
9167       else if (imm5 & 2) {
9168          ts  = 'h';
9169          ity = Ity_I16;
9170          ix1 = (imm5 >> 2) & 7;
9171          ix2 = (imm4 >> 1) & 7;
9172       }
9173       else if (imm5 & 4) {
9174          ts  = 's';
9175          ity = Ity_I32;
9176          ix1 = (imm5 >> 3) & 3;
9177          ix2 = (imm4 >> 2) & 3;
9178       }
9179       else if (imm5 & 8) {
9180          ts  = 'd';
9181          ity = Ity_I64;
9182          ix1 = (imm5 >> 4) & 1;
9183          ix2 = (imm4 >> 3) & 1;
9184       }
9185       /* */
9186       if (ity != Ity_INVALID) {
9187          vassert(ix1 < 16);
9188          vassert(ix2 < 16);
9189          putQRegLane(dd, ix1, getQRegLane(nn, ix2, ity));
9190          DIP("ins %s.%c[%u], %s.%c[%u]\n",
9191              nameQReg128(dd), ts, ix1, nameQReg128(nn), ts, ix2);
9192          return True;
9193       }
9194       /* invalid */
9195       return False;
9196    }
9197
9198    return False;
9199 #  undef INSN
9200 }
9201
9202
9203 static
9204 Bool dis_AdvSIMD_modified_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
9205 {
9206    /* 31    28          18  15    11 9     4
9207       0q op 01111 00000 abc cmode 01 defgh d
9208       Decode fields: q,op,cmode
9209       Bit 11 is really "o2", but it is always zero.
9210    */
9211 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9212    if (INSN(31,31) != 0
9213        || INSN(28,19) != BITS10(0,1,1,1,1,0,0,0,0,0)
9214        || INSN(11,10) != BITS2(0,1)) {
9215       return False;
9216    }
9217    UInt bitQ     = INSN(30,30);
9218    UInt bitOP    = INSN(29,29);
9219    UInt cmode    = INSN(15,12);
9220    UInt abcdefgh = (INSN(18,16) << 5) | INSN(9,5);
9221    UInt dd       = INSN(4,0);
9222
9223    ULong imm64lo  = 0;
9224    UInt  op_cmode = (bitOP << 4) | cmode;
9225    Bool  ok       = False;
9226    Bool  isORR    = False;
9227    Bool  isBIC    = False;
9228    Bool  isMOV    = False;
9229    Bool  isMVN    = False;
9230    Bool  isFMOV   = False;
9231    switch (op_cmode) {
9232       /* -------- x,0,0000 MOVI 32-bit shifted imm -------- */
9233       /* -------- x,0,0010 MOVI 32-bit shifted imm -------- */
9234       /* -------- x,0,0100 MOVI 32-bit shifted imm -------- */
9235       /* -------- x,0,0110 MOVI 32-bit shifted imm -------- */
9236       case BITS5(0,0,0,0,0): case BITS5(0,0,0,1,0):
9237       case BITS5(0,0,1,0,0): case BITS5(0,0,1,1,0): // 0:0xx0
9238          ok = True; isMOV = True; break;
9239
9240       /* -------- x,0,0001 ORR (vector, immediate) 32-bit -------- */
9241       /* -------- x,0,0011 ORR (vector, immediate) 32-bit -------- */
9242       /* -------- x,0,0101 ORR (vector, immediate) 32-bit -------- */
9243       /* -------- x,0,0111 ORR (vector, immediate) 32-bit -------- */
9244       case BITS5(0,0,0,0,1): case BITS5(0,0,0,1,1):
9245       case BITS5(0,0,1,0,1): case BITS5(0,0,1,1,1): // 0:0xx1
9246          ok = True; isORR = True; break;
9247
9248       /* -------- x,0,1000 MOVI 16-bit shifted imm -------- */
9249       /* -------- x,0,1010 MOVI 16-bit shifted imm -------- */
9250       case BITS5(0,1,0,0,0): case BITS5(0,1,0,1,0): // 0:10x0
9251          ok = True; isMOV = True; break;
9252
9253       /* -------- x,0,1001 ORR (vector, immediate) 16-bit -------- */
9254       /* -------- x,0,1011 ORR (vector, immediate) 16-bit -------- */
9255       case BITS5(0,1,0,0,1): case BITS5(0,1,0,1,1): // 0:10x1
9256          ok = True; isORR = True; break;
9257
9258       /* -------- x,0,1100 MOVI 32-bit shifting ones -------- */
9259       /* -------- x,0,1101 MOVI 32-bit shifting ones -------- */
9260       case BITS5(0,1,1,0,0): case BITS5(0,1,1,0,1): // 0:110x
9261          ok = True; isMOV = True; break;
9262
9263       /* -------- x,0,1110 MOVI 8-bit -------- */
9264       case BITS5(0,1,1,1,0):
9265          ok = True; isMOV = True; break;
9266
9267       /* -------- x,0,1111 FMOV (vector, immediate, F32) -------- */
9268       case BITS5(0,1,1,1,1): // 0:1111
9269          ok = True; isFMOV = True; break;
9270
9271       /* -------- x,1,0000 MVNI 32-bit shifted imm -------- */
9272       /* -------- x,1,0010 MVNI 32-bit shifted imm  -------- */
9273       /* -------- x,1,0100 MVNI 32-bit shifted imm  -------- */
9274       /* -------- x,1,0110 MVNI 32-bit shifted imm  -------- */
9275       case BITS5(1,0,0,0,0): case BITS5(1,0,0,1,0):
9276       case BITS5(1,0,1,0,0): case BITS5(1,0,1,1,0): // 1:0xx0
9277          ok = True; isMVN = True; break;
9278
9279       /* -------- x,1,0001 BIC (vector, immediate) 32-bit -------- */
9280       /* -------- x,1,0011 BIC (vector, immediate) 32-bit -------- */
9281       /* -------- x,1,0101 BIC (vector, immediate) 32-bit -------- */
9282       /* -------- x,1,0111 BIC (vector, immediate) 32-bit -------- */
9283       case BITS5(1,0,0,0,1): case BITS5(1,0,0,1,1):
9284       case BITS5(1,0,1,0,1): case BITS5(1,0,1,1,1): // 1:0xx1
9285          ok = True; isBIC = True; break;
9286
9287       /* -------- x,1,1000 MVNI 16-bit shifted imm -------- */
9288       /* -------- x,1,1010 MVNI 16-bit shifted imm -------- */
9289       case BITS5(1,1,0,0,0): case BITS5(1,1,0,1,0): // 1:10x0
9290          ok = True; isMVN = True; break;
9291
9292       /* -------- x,1,1001 BIC (vector, immediate) 16-bit -------- */
9293       /* -------- x,1,1011 BIC (vector, immediate) 16-bit -------- */
9294       case BITS5(1,1,0,0,1): case BITS5(1,1,0,1,1): // 1:10x1
9295          ok = True; isBIC = True; break;
9296
9297       /* -------- x,1,1100 MVNI 32-bit shifting ones -------- */
9298       /* -------- x,1,1101 MVNI 32-bit shifting ones -------- */
9299       case BITS5(1,1,1,0,0): case BITS5(1,1,1,0,1): // 1:110x
9300          ok = True; isMVN = True; break;
9301
9302       /* -------- 0,1,1110 MOVI 64-bit scalar -------- */
9303       /* -------- 1,1,1110 MOVI 64-bit vector -------- */
9304       case BITS5(1,1,1,1,0):
9305          ok = True; isMOV = True; break;
9306
9307       /* -------- 1,1,1111 FMOV (vector, immediate, F64) -------- */
9308       case BITS5(1,1,1,1,1): // 1:1111
9309          ok = bitQ == 1; isFMOV = True; break;
9310
9311       default:
9312         break;
9313    }
9314    if (ok) {
9315       vassert(1 == (isMOV ? 1 : 0) + (isMVN ? 1 : 0)
9316                    + (isORR ? 1 : 0) + (isBIC ? 1 : 0) + (isFMOV ? 1 : 0));
9317       ok = AdvSIMDExpandImm(&imm64lo, bitOP, cmode, abcdefgh);
9318    }
9319    if (ok) {
9320       if (isORR || isBIC) {
9321          ULong inv
9322             = isORR ? 0ULL : ~0ULL;
9323          IRExpr* immV128
9324             = binop(Iop_64HLtoV128, mkU64(inv ^ imm64lo), mkU64(inv ^ imm64lo));
9325          IRExpr* res
9326             = binop(isORR ? Iop_OrV128 : Iop_AndV128, getQReg128(dd), immV128);
9327          const HChar* nm = isORR ? "orr" : "bic";
9328          if (bitQ == 0) {
9329             putQReg128(dd, unop(Iop_ZeroHI64ofV128, res));
9330             DIP("%s %s.1d, %016llx\n", nm, nameQReg128(dd), imm64lo);
9331          } else {
9332             putQReg128(dd, res);
9333             DIP("%s %s.2d, #0x%016llx'%016llx\n", nm,
9334                 nameQReg128(dd), imm64lo, imm64lo);
9335          }
9336       }
9337       else if (isMOV || isMVN || isFMOV) {
9338          if (isMVN) imm64lo = ~imm64lo;
9339          ULong   imm64hi = bitQ == 0  ? 0  :  imm64lo;
9340          IRExpr* immV128 = binop(Iop_64HLtoV128, mkU64(imm64hi),
9341                                                  mkU64(imm64lo));
9342          putQReg128(dd, immV128);
9343          DIP("mov %s, #0x%016llx'%016llx\n", nameQReg128(dd), imm64hi, imm64lo);
9344       }
9345       return True;
9346    }
9347    /* else fall through */
9348
9349    return False;
9350 #  undef INSN
9351 }
9352
9353
9354 static
9355 Bool dis_AdvSIMD_scalar_copy(/*MB_OUT*/DisResult* dres, UInt insn)
9356 {
9357    /* 31    28       20   15 14   10 9 4
9358       01 op 11110000 imm5 0  imm4 1  n d
9359       Decode fields: op,imm4
9360    */
9361 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9362    if (INSN(31,30) != BITS2(0,1)
9363        || INSN(28,21) != BITS8(1,1,1,1,0,0,0,0)
9364        || INSN(15,15) != 0 || INSN(10,10) != 1) {
9365       return False;
9366    }
9367    UInt bitOP = INSN(29,29);
9368    UInt imm5  = INSN(20,16);
9369    UInt imm4  = INSN(14,11);
9370    UInt nn    = INSN(9,5);
9371    UInt dd    = INSN(4,0);
9372
9373    if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
9374       /* -------- 0,0000 DUP (element, scalar) -------- */
9375       IRTemp w0     = newTemp(Ity_I64);
9376       const HChar* arTs = "??";
9377       IRType laneTy = Ity_INVALID;
9378       UInt   laneNo = 16; /* invalid */
9379       if (imm5 & 1) {
9380          arTs   = "b";
9381          laneNo = (imm5 >> 1) & 15;
9382          laneTy = Ity_I8;
9383          assign(w0, unop(Iop_8Uto64, getQRegLane(nn, laneNo, laneTy)));
9384       }
9385       else if (imm5 & 2) {
9386          arTs   = "h";
9387          laneNo = (imm5 >> 2) & 7;
9388          laneTy = Ity_I16;
9389          assign(w0, unop(Iop_16Uto64, getQRegLane(nn, laneNo, laneTy)));
9390       }
9391       else if (imm5 & 4) {
9392          arTs   = "s";
9393          laneNo = (imm5 >> 3) & 3;
9394          laneTy = Ity_I32;
9395          assign(w0, unop(Iop_32Uto64, getQRegLane(nn, laneNo, laneTy)));
9396       }
9397       else if (imm5 & 8) {
9398          arTs   = "d";
9399          laneNo = (imm5 >> 4) & 1;
9400          laneTy = Ity_I64;
9401          assign(w0, getQRegLane(nn, laneNo, laneTy));
9402       }
9403       else {
9404          /* invalid; leave laneTy unchanged. */
9405       }
9406       /* */
9407       if (laneTy != Ity_INVALID) {
9408          vassert(laneNo < 16);
9409          putQReg128(dd, binop(Iop_64HLtoV128, mkU64(0), mkexpr(w0)));
9410          DIP("dup %s, %s.%s[%u]\n",
9411              nameQRegLO(dd, laneTy), nameQReg128(nn), arTs, laneNo);
9412          return True;
9413       }
9414       /* else fall through */
9415    }
9416
9417    return False;
9418 #  undef INSN
9419 }
9420
9421
9422 static
9423 Bool dis_AdvSIMD_scalar_pairwise(/*MB_OUT*/DisResult* dres, UInt insn)
9424 {
9425    /* 31   28    23 21    16     11 9 4
9426       01 u 11110 sz 11000 opcode 10 n d
9427       Decode fields: u,sz,opcode
9428    */
9429 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9430    if (INSN(31,30) != BITS2(0,1)
9431        || INSN(28,24) != BITS5(1,1,1,1,0)
9432        || INSN(21,17) != BITS5(1,1,0,0,0)
9433        || INSN(11,10) != BITS2(1,0)) {
9434       return False;
9435    }
9436    UInt bitU   = INSN(29,29);
9437    UInt sz     = INSN(23,22);
9438    UInt opcode = INSN(16,12);
9439    UInt nn     = INSN(9,5);
9440    UInt dd     = INSN(4,0);
9441
9442    if (bitU == 0 && sz == X11 && opcode == BITS5(1,1,0,1,1)) {
9443       /* -------- 0,11,11011 ADDP d_2d -------- */
9444       IRTemp xy = newTempV128();
9445       IRTemp xx = newTempV128();
9446       assign(xy, getQReg128(nn));
9447       assign(xx, binop(Iop_InterleaveHI64x2, mkexpr(xy), mkexpr(xy)));
9448       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
9449                           binop(Iop_Add64x2, mkexpr(xy), mkexpr(xx))));
9450       DIP("addp d%u, %s.2d\n", dd, nameQReg128(nn));
9451       return True;
9452    }
9453
9454    if (bitU == 1 && sz <= X01 && opcode == BITS5(0,1,1,0,1)) {
9455       /* -------- 1,00,01101 ADDP s_2s -------- */
9456       /* -------- 1,01,01101 ADDP d_2d -------- */
9457       Bool   isD   = sz == X01;
9458       IROp   opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
9459       IROp   opADD = mkVecADDF(isD ? 3 : 2);
9460       IRTemp src   = newTempV128();
9461       IRTemp argL  = newTempV128();
9462       IRTemp argR  = newTempV128();
9463       assign(src, getQReg128(nn));
9464       assign(argL, unop(opZHI, mkexpr(src)));
9465       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
9466                                                     mkU8(isD ? 8 : 4))));
9467       putQReg128(dd, unop(opZHI,
9468                           triop(opADD, mkexpr(mk_get_IR_rounding_mode()),
9469                                               mkexpr(argL), mkexpr(argR))));
9470       DIP(isD ? "faddp d%u, v%u.2d\n" : "faddp s%u, v%u.2s\n", dd, nn);
9471       return True;
9472    }
9473
9474    if (bitU == 1
9475        && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) {
9476       /* -------- 1,0x,01100 FMAXNMP d_2d, s_2s -------- */
9477       /* -------- 1,1x,01100 FMINNMP d_2d, s_2s -------- */
9478       /* -------- 1,0x,01111 FMAXP   d_2d, s_2s -------- */
9479       /* -------- 1,1x,01111 FMINP   d_2d, s_2s -------- */
9480       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
9481       Bool   isD   = (sz & 1) == 1;
9482       Bool   isMIN = (sz & 2) == 2;
9483       Bool   isNM  = opcode == BITS5(0,1,1,0,0);
9484       IROp   opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
9485       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? 3 : 2);
9486       IRTemp src   = newTempV128();
9487       IRTemp argL  = newTempV128();
9488       IRTemp argR  = newTempV128();
9489       assign(src, getQReg128(nn));
9490       assign(argL, unop(opZHI, mkexpr(src)));
9491       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
9492                                                     mkU8(isD ? 8 : 4))));
9493       putQReg128(dd, unop(opZHI,
9494                           binop(opMXX, mkexpr(argL), mkexpr(argR))));
9495       HChar c = isD ? 'd' : 's';
9496       DIP("%s%sp %c%u, v%u.2%c\n",
9497            isMIN ? "fmin" : "fmax", isNM ? "nm" : "", c, dd, nn, c);
9498       return True;
9499    }
9500
9501    return False;
9502 #  undef INSN
9503 }
9504
9505
9506 static
9507 Bool dis_AdvSIMD_scalar_shift_by_imm(/*MB_OUT*/DisResult* dres, UInt insn)
9508 {
9509    /* 31   28     22   18   15     10 9 4
9510       01 u 111110 immh immb opcode 1  n d
9511       Decode fields: u,immh,opcode
9512    */
9513 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9514    if (INSN(31,30) != BITS2(0,1)
9515        || INSN(28,23) != BITS6(1,1,1,1,1,0) || INSN(10,10) != 1) {
9516       return False;
9517    }
9518    UInt bitU   = INSN(29,29);
9519    UInt immh   = INSN(22,19);
9520    UInt immb   = INSN(18,16);
9521    UInt opcode = INSN(15,11);
9522    UInt nn     = INSN(9,5);
9523    UInt dd     = INSN(4,0);
9524    UInt immhb  = (immh << 3) | immb;
9525
9526    if ((immh & 8) == 8
9527        && (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0))) {
9528       /* -------- 0,1xxx,00000 SSHR d_d_#imm -------- */
9529       /* -------- 1,1xxx,00000 USHR d_d_#imm -------- */
9530       /* -------- 0,1xxx,00010 SSRA d_d_#imm -------- */
9531       /* -------- 1,1xxx,00010 USRA d_d_#imm -------- */
9532       Bool isU   = bitU == 1;
9533       Bool isAcc = opcode == BITS5(0,0,0,1,0);
9534       UInt sh    = 128 - immhb;
9535       vassert(sh >= 1 && sh <= 64);
9536       IROp    op  = isU ? Iop_ShrN64x2 : Iop_SarN64x2;
9537       IRExpr* src = getQReg128(nn);
9538       IRTemp  shf = newTempV128();
9539       IRTemp  res = newTempV128();
9540       if (sh == 64 && isU) {
9541          assign(shf, mkV128(0x0000));
9542       } else {
9543          UInt nudge = 0;
9544          if (sh == 64) {
9545             vassert(!isU);
9546             nudge = 1;
9547          }
9548          assign(shf, binop(op, src, mkU8(sh - nudge)));
9549       }
9550       assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
9551                         : mkexpr(shf));
9552       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9553       const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
9554                               : (isU ? "ushr" : "sshr");
9555       DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
9556       return True;
9557    }
9558
9559    if ((immh & 8) == 8
9560        && (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0))) {
9561       /* -------- 0,1xxx,00100 SRSHR d_d_#imm -------- */
9562       /* -------- 1,1xxx,00100 URSHR d_d_#imm -------- */
9563       /* -------- 0,1xxx,00110 SRSRA d_d_#imm -------- */
9564       /* -------- 1,1xxx,00110 URSRA d_d_#imm -------- */
9565       Bool isU   = bitU == 1;
9566       Bool isAcc = opcode == BITS5(0,0,1,1,0);
9567       UInt sh    = 128 - immhb;
9568       vassert(sh >= 1 && sh <= 64);
9569       IROp    op  = isU ? Iop_Rsh64Ux2 : Iop_Rsh64Sx2;
9570       vassert(sh >= 1 && sh <= 64);
9571       IRExpr* src  = getQReg128(nn);
9572       IRTemp  imm8 = newTemp(Ity_I8);
9573       assign(imm8, mkU8((UChar)(-sh)));
9574       IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
9575       IRTemp  shf  = newTempV128();
9576       IRTemp  res  = newTempV128();
9577       assign(shf, binop(op, src, amt));
9578       assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
9579                         : mkexpr(shf));
9580       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9581       const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
9582                               : (isU ? "urshr" : "srshr");
9583       DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
9584       return True;
9585    }
9586
9587    if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,0,0)) {
9588       /* -------- 1,1xxx,01000 SRI d_d_#imm -------- */
9589       UInt sh = 128 - immhb;
9590       vassert(sh >= 1 && sh <= 64);
9591       if (sh == 64) {
9592          putQReg128(dd, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
9593       } else {
9594          /* sh is in range 1 .. 63 */
9595          ULong   nmask  = (ULong)(((Long)0x8000000000000000ULL) >> (sh-1));
9596          IRExpr* nmaskV = binop(Iop_64HLtoV128, mkU64(nmask), mkU64(nmask));
9597          IRTemp  res    = newTempV128();
9598          assign(res, binop(Iop_OrV128,
9599                            binop(Iop_AndV128, getQReg128(dd), nmaskV),
9600                            binop(Iop_ShrN64x2, getQReg128(nn), mkU8(sh))));
9601          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9602       }
9603       DIP("sri d%u, d%u, #%u\n", dd, nn, sh);
9604       return True;
9605    }
9606
9607    if (bitU == 0 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
9608       /* -------- 0,1xxx,01010 SHL d_d_#imm -------- */
9609       UInt sh = immhb - 64;
9610       vassert(sh >= 0 && sh < 64);
9611       putQReg128(dd,
9612                  unop(Iop_ZeroHI64ofV128,
9613                       sh == 0 ? getQReg128(nn)
9614                               : binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
9615       DIP("shl d%u, d%u, #%u\n", dd, nn, sh);
9616       return True;
9617    }
9618
9619    if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
9620       /* -------- 1,1xxx,01010 SLI d_d_#imm -------- */
9621       UInt sh = immhb - 64;
9622       vassert(sh >= 0 && sh < 64);
9623       if (sh == 0) {
9624          putQReg128(dd, unop(Iop_ZeroHI64ofV128, getQReg128(nn)));
9625       } else {
9626          /* sh is in range 1 .. 63 */
9627          ULong   nmask  = (1ULL << sh) - 1;
9628          IRExpr* nmaskV = binop(Iop_64HLtoV128, mkU64(nmask), mkU64(nmask));
9629          IRTemp  res    = newTempV128();
9630          assign(res, binop(Iop_OrV128,
9631                            binop(Iop_AndV128, getQReg128(dd), nmaskV),
9632                            binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
9633          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9634       }
9635       DIP("sli d%u, d%u, #%u\n", dd, nn, sh);
9636       return True;
9637    }
9638
9639    if (opcode == BITS5(0,1,1,1,0)
9640        || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
9641       /* -------- 0,01110  SQSHL  #imm -------- */
9642       /* -------- 1,01110  UQSHL  #imm -------- */
9643       /* -------- 1,01100  SQSHLU #imm -------- */
9644       UInt size  = 0;
9645       UInt shift = 0;
9646       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
9647       if (!ok) return False;
9648       vassert(size >= 0 && size <= 3);
9649       /* The shift encoding has opposite sign for the leftwards case.
9650          Adjust shift to compensate. */
9651       UInt lanebits = 8 << size;
9652       shift = lanebits - shift;
9653       vassert(shift >= 0 && shift < lanebits);
9654       const HChar* nm = NULL;
9655       /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
9656       else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
9657       else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
9658       else vassert(0);
9659       IRTemp qDiff1 = IRTemp_INVALID;
9660       IRTemp qDiff2 = IRTemp_INVALID;
9661       IRTemp res = IRTemp_INVALID;
9662       IRTemp src = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn));
9663       /* This relies on the fact that the zeroed out lanes generate zeroed
9664          result lanes and don't saturate, so there's no point in trimming
9665          the resulting res, qDiff1 or qDiff2 values. */
9666       math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
9667       putQReg128(dd, mkexpr(res));
9668       updateQCFLAGwithDifference(qDiff1, qDiff2);
9669       const HChar arr = "bhsd"[size];
9670       DIP("%s %c%u, %c%u, #%u\n", nm, arr, dd, arr, nn, shift);
9671       return True;
9672    }
9673
9674    if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
9675        || (bitU == 1
9676            && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
9677       /* -------- 0,10010   SQSHRN #imm -------- */
9678       /* -------- 1,10010   UQSHRN #imm -------- */
9679       /* -------- 0,10011  SQRSHRN #imm -------- */
9680       /* -------- 1,10011  UQRSHRN #imm -------- */
9681       /* -------- 1,10000  SQSHRUN #imm -------- */
9682       /* -------- 1,10001 SQRSHRUN #imm -------- */
9683       UInt size  = 0;
9684       UInt shift = 0;
9685       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
9686       if (!ok || size == X11) return False;
9687       vassert(size >= X00 && size <= X10);
9688       vassert(shift >= 1 && shift <= (8 << size));
9689       const HChar* nm = "??";
9690       IROp op = Iop_INVALID;
9691       /* Decide on the name and the operation. */
9692       /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
9693          nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size);
9694       }
9695       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
9696          nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size);
9697       }
9698       else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) {
9699          nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size);
9700       }
9701       else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
9702          nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size);
9703       }
9704       else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) {
9705          nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size);
9706       }
9707       else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) {
9708          nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size);
9709       }
9710       else vassert(0);
9711       /* Compute the result (Q, shifted value) pair. */
9712       IRTemp src128 = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size+1, getQReg128(nn));
9713       IRTemp pair   = newTempV128();
9714       assign(pair, binop(op, mkexpr(src128), mkU8(shift)));
9715       /* Update the result reg */
9716       IRTemp res64in128 = newTempV128();
9717       assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair)));
9718       putQReg128(dd, mkexpr(res64in128));
9719       /* Update the Q flag. */
9720       IRTemp q64q64 = newTempV128();
9721       assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair)));
9722       IRTemp z128 = newTempV128();
9723       assign(z128, mkV128(0x0000));
9724       updateQCFLAGwithDifference(q64q64, z128);
9725       /* */
9726       const HChar arrNarrow = "bhsd"[size];
9727       const HChar arrWide   = "bhsd"[size+1];
9728       DIP("%s %c%u, %c%u, #%u\n", nm, arrNarrow, dd, arrWide, nn, shift);
9729       return True;
9730    }
9731
9732    if (immh >= BITS4(0,1,0,0) && opcode == BITS5(1,1,1,0,0)) {
9733       /* -------- 0,!=00xx,11100 SCVTF d_d_imm, s_s_imm -------- */
9734       /* -------- 1,!=00xx,11100 UCVTF d_d_imm, s_s_imm -------- */
9735       UInt size  = 0;
9736       UInt fbits = 0;
9737       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
9738       /* The following holds because immh is never zero. */
9739       vassert(ok);
9740       /* The following holds because immh >= 0100. */
9741       vassert(size == X10 || size == X11);
9742       Bool isD = size == X11;
9743       Bool isU = bitU == 1;
9744       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
9745       Double  scale  = two_to_the_minus(fbits);
9746       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
9747                              : IRExpr_Const(IRConst_F32( (Float)scale ));
9748       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
9749       IROp    opCVT  = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
9750                            : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
9751       IRType tyF = isD ? Ity_F64 : Ity_F32;
9752       IRType tyI = isD ? Ity_I64 : Ity_I32;
9753       IRTemp src = newTemp(tyI);
9754       IRTemp res = newTemp(tyF);
9755       IRTemp rm  = mk_get_IR_rounding_mode();
9756       assign(src, getQRegLane(nn, 0, tyI));
9757       assign(res, triop(opMUL, mkexpr(rm),
9758                                binop(opCVT, mkexpr(rm), mkexpr(src)), scaleE));
9759       putQRegLane(dd, 0, mkexpr(res));
9760       if (!isD) {
9761          putQRegLane(dd, 1, mkU32(0));
9762       }
9763       putQRegLane(dd, 1, mkU64(0));
9764       const HChar ch = isD ? 'd' : 's';
9765       DIP("%s %c%u, %c%u, #%u\n", isU ? "ucvtf" : "scvtf",
9766           ch, dd, ch, nn, fbits);
9767       return True;
9768    }
9769
9770    if (immh >= BITS4(0,1,0,0) && opcode == BITS5(1,1,1,1,1)) {
9771       /* -------- 0,!=00xx,11111 FCVTZS d_d_imm, s_s_imm -------- */
9772       /* -------- 1,!=00xx,11111 FCVTZU d_d_imm, s_s_imm -------- */
9773       UInt size  = 0;
9774       UInt fbits = 0;
9775       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
9776       /* The following holds because immh is never zero. */
9777       vassert(ok);
9778       /* The following holds because immh >= 0100. */
9779       vassert(size == X10 || size == X11);
9780       Bool isD = size == X11;
9781       Bool isU = bitU == 1;
9782       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
9783       Double  scale  = two_to_the_plus(fbits);
9784       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
9785                            : IRExpr_Const(IRConst_F32( (Float)scale ));
9786       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
9787       IROp    opCVT  = isU ? (isD ? Iop_F64toI64U : Iop_F32toI32U)
9788                            : (isD ? Iop_F64toI64S : Iop_F32toI32S);
9789       IRType tyF = isD ? Ity_F64 : Ity_F32;
9790       IRType tyI = isD ? Ity_I64 : Ity_I32;
9791       IRTemp src = newTemp(tyF);
9792       IRTemp res = newTemp(tyI);
9793       IRTemp rm  = newTemp(Ity_I32);
9794       assign(src, getQRegLane(nn, 0, tyF));
9795       assign(rm,  mkU32(Irrm_ZERO));
9796       assign(res, binop(opCVT, mkexpr(rm),
9797                                triop(opMUL, mkexpr(rm), mkexpr(src), scaleE)));
9798       putQRegLane(dd, 0, mkexpr(res));
9799       if (!isD) {
9800          putQRegLane(dd, 1, mkU32(0));
9801       }
9802       putQRegLane(dd, 1, mkU64(0));
9803       const HChar ch = isD ? 'd' : 's';
9804       DIP("%s %c%u, %c%u, #%u\n", isU ? "fcvtzu" : "fcvtzs",
9805           ch, dd, ch, nn, fbits);
9806       return True;
9807    }
9808
9809 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9810    return False;
9811 #  undef INSN
9812 }
9813
9814
9815 static
9816 Bool dis_AdvSIMD_scalar_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
9817 {
9818    /* 31 29 28    23   21 20 15     11 9 4
9819       01 U  11110 size 1  m  opcode 00 n d
9820       Decode fields: u,opcode
9821    */
9822 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9823    if (INSN(31,30) != BITS2(0,1)
9824        || INSN(28,24) != BITS5(1,1,1,1,0)
9825        || INSN(21,21) != 1
9826        || INSN(11,10) != BITS2(0,0)) {
9827       return False;
9828    }
9829    UInt bitU   = INSN(29,29);
9830    UInt size   = INSN(23,22);
9831    UInt mm     = INSN(20,16);
9832    UInt opcode = INSN(15,12);
9833    UInt nn     = INSN(9,5);
9834    UInt dd     = INSN(4,0);
9835    vassert(size < 4);
9836
9837    if (bitU == 0
9838        && (opcode == BITS4(1,1,0,1)
9839            || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
9840       /* -------- 0,1101  SQDMULL -------- */ // 0 (ks)
9841       /* -------- 0,1001  SQDMLAL -------- */ // 1
9842       /* -------- 0,1011  SQDMLSL -------- */ // 2
9843       /* Widens, and size refers to the narrowed lanes. */
9844       UInt ks = 3;
9845       switch (opcode) {
9846          case BITS4(1,1,0,1): ks = 0; break;
9847          case BITS4(1,0,0,1): ks = 1; break;
9848          case BITS4(1,0,1,1): ks = 2; break;
9849          default: vassert(0);
9850       }
9851       vassert(ks >= 0 && ks <= 2);
9852       if (size == X00 || size == X11) return False;
9853       vassert(size <= 2);
9854       IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
9855       vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
9856       newTempsV128_3(&vecN, &vecM, &vecD);
9857       assign(vecN, getQReg128(nn));
9858       assign(vecM, getQReg128(mm));
9859       assign(vecD, getQReg128(dd));
9860       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
9861                        False/*!is2*/, size, "mas"[ks],
9862                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
9863       IROp opZHI = mkVecZEROHIxxOFV128(size+1);
9864       putQReg128(dd, unop(opZHI, mkexpr(res)));
9865       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
9866       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
9867       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
9868          updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
9869       }
9870       const HChar* nm        = ks == 0 ? "sqdmull"
9871                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
9872       const HChar  arrNarrow = "bhsd"[size];
9873       const HChar  arrWide   = "bhsd"[size+1];
9874       DIP("%s %c%u, %c%u, %c%u\n",
9875           nm, arrWide, dd, arrNarrow, nn, arrNarrow, mm);
9876       return True;
9877    }
9878
9879    return False;
9880 #  undef INSN
9881 }
9882
9883
9884 static
9885 Bool dis_AdvSIMD_scalar_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
9886 {
9887    /* 31 29 28    23   21 20 15     10 9 4
9888       01 U  11110 size 1  m  opcode 1  n d
9889       Decode fields: u,size,opcode
9890    */
9891 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9892    if (INSN(31,30) != BITS2(0,1)
9893        || INSN(28,24) != BITS5(1,1,1,1,0)
9894        || INSN(21,21) != 1
9895        || INSN(10,10) != 1) {
9896       return False;
9897    }
9898    UInt bitU   = INSN(29,29);
9899    UInt size   = INSN(23,22);
9900    UInt mm     = INSN(20,16);
9901    UInt opcode = INSN(15,11);
9902    UInt nn     = INSN(9,5);
9903    UInt dd     = INSN(4,0);
9904    vassert(size < 4);
9905
9906    if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
9907       /* -------- 0,xx,00001 SQADD std4_std4_std4 -------- */
9908       /* -------- 1,xx,00001 UQADD std4_std4_std4 -------- */
9909       /* -------- 0,xx,00101 SQSUB std4_std4_std4 -------- */
9910       /* -------- 1,xx,00101 UQSUB std4_std4_std4 -------- */
9911       Bool isADD = opcode == BITS5(0,0,0,0,1);
9912       Bool isU   = bitU == 1;
9913       IROp qop   = Iop_INVALID;
9914       IROp nop   = Iop_INVALID;
9915       if (isADD) {
9916          qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
9917          nop = mkVecADD(size);
9918       } else {
9919          qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
9920          nop = mkVecSUB(size);
9921       }
9922       IRTemp argL = newTempV128();
9923       IRTemp argR = newTempV128();
9924       IRTemp qres = newTempV128();
9925       IRTemp nres = newTempV128();
9926       assign(argL, getQReg128(nn));
9927       assign(argR, getQReg128(mm));
9928       assign(qres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
9929                              size, binop(qop, mkexpr(argL), mkexpr(argR)))));
9930       assign(nres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
9931                              size, binop(nop, mkexpr(argL), mkexpr(argR)))));
9932       putQReg128(dd, mkexpr(qres));
9933       updateQCFLAGwithDifference(qres, nres);
9934       const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd")
9935                                : (isU ? "uqsub" : "sqsub");
9936       const HChar  arr = "bhsd"[size];
9937       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
9938       return True;
9939    }
9940
9941    if (size == X11 && opcode == BITS5(0,0,1,1,0)) {
9942       /* -------- 0,11,00110 CMGT d_d_d -------- */ // >s
9943       /* -------- 1,11,00110 CMHI d_d_d -------- */ // >u
9944       Bool    isGT = bitU == 0;
9945       IRExpr* argL = getQReg128(nn);
9946       IRExpr* argR = getQReg128(mm);
9947       IRTemp  res  = newTempV128();
9948       assign(res,
9949              isGT ? binop(Iop_CmpGT64Sx2, argL, argR)
9950                   : binop(Iop_CmpGT64Ux2, argL, argR));
9951       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9952       DIP("%s %s, %s, %s\n",isGT ? "cmgt" : "cmhi",
9953           nameQRegLO(dd, Ity_I64),
9954           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
9955       return True;
9956    }
9957
9958    if (size == X11 && opcode == BITS5(0,0,1,1,1)) {
9959       /* -------- 0,11,00111 CMGE d_d_d -------- */ // >=s
9960       /* -------- 1,11,00111 CMHS d_d_d -------- */ // >=u
9961       Bool    isGE = bitU == 0;
9962       IRExpr* argL = getQReg128(nn);
9963       IRExpr* argR = getQReg128(mm);
9964       IRTemp  res  = newTempV128();
9965       assign(res,
9966              isGE ? unop(Iop_NotV128, binop(Iop_CmpGT64Sx2, argR, argL))
9967                   : unop(Iop_NotV128, binop(Iop_CmpGT64Ux2, argR, argL)));
9968       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9969       DIP("%s %s, %s, %s\n", isGE ? "cmge" : "cmhs",
9970           nameQRegLO(dd, Ity_I64),
9971           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
9972       return True;
9973    }
9974
9975    if (size == X11 && (opcode == BITS5(0,1,0,0,0)
9976                        || opcode == BITS5(0,1,0,1,0))) {
9977       /* -------- 0,xx,01000 SSHL  d_d_d -------- */
9978       /* -------- 0,xx,01010 SRSHL d_d_d -------- */
9979       /* -------- 1,xx,01000 USHL  d_d_d -------- */
9980       /* -------- 1,xx,01010 URSHL d_d_d -------- */
9981       Bool isU = bitU == 1;
9982       Bool isR = opcode == BITS5(0,1,0,1,0);
9983       IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
9984                      : (isU ? mkVecSHU(size)  : mkVecSHS(size));
9985       IRTemp res = newTempV128();
9986       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
9987       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9988       const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
9989                              : (isU ? "ushl"  : "sshl");
9990       DIP("%s %s, %s, %s\n", nm,
9991           nameQRegLO(dd, Ity_I64),
9992           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
9993       return True;
9994    }
9995
9996    if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
9997       /* -------- 0,xx,01001 SQSHL  std4_std4_std4 -------- */
9998       /* -------- 0,xx,01011 SQRSHL std4_std4_std4 -------- */
9999       /* -------- 1,xx,01001 UQSHL  std4_std4_std4 -------- */
10000       /* -------- 1,xx,01011 UQRSHL std4_std4_std4 -------- */
10001       Bool isU = bitU == 1;
10002       Bool isR = opcode == BITS5(0,1,0,1,1);
10003       IROp op  = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
10004                      : (isU ? mkVecQANDUQSH(size)  : mkVecQANDSQSH(size));
10005       /* This is a bit tricky.  Since we're only interested in the lowest
10006          lane of the result, we zero out all the rest in the operands, so
10007          as to ensure that other lanes don't pollute the returned Q value.
10008          This works because it means, for the lanes we don't care about, we
10009          are shifting zero by zero, which can never saturate. */
10010       IRTemp res256 = newTemp(Ity_V256);
10011       IRTemp resSH  = newTempV128();
10012       IRTemp resQ   = newTempV128();
10013       IRTemp zero   = newTempV128();
10014       assign(
10015          res256,
10016          binop(op,
10017                mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn))),
10018                mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(mm)))));
10019       assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
10020       assign(resQ,  unop(Iop_V256toV128_1, mkexpr(res256)));
10021       assign(zero,  mkV128(0x0000));
10022       putQReg128(dd, mkexpr(resSH));
10023       updateQCFLAGwithDifference(resQ, zero);
10024       const HChar* nm  = isR ? (isU ? "uqrshl" : "sqrshl")
10025                              : (isU ? "uqshl"  : "sqshl");
10026       const HChar  arr = "bhsd"[size];
10027       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10028       return True;
10029    }
10030
10031    if (size == X11 && opcode == BITS5(1,0,0,0,0)) {
10032       /* -------- 0,11,10000 ADD d_d_d -------- */
10033       /* -------- 1,11,10000 SUB d_d_d -------- */
10034       Bool   isSUB = bitU == 1;
10035       IRTemp res   = newTemp(Ity_I64);
10036       assign(res, binop(isSUB ? Iop_Sub64 : Iop_Add64,
10037                         getQRegLane(nn, 0, Ity_I64),
10038                         getQRegLane(mm, 0, Ity_I64)));
10039       putQRegLane(dd, 0, mkexpr(res));
10040       putQRegLane(dd, 1, mkU64(0));
10041       DIP("%s %s, %s, %s\n", isSUB ? "sub" : "add",
10042           nameQRegLO(dd, Ity_I64),
10043           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10044       return True;
10045    }
10046
10047    if (size == X11 && opcode == BITS5(1,0,0,0,1)) {
10048       /* -------- 0,11,10001 CMTST d_d_d -------- */ // &, != 0
10049       /* -------- 1,11,10001 CMEQ  d_d_d -------- */ // ==
10050       Bool    isEQ = bitU == 1;
10051       IRExpr* argL = getQReg128(nn);
10052       IRExpr* argR = getQReg128(mm);
10053       IRTemp  res  = newTempV128();
10054       assign(res,
10055              isEQ ? binop(Iop_CmpEQ64x2, argL, argR)
10056                   : unop(Iop_NotV128, binop(Iop_CmpEQ64x2,
10057                                             binop(Iop_AndV128, argL, argR),
10058                                             mkV128(0x0000))));
10059       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10060       DIP("%s %s, %s, %s\n", isEQ ? "cmeq" : "cmtst",
10061           nameQRegLO(dd, Ity_I64),
10062           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10063       return True;
10064    }
10065
10066    if (opcode == BITS5(1,0,1,1,0)) {
10067       /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
10068       /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
10069       if (size == X00 || size == X11) return False;
10070       Bool isR = bitU == 1;
10071       IRTemp res, sat1q, sat1n, vN, vM;
10072       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
10073       newTempsV128_2(&vN, &vM);
10074       assign(vN, getQReg128(nn));
10075       assign(vM, getQReg128(mm));
10076       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
10077       putQReg128(dd,
10078                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res))));
10079       updateQCFLAGwithDifference(
10080          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1q)),
10081          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1n)));
10082       const HChar  arr = "bhsd"[size];
10083       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
10084       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10085       return True;
10086    }
10087
10088    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
10089       /* -------- 1,1x,11010 FABD d_d_d, s_s_s -------- */
10090       IRType ity = size == X11 ? Ity_F64 : Ity_F32;
10091       IRTemp res = newTemp(ity);
10092       assign(res, unop(mkABSF(ity),
10093                        triop(mkSUBF(ity),
10094                              mkexpr(mk_get_IR_rounding_mode()),
10095                              getQRegLO(nn,ity), getQRegLO(mm,ity))));
10096       putQReg128(dd, mkV128(0x0000));
10097       putQRegLO(dd, mkexpr(res));
10098       DIP("fabd %s, %s, %s\n",
10099           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10100       return True;
10101    }
10102
10103    if (bitU == 0 && size <= X01 && opcode == BITS5(1,1,0,1,1)) {
10104       /* -------- 0,0x,11011 FMULX d_d_d, s_s_s -------- */
10105       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
10106       IRType ity = size == X01 ? Ity_F64 : Ity_F32;
10107       IRTemp res = newTemp(ity);
10108       assign(res, triop(mkMULF(ity),
10109                         mkexpr(mk_get_IR_rounding_mode()),
10110                         getQRegLO(nn,ity), getQRegLO(mm,ity)));
10111       putQReg128(dd, mkV128(0x0000));
10112       putQRegLO(dd, mkexpr(res));
10113       DIP("fmulx %s, %s, %s\n",
10114           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10115       return True;
10116    }
10117
10118    if (size <= X01 && opcode == BITS5(1,1,1,0,0)) {
10119       /* -------- 0,0x,11100 FCMEQ d_d_d, s_s_s -------- */
10120       /* -------- 1,0x,11100 FCMGE d_d_d, s_s_s -------- */
10121       Bool   isD   = size == X01;
10122       IRType ity   = isD ? Ity_F64 : Ity_F32;
10123       Bool   isGE  = bitU == 1;
10124       IROp   opCMP = isGE ? (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4)
10125                           : (isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4);
10126       IRTemp res   = newTempV128();
10127       assign(res, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn)) // swapd
10128                        : binop(opCMP, getQReg128(nn), getQReg128(mm)));
10129       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10130                                                              mkexpr(res))));
10131       DIP("%s %s, %s, %s\n", isGE ? "fcmge" : "fcmeq",
10132           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10133       return True;
10134    }
10135
10136    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,0,0)) {
10137       /* -------- 1,1x,11100 FCMGT d_d_d, s_s_s -------- */
10138       Bool   isD   = size == X11;
10139       IRType ity   = isD ? Ity_F64 : Ity_F32;
10140       IROp   opCMP = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
10141       IRTemp res   = newTempV128();
10142       assign(res, binop(opCMP, getQReg128(mm), getQReg128(nn))); // swapd
10143       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10144                                                              mkexpr(res))));
10145       DIP("%s %s, %s, %s\n", "fcmgt",
10146           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10147       return True;
10148    }
10149
10150    if (bitU == 1 && opcode == BITS5(1,1,1,0,1)) {
10151       /* -------- 1,0x,11101 FACGE d_d_d, s_s_s -------- */
10152       /* -------- 1,1x,11101 FACGT d_d_d, s_s_s -------- */
10153       Bool   isD   = (size & 1) == 1;
10154       IRType ity   = isD ? Ity_F64 : Ity_F32;
10155       Bool   isGT  = (size & 2) == 2;
10156       IROp   opCMP = isGT ? (isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4)
10157                           : (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4);
10158       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
10159       IRTemp res   = newTempV128();
10160       assign(res, binop(opCMP, unop(opABS, getQReg128(mm)),
10161                                unop(opABS, getQReg128(nn)))); // swapd
10162       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10163                                                              mkexpr(res))));
10164       DIP("%s %s, %s, %s\n", isGT ? "facgt" : "facge",
10165           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
10166       return True;
10167    }
10168
10169    if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
10170       /* -------- 0,0x,11111: FRECPS  d_d_d, s_s_s -------- */
10171       /* -------- 0,1x,11111: FRSQRTS d_d_d, s_s_s -------- */
10172       Bool isSQRT = (size & 2) == 2;
10173       Bool isD    = (size & 1) == 1;
10174       IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
10175                            : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
10176       IRTemp res = newTempV128();
10177       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
10178       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10179                                                              mkexpr(res))));
10180       HChar c = isD ? 'd' : 's';
10181       DIP("%s %c%u, %c%u, %c%u\n", isSQRT ? "frsqrts" : "frecps",
10182           c, dd, c, nn, c, mm);
10183       return True;
10184    }
10185
10186    return False;
10187 #  undef INSN
10188 }
10189
10190
10191 static
10192 Bool dis_AdvSIMD_scalar_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
10193 {
10194    /* 31 29 28    23   21    16     11 9 4
10195       01 U  11110 size 10000 opcode 10 n d
10196       Decode fields: u,size,opcode
10197    */
10198 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10199    if (INSN(31,30) != BITS2(0,1)
10200        || INSN(28,24) != BITS5(1,1,1,1,0)
10201        || INSN(21,17) != BITS5(1,0,0,0,0)
10202        || INSN(11,10) != BITS2(1,0)) {
10203       return False;
10204    }
10205    UInt bitU   = INSN(29,29);
10206    UInt size   = INSN(23,22);
10207    UInt opcode = INSN(16,12);
10208    UInt nn     = INSN(9,5);
10209    UInt dd     = INSN(4,0);
10210    vassert(size < 4);
10211
10212    if (opcode == BITS5(0,0,0,1,1)) {
10213       /* -------- 0,xx,00011: SUQADD std4_std4 -------- */
10214       /* -------- 1,xx,00011: USQADD std4_std4 -------- */
10215       /* These are a bit tricky (to say the least).  See comments on
10216          the vector variants (in dis_AdvSIMD_two_reg_misc) below for
10217          details. */
10218       Bool   isUSQADD = bitU == 1;
10219       IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
10220                              : mkVecQADDEXTUSSATSS(size);
10221       IROp   nop  = mkVecADD(size);
10222       IRTemp argL = newTempV128();
10223       IRTemp argR = newTempV128();
10224       assign(argL, getQReg128(nn));
10225       assign(argR, getQReg128(dd));
10226       IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10227                        size, binop(qop, mkexpr(argL), mkexpr(argR)));
10228       IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10229                        size, binop(nop, mkexpr(argL), mkexpr(argR)));
10230       putQReg128(dd, mkexpr(qres));
10231       updateQCFLAGwithDifference(qres, nres);
10232       const HChar arr = "bhsd"[size];
10233       DIP("%s %c%u, %c%u\n", isUSQADD ? "usqadd" : "suqadd", arr, dd, arr, nn);
10234       return True;
10235    }
10236
10237    if (opcode == BITS5(0,0,1,1,1)) {
10238       /* -------- 0,xx,00111 SQABS std4_std4 -------- */
10239       /* -------- 1,xx,00111 SQNEG std4_std4 -------- */
10240       Bool isNEG = bitU == 1;
10241       IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
10242       (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
10243                                          getQReg128(nn), size );
10244       IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(qresFW));
10245       IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(nresFW));
10246       putQReg128(dd, mkexpr(qres));
10247       updateQCFLAGwithDifference(qres, nres);
10248       const HChar arr = "bhsd"[size];
10249       DIP("%s %c%u, %c%u\n", isNEG ? "sqneg" : "sqabs", arr, dd, arr, nn);
10250       return True;
10251    }
10252
10253    if (size == X11 && opcode == BITS5(0,1,0,0,0)) {
10254       /* -------- 0,11,01000: CMGT d_d_#0 -------- */ // >s 0
10255       /* -------- 1,11,01000: CMGE d_d_#0 -------- */ // >=s 0
10256       Bool    isGT = bitU == 0;
10257       IRExpr* argL = getQReg128(nn);
10258       IRExpr* argR = mkV128(0x0000);
10259       IRTemp  res  = newTempV128();
10260       assign(res, isGT ? binop(Iop_CmpGT64Sx2, argL, argR)
10261                        : unop(Iop_NotV128, binop(Iop_CmpGT64Sx2, argR, argL)));
10262       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10263       DIP("cm%s d%u, d%u, #0\n", isGT ? "gt" : "ge", dd, nn);
10264       return True;
10265    }
10266
10267    if (size == X11 && opcode == BITS5(0,1,0,0,1)) {
10268       /* -------- 0,11,01001: CMEQ d_d_#0 -------- */ // == 0
10269       /* -------- 1,11,01001: CMLE d_d_#0 -------- */ // <=s 0
10270       Bool    isEQ = bitU == 0;
10271       IRExpr* argL = getQReg128(nn);
10272       IRExpr* argR = mkV128(0x0000);
10273       IRTemp  res  = newTempV128();
10274       assign(res, isEQ ? binop(Iop_CmpEQ64x2, argL, argR)
10275                        : unop(Iop_NotV128,
10276                               binop(Iop_CmpGT64Sx2, argL, argR)));
10277       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10278       DIP("cm%s d%u, d%u, #0\n", isEQ ? "eq" : "le", dd, nn);
10279       return True;
10280    }
10281
10282    if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,0)) {
10283       /* -------- 0,11,01010: CMLT d_d_#0 -------- */ // <s 0
10284       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10285                           binop(Iop_CmpGT64Sx2, mkV128(0x0000),
10286                                                 getQReg128(nn))));
10287       DIP("cm%s d%u, d%u, #0\n", "lt", dd, nn);
10288       return True;
10289    }
10290
10291    if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
10292       /* -------- 0,11,01011 ABS d_d -------- */
10293       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10294                           unop(Iop_Abs64x2, getQReg128(nn))));
10295       DIP("abs d%u, d%u\n", dd, nn);
10296       return True;
10297    }
10298
10299    if (bitU == 1 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
10300       /* -------- 1,11,01011 NEG d_d -------- */
10301       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10302                           binop(Iop_Sub64x2, mkV128(0x0000), getQReg128(nn))));
10303       DIP("neg d%u, d%u\n", dd, nn);
10304       return True;
10305    }
10306
10307    UInt ix = 0; /*INVALID*/
10308    if (size >= X10) {
10309       switch (opcode) {
10310          case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 4 : 1; break;
10311          case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 5 : 2; break;
10312          case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;
10313          default: break;
10314       }
10315    }
10316    if (ix > 0) {
10317       /* -------- 0,1x,01100 FCMGT d_d_#0.0, s_s_#0.0 (ix 1) -------- */
10318       /* -------- 0,1x,01101 FCMEQ d_d_#0.0, s_s_#0.0 (ix 2) -------- */
10319       /* -------- 0,1x,01110 FCMLT d_d_#0.0, s_s_#0.0 (ix 3) -------- */
10320       /* -------- 1,1x,01100 FCMGE d_d_#0.0, s_s_#0.0 (ix 4) -------- */
10321       /* -------- 1,1x,01101 FCMLE d_d_#0.0, s_s_#0.0 (ix 5) -------- */
10322       Bool   isD     = size == X11;
10323       IRType ity     = isD ? Ity_F64 : Ity_F32;
10324       IROp   opCmpEQ = isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
10325       IROp   opCmpLE = isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
10326       IROp   opCmpLT = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
10327       IROp   opCmp   = Iop_INVALID;
10328       Bool   swap    = False;
10329       const HChar* nm = "??";
10330       switch (ix) {
10331          case 1: nm = "fcmgt"; opCmp = opCmpLT; swap = True; break;
10332          case 2: nm = "fcmeq"; opCmp = opCmpEQ; break;
10333          case 3: nm = "fcmlt"; opCmp = opCmpLT; break;
10334          case 4: nm = "fcmge"; opCmp = opCmpLE; swap = True; break;
10335          case 5: nm = "fcmle"; opCmp = opCmpLE; break;
10336          default: vassert(0);
10337       }
10338       IRExpr* zero = mkV128(0x0000);
10339       IRTemp res = newTempV128();
10340       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
10341                        : binop(opCmp, getQReg128(nn), zero));
10342       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10343                                                              mkexpr(res))));
10344
10345       DIP("%s %s, %s, #0.0\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
10346       return True;
10347    }
10348
10349    if (opcode == BITS5(1,0,1,0,0)
10350        || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) {
10351       /* -------- 0,xx,10100: SQXTN -------- */
10352       /* -------- 1,xx,10100: UQXTN -------- */
10353       /* -------- 1,xx,10010: SQXTUN -------- */
10354       if (size == X11) return False;
10355       vassert(size < 3);
10356       IROp  opN    = Iop_INVALID;
10357       Bool  zWiden = True;
10358       const HChar* nm = "??";
10359       /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) {
10360          opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False;
10361       }
10362       else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) {
10363          opN = mkVecQNARROWUNUU(size); nm = "uqxtn";
10364       }
10365       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
10366          opN = mkVecQNARROWUNSU(size); nm = "sqxtun";
10367       }
10368       else vassert(0);
10369       IRTemp src  = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10370                        size+1, getQReg128(nn));
10371       IRTemp resN = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10372                        size, unop(Iop_64UtoV128, unop(opN, mkexpr(src))));
10373       putQReg128(dd, mkexpr(resN));
10374       /* This widens zero lanes to zero, and compares it against zero, so all
10375          of the non-participating lanes make no contribution to the
10376          Q flag state. */
10377       IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/,
10378                                               size, mkexpr(resN));
10379       updateQCFLAGwithDifference(src, resW);
10380       const HChar arrNarrow = "bhsd"[size];
10381       const HChar arrWide   = "bhsd"[size+1];
10382       DIP("%s %c%u, %c%u\n", nm, arrNarrow, dd, arrWide, nn);
10383       return True;
10384    }
10385
10386    if (opcode == BITS5(1,0,1,1,0) && bitU == 1 && size == X01) {
10387       /* -------- 1,01,10110 FCVTXN s_d -------- */
10388       /* Using Irrm_NEAREST here isn't right.  The docs say "round to
10389          odd" but I don't know what that really means. */
10390       putQRegLO(dd,
10391                 binop(Iop_F64toF32, mkU32(Irrm_NEAREST),
10392                                     getQRegLO(nn, Ity_F64)));
10393       putQRegLane(dd, 1, mkU32(0));
10394       putQRegLane(dd, 1, mkU64(0));
10395       DIP("fcvtxn s%u, d%u\n", dd, nn);
10396       return True;
10397    }
10398
10399    ix = 0; /*INVALID*/
10400    switch (opcode) {
10401       case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
10402       case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
10403       case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
10404       default: break;
10405    }
10406    if (ix > 0) {
10407       /* -------- 0,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
10408       /* -------- 0,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
10409       /* -------- 0,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
10410       /* -------- 0,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
10411       /* -------- 0,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
10412       /* -------- 1,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
10413       /* -------- 1,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
10414       /* -------- 1,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
10415       /* -------- 1,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
10416       /* -------- 1,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
10417       Bool           isD  = (size & 1) == 1;
10418       IRType         tyF  = isD ? Ity_F64 : Ity_F32;
10419       IRType         tyI  = isD ? Ity_I64 : Ity_I32;
10420       IRRoundingMode irrm = 8; /*impossible*/
10421       HChar          ch   = '?';
10422       switch (ix) {
10423          case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
10424          case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
10425          case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
10426          case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
10427          case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
10428          default: vassert(0);
10429       }
10430       IROp cvt = Iop_INVALID;
10431       if (bitU == 1) {
10432          cvt = isD ? Iop_F64toI64U : Iop_F32toI32U;
10433       } else {
10434          cvt = isD ? Iop_F64toI64S : Iop_F32toI32S;
10435       }
10436       IRTemp src = newTemp(tyF);
10437       IRTemp res = newTemp(tyI);
10438       assign(src, getQRegLane(nn, 0, tyF));
10439       assign(res, binop(cvt, mkU32(irrm), mkexpr(src)));
10440       putQRegLane(dd, 0, mkexpr(res)); /* bits 31-0 or 63-0 */
10441       if (!isD) {
10442          putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
10443       }
10444       putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
10445       HChar sOrD = isD ? 'd' : 's';
10446       DIP("fcvt%c%c %c%u, %c%u\n", ch, bitU == 1 ? 'u' : 's',
10447           sOrD, dd, sOrD, nn);
10448       return True;
10449    }
10450
10451    if (size <= X01 && opcode == BITS5(1,1,1,0,1)) {
10452       /* -------- 0,0x,11101: SCVTF d_d, s_s -------- */
10453       /* -------- 1,0x,11101: UCVTF d_d, s_s -------- */
10454       Bool   isU = bitU == 1;
10455       Bool   isD = (size & 1) == 1;
10456       IRType tyI = isD ? Ity_I64 : Ity_I32;
10457       IROp   iop = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
10458                        : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
10459       IRTemp rm  = mk_get_IR_rounding_mode();
10460       putQRegLO(dd, binop(iop, mkexpr(rm), getQRegLO(nn, tyI)));
10461       if (!isD) {
10462          putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
10463       }
10464       putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
10465       HChar c = isD ? 'd' : 's';
10466       DIP("%ccvtf %c%u, %c%u\n", isU ? 'u' : 's', c, dd, c, nn);
10467       return True;
10468    }
10469
10470    if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
10471       /* -------- 0,1x,11101: FRECPE  d_d, s_s -------- */
10472       /* -------- 1,1x,11101: FRSQRTE d_d, s_s -------- */
10473       Bool isSQRT = bitU == 1;
10474       Bool isD    = (size & 1) == 1;
10475       IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
10476                            : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
10477       IRTemp resV = newTempV128();
10478       assign(resV, unop(op, getQReg128(nn)));
10479       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
10480                                                              mkexpr(resV))));
10481       HChar c = isD ? 'd' : 's';
10482       DIP("%s %c%u, %c%u\n", isSQRT ? "frsqrte" : "frecpe", c, dd, c, nn);
10483       return True;
10484    }
10485
10486    if (bitU == 0 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
10487       /* -------- 0,1x,11111: FRECPX  d_d, s_s -------- */
10488       Bool   isD = (size & 1) == 1;
10489       IRType ty  = isD ? Ity_F64 : Ity_F32;
10490       IROp   op  = isD ? Iop_RecpExpF64 : Iop_RecpExpF32;
10491       IRTemp res = newTemp(ty);
10492       IRTemp rm  = mk_get_IR_rounding_mode();
10493       assign(res, binop(op, mkexpr(rm), getQRegLane(nn, 0, ty)));
10494       putQReg128(dd, mkV128(0x0000));
10495       putQRegLane(dd, 0, mkexpr(res));
10496       HChar c = isD ? 'd' : 's';
10497       DIP("%s %c%u, %c%u\n", "frecpx", c, dd, c, nn);
10498       return True;
10499    }
10500
10501    return False;
10502 #  undef INSN
10503 }
10504
10505
10506 static
10507 Bool dis_AdvSIMD_scalar_x_indexed_element(/*MB_OUT*/DisResult* dres, UInt insn)
10508 {
10509    /* 31   28    23   21 20 19 15     11   9 4
10510       01 U 11111 size L  M  m  opcode H  0 n d
10511       Decode fields are: u,size,opcode
10512       M is really part of the mm register number.  Individual
10513       cases need to inspect L and H though.
10514    */
10515 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10516    if (INSN(31,30) != BITS2(0,1)
10517        || INSN(28,24) != BITS5(1,1,1,1,1) || INSN(10,10) !=0) {
10518       return False;
10519    }
10520    UInt bitU   = INSN(29,29);
10521    UInt size   = INSN(23,22);
10522    UInt bitL   = INSN(21,21);
10523    UInt bitM   = INSN(20,20);
10524    UInt mmLO4  = INSN(19,16);
10525    UInt opcode = INSN(15,12);
10526    UInt bitH   = INSN(11,11);
10527    UInt nn     = INSN(9,5);
10528    UInt dd     = INSN(4,0);
10529    vassert(size < 4);
10530    vassert(bitH < 2 && bitM < 2 && bitL < 2);
10531
10532    if (bitU == 0 && size >= X10
10533        && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
10534       /* -------- 0,1x,0001 FMLA d_d_d[], s_s_s[] -------- */
10535       /* -------- 0,1x,0101 FMLS d_d_d[], s_s_s[] -------- */
10536       Bool isD   = (size & 1) == 1;
10537       Bool isSUB = opcode == BITS4(0,1,0,1);
10538       UInt index;
10539       if      (!isD)             index = (bitH << 1) | bitL;
10540       else if (isD && bitL == 0) index = bitH;
10541       else return False; // sz:L == x11 => unallocated encoding
10542       vassert(index < (isD ? 2 : 4));
10543       IRType ity   = isD ? Ity_F64 : Ity_F32;
10544       IRTemp elem  = newTemp(ity);
10545       UInt   mm    = (bitM << 4) | mmLO4;
10546       assign(elem, getQRegLane(mm, index, ity));
10547       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
10548       IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
10549       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
10550       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
10551       IRTemp rm    = mk_get_IR_rounding_mode();
10552       IRTemp t1    = newTempV128();
10553       IRTemp t2    = newTempV128();
10554       // FIXME: double rounding; use FMA primops instead
10555       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
10556       assign(t2, triop(isSUB ? opSUB : opADD,
10557                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
10558       putQReg128(dd,
10559                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
10560                                                          mkexpr(t2))));
10561       const HChar c = isD ? 'd' : 's';
10562       DIP("%s %c%u, %c%u, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
10563           c, dd, c, nn, nameQReg128(mm), c, index);
10564       return True;
10565    }
10566
10567    if (size >= X10 && opcode == BITS4(1,0,0,1)) {
10568       /* -------- 0,1x,1001 FMUL  d_d_d[], s_s_s[] -------- */
10569       /* -------- 1,1x,1001 FMULX d_d_d[], s_s_s[] -------- */
10570       Bool isD    = (size & 1) == 1;
10571       Bool isMULX = bitU == 1;
10572       UInt index;
10573       if      (!isD)             index = (bitH << 1) | bitL;
10574       else if (isD && bitL == 0) index = bitH;
10575       else return False; // sz:L == x11 => unallocated encoding
10576       vassert(index < (isD ? 2 : 4));
10577       IRType ity   = isD ? Ity_F64 : Ity_F32;
10578       IRTemp elem  = newTemp(ity);
10579       UInt   mm    = (bitM << 4) | mmLO4;
10580       assign(elem, getQRegLane(mm, index, ity));
10581       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
10582       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
10583       IRTemp rm    = mk_get_IR_rounding_mode();
10584       IRTemp t1    = newTempV128();
10585       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
10586       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
10587       putQReg128(dd,
10588                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
10589                                                          mkexpr(t1))));
10590       const HChar c = isD ? 'd' : 's';
10591       DIP("%s %c%u, %c%u, %s.%c[%u]\n", isMULX ? "fmulx" : "fmul",
10592           c, dd, c, nn, nameQReg128(mm), c, index);
10593       return True;
10594    }
10595
10596    if (bitU == 0
10597        && (opcode == BITS4(1,0,1,1)
10598            || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
10599       /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
10600       /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
10601       /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
10602       /* Widens, and size refers to the narrowed lanes. */
10603       UInt ks = 3;
10604       switch (opcode) {
10605          case BITS4(1,0,1,1): ks = 0; break;
10606          case BITS4(0,0,1,1): ks = 1; break;
10607          case BITS4(0,1,1,1): ks = 2; break;
10608          default: vassert(0);
10609       }
10610       vassert(ks >= 0 && ks <= 2);
10611       UInt mm  = 32; // invalid
10612       UInt ix  = 16; // invalid
10613       switch (size) {
10614          case X00:
10615             return False; // h_b_b[] case is not allowed
10616          case X01:
10617             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
10618          case X10:
10619             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
10620          case X11:
10621             return False; // q_d_d[] case is not allowed
10622          default:
10623             vassert(0);
10624       }
10625       vassert(mm < 32 && ix < 16);
10626       IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
10627       vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
10628       newTempsV128_2(&vecN, &vecD);
10629       assign(vecN, getQReg128(nn));
10630       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
10631       assign(vecD, getQReg128(dd));
10632       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
10633                        False/*!is2*/, size, "mas"[ks],
10634                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
10635       IROp opZHI = mkVecZEROHIxxOFV128(size+1);
10636       putQReg128(dd, unop(opZHI, mkexpr(res)));
10637       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
10638       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
10639       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
10640          updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
10641       }
10642       const HChar* nm        = ks == 0 ? "sqmull"
10643                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
10644       const HChar  arrNarrow = "bhsd"[size];
10645       const HChar  arrWide   = "bhsd"[size+1];
10646       DIP("%s %c%u, %c%u, v%u.%c[%u]\n",
10647           nm, arrWide, dd, arrNarrow, nn, dd, arrNarrow, ix);
10648       return True;
10649    }
10650
10651    if (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1)) {
10652       /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
10653       /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
10654       UInt mm  = 32; // invalid
10655       UInt ix  = 16; // invalid
10656       switch (size) {
10657          case X00:
10658             return False; // b case is not allowed
10659          case X01:
10660             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
10661          case X10:
10662             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
10663          case X11:
10664             return False; // q case is not allowed
10665          default:
10666             vassert(0);
10667       }
10668       vassert(mm < 32 && ix < 16);
10669       Bool isR = opcode == BITS4(1,1,0,1);
10670       IRTemp res, sat1q, sat1n, vN, vM;
10671       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
10672       vN = newTempV128();
10673       assign(vN, getQReg128(nn));
10674       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
10675       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
10676       IROp opZHI = mkVecZEROHIxxOFV128(size);
10677       putQReg128(dd, unop(opZHI, mkexpr(res)));
10678       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
10679       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
10680       HChar ch         = size == X01 ? 'h' : 's';
10681       DIP("%s %c%u, %c%u, v%d.%c[%u]\n", nm, ch, dd, ch, nn, ch, (Int)dd, ix);
10682       return True;
10683    }
10684
10685    return False;
10686 #  undef INSN
10687 }
10688
10689
10690 static
10691 Bool dis_AdvSIMD_shift_by_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
10692 {
10693    /* 31    28     22   18   15     10 9 4
10694       0 q u 011110 immh immb opcode 1  n d
10695       Decode fields: u,opcode
10696    */
10697 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10698    if (INSN(31,31) != 0
10699        || INSN(28,23) != BITS6(0,1,1,1,1,0) || INSN(10,10) != 1) {
10700       return False;
10701    }
10702    UInt bitQ   = INSN(30,30);
10703    UInt bitU   = INSN(29,29);
10704    UInt immh   = INSN(22,19);
10705    UInt immb   = INSN(18,16);
10706    UInt opcode = INSN(15,11);
10707    UInt nn     = INSN(9,5);
10708    UInt dd     = INSN(4,0);
10709
10710    if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0)) {
10711       /* -------- 0,00000 SSHR std7_std7_#imm -------- */
10712       /* -------- 1,00000 USHR std7_std7_#imm -------- */
10713       /* -------- 0,00010 SSRA std7_std7_#imm -------- */
10714       /* -------- 1,00010 USRA std7_std7_#imm -------- */
10715       /* laneTy, shift = case immh:immb of
10716                          0001:xxx -> B, SHR:8-xxx
10717                          001x:xxx -> H, SHR:16-xxxx
10718                          01xx:xxx -> S, SHR:32-xxxxx
10719                          1xxx:xxx -> D, SHR:64-xxxxxx
10720                          other    -> invalid
10721       */
10722       UInt size  = 0;
10723       UInt shift = 0;
10724       Bool isQ   = bitQ == 1;
10725       Bool isU   = bitU == 1;
10726       Bool isAcc = opcode == BITS5(0,0,0,1,0);
10727       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10728       if (!ok || (bitQ == 0 && size == X11)) return False;
10729       vassert(size >= 0 && size <= 3);
10730       UInt lanebits = 8 << size;
10731       vassert(shift >= 1 && shift <= lanebits);
10732       IROp    op  = isU ? mkVecSHRN(size) : mkVecSARN(size);
10733       IRExpr* src = getQReg128(nn);
10734       IRTemp  shf = newTempV128();
10735       IRTemp  res = newTempV128();
10736       if (shift == lanebits && isU) {
10737          assign(shf, mkV128(0x0000));
10738       } else {
10739          UInt nudge = 0;
10740          if (shift == lanebits) {
10741             vassert(!isU);
10742             nudge = 1;
10743          }
10744          assign(shf, binop(op, src, mkU8(shift - nudge)));
10745       }
10746       assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
10747                         : mkexpr(shf));
10748       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
10749       HChar laneCh = "bhsd"[size];
10750       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
10751       const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
10752                               : (isU ? "ushr" : "sshr");
10753       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
10754           nameQReg128(dd), nLanes, laneCh,
10755           nameQReg128(nn), nLanes, laneCh, shift);
10756       return True;
10757    }
10758
10759    if (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0)) {
10760       /* -------- 0,00100 SRSHR std7_std7_#imm -------- */
10761       /* -------- 1,00100 URSHR std7_std7_#imm -------- */
10762       /* -------- 0,00110 SRSRA std7_std7_#imm -------- */
10763       /* -------- 1,00110 URSRA std7_std7_#imm -------- */
10764       /* laneTy, shift = case immh:immb of
10765                          0001:xxx -> B, SHR:8-xxx
10766                          001x:xxx -> H, SHR:16-xxxx
10767                          01xx:xxx -> S, SHR:32-xxxxx
10768                          1xxx:xxx -> D, SHR:64-xxxxxx
10769                          other    -> invalid
10770       */
10771       UInt size  = 0;
10772       UInt shift = 0;
10773       Bool isQ   = bitQ == 1;
10774       Bool isU   = bitU == 1;
10775       Bool isAcc = opcode == BITS5(0,0,1,1,0);
10776       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10777       if (!ok || (bitQ == 0 && size == X11)) return False;
10778       vassert(size >= 0 && size <= 3);
10779       UInt lanebits = 8 << size;
10780       vassert(shift >= 1 && shift <= lanebits);
10781       IROp    op   = isU ? mkVecRSHU(size) : mkVecRSHS(size);
10782       IRExpr* src  = getQReg128(nn);
10783       IRTemp  imm8 = newTemp(Ity_I8);
10784       assign(imm8, mkU8((UChar)(-shift)));
10785       IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
10786       IRTemp  shf  = newTempV128();
10787       IRTemp  res  = newTempV128();
10788       assign(shf, binop(op, src, amt));
10789       assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
10790                         : mkexpr(shf));
10791       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
10792       HChar laneCh = "bhsd"[size];
10793       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
10794       const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
10795                               : (isU ? "urshr" : "srshr");
10796       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
10797           nameQReg128(dd), nLanes, laneCh,
10798           nameQReg128(nn), nLanes, laneCh, shift);
10799       return True;
10800    }
10801
10802    if (bitU == 1 && opcode == BITS5(0,1,0,0,0)) {
10803       /* -------- 1,01000 SRI std7_std7_#imm -------- */
10804       /* laneTy, shift = case immh:immb of
10805                          0001:xxx -> B, SHR:8-xxx
10806                          001x:xxx -> H, SHR:16-xxxx
10807                          01xx:xxx -> S, SHR:32-xxxxx
10808                          1xxx:xxx -> D, SHR:64-xxxxxx
10809                          other    -> invalid
10810       */
10811       UInt size  = 0;
10812       UInt shift = 0;
10813       Bool isQ   = bitQ == 1;
10814       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10815       if (!ok || (bitQ == 0 && size == X11)) return False;
10816       vassert(size >= 0 && size <= 3);
10817       UInt lanebits = 8 << size;
10818       vassert(shift >= 1 && shift <= lanebits);
10819       IRExpr* src = getQReg128(nn);
10820       IRTemp  res = newTempV128();
10821       if (shift == lanebits) {
10822          assign(res, getQReg128(dd));
10823       } else {
10824          assign(res, binop(mkVecSHRN(size), src, mkU8(shift)));
10825          IRExpr* nmask = binop(mkVecSHLN(size),
10826                                mkV128(0xFFFF), mkU8(lanebits - shift));
10827          IRTemp  tmp   = newTempV128();
10828          assign(tmp, binop(Iop_OrV128,
10829                            mkexpr(res),
10830                            binop(Iop_AndV128, getQReg128(dd), nmask)));
10831          res = tmp;
10832       }
10833       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
10834       HChar laneCh = "bhsd"[size];
10835       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
10836       DIP("%s %s.%u%c, %s.%u%c, #%u\n", "sri",
10837           nameQReg128(dd), nLanes, laneCh,
10838           nameQReg128(nn), nLanes, laneCh, shift);
10839       return True;
10840    }
10841
10842    if (opcode == BITS5(0,1,0,1,0)) {
10843       /* -------- 0,01010 SHL std7_std7_#imm -------- */
10844       /* -------- 1,01010 SLI std7_std7_#imm -------- */
10845       /* laneTy, shift = case immh:immb of
10846                          0001:xxx -> B, xxx
10847                          001x:xxx -> H, xxxx
10848                          01xx:xxx -> S, xxxxx
10849                          1xxx:xxx -> D, xxxxxx
10850                          other    -> invalid
10851       */
10852       UInt size  = 0;
10853       UInt shift = 0;
10854       Bool isSLI = bitU == 1;
10855       Bool isQ   = bitQ == 1;
10856       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10857       if (!ok || (bitQ == 0 && size == X11)) return False;
10858       vassert(size >= 0 && size <= 3);
10859       /* The shift encoding has opposite sign for the leftwards case.
10860          Adjust shift to compensate. */
10861       UInt lanebits = 8 << size;
10862       shift = lanebits - shift;
10863       vassert(shift >= 0 && shift < lanebits);
10864       IROp    op  = mkVecSHLN(size);
10865       IRExpr* src = getQReg128(nn);
10866       IRTemp  res = newTempV128();
10867       if (shift == 0) {
10868          assign(res, src);
10869       } else {
10870          assign(res, binop(op, src, mkU8(shift)));
10871          if (isSLI) {
10872             IRExpr* nmask = binop(mkVecSHRN(size),
10873                                   mkV128(0xFFFF), mkU8(lanebits - shift));
10874             IRTemp  tmp   = newTempV128();
10875             assign(tmp, binop(Iop_OrV128,
10876                               mkexpr(res),
10877                               binop(Iop_AndV128, getQReg128(dd), nmask)));
10878             res = tmp;
10879          }
10880       }
10881       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
10882       HChar laneCh = "bhsd"[size];
10883       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
10884       const HChar* nm = isSLI ? "sli" : "shl";
10885       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
10886           nameQReg128(dd), nLanes, laneCh,
10887           nameQReg128(nn), nLanes, laneCh, shift);
10888       return True;
10889    }
10890
10891    if (opcode == BITS5(0,1,1,1,0)
10892        || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
10893       /* -------- 0,01110  SQSHL  std7_std7_#imm -------- */
10894       /* -------- 1,01110  UQSHL  std7_std7_#imm -------- */
10895       /* -------- 1,01100  SQSHLU std7_std7_#imm -------- */
10896       UInt size  = 0;
10897       UInt shift = 0;
10898       Bool isQ   = bitQ == 1;
10899       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10900       if (!ok || (bitQ == 0 && size == X11)) return False;
10901       vassert(size >= 0 && size <= 3);
10902       /* The shift encoding has opposite sign for the leftwards case.
10903          Adjust shift to compensate. */
10904       UInt lanebits = 8 << size;
10905       shift = lanebits - shift;
10906       vassert(shift >= 0 && shift < lanebits);
10907       const HChar* nm = NULL;
10908       /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
10909       else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
10910       else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
10911       else vassert(0);
10912       IRTemp qDiff1 = IRTemp_INVALID;
10913       IRTemp qDiff2 = IRTemp_INVALID;
10914       IRTemp res = IRTemp_INVALID;
10915       IRTemp src = newTempV128();
10916       assign(src, getQReg128(nn));
10917       math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
10918       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
10919       updateQCFLAGwithDifferenceZHI(qDiff1, qDiff2,
10920                                     isQ ? Iop_INVALID : Iop_ZeroHI64ofV128);
10921       const HChar* arr = nameArr_Q_SZ(bitQ, size);
10922       DIP("%s %s.%s, %s.%s, #%u\n", nm,
10923           nameQReg128(dd), arr, nameQReg128(nn), arr, shift);
10924       return True;
10925    }
10926
10927    if (bitU == 0
10928        && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1))) {
10929       /* -------- 0,10000  SHRN{,2} #imm -------- */
10930       /* -------- 0,10001 RSHRN{,2} #imm -------- */
10931       /* Narrows, and size is the narrow size. */
10932       UInt size  = 0;
10933       UInt shift = 0;
10934       Bool is2   = bitQ == 1;
10935       Bool isR   = opcode == BITS5(1,0,0,0,1);
10936       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10937       if (!ok || size == X11) return False;
10938       vassert(shift >= 1);
10939       IRTemp t1 = newTempV128();
10940       IRTemp t2 = newTempV128();
10941       IRTemp t3 = newTempV128();
10942       assign(t1, getQReg128(nn));
10943       assign(t2, isR ? binop(mkVecADD(size+1),
10944                              mkexpr(t1),
10945                              mkexpr(math_VEC_DUP_IMM(size+1, 1ULL<<(shift-1))))
10946                      : mkexpr(t1));
10947       assign(t3, binop(mkVecSHRN(size+1), mkexpr(t2), mkU8(shift)));
10948       IRTemp t4 = math_NARROW_LANES(t3, t3, size);
10949       putLO64andZUorPutHI64(is2, dd, t4);
10950       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
10951       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
10952       DIP("%s %s.%s, %s.%s, #%u\n", isR ? "rshrn" : "shrn",
10953           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
10954       return True;
10955    }
10956
10957    if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
10958        || (bitU == 1
10959            && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
10960       /* -------- 0,10010   SQSHRN{,2} #imm -------- */
10961       /* -------- 1,10010   UQSHRN{,2} #imm -------- */
10962       /* -------- 0,10011  SQRSHRN{,2} #imm -------- */
10963       /* -------- 1,10011  UQRSHRN{,2} #imm -------- */
10964       /* -------- 1,10000  SQSHRUN{,2} #imm -------- */
10965       /* -------- 1,10001 SQRSHRUN{,2} #imm -------- */
10966       UInt size  = 0;
10967       UInt shift = 0;
10968       Bool is2   = bitQ == 1;
10969       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10970       if (!ok || size == X11) return False;
10971       vassert(shift >= 1 && shift <= (8 << size));
10972       const HChar* nm = "??";
10973       IROp op = Iop_INVALID;
10974       /* Decide on the name and the operation. */
10975       /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
10976          nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size);
10977       }
10978       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
10979          nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size);
10980       }
10981       else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) {
10982          nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size);
10983       }
10984       else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
10985          nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size);
10986       }
10987       else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) {
10988          nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size);
10989       }
10990       else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) {
10991          nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size);
10992       }
10993       else vassert(0);
10994       /* Compute the result (Q, shifted value) pair. */
10995       IRTemp src128 = newTempV128();
10996       assign(src128, getQReg128(nn));
10997       IRTemp pair = newTempV128();
10998       assign(pair, binop(op, mkexpr(src128), mkU8(shift)));
10999       /* Update the result reg */
11000       IRTemp res64in128 = newTempV128();
11001       assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair)));
11002       putLO64andZUorPutHI64(is2, dd, res64in128);
11003       /* Update the Q flag. */
11004       IRTemp q64q64 = newTempV128();
11005       assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair)));
11006       IRTemp z128 = newTempV128();
11007       assign(z128, mkV128(0x0000));
11008       updateQCFLAGwithDifference(q64q64, z128);
11009       /* */
11010       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11011       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11012       DIP("%s %s.%s, %s.%s, #%u\n", nm,
11013           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
11014       return True;
11015    }
11016
11017    if (opcode == BITS5(1,0,1,0,0)) {
11018       /* -------- 0,10100 SSHLL{,2} #imm -------- */
11019       /* -------- 1,10100 USHLL{,2} #imm -------- */
11020       /* 31  28     22   18   15     9 4
11021          0q0 011110 immh immb 101001 n d  SSHLL Vd.Ta, Vn.Tb, #sh
11022          0q1 011110 immh immb 101001 n d  USHLL Vd.Ta, Vn.Tb, #sh
11023          where Ta,Tb,sh
11024            = case immh of 1xxx -> invalid
11025                           01xx -> 2d, 2s(q0)/4s(q1),  immh:immb - 32 (0..31)
11026                           001x -> 4s, 4h(q0)/8h(q1),  immh:immb - 16 (0..15)
11027                           0001 -> 8h, 8b(q0)/16b(q1), immh:immb - 8  (0..7)
11028                           0000 -> AdvSIMD modified immediate (???)
11029       */
11030       Bool    isQ   = bitQ == 1;
11031       Bool    isU   = bitU == 1;
11032       UInt    immhb = (immh << 3) | immb;
11033       IRTemp  src   = newTempV128();
11034       IRTemp  zero  = newTempV128();
11035       IRExpr* res   = NULL;
11036       UInt    sh    = 0;
11037       const HChar* ta = "??";
11038       const HChar* tb = "??";
11039       assign(src, getQReg128(nn));
11040       assign(zero, mkV128(0x0000));
11041       if (immh & 8) {
11042          /* invalid; don't assign to res */
11043       }
11044       else if (immh & 4) {
11045          sh = immhb - 32;
11046          vassert(sh < 32); /* so 32-sh is 1..32 */
11047          ta = "2d";
11048          tb = isQ ? "4s" : "2s";
11049          IRExpr* tmp = isQ ? mk_InterleaveHI32x4(src, zero)
11050                            : mk_InterleaveLO32x4(src, zero);
11051          res = binop(isU ? Iop_ShrN64x2 : Iop_SarN64x2, tmp, mkU8(32-sh));
11052       }
11053       else if (immh & 2) {
11054          sh = immhb - 16;
11055          vassert(sh < 16); /* so 16-sh is 1..16 */
11056          ta = "4s";
11057          tb = isQ ? "8h" : "4h";
11058          IRExpr* tmp = isQ ? mk_InterleaveHI16x8(src, zero)
11059                            : mk_InterleaveLO16x8(src, zero);
11060          res = binop(isU ? Iop_ShrN32x4 : Iop_SarN32x4, tmp, mkU8(16-sh));
11061       }
11062       else if (immh & 1) {
11063          sh = immhb - 8;
11064          vassert(sh < 8); /* so 8-sh is 1..8 */
11065          ta = "8h";
11066          tb = isQ ? "16b" : "8b";
11067          IRExpr* tmp = isQ ? mk_InterleaveHI8x16(src, zero)
11068                            : mk_InterleaveLO8x16(src, zero);
11069          res = binop(isU ? Iop_ShrN16x8 : Iop_SarN16x8, tmp, mkU8(8-sh));
11070       } else {
11071          vassert(immh == 0);
11072          /* invalid; don't assign to res */
11073       }
11074       /* */
11075       if (res) {
11076          putQReg128(dd, res);
11077          DIP("%cshll%s %s.%s, %s.%s, #%u\n",
11078              isU ? 'u' : 's', isQ ? "2" : "",
11079              nameQReg128(dd), ta, nameQReg128(nn), tb, sh);
11080          return True;
11081       }
11082       return False;
11083    }
11084
11085    if (opcode == BITS5(1,1,1,0,0)) {
11086       /* -------- 0,11100 SCVTF {2d_2d,4s_4s,2s_2s}_imm -------- */
11087       /* -------- 1,11100 UCVTF {2d_2d,4s_4s,2s_2s}_imm -------- */
11088       /* If immh is of the form 00xx, the insn is invalid. */
11089       if (immh < BITS4(0,1,0,0)) return False;
11090       UInt size  = 0;
11091       UInt fbits = 0;
11092       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
11093       /* The following holds because immh is never zero. */
11094       vassert(ok);
11095       /* The following holds because immh >= 0100. */
11096       vassert(size == X10 || size == X11);
11097       Bool isD = size == X11;
11098       Bool isU = bitU == 1;
11099       Bool isQ = bitQ == 1;
11100       if (isD && !isQ) return False; /* reject .1d case */
11101       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
11102       Double  scale  = two_to_the_minus(fbits);
11103       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
11104                            : IRExpr_Const(IRConst_F32( (Float)scale ));
11105       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
11106       IROp    opCVT  = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
11107                            : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
11108       IRType tyF = isD ? Ity_F64 : Ity_F32;
11109       IRType tyI = isD ? Ity_I64 : Ity_I32;
11110       UInt nLanes = (isQ ? 2 : 1) * (isD ? 1 : 2);
11111       vassert(nLanes == 2 || nLanes == 4);
11112       for (UInt i = 0; i < nLanes; i++) {
11113          IRTemp src = newTemp(tyI);
11114          IRTemp res = newTemp(tyF);
11115          IRTemp rm  = mk_get_IR_rounding_mode();
11116          assign(src, getQRegLane(nn, i, tyI));
11117          assign(res, triop(opMUL, mkexpr(rm),
11118                                   binop(opCVT, mkexpr(rm), mkexpr(src)),
11119                                   scaleE));
11120          putQRegLane(dd, i, mkexpr(res));
11121       }
11122       if (!isQ) {
11123          putQRegLane(dd, 1, mkU64(0));
11124       }
11125       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11126       DIP("%s %s.%s, %s.%s, #%u\n", isU ? "ucvtf" : "scvtf",
11127           nameQReg128(dd), arr, nameQReg128(nn), arr, fbits);
11128       return True;
11129    }
11130
11131    if (opcode == BITS5(1,1,1,1,1)) {
11132       /* -------- 0,11111 FCVTZS {2d_2d,4s_4s,2s_2s}_imm -------- */
11133       /* -------- 1,11111 FCVTZU {2d_2d,4s_4s,2s_2s}_imm -------- */
11134       /* If immh is of the form 00xx, the insn is invalid. */
11135       if (immh < BITS4(0,1,0,0)) return False;
11136       UInt size  = 0;
11137       UInt fbits = 0;
11138       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
11139       /* The following holds because immh is never zero. */
11140       vassert(ok);
11141       /* The following holds because immh >= 0100. */
11142       vassert(size == X10 || size == X11);
11143       Bool isD = size == X11;
11144       Bool isU = bitU == 1;
11145       Bool isQ = bitQ == 1;
11146       if (isD && !isQ) return False; /* reject .1d case */
11147       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
11148       Double  scale  = two_to_the_plus(fbits);
11149       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
11150                            : IRExpr_Const(IRConst_F32( (Float)scale ));
11151       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
11152       IROp    opCVT  = isU ? (isD ? Iop_F64toI64U : Iop_F32toI32U)
11153                            : (isD ? Iop_F64toI64S : Iop_F32toI32S);
11154       IRType tyF = isD ? Ity_F64 : Ity_F32;
11155       IRType tyI = isD ? Ity_I64 : Ity_I32;
11156       UInt nLanes = (isQ ? 2 : 1) * (isD ? 1 : 2);
11157       vassert(nLanes == 2 || nLanes == 4);
11158       for (UInt i = 0; i < nLanes; i++) {
11159          IRTemp src = newTemp(tyF);
11160          IRTemp res = newTemp(tyI);
11161          IRTemp rm  = newTemp(Ity_I32);
11162          assign(src, getQRegLane(nn, i, tyF));
11163          assign(rm,  mkU32(Irrm_ZERO));
11164          assign(res, binop(opCVT, mkexpr(rm),
11165                                   triop(opMUL, mkexpr(rm),
11166                                                mkexpr(src), scaleE)));
11167          putQRegLane(dd, i, mkexpr(res));
11168       }
11169       if (!isQ) {
11170          putQRegLane(dd, 1, mkU64(0));
11171       }
11172       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11173       DIP("%s %s.%s, %s.%s, #%u\n", isU ? "fcvtzu" : "fcvtzs",
11174           nameQReg128(dd), arr, nameQReg128(nn), arr, fbits);
11175       return True;
11176    }
11177
11178 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11179    return False;
11180 #  undef INSN
11181 }
11182
11183
11184 static
11185 Bool dis_AdvSIMD_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
11186 {
11187    /* 31 30 29 28    23   21 20 15     11 9 4
11188       0  Q  U  01110 size 1  m  opcode 00 n d
11189       Decode fields: u,opcode
11190    */
11191 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11192    if (INSN(31,31) != 0
11193        || INSN(28,24) != BITS5(0,1,1,1,0)
11194        || INSN(21,21) != 1
11195        || INSN(11,10) != BITS2(0,0)) {
11196       return False;
11197    }
11198    UInt bitQ   = INSN(30,30);
11199    UInt bitU   = INSN(29,29);
11200    UInt size   = INSN(23,22);
11201    UInt mm     = INSN(20,16);
11202    UInt opcode = INSN(15,12);
11203    UInt nn     = INSN(9,5);
11204    UInt dd     = INSN(4,0);
11205    vassert(size < 4);
11206    Bool is2    = bitQ == 1;
11207
11208    if (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,0,1,0)) {
11209       /* -------- 0,0000 SADDL{2} -------- */
11210       /* -------- 1,0000 UADDL{2} -------- */
11211       /* -------- 0,0010 SSUBL{2} -------- */
11212       /* -------- 1,0010 USUBL{2} -------- */
11213       /* Widens, and size refers to the narrow lanes. */
11214       if (size == X11) return False;
11215       vassert(size <= 2);
11216       Bool   isU   = bitU == 1;
11217       Bool   isADD = opcode == BITS4(0,0,0,0);
11218       IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
11219       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
11220       IRTemp res   = newTempV128();
11221       assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
11222                         mkexpr(argL), mkexpr(argR)));
11223       putQReg128(dd, mkexpr(res));
11224       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11225       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11226       const HChar* nm        = isADD ? (isU ? "uaddl" : "saddl")
11227                                      : (isU ? "usubl" : "ssubl");
11228       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11229           nameQReg128(dd), arrWide,
11230           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11231       return True;
11232    }
11233
11234    if (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,0,1,1)) {
11235       /* -------- 0,0001 SADDW{2} -------- */
11236       /* -------- 1,0001 UADDW{2} -------- */
11237       /* -------- 0,0011 SSUBW{2} -------- */
11238       /* -------- 1,0011 USUBW{2} -------- */
11239       /* Widens, and size refers to the narrow lanes. */
11240       if (size == X11) return False;
11241       vassert(size <= 2);
11242       Bool   isU   = bitU == 1;
11243       Bool   isADD = opcode == BITS4(0,0,0,1);
11244       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
11245       IRTemp res   = newTempV128();
11246       assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
11247                         getQReg128(nn), mkexpr(argR)));
11248       putQReg128(dd, mkexpr(res));
11249       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11250       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11251       const HChar* nm        = isADD ? (isU ? "uaddw" : "saddw")
11252                                      : (isU ? "usubw" : "ssubw");
11253       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11254           nameQReg128(dd), arrWide,
11255           nameQReg128(nn), arrWide, nameQReg128(mm), arrNarrow);
11256       return True;
11257    }
11258
11259    if (opcode == BITS4(0,1,0,0) || opcode == BITS4(0,1,1,0)) {
11260       /* -------- 0,0100  ADDHN{2} -------- */
11261       /* -------- 1,0100 RADDHN{2} -------- */
11262       /* -------- 0,0110  SUBHN{2} -------- */
11263       /* -------- 1,0110 RSUBHN{2} -------- */
11264       /* Narrows, and size refers to the narrowed lanes. */
11265       if (size == X11) return False;
11266       vassert(size <= 2);
11267       const UInt shift[3] = { 8, 16, 32 };
11268       Bool isADD = opcode == BITS4(0,1,0,0);
11269       Bool isR   = bitU == 1;
11270       /* Combined elements in wide lanes */
11271       IRTemp  wide  = newTempV128();
11272       IRExpr* wideE = binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
11273                             getQReg128(nn), getQReg128(mm));
11274       if (isR) {
11275          wideE = binop(mkVecADD(size+1),
11276                        wideE,
11277                        mkexpr(math_VEC_DUP_IMM(size+1,
11278                                                1ULL << (shift[size]-1))));
11279       }
11280       assign(wide, wideE);
11281       /* Top halves of elements, still in wide lanes */
11282       IRTemp shrd = newTempV128();
11283       assign(shrd, binop(mkVecSHRN(size+1), mkexpr(wide), mkU8(shift[size])));
11284       /* Elements now compacted into lower 64 bits */
11285       IRTemp new64 = newTempV128();
11286       assign(new64, binop(mkVecCATEVENLANES(size), mkexpr(shrd), mkexpr(shrd)));
11287       putLO64andZUorPutHI64(is2, dd, new64);
11288       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11289       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11290       const HChar* nm = isADD ? (isR ? "raddhn" : "addhn")
11291                               : (isR ? "rsubhn" : "subhn");
11292       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11293           nameQReg128(dd), arrNarrow,
11294           nameQReg128(nn), arrWide, nameQReg128(mm), arrWide);
11295       return True;
11296    }
11297
11298    if (opcode == BITS4(0,1,0,1) || opcode == BITS4(0,1,1,1)) {
11299       /* -------- 0,0101 SABAL{2} -------- */
11300       /* -------- 1,0101 UABAL{2} -------- */
11301       /* -------- 0,0111 SABDL{2} -------- */
11302       /* -------- 1,0111 UABDL{2} -------- */
11303       /* Widens, and size refers to the narrow lanes. */
11304       if (size == X11) return False;
11305       vassert(size <= 2);
11306       Bool   isU   = bitU == 1;
11307       Bool   isACC = opcode == BITS4(0,1,0,1);
11308       IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
11309       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
11310       IRTemp abd   = math_ABD(isU, size+1, mkexpr(argL), mkexpr(argR));
11311       IRTemp res   = newTempV128();
11312       assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(abd), getQReg128(dd))
11313                         : mkexpr(abd));
11314       putQReg128(dd, mkexpr(res));
11315       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11316       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11317       const HChar* nm        = isACC ? (isU ? "uabal" : "sabal")
11318                                      : (isU ? "uabdl" : "sabdl");
11319       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11320           nameQReg128(dd), arrWide,
11321           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11322       return True;
11323    }
11324
11325    if (opcode == BITS4(1,1,0,0)
11326        || opcode == BITS4(1,0,0,0) || opcode == BITS4(1,0,1,0)) {
11327       /* -------- 0,1100  SMULL{2} -------- */ // 0 (ks)
11328       /* -------- 1,1100  UMULL{2} -------- */ // 0
11329       /* -------- 0,1000  SMLAL{2} -------- */ // 1
11330       /* -------- 1,1000  UMLAL{2} -------- */ // 1
11331       /* -------- 0,1010  SMLSL{2} -------- */ // 2
11332       /* -------- 1,1010  UMLSL{2} -------- */ // 2
11333       /* Widens, and size refers to the narrow lanes. */
11334       UInt ks = 3;
11335       switch (opcode) {
11336          case BITS4(1,1,0,0): ks = 0; break;
11337          case BITS4(1,0,0,0): ks = 1; break;
11338          case BITS4(1,0,1,0): ks = 2; break;
11339          default: vassert(0);
11340       }
11341       vassert(ks >= 0 && ks <= 2);
11342       if (size == X11) return False;
11343       vassert(size <= 2);
11344       Bool   isU  = bitU == 1;
11345       IRTemp vecN = newTempV128();
11346       IRTemp vecM = newTempV128();
11347       IRTemp vecD = newTempV128();
11348       assign(vecN, getQReg128(nn));
11349       assign(vecM, getQReg128(mm));
11350       assign(vecD, getQReg128(dd));
11351       IRTemp res = IRTemp_INVALID;
11352       math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
11353                     vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
11354       putQReg128(dd, mkexpr(res));
11355       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11356       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11357       const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
11358       DIP("%c%s%s %s.%s, %s.%s, %s.%s\n", isU ? 'u' : 's', nm, is2 ? "2" : "",
11359           nameQReg128(dd), arrWide,
11360           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11361       return True;
11362    }
11363
11364    if (bitU == 0
11365        && (opcode == BITS4(1,1,0,1)
11366            || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
11367       /* -------- 0,1101  SQDMULL{2} -------- */ // 0 (ks)
11368       /* -------- 0,1001  SQDMLAL{2} -------- */ // 1
11369       /* -------- 0,1011  SQDMLSL{2} -------- */ // 2
11370       /* Widens, and size refers to the narrow lanes. */
11371       UInt ks = 3;
11372       switch (opcode) {
11373          case BITS4(1,1,0,1): ks = 0; break;
11374          case BITS4(1,0,0,1): ks = 1; break;
11375          case BITS4(1,0,1,1): ks = 2; break;
11376          default: vassert(0);
11377       }
11378       vassert(ks >= 0 && ks <= 2);
11379       if (size == X00 || size == X11) return False;
11380       vassert(size <= 2);
11381       IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
11382       vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
11383       newTempsV128_3(&vecN, &vecM, &vecD);
11384       assign(vecN, getQReg128(nn));
11385       assign(vecM, getQReg128(mm));
11386       assign(vecD, getQReg128(dd));
11387       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
11388                        is2, size, "mas"[ks],
11389                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
11390       putQReg128(dd, mkexpr(res));
11391       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
11392       updateQCFLAGwithDifference(sat1q, sat1n);
11393       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
11394          updateQCFLAGwithDifference(sat2q, sat2n);
11395       }
11396       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
11397       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
11398       const HChar* nm        = ks == 0 ? "sqdmull"
11399                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
11400       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
11401           nameQReg128(dd), arrWide,
11402           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11403       return True;
11404    }
11405
11406    if (bitU == 0 && opcode == BITS4(1,1,1,0)) {
11407       /* -------- 0,1110  PMULL{2} -------- */
11408       /* Widens, and size refers to the narrow lanes. */
11409       if (size != X00 && size != X11) return False;
11410       IRTemp  res  = IRTemp_INVALID;
11411       IRExpr* srcN = getQReg128(nn);
11412       IRExpr* srcM = getQReg128(mm);
11413       const HChar* arrNarrow = NULL;
11414       const HChar* arrWide   = NULL;
11415       if (size == X00) {
11416          res = math_BINARY_WIDENING_V128(is2, Iop_PolynomialMull8x8,
11417                                          srcN, srcM);
11418          arrNarrow = nameArr_Q_SZ(bitQ, size);
11419          arrWide   = nameArr_Q_SZ(1,    size+1);
11420       } else {
11421          /* The same thing as the X00 case, except we have to call
11422             a helper to do it. */
11423          vassert(size == X11);
11424          res = newTemp(Ity_V128);
11425          IROp slice
11426             = is2 ? Iop_V128HIto64 : Iop_V128to64;
11427          IRExpr** args
11428             = mkIRExprVec_3( IRExpr_VECRET(),
11429                              unop(slice, srcN), unop(slice, srcM));
11430          IRDirty* di
11431             = unsafeIRDirty_1_N( res, 0/*regparms*/,
11432                                       "arm64g_dirtyhelper_PMULLQ",
11433                                       &arm64g_dirtyhelper_PMULLQ, args);
11434          stmt(IRStmt_Dirty(di));
11435          /* We can't use nameArr_Q_SZ for this because it can't deal with
11436             Q-sized (128 bit) results.  Hence do it by hand. */
11437          arrNarrow = bitQ == 0 ? "1d" : "2d";
11438          arrWide   = "1q";
11439       }
11440       putQReg128(dd, mkexpr(res));
11441       DIP("%s%s %s.%s, %s.%s, %s.%s\n", "pmull", is2 ? "2" : "",
11442           nameQReg128(dd), arrWide,
11443           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
11444       return True;
11445    }
11446
11447    return False;
11448 #  undef INSN
11449 }
11450
11451
11452 static
11453 Bool dis_AdvSIMD_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
11454 {
11455    /* 31 30 29 28    23   21 20 15     10 9 4
11456       0  Q  U  01110 size 1  m  opcode 1  n d
11457       Decode fields: u,size,opcode
11458    */
11459 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11460    if (INSN(31,31) != 0
11461        || INSN(28,24) != BITS5(0,1,1,1,0)
11462        || INSN(21,21) != 1
11463        || INSN(10,10) != 1) {
11464       return False;
11465    }
11466    UInt bitQ   = INSN(30,30);
11467    UInt bitU   = INSN(29,29);
11468    UInt size   = INSN(23,22);
11469    UInt mm     = INSN(20,16);
11470    UInt opcode = INSN(15,11);
11471    UInt nn     = INSN(9,5);
11472    UInt dd     = INSN(4,0);
11473    vassert(size < 4);
11474
11475    if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,1,0,0)) {
11476       /* -------- 0,xx,00000 SHADD std6_std6_std6 -------- */
11477       /* -------- 1,xx,00000 UHADD std6_std6_std6 -------- */
11478       /* -------- 0,xx,00100 SHSUB std6_std6_std6 -------- */
11479       /* -------- 1,xx,00100 UHSUB std6_std6_std6 -------- */
11480       if (size == X11) return False;
11481       Bool isADD = opcode == BITS5(0,0,0,0,0);
11482       Bool isU   = bitU == 1;
11483       /* Widen both args out, do the math, narrow to final result. */
11484       IRTemp argL   = newTempV128();
11485       IRTemp argLhi = IRTemp_INVALID;
11486       IRTemp argLlo = IRTemp_INVALID;
11487       IRTemp argR   = newTempV128();
11488       IRTemp argRhi = IRTemp_INVALID;
11489       IRTemp argRlo = IRTemp_INVALID;
11490       IRTemp resHi  = newTempV128();
11491       IRTemp resLo  = newTempV128();
11492       IRTemp res    = IRTemp_INVALID;
11493       assign(argL, getQReg128(nn));
11494       argLlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argL));
11495       argLhi = math_WIDEN_LO_OR_HI_LANES(isU, True,  size, mkexpr(argL));
11496       assign(argR, getQReg128(mm));
11497       argRlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argR));
11498       argRhi = math_WIDEN_LO_OR_HI_LANES(isU, True,  size, mkexpr(argR));
11499       IROp opADDSUB = isADD ? mkVecADD(size+1) : mkVecSUB(size+1);
11500       IROp opSxR = isU ? mkVecSHRN(size+1) : mkVecSARN(size+1);
11501       assign(resHi, binop(opSxR,
11502                           binop(opADDSUB, mkexpr(argLhi), mkexpr(argRhi)),
11503                           mkU8(1)));
11504       assign(resLo, binop(opSxR,
11505                           binop(opADDSUB, mkexpr(argLlo), mkexpr(argRlo)),
11506                           mkU8(1)));
11507       res = math_NARROW_LANES ( resHi, resLo, size );
11508       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11509       const HChar* nm  = isADD ? (isU ? "uhadd" : "shadd")
11510                                : (isU ? "uhsub" : "shsub");
11511       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11512       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11513           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11514       return True;
11515    }
11516
11517    if (opcode == BITS5(0,0,0,1,0)) {
11518       /* -------- 0,xx,00010 SRHADD std7_std7_std7 -------- */
11519       /* -------- 1,xx,00010 URHADD std7_std7_std7 -------- */
11520       if (bitQ == 0 && size == X11) return False; // implied 1d case
11521       Bool   isU  = bitU == 1;
11522       IRTemp argL = newTempV128();
11523       IRTemp argR = newTempV128();
11524       assign(argL, getQReg128(nn));
11525       assign(argR, getQReg128(mm));
11526       IRTemp res = math_RHADD(size, isU, argL, argR);
11527       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11528       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11529       DIP("%s %s.%s, %s.%s, %s.%s\n", isU ? "urhadd" : "srhadd",
11530           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11531       return True;
11532    }
11533
11534    if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
11535       /* -------- 0,xx,00001 SQADD std7_std7_std7 -------- */
11536       /* -------- 1,xx,00001 UQADD std7_std7_std7 -------- */
11537       /* -------- 0,xx,00101 SQSUB std7_std7_std7 -------- */
11538       /* -------- 1,xx,00101 UQSUB std7_std7_std7 -------- */
11539       if (bitQ == 0 && size == X11) return False; // implied 1d case
11540       Bool isADD = opcode == BITS5(0,0,0,0,1);
11541       Bool isU   = bitU == 1;
11542       IROp qop   = Iop_INVALID;
11543       IROp nop   = Iop_INVALID;
11544       if (isADD) {
11545          qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
11546          nop = mkVecADD(size);
11547       } else {
11548          qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
11549          nop = mkVecSUB(size);
11550       }
11551       IRTemp argL = newTempV128();
11552       IRTemp argR = newTempV128();
11553       IRTemp qres = newTempV128();
11554       IRTemp nres = newTempV128();
11555       assign(argL, getQReg128(nn));
11556       assign(argR, getQReg128(mm));
11557       assign(qres, math_MAYBE_ZERO_HI64_fromE(
11558                       bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
11559       assign(nres, math_MAYBE_ZERO_HI64_fromE(
11560                       bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
11561       putQReg128(dd, mkexpr(qres));
11562       updateQCFLAGwithDifference(qres, nres);
11563       const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd")
11564                                : (isU ? "uqsub" : "sqsub");
11565       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11566       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11567           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11568       return True;
11569    }
11570
11571    if (bitU == 0 && opcode == BITS5(0,0,0,1,1)) {
11572       /* -------- 0,00,00011 AND 16b_16b_16b, 8b_8b_8b -------- */
11573       /* -------- 0,01,00011 BIC 16b_16b_16b, 8b_8b_8b -------- */
11574       /* -------- 0,10,00011 ORR 16b_16b_16b, 8b_8b_8b -------- */
11575       /* -------- 0,10,00011 ORN 16b_16b_16b, 8b_8b_8b -------- */
11576       Bool   isORx  = (size & 2) == 2;
11577       Bool   invert = (size & 1) == 1;
11578       IRTemp res    = newTempV128();
11579       assign(res, binop(isORx ? Iop_OrV128 : Iop_AndV128,
11580                         getQReg128(nn),
11581                         invert ? unop(Iop_NotV128, getQReg128(mm))
11582                                : getQReg128(mm)));
11583       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11584       const HChar* names[4] = { "and", "bic", "orr", "orn" };
11585       const HChar* ar = bitQ == 1 ? "16b" : "8b";
11586       DIP("%s %s.%s, %s.%s, %s.%s\n", names[INSN(23,22)],
11587           nameQReg128(dd), ar, nameQReg128(nn), ar, nameQReg128(mm), ar);
11588       return True;
11589    }
11590
11591    if (bitU == 1 && opcode == BITS5(0,0,0,1,1)) {
11592       /* -------- 1,00,00011 EOR 16b_16b_16b, 8b_8b_8b -------- */
11593       /* -------- 1,01,00011 BSL 16b_16b_16b, 8b_8b_8b -------- */
11594       /* -------- 1,10,00011 BIT 16b_16b_16b, 8b_8b_8b -------- */
11595       /* -------- 1,10,00011 BIF 16b_16b_16b, 8b_8b_8b -------- */
11596       IRTemp argD = newTempV128();
11597       IRTemp argN = newTempV128();
11598       IRTemp argM = newTempV128();
11599       assign(argD, getQReg128(dd));
11600       assign(argN, getQReg128(nn));
11601       assign(argM, getQReg128(mm));
11602       const IROp opXOR = Iop_XorV128;
11603       const IROp opAND = Iop_AndV128;
11604       const IROp opNOT = Iop_NotV128;
11605       IRTemp res = newTempV128();
11606       switch (size) {
11607          case BITS2(0,0): /* EOR */
11608             assign(res, binop(opXOR, mkexpr(argM), mkexpr(argN)));
11609             break;
11610          case BITS2(0,1): /* BSL */
11611             assign(res, binop(opXOR, mkexpr(argM),
11612                               binop(opAND,
11613                                     binop(opXOR, mkexpr(argM), mkexpr(argN)),
11614                                           mkexpr(argD))));
11615             break;
11616          case BITS2(1,0): /* BIT */
11617             assign(res, binop(opXOR, mkexpr(argD),
11618                               binop(opAND,
11619                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
11620                                     mkexpr(argM))));
11621             break;
11622          case BITS2(1,1): /* BIF */
11623             assign(res, binop(opXOR, mkexpr(argD),
11624                               binop(opAND,
11625                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
11626                                     unop(opNOT, mkexpr(argM)))));
11627             break;
11628          default:
11629             vassert(0);
11630       }
11631       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11632       const HChar* nms[4] = { "eor", "bsl", "bit", "bif" };
11633       const HChar* arr = bitQ == 1 ? "16b" : "8b";
11634       DIP("%s %s.%s, %s.%s, %s.%s\n", nms[size],
11635           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11636       return True;
11637    }
11638
11639    if (opcode == BITS5(0,0,1,1,0)) {
11640       /* -------- 0,xx,00110 CMGT std7_std7_std7 -------- */ // >s
11641       /* -------- 1,xx,00110 CMHI std7_std7_std7 -------- */ // >u
11642       if (bitQ == 0 && size == X11) return False; // implied 1d case
11643       Bool   isGT  = bitU == 0;
11644       IRExpr* argL = getQReg128(nn);
11645       IRExpr* argR = getQReg128(mm);
11646       IRTemp  res  = newTempV128();
11647       assign(res,
11648              isGT ? binop(mkVecCMPGTS(size), argL, argR)
11649                   : binop(mkVecCMPGTU(size), argL, argR));
11650       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11651       const HChar* nm  = isGT ? "cmgt" : "cmhi";
11652       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11653       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11654           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11655       return True;
11656    }
11657
11658    if (opcode == BITS5(0,0,1,1,1)) {
11659       /* -------- 0,xx,00111 CMGE std7_std7_std7 -------- */ // >=s
11660       /* -------- 1,xx,00111 CMHS std7_std7_std7 -------- */ // >=u
11661       if (bitQ == 0 && size == X11) return False; // implied 1d case
11662       Bool    isGE = bitU == 0;
11663       IRExpr* argL = getQReg128(nn);
11664       IRExpr* argR = getQReg128(mm);
11665       IRTemp  res  = newTempV128();
11666       assign(res,
11667              isGE ? unop(Iop_NotV128, binop(mkVecCMPGTS(size), argR, argL))
11668                   : unop(Iop_NotV128, binop(mkVecCMPGTU(size), argR, argL)));
11669       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11670       const HChar* nm  = isGE ? "cmge" : "cmhs";
11671       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11672       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11673           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11674       return True;
11675    }
11676
11677    if (opcode == BITS5(0,1,0,0,0) || opcode == BITS5(0,1,0,1,0)) {
11678       /* -------- 0,xx,01000 SSHL  std7_std7_std7 -------- */
11679       /* -------- 0,xx,01010 SRSHL std7_std7_std7 -------- */
11680       /* -------- 1,xx,01000 USHL  std7_std7_std7 -------- */
11681       /* -------- 1,xx,01010 URSHL std7_std7_std7 -------- */
11682       if (bitQ == 0 && size == X11) return False; // implied 1d case
11683       Bool isU = bitU == 1;
11684       Bool isR = opcode == BITS5(0,1,0,1,0);
11685       IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
11686                      : (isU ? mkVecSHU(size)  : mkVecSHS(size));
11687       IRTemp res = newTempV128();
11688       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
11689       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11690       const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
11691                              : (isU ? "ushl"  : "sshl");
11692       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11693       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11694           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11695       return True;
11696    }
11697
11698    if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
11699       /* -------- 0,xx,01001 SQSHL  std7_std7_std7 -------- */
11700       /* -------- 0,xx,01011 SQRSHL std7_std7_std7 -------- */
11701       /* -------- 1,xx,01001 UQSHL  std7_std7_std7 -------- */
11702       /* -------- 1,xx,01011 UQRSHL std7_std7_std7 -------- */
11703       if (bitQ == 0 && size == X11) return False; // implied 1d case
11704       Bool isU = bitU == 1;
11705       Bool isR = opcode == BITS5(0,1,0,1,1);
11706       IROp op  = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
11707                      : (isU ? mkVecQANDUQSH(size)  : mkVecQANDSQSH(size));
11708       /* This is a bit tricky.  If we're only interested in the lowest 64 bits
11709          of the result (viz, bitQ == 0), then we must adjust the operands to
11710          ensure that the upper part of the result, that we don't care about,
11711          doesn't pollute the returned Q value.  To do this, zero out the upper
11712          operand halves beforehand.  This works because it means, for the
11713          lanes we don't care about, we are shifting zero by zero, which can
11714          never saturate. */
11715       IRTemp res256 = newTemp(Ity_V256);
11716       IRTemp resSH  = newTempV128();
11717       IRTemp resQ   = newTempV128();
11718       IRTemp zero   = newTempV128();
11719       assign(res256, binop(op,
11720                            math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(nn)),
11721                            math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(mm))));
11722       assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
11723       assign(resQ,  unop(Iop_V256toV128_1, mkexpr(res256)));
11724       assign(zero,  mkV128(0x0000));
11725       putQReg128(dd, mkexpr(resSH));
11726       updateQCFLAGwithDifference(resQ, zero);
11727       const HChar* nm  = isR ? (isU ? "uqrshl" : "sqrshl")
11728                              : (isU ? "uqshl"  : "sqshl");
11729       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11730       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11731           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11732       return True;
11733    }
11734
11735    if (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,0,1)) {
11736       /* -------- 0,xx,01100 SMAX std7_std7_std7 -------- */
11737       /* -------- 1,xx,01100 UMAX std7_std7_std7 -------- */
11738       /* -------- 0,xx,01101 SMIN std7_std7_std7 -------- */
11739       /* -------- 1,xx,01101 UMIN std7_std7_std7 -------- */
11740       if (bitQ == 0 && size == X11) return False; // implied 1d case
11741       Bool isU   = bitU == 1;
11742       Bool isMAX = (opcode & 1) == 0;
11743       IROp op    = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
11744                          : (isU ? mkVecMINU(size) : mkVecMINS(size));
11745       IRTemp t   = newTempV128();
11746       assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
11747       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t));
11748       const HChar* nm = isMAX ? (isU ? "umax" : "smax")
11749                               : (isU ? "umin" : "smin");
11750       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11751       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11752           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11753       return True;
11754    }
11755
11756    if (opcode == BITS5(0,1,1,1,0) || opcode == BITS5(0,1,1,1,1)) {
11757       /* -------- 0,xx,01110 SABD std6_std6_std6 -------- */
11758       /* -------- 1,xx,01110 UABD std6_std6_std6 -------- */
11759       /* -------- 0,xx,01111 SABA std6_std6_std6 -------- */
11760       /* -------- 1,xx,01111 UABA std6_std6_std6 -------- */
11761       if (size == X11) return False; // 1d/2d cases not allowed
11762       Bool isU   = bitU == 1;
11763       Bool isACC = opcode == BITS5(0,1,1,1,1);
11764       vassert(size <= 2);
11765       IRTemp t1 = math_ABD(isU, size, getQReg128(nn), getQReg128(mm));
11766       IRTemp t2 = newTempV128();
11767       assign(t2, isACC ? binop(mkVecADD(size), mkexpr(t1), getQReg128(dd))
11768                        : mkexpr(t1));
11769       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
11770       const HChar* nm  = isACC ? (isU ? "uaba" : "saba")
11771                                : (isU ? "uabd" : "sabd");
11772       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11773       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11774           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11775       return True;
11776    }
11777
11778    if (opcode == BITS5(1,0,0,0,0)) {
11779       /* -------- 0,xx,10000 ADD std7_std7_std7 -------- */
11780       /* -------- 1,xx,10000 SUB std7_std7_std7 -------- */
11781       if (bitQ == 0 && size == X11) return False; // implied 1d case
11782       Bool   isSUB = bitU == 1;
11783       IROp   op    = isSUB ? mkVecSUB(size) : mkVecADD(size);
11784       IRTemp t     = newTempV128();
11785       assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
11786       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t));
11787       const HChar* nm  = isSUB ? "sub" : "add";
11788       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11789       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11790           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11791       return True;
11792    }
11793
11794    if (opcode == BITS5(1,0,0,0,1)) {
11795       /* -------- 0,xx,10001 CMTST std7_std7_std7 -------- */ // &, != 0
11796       /* -------- 1,xx,10001 CMEQ  std7_std7_std7 -------- */ // ==
11797       if (bitQ == 0 && size == X11) return False; // implied 1d case
11798       Bool    isEQ = bitU == 1;
11799       IRExpr* argL = getQReg128(nn);
11800       IRExpr* argR = getQReg128(mm);
11801       IRTemp  res  = newTempV128();
11802       assign(res,
11803              isEQ ? binop(mkVecCMPEQ(size), argL, argR)
11804                   : unop(Iop_NotV128, binop(mkVecCMPEQ(size),
11805                                             binop(Iop_AndV128, argL, argR),
11806                                             mkV128(0x0000))));
11807       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11808       const HChar* nm  = isEQ ? "cmeq" : "cmtst";
11809       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11810       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11811           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11812       return True;
11813    }
11814
11815    if (opcode == BITS5(1,0,0,1,0)) {
11816       /* -------- 0,xx,10010 MLA std7_std7_std7 -------- */
11817       /* -------- 1,xx,10010 MLS std7_std7_std7 -------- */
11818       if (bitQ == 0 && size == X11) return False; // implied 1d case
11819       Bool isMLS = bitU == 1;
11820       IROp   opMUL    = mkVecMUL(size);
11821       IROp   opADDSUB = isMLS ? mkVecSUB(size) : mkVecADD(size);
11822       IRTemp res      = newTempV128();
11823       if (opMUL != Iop_INVALID && opADDSUB != Iop_INVALID) {
11824          assign(res, binop(opADDSUB,
11825                            getQReg128(dd),
11826                            binop(opMUL, getQReg128(nn), getQReg128(mm))));
11827          putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11828          const HChar* arr = nameArr_Q_SZ(bitQ, size);
11829          DIP("%s %s.%s, %s.%s, %s.%s\n", isMLS ? "mls" : "mla",
11830              nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11831          return True;
11832       }
11833       return False;
11834    }
11835
11836    if (opcode == BITS5(1,0,0,1,1)) {
11837       /* -------- 0,xx,10011 MUL  std7_std7_std7 -------- */
11838       /* -------- 1,xx,10011 PMUL 16b_16b_16b, 8b_8b_8b -------- */
11839       if (bitQ == 0 && size == X11) return False; // implied 1d case
11840       Bool isPMUL = bitU == 1;
11841       const IROp opsPMUL[4]
11842          = { Iop_PolynomialMul8x16, Iop_INVALID, Iop_INVALID, Iop_INVALID };
11843       IROp   opMUL = isPMUL ? opsPMUL[size] : mkVecMUL(size);
11844       IRTemp res   = newTempV128();
11845       if (opMUL != Iop_INVALID) {
11846          assign(res, binop(opMUL, getQReg128(nn), getQReg128(mm)));
11847          putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11848          const HChar* arr = nameArr_Q_SZ(bitQ, size);
11849          DIP("%s %s.%s, %s.%s, %s.%s\n", isPMUL ? "pmul" : "mul",
11850              nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11851          return True;
11852       }
11853       return False;
11854    }
11855
11856    if (opcode == BITS5(1,0,1,0,0) || opcode == BITS5(1,0,1,0,1)) {
11857       /* -------- 0,xx,10100 SMAXP std6_std6_std6 -------- */
11858       /* -------- 1,xx,10100 UMAXP std6_std6_std6 -------- */
11859       /* -------- 0,xx,10101 SMINP std6_std6_std6 -------- */
11860       /* -------- 1,xx,10101 UMINP std6_std6_std6 -------- */
11861       if (size == X11) return False;
11862       Bool isU   = bitU == 1;
11863       Bool isMAX = opcode == BITS5(1,0,1,0,0);
11864       IRTemp vN  = newTempV128();
11865       IRTemp vM  = newTempV128();
11866       IROp op = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
11867                       : (isU ? mkVecMINU(size) : mkVecMINS(size));
11868       assign(vN, getQReg128(nn));
11869       assign(vM, getQReg128(mm));
11870       IRTemp res128 = newTempV128();
11871       assign(res128,
11872              binop(op,
11873                    binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
11874                    binop(mkVecCATODDLANES(size),  mkexpr(vM), mkexpr(vN))));
11875       /* In the half-width case, use CatEL32x4 to extract the half-width
11876          result from the full-width result. */
11877       IRExpr* res
11878          = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
11879                             binop(Iop_CatEvenLanes32x4, mkexpr(res128),
11880                                                         mkexpr(res128)))
11881                      : mkexpr(res128);
11882       putQReg128(dd, res);
11883       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11884       const HChar* nm  = isMAX ? (isU ? "umaxp" : "smaxp")
11885                                : (isU ? "uminp" : "sminp");
11886       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11887           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11888       return True;
11889    }
11890
11891    if (opcode == BITS5(1,0,1,1,0)) {
11892       /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
11893       /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
11894       if (size == X00 || size == X11) return False;
11895       Bool isR = bitU == 1;
11896       IRTemp res, sat1q, sat1n, vN, vM;
11897       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
11898       newTempsV128_2(&vN, &vM);
11899       assign(vN, getQReg128(nn));
11900       assign(vM, getQReg128(mm));
11901       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
11902       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11903       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
11904       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
11905       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11906       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
11907       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
11908           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11909       return True;
11910    }
11911
11912    if (bitU == 0 && opcode == BITS5(1,0,1,1,1)) {
11913       /* -------- 0,xx,10111 ADDP std7_std7_std7 -------- */
11914       if (bitQ == 0 && size == X11) return False; // implied 1d case
11915       IRTemp vN = newTempV128();
11916       IRTemp vM = newTempV128();
11917       assign(vN, getQReg128(nn));
11918       assign(vM, getQReg128(mm));
11919       IRTemp res128 = newTempV128();
11920       assign(res128,
11921              binop(mkVecADD(size),
11922                    binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
11923                    binop(mkVecCATODDLANES(size),  mkexpr(vM), mkexpr(vN))));
11924       /* In the half-width case, use CatEL32x4 to extract the half-width
11925          result from the full-width result. */
11926       IRExpr* res
11927          = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
11928                             binop(Iop_CatEvenLanes32x4, mkexpr(res128),
11929                                                         mkexpr(res128)))
11930                      : mkexpr(res128);
11931       putQReg128(dd, res);
11932       const HChar* arr = nameArr_Q_SZ(bitQ, size);
11933       DIP("addp %s.%s, %s.%s, %s.%s\n",
11934           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11935       return True;
11936    }
11937
11938    if (bitU == 0
11939        && (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,1,1,0))) {
11940       /* -------- 0,0x,11000 FMAXNM 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11941       /* -------- 0,1x,11000 FMINNM 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11942       /* -------- 0,0x,11110 FMAX   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11943       /* -------- 0,1x,11110 FMIN   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11944       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
11945       Bool   isD   = (size & 1) == 1;
11946       if (bitQ == 0 && isD) return False; // implied 1d case
11947       Bool   isMIN = (size & 2) == 2;
11948       Bool   isNM  = opcode == BITS5(1,1,0,0,0);
11949       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? X11 : X10);
11950       IRTemp res   = newTempV128();
11951       assign(res, binop(opMXX, getQReg128(nn), getQReg128(mm)));
11952       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
11953       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
11954       DIP("%s%s %s.%s, %s.%s, %s.%s\n",
11955           isMIN ? "fmin" : "fmax", isNM ? "nm" : "",
11956           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11957       return True;
11958    }
11959
11960    if (bitU == 0 && opcode == BITS5(1,1,0,0,1)) {
11961       /* -------- 0,0x,11001 FMLA 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11962       /* -------- 0,1x,11001 FMLS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11963       Bool isD   = (size & 1) == 1;
11964       Bool isSUB = (size & 2) == 2;
11965       if (bitQ == 0 && isD) return False; // implied 1d case
11966       IROp opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
11967       IROp opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
11968       IROp opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
11969       IRTemp rm = mk_get_IR_rounding_mode();
11970       IRTemp t1 = newTempV128();
11971       IRTemp t2 = newTempV128();
11972       // FIXME: double rounding; use FMA primops instead
11973       assign(t1, triop(opMUL,
11974                        mkexpr(rm), getQReg128(nn), getQReg128(mm)));
11975       assign(t2, triop(isSUB ? opSUB : opADD,
11976                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
11977       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
11978       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
11979       DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fmls" : "fmla",
11980           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
11981       return True;
11982    }
11983
11984    if (bitU == 0 && opcode == BITS5(1,1,0,1,0)) {
11985       /* -------- 0,0x,11010 FADD 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11986       /* -------- 0,1x,11010 FSUB 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
11987       Bool isD   = (size & 1) == 1;
11988       Bool isSUB = (size & 2) == 2;
11989       if (bitQ == 0 && isD) return False; // implied 1d case
11990       const IROp ops[4]
11991          = { Iop_Add32Fx4, Iop_Add64Fx2, Iop_Sub32Fx4, Iop_Sub64Fx2 };
11992       IROp   op = ops[size];
11993       IRTemp rm = mk_get_IR_rounding_mode();
11994       IRTemp t1 = newTempV128();
11995       IRTemp t2 = newTempV128();
11996       assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
11997       assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
11998       putQReg128(dd, mkexpr(t2));
11999       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12000       DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fsub" : "fadd",
12001           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12002       return True;
12003    }
12004
12005    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
12006       /* -------- 1,1x,11010 FABD 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12007       Bool isD = (size & 1) == 1;
12008       if (bitQ == 0 && isD) return False; // implied 1d case
12009       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
12010       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
12011       IRTemp rm    = mk_get_IR_rounding_mode();
12012       IRTemp t1    = newTempV128();
12013       IRTemp t2    = newTempV128();
12014       // FIXME: use Abd primop instead?
12015       assign(t1, triop(opSUB, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12016       assign(t2, unop(opABS, mkexpr(t1)));
12017       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
12018       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12019       DIP("fabd %s.%s, %s.%s, %s.%s\n",
12020           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12021       return True;
12022    }
12023
12024    if (size <= X01 && opcode == BITS5(1,1,0,1,1)) {
12025       /* -------- 0,0x,11011 FMULX 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12026       /* -------- 1,0x,11011 FMUL  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12027       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
12028       Bool isD    = (size & 1) == 1;
12029       Bool isMULX = bitU == 0;
12030       if (bitQ == 0 && isD) return False; // implied 1d case
12031       IRTemp rm = mk_get_IR_rounding_mode();
12032       IRTemp t1 = newTempV128();
12033       assign(t1, triop(isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4,
12034                        mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12035       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12036       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12037       DIP("%s %s.%s, %s.%s, %s.%s\n", isMULX ? "fmulx" : "fmul",
12038           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12039       return True;
12040    }
12041
12042    if (size <= X01 && opcode == BITS5(1,1,1,0,0)) {
12043       /* -------- 0,0x,11100 FCMEQ 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12044       /* -------- 1,0x,11100 FCMGE 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12045       Bool isD = (size & 1) == 1;
12046       if (bitQ == 0 && isD) return False; // implied 1d case
12047       Bool   isGE  = bitU == 1;
12048       IROp   opCMP = isGE ? (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4)
12049                           : (isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4);
12050       IRTemp t1    = newTempV128();
12051       assign(t1, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn)) // swapd
12052                       : binop(opCMP, getQReg128(nn), getQReg128(mm)));
12053       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12054       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12055       DIP("%s %s.%s, %s.%s, %s.%s\n", isGE ? "fcmge" : "fcmeq",
12056           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12057       return True;
12058    }
12059
12060    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,0,0)) {
12061       /* -------- 1,1x,11100 FCMGT 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12062       Bool isD = (size & 1) == 1;
12063       if (bitQ == 0 && isD) return False; // implied 1d case
12064       IROp   opCMP = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
12065       IRTemp t1    = newTempV128();
12066       assign(t1, binop(opCMP, getQReg128(mm), getQReg128(nn))); // swapd
12067       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12068       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12069       DIP("%s %s.%s, %s.%s, %s.%s\n", "fcmgt",
12070           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12071       return True;
12072    }
12073
12074    if (bitU == 1 && opcode == BITS5(1,1,1,0,1)) {
12075       /* -------- 1,0x,11101 FACGE 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12076       /* -------- 1,1x,11101 FACGT 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12077       Bool isD  = (size & 1) == 1;
12078       Bool isGT = (size & 2) == 2;
12079       if (bitQ == 0 && isD) return False; // implied 1d case
12080       IROp   opCMP = isGT ? (isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4)
12081                           : (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4);
12082       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
12083       IRTemp t1    = newTempV128();
12084       assign(t1, binop(opCMP, unop(opABS, getQReg128(mm)),
12085                               unop(opABS, getQReg128(nn)))); // swapd
12086       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
12087       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12088       DIP("%s %s.%s, %s.%s, %s.%s\n", isGT ? "facgt" : "facge",
12089           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12090       return True;
12091    }
12092
12093    if (bitU == 1
12094        && (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,1,1,0))) {
12095       /* -------- 1,0x,11000 FMAXNMP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12096       /* -------- 1,1x,11000 FMINNMP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12097       /* -------- 1,0x,11110 FMAXP   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12098       /* -------- 1,1x,11110 FMINP   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12099       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
12100       Bool isD = (size & 1) == 1;
12101       if (bitQ == 0 && isD) return False; // implied 1d case
12102       Bool   isMIN = (size & 2) == 2;
12103       Bool   isNM  = opcode == BITS5(1,1,0,0,0);
12104       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? 3 : 2);
12105       IRTemp srcN  = newTempV128();
12106       IRTemp srcM  = newTempV128();
12107       IRTemp preL  = IRTemp_INVALID;
12108       IRTemp preR  = IRTemp_INVALID;
12109       assign(srcN, getQReg128(nn));
12110       assign(srcM, getQReg128(mm));
12111       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR,
12112                                            srcM, srcN, isD, bitQ);
12113       putQReg128(
12114          dd, math_MAYBE_ZERO_HI64_fromE(
12115                 bitQ,
12116                 binop(opMXX, mkexpr(preL), mkexpr(preR))));
12117       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12118       DIP("%s%sp %s.%s, %s.%s, %s.%s\n",
12119           isMIN ? "fmin" : "fmax", isNM ? "nm" : "",
12120           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12121       return True;
12122    }
12123
12124    if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,0,1,0)) {
12125       /* -------- 1,0x,11010 FADDP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12126       Bool isD = size == X01;
12127       if (bitQ == 0 && isD) return False; // implied 1d case
12128       IRTemp srcN = newTempV128();
12129       IRTemp srcM = newTempV128();
12130       IRTemp preL = IRTemp_INVALID;
12131       IRTemp preR = IRTemp_INVALID;
12132       assign(srcN, getQReg128(nn));
12133       assign(srcM, getQReg128(mm));
12134       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR,
12135                                            srcM, srcN, isD, bitQ);
12136       putQReg128(
12137          dd, math_MAYBE_ZERO_HI64_fromE(
12138                 bitQ,
12139                 triop(mkVecADDF(isD ? 3 : 2),
12140                       mkexpr(mk_get_IR_rounding_mode()),
12141                       mkexpr(preL), mkexpr(preR))));
12142       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12143       DIP("%s %s.%s, %s.%s, %s.%s\n", "faddp",
12144           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12145       return True;
12146    }
12147
12148    if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,1,1,1)) {
12149       /* -------- 1,0x,11111 FDIV 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12150       Bool isD = (size & 1) == 1;
12151       if (bitQ == 0 && isD) return False; // implied 1d case
12152       vassert(size <= 1);
12153       const IROp ops[2] = { Iop_Div32Fx4, Iop_Div64Fx2 };
12154       IROp   op = ops[size];
12155       IRTemp rm = mk_get_IR_rounding_mode();
12156       IRTemp t1 = newTempV128();
12157       IRTemp t2 = newTempV128();
12158       assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
12159       assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
12160       putQReg128(dd, mkexpr(t2));
12161       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12162       DIP("%s %s.%s, %s.%s, %s.%s\n", "fdiv",
12163           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12164       return True;
12165    }
12166
12167    if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
12168       /* -------- 0,0x,11111: FRECPS  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12169       /* -------- 0,1x,11111: FRSQRTS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
12170       Bool isSQRT = (size & 2) == 2;
12171       Bool isD    = (size & 1) == 1;
12172       if (bitQ == 0 && isD) return False; // implied 1d case
12173       IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
12174                            : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
12175       IRTemp res = newTempV128();
12176       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
12177       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12178       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12179       DIP("%s %s.%s, %s.%s, %s.%s\n", isSQRT ? "frsqrts" : "frecps",
12180           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12181       return True;
12182    }
12183
12184    return False;
12185 #  undef INSN
12186 }
12187
12188
12189 static
12190 Bool dis_AdvSIMD_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
12191 {
12192    /* 31 30 29 28    23   21    16     11 9 4
12193       0  Q  U  01110 size 10000 opcode 10 n d
12194       Decode fields: U,size,opcode
12195    */
12196 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12197    if (INSN(31,31) != 0
12198        || INSN(28,24) != BITS5(0,1,1,1,0)
12199        || INSN(21,17) != BITS5(1,0,0,0,0)
12200        || INSN(11,10) != BITS2(1,0)) {
12201       return False;
12202    }
12203    UInt bitQ   = INSN(30,30);
12204    UInt bitU   = INSN(29,29);
12205    UInt size   = INSN(23,22);
12206    UInt opcode = INSN(16,12);
12207    UInt nn     = INSN(9,5);
12208    UInt dd     = INSN(4,0);
12209    vassert(size < 4);
12210
12211    if (bitU == 0 && size <= X10 && opcode == BITS5(0,0,0,0,0)) {
12212       /* -------- 0,00,00000: REV64 16b_16b, 8b_8b -------- */
12213       /* -------- 0,01,00000: REV64 8h_8h, 4h_4h -------- */
12214       /* -------- 0,10,00000: REV64 4s_4s, 2s_2s -------- */
12215       const IROp iops[3] = { Iop_Reverse8sIn64_x2,
12216                              Iop_Reverse16sIn64_x2, Iop_Reverse32sIn64_x2 };
12217       vassert(size <= 2);
12218       IRTemp res = newTempV128();
12219       assign(res, unop(iops[size], getQReg128(nn)));
12220       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12221       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12222       DIP("%s %s.%s, %s.%s\n", "rev64",
12223           nameQReg128(dd), arr, nameQReg128(nn), arr);
12224       return True;
12225    }
12226
12227    if (bitU == 1 && size <= X01 && opcode == BITS5(0,0,0,0,0)) {
12228       /* -------- 1,00,00000: REV32 16b_16b, 8b_8b -------- */
12229       /* -------- 1,01,00000: REV32 8h_8h, 4h_4h -------- */
12230       Bool   isH = size == X01;
12231       IRTemp res = newTempV128();
12232       IROp   iop = isH ? Iop_Reverse16sIn32_x4 : Iop_Reverse8sIn32_x4;
12233       assign(res, unop(iop, getQReg128(nn)));
12234       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12235       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12236       DIP("%s %s.%s, %s.%s\n", "rev32",
12237           nameQReg128(dd), arr, nameQReg128(nn), arr);
12238       return True;
12239    }
12240
12241    if (bitU == 0 && size == X00 && opcode == BITS5(0,0,0,0,1)) {
12242       /* -------- 0,00,00001: REV16 16b_16b, 8b_8b -------- */
12243       IRTemp res = newTempV128();
12244       assign(res, unop(Iop_Reverse8sIn16_x8, getQReg128(nn)));
12245       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12246       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12247       DIP("%s %s.%s, %s.%s\n", "rev16",
12248           nameQReg128(dd), arr, nameQReg128(nn), arr);
12249       return True;
12250    }
12251
12252    if (opcode == BITS5(0,0,0,1,0) || opcode == BITS5(0,0,1,1,0)) {
12253       /* -------- 0,xx,00010: SADDLP std6_std6 -------- */
12254       /* -------- 1,xx,00010: UADDLP std6_std6 -------- */
12255       /* -------- 0,xx,00110: SADALP std6_std6 -------- */
12256       /* -------- 1,xx,00110: UADALP std6_std6 -------- */
12257       /* Widens, and size refers to the narrow size. */
12258       if (size == X11) return False; // no 1d or 2d cases
12259       Bool   isU   = bitU == 1;
12260       Bool   isACC = opcode == BITS5(0,0,1,1,0);
12261       IRTemp src   = newTempV128();
12262       IRTemp sum   = newTempV128();
12263       IRTemp res   = newTempV128();
12264       assign(src, getQReg128(nn));
12265       assign(sum,
12266              binop(mkVecADD(size+1),
12267                    mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
12268                              isU, True/*fromOdd*/, size, mkexpr(src))),
12269                    mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
12270                              isU, False/*!fromOdd*/, size, mkexpr(src)))));
12271       assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(sum), getQReg128(dd))
12272                         : mkexpr(sum));
12273       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12274       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12275       const HChar* arrWide   = nameArr_Q_SZ(bitQ, size+1);
12276       DIP("%s %s.%s, %s.%s\n", isACC ? (isU ? "uadalp" : "sadalp")
12277                                      : (isU ? "uaddlp" : "saddlp"),
12278           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
12279       return True;
12280    }
12281
12282    if (opcode == BITS5(0,0,0,1,1)) {
12283       /* -------- 0,xx,00011: SUQADD std7_std7 -------- */
12284       /* -------- 1,xx,00011: USQADD std7_std7 -------- */
12285       if (bitQ == 0 && size == X11) return False; // implied 1d case
12286       Bool isUSQADD = bitU == 1;
12287       /* This is switched (in the US vs SU sense) deliberately.
12288          SUQADD corresponds to the ExtUSsatSS variants and
12289          USQADD corresponds to the ExtSUsatUU variants.
12290          See libvex_ir for more details. */
12291       IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
12292                              : mkVecQADDEXTUSSATSS(size);
12293       IROp   nop  = mkVecADD(size);
12294       IRTemp argL = newTempV128();
12295       IRTemp argR = newTempV128();
12296       IRTemp qres = newTempV128();
12297       IRTemp nres = newTempV128();
12298       /* Because the two arguments to the addition are implicitly
12299          extended differently (one signedly, the other unsignedly) it is
12300          important to present them to the primop in the correct order. */
12301       assign(argL, getQReg128(nn));
12302       assign(argR, getQReg128(dd));
12303       assign(qres, math_MAYBE_ZERO_HI64_fromE(
12304                       bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
12305       assign(nres, math_MAYBE_ZERO_HI64_fromE(
12306                       bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
12307       putQReg128(dd, mkexpr(qres));
12308       updateQCFLAGwithDifference(qres, nres);
12309       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12310       DIP("%s %s.%s, %s.%s\n", isUSQADD ? "usqadd" : "suqadd",
12311           nameQReg128(dd), arr, nameQReg128(nn), arr);
12312       return True;
12313    }
12314
12315    if (opcode == BITS5(0,0,1,0,0)) {
12316       /* -------- 0,xx,00100: CLS std6_std6 -------- */
12317       /* -------- 1,xx,00100: CLZ std6_std6 -------- */
12318       if (size == X11) return False; // no 1d or 2d cases
12319       const IROp opsCLS[3] = { Iop_Cls8x16, Iop_Cls16x8, Iop_Cls32x4 };
12320       const IROp opsCLZ[3] = { Iop_Clz8x16, Iop_Clz16x8, Iop_Clz32x4 };
12321       Bool   isCLZ = bitU == 1;
12322       IRTemp res   = newTempV128();
12323       vassert(size <= 2);
12324       assign(res, unop(isCLZ ? opsCLZ[size] : opsCLS[size], getQReg128(nn)));
12325       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12326       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12327       DIP("%s %s.%s, %s.%s\n", isCLZ ? "clz" : "cls",
12328           nameQReg128(dd), arr, nameQReg128(nn), arr);
12329       return True;
12330    }
12331
12332    if (size == X00 && opcode == BITS5(0,0,1,0,1)) {
12333       /* -------- 0,00,00101: CNT 16b_16b, 8b_8b -------- */
12334       /* -------- 1,00,00101: NOT 16b_16b, 8b_8b -------- */
12335       IRTemp res = newTempV128();
12336       assign(res, unop(bitU == 0 ? Iop_Cnt8x16 : Iop_NotV128, getQReg128(nn)));
12337       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12338       const HChar* arr = nameArr_Q_SZ(bitQ, 0);
12339       DIP("%s %s.%s, %s.%s\n", bitU == 0 ? "cnt" : "not",
12340           nameQReg128(dd), arr, nameQReg128(nn), arr);
12341       return True;
12342    }
12343
12344    if (bitU == 1 && size == X01 && opcode == BITS5(0,0,1,0,1)) {
12345       /* -------- 1,01,00101  RBIT 16b_16b, 8b_8b -------- */
12346       IRTemp res = newTempV128();
12347       assign(res, unop(Iop_Reverse1sIn8_x16, getQReg128(nn)));
12348       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12349       const HChar* arr = nameArr_Q_SZ(bitQ, 0);
12350       DIP("%s %s.%s, %s.%s\n", "rbit",
12351           nameQReg128(dd), arr, nameQReg128(nn), arr);
12352       return True;
12353    }
12354
12355    if (opcode == BITS5(0,0,1,1,1)) {
12356       /* -------- 0,xx,00111 SQABS std7_std7 -------- */
12357       /* -------- 1,xx,00111 SQNEG std7_std7 -------- */
12358       if (bitQ == 0 && size == X11) return False; // implied 1d case
12359       Bool   isNEG  = bitU == 1;
12360       IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
12361       (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
12362                                          getQReg128(nn), size );
12363       IRTemp qres = newTempV128(), nres = newTempV128();
12364       assign(qres, math_MAYBE_ZERO_HI64(bitQ, qresFW));
12365       assign(nres, math_MAYBE_ZERO_HI64(bitQ, nresFW));
12366       putQReg128(dd, mkexpr(qres));
12367       updateQCFLAGwithDifference(qres, nres);
12368       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12369       DIP("%s %s.%s, %s.%s\n", isNEG ? "sqneg" : "sqabs",
12370           nameQReg128(dd), arr, nameQReg128(nn), arr);
12371       return True;
12372    }
12373
12374    if (opcode == BITS5(0,1,0,0,0)) {
12375       /* -------- 0,xx,01000: CMGT std7_std7_#0 -------- */ // >s 0
12376       /* -------- 1,xx,01000: CMGE std7_std7_#0 -------- */ // >=s 0
12377       if (bitQ == 0 && size == X11) return False; // implied 1d case
12378       Bool    isGT  = bitU == 0;
12379       IRExpr* argL  = getQReg128(nn);
12380       IRExpr* argR  = mkV128(0x0000);
12381       IRTemp  res   = newTempV128();
12382       IROp    opGTS = mkVecCMPGTS(size);
12383       assign(res, isGT ? binop(opGTS, argL, argR)
12384                        : unop(Iop_NotV128, binop(opGTS, argR, argL)));
12385       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12386       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12387       DIP("cm%s %s.%s, %s.%s, #0\n", isGT ? "gt" : "ge",
12388           nameQReg128(dd), arr, nameQReg128(nn), arr);
12389       return True;
12390    }
12391
12392    if (opcode == BITS5(0,1,0,0,1)) {
12393       /* -------- 0,xx,01001: CMEQ std7_std7_#0 -------- */ // == 0
12394       /* -------- 1,xx,01001: CMLE std7_std7_#0 -------- */ // <=s 0
12395       if (bitQ == 0 && size == X11) return False; // implied 1d case
12396       Bool    isEQ = bitU == 0;
12397       IRExpr* argL = getQReg128(nn);
12398       IRExpr* argR = mkV128(0x0000);
12399       IRTemp  res  = newTempV128();
12400       assign(res, isEQ ? binop(mkVecCMPEQ(size), argL, argR)
12401                        : unop(Iop_NotV128,
12402                               binop(mkVecCMPGTS(size), argL, argR)));
12403       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12404       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12405       DIP("cm%s %s.%s, %s.%s, #0\n", isEQ ? "eq" : "le",
12406           nameQReg128(dd), arr, nameQReg128(nn), arr);
12407       return True;
12408    }
12409
12410    if (bitU == 0 && opcode == BITS5(0,1,0,1,0)) {
12411       /* -------- 0,xx,01010: CMLT std7_std7_#0 -------- */ // <s 0
12412       if (bitQ == 0 && size == X11) return False; // implied 1d case
12413       IRExpr* argL = getQReg128(nn);
12414       IRExpr* argR = mkV128(0x0000);
12415       IRTemp  res  = newTempV128();
12416       assign(res, binop(mkVecCMPGTS(size), argR, argL));
12417       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12418       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12419       DIP("cm%s %s.%s, %s.%s, #0\n", "lt",
12420           nameQReg128(dd), arr, nameQReg128(nn), arr);
12421       return True;
12422    }
12423
12424    if (bitU == 0 && opcode == BITS5(0,1,0,1,1)) {
12425       /* -------- 0,xx,01011: ABS std7_std7 -------- */
12426       if (bitQ == 0 && size == X11) return False; // implied 1d case
12427       IRTemp res = newTempV128();
12428       assign(res, unop(mkVecABS(size), getQReg128(nn)));
12429       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12430       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12431       DIP("abs %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
12432       return True;
12433    }
12434
12435    if (bitU == 1 && opcode == BITS5(0,1,0,1,1)) {
12436       /* -------- 1,xx,01011: NEG std7_std7 -------- */
12437       if (bitQ == 0 && size == X11) return False; // implied 1d case
12438       IRTemp res = newTempV128();
12439       assign(res, binop(mkVecSUB(size), mkV128(0x0000), getQReg128(nn)));
12440       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12441       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12442       DIP("neg %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
12443       return True;
12444    }
12445
12446    UInt ix = 0; /*INVALID*/
12447    if (size >= X10) {
12448       switch (opcode) {
12449          case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 4 : 1; break;
12450          case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 5 : 2; break;
12451          case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;
12452          default: break;
12453       }
12454    }
12455    if (ix > 0) {
12456       /* -------- 0,1x,01100 FCMGT 2d_2d,4s_4s,2s_2s _#0.0 (ix 1) -------- */
12457       /* -------- 0,1x,01101 FCMEQ 2d_2d,4s_4s,2s_2s _#0.0 (ix 2) -------- */
12458       /* -------- 0,1x,01110 FCMLT 2d_2d,4s_4s,2s_2s _#0.0 (ix 3) -------- */
12459       /* -------- 1,1x,01100 FCMGE 2d_2d,4s_4s,2s_2s _#0.0 (ix 4) -------- */
12460       /* -------- 1,1x,01101 FCMLE 2d_2d,4s_4s,2s_2s _#0.0 (ix 5) -------- */
12461       if (bitQ == 0 && size == X11) return False; // implied 1d case
12462       Bool   isD     = size == X11;
12463       IROp   opCmpEQ = isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
12464       IROp   opCmpLE = isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
12465       IROp   opCmpLT = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
12466       IROp   opCmp   = Iop_INVALID;
12467       Bool   swap    = False;
12468       const HChar* nm = "??";
12469       switch (ix) {
12470          case 1: nm = "fcmgt"; opCmp = opCmpLT; swap = True; break;
12471          case 2: nm = "fcmeq"; opCmp = opCmpEQ; break;
12472          case 3: nm = "fcmlt"; opCmp = opCmpLT; break;
12473          case 4: nm = "fcmge"; opCmp = opCmpLE; swap = True; break;
12474          case 5: nm = "fcmle"; opCmp = opCmpLE; break;
12475          default: vassert(0);
12476       }
12477       IRExpr* zero = mkV128(0x0000);
12478       IRTemp res = newTempV128();
12479       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
12480                        : binop(opCmp, getQReg128(nn), zero));
12481       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12482       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
12483       DIP("%s %s.%s, %s.%s, #0.0\n", nm,
12484           nameQReg128(dd), arr, nameQReg128(nn), arr);
12485       return True;
12486    }
12487
12488    if (size >= X10 && opcode == BITS5(0,1,1,1,1)) {
12489       /* -------- 0,1x,01111: FABS 2d_2d, 4s_4s, 2s_2s -------- */
12490       /* -------- 1,1x,01111: FNEG 2d_2d, 4s_4s, 2s_2s -------- */
12491       if (bitQ == 0 && size == X11) return False; // implied 1d case
12492       Bool   isFNEG = bitU == 1;
12493       IROp   op     = isFNEG ? (size == X10 ? Iop_Neg32Fx4 : Iop_Neg64Fx2)
12494                              : (size == X10 ? Iop_Abs32Fx4 : Iop_Abs64Fx2);
12495       IRTemp res = newTempV128();
12496       assign(res, unop(op, getQReg128(nn)));
12497       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12498       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
12499       DIP("%s %s.%s, %s.%s\n", isFNEG ? "fneg" : "fabs",
12500           nameQReg128(dd), arr, nameQReg128(nn), arr);
12501       return True;
12502    }
12503
12504    if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
12505       /* -------- 0,xx,10010: XTN{,2} -------- */
12506       if (size == X11) return False;
12507       vassert(size < 3);
12508       Bool   is2  = bitQ == 1;
12509       IROp   opN  = mkVecNARROWUN(size);
12510       IRTemp resN = newTempV128();
12511       assign(resN, unop(Iop_64UtoV128, unop(opN, getQReg128(nn))));
12512       putLO64andZUorPutHI64(is2, dd, resN);
12513       const HChar* nm        = "xtn";
12514       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12515       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12516       DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm,
12517           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
12518       return True;
12519    }
12520
12521    if (opcode == BITS5(1,0,1,0,0)
12522        || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) {
12523       /* -------- 0,xx,10100: SQXTN{,2} -------- */
12524       /* -------- 1,xx,10100: UQXTN{,2} -------- */
12525       /* -------- 1,xx,10010: SQXTUN{,2} -------- */
12526       if (size == X11) return False;
12527       vassert(size < 3);
12528       Bool  is2    = bitQ == 1;
12529       IROp  opN    = Iop_INVALID;
12530       Bool  zWiden = True;
12531       const HChar* nm = "??";
12532       /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) {
12533          opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False;
12534       }
12535       else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) {
12536          opN = mkVecQNARROWUNUU(size); nm = "uqxtn";
12537       }
12538       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
12539          opN = mkVecQNARROWUNSU(size); nm = "sqxtun";
12540       }
12541       else vassert(0);
12542       IRTemp src  = newTempV128();
12543       assign(src, getQReg128(nn));
12544       IRTemp resN = newTempV128();
12545       assign(resN, unop(Iop_64UtoV128, unop(opN, mkexpr(src))));
12546       putLO64andZUorPutHI64(is2, dd, resN);
12547       IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/,
12548                                               size, mkexpr(resN));
12549       updateQCFLAGwithDifference(src, resW);
12550       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12551       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12552       DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm,
12553           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
12554       return True;
12555    }
12556
12557    if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
12558       /* -------- 1,xx,10011 SHLL{2} #lane-width -------- */
12559       /* Widens, and size is the narrow size. */
12560       if (size == X11) return False;
12561       Bool is2   = bitQ == 1;
12562       IROp opINT = is2 ? mkVecINTERLEAVEHI(size) : mkVecINTERLEAVELO(size);
12563       IROp opSHL = mkVecSHLN(size+1);
12564       IRTemp src = newTempV128();
12565       IRTemp res = newTempV128();
12566       assign(src, getQReg128(nn));
12567       assign(res, binop(opSHL, binop(opINT, mkexpr(src), mkexpr(src)),
12568                                mkU8(8 << size)));
12569       putQReg128(dd, mkexpr(res));
12570       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12571       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12572       DIP("shll%s %s.%s, %s.%s, #%d\n", is2 ? "2" : "",
12573           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow, 8 << size);
12574       return True;
12575    }
12576
12577    if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,0)) {
12578       /* -------- 0,0x,10110: FCVTN 4h/8h_4s, 2s/4s_2d -------- */
12579       UInt   nLanes = size == X00 ? 4 : 2;
12580       IRType srcTy  = size == X00 ? Ity_F32 : Ity_F64;
12581       IROp   opCvt  = size == X00 ? Iop_F32toF16 : Iop_F64toF32;
12582       IRTemp rm     = mk_get_IR_rounding_mode();
12583       IRTemp src[nLanes];
12584       for (UInt i = 0; i < nLanes; i++) {
12585          src[i] = newTemp(srcTy);
12586          assign(src[i], getQRegLane(nn, i, srcTy));
12587       }
12588       for (UInt i = 0; i < nLanes; i++) {
12589          putQRegLane(dd, nLanes * bitQ + i,
12590                          binop(opCvt, mkexpr(rm), mkexpr(src[i])));
12591       }
12592       if (bitQ == 0) {
12593          putQRegLane(dd, 1, mkU64(0));
12594       }
12595       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
12596       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
12597       DIP("fcvtn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
12598           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
12599       return True;
12600    }
12601
12602    if (bitU == 1 && size == X01 && opcode == BITS5(1,0,1,1,0)) {
12603       /* -------- 1,01,10110: FCVTXN 2s/4s_2d -------- */
12604       /* Using Irrm_NEAREST here isn't right.  The docs say "round to
12605          odd" but I don't know what that really means. */
12606       IRType srcTy = Ity_F64;
12607       IROp   opCvt = Iop_F64toF32;
12608       IRTemp src[2];
12609       for (UInt i = 0; i < 2; i++) {
12610          src[i] = newTemp(srcTy);
12611          assign(src[i], getQRegLane(nn, i, srcTy));
12612       }
12613       for (UInt i = 0; i < 2; i++) {
12614          putQRegLane(dd, 2 * bitQ + i,
12615                          binop(opCvt, mkU32(Irrm_NEAREST), mkexpr(src[i])));
12616       }
12617       if (bitQ == 0) {
12618          putQRegLane(dd, 1, mkU64(0));
12619       }
12620       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
12621       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
12622       DIP("fcvtxn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
12623           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
12624       return True;
12625    }
12626
12627    if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,1)) {
12628       /* -------- 0,0x,10111: FCVTL 4s_4h/8h, 2d_2s/4s -------- */
12629       UInt   nLanes = size == X00 ? 4 : 2;
12630       IRType srcTy  = size == X00 ? Ity_F16 : Ity_F32;
12631       IROp   opCvt  = size == X00 ? Iop_F16toF32 : Iop_F32toF64;
12632       IRTemp src[nLanes];
12633       for (UInt i = 0; i < nLanes; i++) {
12634          src[i] = newTemp(srcTy);
12635          assign(src[i], getQRegLane(nn, nLanes * bitQ + i, srcTy));
12636       }
12637       for (UInt i = 0; i < nLanes; i++) {
12638          putQRegLane(dd, i, unop(opCvt, mkexpr(src[i])));
12639       }
12640       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
12641       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
12642       DIP("fcvtl%s %s.%s, %s.%s\n", bitQ ? "2" : "",
12643           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
12644       return True;
12645    }
12646
12647    ix = 0;
12648    if (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,0,0,1)) {
12649       ix = 1 + ((((bitU & 1) << 2) | ((size & 2) << 0)) | ((opcode & 1) << 0));
12650       // = 1 + bitU[0]:size[1]:opcode[0]
12651       vassert(ix >= 1 && ix <= 8);
12652       if (ix == 7) ix = 0;
12653    }
12654    if (ix > 0) {
12655       /* -------- 0,0x,11000 FRINTN 2d_2d, 4s_4s, 2s_2s (1) -------- */
12656       /* -------- 0,0x,11001 FRINTM 2d_2d, 4s_4s, 2s_2s (2) -------- */
12657       /* -------- 0,1x,11000 FRINTP 2d_2d, 4s_4s, 2s_2s (3) -------- */
12658       /* -------- 0,1x,11001 FRINTZ 2d_2d, 4s_4s, 2s_2s (4) -------- */
12659       /* -------- 1,0x,11000 FRINTA 2d_2d, 4s_4s, 2s_2s (5) -------- */
12660       /* -------- 1,0x,11001 FRINTX 2d_2d, 4s_4s, 2s_2s (6) -------- */
12661       /* -------- 1,1x,11000 (apparently unassigned)    (7) -------- */
12662       /* -------- 1,1x,11001 FRINTI 2d_2d, 4s_4s, 2s_2s (8) -------- */
12663       /* rm plan:
12664          FRINTN: tieeven -- !! FIXME KLUDGED !!
12665          FRINTM: -inf
12666          FRINTP: +inf
12667          FRINTZ: zero
12668          FRINTA: tieaway -- !! FIXME KLUDGED !!
12669          FRINTX: per FPCR + "exact = TRUE"
12670          FRINTI: per FPCR
12671       */
12672       Bool isD = (size & 1) == 1;
12673       if (bitQ == 0 && isD) return False; // implied 1d case
12674
12675       IRTemp irrmRM = mk_get_IR_rounding_mode();
12676
12677       UChar ch = '?';
12678       IRTemp irrm = newTemp(Ity_I32);
12679       switch (ix) {
12680          case 1: ch = 'n'; assign(irrm, mkU32(Irrm_NEAREST)); break;
12681          case 2: ch = 'm'; assign(irrm, mkU32(Irrm_NegINF)); break;
12682          case 3: ch = 'p'; assign(irrm, mkU32(Irrm_PosINF)); break;
12683          case 4: ch = 'z'; assign(irrm, mkU32(Irrm_ZERO)); break;
12684          // The following is a kludge.  Should be: Irrm_NEAREST_TIE_AWAY_0
12685          case 5: ch = 'a'; assign(irrm, mkU32(Irrm_NEAREST)); break;
12686          // I am unsure about the following, due to the "integral exact"
12687          // description in the manual.  What does it mean? (frintx, that is)
12688          case 6: ch = 'x'; assign(irrm, mkexpr(irrmRM)); break;
12689          case 8: ch = 'i'; assign(irrm, mkexpr(irrmRM)); break;
12690          default: vassert(0);
12691       }
12692
12693       IROp opRND = isD ? Iop_RoundF64toInt : Iop_RoundF32toInt;
12694       if (isD) {
12695          for (UInt i = 0; i < 2; i++) {
12696             putQRegLane(dd, i, binop(opRND, mkexpr(irrm),
12697                                             getQRegLane(nn, i, Ity_F64)));
12698          }
12699       } else {
12700          UInt n = bitQ==1 ? 4 : 2;
12701          for (UInt i = 0; i < n; i++) {
12702             putQRegLane(dd, i, binop(opRND, mkexpr(irrm),
12703                                             getQRegLane(nn, i, Ity_F32)));
12704          }
12705          if (bitQ == 0)
12706             putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
12707       }
12708       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12709       DIP("frint%c %s.%s, %s.%s\n", ch,
12710           nameQReg128(dd), arr, nameQReg128(nn), arr);
12711       return True;
12712    }
12713
12714    ix = 0; /*INVALID*/
12715    switch (opcode) {
12716       case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
12717       case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
12718       case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
12719       default: break;
12720    }
12721    if (ix > 0) {
12722       /* -------- 0,0x,11010 FCVTNS 2d_2d, 4s_4s, 2s_2s (ix 1) -------- */
12723       /* -------- 0,0x,11011 FCVTMS 2d_2d, 4s_4s, 2s_2s (ix 2) -------- */
12724       /* -------- 0,0x,11100 FCVTAS 2d_2d, 4s_4s, 2s_2s (ix 3) -------- */
12725       /* -------- 0,1x,11010 FCVTPS 2d_2d, 4s_4s, 2s_2s (ix 4) -------- */
12726       /* -------- 0,1x,11011 FCVTZS 2d_2d, 4s_4s, 2s_2s (ix 5) -------- */
12727       /* -------- 1,0x,11010 FCVTNS 2d_2d, 4s_4s, 2s_2s (ix 1) -------- */
12728       /* -------- 1,0x,11011 FCVTMS 2d_2d, 4s_4s, 2s_2s (ix 2) -------- */
12729       /* -------- 1,0x,11100 FCVTAS 2d_2d, 4s_4s, 2s_2s (ix 3) -------- */
12730       /* -------- 1,1x,11010 FCVTPS 2d_2d, 4s_4s, 2s_2s (ix 4) -------- */
12731       /* -------- 1,1x,11011 FCVTZS 2d_2d, 4s_4s, 2s_2s (ix 5) -------- */
12732       Bool isD = (size & 1) == 1;
12733       if (bitQ == 0 && isD) return False; // implied 1d case
12734
12735       IRRoundingMode irrm = 8; /*impossible*/
12736       HChar          ch   = '?';
12737       switch (ix) {
12738          case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
12739          case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
12740          case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
12741          case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
12742          case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
12743          default: vassert(0);
12744       }
12745       IROp cvt = Iop_INVALID;
12746       if (bitU == 1) {
12747          cvt = isD ? Iop_F64toI64U : Iop_F32toI32U;
12748       } else {
12749          cvt = isD ? Iop_F64toI64S : Iop_F32toI32S;
12750       }
12751       if (isD) {
12752          for (UInt i = 0; i < 2; i++) {
12753             putQRegLane(dd, i, binop(cvt, mkU32(irrm),
12754                                             getQRegLane(nn, i, Ity_F64)));
12755          }
12756       } else {
12757          UInt n = bitQ==1 ? 4 : 2;
12758          for (UInt i = 0; i < n; i++) {
12759             putQRegLane(dd, i, binop(cvt, mkU32(irrm),
12760                                             getQRegLane(nn, i, Ity_F32)));
12761          }
12762          if (bitQ == 0)
12763             putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
12764       }
12765       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12766       DIP("fcvt%c%c %s.%s, %s.%s\n", ch, bitU == 1 ? 'u' : 's',
12767           nameQReg128(dd), arr, nameQReg128(nn), arr);
12768       return True;
12769    }
12770
12771    if (size == X10 && opcode == BITS5(1,1,1,0,0)) {
12772       /* -------- 0,10,11100: URECPE  4s_4s, 2s_2s -------- */
12773       /* -------- 1,10,11100: URSQRTE 4s_4s, 2s_2s -------- */
12774       Bool isREC = bitU == 0;
12775       IROp op    = isREC ? Iop_RecipEst32Ux4 : Iop_RSqrtEst32Ux4;
12776       IRTemp res = newTempV128();
12777       assign(res, unop(op, getQReg128(nn)));
12778       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12779       const HChar* nm  = isREC ? "urecpe" : "ursqrte";
12780       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12781       DIP("%s %s.%s, %s.%s\n", nm,
12782           nameQReg128(dd), arr, nameQReg128(nn), arr);
12783       return True;
12784    }
12785
12786    if (size <= X01 && opcode == BITS5(1,1,1,0,1)) {
12787       /* -------- 0,0x,11101: SCVTF -------- */
12788       /* -------- 1,0x,11101: UCVTF -------- */
12789       /* 31  28      22 21       15     9 4
12790          0q0 01110 0 sz 1  00001 110110 n d  SCVTF Vd, Vn
12791          0q1 01110 0 sz 1  00001 110110 n d  UCVTF Vd, Vn
12792          with laneage:
12793          case sz:Q of 00 -> 2S, zero upper, 01 -> 4S, 10 -> illegal, 11 -> 2D
12794       */
12795       Bool isQ   = bitQ == 1;
12796       Bool isU   = bitU == 1;
12797       Bool isF64 = (size & 1) == 1;
12798       if (isQ || !isF64) {
12799          IRType tyF = Ity_INVALID, tyI = Ity_INVALID;
12800          UInt   nLanes = 0;
12801          Bool   zeroHI = False;
12802          const HChar* arrSpec = NULL;
12803          Bool   ok  = getLaneInfo_Q_SZ(&tyI, &tyF, &nLanes, &zeroHI, &arrSpec,
12804                                        isQ, isF64 );
12805          IROp   iop = isU ? (isF64 ? Iop_I64UtoF64 : Iop_I32UtoF32)
12806                           : (isF64 ? Iop_I64StoF64 : Iop_I32StoF32);
12807          IRTemp rm  = mk_get_IR_rounding_mode();
12808          UInt   i;
12809          vassert(ok); /* the 'if' above should ensure this */
12810          for (i = 0; i < nLanes; i++) {
12811             putQRegLane(dd, i,
12812                         binop(iop, mkexpr(rm), getQRegLane(nn, i, tyI)));
12813          }
12814          if (zeroHI) {
12815             putQRegLane(dd, 1, mkU64(0));
12816          }
12817          DIP("%ccvtf %s.%s, %s.%s\n", isU ? 'u' : 's',
12818              nameQReg128(dd), arrSpec, nameQReg128(nn), arrSpec);
12819          return True;
12820       }
12821       /* else fall through */
12822    }
12823
12824    if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
12825       /* -------- 0,1x,11101: FRECPE  2d_2d, 4s_4s, 2s_2s -------- */
12826       /* -------- 1,1x,11101: FRSQRTE 2d_2d, 4s_4s, 2s_2s -------- */
12827       Bool isSQRT = bitU == 1;
12828       Bool isD    = (size & 1) == 1;
12829       IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
12830                            : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
12831       if (bitQ == 0 && isD) return False; // implied 1d case
12832       IRTemp resV = newTempV128();
12833       assign(resV, unop(op, getQReg128(nn)));
12834       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
12835       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
12836       DIP("%s %s.%s, %s.%s\n", isSQRT ? "frsqrte" : "frecpe",
12837           nameQReg128(dd), arr, nameQReg128(nn), arr);
12838       return True;
12839    }
12840
12841    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
12842       /* -------- 1,1x,11111: FSQRT 2d_2d, 4s_4s, 2s_2s -------- */
12843       Bool isD = (size & 1) == 1;
12844       IROp op  = isD ? Iop_Sqrt64Fx2 : Iop_Sqrt32Fx4;
12845       if (bitQ == 0 && isD) return False; // implied 1d case
12846       IRTemp resV = newTempV128();
12847       assign(resV, binop(op, mkexpr(mk_get_IR_rounding_mode()),
12848                              getQReg128(nn)));
12849       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
12850       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
12851       DIP("%s %s.%s, %s.%s\n", "fsqrt",
12852           nameQReg128(dd), arr, nameQReg128(nn), arr);
12853       return True;
12854    }
12855
12856    return False;
12857 #  undef INSN
12858 }
12859
12860
12861 static
12862 Bool dis_AdvSIMD_vector_x_indexed_elem(/*MB_OUT*/DisResult* dres, UInt insn)
12863 {
12864    /* 31    28    23   21 20 19 15     11   9 4
12865       0 Q U 01111 size L  M  m  opcode H  0 n d
12866       Decode fields are: u,size,opcode
12867       M is really part of the mm register number.  Individual
12868       cases need to inspect L and H though.
12869    */
12870 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12871    if (INSN(31,31) != 0
12872        || INSN(28,24) != BITS5(0,1,1,1,1) || INSN(10,10) !=0) {
12873       return False;
12874    }
12875    UInt bitQ   = INSN(30,30);
12876    UInt bitU   = INSN(29,29);
12877    UInt size   = INSN(23,22);
12878    UInt bitL   = INSN(21,21);
12879    UInt bitM   = INSN(20,20);
12880    UInt mmLO4  = INSN(19,16);
12881    UInt opcode = INSN(15,12);
12882    UInt bitH   = INSN(11,11);
12883    UInt nn     = INSN(9,5);
12884    UInt dd     = INSN(4,0);
12885    vassert(size < 4);
12886    vassert(bitH < 2 && bitM < 2 && bitL < 2);
12887
12888    if (bitU == 0 && size >= X10
12889        && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
12890       /* -------- 0,1x,0001 FMLA 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
12891       /* -------- 0,1x,0101 FMLS 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
12892       if (bitQ == 0 && size == X11) return False; // implied 1d case
12893       Bool isD   = (size & 1) == 1;
12894       Bool isSUB = opcode == BITS4(0,1,0,1);
12895       UInt index;
12896       if      (!isD)             index = (bitH << 1) | bitL;
12897       else if (isD && bitL == 0) index = bitH;
12898       else return False; // sz:L == x11 => unallocated encoding
12899       vassert(index < (isD ? 2 : 4));
12900       IRType ity   = isD ? Ity_F64 : Ity_F32;
12901       IRTemp elem  = newTemp(ity);
12902       UInt   mm    = (bitM << 4) | mmLO4;
12903       assign(elem, getQRegLane(mm, index, ity));
12904       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
12905       IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
12906       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
12907       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
12908       IRTemp rm    = mk_get_IR_rounding_mode();
12909       IRTemp t1    = newTempV128();
12910       IRTemp t2    = newTempV128();
12911       // FIXME: double rounding; use FMA primops instead
12912       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
12913       assign(t2, triop(isSUB ? opSUB : opADD,
12914                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
12915       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
12916       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12917       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
12918           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm),
12919           isD ? 'd' : 's', index);
12920       return True;
12921    }
12922
12923    if (size >= X10 && opcode == BITS4(1,0,0,1)) {
12924       /* -------- 0,1x,1001 FMUL  2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
12925       /* -------- 1,1x,1001 FMULX 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
12926       if (bitQ == 0 && size == X11) return False; // implied 1d case
12927       Bool isD    = (size & 1) == 1;
12928       Bool isMULX = bitU == 1;
12929       UInt index;
12930       if      (!isD)             index = (bitH << 1) | bitL;
12931       else if (isD && bitL == 0) index = bitH;
12932       else return False; // sz:L == x11 => unallocated encoding
12933       vassert(index < (isD ? 2 : 4));
12934       IRType ity  = isD ? Ity_F64 : Ity_F32;
12935       IRTemp elem = newTemp(ity);
12936       UInt   mm   = (bitM << 4) | mmLO4;
12937       assign(elem, getQRegLane(mm, index, ity));
12938       IRTemp dupd = math_DUP_TO_V128(elem, ity);
12939       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
12940       IRTemp res  = newTempV128();
12941       assign(res, triop(isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4,
12942                         mkexpr(mk_get_IR_rounding_mode()),
12943                         getQReg128(nn), mkexpr(dupd)));
12944       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12945       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
12946       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n",
12947           isMULX ? "fmulx" : "fmul", nameQReg128(dd), arr,
12948           nameQReg128(nn), arr, nameQReg128(mm), isD ? 'd' : 's', index);
12949       return True;
12950    }
12951
12952    if ((bitU == 1 && (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,1,0,0)))
12953        || (bitU == 0 && opcode == BITS4(1,0,0,0))) {
12954       /* -------- 1,xx,0000 MLA s/h variants only -------- */
12955       /* -------- 1,xx,0100 MLS s/h variants only -------- */
12956       /* -------- 0,xx,1000 MUL s/h variants only -------- */
12957       Bool isMLA = opcode == BITS4(0,0,0,0);
12958       Bool isMLS = opcode == BITS4(0,1,0,0);
12959       UInt mm    = 32; // invalid
12960       UInt ix    = 16; // invalid
12961       switch (size) {
12962          case X00:
12963             return False; // b case is not allowed
12964          case X01:
12965             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
12966          case X10:
12967             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
12968          case X11:
12969             return False; // d case is not allowed
12970          default:
12971             vassert(0);
12972       }
12973       vassert(mm < 32 && ix < 16);
12974       IROp   opMUL = mkVecMUL(size);
12975       IROp   opADD = mkVecADD(size);
12976       IROp   opSUB = mkVecSUB(size);
12977       HChar  ch    = size == X01 ? 'h' : 's';
12978       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
12979       IRTemp vecD  = newTempV128();
12980       IRTemp vecN  = newTempV128();
12981       IRTemp res   = newTempV128();
12982       assign(vecD, getQReg128(dd));
12983       assign(vecN, getQReg128(nn));
12984       IRExpr* prod = binop(opMUL, mkexpr(vecN), mkexpr(vecM));
12985       if (isMLA || isMLS) {
12986          assign(res, binop(isMLA ? opADD : opSUB, mkexpr(vecD), prod));
12987       } else {
12988          assign(res, prod);
12989       }
12990       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12991       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12992       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isMLA ? "mla"
12993                                                 : (isMLS ? "mls" : "mul"),
12994           nameQReg128(dd), arr,
12995           nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
12996       return True;
12997    }
12998
12999    if (opcode == BITS4(1,0,1,0)
13000        || opcode == BITS4(0,0,1,0) || opcode == BITS4(0,1,1,0)) {
13001       /* -------- 0,xx,1010 SMULL s/h variants only -------- */ // 0 (ks)
13002       /* -------- 1,xx,1010 UMULL s/h variants only -------- */ // 0
13003       /* -------- 0,xx,0010 SMLAL s/h variants only -------- */ // 1
13004       /* -------- 1,xx,0010 UMLAL s/h variants only -------- */ // 1
13005       /* -------- 0,xx,0110 SMLSL s/h variants only -------- */ // 2
13006       /* -------- 1,xx,0110 SMLSL s/h variants only -------- */ // 2
13007       /* Widens, and size refers to the narrowed lanes. */
13008       UInt ks = 3;
13009       switch (opcode) {
13010          case BITS4(1,0,1,0): ks = 0; break;
13011          case BITS4(0,0,1,0): ks = 1; break;
13012          case BITS4(0,1,1,0): ks = 2; break;
13013          default: vassert(0);
13014       }
13015       vassert(ks >= 0 && ks <= 2);
13016       Bool isU = bitU == 1;
13017       Bool is2 = bitQ == 1;
13018       UInt mm  = 32; // invalid
13019       UInt ix  = 16; // invalid
13020       switch (size) {
13021          case X00:
13022             return False; // h_b_b[] case is not allowed
13023          case X01:
13024             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13025          case X10:
13026             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13027          case X11:
13028             return False; // q_d_d[] case is not allowed
13029          default:
13030             vassert(0);
13031       }
13032       vassert(mm < 32 && ix < 16);
13033       IRTemp vecN  = newTempV128();
13034       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13035       IRTemp vecD  = newTempV128();
13036       assign(vecN, getQReg128(nn));
13037       assign(vecD, getQReg128(dd));
13038       IRTemp res = IRTemp_INVALID;
13039       math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
13040                     vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
13041       putQReg128(dd, mkexpr(res));
13042       const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
13043       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13044       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13045       HChar ch               = size == X01 ? 'h' : 's';
13046       DIP("%c%s%s %s.%s, %s.%s, %s.%c[%u]\n",
13047           isU ? 'u' : 's', nm, is2 ? "2" : "",
13048           nameQReg128(dd), arrWide,
13049           nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
13050       return True;
13051    }
13052
13053    if (bitU == 0
13054        && (opcode == BITS4(1,0,1,1)
13055            || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
13056       /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
13057       /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
13058       /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
13059       /* Widens, and size refers to the narrowed lanes. */
13060       UInt ks = 3;
13061       switch (opcode) {
13062          case BITS4(1,0,1,1): ks = 0; break;
13063          case BITS4(0,0,1,1): ks = 1; break;
13064          case BITS4(0,1,1,1): ks = 2; break;
13065          default: vassert(0);
13066       }
13067       vassert(ks >= 0 && ks <= 2);
13068       Bool is2 = bitQ == 1;
13069       UInt mm  = 32; // invalid
13070       UInt ix  = 16; // invalid
13071       switch (size) {
13072          case X00:
13073             return False; // h_b_b[] case is not allowed
13074          case X01:
13075             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13076          case X10:
13077             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13078          case X11:
13079             return False; // q_d_d[] case is not allowed
13080          default:
13081             vassert(0);
13082       }
13083       vassert(mm < 32 && ix < 16);
13084       IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
13085       vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
13086       newTempsV128_2(&vecN, &vecD);
13087       assign(vecN, getQReg128(nn));
13088       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13089       assign(vecD, getQReg128(dd));
13090       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
13091                        is2, size, "mas"[ks],
13092                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
13093       putQReg128(dd, mkexpr(res));
13094       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
13095       updateQCFLAGwithDifference(sat1q, sat1n);
13096       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
13097          updateQCFLAGwithDifference(sat2q, sat2n);
13098       }
13099       const HChar* nm        = ks == 0 ? "sqdmull"
13100                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
13101       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13102       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13103       HChar ch               = size == X01 ? 'h' : 's';
13104       DIP("%s%s %s.%s, %s.%s, %s.%c[%u]\n",
13105           nm, is2 ? "2" : "",
13106           nameQReg128(dd), arrWide,
13107           nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
13108       return True;
13109    }
13110
13111    if (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1)) {
13112       /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
13113       /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
13114       UInt mm  = 32; // invalid
13115       UInt ix  = 16; // invalid
13116       switch (size) {
13117          case X00:
13118             return False; // b case is not allowed
13119          case X01:
13120             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
13121          case X10:
13122             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
13123          case X11:
13124             return False; // q case is not allowed
13125          default:
13126             vassert(0);
13127       }
13128       vassert(mm < 32 && ix < 16);
13129       Bool isR = opcode == BITS4(1,1,0,1);
13130       IRTemp res, sat1q, sat1n, vN, vM;
13131       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
13132       vN = newTempV128();
13133       assign(vN, getQReg128(nn));
13134       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
13135       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
13136       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13137       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
13138       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
13139       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
13140       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13141       HChar ch         = size == X01 ? 'h' : 's';
13142       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", nm,
13143           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
13144       return True;
13145    }
13146
13147    return False;
13148 #  undef INSN
13149 }
13150
13151
13152 static
13153 Bool dis_AdvSIMD_crypto_aes(/*MB_OUT*/DisResult* dres, UInt insn)
13154 {
13155    /* 31        23   21    16     11 9 4
13156       0100 1110 size 10100 opcode 10 n d
13157       Decode fields are: size,opcode
13158       Size is always 00 in ARMv8, it appears.
13159    */
13160 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13161    if (INSN(31,24) != BITS8(0,1,0,0,1,1,1,0)
13162       || INSN(21,17) != BITS5(1,0,1,0,0) || INSN(11,10) != BITS2(1,0)) {
13163       return False;
13164    }
13165    UInt size   = INSN(23,22);
13166    UInt opcode = INSN(16,12);
13167    UInt nn     = INSN(9,5);
13168    UInt dd     = INSN(4,0);
13169
13170    if (size == BITS2(0,0)
13171        && (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,0,1))) {
13172       /* -------- 00,00100: AESE Vd.16b, Vn.16b -------- */
13173       /* -------- 00,00101: AESD Vd.16b, Vn.16b -------- */
13174       Bool   isD  = opcode == BITS5(0,0,1,0,1);
13175       IRTemp op1  = newTemp(Ity_V128);
13176       IRTemp op2  = newTemp(Ity_V128);
13177       IRTemp xord = newTemp(Ity_V128);
13178       IRTemp res  = newTemp(Ity_V128);
13179       void*        helper = isD ? &arm64g_dirtyhelper_AESD
13180                                 : &arm64g_dirtyhelper_AESE;
13181       const HChar* hname  = isD ? "arm64g_dirtyhelper_AESD"
13182                                 : "arm64g_dirtyhelper_AESE";
13183       assign(op1, getQReg128(dd));
13184       assign(op2, getQReg128(nn));
13185       assign(xord, binop(Iop_XorV128, mkexpr(op1), mkexpr(op2)));
13186       IRDirty* di
13187          = unsafeIRDirty_1_N( res, 0/*regparms*/, hname, helper,
13188                               mkIRExprVec_3(
13189                                  IRExpr_VECRET(),
13190                                  unop(Iop_V128HIto64, mkexpr(xord)),
13191                                  unop(Iop_V128to64, mkexpr(xord)) ) );
13192       stmt(IRStmt_Dirty(di));
13193       putQReg128(dd, mkexpr(res));
13194       DIP("aes%c %s.16b, %s.16b\n", isD ? 'd' : 'e',
13195                                     nameQReg128(dd), nameQReg128(nn));
13196       return True;
13197    }
13198
13199    if (size == BITS2(0,0)
13200        && (opcode == BITS5(0,0,1,1,0) || opcode == BITS5(0,0,1,1,1))) {
13201       /* -------- 00,00110: AESMC  Vd.16b, Vn.16b -------- */
13202       /* -------- 00,00111: AESIMC Vd.16b, Vn.16b -------- */
13203       Bool   isI  = opcode == BITS5(0,0,1,1,1);
13204       IRTemp src  = newTemp(Ity_V128);
13205       IRTemp res  = newTemp(Ity_V128);
13206       void*        helper = isI ? &arm64g_dirtyhelper_AESIMC
13207                                 : &arm64g_dirtyhelper_AESMC;
13208       const HChar* hname  = isI ? "arm64g_dirtyhelper_AESIMC"
13209                                 : "arm64g_dirtyhelper_AESMC";
13210       assign(src, getQReg128(nn));
13211       IRDirty* di
13212          = unsafeIRDirty_1_N( res, 0/*regparms*/, hname, helper,
13213                               mkIRExprVec_3(
13214                                  IRExpr_VECRET(),
13215                                  unop(Iop_V128HIto64, mkexpr(src)),
13216                                  unop(Iop_V128to64, mkexpr(src)) ) );
13217       stmt(IRStmt_Dirty(di));
13218       putQReg128(dd, mkexpr(res));
13219       DIP("aes%s %s.16b, %s.16b\n", isI ? "imc" : "mc",
13220                                     nameQReg128(dd), nameQReg128(nn));
13221       return True;
13222    }
13223
13224    return False;
13225 #  undef INSN
13226 }
13227
13228
13229 static
13230 Bool dis_AdvSIMD_crypto_three_reg_sha(/*MB_OUT*/DisResult* dres, UInt insn)
13231 {
13232    /* 31   28   23 21 20 15 14  11 9 4
13233       0101 1110 sz 0  m  0  opc 00 n d
13234       Decode fields are: sz,opc
13235    */
13236 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13237    if (INSN(31,24) != BITS8(0,1,0,1,1,1,1,0) || INSN(21,21) != 0
13238        || INSN(15,15) != 0 || INSN(11,10) != BITS2(0,0)) {
13239       return False;
13240    }
13241    UInt sz  = INSN(23,22);
13242    UInt mm  = INSN(20,16);
13243    UInt opc = INSN(14,12);
13244    UInt nn  = INSN(9,5);
13245    UInt dd  = INSN(4,0);
13246    if (sz == BITS2(0,0) && opc <= BITS3(1,1,0)) {
13247       /* -------- 00,000 SHA1C     Qd,    Sn,    Vm.4S -------- */
13248       /* -------- 00,001 SHA1P     Qd,    Sn,    Vm.4S -------- */
13249       /* -------- 00,010 SHA1M     Qd,    Sn,    Vm.4S -------- */
13250       /* -------- 00,011 SHA1SU0   Vd.4S, Vn.4S, Vm.4S -------- */
13251       /* -------- 00,100 SHA256H   Qd,    Qn,    Vm.4S -------- */
13252       /* -------- 00,101 SHA256H2  Qd,    Qn,    Vm.4S -------- */
13253       /* -------- 00,110 SHA256SU1 Vd.4S, Vn.4S, Vm.4S -------- */
13254       vassert(opc < 7);
13255       const HChar* inames[7]
13256          = { "sha1c", "sha1p", "sha1m", "sha1su0",
13257              "sha256h", "sha256h2", "sha256su1" };
13258       void(*helpers[7])(V128*,ULong,ULong,ULong,ULong,ULong,ULong)
13259          = { &arm64g_dirtyhelper_SHA1C,    &arm64g_dirtyhelper_SHA1P,
13260              &arm64g_dirtyhelper_SHA1M,    &arm64g_dirtyhelper_SHA1SU0,
13261              &arm64g_dirtyhelper_SHA256H,  &arm64g_dirtyhelper_SHA256H2,
13262              &arm64g_dirtyhelper_SHA256SU1 };
13263       const HChar* hnames[7]
13264          = { "arm64g_dirtyhelper_SHA1C",    "arm64g_dirtyhelper_SHA1P",
13265              "arm64g_dirtyhelper_SHA1M",    "arm64g_dirtyhelper_SHA1SU0",
13266              "arm64g_dirtyhelper_SHA256H",  "arm64g_dirtyhelper_SHA256H2",
13267              "arm64g_dirtyhelper_SHA256SU1" };
13268       IRTemp vD      = newTemp(Ity_V128);
13269       IRTemp vN      = newTemp(Ity_V128);
13270       IRTemp vM      = newTemp(Ity_V128);
13271       IRTemp vDhi    = newTemp(Ity_I64);
13272       IRTemp vDlo    = newTemp(Ity_I64);
13273       IRTemp vNhiPre = newTemp(Ity_I64);
13274       IRTemp vNloPre = newTemp(Ity_I64);
13275       IRTemp vNhi    = newTemp(Ity_I64);
13276       IRTemp vNlo    = newTemp(Ity_I64);
13277       IRTemp vMhi    = newTemp(Ity_I64);
13278       IRTemp vMlo    = newTemp(Ity_I64);
13279       assign(vD,      getQReg128(dd));
13280       assign(vN,      getQReg128(nn));
13281       assign(vM,      getQReg128(mm));
13282       assign(vDhi,    unop(Iop_V128HIto64, mkexpr(vD)));
13283       assign(vDlo,    unop(Iop_V128to64,   mkexpr(vD)));
13284       assign(vNhiPre, unop(Iop_V128HIto64, mkexpr(vN)));
13285       assign(vNloPre, unop(Iop_V128to64,   mkexpr(vN)));
13286       assign(vMhi,    unop(Iop_V128HIto64, mkexpr(vM)));
13287       assign(vMlo,    unop(Iop_V128to64,   mkexpr(vM)));
13288       /* Mask off any bits of the N register operand that aren't actually
13289          needed, so that Memcheck doesn't complain unnecessarily. */
13290       switch (opc) {
13291          case BITS3(0,0,0): case BITS3(0,0,1): case BITS3(0,1,0):
13292             assign(vNhi, mkU64(0));
13293             assign(vNlo, unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(vNloPre))));
13294             break;
13295          case BITS3(0,1,1): case BITS3(1,0,0):
13296          case BITS3(1,0,1): case BITS3(1,1,0):
13297             assign(vNhi, mkexpr(vNhiPre));
13298             assign(vNlo, mkexpr(vNloPre));
13299             break;
13300          default:
13301             vassert(0);
13302       }
13303       IRTemp res = newTemp(Ity_V128);
13304       IRDirty* di
13305          = unsafeIRDirty_1_N( res, 0/*regparms*/, hnames[opc], helpers[opc],
13306                               mkIRExprVec_7(
13307                                  IRExpr_VECRET(),
13308                                  mkexpr(vDhi), mkexpr(vDlo), mkexpr(vNhi),
13309                                  mkexpr(vNlo), mkexpr(vMhi), mkexpr(vMlo)));
13310       stmt(IRStmt_Dirty(di));
13311       putQReg128(dd, mkexpr(res));
13312       switch (opc) {
13313          case BITS3(0,0,0): case BITS3(0,0,1): case BITS3(0,1,0):
13314             DIP("%s q%u, s%u, v%u.4s\n", inames[opc], dd, nn, mm);
13315             break;
13316          case BITS3(0,1,1): case BITS3(1,1,0):
13317             DIP("%s v%u.4s, v%u.4s, v%u.4s\n", inames[opc], dd, nn, mm);
13318             break;
13319          case BITS3(1,0,0): case BITS3(1,0,1):
13320             DIP("%s q%u, q%u, v%u.4s\n", inames[opc], dd, nn, mm);
13321             break;
13322          default:
13323             vassert(0);
13324       }
13325       return True;
13326    }
13327
13328    return False;
13329 #  undef INSN
13330 }
13331
13332
13333 static
13334 Bool dis_AdvSIMD_crypto_two_reg_sha(/*MB_OUT*/DisResult* dres, UInt insn)
13335 {
13336    /* 31   28   23 21    16  11 9 4
13337       0101 1110 sz 10100 opc 10 n d
13338       Decode fields are: sz,opc
13339    */
13340 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13341    if (INSN(31,24) != BITS8(0,1,0,1,1,1,1,0)
13342        || INSN(21,17) != BITS5(1,0,1,0,0) || INSN(11,10) != BITS2(1,0)) {
13343       return False;
13344    }
13345    UInt sz  = INSN(23,22);
13346    UInt opc = INSN(16,12);
13347    UInt nn  = INSN(9,5);
13348    UInt dd  = INSN(4,0);
13349    if (sz == BITS2(0,0) && opc <= BITS5(0,0,0,1,0)) {
13350       /* -------- 00,00000 SHA1H     Sd,    Sn    -------- */
13351       /* -------- 00,00001 SHA1SU1   Vd.4S, Vn.4S -------- */
13352       /* -------- 00,00010 SHA256SU0 Vd.4S, Vn.4S -------- */
13353       vassert(opc < 3);
13354       const HChar* inames[3] = { "sha1h", "sha1su1", "sha256su0" };
13355       IRTemp vD   = newTemp(Ity_V128);
13356       IRTemp vN   = newTemp(Ity_V128);
13357       IRTemp vDhi = newTemp(Ity_I64);
13358       IRTemp vDlo = newTemp(Ity_I64);
13359       IRTemp vNhi = newTemp(Ity_I64);
13360       IRTemp vNlo = newTemp(Ity_I64);
13361       assign(vD,   getQReg128(dd));
13362       assign(vN,   getQReg128(nn));
13363       assign(vDhi, unop(Iop_V128HIto64, mkexpr(vD)));
13364       assign(vDlo, unop(Iop_V128to64,   mkexpr(vD)));
13365       assign(vNhi, unop(Iop_V128HIto64, mkexpr(vN)));
13366       assign(vNlo, unop(Iop_V128to64,   mkexpr(vN)));
13367       /* Mask off any bits of the N register operand that aren't actually
13368          needed, so that Memcheck doesn't complain unnecessarily.  Also
13369          construct the calls, given that the helper functions don't take
13370          the same number of arguments. */
13371       IRDirty* di  = NULL;
13372       IRTemp   res = newTemp(Ity_V128);
13373       switch (opc) {
13374          case BITS5(0,0,0,0,0): {
13375             IRExpr* vNloMasked = unop(Iop_32Uto64,
13376                                       unop(Iop_64to32, mkexpr(vNlo)));
13377             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
13378                                     "arm64g_dirtyhelper_SHA1H",
13379                                     &arm64g_dirtyhelper_SHA1H,
13380                                     mkIRExprVec_3(
13381                                        IRExpr_VECRET(),
13382                                        mkU64(0), vNloMasked) );
13383             break;
13384          }
13385          case BITS5(0,0,0,0,1):
13386             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
13387                                     "arm64g_dirtyhelper_SHA1SU1",
13388                                     &arm64g_dirtyhelper_SHA1SU1,
13389                                     mkIRExprVec_5(
13390                                        IRExpr_VECRET(),
13391                                        mkexpr(vDhi), mkexpr(vDlo),
13392                                        mkexpr(vNhi), mkexpr(vNlo)) );
13393             break;
13394          case BITS5(0,0,0,1,0):
13395             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
13396                                     "arm64g_dirtyhelper_SHA256SU0",
13397                                     &arm64g_dirtyhelper_SHA256SU0,
13398                                     mkIRExprVec_5(
13399                                        IRExpr_VECRET(),
13400                                        mkexpr(vDhi), mkexpr(vDlo),
13401                                        mkexpr(vNhi), mkexpr(vNlo)) );
13402             break;
13403          default:
13404             vassert(0);
13405       }
13406       stmt(IRStmt_Dirty(di));
13407       putQReg128(dd, mkexpr(res));
13408       switch (opc) {
13409          case BITS5(0,0,0,0,0):
13410             DIP("%s s%u, s%u\n", inames[opc], dd, nn);
13411             break;
13412          case BITS5(0,0,0,0,1): case BITS5(0,0,0,1,0):
13413             DIP("%s v%u.4s, v%u.4s\n", inames[opc], dd, nn);
13414             break;
13415          default:
13416             vassert(0);
13417       }
13418       return True;
13419    }
13420
13421    return False;
13422 #  undef INSN
13423 }
13424
13425
13426 static
13427 Bool dis_AdvSIMD_fp_compare(/*MB_OUT*/DisResult* dres, UInt insn)
13428 {
13429    /* 31  28    23 21 20 15 13   9 4
13430       000 11110 ty 1  m  op 1000 n opcode2
13431       The first 3 bits are really "M 0 S", but M and S are always zero.
13432       Decode fields are: ty,op,opcode2
13433    */
13434 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13435    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
13436        || INSN(21,21) != 1 || INSN(13,10) != BITS4(1,0,0,0)) {
13437       return False;
13438    }
13439    UInt ty      = INSN(23,22);
13440    UInt mm      = INSN(20,16);
13441    UInt op      = INSN(15,14);
13442    UInt nn      = INSN(9,5);
13443    UInt opcode2 = INSN(4,0);
13444    vassert(ty < 4);
13445
13446    if (ty <= X01 && op == X00
13447        && (opcode2 & BITS5(0,0,1,1,1)) == BITS5(0,0,0,0,0)) {
13448       /* -------- 0x,00,00000 FCMP  d_d,   s_s -------- */
13449       /* -------- 0x,00,01000 FCMP  d_#0, s_#0 -------- */
13450       /* -------- 0x,00,10000 FCMPE d_d,   s_s -------- */
13451       /* -------- 0x,00,11000 FCMPE d_#0, s_#0 -------- */
13452       /* 31        23   20    15      9 4
13453          000 11110 01 1     m 00 1000 n 10 000  FCMPE Dn, Dm
13454          000 11110 01 1 00000 00 1000 n 11 000  FCMPE Dn, #0.0
13455          000 11110 01 1     m 00 1000 n 00 000  FCMP  Dn, Dm
13456          000 11110 01 1 00000 00 1000 n 01 000  FCMP  Dn, #0.0
13457
13458          000 11110 00 1     m 00 1000 n 10 000  FCMPE Sn, Sm
13459          000 11110 00 1 00000 00 1000 n 11 000  FCMPE Sn, #0.0
13460          000 11110 00 1     m 00 1000 n 00 000  FCMP  Sn, Sm
13461          000 11110 00 1 00000 00 1000 n 01 000  FCMP  Sn, #0.0
13462
13463          FCMPE generates Invalid Operation exn if either arg is any kind
13464          of NaN.  FCMP generates Invalid Operation exn if either arg is a
13465          signalling NaN.  We ignore this detail here and produce the same
13466          IR for both.
13467       */
13468       Bool   isD     = (ty & 1) == 1;
13469       Bool   isCMPE  = (opcode2 & 16) == 16;
13470       Bool   cmpZero = (opcode2 & 8) == 8;
13471       IRType ity     = isD ? Ity_F64 : Ity_F32;
13472       Bool   valid   = True;
13473       if (cmpZero && mm != 0) valid = False;
13474       if (valid) {
13475          IRTemp argL  = newTemp(ity);
13476          IRTemp argR  = newTemp(ity);
13477          IRTemp irRes = newTemp(Ity_I32);
13478          assign(argL, getQRegLO(nn, ity));
13479          assign(argR,
13480                 cmpZero
13481                    ? (IRExpr_Const(isD ? IRConst_F64i(0) : IRConst_F32i(0)))
13482                    : getQRegLO(mm, ity));
13483          assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32,
13484                              mkexpr(argL), mkexpr(argR)));
13485          IRTemp nzcv = mk_convert_IRCmpF64Result_to_NZCV(irRes);
13486          IRTemp nzcv_28x0 = newTemp(Ity_I64);
13487          assign(nzcv_28x0, binop(Iop_Shl64, mkexpr(nzcv), mkU8(28)));
13488          setFlags_COPY(nzcv_28x0);
13489          DIP("fcmp%s %s, %s\n", isCMPE ? "e" : "", nameQRegLO(nn, ity),
13490              cmpZero ? "#0.0" : nameQRegLO(mm, ity));
13491          return True;
13492       }
13493       return False;
13494    }
13495
13496    return False;
13497 #  undef INSN
13498 }
13499
13500
13501 static
13502 Bool dis_AdvSIMD_fp_conditional_compare(/*MB_OUT*/DisResult* dres, UInt insn)
13503 {
13504    /* 31  28    23 21 20 15   11 9 4  3
13505       000 11110 ty 1  m  cond 01 n op nzcv
13506       The first 3 bits are really "M 0 S", but M and S are always zero.
13507       Decode fields are: ty,op
13508    */
13509 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13510    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
13511        || INSN(21,21) != 1 || INSN(11,10) != BITS2(0,1)) {
13512       return False;
13513    }
13514    UInt ty   = INSN(23,22);
13515    UInt mm   = INSN(20,16);
13516    UInt cond = INSN(15,12);
13517    UInt nn   = INSN(9,5);
13518    UInt op   = INSN(4,4);
13519    UInt nzcv = INSN(3,0);
13520    vassert(ty < 4 && op <= 1);
13521
13522    if (ty <= BITS2(0,1)) {
13523       /* -------- 00,0 FCCMP  s_s -------- */
13524       /* -------- 00,1 FCCMPE s_s -------- */
13525       /* -------- 01,0 FCCMP  d_d -------- */
13526       /* -------- 01,1 FCCMPE d_d -------- */
13527
13528       /* FCCMPE generates Invalid Operation exn if either arg is any kind
13529          of NaN.  FCCMP generates Invalid Operation exn if either arg is a
13530          signalling NaN.  We ignore this detail here and produce the same
13531          IR for both.
13532       */
13533       Bool   isD    = (ty & 1) == 1;
13534       Bool   isCMPE = op == 1;
13535       IRType ity    = isD ? Ity_F64 : Ity_F32;
13536       IRTemp argL   = newTemp(ity);
13537       IRTemp argR   = newTemp(ity);
13538       IRTemp irRes  = newTemp(Ity_I32);
13539       assign(argL,  getQRegLO(nn, ity));
13540       assign(argR,  getQRegLO(mm, ity));
13541       assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32,
13542                           mkexpr(argL), mkexpr(argR)));
13543       IRTemp condT = newTemp(Ity_I1);
13544       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
13545       IRTemp nzcvT = mk_convert_IRCmpF64Result_to_NZCV(irRes);
13546
13547       IRTemp nzcvT_28x0 = newTemp(Ity_I64);
13548       assign(nzcvT_28x0, binop(Iop_Shl64, mkexpr(nzcvT), mkU8(28)));
13549
13550       IRExpr* nzcvF_28x0 = mkU64(((ULong)nzcv) << 28);
13551
13552       IRTemp nzcv_28x0 = newTemp(Ity_I64);
13553       assign(nzcv_28x0, IRExpr_ITE(mkexpr(condT),
13554                                    mkexpr(nzcvT_28x0), nzcvF_28x0));
13555       setFlags_COPY(nzcv_28x0);
13556       DIP("fccmp%s %s, %s, #%u, %s\n", isCMPE ? "e" : "",
13557           nameQRegLO(nn, ity), nameQRegLO(mm, ity), nzcv, nameCC(cond));
13558       return True;
13559    }
13560
13561    return False;
13562 #  undef INSN
13563 }
13564
13565
13566 static
13567 Bool dis_AdvSIMD_fp_conditional_select(/*MB_OUT*/DisResult* dres, UInt insn)
13568 {
13569    /* 31        23 21 20 15   11 9 5
13570       000 11110 ty 1  m  cond 11 n d
13571       The first 3 bits are really "M 0 S", but M and S are always zero.
13572       Decode fields: ty
13573    */
13574 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13575    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0) || INSN(21,21) != 1
13576        || INSN(11,10) != BITS2(1,1)) {
13577       return False;
13578    }
13579    UInt ty   = INSN(23,22);
13580    UInt mm   = INSN(20,16);
13581    UInt cond = INSN(15,12);
13582    UInt nn   = INSN(9,5);
13583    UInt dd   = INSN(4,0);
13584    if (ty <= X01) {
13585       /* -------- 00: FCSEL s_s -------- */
13586       /* -------- 00: FCSEL d_d -------- */
13587       IRType ity = ty == X01 ? Ity_F64 : Ity_F32;
13588       IRTemp srcT = newTemp(ity);
13589       IRTemp srcF = newTemp(ity);
13590       IRTemp res  = newTemp(ity);
13591       assign(srcT, getQRegLO(nn, ity));
13592       assign(srcF, getQRegLO(mm, ity));
13593       assign(res, IRExpr_ITE(
13594                      unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
13595                      mkexpr(srcT), mkexpr(srcF)));
13596       putQReg128(dd, mkV128(0x0000));
13597       putQRegLO(dd, mkexpr(res));
13598       DIP("fcsel %s, %s, %s, %s\n",
13599           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity),
13600           nameCC(cond));
13601       return True;
13602    }
13603    return False;
13604 #  undef INSN
13605 }
13606
13607
13608 static
13609 Bool dis_AdvSIMD_fp_data_proc_1_source(/*MB_OUT*/DisResult* dres, UInt insn)
13610 {
13611    /* 31  28    23 21 20     14    9 4
13612       000 11110 ty 1  opcode 10000 n d
13613       The first 3 bits are really "M 0 S", but M and S are always zero.
13614       Decode fields: ty,opcode
13615    */
13616 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13617    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
13618        || INSN(21,21) != 1 || INSN(14,10) != BITS5(1,0,0,0,0)) {
13619       return False;
13620    }
13621    UInt ty     = INSN(23,22);
13622    UInt opcode = INSN(20,15);
13623    UInt nn     = INSN(9,5);
13624    UInt dd     = INSN(4,0);
13625
13626    if (ty <= X01 && opcode <= BITS6(0,0,0,0,1,1)) {
13627       /* -------- 0x,000000: FMOV  d_d, s_s -------- */
13628       /* -------- 0x,000001: FABS  d_d, s_s -------- */
13629       /* -------- 0x,000010: FNEG  d_d, s_s -------- */
13630       /* -------- 0x,000011: FSQRT d_d, s_s -------- */
13631       IRType ity = ty == X01 ? Ity_F64 : Ity_F32;
13632       IRTemp src = newTemp(ity);
13633       IRTemp res = newTemp(ity);
13634       const HChar* nm = "??";
13635       assign(src, getQRegLO(nn, ity));
13636       switch (opcode) {
13637          case BITS6(0,0,0,0,0,0):
13638             nm = "fmov"; assign(res, mkexpr(src)); break;
13639          case BITS6(0,0,0,0,0,1):
13640             nm = "fabs"; assign(res, unop(mkABSF(ity), mkexpr(src))); break;
13641          case BITS6(0,0,0,0,1,0):
13642             nm = "fabs"; assign(res, unop(mkNEGF(ity), mkexpr(src))); break;
13643          case BITS6(0,0,0,0,1,1):
13644             nm = "fsqrt";
13645             assign(res, binop(mkSQRTF(ity),
13646                               mkexpr(mk_get_IR_rounding_mode()),
13647                               mkexpr(src))); break;
13648          default:
13649             vassert(0);
13650       }
13651       putQReg128(dd, mkV128(0x0000));
13652       putQRegLO(dd, mkexpr(res));
13653       DIP("%s %s, %s\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
13654       return True;
13655    }
13656
13657    if (   (ty == X11 && (opcode == BITS6(0,0,0,1,0,0)
13658                          || opcode == BITS6(0,0,0,1,0,1)))
13659        || (ty == X00 && (opcode == BITS6(0,0,0,1,1,1)
13660                          || opcode == BITS6(0,0,0,1,0,1)))
13661        || (ty == X01 && (opcode == BITS6(0,0,0,1,1,1)
13662                          || opcode == BITS6(0,0,0,1,0,0)))) {
13663       /* -------- 11,000100: FCVT s_h -------- */
13664       /* -------- 11,000101: FCVT d_h -------- */
13665       /* -------- 00,000111: FCVT h_s -------- */
13666       /* -------- 00,000101: FCVT d_s -------- */
13667       /* -------- 01,000111: FCVT h_d -------- */
13668       /* -------- 01,000100: FCVT s_d -------- */
13669       /* 31        23 21    16 14    9 4
13670          000 11110 11 10001 00 10000 n d   FCVT Sd, Hn
13671          --------- 11 ----- 01 ---------   FCVT Dd, Hn
13672          --------- 00 ----- 11 ---------   FCVT Hd, Sn
13673          --------- 00 ----- 01 ---------   FCVT Dd, Sn
13674          --------- 01 ----- 11 ---------   FCVT Hd, Dn
13675          --------- 01 ----- 00 ---------   FCVT Sd, Dn
13676          Rounding, when dst is smaller than src, is per the FPCR.
13677       */
13678       UInt b2322 = ty;
13679       UInt b1615 = opcode & BITS2(1,1);
13680       switch ((b2322 << 2) | b1615) {
13681          case BITS4(0,0,0,1):   // S -> D
13682          case BITS4(1,1,0,1): { // H -> D
13683             Bool   srcIsH = b2322 == BITS2(1,1);
13684             IRType srcTy  = srcIsH ? Ity_F16 : Ity_F32;
13685             IRTemp res    = newTemp(Ity_F64);
13686             assign(res, unop(srcIsH ? Iop_F16toF64 : Iop_F32toF64,
13687                              getQRegLO(nn, srcTy)));
13688             putQReg128(dd, mkV128(0x0000));
13689             putQRegLO(dd, mkexpr(res));
13690             DIP("fcvt %s, %s\n",
13691                 nameQRegLO(dd, Ity_F64), nameQRegLO(nn, srcTy));
13692             return True;
13693          }
13694          case BITS4(0,1,0,0):   // D -> S
13695          case BITS4(0,1,1,1): { // D -> H
13696             Bool   dstIsH = b1615 == BITS2(1,1);
13697             IRType dstTy  = dstIsH ? Ity_F16 : Ity_F32;
13698             IRTemp res    = newTemp(dstTy);
13699             assign(res, binop(dstIsH ? Iop_F64toF16 : Iop_F64toF32,
13700                               mkexpr(mk_get_IR_rounding_mode()),
13701                               getQRegLO(nn, Ity_F64)));
13702             putQReg128(dd, mkV128(0x0000));
13703             putQRegLO(dd, mkexpr(res));
13704             DIP("fcvt %s, %s\n",
13705                 nameQRegLO(dd, dstTy), nameQRegLO(nn, Ity_F64));
13706             return True;
13707          }
13708          case BITS4(0,0,1,1):   // S -> H
13709          case BITS4(1,1,0,0): { // H -> S
13710             Bool   toH   = b1615 == BITS2(1,1);
13711             IRType srcTy = toH ? Ity_F32 : Ity_F16;
13712             IRType dstTy = toH ? Ity_F16 : Ity_F32;
13713             IRTemp res = newTemp(dstTy);
13714             if (toH) {
13715                assign(res, binop(Iop_F32toF16,
13716                                  mkexpr(mk_get_IR_rounding_mode()),
13717                                  getQRegLO(nn, srcTy)));
13718
13719             } else {
13720                assign(res, unop(Iop_F16toF32,
13721                                 getQRegLO(nn, srcTy)));
13722             }
13723             putQReg128(dd, mkV128(0x0000));
13724             putQRegLO(dd, mkexpr(res));
13725             DIP("fcvt %s, %s\n",
13726                 nameQRegLO(dd, dstTy), nameQRegLO(nn, srcTy));
13727             return True;
13728          }
13729          default:
13730             break;
13731       }
13732       /* else unhandled */
13733       return False;
13734    }
13735
13736    if (ty <= X01
13737        && opcode >= BITS6(0,0,1,0,0,0) && opcode <= BITS6(0,0,1,1,1,1)
13738        && opcode != BITS6(0,0,1,1,0,1)) {
13739       /* -------- 0x,001000 FRINTN d_d, s_s -------- */
13740       /* -------- 0x,001001 FRINTP d_d, s_s -------- */
13741       /* -------- 0x,001010 FRINTM d_d, s_s -------- */
13742       /* -------- 0x,001011 FRINTZ d_d, s_s -------- */
13743       /* -------- 0x,001100 FRINTA d_d, s_s -------- */
13744       /* -------- 0x,001110 FRINTX d_d, s_s -------- */
13745       /* -------- 0x,001111 FRINTI d_d, s_s -------- */
13746       /* 31        23 21   17  14    9 4
13747          000 11110 0x 1001 111 10000 n d  FRINTI Fd, Fm (round per FPCR)
13748                            rm
13749          x==0 => S-registers, x==1 => D-registers
13750          rm (17:15) encodings:
13751             111 per FPCR  (FRINTI)
13752             001 +inf      (FRINTP)
13753             010 -inf      (FRINTM)
13754             011 zero      (FRINTZ)
13755             000 tieeven   (FRINTN) -- !! FIXME KLUDGED !!
13756             100 tieaway   (FRINTA) -- !! FIXME KLUDGED !!
13757             110 per FPCR + "exact = TRUE" (FRINTX)
13758             101 unallocated
13759       */
13760       Bool    isD   = (ty & 1) == 1;
13761       UInt    rm    = opcode & BITS6(0,0,0,1,1,1);
13762       IRType  ity   = isD ? Ity_F64 : Ity_F32;
13763       IRExpr* irrmE = NULL;
13764       UChar   ch    = '?';
13765       switch (rm) {
13766          case BITS3(0,1,1): ch = 'z'; irrmE = mkU32(Irrm_ZERO); break;
13767          case BITS3(0,1,0): ch = 'm'; irrmE = mkU32(Irrm_NegINF); break;
13768          case BITS3(0,0,1): ch = 'p'; irrmE = mkU32(Irrm_PosINF); break;
13769          // The following is a kludge.  Should be: Irrm_NEAREST_TIE_AWAY_0
13770          case BITS3(1,0,0): ch = 'a'; irrmE = mkU32(Irrm_NEAREST); break;
13771          // I am unsure about the following, due to the "integral exact"
13772          // description in the manual.  What does it mean? (frintx, that is)
13773          case BITS3(1,1,0):
13774             ch = 'x'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
13775          case BITS3(1,1,1):
13776             ch = 'i'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
13777          // The following is a kludge.  There's no Irrm_ value to represent
13778          // this ("to nearest, with ties to even")
13779          case BITS3(0,0,0): ch = 'n'; irrmE = mkU32(Irrm_NEAREST); break;
13780          default: break;
13781       }
13782       if (irrmE) {
13783          IRTemp src = newTemp(ity);
13784          IRTemp dst = newTemp(ity);
13785          assign(src, getQRegLO(nn, ity));
13786          assign(dst, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
13787                            irrmE, mkexpr(src)));
13788          putQReg128(dd, mkV128(0x0000));
13789          putQRegLO(dd, mkexpr(dst));
13790          DIP("frint%c %s, %s\n",
13791              ch, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
13792          return True;
13793       }
13794       return False;
13795    }
13796
13797    return False;
13798 #  undef INSN
13799 }
13800
13801
13802 static
13803 Bool dis_AdvSIMD_fp_data_proc_2_source(/*MB_OUT*/DisResult* dres, UInt insn)
13804 {
13805    /* 31  28    23 21 20 15     11 9 4
13806       000 11110 ty 1  m  opcode 10 n d
13807       The first 3 bits are really "M 0 S", but M and S are always zero.
13808       Decode fields: ty, opcode
13809    */
13810 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13811    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
13812        || INSN(21,21) != 1 || INSN(11,10) != BITS2(1,0)) {
13813       return False;
13814    }
13815    UInt ty     = INSN(23,22);
13816    UInt mm     = INSN(20,16);
13817    UInt opcode = INSN(15,12);
13818    UInt nn     = INSN(9,5);
13819    UInt dd     = INSN(4,0);
13820
13821    if (ty <= X01 && opcode <= BITS4(0,1,1,1)) {
13822       /* ------- 0x,0000: FMUL d_d, s_s ------- */
13823       /* ------- 0x,0001: FDIV d_d, s_s ------- */
13824       /* ------- 0x,0010: FADD d_d, s_s ------- */
13825       /* ------- 0x,0011: FSUB d_d, s_s ------- */
13826       /* ------- 0x,0100: FMAX d_d, s_s ------- */
13827       /* ------- 0x,0101: FMIN d_d, s_s ------- */
13828       /* ------- 0x,0110: FMAXNM d_d, s_s ------- (FIXME KLUDGED) */
13829       /* ------- 0x,0111: FMINNM d_d, s_s ------- (FIXME KLUDGED) */
13830       IRType ity = ty == X00 ? Ity_F32 : Ity_F64;
13831       IROp   iop = Iop_INVALID;
13832       const HChar* nm = "???";
13833       switch (opcode) {
13834          case BITS4(0,0,0,0): nm = "fmul"; iop = mkMULF(ity); break;
13835          case BITS4(0,0,0,1): nm = "fdiv"; iop = mkDIVF(ity); break;
13836          case BITS4(0,0,1,0): nm = "fadd"; iop = mkADDF(ity); break;
13837          case BITS4(0,0,1,1): nm = "fsub"; iop = mkSUBF(ity); break;
13838          case BITS4(0,1,0,0): nm = "fmax"; iop = mkVecMAXF(ty+2); break;
13839          case BITS4(0,1,0,1): nm = "fmin"; iop = mkVecMINF(ty+2); break;
13840          case BITS4(0,1,1,0): nm = "fmaxnm"; iop = mkVecMAXF(ty+2); break; //!!
13841          case BITS4(0,1,1,1): nm = "fminnm"; iop = mkVecMINF(ty+2); break; //!!
13842          default: vassert(0);
13843       }
13844       if (opcode <= BITS4(0,0,1,1)) {
13845          // This is really not good code.  TODO: avoid width-changing
13846          IRTemp res = newTemp(ity);
13847          assign(res, triop(iop, mkexpr(mk_get_IR_rounding_mode()),
13848                                 getQRegLO(nn, ity), getQRegLO(mm, ity)));
13849          putQReg128(dd, mkV128(0));
13850          putQRegLO(dd, mkexpr(res));
13851       } else {
13852          putQReg128(dd, unop(mkVecZEROHIxxOFV128(ty+2),
13853                              binop(iop, getQReg128(nn), getQReg128(mm))));
13854       }
13855       DIP("%s %s, %s, %s\n",
13856           nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
13857       return True;
13858    }
13859
13860    if (ty <= X01 && opcode == BITS4(1,0,0,0)) {
13861       /* ------- 0x,1000: FNMUL d_d, s_s ------- */
13862       IRType ity  = ty == X00 ? Ity_F32 : Ity_F64;
13863       IROp   iop  = mkMULF(ity);
13864       IROp   iopn = mkNEGF(ity);
13865       const HChar* nm = "fnmul";
13866       IRExpr* resE = unop(iopn,
13867                           triop(iop, mkexpr(mk_get_IR_rounding_mode()),
13868                                 getQRegLO(nn, ity), getQRegLO(mm, ity)));
13869       IRTemp  res  = newTemp(ity);
13870       assign(res, resE);
13871       putQReg128(dd, mkV128(0));
13872       putQRegLO(dd, mkexpr(res));
13873       DIP("%s %s, %s, %s\n",
13874           nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
13875       return True;
13876    }
13877
13878    return False;
13879 #  undef INSN
13880 }
13881
13882
13883 static
13884 Bool dis_AdvSIMD_fp_data_proc_3_source(/*MB_OUT*/DisResult* dres, UInt insn)
13885 {
13886    /* 31  28    23 21 20 15 14 9 4
13887       000 11111 ty o1 m  o0 a  n d
13888       The first 3 bits are really "M 0 S", but M and S are always zero.
13889       Decode fields: ty,o1,o0
13890    */
13891 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13892    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,1)) {
13893       return False;
13894    }
13895    UInt ty    = INSN(23,22);
13896    UInt bitO1 = INSN(21,21);
13897    UInt mm    = INSN(20,16);
13898    UInt bitO0 = INSN(15,15);
13899    UInt aa    = INSN(14,10);
13900    UInt nn    = INSN(9,5);
13901    UInt dd    = INSN(4,0);
13902    vassert(ty < 4);
13903
13904    if (ty <= X01) {
13905       /* -------- 0x,0,0 FMADD  d_d_d_d, s_s_s_s -------- */
13906       /* -------- 0x,0,1 FMSUB  d_d_d_d, s_s_s_s -------- */
13907       /* -------- 0x,1,0 FNMADD d_d_d_d, s_s_s_s -------- */
13908       /* -------- 0x,1,1 FNMSUB d_d_d_d, s_s_s_s -------- */
13909       /* -------------------- F{N}M{ADD,SUB} -------------------- */
13910       /* 31          22   20 15 14 9 4   ix
13911          000 11111 0 sz 0 m  0  a  n d   0   FMADD  Fd,Fn,Fm,Fa
13912          000 11111 0 sz 0 m  1  a  n d   1   FMSUB  Fd,Fn,Fm,Fa
13913          000 11111 0 sz 1 m  0  a  n d   2   FNMADD Fd,Fn,Fm,Fa
13914          000 11111 0 sz 1 m  1  a  n d   3   FNMSUB Fd,Fn,Fm,Fa
13915          where Fx=Dx when sz=1, Fx=Sx when sz=0
13916
13917                   -----SPEC------    ----IMPL----
13918          fmadd       a +    n * m    a + n * m
13919          fmsub       a + (-n) * m    a - n * m
13920          fnmadd   (-a) + (-n) * m    -(a + n * m)
13921          fnmsub   (-a) +    n * m    -(a - n * m)
13922       */
13923       Bool    isD   = (ty & 1) == 1;
13924       UInt    ix    = (bitO1 << 1) | bitO0;
13925       IRType  ity   = isD ? Ity_F64 : Ity_F32;
13926       IROp    opADD = mkADDF(ity);
13927       IROp    opSUB = mkSUBF(ity);
13928       IROp    opMUL = mkMULF(ity);
13929       IROp    opNEG = mkNEGF(ity);
13930       IRTemp  res   = newTemp(ity);
13931       IRExpr* eA    = getQRegLO(aa, ity);
13932       IRExpr* eN    = getQRegLO(nn, ity);
13933       IRExpr* eM    = getQRegLO(mm, ity);
13934       IRExpr* rm    = mkexpr(mk_get_IR_rounding_mode());
13935       IRExpr* eNxM  = triop(opMUL, rm, eN, eM);
13936       switch (ix) {
13937          case 0:  assign(res, triop(opADD, rm, eA, eNxM)); break;
13938          case 1:  assign(res, triop(opSUB, rm, eA, eNxM)); break;
13939          case 2:  assign(res, unop(opNEG, triop(opADD, rm, eA, eNxM))); break;
13940          case 3:  assign(res, unop(opNEG, triop(opSUB, rm, eA, eNxM))); break;
13941          default: vassert(0);
13942       }
13943       putQReg128(dd, mkV128(0x0000));
13944       putQRegLO(dd, mkexpr(res));
13945       const HChar* names[4] = { "fmadd", "fmsub", "fnmadd", "fnmsub" };
13946       DIP("%s %s, %s, %s, %s\n",
13947           names[ix], nameQRegLO(dd, ity), nameQRegLO(nn, ity),
13948                      nameQRegLO(mm, ity), nameQRegLO(aa, ity));
13949       return True;
13950    }
13951
13952    return False;
13953 #  undef INSN
13954 }
13955
13956
13957 static
13958 Bool dis_AdvSIMD_fp_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
13959 {
13960    /* 31  28    23 21 20   12  9    4
13961       000 11110 ty 1  imm8 100 imm5 d
13962       The first 3 bits are really "M 0 S", but M and S are always zero.
13963    */
13964 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13965    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
13966        || INSN(21,21) != 1 || INSN(12,10) != BITS3(1,0,0)) {
13967       return False;
13968    }
13969    UInt ty     = INSN(23,22);
13970    UInt imm8   = INSN(20,13);
13971    UInt imm5   = INSN(9,5);
13972    UInt dd     = INSN(4,0);
13973
13974    /* ------- 00,00000: FMOV s_imm ------- */
13975    /* ------- 01,00000: FMOV d_imm ------- */
13976    if (ty <= X01 && imm5 == BITS5(0,0,0,0,0)) {
13977       Bool  isD  = (ty & 1) == 1;
13978       ULong imm  = VFPExpandImm(imm8, isD ? 64 : 32);
13979       if (!isD) {
13980          vassert(0 == (imm & 0xFFFFFFFF00000000ULL));
13981       }
13982       putQReg128(dd, mkV128(0));
13983       putQRegLO(dd, isD ? mkU64(imm) : mkU32(imm & 0xFFFFFFFFULL));
13984       DIP("fmov %s, #0x%llx\n",
13985           nameQRegLO(dd, isD ? Ity_F64 : Ity_F32), imm);
13986       return True;
13987    }
13988
13989    return False;
13990 #  undef INSN
13991 }
13992
13993
13994 static
13995 Bool dis_AdvSIMD_fp_to_from_fixedp_conv(/*MB_OUT*/DisResult* dres, UInt insn)
13996 {
13997 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13998    /* 31 30 29 28    23   21 20    18     15    9 4
13999       sf  0  0 11110 type 0  rmode opcode scale n d
14000       The first 3 bits are really "sf 0 S", but S is always zero.
14001       Decode fields: sf,type,rmode,opcode
14002    */
14003 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14004    if (INSN(30,29) != BITS2(0,0)
14005        || INSN(28,24) != BITS5(1,1,1,1,0)
14006        || INSN(21,21) != 0) {
14007       return False;
14008    }
14009    UInt bitSF = INSN(31,31);
14010    UInt ty    = INSN(23,22); // type
14011    UInt rm    = INSN(20,19); // rmode
14012    UInt op    = INSN(18,16); // opcode
14013    UInt sc    = INSN(15,10); // scale
14014    UInt nn    = INSN(9,5);
14015    UInt dd    = INSN(4,0);
14016
14017    if (ty <= X01 && rm == X11
14018        && (op == BITS3(0,0,0) || op == BITS3(0,0,1))) {
14019       /* -------- (ix) sf ty rm opc -------- */
14020       /* -------- 0    0  00 11 000: FCVTZS w_s_#fbits -------- */
14021       /* -------- 1    0  01 11 000: FCVTZS w_d_#fbits -------- */
14022       /* -------- 2    1  00 11 000: FCVTZS x_s_#fbits -------- */
14023       /* -------- 3    1  01 11 000: FCVTZS x_d_#fbits -------- */
14024
14025       /* -------- 4    0  00 11 001: FCVTZU w_s_#fbits -------- */
14026       /* -------- 5    0  01 11 001: FCVTZU w_d_#fbits -------- */
14027       /* -------- 6    1  00 11 001: FCVTZU x_s_#fbits -------- */
14028       /* -------- 7    1  01 11 001: FCVTZU x_d_#fbits -------- */
14029       Bool isI64 = bitSF == 1;
14030       Bool isF64 = (ty & 1) == 1;
14031       Bool isU   = (op & 1) == 1;
14032       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
14033
14034       Int fbits = 64 - sc;
14035       vassert(fbits >= 1 && fbits <= (isI64 ? 64 : 32));
14036
14037       Double  scale  = two_to_the_plus(fbits);
14038       IRExpr* scaleE = isF64 ? IRExpr_Const(IRConst_F64(scale))
14039                              : IRExpr_Const(IRConst_F32( (Float)scale ));
14040       IROp    opMUL  = isF64 ? Iop_MulF64 : Iop_MulF32;
14041
14042       const IROp ops[8]
14043         = { Iop_F32toI32S, Iop_F64toI32S, Iop_F32toI64S, Iop_F64toI64S,
14044             Iop_F32toI32U, Iop_F64toI32U, Iop_F32toI64U, Iop_F64toI64U };
14045       IRTemp irrm = newTemp(Ity_I32);
14046       assign(irrm, mkU32(Irrm_ZERO));
14047
14048       IRExpr* src = getQRegLO(nn, isF64 ? Ity_F64 : Ity_F32);
14049       IRExpr* res = binop(ops[ix], mkexpr(irrm),
14050                                    triop(opMUL, mkexpr(irrm), src, scaleE));
14051       putIRegOrZR(isI64, dd, res);
14052
14053       DIP("fcvtz%c %s, %s, #%d\n",
14054           isU ? 'u' : 's', nameIRegOrZR(isI64, dd),
14055           nameQRegLO(nn, isF64 ? Ity_F64 : Ity_F32), fbits);
14056       return True;
14057    }
14058
14059    /* ------ sf,ty,rm,opc ------ */
14060    /* ------ x,0x,00,010  SCVTF s/d, w/x, #fbits  ------ */
14061    /* ------ x,0x,00,011  UCVTF s/d, w/x, #fbits  ------ */
14062    /* (ix) sf  S 28    ty   rm opc 15    9 4
14063       0    0 0 0 11110 00 0 00 010 scale n d  SCVTF Sd, Wn, #fbits
14064       1    0 0 0 11110 01 0 00 010 scale n d  SCVTF Dd, Wn, #fbits
14065       2    1 0 0 11110 00 0 00 010 scale n d  SCVTF Sd, Xn, #fbits
14066       3    1 0 0 11110 01 0 00 010 scale n d  SCVTF Dd, Xn, #fbits
14067
14068       4    0 0 0 11110 00 0 00 011 scale n d  UCVTF Sd, Wn, #fbits
14069       5    0 0 0 11110 01 0 00 011 scale n d  UCVTF Dd, Wn, #fbits
14070       6    1 0 0 11110 00 0 00 011 scale n d  UCVTF Sd, Xn, #fbits
14071       7    1 0 0 11110 01 0 00 011 scale n d  UCVTF Dd, Xn, #fbits
14072
14073       These are signed/unsigned conversion from integer registers to
14074       FP registers, all 4 32/64-bit combinations, rounded per FPCR,
14075       scaled per |scale|.
14076    */
14077    if (ty <= X01 && rm == X00
14078        && (op == BITS3(0,1,0) || op == BITS3(0,1,1))
14079        && (bitSF == 1 || ((sc >> 5) & 1) == 1)) {
14080       Bool isI64 = bitSF == 1;
14081       Bool isF64 = (ty & 1) == 1;
14082       Bool isU   = (op & 1) == 1;
14083       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
14084
14085       Int fbits = 64 - sc;
14086       vassert(fbits >= 1 && fbits <= (isI64 ? 64 : 32));
14087
14088       Double  scale  = two_to_the_minus(fbits);
14089       IRExpr* scaleE = isF64 ? IRExpr_Const(IRConst_F64(scale))
14090                              : IRExpr_Const(IRConst_F32( (Float)scale ));
14091       IROp    opMUL  = isF64 ? Iop_MulF64 : Iop_MulF32;
14092
14093       const IROp ops[8]
14094         = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
14095             Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
14096       IRExpr* src = getIRegOrZR(isI64, nn);
14097       IRExpr* res = (isF64 && !isI64)
14098                        ? unop(ops[ix], src)
14099                        : binop(ops[ix],
14100                                mkexpr(mk_get_IR_rounding_mode()), src);
14101       putQReg128(dd, mkV128(0));
14102       putQRegLO(dd, triop(opMUL, mkU32(Irrm_NEAREST), res, scaleE));
14103
14104       DIP("%ccvtf %s, %s, #%d\n",
14105           isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32),
14106           nameIRegOrZR(isI64, nn), fbits);
14107       return True;
14108    }
14109
14110    return False;
14111 #  undef INSN
14112 }
14113
14114
14115 static
14116 Bool dis_AdvSIMD_fp_to_from_int_conv(/*MB_OUT*/DisResult* dres, UInt insn)
14117 {
14118    /* 31 30 29 28    23   21 20    18     15     9 4
14119       sf  0  0 11110 type 1  rmode opcode 000000 n d
14120       The first 3 bits are really "sf 0 S", but S is always zero.
14121       Decode fields: sf,type,rmode,opcode
14122    */
14123 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14124    if (INSN(30,29) != BITS2(0,0)
14125        || INSN(28,24) != BITS5(1,1,1,1,0)
14126        || INSN(21,21) != 1
14127        || INSN(15,10) != BITS6(0,0,0,0,0,0)) {
14128       return False;
14129    }
14130    UInt bitSF = INSN(31,31);
14131    UInt ty    = INSN(23,22); // type
14132    UInt rm    = INSN(20,19); // rmode
14133    UInt op    = INSN(18,16); // opcode
14134    UInt nn    = INSN(9,5);
14135    UInt dd    = INSN(4,0);
14136
14137    // op = 000, 001
14138    /* -------- FCVT{N,P,M,Z,A}{S,U} (scalar, integer) -------- */
14139    /*    30       23   20 18  15     9 4
14140       sf 00 11110 0x 1 00 000 000000 n d  FCVTNS Rd, Fn (round to
14141       sf 00 11110 0x 1 00 001 000000 n d  FCVTNU Rd, Fn  nearest)
14142       ---------------- 01 --------------  FCVTP-------- (round to +inf)
14143       ---------------- 10 --------------  FCVTM-------- (round to -inf)
14144       ---------------- 11 --------------  FCVTZ-------- (round to zero)
14145       ---------------- 00 100 ----------  FCVTAS------- (nearest, ties away)
14146       ---------------- 00 101 ----------  FCVTAU------- (nearest, ties away)
14147
14148       Rd is Xd when sf==1, Wd when sf==0
14149       Fn is Dn when x==1, Sn when x==0
14150       20:19 carry the rounding mode, using the same encoding as FPCR
14151    */
14152    if (ty <= X01
14153        && (   ((op == BITS3(0,0,0) || op == BITS3(0,0,1)) && True)
14154            || ((op == BITS3(1,0,0) || op == BITS3(1,0,1)) && rm == BITS2(0,0))
14155           )
14156       ) {
14157       Bool isI64 = bitSF == 1;
14158       Bool isF64 = (ty & 1) == 1;
14159       Bool isU   = (op & 1) == 1;
14160       /* Decide on the IR rounding mode to use. */
14161       IRRoundingMode irrm = 8; /*impossible*/
14162       HChar ch = '?';
14163       if (op == BITS3(0,0,0) || op == BITS3(0,0,1)) {
14164          switch (rm) {
14165             case BITS2(0,0): ch = 'n'; irrm = Irrm_NEAREST; break;
14166             case BITS2(0,1): ch = 'p'; irrm = Irrm_PosINF; break;
14167             case BITS2(1,0): ch = 'm'; irrm = Irrm_NegINF; break;
14168             case BITS2(1,1): ch = 'z'; irrm = Irrm_ZERO; break;
14169             default: vassert(0);
14170          }
14171       } else {
14172          vassert(op == BITS3(1,0,0) || op == BITS3(1,0,1));
14173          switch (rm) {
14174             case BITS2(0,0): ch = 'a'; irrm = Irrm_NEAREST; break;
14175             default: vassert(0);
14176          }
14177       }
14178       vassert(irrm != 8);
14179       /* Decide on the conversion primop, based on the source size,
14180          dest size and signedness (8 possibilities).  Case coding:
14181             F32 ->s I32   0
14182             F32 ->u I32   1
14183             F32 ->s I64   2
14184             F32 ->u I64   3
14185             F64 ->s I32   4
14186             F64 ->u I32   5
14187             F64 ->s I64   6
14188             F64 ->u I64   7
14189       */
14190       UInt ix = (isF64 ? 4 : 0) | (isI64 ? 2 : 0) | (isU ? 1 : 0);
14191       vassert(ix < 8);
14192       const IROp iops[8]
14193          = { Iop_F32toI32S, Iop_F32toI32U, Iop_F32toI64S, Iop_F32toI64U,
14194              Iop_F64toI32S, Iop_F64toI32U, Iop_F64toI64S, Iop_F64toI64U };
14195       IROp iop = iops[ix];
14196       // A bit of ATCery: bounce all cases we haven't seen an example of.
14197       if (/* F32toI32S */
14198              (iop == Iop_F32toI32S && irrm == Irrm_ZERO)   /* FCVTZS Wd,Sn */
14199           || (iop == Iop_F32toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Sn */
14200           || (iop == Iop_F32toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Sn */
14201           || (iop == Iop_F32toI32S && irrm == Irrm_NEAREST)/* FCVT{A,N}S W,S */
14202           /* F32toI32U */
14203           || (iop == Iop_F32toI32U && irrm == Irrm_ZERO)   /* FCVTZU Wd,Sn */
14204           || (iop == Iop_F32toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Sn */
14205           || (iop == Iop_F32toI32U && irrm == Irrm_PosINF) /* FCVTPU Wd,Sn */
14206           || (iop == Iop_F32toI32U && irrm == Irrm_NEAREST)/* FCVT{A,N}U W,S */
14207           /* F32toI64S */
14208           || (iop == Iop_F32toI64S && irrm == Irrm_ZERO)   /* FCVTZS Xd,Sn */
14209           || (iop == Iop_F32toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Sn */
14210           || (iop == Iop_F32toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Sn */
14211           || (iop == Iop_F32toI64S && irrm == Irrm_NEAREST)/* FCVT{A,N}S X,S */
14212           /* F32toI64U */
14213           || (iop == Iop_F32toI64U && irrm == Irrm_ZERO)   /* FCVTZU Xd,Sn */
14214           || (iop == Iop_F32toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Sn */
14215           || (iop == Iop_F32toI64U && irrm == Irrm_PosINF) /* FCVTPU Xd,Sn */
14216           || (iop == Iop_F32toI64U && irrm == Irrm_NEAREST)/* FCVT{A,N}U X,S */
14217           /* F64toI32S */
14218           || (iop == Iop_F64toI32S && irrm == Irrm_ZERO)   /* FCVTZS Wd,Dn */
14219           || (iop == Iop_F64toI32S && irrm == Irrm_NegINF) /* FCVTMS Wd,Dn */
14220           || (iop == Iop_F64toI32S && irrm == Irrm_PosINF) /* FCVTPS Wd,Dn */
14221           || (iop == Iop_F64toI32S && irrm == Irrm_NEAREST)/* FCVT{A,N}S W,D */
14222           /* F64toI32U */
14223           || (iop == Iop_F64toI32U && irrm == Irrm_ZERO)   /* FCVTZU Wd,Dn */
14224           || (iop == Iop_F64toI32U && irrm == Irrm_NegINF) /* FCVTMU Wd,Dn */
14225           || (iop == Iop_F64toI32U && irrm == Irrm_PosINF) /* FCVTPU Wd,Dn */
14226           || (iop == Iop_F64toI32U && irrm == Irrm_NEAREST)/* FCVT{A,N}U W,D */
14227           /* F64toI64S */
14228           || (iop == Iop_F64toI64S && irrm == Irrm_ZERO)   /* FCVTZS Xd,Dn */
14229           || (iop == Iop_F64toI64S && irrm == Irrm_NegINF) /* FCVTMS Xd,Dn */
14230           || (iop == Iop_F64toI64S && irrm == Irrm_PosINF) /* FCVTPS Xd,Dn */
14231           || (iop == Iop_F64toI64S && irrm == Irrm_NEAREST)/* FCVT{A,N}S X,D */
14232           /* F64toI64U */
14233           || (iop == Iop_F64toI64U && irrm == Irrm_ZERO)   /* FCVTZU Xd,Dn */
14234           || (iop == Iop_F64toI64U && irrm == Irrm_NegINF) /* FCVTMU Xd,Dn */
14235           || (iop == Iop_F64toI64U && irrm == Irrm_PosINF) /* FCVTPU Xd,Dn */
14236           || (iop == Iop_F64toI64U && irrm == Irrm_NEAREST)/* FCVT{A,N}U X,D */
14237          ) {
14238         /* validated */
14239       } else {
14240         return False;
14241       }
14242       IRType srcTy  = isF64 ? Ity_F64 : Ity_F32;
14243       IRType dstTy  = isI64 ? Ity_I64 : Ity_I32;
14244       IRTemp src    = newTemp(srcTy);
14245       IRTemp dst    = newTemp(dstTy);
14246       assign(src, getQRegLO(nn, srcTy));
14247       assign(dst, binop(iop, mkU32(irrm), mkexpr(src)));
14248       putIRegOrZR(isI64, dd, mkexpr(dst));
14249       DIP("fcvt%c%c %s, %s\n", ch, isU ? 'u' : 's',
14250           nameIRegOrZR(isI64, dd), nameQRegLO(nn, srcTy));
14251       return True;
14252    }
14253
14254    // op = 010, 011
14255    /* -------------- {S,U}CVTF (scalar, integer) -------------- */
14256    /* (ix) sf  S 28    ty   rm op  15     9 4
14257       0    0 0 0 11110 00 1 00 010 000000 n d  SCVTF Sd, Wn
14258       1    0 0 0 11110 01 1 00 010 000000 n d  SCVTF Dd, Wn
14259       2    1 0 0 11110 00 1 00 010 000000 n d  SCVTF Sd, Xn
14260       3    1 0 0 11110 01 1 00 010 000000 n d  SCVTF Dd, Xn
14261
14262       4    0 0 0 11110 00 1 00 011 000000 n d  UCVTF Sd, Wn
14263       5    0 0 0 11110 01 1 00 011 000000 n d  UCVTF Dd, Wn
14264       6    1 0 0 11110 00 1 00 011 000000 n d  UCVTF Sd, Xn
14265       7    1 0 0 11110 01 1 00 011 000000 n d  UCVTF Dd, Xn
14266
14267       These are signed/unsigned conversion from integer registers to
14268       FP registers, all 4 32/64-bit combinations, rounded per FPCR.
14269    */
14270    if (ty <= X01 && rm == X00 && (op == BITS3(0,1,0) || op == BITS3(0,1,1))) {
14271       Bool isI64 = bitSF == 1;
14272       Bool isF64 = (ty & 1) == 1;
14273       Bool isU   = (op & 1) == 1;
14274       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
14275       const IROp ops[8]
14276         = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
14277             Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
14278       IRExpr* src = getIRegOrZR(isI64, nn);
14279       IRExpr* res = (isF64 && !isI64)
14280                        ? unop(ops[ix], src)
14281                        : binop(ops[ix],
14282                                mkexpr(mk_get_IR_rounding_mode()), src);
14283       putQReg128(dd, mkV128(0));
14284       putQRegLO(dd, res);
14285       DIP("%ccvtf %s, %s\n",
14286           isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32),
14287           nameIRegOrZR(isI64, nn));
14288       return True;
14289    }
14290
14291    // op = 110, 111
14292    /* -------- FMOV (general) -------- */
14293    /* case sf  S       ty   rm op  15     9 4
14294        (1) 0 0 0 11110 00 1 00 111 000000 n d     FMOV Sd,      Wn
14295        (2) 1 0 0 11110 01 1 00 111 000000 n d     FMOV Dd,      Xn
14296        (3) 1 0 0 11110 10 1 01 111 000000 n d     FMOV Vd.D[1], Xn
14297
14298        (4) 0 0 0 11110 00 1 00 110 000000 n d     FMOV Wd, Sn
14299        (5) 1 0 0 11110 01 1 00 110 000000 n d     FMOV Xd, Dn
14300        (6) 1 0 0 11110 10 1 01 110 000000 n d     FMOV Xd, Vn.D[1]
14301    */
14302    if (1) {
14303       UInt ix = 0; // case
14304       if (bitSF == 0) {
14305          if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,1))
14306             ix = 1;
14307          else
14308          if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,0))
14309             ix = 4;
14310       } else {
14311          vassert(bitSF == 1);
14312          if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,1))
14313             ix = 2;
14314          else
14315          if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,0))
14316             ix = 5;
14317          else
14318          if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,1))
14319             ix = 3;
14320          else
14321          if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,0))
14322             ix = 6;
14323       }
14324       if (ix > 0) {
14325          switch (ix) {
14326             case 1:
14327                putQReg128(dd, mkV128(0));
14328                putQRegLO(dd, getIReg32orZR(nn));
14329                DIP("fmov s%u, w%u\n", dd, nn);
14330                break;
14331             case 2:
14332                putQReg128(dd, mkV128(0));
14333                putQRegLO(dd, getIReg64orZR(nn));
14334                DIP("fmov d%u, x%u\n", dd, nn);
14335                break;
14336             case 3:
14337                putQRegHI64(dd, getIReg64orZR(nn));
14338                DIP("fmov v%u.d[1], x%u\n", dd, nn);
14339                break;
14340             case 4:
14341                putIReg32orZR(dd, getQRegLO(nn, Ity_I32));
14342                DIP("fmov w%u, s%u\n", dd, nn);
14343                break;
14344             case 5:
14345                putIReg64orZR(dd, getQRegLO(nn, Ity_I64));
14346                DIP("fmov x%u, d%u\n", dd, nn);
14347                break;
14348             case 6:
14349                putIReg64orZR(dd, getQRegHI64(nn));
14350                DIP("fmov x%u, v%u.d[1]\n", dd, nn);
14351                break;
14352             default:
14353                vassert(0);
14354          }
14355          return True;
14356       }
14357       /* undecodable; fall through */
14358    }
14359
14360    return False;
14361 #  undef INSN
14362 }
14363
14364
14365 static
14366 Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn)
14367 {
14368    Bool ok;
14369    ok = dis_AdvSIMD_EXT(dres, insn);
14370    if (UNLIKELY(ok)) return True;
14371    ok = dis_AdvSIMD_TBL_TBX(dres, insn);
14372    if (UNLIKELY(ok)) return True;
14373    ok = dis_AdvSIMD_ZIP_UZP_TRN(dres, insn);
14374    if (UNLIKELY(ok)) return True;
14375    ok = dis_AdvSIMD_across_lanes(dres, insn);
14376    if (UNLIKELY(ok)) return True;
14377    ok = dis_AdvSIMD_copy(dres, insn);
14378    if (UNLIKELY(ok)) return True;
14379    ok = dis_AdvSIMD_modified_immediate(dres, insn);
14380    if (UNLIKELY(ok)) return True;
14381    ok = dis_AdvSIMD_scalar_copy(dres, insn);
14382    if (UNLIKELY(ok)) return True;
14383    ok = dis_AdvSIMD_scalar_pairwise(dres, insn);
14384    if (UNLIKELY(ok)) return True;
14385    ok = dis_AdvSIMD_scalar_shift_by_imm(dres, insn);
14386    if (UNLIKELY(ok)) return True;
14387    ok = dis_AdvSIMD_scalar_three_different(dres, insn);
14388    if (UNLIKELY(ok)) return True;
14389    ok = dis_AdvSIMD_scalar_three_same(dres, insn);
14390    if (UNLIKELY(ok)) return True;
14391    ok = dis_AdvSIMD_scalar_two_reg_misc(dres, insn);
14392    if (UNLIKELY(ok)) return True;
14393    ok = dis_AdvSIMD_scalar_x_indexed_element(dres, insn);
14394    if (UNLIKELY(ok)) return True;
14395    ok = dis_AdvSIMD_shift_by_immediate(dres, insn);
14396    if (UNLIKELY(ok)) return True;
14397    ok = dis_AdvSIMD_three_different(dres, insn);
14398    if (UNLIKELY(ok)) return True;
14399    ok = dis_AdvSIMD_three_same(dres, insn);
14400    if (UNLIKELY(ok)) return True;
14401    ok = dis_AdvSIMD_two_reg_misc(dres, insn);
14402    if (UNLIKELY(ok)) return True;
14403    ok = dis_AdvSIMD_vector_x_indexed_elem(dres, insn);
14404    if (UNLIKELY(ok)) return True;
14405    ok = dis_AdvSIMD_crypto_aes(dres, insn);
14406    if (UNLIKELY(ok)) return True;
14407    ok = dis_AdvSIMD_crypto_three_reg_sha(dres, insn);
14408    if (UNLIKELY(ok)) return True;
14409    ok = dis_AdvSIMD_crypto_two_reg_sha(dres, insn);
14410    if (UNLIKELY(ok)) return True;
14411    ok = dis_AdvSIMD_fp_compare(dres, insn);
14412    if (UNLIKELY(ok)) return True;
14413    ok = dis_AdvSIMD_fp_conditional_compare(dres, insn);
14414    if (UNLIKELY(ok)) return True;
14415    ok = dis_AdvSIMD_fp_conditional_select(dres, insn);
14416    if (UNLIKELY(ok)) return True;
14417    ok = dis_AdvSIMD_fp_data_proc_1_source(dres, insn);
14418    if (UNLIKELY(ok)) return True;
14419    ok = dis_AdvSIMD_fp_data_proc_2_source(dres, insn);
14420    if (UNLIKELY(ok)) return True;
14421    ok = dis_AdvSIMD_fp_data_proc_3_source(dres, insn);
14422    if (UNLIKELY(ok)) return True;
14423    ok = dis_AdvSIMD_fp_immediate(dres, insn);
14424    if (UNLIKELY(ok)) return True;
14425    ok = dis_AdvSIMD_fp_to_from_fixedp_conv(dres, insn);
14426    if (UNLIKELY(ok)) return True;
14427    ok = dis_AdvSIMD_fp_to_from_int_conv(dres, insn);
14428    if (UNLIKELY(ok)) return True;
14429    return False;
14430 }
14431
14432
14433 /*------------------------------------------------------------*/
14434 /*--- Disassemble a single ARM64 instruction               ---*/
14435 /*------------------------------------------------------------*/
14436
14437 /* Disassemble a single ARM64 instruction into IR.  The instruction
14438    has is located at |guest_instr| and has guest IP of
14439    |guest_PC_curr_instr|, which will have been set before the call
14440    here.  Returns True iff the instruction was decoded, in which case
14441    *dres will be set accordingly, or False, in which case *dres should
14442    be ignored by the caller. */
14443
14444 static
14445 Bool disInstr_ARM64_WRK (
14446         /*MB_OUT*/DisResult* dres,
14447         Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
14448         Bool         resteerCisOk,
14449         void*        callback_opaque,
14450         const UChar* guest_instr,
14451         const VexArchInfo* archinfo,
14452         const VexAbiInfo*  abiinfo
14453      )
14454 {
14455    // A macro to fish bits out of 'insn'.
14456 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14457
14458 //ZZ    DisResult dres;
14459 //ZZ    UInt      insn;
14460 //ZZ    //Bool      allow_VFP = False;
14461 //ZZ    //UInt      hwcaps = archinfo->hwcaps;
14462 //ZZ    IRTemp    condT; /* :: Ity_I32 */
14463 //ZZ    UInt      summary;
14464 //ZZ    HChar     dis_buf[128];  // big enough to hold LDMIA etc text
14465 //ZZ
14466 //ZZ    /* What insn variants are we supporting today? */
14467 //ZZ    //allow_VFP  = (0 != (hwcaps & VEX_HWCAPS_ARM_VFP));
14468 //ZZ    // etc etc
14469
14470    /* Set result defaults. */
14471    dres->whatNext    = Dis_Continue;
14472    dres->len         = 4;
14473    dres->continueAt  = 0;
14474    dres->jk_StopHere = Ijk_INVALID;
14475    dres->hint        = Dis_HintNone;
14476
14477    /* At least this is simple on ARM64: insns are all 4 bytes long, and
14478       4-aligned.  So just fish the whole thing out of memory right now
14479       and have done. */
14480    UInt insn = getUIntLittleEndianly( guest_instr );
14481
14482    if (0) vex_printf("insn: 0x%x\n", insn);
14483
14484    DIP("\t(arm64) 0x%llx:  ", (ULong)guest_PC_curr_instr);
14485
14486    vassert(0 == (guest_PC_curr_instr & 3ULL));
14487
14488    /* ----------------------------------------------------------- */
14489
14490    /* Spot "Special" instructions (see comment at top of file). */
14491    {
14492       const UChar* code = guest_instr;
14493       /* Spot the 16-byte preamble:
14494             93CC0D8C   ror x12, x12, #3
14495             93CC358C   ror x12, x12, #13
14496             93CCCD8C   ror x12, x12, #51
14497             93CCF58C   ror x12, x12, #61
14498       */
14499       UInt word1 = 0x93CC0D8C;
14500       UInt word2 = 0x93CC358C;
14501       UInt word3 = 0x93CCCD8C;
14502       UInt word4 = 0x93CCF58C;
14503       if (getUIntLittleEndianly(code+ 0) == word1 &&
14504           getUIntLittleEndianly(code+ 4) == word2 &&
14505           getUIntLittleEndianly(code+ 8) == word3 &&
14506           getUIntLittleEndianly(code+12) == word4) {
14507          /* Got a "Special" instruction preamble.  Which one is it? */
14508          if (getUIntLittleEndianly(code+16) == 0xAA0A014A
14509                                                /* orr x10,x10,x10 */) {
14510             /* X3 = client_request ( X4 ) */
14511             DIP("x3 = client_request ( x4 )\n");
14512             putPC(mkU64( guest_PC_curr_instr + 20 ));
14513             dres->jk_StopHere = Ijk_ClientReq;
14514             dres->whatNext    = Dis_StopHere;
14515             return True;
14516          }
14517          else
14518          if (getUIntLittleEndianly(code+16) == 0xAA0B016B
14519                                                /* orr x11,x11,x11 */) {
14520             /* X3 = guest_NRADDR */
14521             DIP("x3 = guest_NRADDR\n");
14522             dres->len = 20;
14523             putIReg64orZR(3, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
14524             return True;
14525          }
14526          else
14527          if (getUIntLittleEndianly(code+16) == 0xAA0C018C
14528                                                /* orr x12,x12,x12 */) {
14529             /*  branch-and-link-to-noredir X8 */
14530             DIP("branch-and-link-to-noredir x8\n");
14531             putIReg64orZR(30, mkU64(guest_PC_curr_instr + 20));
14532             putPC(getIReg64orZR(8));
14533             dres->jk_StopHere = Ijk_NoRedir;
14534             dres->whatNext    = Dis_StopHere;
14535             return True;
14536          }
14537          else
14538          if (getUIntLittleEndianly(code+16) == 0xAA090129
14539                                                /* orr x9,x9,x9 */) {
14540             /* IR injection */
14541             DIP("IR injection\n");
14542             vex_inject_ir(irsb, Iend_LE);
14543             // Invalidate the current insn. The reason is that the IRop we're
14544             // injecting here can change. In which case the translation has to
14545             // be redone. For ease of handling, we simply invalidate all the
14546             // time.
14547             stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_PC_curr_instr)));
14548             stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(20)));
14549             putPC(mkU64( guest_PC_curr_instr + 20 ));
14550             dres->whatNext    = Dis_StopHere;
14551             dres->jk_StopHere = Ijk_InvalICache;
14552             return True;
14553          }
14554          /* We don't know what it is. */
14555          return False;
14556          /*NOTREACHED*/
14557       }
14558    }
14559
14560    /* ----------------------------------------------------------- */
14561
14562    /* Main ARM64 instruction decoder starts here. */
14563
14564    Bool ok = False;
14565
14566    /* insn[28:25] determines the top-level grouping, so let's start
14567       off with that.
14568
14569       For all of these dis_ARM64_ functions, we pass *dres with the
14570       normal default results "insn OK, 4 bytes long, keep decoding" so
14571       they don't need to change it.  However, decodes of control-flow
14572       insns may cause *dres to change.
14573    */
14574    switch (INSN(28,25)) {
14575       case BITS4(1,0,0,0): case BITS4(1,0,0,1):
14576          // Data processing - immediate
14577          ok = dis_ARM64_data_processing_immediate(dres, insn);
14578          break;
14579       case BITS4(1,0,1,0): case BITS4(1,0,1,1):
14580          // Branch, exception generation and system instructions
14581          ok = dis_ARM64_branch_etc(dres, insn, archinfo, abiinfo);
14582          break;
14583       case BITS4(0,1,0,0): case BITS4(0,1,1,0):
14584       case BITS4(1,1,0,0): case BITS4(1,1,1,0):
14585          // Loads and stores
14586          ok = dis_ARM64_load_store(dres, insn, abiinfo);
14587          break;
14588       case BITS4(0,1,0,1): case BITS4(1,1,0,1):
14589          // Data processing - register
14590          ok = dis_ARM64_data_processing_register(dres, insn);
14591          break;
14592       case BITS4(0,1,1,1): case BITS4(1,1,1,1):
14593          // Data processing - SIMD and floating point
14594          ok = dis_ARM64_simd_and_fp(dres, insn);
14595          break;
14596       case BITS4(0,0,0,0): case BITS4(0,0,0,1):
14597       case BITS4(0,0,1,0): case BITS4(0,0,1,1):
14598          // UNALLOCATED
14599          break;
14600       default:
14601          vassert(0); /* Can't happen */
14602    }
14603
14604    /* If the next-level down decoders failed, make sure |dres| didn't
14605       get changed. */
14606    if (!ok) {
14607       vassert(dres->whatNext    == Dis_Continue);
14608       vassert(dres->len         == 4);
14609       vassert(dres->continueAt  == 0);
14610       vassert(dres->jk_StopHere == Ijk_INVALID);
14611    }
14612
14613    return ok;
14614
14615 #  undef INSN
14616 }
14617
14618
14619 /*------------------------------------------------------------*/
14620 /*--- Top-level fn                                         ---*/
14621 /*------------------------------------------------------------*/
14622
14623 /* Disassemble a single instruction into IR.  The instruction
14624    is located in host memory at &guest_code[delta]. */
14625
14626 DisResult disInstr_ARM64 ( IRSB*        irsb_IN,
14627                            Bool         (*resteerOkFn) ( void*, Addr ),
14628                            Bool         resteerCisOk,
14629                            void*        callback_opaque,
14630                            const UChar* guest_code_IN,
14631                            Long         delta_IN,
14632                            Addr         guest_IP,
14633                            VexArch      guest_arch,
14634                            const VexArchInfo* archinfo,
14635                            const VexAbiInfo*  abiinfo,
14636                            VexEndness   host_endness_IN,
14637                            Bool         sigill_diag_IN )
14638 {
14639    DisResult dres;
14640    vex_bzero(&dres, sizeof(dres));
14641
14642    /* Set globals (see top of this file) */
14643    vassert(guest_arch == VexArchARM64);
14644
14645    irsb                = irsb_IN;
14646    host_endness        = host_endness_IN;
14647    guest_PC_curr_instr = (Addr64)guest_IP;
14648
14649    /* Sanity checks */
14650    /* (x::UInt - 2) <= 15   ===   x >= 2 && x <= 17 (I hope) */
14651    vassert((archinfo->arm64_dMinLine_lg2_szB - 2) <= 15);
14652    vassert((archinfo->arm64_iMinLine_lg2_szB - 2) <= 15);
14653
14654    /* Try to decode */
14655    Bool ok = disInstr_ARM64_WRK( &dres,
14656                                  resteerOkFn, resteerCisOk, callback_opaque,
14657                                  &guest_code_IN[delta_IN],
14658                                  archinfo, abiinfo );
14659    if (ok) {
14660       /* All decode successes end up here. */
14661       vassert(dres.len == 4 || dres.len == 20);
14662       switch (dres.whatNext) {
14663          case Dis_Continue:
14664             putPC( mkU64(dres.len + guest_PC_curr_instr) );
14665             break;
14666          case Dis_ResteerU:
14667          case Dis_ResteerC:
14668             putPC(mkU64(dres.continueAt));
14669             break;
14670          case Dis_StopHere:
14671             break;
14672          default:
14673             vassert(0);
14674       }
14675       DIP("\n");
14676    } else {
14677       /* All decode failures end up here. */
14678       if (sigill_diag_IN) {
14679          Int   i, j;
14680          UChar buf[64];
14681          UInt  insn
14682                   = getUIntLittleEndianly( &guest_code_IN[delta_IN] );
14683          vex_bzero(buf, sizeof(buf));
14684          for (i = j = 0; i < 32; i++) {
14685             if (i > 0) {
14686               if ((i & 7) == 0) buf[j++] = ' ';
14687               else if ((i & 3) == 0) buf[j++] = '\'';
14688             }
14689             buf[j++] = (insn & (1<<(31-i))) ? '1' : '0';
14690          }
14691          vex_printf("disInstr(arm64): unhandled instruction 0x%08x\n", insn);
14692          vex_printf("disInstr(arm64): %s\n", buf);
14693       }
14694
14695       /* Tell the dispatcher that this insn cannot be decoded, and so
14696          has not been executed, and (is currently) the next to be
14697          executed.  PC should be up-to-date since it is made so at the
14698          start of each insn, but nevertheless be paranoid and update
14699          it again right now. */
14700       putPC( mkU64(guest_PC_curr_instr) );
14701       dres.len         = 0;
14702       dres.whatNext    = Dis_StopHere;
14703       dres.jk_StopHere = Ijk_NoDecode;
14704       dres.continueAt  = 0;
14705    }
14706    return dres;
14707 }
14708
14709
14710 /*--------------------------------------------------------------------*/
14711 /*--- end                                       guest_arm64_toIR.c ---*/
14712 /*--------------------------------------------------------------------*/