VEX/priv/guest_arm64_toIR.c

   1 /* -*- mode: C; c-basic-offset: 3; -*- */
   2
   3 /*--------------------------------------------------------------------*/
   4 /*--- begin                                     guest_arm64_toIR.c ---*/
   5 /*--------------------------------------------------------------------*/
   6
   7 /*
   8    This file is part of Valgrind, a dynamic binary instrumentation
   9    framework.
  10
  11    Copyright (C) 2013-2017 OpenWorks
  12       info@open-works.net
  13
  14    This program is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU General Public License as
  16    published by the Free Software Foundation; either version 2 of the
  17    License, or (at your option) any later version.
  18
  19    This program is distributed in the hope that it will be useful, but
  20    WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    General Public License for more details.
  23
  24    You should have received a copy of the GNU General Public License
  25    along with this program; if not, see <http://www.gnu.org/licenses/>.
  26
  27    The GNU General Public License is contained in the file COPYING.
  28 */
  29
  30 /* KNOWN LIMITATIONS 2014-Nov-16
  31
  32    * Correctness: FMAXNM, FMINNM are implemented the same as FMAX/FMIN.
  33
  34      Also FP comparison "unordered" .. is implemented as normal FP
  35      comparison.
  36
  37      Both should be fixed.  They behave incorrectly in the presence of
  38      NaNs.
  39
  40      FMULX is treated the same as FMUL.  That's also not correct.
  41
  42    * Floating multiply-add (etc) insns.  Are split into a multiply and
  43      an add, and so suffer double rounding and hence sometimes the
  44      least significant mantissa bit is incorrect.  Fix: use the IR
  45      multiply-add IROps instead.
  46
  47    * FRINTX might be need updating to set the inexact computation FPSR flag
  48
  49    * Ditto FCVTXN.  No idea what "round to odd" means.  This implementation
  50      just rounds to nearest.
  51 */
  52
  53 /* "Special" instructions.
  54
  55    This instruction decoder can decode four special instructions
  56    which mean nothing natively (are no-ops as far as regs/mem are
  57    concerned) but have meaning for supporting Valgrind.  A special
  58    instruction is flagged by a 16-byte preamble:
  59
  60       93CC0D8C 93CC358C 93CCCD8C 93CCF58C
  61       (ror x12, x12, #3;   ror x12, x12, #13
  62        ror x12, x12, #51;  ror x12, x12, #61)
  63
  64    Following that, one of the following 3 are allowed
  65    (standard interpretation in parentheses):
  66
  67       AA0A014A (orr x10,x10,x10)   X3 = client_request ( X4 )
  68       AA0B016B (orr x11,x11,x11)   X3 = guest_NRADDR
  69       AA0C018C (orr x12,x12,x12)   branch-and-link-to-noredir X8
  70       AA090129 (orr x9,x9,x9)      IR injection
  71
  72    Any other bytes following the 16-byte preamble are illegal and
  73    constitute a failure in instruction decoding.  This all assumes
  74    that the preamble will never occur except in specific code
  75    fragments designed for Valgrind to catch.
  76 */
  77
  78 /* Translates ARM64 code to IR. */
  79
  80 #include "libvex_basictypes.h"
  81 #include "libvex_ir.h"
  82 #include "libvex.h"
  83 #include "libvex_guest_arm64.h"
  84
  85 #include "main_util.h"
  86 #include "main_globals.h"
  87 #include "guest_generic_bb_to_IR.h"
  88 #include "guest_arm64_defs.h"
  89
  90
  91 /*------------------------------------------------------------*/
  92 /*--- Globals                                              ---*/
  93 /*------------------------------------------------------------*/
  94
  95 /* These are set at the start of the translation of a instruction, so
  96    that we don't have to pass them around endlessly.  CONST means does
  97    not change during translation of the instruction.
  98 */
  99
 100 /* CONST: what is the host's endianness?  We need to know this in
 101    order to do sub-register accesses to the SIMD/FP registers
 102    correctly. */
 103 static VexEndness host_endness;
 104
 105 /* CONST: The guest address for the instruction currently being
 106    translated.  */
 107 static Addr64 guest_PC_curr_instr;
 108
 109 /* MOD: The IRSB* into which we're generating code. */
 110 static IRSB* irsb;
 111
 112
 113 /*------------------------------------------------------------*/
 114 /*--- Debugging output                                     ---*/
 115 /*------------------------------------------------------------*/
 116
 117 #define DIP(format, args...)           \
 118    if (vex_traceflags & VEX_TRACE_FE)  \
 119       vex_printf(format, ## args)
 120
 121 #define DIS(buf, format, args...)      \
 122    if (vex_traceflags & VEX_TRACE_FE)  \
 123       vex_sprintf(buf, format, ## args)
 124
 125
 126 /*------------------------------------------------------------*/
 127 /*--- Helper bits and pieces for deconstructing the        ---*/
 128 /*--- arm insn stream.                                     ---*/
 129 /*------------------------------------------------------------*/
 130
 131 /* Do a little-endian load of a 32-bit word, regardless of the
 132    endianness of the underlying host. */
 133 static inline UInt getUIntLittleEndianly ( const UChar* p )
 134 {
 135    UInt w = 0;
 136    w = (w << 8) | p[3];
 137    w = (w << 8) | p[2];
 138    w = (w << 8) | p[1];
 139    w = (w << 8) | p[0];
 140    return w;
 141 }
 142
 143 /* Sign extend a N-bit value up to 64 bits, by copying
 144    bit N-1 into all higher positions. */
 145 static ULong sx_to_64 ( ULong x, UInt n )
 146 {
 147    vassert(n > 1 && n < 64);
 148    x <<= (64-n);
 149    Long r = (Long)x;
 150    r >>= (64-n);
 151    return (ULong)r;
 152 }
 153
 154 //ZZ /* Do a little-endian load of a 16-bit word, regardless of the
 155 //ZZ    endianness of the underlying host. */
 156 //ZZ static inline UShort getUShortLittleEndianly ( UChar* p )
 157 //ZZ {
 158 //ZZ    UShort w = 0;
 159 //ZZ    w = (w << 8) | p[1];
 160 //ZZ    w = (w << 8) | p[0];
 161 //ZZ    return w;
 162 //ZZ }
 163 //ZZ
 164 //ZZ static UInt ROR32 ( UInt x, UInt sh ) {
 165 //ZZ    vassert(sh >= 0 && sh < 32);
 166 //ZZ    if (sh == 0)
 167 //ZZ       return x;
 168 //ZZ    else
 169 //ZZ       return (x << (32-sh)) | (x >> sh);
 170 //ZZ }
 171 //ZZ
 172 //ZZ static Int popcount32 ( UInt x )
 173 //ZZ {
 174 //ZZ    Int res = 0, i;
 175 //ZZ    for (i = 0; i < 32; i++) {
 176 //ZZ       res += (x & 1);
 177 //ZZ       x >>= 1;
 178 //ZZ    }
 179 //ZZ    return res;
 180 //ZZ }
 181 //ZZ
 182 //ZZ static UInt setbit32 ( UInt x, Int ix, UInt b )
 183 //ZZ {
 184 //ZZ    UInt mask = 1 << ix;
 185 //ZZ    x &= ~mask;
 186 //ZZ    x |= ((b << ix) & mask);
 187 //ZZ    return x;
 188 //ZZ }
 189
 190 #define BITS2(_b1,_b0)  \
 191    (((_b1) << 1) | (_b0))
 192
 193 #define BITS3(_b2,_b1,_b0)  \
 194   (((_b2) << 2) | ((_b1) << 1) | (_b0))
 195
 196 #define BITS4(_b3,_b2,_b1,_b0)  \
 197    (((_b3) << 3) | ((_b2) << 2) | ((_b1) << 1) | (_b0))
 198
 199 #define BITS8(_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 200    ((BITS4((_b7),(_b6),(_b5),(_b4)) << 4)  \
 201     | BITS4((_b3),(_b2),(_b1),(_b0)))
 202
 203 #define BITS5(_b4,_b3,_b2,_b1,_b0)  \
 204    (BITS8(0,0,0,(_b4),(_b3),(_b2),(_b1),(_b0)))
 205 #define BITS6(_b5,_b4,_b3,_b2,_b1,_b0)  \
 206    (BITS8(0,0,(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 207 #define BITS7(_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 208    (BITS8(0,(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 209
 210 #define BITS9(_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 211    (((_b8) << 8)  \
 212     | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 213
 214 #define BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 215    (((_b9) << 9) | ((_b8) << 8)  \
 216     | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
 217
 218 #define BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
 219    (((_b10) << 10)  \
 220     | BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
 221
 222 #define BITS12(_b11, _b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0) \
 223    (((_b11) << 11)  \
 224     | BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))
 225
 226 #define X00 BITS2(0,0)
 227 #define X01 BITS2(0,1)
 228 #define X10 BITS2(1,0)
 229 #define X11 BITS2(1,1)
 230
 231 // produces _uint[_bMax:_bMin]
 232 #define SLICE_UInt(_uint,_bMax,_bMin)  \
 233    (( ((UInt)(_uint)) >> (_bMin))  \
 234     & (UInt)((1ULL << ((_bMax) - (_bMin) + 1)) - 1ULL))
 235
 236
 237 /*------------------------------------------------------------*/
 238 /*--- Helper bits and pieces for creating IR fragments.    ---*/
 239 /*------------------------------------------------------------*/
 240
 241 static IRExpr* mkV128 ( UShort w )
 242 {
 243    return IRExpr_Const(IRConst_V128(w));
 244 }
 245
 246 static IRExpr* mkU64 ( ULong i )
 247 {
 248    return IRExpr_Const(IRConst_U64(i));
 249 }
 250
 251 static IRExpr* mkU32 ( UInt i )
 252 {
 253    return IRExpr_Const(IRConst_U32(i));
 254 }
 255
 256 static IRExpr* mkU16 ( UInt i )
 257 {
 258    vassert(i < 65536);
 259    return IRExpr_Const(IRConst_U16(i));
 260 }
 261
 262 static IRExpr* mkU8 ( UInt i )
 263 {
 264    vassert(i < 256);
 265    return IRExpr_Const(IRConst_U8( (UChar)i ));
 266 }
 267
 268 static IRExpr* mkexpr ( IRTemp tmp )
 269 {
 270    return IRExpr_RdTmp(tmp);
 271 }
 272
 273 static IRExpr* unop ( IROp op, IRExpr* a )
 274 {
 275    return IRExpr_Unop(op, a);
 276 }
 277
 278 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
 279 {
 280    return IRExpr_Binop(op, a1, a2);
 281 }
 282
 283 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
 284 {
 285    return IRExpr_Triop(op, a1, a2, a3);
 286 }
 287
 288 static IRExpr* qop ( IROp op, IRExpr* a1, IRExpr* a2,
 289                               IRExpr* a3, IRExpr* a4 )
 290 {
 291    return IRExpr_Qop(op, a1, a2, a3, a4);
 292 }
 293
 294 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
 295 {
 296    return IRExpr_Load(Iend_LE, ty, addr);
 297 }
 298
 299 /* Add a statement to the list held by "irbb". */
 300 static void stmt ( IRStmt* st )
 301 {
 302    addStmtToIRSB( irsb, st );
 303 }
 304
 305 static void assign ( IRTemp dst, IRExpr* e )
 306 {
 307    stmt( IRStmt_WrTmp(dst, e) );
 308 }
 309
 310 static void storeLE ( IRExpr* addr, IRExpr* data )
 311 {
 312    stmt( IRStmt_Store(Iend_LE, addr, data) );
 313 }
 314
 315 //ZZ static void storeGuardedLE ( IRExpr* addr, IRExpr* data, IRTemp guardT )
 316 //ZZ {
 317 //ZZ    if (guardT == IRTemp_INVALID) {
 318 //ZZ       /* unconditional */
 319 //ZZ       storeLE(addr, data);
 320 //ZZ    } else {
 321 //ZZ       stmt( IRStmt_StoreG(Iend_LE, addr, data,
 322 //ZZ                           binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
 323 //ZZ    }
 324 //ZZ }
 325 //ZZ
 326 //ZZ static void loadGuardedLE ( IRTemp dst, IRLoadGOp cvt,
 327 //ZZ                             IRExpr* addr, IRExpr* alt,
 328 //ZZ                             IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
 329 //ZZ {
 330 //ZZ    if (guardT == IRTemp_INVALID) {
 331 //ZZ       /* unconditional */
 332 //ZZ       IRExpr* loaded = NULL;
 333 //ZZ       switch (cvt) {
 334 //ZZ          case ILGop_Ident32:
 335 //ZZ             loaded = loadLE(Ity_I32, addr); break;
 336 //ZZ          case ILGop_8Uto32:
 337 //ZZ             loaded = unop(Iop_8Uto32, loadLE(Ity_I8, addr)); break;
 338 //ZZ          case ILGop_8Sto32:
 339 //ZZ             loaded = unop(Iop_8Sto32, loadLE(Ity_I8, addr)); break;
 340 //ZZ          case ILGop_16Uto32:
 341 //ZZ             loaded = unop(Iop_16Uto32, loadLE(Ity_I16, addr)); break;
 342 //ZZ          case ILGop_16Sto32:
 343 //ZZ             loaded = unop(Iop_16Sto32, loadLE(Ity_I16, addr)); break;
 344 //ZZ          default:
 345 //ZZ             vassert(0);
 346 //ZZ       }
 347 //ZZ       vassert(loaded != NULL);
 348 //ZZ       assign(dst, loaded);
 349 //ZZ    } else {
 350 //ZZ       /* Generate a guarded load into 'dst', but apply 'cvt' to the
 351 //ZZ          loaded data before putting the data in 'dst'.  If the load
 352 //ZZ          does not take place, 'alt' is placed directly in 'dst'. */
 353 //ZZ       stmt( IRStmt_LoadG(Iend_LE, cvt, dst, addr, alt,
 354 //ZZ                          binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
 355 //ZZ    }
 356 //ZZ }
 357
 358 /* Generate a new temporary of the given type. */
 359 static IRTemp newTemp ( IRType ty )
 360 {
 361    vassert(isPlausibleIRType(ty));
 362    return newIRTemp( irsb->tyenv, ty );
 363 }
 364
 365 /* This is used in many places, so the brevity is an advantage. */
 366 static IRTemp newTempV128(void)
 367 {
 368    return newTemp(Ity_V128);
 369 }
 370
 371 /* Initialise V128 temporaries en masse. */
 372 static
 373 void newTempsV128_2(IRTemp* t1, IRTemp* t2)
 374 {
 375    vassert(t1 && *t1 == IRTemp_INVALID);
 376    vassert(t2 && *t2 == IRTemp_INVALID);
 377    *t1 = newTempV128();
 378    *t2 = newTempV128();
 379 }
 380
 381 static
 382 void newTempsV128_3(IRTemp* t1, IRTemp* t2, IRTemp* t3)
 383 {
 384    vassert(t1 && *t1 == IRTemp_INVALID);
 385    vassert(t2 && *t2 == IRTemp_INVALID);
 386    vassert(t3 && *t3 == IRTemp_INVALID);
 387    *t1 = newTempV128();
 388    *t2 = newTempV128();
 389    *t3 = newTempV128();
 390 }
 391
 392 static
 393 void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4)
 394 {
 395    vassert(t1 && *t1 == IRTemp_INVALID);
 396    vassert(t2 && *t2 == IRTemp_INVALID);
 397    vassert(t3 && *t3 == IRTemp_INVALID);
 398    vassert(t4 && *t4 == IRTemp_INVALID);
 399    *t1 = newTempV128();
 400    *t2 = newTempV128();
 401    *t3 = newTempV128();
 402    *t4 = newTempV128();
 403 }
 404
 405 static
 406 void newTempsV128_7(IRTemp* t1, IRTemp* t2, IRTemp* t3,
 407                     IRTemp* t4, IRTemp* t5, IRTemp* t6, IRTemp* t7)
 408 {
 409    vassert(t1 && *t1 == IRTemp_INVALID);
 410    vassert(t2 && *t2 == IRTemp_INVALID);
 411    vassert(t3 && *t3 == IRTemp_INVALID);
 412    vassert(t4 && *t4 == IRTemp_INVALID);
 413    vassert(t5 && *t5 == IRTemp_INVALID);
 414    vassert(t6 && *t6 == IRTemp_INVALID);
 415    vassert(t7 && *t7 == IRTemp_INVALID);
 416    *t1 = newTempV128();
 417    *t2 = newTempV128();
 418    *t3 = newTempV128();
 419    *t4 = newTempV128();
 420    *t5 = newTempV128();
 421    *t6 = newTempV128();
 422    *t7 = newTempV128();
 423 }
 424
 425 //ZZ /* Produces a value in 0 .. 3, which is encoded as per the type
 426 //ZZ    IRRoundingMode. */
 427 //ZZ static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
 428 //ZZ {
 429 //ZZ    return mkU32(Irrm_NEAREST);
 430 //ZZ }
 431 //ZZ
 432 //ZZ /* Generate an expression for SRC rotated right by ROT. */
 433 //ZZ static IRExpr* genROR32( IRTemp src, Int rot )
 434 //ZZ {
 435 //ZZ    vassert(rot >= 0 && rot < 32);
 436 //ZZ    if (rot == 0)
 437 //ZZ       return mkexpr(src);
 438 //ZZ    return
 439 //ZZ       binop(Iop_Or32,
 440 //ZZ             binop(Iop_Shl32, mkexpr(src), mkU8(32 - rot)),
 441 //ZZ             binop(Iop_Shr32, mkexpr(src), mkU8(rot)));
 442 //ZZ }
 443 //ZZ
 444 //ZZ static IRExpr* mkU128 ( ULong i )
 445 //ZZ {
 446 //ZZ    return binop(Iop_64HLtoV128, mkU64(i), mkU64(i));
 447 //ZZ }
 448 //ZZ
 449 //ZZ /* Generate a 4-aligned version of the given expression if
 450 //ZZ    the given condition is true.  Else return it unchanged. */
 451 //ZZ static IRExpr* align4if ( IRExpr* e, Bool b )
 452 //ZZ {
 453 //ZZ    if (b)
 454 //ZZ       return binop(Iop_And32, e, mkU32(~3));
 455 //ZZ    else
 456 //ZZ       return e;
 457 //ZZ }
 458
 459 /* Other IR construction helpers. */
 460 static IROp mkAND ( IRType ty ) {
 461    switch (ty) {
 462       case Ity_I32: return Iop_And32;
 463       case Ity_I64: return Iop_And64;
 464       default: vpanic("mkAND");
 465    }
 466 }
 467
 468 static IROp mkOR ( IRType ty ) {
 469    switch (ty) {
 470       case Ity_I32: return Iop_Or32;
 471       case Ity_I64: return Iop_Or64;
 472       default: vpanic("mkOR");
 473    }
 474 }
 475
 476 static IROp mkXOR ( IRType ty ) {
 477    switch (ty) {
 478       case Ity_I32: return Iop_Xor32;
 479       case Ity_I64: return Iop_Xor64;
 480       default: vpanic("mkXOR");
 481    }
 482 }
 483
 484 static IROp mkSHL ( IRType ty ) {
 485    switch (ty) {
 486       case Ity_I32: return Iop_Shl32;
 487       case Ity_I64: return Iop_Shl64;
 488       default: vpanic("mkSHL");
 489    }
 490 }
 491
 492 static IROp mkSHR ( IRType ty ) {
 493    switch (ty) {
 494       case Ity_I32: return Iop_Shr32;
 495       case Ity_I64: return Iop_Shr64;
 496       default: vpanic("mkSHR");
 497    }
 498 }
 499
 500 static IROp mkSAR ( IRType ty ) {
 501    switch (ty) {
 502       case Ity_I32: return Iop_Sar32;
 503       case Ity_I64: return Iop_Sar64;
 504       default: vpanic("mkSAR");
 505    }
 506 }
 507
 508 static IROp mkNOT ( IRType ty ) {
 509    switch (ty) {
 510       case Ity_I32: return Iop_Not32;
 511       case Ity_I64: return Iop_Not64;
 512       default: vpanic("mkNOT");
 513    }
 514 }
 515
 516 static IROp mkADD ( IRType ty ) {
 517    switch (ty) {
 518       case Ity_I32: return Iop_Add32;
 519       case Ity_I64: return Iop_Add64;
 520       default: vpanic("mkADD");
 521    }
 522 }
 523
 524 static IROp mkSUB ( IRType ty ) {
 525    switch (ty) {
 526       case Ity_I32: return Iop_Sub32;
 527       case Ity_I64: return Iop_Sub64;
 528       default: vpanic("mkSUB");
 529    }
 530 }
 531
 532 static IROp mkADDF ( IRType ty ) {
 533    switch (ty) {
 534       case Ity_F16: return Iop_AddF16;
 535       case Ity_F32: return Iop_AddF32;
 536       case Ity_F64: return Iop_AddF64;
 537       default: vpanic("mkADDF");
 538    }
 539 }
 540
 541 static IROp mkFMADDF ( IRType ty ) {
 542    switch (ty) {
 543       case Ity_F32: return Iop_MAddF32;
 544       case Ity_F64: return Iop_MAddF64;
 545       default: vpanic("mkFMADDF");
 546    }
 547 }
 548
 549 static IROp mkFMSUBF ( IRType ty ) {
 550    switch (ty) {
 551       case Ity_F32: return Iop_MSubF32;
 552       case Ity_F64: return Iop_MSubF64;
 553       default: vpanic("mkFMSUBF");
 554    }
 555 }
 556
 557 static IROp mkSUBF ( IRType ty ) {
 558    switch (ty) {
 559       case Ity_F16: return Iop_SubF16;
 560       case Ity_F32: return Iop_SubF32;
 561       case Ity_F64: return Iop_SubF64;
 562       default: vpanic("mkSUBF");
 563    }
 564 }
 565
 566 static IROp mkMULF ( IRType ty ) {
 567    switch (ty) {
 568       case Ity_F32: return Iop_MulF32;
 569       case Ity_F64: return Iop_MulF64;
 570       default: vpanic("mkMULF");
 571    }
 572 }
 573
 574 static IROp mkDIVF ( IRType ty ) {
 575    switch (ty) {
 576       case Ity_F32: return Iop_DivF32;
 577       case Ity_F64: return Iop_DivF64;
 578       default: vpanic("mkDIVF");
 579    }
 580 }
 581
 582 static IROp mkNEGF ( IRType ty ) {
 583    switch (ty) {
 584       case Ity_F16: return Iop_NegF16;
 585       case Ity_F32: return Iop_NegF32;
 586       case Ity_F64: return Iop_NegF64;
 587       default: vpanic("mkNEGF");
 588    }
 589 }
 590
 591 static IROp mkABSF ( IRType ty ) {
 592    switch (ty) {
 593       case Ity_F16: return Iop_AbsF16;
 594       case Ity_F32: return Iop_AbsF32;
 595       case Ity_F64: return Iop_AbsF64;
 596       default: vpanic("mkABSF");
 597    }
 598 }
 599
 600 static IROp mkSQRTF ( IRType ty ) {
 601    switch (ty) {
 602       case Ity_F16: return Iop_SqrtF16;
 603       case Ity_F32: return Iop_SqrtF32;
 604       case Ity_F64: return Iop_SqrtF64;
 605       default: vpanic("mkSQRTF");
 606    }
 607 }
 608
 609 static IROp mkVecADD ( UInt size ) {
 610    const IROp ops[4]
 611       = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
 612    vassert(size < 4);
 613    return ops[size];
 614 }
 615
 616 static IROp mkVecQADDU ( UInt size ) {
 617    const IROp ops[4]
 618       = { Iop_QAdd8Ux16, Iop_QAdd16Ux8, Iop_QAdd32Ux4, Iop_QAdd64Ux2 };
 619    vassert(size < 4);
 620    return ops[size];
 621 }
 622
 623 static IROp mkVecQADDS ( UInt size ) {
 624    const IROp ops[4]
 625       = { Iop_QAdd8Sx16, Iop_QAdd16Sx8, Iop_QAdd32Sx4, Iop_QAdd64Sx2 };
 626    vassert(size < 4);
 627    return ops[size];
 628 }
 629
 630 static IROp mkVecQADDEXTSUSATUU ( UInt size ) {
 631    const IROp ops[4]
 632       = { Iop_QAddExtSUsatUU8x16, Iop_QAddExtSUsatUU16x8,
 633           Iop_QAddExtSUsatUU32x4, Iop_QAddExtSUsatUU64x2 };
 634    vassert(size < 4);
 635    return ops[size];
 636 }
 637
 638 static IROp mkVecQADDEXTUSSATSS ( UInt size ) {
 639    const IROp ops[4]
 640       = { Iop_QAddExtUSsatSS8x16, Iop_QAddExtUSsatSS16x8,
 641           Iop_QAddExtUSsatSS32x4, Iop_QAddExtUSsatSS64x2 };
 642    vassert(size < 4);
 643    return ops[size];
 644 }
 645
 646 static IROp mkVecSUB ( UInt size ) {
 647    const IROp ops[4]
 648       = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
 649    vassert(size < 4);
 650    return ops[size];
 651 }
 652
 653 static IROp mkVecQSUBU ( UInt size ) {
 654    const IROp ops[4]
 655       = { Iop_QSub8Ux16, Iop_QSub16Ux8, Iop_QSub32Ux4, Iop_QSub64Ux2 };
 656    vassert(size < 4);
 657    return ops[size];
 658 }
 659
 660 static IROp mkVecQSUBS ( UInt size ) {
 661    const IROp ops[4]
 662       = { Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4, Iop_QSub64Sx2 };
 663    vassert(size < 4);
 664    return ops[size];
 665 }
 666
 667 static IROp mkVecSARN ( UInt size ) {
 668    const IROp ops[4]
 669       = { Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4, Iop_SarN64x2 };
 670    vassert(size < 4);
 671    return ops[size];
 672 }
 673
 674 static IROp mkVecSHRN ( UInt size ) {
 675    const IROp ops[4]
 676       = { Iop_ShrN8x16, Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
 677    vassert(size < 4);
 678    return ops[size];
 679 }
 680
 681 static IROp mkVecSHLN ( UInt size ) {
 682    const IROp ops[4]
 683       = { Iop_ShlN8x16, Iop_ShlN16x8, Iop_ShlN32x4, Iop_ShlN64x2 };
 684    vassert(size < 4);
 685    return ops[size];
 686 }
 687
 688 static IROp mkVecCATEVENLANES ( UInt size ) {
 689    const IROp ops[4]
 690       = { Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8,
 691           Iop_CatEvenLanes32x4, Iop_InterleaveLO64x2 };
 692    vassert(size < 4);
 693    return ops[size];
 694 }
 695
 696 static IROp mkVecCATODDLANES ( UInt size ) {
 697    const IROp ops[4]
 698       = { Iop_CatOddLanes8x16, Iop_CatOddLanes16x8,
 699           Iop_CatOddLanes32x4, Iop_InterleaveHI64x2 };
 700    vassert(size < 4);
 701    return ops[size];
 702 }
 703
 704 static IROp mkVecINTERLEAVELO ( UInt size ) {
 705    const IROp ops[4]
 706       = { Iop_InterleaveLO8x16, Iop_InterleaveLO16x8,
 707           Iop_InterleaveLO32x4, Iop_InterleaveLO64x2 };
 708    vassert(size < 4);
 709    return ops[size];
 710 }
 711
 712 static IROp mkVecINTERLEAVEHI ( UInt size ) {
 713    const IROp ops[4]
 714       = { Iop_InterleaveHI8x16, Iop_InterleaveHI16x8,
 715           Iop_InterleaveHI32x4, Iop_InterleaveHI64x2 };
 716    vassert(size < 4);
 717    return ops[size];
 718 }
 719
 720 static IROp mkVecMAXU ( UInt size ) {
 721    const IROp ops[4]
 722       = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4, Iop_Max64Ux2 };
 723    vassert(size < 4);
 724    return ops[size];
 725 }
 726
 727 static IROp mkVecMAXS ( UInt size ) {
 728    const IROp ops[4]
 729       = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4, Iop_Max64Sx2 };
 730    vassert(size < 4);
 731    return ops[size];
 732 }
 733
 734 static IROp mkVecMINU ( UInt size ) {
 735    const IROp ops[4]
 736       = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4, Iop_Min64Ux2 };
 737    vassert(size < 4);
 738    return ops[size];
 739 }
 740
 741 static IROp mkVecMINS ( UInt size ) {
 742    const IROp ops[4]
 743       = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4, Iop_Min64Sx2 };
 744    vassert(size < 4);
 745    return ops[size];
 746 }
 747
 748 static IROp mkVecMUL ( UInt size ) {
 749    const IROp ops[4]
 750       = { Iop_Mul8x16, Iop_Mul16x8, Iop_Mul32x4, Iop_INVALID };
 751    vassert(size < 3);
 752    return ops[size];
 753 }
 754
 755 static IROp mkVecMULLU ( UInt sizeNarrow ) {
 756    const IROp ops[4]
 757       = { Iop_Mull8Ux8, Iop_Mull16Ux4, Iop_Mull32Ux2, Iop_INVALID };
 758    vassert(sizeNarrow < 3);
 759    return ops[sizeNarrow];
 760 }
 761
 762 static IROp mkVecMULLS ( UInt sizeNarrow ) {
 763    const IROp ops[4]
 764       = { Iop_Mull8Sx8, Iop_Mull16Sx4, Iop_Mull32Sx2, Iop_INVALID };
 765    vassert(sizeNarrow < 3);
 766    return ops[sizeNarrow];
 767 }
 768
 769 static IROp mkVecQDMULLS ( UInt sizeNarrow ) {
 770    const IROp ops[4]
 771       = { Iop_INVALID, Iop_QDMull16Sx4, Iop_QDMull32Sx2, Iop_INVALID };
 772    vassert(sizeNarrow < 3);
 773    return ops[sizeNarrow];
 774 }
 775
 776 static IROp mkVecCMPEQ ( UInt size ) {
 777    const IROp ops[4]
 778       = { Iop_CmpEQ8x16, Iop_CmpEQ16x8, Iop_CmpEQ32x4, Iop_CmpEQ64x2 };
 779    vassert(size < 4);
 780    return ops[size];
 781 }
 782
 783 static IROp mkVecCMPGTU ( UInt size ) {
 784    const IROp ops[4]
 785       = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4, Iop_CmpGT64Ux2 };
 786    vassert(size < 4);
 787    return ops[size];
 788 }
 789
 790 static IROp mkVecCMPGTS ( UInt size ) {
 791    const IROp ops[4]
 792       = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2 };
 793    vassert(size < 4);
 794    return ops[size];
 795 }
 796
 797 static IROp mkVecABS ( UInt size ) {
 798    const IROp ops[4]
 799       = { Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2 };
 800    vassert(size < 4);
 801    return ops[size];
 802 }
 803
 804 static IROp mkVecZEROHIxxOFV128 ( UInt size ) {
 805    const IROp ops[4]
 806       = { Iop_ZeroHI120ofV128, Iop_ZeroHI112ofV128,
 807           Iop_ZeroHI96ofV128,  Iop_ZeroHI64ofV128 };
 808    vassert(size < 4);
 809    return ops[size];
 810 }
 811
 812 static IRExpr* mkU ( IRType ty, ULong imm ) {
 813    switch (ty) {
 814       case Ity_I32: return mkU32((UInt)(imm & 0xFFFFFFFFULL));
 815       case Ity_I64: return mkU64(imm);
 816       default: vpanic("mkU");
 817    }
 818 }
 819
 820 static IROp mkVecQDMULHIS ( UInt size ) {
 821    const IROp ops[4]
 822       = { Iop_INVALID, Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4, Iop_INVALID };
 823    vassert(size < 4);
 824    return ops[size];
 825 }
 826
 827 static IROp mkVecQRDMULHIS ( UInt size ) {
 828    const IROp ops[4]
 829       = { Iop_INVALID, Iop_QRDMulHi16Sx8, Iop_QRDMulHi32Sx4, Iop_INVALID };
 830    vassert(size < 4);
 831    return ops[size];
 832 }
 833
 834 static IROp mkVecQANDUQSH ( UInt size ) {
 835    const IROp ops[4]
 836       = { Iop_QandUQsh8x16, Iop_QandUQsh16x8,
 837           Iop_QandUQsh32x4, Iop_QandUQsh64x2 };
 838    vassert(size < 4);
 839    return ops[size];
 840 }
 841
 842 static IROp mkVecQANDSQSH ( UInt size ) {
 843    const IROp ops[4]
 844       = { Iop_QandSQsh8x16, Iop_QandSQsh16x8,
 845           Iop_QandSQsh32x4, Iop_QandSQsh64x2 };
 846    vassert(size < 4);
 847    return ops[size];
 848 }
 849
 850 static IROp mkVecQANDUQRSH ( UInt size ) {
 851    const IROp ops[4]
 852       = { Iop_QandUQRsh8x16, Iop_QandUQRsh16x8,
 853           Iop_QandUQRsh32x4, Iop_QandUQRsh64x2 };
 854    vassert(size < 4);
 855    return ops[size];
 856 }
 857
 858 static IROp mkVecQANDSQRSH ( UInt size ) {
 859    const IROp ops[4]
 860       = { Iop_QandSQRsh8x16, Iop_QandSQRsh16x8,
 861           Iop_QandSQRsh32x4, Iop_QandSQRsh64x2 };
 862    vassert(size < 4);
 863    return ops[size];
 864 }
 865
 866 static IROp mkVecSHU ( UInt size ) {
 867    const IROp ops[4]
 868       = { Iop_Sh8Ux16, Iop_Sh16Ux8, Iop_Sh32Ux4, Iop_Sh64Ux2 };
 869    vassert(size < 4);
 870    return ops[size];
 871 }
 872
 873 static IROp mkVecSHS ( UInt size ) {
 874    const IROp ops[4]
 875       = { Iop_Sh8Sx16, Iop_Sh16Sx8, Iop_Sh32Sx4, Iop_Sh64Sx2 };
 876    vassert(size < 4);
 877    return ops[size];
 878 }
 879
 880 static IROp mkVecRSHU ( UInt size ) {
 881    const IROp ops[4]
 882       = { Iop_Rsh8Ux16, Iop_Rsh16Ux8, Iop_Rsh32Ux4, Iop_Rsh64Ux2 };
 883    vassert(size < 4);
 884    return ops[size];
 885 }
 886
 887 static IROp mkVecRSHS ( UInt size ) {
 888    const IROp ops[4]
 889       = { Iop_Rsh8Sx16, Iop_Rsh16Sx8, Iop_Rsh32Sx4, Iop_Rsh64Sx2 };
 890    vassert(size < 4);
 891    return ops[size];
 892 }
 893
 894 static IROp mkVecNARROWUN ( UInt sizeNarrow ) {
 895    const IROp ops[4]
 896       = { Iop_NarrowUn16to8x8, Iop_NarrowUn32to16x4,
 897           Iop_NarrowUn64to32x2, Iop_INVALID };
 898    vassert(sizeNarrow < 4);
 899    return ops[sizeNarrow];
 900 }
 901
 902 static IROp mkVecQNARROWUNSU ( UInt sizeNarrow ) {
 903    const IROp ops[4]
 904       = { Iop_QNarrowUn16Sto8Ux8,  Iop_QNarrowUn32Sto16Ux4,
 905           Iop_QNarrowUn64Sto32Ux2, Iop_INVALID };
 906    vassert(sizeNarrow < 4);
 907    return ops[sizeNarrow];
 908 }
 909
 910 static IROp mkVecQNARROWUNSS ( UInt sizeNarrow ) {
 911    const IROp ops[4]
 912       = { Iop_QNarrowUn16Sto8Sx8,  Iop_QNarrowUn32Sto16Sx4,
 913           Iop_QNarrowUn64Sto32Sx2, Iop_INVALID };
 914    vassert(sizeNarrow < 4);
 915    return ops[sizeNarrow];
 916 }
 917
 918 static IROp mkVecQNARROWUNUU ( UInt sizeNarrow ) {
 919    const IROp ops[4]
 920       = { Iop_QNarrowUn16Uto8Ux8,  Iop_QNarrowUn32Uto16Ux4,
 921           Iop_QNarrowUn64Uto32Ux2, Iop_INVALID };
 922    vassert(sizeNarrow < 4);
 923    return ops[sizeNarrow];
 924 }
 925
 926 static IROp mkVecQANDqshrNNARROWUU ( UInt sizeNarrow ) {
 927    const IROp ops[4]
 928       = { Iop_QandQShrNnarrow16Uto8Ux8, Iop_QandQShrNnarrow32Uto16Ux4,
 929           Iop_QandQShrNnarrow64Uto32Ux2, Iop_INVALID };
 930    vassert(sizeNarrow < 4);
 931    return ops[sizeNarrow];
 932 }
 933
 934 static IROp mkVecQANDqsarNNARROWSS ( UInt sizeNarrow ) {
 935    const IROp ops[4]
 936       = { Iop_QandQSarNnarrow16Sto8Sx8,  Iop_QandQSarNnarrow32Sto16Sx4,
 937           Iop_QandQSarNnarrow64Sto32Sx2, Iop_INVALID };
 938    vassert(sizeNarrow < 4);
 939    return ops[sizeNarrow];
 940 }
 941
 942 static IROp mkVecQANDqsarNNARROWSU ( UInt sizeNarrow ) {
 943    const IROp ops[4]
 944       = { Iop_QandQSarNnarrow16Sto8Ux8,  Iop_QandQSarNnarrow32Sto16Ux4,
 945           Iop_QandQSarNnarrow64Sto32Ux2, Iop_INVALID };
 946    vassert(sizeNarrow < 4);
 947    return ops[sizeNarrow];
 948 }
 949
 950 static IROp mkVecQANDqrshrNNARROWUU ( UInt sizeNarrow ) {
 951    const IROp ops[4]
 952       = { Iop_QandQRShrNnarrow16Uto8Ux8,  Iop_QandQRShrNnarrow32Uto16Ux4,
 953           Iop_QandQRShrNnarrow64Uto32Ux2, Iop_INVALID };
 954    vassert(sizeNarrow < 4);
 955    return ops[sizeNarrow];
 956 }
 957
 958 static IROp mkVecQANDqrsarNNARROWSS ( UInt sizeNarrow ) {
 959    const IROp ops[4]
 960       = { Iop_QandQRSarNnarrow16Sto8Sx8,  Iop_QandQRSarNnarrow32Sto16Sx4,
 961           Iop_QandQRSarNnarrow64Sto32Sx2, Iop_INVALID };
 962    vassert(sizeNarrow < 4);
 963    return ops[sizeNarrow];
 964 }
 965
 966 static IROp mkVecQANDqrsarNNARROWSU ( UInt sizeNarrow ) {
 967    const IROp ops[4]
 968       = { Iop_QandQRSarNnarrow16Sto8Ux8,  Iop_QandQRSarNnarrow32Sto16Ux4,
 969           Iop_QandQRSarNnarrow64Sto32Ux2, Iop_INVALID };
 970    vassert(sizeNarrow < 4);
 971    return ops[sizeNarrow];
 972 }
 973
 974 static IROp mkVecQSHLNSATUU ( UInt size ) {
 975    const IROp ops[4]
 976       = { Iop_QShlNsatUU8x16, Iop_QShlNsatUU16x8,
 977           Iop_QShlNsatUU32x4, Iop_QShlNsatUU64x2 };
 978    vassert(size < 4);
 979    return ops[size];
 980 }
 981
 982 static IROp mkVecQSHLNSATSS ( UInt size ) {
 983    const IROp ops[4]
 984       = { Iop_QShlNsatSS8x16, Iop_QShlNsatSS16x8,
 985           Iop_QShlNsatSS32x4, Iop_QShlNsatSS64x2 };
 986    vassert(size < 4);
 987    return ops[size];
 988 }
 989
 990 static IROp mkVecQSHLNSATSU ( UInt size ) {
 991    const IROp ops[4]
 992       = { Iop_QShlNsatSU8x16, Iop_QShlNsatSU16x8,
 993           Iop_QShlNsatSU32x4, Iop_QShlNsatSU64x2 };
 994    vassert(size < 4);
 995    return ops[size];
 996 }
 997
 998 static IROp mkVecADDF ( UInt size ) {
 999    const IROp ops[4]
1000       = { Iop_INVALID, Iop_Add16Fx8, Iop_Add32Fx4, Iop_Add64Fx2 };
1001    vassert(size < 4);
1002    return ops[size];
1003 }
1004
1005 static IROp mkVecMAXF ( UInt size ) {
1006    const IROp ops[4]
1007       = { Iop_INVALID, Iop_INVALID, Iop_Max32Fx4, Iop_Max64Fx2 };
1008    vassert(size < 4);
1009    return ops[size];
1010 }
1011
1012 static IROp mkVecMINF ( UInt size ) {
1013    const IROp ops[4]
1014       = { Iop_INVALID, Iop_INVALID, Iop_Min32Fx4, Iop_Min64Fx2 };
1015    vassert(size < 4);
1016    return ops[size];
1017 }
1018
1019 /* Generate IR to create 'arg rotated right by imm', for sane values
1020    of 'ty' and 'imm'. */
1021 static IRTemp mathROR ( IRType ty, IRTemp arg, UInt imm )
1022 {
1023    UInt w = 0;
1024    if (ty == Ity_I64) {
1025       w = 64;
1026    } else {
1027       vassert(ty == Ity_I32);
1028       w = 32;
1029    }
1030    vassert(w != 0);
1031    vassert(imm < w);
1032    if (imm == 0) {
1033       return arg;
1034    }
1035    IRTemp res = newTemp(ty);
1036    assign(res, binop(mkOR(ty),
1037                      binop(mkSHL(ty), mkexpr(arg), mkU8(w - imm)),
1038                      binop(mkSHR(ty), mkexpr(arg), mkU8(imm)) ));
1039    return res;
1040 }
1041
1042 /* Generate IR to set the returned temp to either all-zeroes or
1043    all ones, as a copy of arg<imm>. */
1044 static IRTemp mathREPLICATE ( IRType ty, IRTemp arg, UInt imm )
1045 {
1046    UInt w = 0;
1047    if (ty == Ity_I64) {
1048       w = 64;
1049    } else {
1050       vassert(ty == Ity_I32);
1051       w = 32;
1052    }
1053    vassert(w != 0);
1054    vassert(imm < w);
1055    IRTemp res = newTemp(ty);
1056    assign(res, binop(mkSAR(ty),
1057                      binop(mkSHL(ty), mkexpr(arg), mkU8(w - 1 - imm)),
1058                      mkU8(w - 1)));
1059    return res;
1060 }
1061
1062 /* S-widen 8/16/32/64 bit int expr to 64. */
1063 static IRExpr* widenSto64 ( IRType srcTy, IRExpr* e )
1064 {
1065    switch (srcTy) {
1066       case Ity_I64: return e;
1067       case Ity_I32: return unop(Iop_32Sto64, e);
1068       case Ity_I16: return unop(Iop_16Sto64, e);
1069       case Ity_I8:  return unop(Iop_8Sto64, e);
1070       default: vpanic("widenSto64(arm64)");
1071    }
1072 }
1073
1074 /* U-widen 8/16/32/64 bit int expr to 64. */
1075 static IRExpr* widenUto64 ( IRType srcTy, IRExpr* e )
1076 {
1077    switch (srcTy) {
1078       case Ity_I64: return e;
1079       case Ity_I32: return unop(Iop_32Uto64, e);
1080       case Ity_I16: return unop(Iop_16Uto64, e);
1081       case Ity_I8:  return unop(Iop_8Uto64, e);
1082       default: vpanic("widenUto64(arm64)");
1083    }
1084 }
1085
1086 /* Narrow 64 bit int expr to 8/16/32/64.  Clearly only some
1087    of these combinations make sense. */
1088 static IRExpr* narrowFrom64 ( IRType dstTy, IRExpr* e )
1089 {
1090    switch (dstTy) {
1091       case Ity_I64: return e;
1092       case Ity_I32: return unop(Iop_64to32, e);
1093       case Ity_I16: return unop(Iop_64to16, e);
1094       case Ity_I8:  return unop(Iop_64to8, e);
1095       default: vpanic("narrowFrom64(arm64)");
1096    }
1097 }
1098
1099
1100 /*------------------------------------------------------------*/
1101 /*--- Helpers for accessing guest registers.               ---*/
1102 /*------------------------------------------------------------*/
1103
1104 #define OFFB_X0       offsetof(VexGuestARM64State,guest_X0)
1105 #define OFFB_X1       offsetof(VexGuestARM64State,guest_X1)
1106 #define OFFB_X2       offsetof(VexGuestARM64State,guest_X2)
1107 #define OFFB_X3       offsetof(VexGuestARM64State,guest_X3)
1108 #define OFFB_X4       offsetof(VexGuestARM64State,guest_X4)
1109 #define OFFB_X5       offsetof(VexGuestARM64State,guest_X5)
1110 #define OFFB_X6       offsetof(VexGuestARM64State,guest_X6)
1111 #define OFFB_X7       offsetof(VexGuestARM64State,guest_X7)
1112 #define OFFB_X8       offsetof(VexGuestARM64State,guest_X8)
1113 #define OFFB_X9       offsetof(VexGuestARM64State,guest_X9)
1114 #define OFFB_X10      offsetof(VexGuestARM64State,guest_X10)
1115 #define OFFB_X11      offsetof(VexGuestARM64State,guest_X11)
1116 #define OFFB_X12      offsetof(VexGuestARM64State,guest_X12)
1117 #define OFFB_X13      offsetof(VexGuestARM64State,guest_X13)
1118 #define OFFB_X14      offsetof(VexGuestARM64State,guest_X14)
1119 #define OFFB_X15      offsetof(VexGuestARM64State,guest_X15)
1120 #define OFFB_X16      offsetof(VexGuestARM64State,guest_X16)
1121 #define OFFB_X17      offsetof(VexGuestARM64State,guest_X17)
1122 #define OFFB_X18      offsetof(VexGuestARM64State,guest_X18)
1123 #define OFFB_X19      offsetof(VexGuestARM64State,guest_X19)
1124 #define OFFB_X20      offsetof(VexGuestARM64State,guest_X20)
1125 #define OFFB_X21      offsetof(VexGuestARM64State,guest_X21)
1126 #define OFFB_X22      offsetof(VexGuestARM64State,guest_X22)
1127 #define OFFB_X23      offsetof(VexGuestARM64State,guest_X23)
1128 #define OFFB_X24      offsetof(VexGuestARM64State,guest_X24)
1129 #define OFFB_X25      offsetof(VexGuestARM64State,guest_X25)
1130 #define OFFB_X26      offsetof(VexGuestARM64State,guest_X26)
1131 #define OFFB_X27      offsetof(VexGuestARM64State,guest_X27)
1132 #define OFFB_X28      offsetof(VexGuestARM64State,guest_X28)
1133 #define OFFB_X29      offsetof(VexGuestARM64State,guest_X29)
1134 #define OFFB_X30      offsetof(VexGuestARM64State,guest_X30)
1135
1136 #define OFFB_XSP      offsetof(VexGuestARM64State,guest_XSP)
1137 #define OFFB_PC       offsetof(VexGuestARM64State,guest_PC)
1138
1139 #define OFFB_CC_OP    offsetof(VexGuestARM64State,guest_CC_OP)
1140 #define OFFB_CC_DEP1  offsetof(VexGuestARM64State,guest_CC_DEP1)
1141 #define OFFB_CC_DEP2  offsetof(VexGuestARM64State,guest_CC_DEP2)
1142 #define OFFB_CC_NDEP  offsetof(VexGuestARM64State,guest_CC_NDEP)
1143
1144 #define OFFB_TPIDR_EL0 offsetof(VexGuestARM64State,guest_TPIDR_EL0)
1145 #define OFFB_NRADDR   offsetof(VexGuestARM64State,guest_NRADDR)
1146
1147 #define OFFB_Q0       offsetof(VexGuestARM64State,guest_Q0)
1148 #define OFFB_Q1       offsetof(VexGuestARM64State,guest_Q1)
1149 #define OFFB_Q2       offsetof(VexGuestARM64State,guest_Q2)
1150 #define OFFB_Q3       offsetof(VexGuestARM64State,guest_Q3)
1151 #define OFFB_Q4       offsetof(VexGuestARM64State,guest_Q4)
1152 #define OFFB_Q5       offsetof(VexGuestARM64State,guest_Q5)
1153 #define OFFB_Q6       offsetof(VexGuestARM64State,guest_Q6)
1154 #define OFFB_Q7       offsetof(VexGuestARM64State,guest_Q7)
1155 #define OFFB_Q8       offsetof(VexGuestARM64State,guest_Q8)
1156 #define OFFB_Q9       offsetof(VexGuestARM64State,guest_Q9)
1157 #define OFFB_Q10      offsetof(VexGuestARM64State,guest_Q10)
1158 #define OFFB_Q11      offsetof(VexGuestARM64State,guest_Q11)
1159 #define OFFB_Q12      offsetof(VexGuestARM64State,guest_Q12)
1160 #define OFFB_Q13      offsetof(VexGuestARM64State,guest_Q13)
1161 #define OFFB_Q14      offsetof(VexGuestARM64State,guest_Q14)
1162 #define OFFB_Q15      offsetof(VexGuestARM64State,guest_Q15)
1163 #define OFFB_Q16      offsetof(VexGuestARM64State,guest_Q16)
1164 #define OFFB_Q17      offsetof(VexGuestARM64State,guest_Q17)
1165 #define OFFB_Q18      offsetof(VexGuestARM64State,guest_Q18)
1166 #define OFFB_Q19      offsetof(VexGuestARM64State,guest_Q19)
1167 #define OFFB_Q20      offsetof(VexGuestARM64State,guest_Q20)
1168 #define OFFB_Q21      offsetof(VexGuestARM64State,guest_Q21)
1169 #define OFFB_Q22      offsetof(VexGuestARM64State,guest_Q22)
1170 #define OFFB_Q23      offsetof(VexGuestARM64State,guest_Q23)
1171 #define OFFB_Q24      offsetof(VexGuestARM64State,guest_Q24)
1172 #define OFFB_Q25      offsetof(VexGuestARM64State,guest_Q25)
1173 #define OFFB_Q26      offsetof(VexGuestARM64State,guest_Q26)
1174 #define OFFB_Q27      offsetof(VexGuestARM64State,guest_Q27)
1175 #define OFFB_Q28      offsetof(VexGuestARM64State,guest_Q28)
1176 #define OFFB_Q29      offsetof(VexGuestARM64State,guest_Q29)
1177 #define OFFB_Q30      offsetof(VexGuestARM64State,guest_Q30)
1178 #define OFFB_Q31      offsetof(VexGuestARM64State,guest_Q31)
1179
1180 #define OFFB_FPCR     offsetof(VexGuestARM64State,guest_FPCR)
1181 #define OFFB_QCFLAG   offsetof(VexGuestARM64State,guest_QCFLAG)
1182
1183 #define OFFB_CMSTART  offsetof(VexGuestARM64State,guest_CMSTART)
1184 #define OFFB_CMLEN    offsetof(VexGuestARM64State,guest_CMLEN)
1185
1186 #define OFFB_LLSC_SIZE      offsetof(VexGuestARM64State,guest_LLSC_SIZE)
1187 #define OFFB_LLSC_ADDR      offsetof(VexGuestARM64State,guest_LLSC_ADDR)
1188 #define OFFB_LLSC_DATA_LO64 offsetof(VexGuestARM64State,guest_LLSC_DATA_LO64)
1189 #define OFFB_LLSC_DATA_HI64 offsetof(VexGuestARM64State,guest_LLSC_DATA_HI64)
1190
1191
1192 /* ---------------- Integer registers ---------------- */
1193
1194 static Int offsetIReg64 ( UInt iregNo )
1195 {
1196    /* Do we care about endianness here?  We do if sub-parts of integer
1197       registers are accessed. */
1198    switch (iregNo) {
1199       case 0:  return OFFB_X0;
1200       case 1:  return OFFB_X1;
1201       case 2:  return OFFB_X2;
1202       case 3:  return OFFB_X3;
1203       case 4:  return OFFB_X4;
1204       case 5:  return OFFB_X5;
1205       case 6:  return OFFB_X6;
1206       case 7:  return OFFB_X7;
1207       case 8:  return OFFB_X8;
1208       case 9:  return OFFB_X9;
1209       case 10: return OFFB_X10;
1210       case 11: return OFFB_X11;
1211       case 12: return OFFB_X12;
1212       case 13: return OFFB_X13;
1213       case 14: return OFFB_X14;
1214       case 15: return OFFB_X15;
1215       case 16: return OFFB_X16;
1216       case 17: return OFFB_X17;
1217       case 18: return OFFB_X18;
1218       case 19: return OFFB_X19;
1219       case 20: return OFFB_X20;
1220       case 21: return OFFB_X21;
1221       case 22: return OFFB_X22;
1222       case 23: return OFFB_X23;
1223       case 24: return OFFB_X24;
1224       case 25: return OFFB_X25;
1225       case 26: return OFFB_X26;
1226       case 27: return OFFB_X27;
1227       case 28: return OFFB_X28;
1228       case 29: return OFFB_X29;
1229       case 30: return OFFB_X30;
1230       /* but not 31 */
1231       default: vassert(0);
1232    }
1233 }
1234
1235 static Int offsetIReg64orSP ( UInt iregNo )
1236 {
1237    return iregNo == 31  ? OFFB_XSP  : offsetIReg64(iregNo);
1238 }
1239
1240 static const HChar* nameIReg64orZR ( UInt iregNo )
1241 {
1242    vassert(iregNo < 32);
1243    static const HChar* names[32]
1244       = { "x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",
1245           "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15",
1246           "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
1247           "x24", "x25", "x26", "x27", "x28", "x29", "x30", "xzr" };
1248    return names[iregNo];
1249 }
1250
1251 static const HChar* nameIReg64orSP ( UInt iregNo )
1252 {
1253    if (iregNo == 31) {
1254       return "sp";
1255    }
1256    vassert(iregNo < 31);
1257    return nameIReg64orZR(iregNo);
1258 }
1259
1260 static IRExpr* getIReg64orSP ( UInt iregNo )
1261 {
1262    vassert(iregNo < 32);
1263    return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
1264 }
1265
1266 static IRExpr* getIReg64orZR ( UInt iregNo )
1267 {
1268    if (iregNo == 31) {
1269       return mkU64(0);
1270    }
1271    vassert(iregNo < 31);
1272    return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
1273 }
1274
1275 static void putIReg64orSP ( UInt iregNo, IRExpr* e )
1276 {
1277    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1278    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
1279 }
1280
1281 static void putIReg64orZR ( UInt iregNo, IRExpr* e )
1282 {
1283    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1284    if (iregNo == 31) {
1285       return;
1286    }
1287    vassert(iregNo < 31);
1288    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
1289 }
1290
1291 static const HChar* nameIReg32orZR ( UInt iregNo )
1292 {
1293    vassert(iregNo < 32);
1294    static const HChar* names[32]
1295       = { "w0",  "w1",  "w2",  "w3",  "w4",  "w5",  "w6",  "w7",
1296           "w8",  "w9",  "w10", "w11", "w12", "w13", "w14", "w15",
1297           "w16", "w17", "w18", "w19", "w20", "w21", "w22", "w23",
1298           "w24", "w25", "w26", "w27", "w28", "w29", "w30", "wzr" };
1299    return names[iregNo];
1300 }
1301
1302 static const HChar* nameIReg32orSP ( UInt iregNo )
1303 {
1304    if (iregNo == 31) {
1305       return "wsp";
1306    }
1307    vassert(iregNo < 31);
1308    return nameIReg32orZR(iregNo);
1309 }
1310
1311 static IRExpr* getIReg32orSP ( UInt iregNo )
1312 {
1313    vassert(iregNo < 32);
1314    return unop(Iop_64to32,
1315                IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
1316 }
1317
1318 static IRExpr* getIReg32orZR ( UInt iregNo )
1319 {
1320    if (iregNo == 31) {
1321       return mkU32(0);
1322    }
1323    vassert(iregNo < 31);
1324    return unop(Iop_64to32,
1325                IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
1326 }
1327
1328 static void putIReg32orSP ( UInt iregNo, IRExpr* e )
1329 {
1330    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1331    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
1332 }
1333
1334 static void putIReg32orZR ( UInt iregNo, IRExpr* e )
1335 {
1336    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1337    if (iregNo == 31) {
1338       return;
1339    }
1340    vassert(iregNo < 31);
1341    stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
1342 }
1343
1344 static const HChar* nameIRegOrSP ( Bool is64, UInt iregNo )
1345 {
1346    vassert(is64 == True || is64 == False);
1347    return is64 ? nameIReg64orSP(iregNo) : nameIReg32orSP(iregNo);
1348 }
1349
1350 static const HChar* nameIRegOrZR ( Bool is64, UInt iregNo )
1351 {
1352    vassert(is64 == True || is64 == False);
1353    return is64 ? nameIReg64orZR(iregNo) : nameIReg32orZR(iregNo);
1354 }
1355
1356 static IRExpr* getIRegOrZR ( Bool is64, UInt iregNo )
1357 {
1358    vassert(is64 == True || is64 == False);
1359    return is64 ? getIReg64orZR(iregNo) : getIReg32orZR(iregNo);
1360 }
1361
1362 static void putIRegOrZR ( Bool is64, UInt iregNo, IRExpr* e )
1363 {
1364    vassert(is64 == True || is64 == False);
1365    if (is64) putIReg64orZR(iregNo, e); else putIReg32orZR(iregNo, e);
1366 }
1367
1368 static void putPC ( IRExpr* e )
1369 {
1370    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
1371    stmt( IRStmt_Put(OFFB_PC, e) );
1372 }
1373
1374
1375 /* ---------------- Vector (Q) registers ---------------- */
1376
1377 static Int offsetQReg128 ( UInt qregNo )
1378 {
1379    /* We don't care about endianness at this point.  It only becomes
1380       relevant when dealing with sections of these registers.*/
1381    switch (qregNo) {
1382       case 0:  return OFFB_Q0;
1383       case 1:  return OFFB_Q1;
1384       case 2:  return OFFB_Q2;
1385       case 3:  return OFFB_Q3;
1386       case 4:  return OFFB_Q4;
1387       case 5:  return OFFB_Q5;
1388       case 6:  return OFFB_Q6;
1389       case 7:  return OFFB_Q7;
1390       case 8:  return OFFB_Q8;
1391       case 9:  return OFFB_Q9;
1392       case 10: return OFFB_Q10;
1393       case 11: return OFFB_Q11;
1394       case 12: return OFFB_Q12;
1395       case 13: return OFFB_Q13;
1396       case 14: return OFFB_Q14;
1397       case 15: return OFFB_Q15;
1398       case 16: return OFFB_Q16;
1399       case 17: return OFFB_Q17;
1400       case 18: return OFFB_Q18;
1401       case 19: return OFFB_Q19;
1402       case 20: return OFFB_Q20;
1403       case 21: return OFFB_Q21;
1404       case 22: return OFFB_Q22;
1405       case 23: return OFFB_Q23;
1406       case 24: return OFFB_Q24;
1407       case 25: return OFFB_Q25;
1408       case 26: return OFFB_Q26;
1409       case 27: return OFFB_Q27;
1410       case 28: return OFFB_Q28;
1411       case 29: return OFFB_Q29;
1412       case 30: return OFFB_Q30;
1413       case 31: return OFFB_Q31;
1414       default: vassert(0);
1415    }
1416 }
1417
1418 /* Write to a complete Qreg. */
1419 static void putQReg128 ( UInt qregNo, IRExpr* e )
1420 {
1421    vassert(qregNo < 32);
1422    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_V128);
1423    stmt( IRStmt_Put(offsetQReg128(qregNo), e) );
1424 }
1425
1426 /* Read a complete Qreg. */
1427 static IRExpr* getQReg128 ( UInt qregNo )
1428 {
1429    vassert(qregNo < 32);
1430    return IRExpr_Get(offsetQReg128(qregNo), Ity_V128);
1431 }
1432
1433 /* Produce the IR type for some sub-part of a vector.  For 32- and 64-
1434    bit sub-parts we can choose either integer or float types, and
1435    choose float on the basis that that is the common use case and so
1436    will give least interference with Put-to-Get forwarding later
1437    on. */
1438 static IRType preferredVectorSubTypeFromSize ( UInt szB )
1439 {
1440    switch (szB) {
1441       case 1:  return Ity_I8;
1442       case 2:  return Ity_I16;
1443       case 4:  return Ity_I32; //Ity_F32;
1444       case 8:  return Ity_F64;
1445       case 16: return Ity_V128;
1446       default: vassert(0);
1447    }
1448 }
1449
1450 /* Find the offset of the laneNo'th lane of type laneTy in the given
1451    Qreg.  Since the host is little-endian, the least significant lane
1452    has the lowest offset. */
1453 static Int offsetQRegLane ( UInt qregNo, IRType laneTy, UInt laneNo )
1454 {
1455    vassert(host_endness == VexEndnessLE);
1456    Int base = offsetQReg128(qregNo);
1457    /* Since the host is little-endian, the least significant lane
1458       will be at the lowest address. */
1459    /* Restrict this to known types, so as to avoid silently accepting
1460       stupid types. */
1461    UInt laneSzB = 0;
1462    switch (laneTy) {
1463       case Ity_I8:                 laneSzB = 1;  break;
1464       case Ity_F16: case Ity_I16:  laneSzB = 2;  break;
1465       case Ity_F32: case Ity_I32:  laneSzB = 4;  break;
1466       case Ity_F64: case Ity_I64:  laneSzB = 8;  break;
1467       case Ity_V128:               laneSzB = 16; break;
1468       default: break;
1469    }
1470    vassert(laneSzB > 0);
1471    UInt minOff = laneNo * laneSzB;
1472    UInt maxOff = minOff + laneSzB - 1;
1473    vassert(maxOff < 16);
1474    return base + minOff;
1475 }
1476
1477 /* Put to the least significant lane of a Qreg. */
1478 static void putQRegLO ( UInt qregNo, IRExpr* e )
1479 {
1480    IRType ty  = typeOfIRExpr(irsb->tyenv, e);
1481    Int    off = offsetQRegLane(qregNo, ty, 0);
1482    switch (ty) {
1483       case Ity_I8:  case Ity_I16: case Ity_I32: case Ity_I64:
1484       case Ity_F16: case Ity_F32: case Ity_F64: case Ity_V128:
1485          break;
1486       default:
1487          vassert(0); // Other cases are probably invalid
1488    }
1489    stmt(IRStmt_Put(off, e));
1490 }
1491
1492 /* Get from the least significant lane of a Qreg. */
1493 static IRExpr* getQRegLO ( UInt qregNo, IRType ty )
1494 {
1495    Int off = offsetQRegLane(qregNo, ty, 0);
1496    switch (ty) {
1497       case Ity_I8:
1498       case Ity_F16: case Ity_I16:
1499       case Ity_I32: case Ity_I64:
1500       case Ity_F32: case Ity_F64: case Ity_V128:
1501          break;
1502       default:
1503          vassert(0); // Other cases are ATC
1504    }
1505    return IRExpr_Get(off, ty);
1506 }
1507
1508 static const HChar* nameQRegLO ( UInt qregNo, IRType laneTy )
1509 {
1510    static const HChar* namesQ[32]
1511       = { "q0",  "q1",  "q2",  "q3",  "q4",  "q5",  "q6",  "q7",
1512           "q8",  "q9",  "q10", "q11", "q12", "q13", "q14", "q15",
1513           "q16", "q17", "q18", "q19", "q20", "q21", "q22", "q23",
1514           "q24", "q25", "q26", "q27", "q28", "q29", "q30", "q31" };
1515    static const HChar* namesD[32]
1516       = { "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
1517           "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
1518           "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
1519           "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" };
1520    static const HChar* namesS[32]
1521       = { "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
1522           "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15",
1523           "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
1524           "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31" };
1525    static const HChar* namesH[32]
1526       = { "h0",  "h1",  "h2",  "h3",  "h4",  "h5",  "h6",  "h7",
1527           "h8",  "h9",  "h10", "h11", "h12", "h13", "h14", "h15",
1528           "h16", "h17", "h18", "h19", "h20", "h21", "h22", "h23",
1529           "h24", "h25", "h26", "h27", "h28", "h29", "h30", "h31" };
1530    static const HChar* namesB[32]
1531       = { "b0",  "b1",  "b2",  "b3",  "b4",  "b5",  "b6",  "b7",
1532           "b8",  "b9",  "b10", "b11", "b12", "b13", "b14", "b15",
1533           "b16", "b17", "b18", "b19", "b20", "b21", "b22", "b23",
1534           "b24", "b25", "b26", "b27", "b28", "b29", "b30", "b31" };
1535    vassert(qregNo < 32);
1536    switch (sizeofIRType(laneTy)) {
1537       case 1:  return namesB[qregNo];
1538       case 2:  return namesH[qregNo];
1539       case 4:  return namesS[qregNo];
1540       case 8:  return namesD[qregNo];
1541       case 16: return namesQ[qregNo];
1542       default: vassert(0);
1543    }
1544    /*NOTREACHED*/
1545 }
1546
1547 static const HChar* nameQReg128 ( UInt qregNo )
1548 {
1549    return nameQRegLO(qregNo, Ity_V128);
1550 }
1551
1552 /* Find the offset of the most significant half (8 bytes) of the given
1553    Qreg.  This requires knowing the endianness of the host. */
1554 static Int offsetQRegHI64 ( UInt qregNo )
1555 {
1556    return offsetQRegLane(qregNo, Ity_I64, 1);
1557 }
1558
1559 static IRExpr* getQRegHI64 ( UInt qregNo )
1560 {
1561    return IRExpr_Get(offsetQRegHI64(qregNo), Ity_I64);
1562 }
1563
1564 static void putQRegHI64 ( UInt qregNo, IRExpr* e )
1565 {
1566    IRType ty  = typeOfIRExpr(irsb->tyenv, e);
1567    Int    off = offsetQRegHI64(qregNo);
1568    switch (ty) {
1569       case Ity_I64: case Ity_F64:
1570          break;
1571       default:
1572          vassert(0); // Other cases are plain wrong
1573    }
1574    stmt(IRStmt_Put(off, e));
1575 }
1576
1577 /* Put to a specified lane of a Qreg. */
1578 static void putQRegLane ( UInt qregNo, UInt laneNo, IRExpr* e )
1579 {
1580    IRType laneTy  = typeOfIRExpr(irsb->tyenv, e);
1581    Int    off     = offsetQRegLane(qregNo, laneTy, laneNo);
1582    switch (laneTy) {
1583       case Ity_F64: case Ity_I64:
1584       case Ity_I32: case Ity_F32:
1585       case Ity_I16: case Ity_F16:
1586       case Ity_I8:
1587          break;
1588       default:
1589          vassert(0); // Other cases are ATC
1590    }
1591    stmt(IRStmt_Put(off, e));
1592 }
1593
1594 /* Get from a specified lane of a Qreg. */
1595 static IRExpr* getQRegLane ( UInt qregNo, UInt laneNo, IRType laneTy )
1596 {
1597    Int off = offsetQRegLane(qregNo, laneTy, laneNo);
1598    switch (laneTy) {
1599       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
1600       case Ity_F64: case Ity_F32: case Ity_F16:
1601          break;
1602       default:
1603          vassert(0); // Other cases are ATC
1604    }
1605    return IRExpr_Get(off, laneTy);
1606 }
1607
1608
1609 //ZZ /* ---------------- Misc registers ---------------- */
1610 //ZZ
1611 //ZZ static void putMiscReg32 ( UInt    gsoffset,
1612 //ZZ                            IRExpr* e, /* :: Ity_I32 */
1613 //ZZ                            IRTemp  guardT /* :: Ity_I32, 0 or 1 */)
1614 //ZZ {
1615 //ZZ    switch (gsoffset) {
1616 //ZZ       case OFFB_FPSCR:   break;
1617 //ZZ       case OFFB_QFLAG32: break;
1618 //ZZ       case OFFB_GEFLAG0: break;
1619 //ZZ       case OFFB_GEFLAG1: break;
1620 //ZZ       case OFFB_GEFLAG2: break;
1621 //ZZ       case OFFB_GEFLAG3: break;
1622 //ZZ       default: vassert(0); /* awaiting more cases */
1623 //ZZ    }
1624 //ZZ    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
1625 //ZZ
1626 //ZZ    if (guardT == IRTemp_INVALID) {
1627 //ZZ       /* unconditional write */
1628 //ZZ       stmt(IRStmt_Put(gsoffset, e));
1629 //ZZ    } else {
1630 //ZZ       stmt(IRStmt_Put(
1631 //ZZ          gsoffset,
1632 //ZZ          IRExpr_ITE( binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0)),
1633 //ZZ                      e, IRExpr_Get(gsoffset, Ity_I32) )
1634 //ZZ       ));
1635 //ZZ    }
1636 //ZZ }
1637 //ZZ
1638 //ZZ static IRTemp get_ITSTATE ( void )
1639 //ZZ {
1640 //ZZ    ASSERT_IS_THUMB;
1641 //ZZ    IRTemp t = newTemp(Ity_I32);
1642 //ZZ    assign(t, IRExpr_Get( OFFB_ITSTATE, Ity_I32));
1643 //ZZ    return t;
1644 //ZZ }
1645 //ZZ
1646 //ZZ static void put_ITSTATE ( IRTemp t )
1647 //ZZ {
1648 //ZZ    ASSERT_IS_THUMB;
1649 //ZZ    stmt( IRStmt_Put( OFFB_ITSTATE, mkexpr(t)) );
1650 //ZZ }
1651 //ZZ
1652 //ZZ static IRTemp get_QFLAG32 ( void )
1653 //ZZ {
1654 //ZZ    IRTemp t = newTemp(Ity_I32);
1655 //ZZ    assign(t, IRExpr_Get( OFFB_QFLAG32, Ity_I32));
1656 //ZZ    return t;
1657 //ZZ }
1658 //ZZ
1659 //ZZ static void put_QFLAG32 ( IRTemp t, IRTemp condT )
1660 //ZZ {
1661 //ZZ    putMiscReg32( OFFB_QFLAG32, mkexpr(t), condT );
1662 //ZZ }
1663 //ZZ
1664 //ZZ /* Stickily set the 'Q' flag (APSR bit 27) of the APSR (Application Program
1665 //ZZ    Status Register) to indicate that overflow or saturation occurred.
1666 //ZZ    Nb: t must be zero to denote no saturation, and any nonzero
1667 //ZZ    value to indicate saturation. */
1668 //ZZ static void or_into_QFLAG32 ( IRExpr* e, IRTemp condT )
1669 //ZZ {
1670 //ZZ    IRTemp old = get_QFLAG32();
1671 //ZZ    IRTemp nyu = newTemp(Ity_I32);
1672 //ZZ    assign(nyu, binop(Iop_Or32, mkexpr(old), e) );
1673 //ZZ    put_QFLAG32(nyu, condT);
1674 //ZZ }
1675
1676
1677 /* ---------------- FPCR stuff ---------------- */
1678
1679 /* Generate IR to get hold of the rounding mode bits in FPCR, and
1680    convert them to IR format.  Bind the final result to the
1681    returned temp. */
1682 static IRTemp /* :: Ity_I32 */ mk_get_IR_rounding_mode ( void )
1683 {
1684    /* The ARMvfp encoding for rounding mode bits is:
1685          00  to nearest
1686          01  to +infinity
1687          10  to -infinity
1688          11  to zero
1689       We need to convert that to the IR encoding:
1690          00  to nearest (the default)
1691          10  to +infinity
1692          01  to -infinity
1693          11  to zero
1694       Which can be done by swapping bits 0 and 1.
1695       The rmode bits are at 23:22 in FPSCR.
1696    */
1697    IRTemp armEncd = newTemp(Ity_I32);
1698    IRTemp swapped = newTemp(Ity_I32);
1699    /* Fish FPCR[23:22] out, and slide to bottom.  Doesn't matter that
1700       we don't zero out bits 24 and above, since the assignment to
1701       'swapped' will mask them out anyway. */
1702    assign(armEncd,
1703           binop(Iop_Shr32, IRExpr_Get(OFFB_FPCR, Ity_I32), mkU8(22)));
1704    /* Now swap them. */
1705    assign(swapped,
1706           binop(Iop_Or32,
1707                 binop(Iop_And32,
1708                       binop(Iop_Shl32, mkexpr(armEncd), mkU8(1)),
1709                       mkU32(2)),
1710                 binop(Iop_And32,
1711                       binop(Iop_Shr32, mkexpr(armEncd), mkU8(1)),
1712                       mkU32(1))
1713          ));
1714    return swapped;
1715 }
1716
1717
1718 /*------------------------------------------------------------*/
1719 /*--- Helpers for flag handling and conditional insns      ---*/
1720 /*------------------------------------------------------------*/
1721
1722 static const HChar* nameARM64Condcode ( ARM64Condcode cond )
1723 {
1724    switch (cond) {
1725       case ARM64CondEQ:  return "eq";
1726       case ARM64CondNE:  return "ne";
1727       case ARM64CondCS:  return "cs";  // or 'hs'
1728       case ARM64CondCC:  return "cc";  // or 'lo'
1729       case ARM64CondMI:  return "mi";
1730       case ARM64CondPL:  return "pl";
1731       case ARM64CondVS:  return "vs";
1732       case ARM64CondVC:  return "vc";
1733       case ARM64CondHI:  return "hi";
1734       case ARM64CondLS:  return "ls";
1735       case ARM64CondGE:  return "ge";
1736       case ARM64CondLT:  return "lt";
1737       case ARM64CondGT:  return "gt";
1738       case ARM64CondLE:  return "le";
1739       case ARM64CondAL:  return "al";
1740       case ARM64CondNV:  return "nv";
1741       default: vpanic("name_ARM64Condcode");
1742    }
1743 }
1744
1745 /* and a handy shorthand for it */
1746 static const HChar* nameCC ( ARM64Condcode cond ) {
1747    return nameARM64Condcode(cond);
1748 }
1749
1750
1751 /* Build IR to calculate some particular condition from stored
1752    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
1753    Ity_I64, suitable for narrowing.  Although the return type is
1754    Ity_I64, the returned value is either 0 or 1.  'cond' must be
1755    :: Ity_I64 and must denote the condition to compute in
1756    bits 7:4, and be zero everywhere else.
1757 */
1758 static IRExpr* mk_arm64g_calculate_condition_dyn ( IRExpr* cond )
1759 {
1760    vassert(typeOfIRExpr(irsb->tyenv, cond) == Ity_I64);
1761    /* And 'cond' had better produce a value in which only bits 7:4 are
1762       nonzero.  However, obviously we can't assert for that. */
1763
1764    /* So what we're constructing for the first argument is
1765       "(cond << 4) | stored-operation".
1766       However, as per comments above, 'cond' must be supplied
1767       pre-shifted to this function.
1768
1769       This pairing scheme requires that the ARM64_CC_OP_ values all fit
1770       in 4 bits.  Hence we are passing a (COND, OP) pair in the lowest
1771       8 bits of the first argument. */
1772    IRExpr** args
1773       = mkIRExprVec_4(
1774            binop(Iop_Or64, IRExpr_Get(OFFB_CC_OP, Ity_I64), cond),
1775            IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1776            IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1777            IRExpr_Get(OFFB_CC_NDEP, Ity_I64)
1778         );
1779    IRExpr* call
1780       = mkIRExprCCall(
1781            Ity_I64,
1782            0/*regparm*/,
1783            "arm64g_calculate_condition", &arm64g_calculate_condition,
1784            args
1785         );
1786
1787    /* Exclude the requested condition, OP and NDEP from definedness
1788       checking.  We're only interested in DEP1 and DEP2. */
1789    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1790    return call;
1791 }
1792
1793
1794 /* Build IR to calculate some particular condition from stored
1795    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
1796    Ity_I64, suitable for narrowing.  Although the return type is
1797    Ity_I64, the returned value is either 0 or 1.
1798 */
1799 static IRExpr* mk_arm64g_calculate_condition ( ARM64Condcode cond )
1800 {
1801   /* First arg is "(cond << 4) | condition".  This requires that the
1802      ARM64_CC_OP_ values all fit in 4 bits.  Hence we are passing a
1803      (COND, OP) pair in the lowest 8 bits of the first argument. */
1804    vassert(cond >= 0 && cond <= 15);
1805    return mk_arm64g_calculate_condition_dyn( mkU64(cond << 4) );
1806 }
1807
1808
1809 /* Build IR to calculate just the carry flag from stored
1810    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1811    Ity_I64. */
1812 static IRExpr* mk_arm64g_calculate_flag_c ( void )
1813 {
1814    IRExpr** args
1815       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1816                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1817                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1818                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1819    IRExpr* call
1820       = mkIRExprCCall(
1821            Ity_I64,
1822            0/*regparm*/,
1823            "arm64g_calculate_flag_c", &arm64g_calculate_flag_c,
1824            args
1825         );
1826    /* Exclude OP and NDEP from definedness checking.  We're only
1827       interested in DEP1 and DEP2. */
1828    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1829    return call;
1830 }
1831
1832
1833 //ZZ /* Build IR to calculate just the overflow flag from stored
1834 //ZZ    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1835 //ZZ    Ity_I32. */
1836 //ZZ static IRExpr* mk_armg_calculate_flag_v ( void )
1837 //ZZ {
1838 //ZZ    IRExpr** args
1839 //ZZ       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
1840 //ZZ                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
1841 //ZZ                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
1842 //ZZ                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
1843 //ZZ    IRExpr* call
1844 //ZZ       = mkIRExprCCall(
1845 //ZZ            Ity_I32,
1846 //ZZ            0/*regparm*/,
1847 //ZZ            "armg_calculate_flag_v", &armg_calculate_flag_v,
1848 //ZZ            args
1849 //ZZ         );
1850 //ZZ    /* Exclude OP and NDEP from definedness checking.  We're only
1851 //ZZ       interested in DEP1 and DEP2. */
1852 //ZZ    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1853 //ZZ    return call;
1854 //ZZ }
1855
1856
1857 /* Build IR to calculate N Z C V in bits 31:28 of the
1858    returned word. */
1859 static IRExpr* mk_arm64g_calculate_flags_nzcv ( void )
1860 {
1861    IRExpr** args
1862       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1863                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1864                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1865                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1866    IRExpr* call
1867       = mkIRExprCCall(
1868            Ity_I64,
1869            0/*regparm*/,
1870            "arm64g_calculate_flags_nzcv", &arm64g_calculate_flags_nzcv,
1871            args
1872         );
1873    /* Exclude OP and NDEP from definedness checking.  We're only
1874       interested in DEP1 and DEP2. */
1875    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1876    return call;
1877 }
1878
1879
1880 /* Build IR to set the flags thunk, in the most general case. */
1881 static
1882 void setFlags_D1_D2_ND ( UInt cc_op,
1883                          IRTemp t_dep1, IRTemp t_dep2, IRTemp t_ndep )
1884 {
1885    vassert(typeOfIRTemp(irsb->tyenv, t_dep1 == Ity_I64));
1886    vassert(typeOfIRTemp(irsb->tyenv, t_dep2 == Ity_I64));
1887    vassert(typeOfIRTemp(irsb->tyenv, t_ndep == Ity_I64));
1888    vassert(cc_op >= ARM64G_CC_OP_COPY && cc_op < ARM64G_CC_OP_NUMBER);
1889    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(cc_op) ));
1890    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t_dep1) ));
1891    stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(t_dep2) ));
1892    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(t_ndep) ));
1893 }
1894
1895 /* Build IR to set the flags thunk after ADD or SUB. */
1896 static
1897 void setFlags_ADD_SUB ( Bool is64, Bool isSUB, IRTemp argL, IRTemp argR )
1898 {
1899    IRTemp argL64 = IRTemp_INVALID;
1900    IRTemp argR64 = IRTemp_INVALID;
1901    IRTemp z64    = newTemp(Ity_I64);
1902    if (is64) {
1903       argL64 = argL;
1904       argR64 = argR;
1905    } else {
1906       argL64 = newTemp(Ity_I64);
1907       argR64 = newTemp(Ity_I64);
1908       assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
1909       assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
1910    }
1911    assign(z64, mkU64(0));
1912    UInt cc_op = ARM64G_CC_OP_NUMBER;
1913    /**/ if ( isSUB &&  is64) { cc_op = ARM64G_CC_OP_SUB64; }
1914    else if ( isSUB && !is64) { cc_op = ARM64G_CC_OP_SUB32; }
1915    else if (!isSUB &&  is64) { cc_op = ARM64G_CC_OP_ADD64; }
1916    else if (!isSUB && !is64) { cc_op = ARM64G_CC_OP_ADD32; }
1917    else                      { vassert(0); }
1918    setFlags_D1_D2_ND(cc_op, argL64, argR64, z64);
1919 }
1920
1921 /* Build IR to set the flags thunk after ADC or SBC. */
1922 static
1923 void setFlags_ADC_SBC ( Bool is64, Bool isSBC,
1924                         IRTemp argL, IRTemp argR, IRTemp oldC )
1925 {
1926    IRTemp argL64 = IRTemp_INVALID;
1927    IRTemp argR64 = IRTemp_INVALID;
1928    IRTemp oldC64 = IRTemp_INVALID;
1929    if (is64) {
1930       argL64 = argL;
1931       argR64 = argR;
1932       oldC64 = oldC;
1933    } else {
1934       argL64 = newTemp(Ity_I64);
1935       argR64 = newTemp(Ity_I64);
1936       oldC64 = newTemp(Ity_I64);
1937       assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
1938       assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
1939       assign(oldC64, unop(Iop_32Uto64, mkexpr(oldC)));
1940    }
1941    UInt cc_op = ARM64G_CC_OP_NUMBER;
1942    /**/ if ( isSBC &&  is64) { cc_op = ARM64G_CC_OP_SBC64; }
1943    else if ( isSBC && !is64) { cc_op = ARM64G_CC_OP_SBC32; }
1944    else if (!isSBC &&  is64) { cc_op = ARM64G_CC_OP_ADC64; }
1945    else if (!isSBC && !is64) { cc_op = ARM64G_CC_OP_ADC32; }
1946    else                      { vassert(0); }
1947    setFlags_D1_D2_ND(cc_op, argL64, argR64, oldC64);
1948 }
1949
1950 /* Build IR to set the flags thunk after ADD or SUB, if the given
1951    condition evaluates to True at run time.  If not, the flags are set
1952    to the specified NZCV value. */
1953 static
1954 void setFlags_ADD_SUB_conditionally (
1955         Bool is64, Bool isSUB,
1956         IRTemp cond, IRTemp argL, IRTemp argR, UInt nzcv
1957      )
1958 {
1959    /* Generate IR as follows:
1960         CC_OP   = ITE(cond, OP_{ADD,SUB}{32,64}, OP_COPY)
1961         CC_DEP1 = ITE(cond, argL64, nzcv << 28)
1962         CC_DEP2 = ITE(cond, argR64, 0)
1963         CC_NDEP = 0
1964    */
1965
1966    IRTemp z64 = newTemp(Ity_I64);
1967    assign(z64, mkU64(0));
1968
1969    /* Establish the operation and operands for the True case. */
1970    IRTemp t_dep1 = IRTemp_INVALID;
1971    IRTemp t_dep2 = IRTemp_INVALID;
1972    UInt   t_op   = ARM64G_CC_OP_NUMBER;
1973    /**/ if ( isSUB &&  is64) { t_op = ARM64G_CC_OP_SUB64; }
1974    else if ( isSUB && !is64) { t_op = ARM64G_CC_OP_SUB32; }
1975    else if (!isSUB &&  is64) { t_op = ARM64G_CC_OP_ADD64; }
1976    else if (!isSUB && !is64) { t_op = ARM64G_CC_OP_ADD32; }
1977    else                      { vassert(0); }
1978    /* */
1979    if (is64) {
1980       t_dep1 = argL;
1981       t_dep2 = argR;
1982    } else {
1983       t_dep1 = newTemp(Ity_I64);
1984       t_dep2 = newTemp(Ity_I64);
1985       assign(t_dep1, unop(Iop_32Uto64, mkexpr(argL)));
1986       assign(t_dep2, unop(Iop_32Uto64, mkexpr(argR)));
1987    }
1988
1989    /* Establish the operation and operands for the False case. */
1990    IRTemp f_dep1 = newTemp(Ity_I64);
1991    IRTemp f_dep2 = z64;
1992    UInt   f_op   = ARM64G_CC_OP_COPY;
1993    assign(f_dep1, mkU64(nzcv << 28));
1994
1995    /* Final thunk values */
1996    IRTemp dep1 = newTemp(Ity_I64);
1997    IRTemp dep2 = newTemp(Ity_I64);
1998    IRTemp op   = newTemp(Ity_I64);
1999
2000    assign(op,   IRExpr_ITE(mkexpr(cond), mkU64(t_op), mkU64(f_op)));
2001    assign(dep1, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep1), mkexpr(f_dep1)));
2002    assign(dep2, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep2), mkexpr(f_dep2)));
2003
2004    /* finally .. */
2005    stmt( IRStmt_Put( OFFB_CC_OP,   mkexpr(op) ));
2006    stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(dep1) ));
2007    stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(dep2) ));
2008    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(z64) ));
2009 }
2010
2011 /* Build IR to set the flags thunk after AND/OR/XOR or variants thereof. */
2012 static
2013 void setFlags_LOGIC ( Bool is64, IRTemp res )
2014 {
2015    IRTemp res64 = IRTemp_INVALID;
2016    IRTemp z64   = newTemp(Ity_I64);
2017    UInt   cc_op = ARM64G_CC_OP_NUMBER;
2018    if (is64) {
2019       res64 = res;
2020       cc_op = ARM64G_CC_OP_LOGIC64;
2021    } else {
2022       res64 = newTemp(Ity_I64);
2023       assign(res64, unop(Iop_32Uto64, mkexpr(res)));
2024       cc_op = ARM64G_CC_OP_LOGIC32;
2025    }
2026    assign(z64, mkU64(0));
2027    setFlags_D1_D2_ND(cc_op, res64, z64, z64);
2028 }
2029
2030 /* Build IR to set the flags thunk to a given NZCV value.  NZCV is
2031    located in bits 31:28 of the supplied value. */
2032 static
2033 void setFlags_COPY ( IRTemp nzcv_28x0 )
2034 {
2035    IRTemp z64 = newTemp(Ity_I64);
2036    assign(z64, mkU64(0));
2037    setFlags_D1_D2_ND(ARM64G_CC_OP_COPY, nzcv_28x0, z64, z64);
2038 }
2039
2040
2041 //ZZ /* Minor variant of the above that sets NDEP to zero (if it
2042 //ZZ    sets it at all) */
2043 //ZZ static void setFlags_D1_D2 ( UInt cc_op, IRTemp t_dep1,
2044 //ZZ                              IRTemp t_dep2,
2045 //ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2046 //ZZ {
2047 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2048 //ZZ    assign( z32, mkU32(0) );
2049 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, t_dep2, z32, guardT );
2050 //ZZ }
2051 //ZZ
2052 //ZZ
2053 //ZZ /* Minor variant of the above that sets DEP2 to zero (if it
2054 //ZZ    sets it at all) */
2055 //ZZ static void setFlags_D1_ND ( UInt cc_op, IRTemp t_dep1,
2056 //ZZ                              IRTemp t_ndep,
2057 //ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2058 //ZZ {
2059 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2060 //ZZ    assign( z32, mkU32(0) );
2061 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, t_ndep, guardT );
2062 //ZZ }
2063 //ZZ
2064 //ZZ
2065 //ZZ /* Minor variant of the above that sets DEP2 and NDEP to zero (if it
2066 //ZZ    sets them at all) */
2067 //ZZ static void setFlags_D1 ( UInt cc_op, IRTemp t_dep1,
2068 //ZZ                           IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
2069 //ZZ {
2070 //ZZ    IRTemp z32 = newTemp(Ity_I32);
2071 //ZZ    assign( z32, mkU32(0) );
2072 //ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, z32, guardT );
2073 //ZZ }
2074
2075
2076 /*------------------------------------------------------------*/
2077 /*--- Misc math helpers                                    ---*/
2078 /*------------------------------------------------------------*/
2079
2080 /* Generate IR for ((x & mask) >>u sh) | ((x << sh) & mask) */
2081 static IRTemp math_SWAPHELPER ( IRTemp x, ULong mask, Int sh )
2082 {
2083    IRTemp maskT = newTemp(Ity_I64);
2084    IRTemp res   = newTemp(Ity_I64);
2085    vassert(sh >= 1 && sh <= 63);
2086    assign(maskT, mkU64(mask));
2087    assign( res,
2088            binop(Iop_Or64,
2089                  binop(Iop_Shr64,
2090                        binop(Iop_And64,mkexpr(x),mkexpr(maskT)),
2091                        mkU8(sh)),
2092                  binop(Iop_And64,
2093                        binop(Iop_Shl64,mkexpr(x),mkU8(sh)),
2094                        mkexpr(maskT))
2095                  )
2096            );
2097    return res;
2098 }
2099
2100 /* Generates byte swaps within 32-bit lanes. */
2101 static IRTemp math_UINTSWAP64 ( IRTemp src )
2102 {
2103    IRTemp res;
2104    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2105    res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
2106    return res;
2107 }
2108
2109 /* Generates byte swaps within 16-bit lanes. */
2110 static IRTemp math_USHORTSWAP64 ( IRTemp src )
2111 {
2112    IRTemp res;
2113    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2114    return res;
2115 }
2116
2117 /* Generates a 64-bit byte swap. */
2118 static IRTemp math_BYTESWAP64 ( IRTemp src )
2119 {
2120    IRTemp res;
2121    res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
2122    res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
2123    res = math_SWAPHELPER(res, 0xFFFFFFFF00000000ULL, 32);
2124    return res;
2125 }
2126
2127 /* Generates a 64-bit bit swap. */
2128 static IRTemp math_BITSWAP64 ( IRTemp src )
2129 {
2130    IRTemp res;
2131    res = math_SWAPHELPER(src, 0xAAAAAAAAAAAAAAAAULL, 1);
2132    res = math_SWAPHELPER(res, 0xCCCCCCCCCCCCCCCCULL, 2);
2133    res = math_SWAPHELPER(res, 0xF0F0F0F0F0F0F0F0ULL, 4);
2134    return math_BYTESWAP64(res);
2135 }
2136
2137 /* Duplicates the bits at the bottom of the given word to fill the
2138    whole word.  src :: Ity_I64 is assumed to have zeroes everywhere
2139    except for the bottom bits. */
2140 static IRTemp math_DUP_TO_64 ( IRTemp src, IRType srcTy )
2141 {
2142    if (srcTy == Ity_I8) {
2143       IRTemp t16 = newTemp(Ity_I64);
2144       assign(t16, binop(Iop_Or64, mkexpr(src),
2145                                   binop(Iop_Shl64, mkexpr(src), mkU8(8))));
2146       IRTemp t32 = newTemp(Ity_I64);
2147       assign(t32, binop(Iop_Or64, mkexpr(t16),
2148                                   binop(Iop_Shl64, mkexpr(t16), mkU8(16))));
2149       IRTemp t64 = newTemp(Ity_I64);
2150       assign(t64, binop(Iop_Or64, mkexpr(t32),
2151                                   binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
2152       return t64;
2153    }
2154    if (srcTy == Ity_I16) {
2155       IRTemp t32 = newTemp(Ity_I64);
2156       assign(t32, binop(Iop_Or64, mkexpr(src),
2157                                   binop(Iop_Shl64, mkexpr(src), mkU8(16))));
2158       IRTemp t64 = newTemp(Ity_I64);
2159       assign(t64, binop(Iop_Or64, mkexpr(t32),
2160                                   binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
2161       return t64;
2162    }
2163    if (srcTy == Ity_I32) {
2164       IRTemp t64 = newTemp(Ity_I64);
2165       assign(t64, binop(Iop_Or64, mkexpr(src),
2166                                   binop(Iop_Shl64, mkexpr(src), mkU8(32))));
2167       return t64;
2168    }
2169    if (srcTy == Ity_I64) {
2170       return src;
2171    }
2172    vassert(0);
2173 }
2174
2175
2176 /* Duplicates the src element exactly so as to fill a V128 value. */
2177 static IRTemp math_DUP_TO_V128 ( IRTemp src, IRType srcTy )
2178 {
2179    IRTemp res = newTempV128();
2180    if (srcTy == Ity_F64) {
2181       IRTemp i64 = newTemp(Ity_I64);
2182       assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(src)));
2183       assign(res, binop(Iop_64HLtoV128, mkexpr(i64), mkexpr(i64)));
2184       return res;
2185    }
2186    if (srcTy == Ity_F32) {
2187       IRTemp i64a = newTemp(Ity_I64);
2188       assign(i64a, unop(Iop_32Uto64, unop(Iop_ReinterpF32asI32, mkexpr(src))));
2189       IRTemp i64b = newTemp(Ity_I64);
2190       assign(i64b, binop(Iop_Or64, binop(Iop_Shl64, mkexpr(i64a), mkU8(32)),
2191                                    mkexpr(i64a)));
2192       assign(res, binop(Iop_64HLtoV128, mkexpr(i64b), mkexpr(i64b)));
2193       return res;
2194    }
2195    if (srcTy == Ity_I64) {
2196       assign(res, binop(Iop_64HLtoV128, mkexpr(src), mkexpr(src)));
2197       return res;
2198    }
2199    if (srcTy == Ity_I32 || srcTy == Ity_I16 || srcTy == Ity_I8) {
2200       IRTemp t1 = newTemp(Ity_I64);
2201       assign(t1, widenUto64(srcTy, mkexpr(src)));
2202       IRTemp t2 = math_DUP_TO_64(t1, srcTy);
2203       assign(res, binop(Iop_64HLtoV128, mkexpr(t2), mkexpr(t2)));
2204       return res;
2205    }
2206    vassert(0);
2207 }
2208
2209
2210 /* |fullWidth| is a full V128 width result.  Depending on bitQ,
2211    zero out the upper half. */
2212 static IRExpr* math_MAYBE_ZERO_HI64 ( UInt bitQ, IRTemp fullWidth )
2213 {
2214    if (bitQ == 1) return mkexpr(fullWidth);
2215    if (bitQ == 0) return unop(Iop_ZeroHI64ofV128, mkexpr(fullWidth));
2216    vassert(0);
2217 }
2218
2219 /* The same, but from an expression instead. */
2220 static IRExpr* math_MAYBE_ZERO_HI64_fromE ( UInt bitQ, IRExpr* fullWidth )
2221 {
2222    IRTemp fullWidthT = newTempV128();
2223    assign(fullWidthT, fullWidth);
2224    return math_MAYBE_ZERO_HI64(bitQ, fullWidthT);
2225 }
2226
2227
2228 /*------------------------------------------------------------*/
2229 /*--- FP comparison helpers                                ---*/
2230 /*------------------------------------------------------------*/
2231
2232 /* irRes :: Ity_I32 holds a floating point comparison result encoded
2233    as an IRCmpF64Result.  Generate code to convert it to an
2234    ARM64-encoded (N,Z,C,V) group in the lowest 4 bits of an I64 value.
2235    Assign a new temp to hold that value, and return the temp. */
2236 static
2237 IRTemp mk_convert_IRCmpF64Result_to_NZCV ( IRTemp irRes32 )
2238 {
2239    IRTemp ix       = newTemp(Ity_I64);
2240    IRTemp termL    = newTemp(Ity_I64);
2241    IRTemp termR    = newTemp(Ity_I64);
2242    IRTemp nzcv     = newTemp(Ity_I64);
2243    IRTemp irRes    = newTemp(Ity_I64);
2244
2245    /* This is where the fun starts.  We have to convert 'irRes' from
2246       an IR-convention return result (IRCmpF64Result) to an
2247       ARM-encoded (N,Z,C,V) group.  The final result is in the bottom
2248       4 bits of 'nzcv'. */
2249    /* Map compare result from IR to ARM(nzcv) */
2250    /*
2251       FP cmp result | IR   | ARM(nzcv)
2252       --------------------------------
2253       UN              0x45   0011
2254       LT              0x01   1000
2255       GT              0x00   0010
2256       EQ              0x40   0110
2257    */
2258    /* Now since you're probably wondering WTF ..
2259
2260       ix fishes the useful bits out of the IR value, bits 6 and 0, and
2261       places them side by side, giving a number which is 0, 1, 2 or 3.
2262
2263       termL is a sequence cooked up by GNU superopt.  It converts ix
2264          into an almost correct value NZCV value (incredibly), except
2265          for the case of UN, where it produces 0100 instead of the
2266          required 0011.
2267
2268       termR is therefore a correction term, also computed from ix.  It
2269          is 1 in the UN case and 0 for LT, GT and UN.  Hence, to get
2270          the final correct value, we subtract termR from termL.
2271
2272       Don't take my word for it.  There's a test program at the bottom
2273       of guest_arm_toIR.c, to try this out with.
2274    */
2275    assign(irRes, unop(Iop_32Uto64, mkexpr(irRes32)));
2276
2277    assign(
2278       ix,
2279       binop(Iop_Or64,
2280             binop(Iop_And64,
2281                   binop(Iop_Shr64, mkexpr(irRes), mkU8(5)),
2282                   mkU64(3)),
2283             binop(Iop_And64, mkexpr(irRes), mkU64(1))));
2284
2285    assign(
2286       termL,
2287       binop(Iop_Add64,
2288             binop(Iop_Shr64,
2289                   binop(Iop_Sub64,
2290                         binop(Iop_Shl64,
2291                               binop(Iop_Xor64, mkexpr(ix), mkU64(1)),
2292                               mkU8(62)),
2293                         mkU64(1)),
2294                   mkU8(61)),
2295             mkU64(1)));
2296
2297    assign(
2298       termR,
2299       binop(Iop_And64,
2300             binop(Iop_And64,
2301                   mkexpr(ix),
2302                   binop(Iop_Shr64, mkexpr(ix), mkU8(1))),
2303             mkU64(1)));
2304
2305    assign(nzcv, binop(Iop_Sub64, mkexpr(termL), mkexpr(termR)));
2306    return nzcv;
2307 }
2308
2309
2310 /*------------------------------------------------------------*/
2311 /*--- Data processing (immediate)                          ---*/
2312 /*------------------------------------------------------------*/
2313
2314 /* Helper functions for supporting "DecodeBitMasks" */
2315
2316 static ULong dbm_ROR ( Int width, ULong x, Int rot )
2317 {
2318    vassert(width > 0 && width <= 64);
2319    vassert(rot >= 0 && rot < width);
2320    if (rot == 0) return x;
2321    ULong res = x >> rot;
2322    res |= (x << (width - rot));
2323    if (width < 64)
2324      res &= ((1ULL << width) - 1);
2325    return res;
2326 }
2327
2328 static ULong dbm_RepTo64( Int esize, ULong x )
2329 {
2330    switch (esize) {
2331       case 64:
2332          return x;
2333       case 32:
2334          x &= 0xFFFFFFFF; x |= (x << 32);
2335          return x;
2336       case 16:
2337          x &= 0xFFFF; x |= (x << 16); x |= (x << 32);
2338          return x;
2339       case 8:
2340          x &= 0xFF; x |= (x << 8); x |= (x << 16); x |= (x << 32);
2341          return x;
2342       case 4:
2343          x &= 0xF; x |= (x << 4); x |= (x << 8);
2344          x |= (x << 16); x |= (x << 32);
2345          return x;
2346       case 2:
2347          x &= 0x3; x |= (x << 2); x |= (x << 4); x |= (x << 8);
2348          x |= (x << 16); x |= (x << 32);
2349          return x;
2350       default:
2351          break;
2352    }
2353    vpanic("dbm_RepTo64");
2354    /*NOTREACHED*/
2355    return 0;
2356 }
2357
2358 static Int dbm_highestSetBit ( ULong x )
2359 {
2360    Int i;
2361    for (i = 63; i >= 0; i--) {
2362       if (x & (1ULL << i))
2363          return i;
2364    }
2365    vassert(x == 0);
2366    return -1;
2367 }
2368
2369 static
2370 Bool dbm_DecodeBitMasks ( /*OUT*/ULong* wmask, /*OUT*/ULong* tmask,
2371                           ULong immN, ULong imms, ULong immr, Bool immediate,
2372                           UInt M /*32 or 64*/)
2373 {
2374    vassert(immN < (1ULL << 1));
2375    vassert(imms < (1ULL << 6));
2376    vassert(immr < (1ULL << 6));
2377    vassert(immediate == False || immediate == True);
2378    vassert(M == 32 || M == 64);
2379
2380    Int len = dbm_highestSetBit( ((immN << 6) & 64) | ((~imms) & 63) );
2381    if (len < 1) { /* printf("fail1\n"); */ return False; }
2382    vassert(len <= 6);
2383    vassert(M >= (1 << len));
2384
2385    vassert(len >= 1 && len <= 6);
2386    ULong levels = // (zeroes(6 - len) << (6-len)) | ones(len);
2387                   (1UL << len) - 1;
2388    vassert(levels >= 1 && levels <= 63);
2389
2390    if (immediate && ((imms & levels) == levels)) {
2391       /* printf("fail2 imms %llu levels %llu len %d\n", imms, levels, len); */
2392       return False;
2393    }
2394
2395    ULong S = imms & levels;
2396    ULong R = immr & levels;
2397    Int   diff = S - R;
2398    diff &= 63;
2399    Int esize = 1 << len;
2400    vassert(2 <= esize && esize <= 64);
2401
2402    /* Be careful of these (1ULL << (S+1)) - 1 expressions, and the
2403       same below with d.  S can be 63 in which case we have an out of
2404       range and hence undefined shift. */
2405    vassert(S <= 63);
2406    vassert(esize >= (S+1));
2407    ULong elem_s = // Zeroes(esize-(S+1)):Ones(S+1)
2408                   //(1ULL << (S+1)) - 1;
2409                   ((1ULL << S) - 1) + (1ULL << S);
2410
2411    Int d = // diff<len-1:0>
2412            diff & ((1 << len)-1);
2413    vassert(esize >= (d+1));
2414    vassert(d >= 0 && d <= 63);
2415
2416    ULong elem_d = // Zeroes(esize-(d+1)):Ones(d+1)
2417                   //(1ULL << (d+1)) - 1;
2418                   ((1ULL << d) - 1) + (1ULL << d);
2419
2420    if (esize != 64) vassert(elem_s < (1ULL << esize));
2421    if (esize != 64) vassert(elem_d < (1ULL << esize));
2422
2423    if (wmask) *wmask = dbm_RepTo64(esize, dbm_ROR(esize, elem_s, R));
2424    if (tmask) *tmask = dbm_RepTo64(esize, elem_d);
2425
2426    return True;
2427 }
2428
2429
2430 static
2431 Bool dis_ARM64_data_processing_immediate(/*MB_OUT*/DisResult* dres,
2432                                          UInt insn, Bool sigill_diag)
2433 {
2434 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
2435
2436    /* insn[28:23]
2437       10000x PC-rel addressing
2438       10001x Add/subtract (immediate)
2439       100100 Logical (immediate)
2440       100101 Move Wide (immediate)
2441       100110 Bitfield
2442       100111 Extract
2443    */
2444
2445    /* ------------------ ADD/SUB{,S} imm12 ------------------ */
2446    if (INSN(28,24) == BITS5(1,0,0,0,1)) {
2447       Bool is64   = INSN(31,31) == 1;
2448       Bool isSub  = INSN(30,30) == 1;
2449       Bool setCC  = INSN(29,29) == 1;
2450       UInt sh     = INSN(23,22);
2451       UInt uimm12 = INSN(21,10);
2452       UInt nn     = INSN(9,5);
2453       UInt dd     = INSN(4,0);
2454       const HChar* nm = isSub ? "sub" : "add";
2455       if (sh >= 2) {
2456          /* Invalid; fall through */
2457       } else {
2458          vassert(sh <= 1);
2459          uimm12 <<= (12 * sh);
2460          if (is64) {
2461             IRTemp argL  = newTemp(Ity_I64);
2462             IRTemp argR  = newTemp(Ity_I64);
2463             IRTemp res   = newTemp(Ity_I64);
2464             assign(argL, getIReg64orSP(nn));
2465             assign(argR, mkU64(uimm12));
2466             assign(res,  binop(isSub ? Iop_Sub64 : Iop_Add64,
2467                                mkexpr(argL), mkexpr(argR)));
2468             if (setCC) {
2469                putIReg64orZR(dd, mkexpr(res));
2470                setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
2471                DIP("%ss %s, %s, 0x%x\n",
2472                    nm, nameIReg64orZR(dd), nameIReg64orSP(nn), uimm12);
2473             } else {
2474                putIReg64orSP(dd, mkexpr(res));
2475                DIP("%s %s, %s, 0x%x\n",
2476                    nm, nameIReg64orSP(dd), nameIReg64orSP(nn), uimm12);
2477             }
2478          } else {
2479             IRTemp argL  = newTemp(Ity_I32);
2480             IRTemp argR  = newTemp(Ity_I32);
2481             IRTemp res   = newTemp(Ity_I32);
2482             assign(argL, getIReg32orSP(nn));
2483             assign(argR, mkU32(uimm12));
2484             assign(res,  binop(isSub ? Iop_Sub32 : Iop_Add32,
2485                                mkexpr(argL), mkexpr(argR)));
2486             if (setCC) {
2487                putIReg32orZR(dd, mkexpr(res));
2488                setFlags_ADD_SUB(False/*!is64*/, isSub, argL, argR);
2489                DIP("%ss %s, %s, 0x%x\n",
2490                    nm, nameIReg32orZR(dd), nameIReg32orSP(nn), uimm12);
2491             } else {
2492                putIReg32orSP(dd, mkexpr(res));
2493                DIP("%s %s, %s, 0x%x\n",
2494                    nm, nameIReg32orSP(dd), nameIReg32orSP(nn), uimm12);
2495             }
2496          }
2497          return True;
2498       }
2499    }
2500
2501    /* -------------------- ADR/ADRP -------------------- */
2502    if (INSN(28,24) == BITS5(1,0,0,0,0)) {
2503       UInt  bP    = INSN(31,31);
2504       UInt  immLo = INSN(30,29);
2505       UInt  immHi = INSN(23,5);
2506       UInt  rD    = INSN(4,0);
2507       ULong uimm  = (immHi << 2) | immLo;
2508       ULong simm  = sx_to_64(uimm, 21);
2509       ULong val;
2510       if (bP) {
2511          val = (guest_PC_curr_instr & 0xFFFFFFFFFFFFF000ULL) + (simm << 12);
2512       } else {
2513          val = guest_PC_curr_instr + simm;
2514       }
2515       putIReg64orZR(rD, mkU64(val));
2516       DIP("adr%s %s, 0x%llx\n", bP ? "p" : "", nameIReg64orZR(rD), val);
2517       return True;
2518    }
2519
2520    /* -------------------- LOGIC(imm) -------------------- */
2521    if (INSN(28,23) == BITS6(1,0,0,1,0,0)) {
2522       /* 31 30 28     22 21   15   9  4
2523          sf op 100100 N  immr imms Rn Rd
2524            op=00: AND  Rd|SP, Rn, #imm
2525            op=01: ORR  Rd|SP, Rn, #imm
2526            op=10: EOR  Rd|SP, Rn, #imm
2527            op=11: ANDS Rd|ZR, Rn, #imm
2528       */
2529       Bool  is64 = INSN(31,31) == 1;
2530       UInt  op   = INSN(30,29);
2531       UInt  N    = INSN(22,22);
2532       UInt  immR = INSN(21,16);
2533       UInt  immS = INSN(15,10);
2534       UInt  nn   = INSN(9,5);
2535       UInt  dd   = INSN(4,0);
2536       ULong imm  = 0;
2537       Bool  ok;
2538       if (N == 1 && !is64)
2539          goto after_logic_imm; /* not allowed; fall through */
2540       ok = dbm_DecodeBitMasks(&imm, NULL,
2541                               N, immS, immR, True, is64 ? 64 : 32);
2542       if (!ok)
2543          goto after_logic_imm;
2544
2545       const HChar* names[4] = { "and", "orr", "eor", "ands" };
2546       const IROp   ops64[4] = { Iop_And64, Iop_Or64, Iop_Xor64, Iop_And64 };
2547       const IROp   ops32[4] = { Iop_And32, Iop_Or32, Iop_Xor32, Iop_And32 };
2548
2549       vassert(op < 4);
2550       if (is64) {
2551          IRExpr* argL = getIReg64orZR(nn);
2552          IRExpr* argR = mkU64(imm);
2553          IRTemp  res  = newTemp(Ity_I64);
2554          assign(res, binop(ops64[op], argL, argR));
2555          if (op < 3) {
2556             putIReg64orSP(dd, mkexpr(res));
2557             DIP("%s %s, %s, 0x%llx\n", names[op],
2558                 nameIReg64orSP(dd), nameIReg64orZR(nn), imm);
2559          } else {
2560             putIReg64orZR(dd, mkexpr(res));
2561             setFlags_LOGIC(True/*is64*/, res);
2562             DIP("%s %s, %s, 0x%llx\n", names[op],
2563                 nameIReg64orZR(dd), nameIReg64orZR(nn), imm);
2564          }
2565       } else {
2566          IRExpr* argL = getIReg32orZR(nn);
2567          IRExpr* argR = mkU32((UInt)imm);
2568          IRTemp  res  = newTemp(Ity_I32);
2569          assign(res, binop(ops32[op], argL, argR));
2570          if (op < 3) {
2571             putIReg32orSP(dd, mkexpr(res));
2572             DIP("%s %s, %s, 0x%x\n", names[op],
2573                 nameIReg32orSP(dd), nameIReg32orZR(nn), (UInt)imm);
2574          } else {
2575             putIReg32orZR(dd, mkexpr(res));
2576             setFlags_LOGIC(False/*!is64*/, res);
2577             DIP("%s %s, %s, 0x%x\n", names[op],
2578                 nameIReg32orZR(dd), nameIReg32orZR(nn), (UInt)imm);
2579          }
2580       }
2581       return True;
2582    }
2583    after_logic_imm:
2584
2585    /* -------------------- MOV{Z,N,K} -------------------- */
2586    if (INSN(28,23) == BITS6(1,0,0,1,0,1)) {
2587       /* 31 30 28      22 20    4
2588          |  |  |       |  |     |
2589          sf 10 100 101 hw imm16 Rd   MOV(Z) Rd, (imm16 << (16*hw))
2590          sf 00 100 101 hw imm16 Rd   MOV(N) Rd, ~(imm16 << (16*hw))
2591          sf 11 100 101 hw imm16 Rd   MOV(K) Rd, (imm16 << (16*hw))
2592       */
2593       Bool is64   = INSN(31,31) == 1;
2594       UInt subopc = INSN(30,29);
2595       UInt hw     = INSN(22,21);
2596       UInt imm16  = INSN(20,5);
2597       UInt dd     = INSN(4,0);
2598       if (subopc == BITS2(0,1) || (!is64 && hw >= 2)) {
2599          /* invalid; fall through */
2600       } else {
2601          ULong imm64 = ((ULong)imm16) << (16 * hw);
2602          if (!is64)
2603             vassert(imm64 < 0x100000000ULL);
2604          switch (subopc) {
2605             case BITS2(1,0): // MOVZ
2606                putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
2607                DIP("movz %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
2608                break;
2609             case BITS2(0,0): // MOVN
2610                imm64 = ~imm64;
2611                if (!is64)
2612                   imm64 &= 0xFFFFFFFFULL;
2613                putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
2614                DIP("movn %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
2615                break;
2616             case BITS2(1,1): // MOVK
2617                /* This is more complex.  We are inserting a slice into
2618                   the destination register, so we need to have the old
2619                   value of it. */
2620                if (is64) {
2621                   IRTemp old = newTemp(Ity_I64);
2622                   assign(old, getIReg64orZR(dd));
2623                   ULong mask = 0xFFFFULL << (16 * hw);
2624                   IRExpr* res
2625                      = binop(Iop_Or64,
2626                              binop(Iop_And64, mkexpr(old), mkU64(~mask)),
2627                              mkU64(imm64));
2628                   putIReg64orZR(dd, res);
2629                   DIP("movk %s, 0x%x, lsl %u\n",
2630                       nameIReg64orZR(dd), imm16, 16*hw);
2631                } else {
2632                   IRTemp old = newTemp(Ity_I32);
2633                   assign(old, getIReg32orZR(dd));
2634                   vassert(hw <= 1);
2635                   UInt mask = ((UInt)0xFFFF) << (16 * hw);
2636                   IRExpr* res
2637                      = binop(Iop_Or32,
2638                              binop(Iop_And32, mkexpr(old), mkU32(~mask)),
2639                              mkU32((UInt)imm64));
2640                   putIReg32orZR(dd, res);
2641                   DIP("movk %s, 0x%x, lsl %u\n",
2642                       nameIReg32orZR(dd), imm16, 16*hw);
2643                }
2644                break;
2645             default:
2646                vassert(0);
2647          }
2648          return True;
2649       }
2650    }
2651
2652    /* -------------------- {U,S,}BFM -------------------- */
2653    /*    30 28     22 21   15   9  4
2654
2655       sf 10 100110 N  immr imms nn dd
2656          UBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2657          UBFM Xd, Xn, #immr, #imms   when sf=1, N=1
2658
2659       sf 00 100110 N  immr imms nn dd
2660          SBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2661          SBFM Xd, Xn, #immr, #imms   when sf=1, N=1
2662
2663       sf 01 100110 N  immr imms nn dd
2664          BFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
2665          BFM Xd, Xn, #immr, #imms   when sf=1, N=1
2666    */
2667    if (INSN(28,23) == BITS6(1,0,0,1,1,0)) {
2668       UInt sf     = INSN(31,31);
2669       UInt opc    = INSN(30,29);
2670       UInt N      = INSN(22,22);
2671       UInt immR   = INSN(21,16);
2672       UInt immS   = INSN(15,10);
2673       UInt nn     = INSN(9,5);
2674       UInt dd     = INSN(4,0);
2675       Bool inZero = False;
2676       Bool extend = False;
2677       const HChar* nm = "???";
2678       /* skip invalid combinations */
2679       switch (opc) {
2680          case BITS2(0,0):
2681             inZero = True; extend = True; nm = "sbfm"; break;
2682          case BITS2(0,1):
2683             inZero = False; extend = False; nm = "bfm"; break;
2684          case BITS2(1,0):
2685             inZero = True; extend = False; nm = "ubfm"; break;
2686          case BITS2(1,1):
2687             goto after_bfm; /* invalid */
2688          default:
2689             vassert(0);
2690       }
2691       if (sf == 1 && N != 1) goto after_bfm;
2692       if (sf == 0 && (N != 0 || ((immR >> 5) & 1) != 0
2693                              || ((immS >> 5) & 1) != 0)) goto after_bfm;
2694       ULong wmask = 0, tmask = 0;
2695       Bool ok = dbm_DecodeBitMasks(&wmask, &tmask,
2696                                    N, immS, immR, False, sf == 1 ? 64 : 32);
2697       if (!ok) goto after_bfm; /* hmmm */
2698
2699       Bool   is64 = sf == 1;
2700       IRType ty   = is64 ? Ity_I64 : Ity_I32;
2701
2702       // Handle plain shifts explicitly.  These are functionally identical to
2703       // the general case below, but iropt isn't clever enough to reduce those
2704       // sequences to plain shifts.  So give it a hand.
2705       if (is64 && immS == 63 && immR >= 1 && immR <= 63) {
2706          if (opc == BITS2(0,0)) {
2707             // 64-bit signed shift right
2708             putIReg64orZR(dd, binop(Iop_Sar64, getIReg64orZR(nn), mkU8(immR)));
2709             DIP("asr %s, %s, #%u\n",
2710                 nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR);
2711             return True;
2712          }
2713          if (opc == BITS2(1,0)) {
2714             // 64-bit unsigned shift right
2715             putIReg64orZR(dd, binop(Iop_Shr64, getIReg64orZR(nn), mkU8(immR)));
2716             DIP("lsr %s, %s, #%u\n",
2717                 nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR);
2718             return True;
2719          }
2720       }
2721
2722       if (!is64 && immS == 31 && immR >= 1 && immR <= 31) {
2723          if (opc == BITS2(0,0)) {
2724             // 32-bit signed shift right
2725             putIReg32orZR(dd, binop(Iop_Sar32, getIReg32orZR(nn), mkU8(immR)));
2726             DIP("asr %s, %s, #%u\n",
2727                 nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR);
2728             return True;
2729          }
2730          if (opc == BITS2(1,0)) {
2731             // 32-bit unsigned shift right
2732             putIReg32orZR(dd, binop(Iop_Shr32, getIReg32orZR(nn), mkU8(immR)));
2733             DIP("lsr %s, %s, #%u\n",
2734                 nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR);
2735             return True;
2736          }
2737       }
2738
2739       if (is64 && immS <= 62
2740           && immR == immS + 1 && opc == BITS2(1,0)) {
2741          // 64-bit shift left
2742          UInt shift = 64 - immR;
2743          vassert(shift >= 1 && shift <= 63);
2744          putIReg64orZR(dd, binop(Iop_Shl64, getIReg64orZR(nn), mkU8(shift)));
2745          DIP("lsl %s, %s, #%u\n",
2746              nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), shift);
2747          return True;
2748       }
2749       if (!is64 && immS <= 30
2750           && immR == immS + 1 && opc == BITS2(1,0)) {
2751          // 32-bit shift left
2752          UInt shift = 32 - immR;
2753          vassert(shift >= 1 && shift <= 31);
2754          putIReg32orZR(dd, binop(Iop_Shl32, getIReg32orZR(nn), mkU8(shift)));
2755          DIP("lsl %s, %s, #%u\n",
2756              nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), shift);
2757          return True;
2758       }
2759
2760       // Also special-case sxtw.
2761       if (opc == BITS2(0,0) && immR == 0) {
2762          if (is64) {
2763             // The destination size is 64 bits.
2764             if (immS == 31) {
2765                putIReg64orZR(dd, unop(Iop_32Sto64, getIReg32orZR(nn)));
2766                DIP("sxtw %s, %s\n", nameIReg64orZR(dd), nameIReg32orZR(nn));
2767                return True;
2768             }
2769             if (immS == 15) {
2770                putIReg64orZR(dd, unop(Iop_16Sto64,
2771                                       unop(Iop_64to16, getIReg64orZR(nn))));
2772                DIP("sxth %s, %s\n", nameIReg64orZR(dd), nameIReg32orZR(nn));
2773                return True;
2774             }
2775             if (immS == 7) {
2776                putIReg64orZR(dd, unop(Iop_8Sto64,
2777                                       unop(Iop_64to8, getIReg64orZR(nn))));
2778                DIP("sxtb %s, %s\n", nameIReg64orZR(dd), nameIReg32orZR(nn));
2779                return True;
2780             }
2781          } else {
2782             // The destination size is 32 bits.
2783             if (immS == 15) {
2784                putIReg32orZR(dd, unop(Iop_16Sto32,
2785                                       unop(Iop_64to16, getIReg64orZR(nn))));
2786                DIP("sxth %s, %s\n", nameIReg32orZR(dd), nameIReg32orZR(nn));
2787                return True;
2788             }
2789             if (immS == 7) {
2790                putIReg32orZR(dd, unop(Iop_8Sto32,
2791                                       unop(Iop_64to8, getIReg64orZR(nn))));
2792                DIP("sxtb %s, %s\n", nameIReg32orZR(dd), nameIReg32orZR(nn));
2793                return True;
2794             }
2795          }
2796       }
2797
2798       // None of the special cases apply.  We have to use the (slow) general
2799       // case.
2800       IRTemp dst = newTemp(ty);
2801       IRTemp src = newTemp(ty);
2802       IRTemp bot = newTemp(ty);
2803       IRTemp top = newTemp(ty);
2804       IRTemp res = newTemp(ty);
2805       assign(dst, inZero ? mkU(ty,0) : getIRegOrZR(is64, dd));
2806       assign(src, getIRegOrZR(is64, nn));
2807       /* perform bitfield move on low bits */
2808       assign(bot, binop(mkOR(ty),
2809                         binop(mkAND(ty), mkexpr(dst), mkU(ty, ~wmask)),
2810                         binop(mkAND(ty), mkexpr(mathROR(ty, src, immR)),
2811                                          mkU(ty, wmask))));
2812       /* determine extension bits (sign, zero or dest register) */
2813       assign(top, mkexpr(extend ? mathREPLICATE(ty, src, immS) : dst));
2814       /* combine extension bits and result bits */
2815       assign(res, binop(mkOR(ty),
2816                         binop(mkAND(ty), mkexpr(top), mkU(ty, ~tmask)),
2817                         binop(mkAND(ty), mkexpr(bot), mkU(ty, tmask))));
2818       putIRegOrZR(is64, dd, mkexpr(res));
2819       DIP("%s %s, %s, immR=%u, immS=%u\n",
2820           nm, nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR, immS);
2821       return True;
2822    }
2823    after_bfm:
2824
2825    /* ---------------------- EXTR ---------------------- */
2826    /*   30 28     22 20 15   9 4
2827       1 00 100111 10 m  imm6 n d  EXTR Xd, Xn, Xm, #imm6
2828       0 00 100111 00 m  imm6 n d  EXTR Wd, Wn, Wm, #imm6 when #imm6 < 32
2829    */
2830    if (INSN(30,23) == BITS8(0,0,1,0,0,1,1,1) && INSN(21,21) == 0) {
2831       Bool is64  = INSN(31,31) == 1;
2832       UInt mm    = INSN(20,16);
2833       UInt imm6  = INSN(15,10);
2834       UInt nn    = INSN(9,5);
2835       UInt dd    = INSN(4,0);
2836       Bool valid = True;
2837       if (INSN(31,31) != INSN(22,22))
2838         valid = False;
2839       if (!is64 && imm6 >= 32)
2840         valid = False;
2841       if (!valid) goto after_extr;
2842       IRType ty    = is64 ? Ity_I64 : Ity_I32;
2843       IRTemp srcHi = newTemp(ty);
2844       IRTemp srcLo = newTemp(ty);
2845       IRTemp res   = newTemp(ty);
2846       assign(srcHi, getIRegOrZR(is64, nn));
2847       assign(srcLo, getIRegOrZR(is64, mm));
2848       if (imm6 == 0) {
2849         assign(res, mkexpr(srcLo));
2850       } else {
2851         UInt szBits = 8 * sizeofIRType(ty);
2852         vassert(imm6 > 0 && imm6 < szBits);
2853         assign(res, binop(mkOR(ty),
2854                           binop(mkSHL(ty), mkexpr(srcHi), mkU8(szBits-imm6)),
2855                           binop(mkSHR(ty), mkexpr(srcLo), mkU8(imm6))));
2856       }
2857       putIRegOrZR(is64, dd, mkexpr(res));
2858       DIP("extr %s, %s, %s, #%u\n",
2859           nameIRegOrZR(is64,dd),
2860           nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm), imm6);
2861       return True;
2862    }
2863   after_extr:
2864
2865    if (sigill_diag) {
2866       vex_printf("ARM64 front end: data_processing_immediate\n");
2867    }
2868    return False;
2869 #  undef INSN
2870 }
2871
2872
2873 /*------------------------------------------------------------*/
2874 /*--- Data processing (register) instructions              ---*/
2875 /*------------------------------------------------------------*/
2876
2877 static const HChar* nameSH ( UInt sh ) {
2878    switch (sh) {
2879       case 0: return "lsl";
2880       case 1: return "lsr";
2881       case 2: return "asr";
2882       case 3: return "ror";
2883       default: vassert(0);
2884    }
2885 }
2886
2887 /* Generate IR to get a register value, possibly shifted by an
2888    immediate.  Returns either a 32- or 64-bit temporary holding the
2889    result.  After the shift, the value can optionally be NOT-ed
2890    too.
2891
2892    sh_how coding: 00=SHL, 01=SHR, 10=SAR, 11=ROR.  sh_amt may only be
2893    in the range 0 to (is64 ? 64 : 32)-1.  For some instructions, ROR
2894    isn't allowed, but it's the job of the caller to check that.
2895 */
2896 static IRTemp getShiftedIRegOrZR ( Bool is64,
2897                                    UInt sh_how, UInt sh_amt, UInt regNo,
2898                                    Bool invert )
2899 {
2900    vassert(sh_how < 4);
2901    vassert(sh_amt < (is64 ? 64 : 32));
2902    IRType ty = is64 ? Ity_I64 : Ity_I32;
2903    IRTemp t0 = newTemp(ty);
2904    assign(t0, getIRegOrZR(is64, regNo));
2905    IRTemp t1 = newTemp(ty);
2906    switch (sh_how) {
2907       case BITS2(0,0):
2908          assign(t1, binop(mkSHL(ty), mkexpr(t0), mkU8(sh_amt)));
2909          break;
2910       case BITS2(0,1):
2911          assign(t1, binop(mkSHR(ty), mkexpr(t0), mkU8(sh_amt)));
2912          break;
2913       case BITS2(1,0):
2914          assign(t1, binop(mkSAR(ty), mkexpr(t0), mkU8(sh_amt)));
2915          break;
2916       case BITS2(1,1):
2917          assign(t1, mkexpr(mathROR(ty, t0, sh_amt)));
2918          break;
2919       default:
2920          vassert(0);
2921    }
2922    if (invert) {
2923       IRTemp t2 = newTemp(ty);
2924       assign(t2, unop(mkNOT(ty), mkexpr(t1)));
2925       return t2;
2926    } else {
2927       return t1;
2928    }
2929 }
2930
2931
2932 static
2933 Bool dis_ARM64_data_processing_register(/*MB_OUT*/DisResult* dres,
2934                                         UInt insn, Bool sigill_diag)
2935 {
2936 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
2937
2938    /* ------------------- ADD/SUB(reg) ------------------- */
2939    /* x==0 => 32 bit op      x==1 => 64 bit op
2940       sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR(NOT ALLOWED)
2941
2942       31 30 29 28    23 21 20 15   9  4
2943       |  |  |  |     |  |  |  |    |  |
2944       x  0  0  01011 sh 0  Rm imm6 Rn Rd   ADD  Rd,Rn, sh(Rm,imm6)
2945       x  0  1  01011 sh 0  Rm imm6 Rn Rd   ADDS Rd,Rn, sh(Rm,imm6)
2946       x  1  0  01011 sh 0  Rm imm6 Rn Rd   SUB  Rd,Rn, sh(Rm,imm6)
2947       x  1  1  01011 sh 0  Rm imm6 Rn Rd   SUBS Rd,Rn, sh(Rm,imm6)
2948    */
2949    if (INSN(28,24) == BITS5(0,1,0,1,1) && INSN(21,21) == 0) {
2950       UInt   bX    = INSN(31,31);
2951       UInt   bOP   = INSN(30,30); /* 0: ADD, 1: SUB */
2952       UInt   bS    = INSN(29, 29); /* set flags? */
2953       UInt   sh    = INSN(23,22);
2954       UInt   rM    = INSN(20,16);
2955       UInt   imm6  = INSN(15,10);
2956       UInt   rN    = INSN(9,5);
2957       UInt   rD    = INSN(4,0);
2958       Bool   isSUB = bOP == 1;
2959       Bool   is64  = bX == 1;
2960       IRType ty    = is64 ? Ity_I64 : Ity_I32;
2961       if ((!is64 && imm6 > 31) || sh == BITS2(1,1)) {
2962          /* invalid; fall through */
2963       } else {
2964          IRTemp argL = newTemp(ty);
2965          assign(argL, getIRegOrZR(is64, rN));
2966          IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, False);
2967          IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
2968          IRTemp res  = newTemp(ty);
2969          assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
2970          if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
2971          if (bS) {
2972             setFlags_ADD_SUB(is64, isSUB, argL, argR);
2973          }
2974          DIP("%s%s %s, %s, %s, %s #%u\n",
2975              bOP ? "sub" : "add", bS ? "s" : "",
2976              nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
2977              nameIRegOrZR(is64, rM), nameSH(sh), imm6);
2978          return True;
2979       }
2980    }
2981
2982    /* ------------------- ADC/SBC(reg) ------------------- */
2983    /* x==0 => 32 bit op      x==1 => 64 bit op
2984
2985       31 30 29 28    23 21 20 15     9  4
2986       |  |  |  |     |  |  |  |      |  |
2987       x  0  0  11010 00 0  Rm 000000 Rn Rd   ADC  Rd,Rn,Rm
2988       x  0  1  11010 00 0  Rm 000000 Rn Rd   ADCS Rd,Rn,Rm
2989       x  1  0  11010 00 0  Rm 000000 Rn Rd   SBC  Rd,Rn,Rm
2990       x  1  1  11010 00 0  Rm 000000 Rn Rd   SBCS Rd,Rn,Rm
2991    */
2992
2993    if (INSN(28,21) == BITS8(1,1,0,1,0,0,0,0) && INSN(15,10) == 0 ) {
2994       UInt   bX    = INSN(31,31);
2995       UInt   bOP   = INSN(30,30); /* 0: ADC, 1: SBC */
2996       UInt   bS    = INSN(29,29); /* set flags */
2997       UInt   rM    = INSN(20,16);
2998       UInt   rN    = INSN(9,5);
2999       UInt   rD    = INSN(4,0);
3000
3001       Bool   isSUB = bOP == 1;
3002       Bool   is64  = bX == 1;
3003       IRType ty    = is64 ? Ity_I64 : Ity_I32;
3004
3005       IRTemp oldC = newTemp(ty);
3006       assign(oldC,
3007              is64 ? mk_arm64g_calculate_flag_c()
3008                   : unop(Iop_64to32, mk_arm64g_calculate_flag_c()) );
3009
3010       IRTemp argL = newTemp(ty);
3011       assign(argL, getIRegOrZR(is64, rN));
3012       IRTemp argR = newTemp(ty);
3013       assign(argR, getIRegOrZR(is64, rM));
3014
3015       IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
3016       IRTemp res  = newTemp(ty);
3017       if (isSUB) {
3018          IRExpr* one = is64 ? mkU64(1) : mkU32(1);
3019          IROp xorOp = is64 ? Iop_Xor64 : Iop_Xor32;
3020          assign(res,
3021                 binop(op,
3022                       binop(op, mkexpr(argL), mkexpr(argR)),
3023                       binop(xorOp, mkexpr(oldC), one)));
3024       } else {
3025          assign(res,
3026                 binop(op,
3027                       binop(op, mkexpr(argL), mkexpr(argR)),
3028                       mkexpr(oldC)));
3029       }
3030
3031       if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
3032
3033       if (bS) {
3034          setFlags_ADC_SBC(is64, isSUB, argL, argR, oldC);
3035       }
3036
3037       DIP("%s%s %s, %s, %s\n",
3038           bOP ? "sbc" : "adc", bS ? "s" : "",
3039           nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
3040           nameIRegOrZR(is64, rM));
3041       return True;
3042    }
3043
3044    /* -------------------- LOGIC(reg) -------------------- */
3045    /* x==0 => 32 bit op      x==1 => 64 bit op
3046       N==0 => inv? is no-op (no inversion)
3047       N==1 => inv? is NOT
3048       sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR
3049
3050       31 30 28    23 21 20 15   9  4
3051       |  |  |     |  |  |  |    |  |
3052       x  00 01010 sh N  Rm imm6 Rn Rd  AND  Rd,Rn, inv?(sh(Rm,imm6))
3053       x  01 01010 sh N  Rm imm6 Rn Rd  ORR  Rd,Rn, inv?(sh(Rm,imm6))
3054       x  10 01010 sh N  Rm imm6 Rn Rd  EOR  Rd,Rn, inv?(sh(Rm,imm6))
3055       x  11 01010 sh N  Rm imm6 Rn Rd  ANDS Rd,Rn, inv?(sh(Rm,imm6))
3056       With N=1, the names are: BIC ORN EON BICS
3057    */
3058    if (INSN(28,24) == BITS5(0,1,0,1,0)) {
3059       UInt   bX   = INSN(31,31);
3060       UInt   sh   = INSN(23,22);
3061       UInt   bN   = INSN(21,21);
3062       UInt   rM   = INSN(20,16);
3063       UInt   imm6 = INSN(15,10);
3064       UInt   rN   = INSN(9,5);
3065       UInt   rD   = INSN(4,0);
3066       Bool   is64 = bX == 1;
3067       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3068       if (!is64 && imm6 > 31) {
3069          /* invalid; fall though */
3070       } else {
3071          IRTemp argL = newTemp(ty);
3072          assign(argL, getIRegOrZR(is64, rN));
3073          IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, bN == 1);
3074          IROp   op   = Iop_INVALID;
3075          switch (INSN(30,29)) {
3076             case BITS2(0,0): case BITS2(1,1): op = mkAND(ty); break;
3077             case BITS2(0,1):                  op = mkOR(ty);  break;
3078             case BITS2(1,0):                  op = mkXOR(ty); break;
3079             default: vassert(0);
3080          }
3081          IRTemp res = newTemp(ty);
3082          assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
3083          if (INSN(30,29) == BITS2(1,1)) {
3084             setFlags_LOGIC(is64, res);
3085          }
3086          putIRegOrZR(is64, rD, mkexpr(res));
3087
3088          static const HChar* names_op[8]
3089             = { "and", "orr", "eor", "ands", "bic", "orn", "eon", "bics" };
3090          vassert(((bN << 2) | INSN(30,29)) < 8);
3091          const HChar* nm_op = names_op[(bN << 2) | INSN(30,29)];
3092          /* Special-case the printing of "MOV" */
3093          if (rN == 31/*zr*/ && sh == 0/*LSL*/ && imm6 == 0 && bN == 0) {
3094             DIP("mov %s, %s\n", nameIRegOrZR(is64, rD),
3095                                 nameIRegOrZR(is64, rM));
3096          } else {
3097             DIP("%s %s, %s, %s, %s #%u\n", nm_op,
3098                 nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
3099                 nameIRegOrZR(is64, rM), nameSH(sh), imm6);
3100          }
3101          return True;
3102       }
3103    }
3104
3105    /* -------------------- {U,S}MULH -------------------- */
3106    /* 31       23 22 20 15     9   4
3107       10011011 1  10 Rm 011111 Rn Rd   UMULH Xd,Xn,Xm
3108       10011011 0  10 Rm 011111 Rn Rd   SMULH Xd,Xn,Xm
3109    */
3110    if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1)
3111        && INSN(22,21) == BITS2(1,0) && INSN(15,10) == BITS6(0,1,1,1,1,1)) {
3112       Bool isU = INSN(23,23) == 1;
3113       UInt mm  = INSN(20,16);
3114       UInt nn  = INSN(9,5);
3115       UInt dd  = INSN(4,0);
3116       putIReg64orZR(dd, unop(Iop_128HIto64,
3117                              binop(isU ? Iop_MullU64 : Iop_MullS64,
3118                                    getIReg64orZR(nn), getIReg64orZR(mm))));
3119       DIP("%cmulh %s, %s, %s\n",
3120           isU ? 'u' : 's',
3121           nameIReg64orZR(dd), nameIReg64orZR(nn), nameIReg64orZR(mm));
3122       return True;
3123    }
3124
3125    /* -------------------- M{ADD,SUB} -------------------- */
3126    /* 31 30           20 15 14 9 4
3127       sf 00 11011 000 m  0  a  n r   MADD Rd,Rn,Rm,Ra  d = a+m*n
3128       sf 00 11011 000 m  1  a  n r   MADD Rd,Rn,Rm,Ra  d = a-m*n
3129    */
3130    if (INSN(30,21) == BITS10(0,0,1,1,0,1,1,0,0,0)) {
3131       Bool is64  = INSN(31,31) == 1;
3132       UInt mm    = INSN(20,16);
3133       Bool isAdd = INSN(15,15) == 0;
3134       UInt aa    = INSN(14,10);
3135       UInt nn    = INSN(9,5);
3136       UInt dd    = INSN(4,0);
3137       if (is64) {
3138          putIReg64orZR(
3139             dd,
3140             binop(isAdd ? Iop_Add64 : Iop_Sub64,
3141                   getIReg64orZR(aa),
3142                   binop(Iop_Mul64, getIReg64orZR(mm), getIReg64orZR(nn))));
3143       } else {
3144          putIReg32orZR(
3145             dd,
3146             binop(isAdd ? Iop_Add32 : Iop_Sub32,
3147                   getIReg32orZR(aa),
3148                   binop(Iop_Mul32, getIReg32orZR(mm), getIReg32orZR(nn))));
3149       }
3150       DIP("%s %s, %s, %s, %s\n",
3151           isAdd ? "madd" : "msub",
3152           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
3153           nameIRegOrZR(is64, mm), nameIRegOrZR(is64, aa));
3154       return True;
3155    }
3156
3157    /* ---------------- CS{EL,INC,INV,NEG} ---------------- */
3158    /* 31 30 28        20 15   11 9  4
3159       sf 00 1101 0100 mm cond 00 nn dd   CSEL  Rd,Rn,Rm
3160       sf 00 1101 0100 mm cond 01 nn dd   CSINC Rd,Rn,Rm
3161       sf 10 1101 0100 mm cond 00 nn dd   CSINV Rd,Rn,Rm
3162       sf 10 1101 0100 mm cond 01 nn dd   CSNEG Rd,Rn,Rm
3163       In all cases, the operation is: Rd = if cond then Rn else OP(Rm)
3164    */
3165    if (INSN(29,21) == BITS9(0, 1,1,0,1, 0,1,0,0) && INSN(11,11) == 0) {
3166       Bool    is64 = INSN(31,31) == 1;
3167       UInt    b30  = INSN(30,30);
3168       UInt    mm   = INSN(20,16);
3169       UInt    cond = INSN(15,12);
3170       UInt    b10  = INSN(10,10);
3171       UInt    nn   = INSN(9,5);
3172       UInt    dd   = INSN(4,0);
3173       UInt    op   = (b30 << 1) | b10; /* 00=id 01=inc 10=inv 11=neg */
3174       IRType  ty   = is64 ? Ity_I64 : Ity_I32;
3175       IRExpr* argL = getIRegOrZR(is64, nn);
3176       IRExpr* argR = getIRegOrZR(is64, mm);
3177       switch (op) {
3178          case BITS2(0,0):
3179             break;
3180          case BITS2(0,1):
3181             argR = binop(mkADD(ty), argR, mkU(ty,1));
3182             break;
3183          case BITS2(1,0):
3184             argR = unop(mkNOT(ty), argR);
3185             break;
3186          case BITS2(1,1):
3187             argR = binop(mkSUB(ty), mkU(ty,0), argR);
3188             break;
3189          default:
3190             vassert(0);
3191       }
3192       putIRegOrZR(
3193          is64, dd,
3194          IRExpr_ITE(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
3195                     argL, argR)
3196       );
3197       const HChar* op_nm[4] = { "csel", "csinc", "csinv", "csneg" };
3198       DIP("%s %s, %s, %s, %s\n", op_nm[op],
3199           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
3200           nameIRegOrZR(is64, mm), nameCC(cond));
3201       return True;
3202    }
3203
3204    /* -------------- ADD/SUB(extended reg) -------------- */
3205    /*     28         20 15  12   9 4
3206       000 01011 00 1 m  opt imm3 n d   ADD  Wd|SP, Wn|SP, Wm ext&lsld
3207       100 01011 00 1 m  opt imm3 n d   ADD  Xd|SP, Xn|SP, Rm ext&lsld
3208
3209       001 01011 00 1 m  opt imm3 n d   ADDS Wd,    Wn|SP, Wm ext&lsld
3210       101 01011 00 1 m  opt imm3 n d   ADDS Xd,    Xn|SP, Rm ext&lsld
3211
3212       010 01011 00 1 m  opt imm3 n d   SUB  Wd|SP, Wn|SP, Wm ext&lsld
3213       110 01011 00 1 m  opt imm3 n d   SUB  Xd|SP, Xn|SP, Rm ext&lsld
3214
3215       011 01011 00 1 m  opt imm3 n d   SUBS Wd,    Wn|SP, Wm ext&lsld
3216       111 01011 00 1 m  opt imm3 n d   SUBS Xd,    Xn|SP, Rm ext&lsld
3217
3218       The 'm' operand is extended per opt, thusly:
3219
3220         000   Xm & 0xFF           UXTB
3221         001   Xm & 0xFFFF         UXTH
3222         010   Xm & (2^32)-1       UXTW
3223         011   Xm                  UXTX
3224
3225         100   Xm sx from bit 7    SXTB
3226         101   Xm sx from bit 15   SXTH
3227         110   Xm sx from bit 31   SXTW
3228         111   Xm                  SXTX
3229
3230       In the 64 bit case (bit31 == 1), UXTX and SXTX are the identity
3231       operation on Xm.  In the 32 bit case, UXTW, UXTX, SXTW and SXTX
3232       are the identity operation on Wm.
3233
3234       After extension, the value is shifted left by imm3 bits, which
3235       may only be in the range 0 .. 4 inclusive.
3236    */
3237    if (INSN(28,21) == BITS8(0,1,0,1,1,0,0,1) && INSN(12,10) <= 4) {
3238       Bool is64  = INSN(31,31) == 1;
3239       Bool isSub = INSN(30,30) == 1;
3240       Bool setCC = INSN(29,29) == 1;
3241       UInt mm    = INSN(20,16);
3242       UInt opt   = INSN(15,13);
3243       UInt imm3  = INSN(12,10);
3244       UInt nn    = INSN(9,5);
3245       UInt dd    = INSN(4,0);
3246       const HChar* nameExt[8] = { "uxtb", "uxth", "uxtw", "uxtx",
3247                                   "sxtb", "sxth", "sxtw", "sxtx" };
3248       /* Do almost the same thing in the 32- and 64-bit cases. */
3249       IRTemp xN = newTemp(Ity_I64);
3250       IRTemp xM = newTemp(Ity_I64);
3251       assign(xN, getIReg64orSP(nn));
3252       assign(xM, getIReg64orZR(mm));
3253       IRExpr* xMw  = mkexpr(xM); /* "xM widened" */
3254       Int     shSX = 0;
3255       /* widen Xm .. */
3256       switch (opt) {
3257          case BITS3(0,0,0): // UXTB
3258             xMw = binop(Iop_And64, xMw, mkU64(0xFF)); break;
3259          case BITS3(0,0,1): // UXTH
3260             xMw = binop(Iop_And64, xMw, mkU64(0xFFFF)); break;
3261          case BITS3(0,1,0): // UXTW -- noop for the 32bit case
3262             if (is64) {
3263                xMw = unop(Iop_32Uto64, unop(Iop_64to32, xMw));
3264             }
3265             break;
3266          case BITS3(0,1,1): // UXTX -- always a noop
3267             break;
3268          case BITS3(1,0,0): // SXTB
3269             shSX = 56; goto sxTo64;
3270          case BITS3(1,0,1): // SXTH
3271             shSX = 48; goto sxTo64;
3272          case BITS3(1,1,0): // SXTW -- noop for the 32bit case
3273             if (is64) {
3274                shSX = 32; goto sxTo64;
3275             }
3276             break;
3277          case BITS3(1,1,1): // SXTX -- always a noop
3278             break;
3279          sxTo64:
3280             vassert(shSX >= 32);
3281             xMw = binop(Iop_Sar64, binop(Iop_Shl64, xMw, mkU8(shSX)),
3282                         mkU8(shSX));
3283             break;
3284          default:
3285             vassert(0);
3286       }
3287       /* and now shift */
3288       IRTemp argL = xN;
3289       IRTemp argR = newTemp(Ity_I64);
3290       assign(argR, binop(Iop_Shl64, xMw, mkU8(imm3)));
3291       IRTemp res = newTemp(Ity_I64);
3292       assign(res, binop(isSub ? Iop_Sub64 : Iop_Add64,
3293                         mkexpr(argL), mkexpr(argR)));
3294       if (is64) {
3295          if (setCC) {
3296             putIReg64orZR(dd, mkexpr(res));
3297             setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
3298          } else {
3299             putIReg64orSP(dd, mkexpr(res));
3300          }
3301       } else {
3302          if (setCC) {
3303             IRTemp argL32 = newTemp(Ity_I32);
3304             IRTemp argR32 = newTemp(Ity_I32);
3305             putIReg32orZR(dd, unop(Iop_64to32, mkexpr(res)));
3306             assign(argL32, unop(Iop_64to32, mkexpr(argL)));
3307             assign(argR32, unop(Iop_64to32, mkexpr(argR)));
3308             setFlags_ADD_SUB(False/*!is64*/, isSub, argL32, argR32);
3309          } else {
3310             putIReg32orSP(dd, unop(Iop_64to32, mkexpr(res)));
3311          }
3312       }
3313       DIP("%s%s %s, %s, %s %s lsl %u\n",
3314           isSub ? "sub" : "add", setCC ? "s" : "",
3315           setCC ? nameIRegOrZR(is64, dd) : nameIRegOrSP(is64, dd),
3316           nameIRegOrSP(is64, nn), nameIRegOrSP(is64, mm),
3317           nameExt[opt], imm3);
3318       return True;
3319    }
3320
3321    /* ---------------- CCMP/CCMN(imm) ---------------- */
3322    /* Bizarrely, these appear in the "data processing register"
3323       category, even though they are operations against an
3324       immediate. */
3325    /* 31   29        20   15   11 9    3
3326       sf 1 111010010 imm5 cond 10 Rn 0 nzcv   CCMP Rn, #imm5, #nzcv, cond
3327       sf 0 111010010 imm5 cond 10 Rn 0 nzcv   CCMN Rn, #imm5, #nzcv, cond
3328
3329       Operation is:
3330          (CCMP) flags = if cond then flags-after-sub(Rn,imm5) else nzcv
3331          (CCMN) flags = if cond then flags-after-add(Rn,imm5) else nzcv
3332    */
3333    if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
3334        && INSN(11,10) == BITS2(1,0) && INSN(4,4) == 0) {
3335       Bool is64  = INSN(31,31) == 1;
3336       Bool isSUB = INSN(30,30) == 1;
3337       UInt imm5  = INSN(20,16);
3338       UInt cond  = INSN(15,12);
3339       UInt nn    = INSN(9,5);
3340       UInt nzcv  = INSN(3,0);
3341
3342       IRTemp condT = newTemp(Ity_I1);
3343       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
3344
3345       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3346       IRTemp argL = newTemp(ty);
3347       IRTemp argR = newTemp(ty);
3348
3349       if (is64) {
3350          assign(argL, getIReg64orZR(nn));
3351          assign(argR, mkU64(imm5));
3352       } else {
3353          assign(argL, getIReg32orZR(nn));
3354          assign(argR, mkU32(imm5));
3355       }
3356       setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
3357
3358       DIP("ccm%c %s, #%u, #%u, %s\n",
3359           isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
3360           imm5, nzcv, nameCC(cond));
3361       return True;
3362    }
3363
3364    /* ---------------- CCMP/CCMN(reg) ---------------- */
3365    /* 31   29        20 15   11 9    3
3366       sf 1 111010010 Rm cond 00 Rn 0 nzcv   CCMP Rn, Rm, #nzcv, cond
3367       sf 0 111010010 Rm cond 00 Rn 0 nzcv   CCMN Rn, Rm, #nzcv, cond
3368       Operation is:
3369          (CCMP) flags = if cond then flags-after-sub(Rn,Rm) else nzcv
3370          (CCMN) flags = if cond then flags-after-add(Rn,Rm) else nzcv
3371    */
3372    if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
3373        && INSN(11,10) == BITS2(0,0) && INSN(4,4) == 0) {
3374       Bool is64  = INSN(31,31) == 1;
3375       Bool isSUB = INSN(30,30) == 1;
3376       UInt mm    = INSN(20,16);
3377       UInt cond  = INSN(15,12);
3378       UInt nn    = INSN(9,5);
3379       UInt nzcv  = INSN(3,0);
3380
3381       IRTemp condT = newTemp(Ity_I1);
3382       assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
3383
3384       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3385       IRTemp argL = newTemp(ty);
3386       IRTemp argR = newTemp(ty);
3387
3388       if (is64) {
3389          assign(argL, getIReg64orZR(nn));
3390          assign(argR, getIReg64orZR(mm));
3391       } else {
3392          assign(argL, getIReg32orZR(nn));
3393          assign(argR, getIReg32orZR(mm));
3394       }
3395       setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);
3396
3397       DIP("ccm%c %s, %s, #%u, %s\n",
3398           isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
3399           nameIRegOrZR(is64, mm), nzcv, nameCC(cond));
3400       return True;
3401    }
3402
3403
3404    /* -------------- REV/REV16/REV32/RBIT -------------- */
3405    /* 31 30 28       20    15   11 9 4
3406
3407       1  10 11010110 00000 0000 11 n d    (1) REV   Xd, Xn
3408       0  10 11010110 00000 0000 10 n d    (2) REV   Wd, Wn
3409
3410       1  10 11010110 00000 0000 00 n d    (3) RBIT  Xd, Xn
3411       0  10 11010110 00000 0000 00 n d    (4) RBIT  Wd, Wn
3412
3413       1  10 11010110 00000 0000 01 n d    (5) REV16 Xd, Xn
3414       0  10 11010110 00000 0000 01 n d    (6) REV16 Wd, Wn
3415
3416       1  10 11010110 00000 0000 10 n d    (7) REV32 Xd, Xn
3417    */
3418    if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
3419        && INSN(20,12) == BITS9(0,0,0,0,0,0,0,0,0)) {
3420       UInt b31 = INSN(31,31);
3421       UInt opc = INSN(11,10);
3422
3423       UInt ix = 0;
3424       /**/ if (b31 == 1 && opc == BITS2(1,1)) ix = 1;
3425       else if (b31 == 0 && opc == BITS2(1,0)) ix = 2;
3426       else if (b31 == 1 && opc == BITS2(0,0)) ix = 3;
3427       else if (b31 == 0 && opc == BITS2(0,0)) ix = 4;
3428       else if (b31 == 1 && opc == BITS2(0,1)) ix = 5;
3429       else if (b31 == 0 && opc == BITS2(0,1)) ix = 6;
3430       else if (b31 == 1 && opc == BITS2(1,0)) ix = 7;
3431       if (ix >= 1 && ix <= 7) {
3432          Bool   is64  = ix == 1 || ix == 3 || ix == 5 || ix == 7;
3433          UInt   nn    = INSN(9,5);
3434          UInt   dd    = INSN(4,0);
3435          IRTemp src   = newTemp(Ity_I64);
3436          IRTemp dst   = IRTemp_INVALID;
3437          IRTemp (*math)(IRTemp) = NULL;
3438          switch (ix) {
3439             case 1: case 2: math = math_BYTESWAP64;   break;
3440             case 3: case 4: math = math_BITSWAP64;    break;
3441             case 5: case 6: math = math_USHORTSWAP64; break;
3442             case 7:         math = math_UINTSWAP64;   break;
3443             default: vassert(0);
3444          }
3445          const HChar* names[7]
3446            = { "rev", "rev", "rbit", "rbit", "rev16", "rev16", "rev32" };
3447          const HChar* nm = names[ix-1];
3448          vassert(math);
3449          if (ix == 6) {
3450             /* This has to be special cased, since the logic below doesn't
3451                handle it correctly. */
3452             assign(src, getIReg64orZR(nn));
3453             dst = math(src);
3454             putIReg64orZR(dd,
3455                           unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(dst))));
3456          } else if (is64) {
3457             assign(src, getIReg64orZR(nn));
3458             dst = math(src);
3459             putIReg64orZR(dd, mkexpr(dst));
3460          } else {
3461             assign(src, binop(Iop_Shl64, getIReg64orZR(nn), mkU8(32)));
3462             dst = math(src);
3463             putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
3464          }
3465          DIP("%s %s, %s\n", nm,
3466              nameIRegOrZR(is64,dd), nameIRegOrZR(is64,nn));
3467          return True;
3468       }
3469       /* else fall through */
3470    }
3471
3472    /* -------------------- CLZ/CLS -------------------- */
3473    /*    30 28   24   20    15      9 4
3474       sf 10 1101 0110 00000 00010 0 n d    CLZ Rd, Rn
3475       sf 10 1101 0110 00000 00010 1 n d    CLS Rd, Rn
3476    */
3477    if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
3478        && INSN(20,11) == BITS10(0,0,0,0,0,0,0,0,1,0)) {
3479       Bool   is64  = INSN(31,31) == 1;
3480       Bool   isCLS = INSN(10,10) == 1;
3481       UInt   nn    = INSN(9,5);
3482       UInt   dd    = INSN(4,0);
3483       IRTemp src   = newTemp(Ity_I64);
3484       IRTemp srcZ  = newTemp(Ity_I64);
3485       IRTemp dst   = newTemp(Ity_I64);
3486       /* Get the argument, widened out to 64 bit */
3487       if (is64) {
3488          assign(src, getIReg64orZR(nn));
3489       } else {
3490          assign(src, binop(Iop_Shl64,
3491                            unop(Iop_32Uto64, getIReg32orZR(nn)), mkU8(32)));
3492       }
3493       /* If this is CLS, mash the arg around accordingly */
3494       if (isCLS) {
3495          IRExpr* one = mkU8(1);
3496          assign(srcZ,
3497          binop(Iop_Xor64,
3498                binop(Iop_Shl64, mkexpr(src), one),
3499                binop(Iop_Shl64, binop(Iop_Shr64, mkexpr(src), one), one)));
3500       } else {
3501          assign(srcZ, mkexpr(src));
3502       }
3503       /* And compute CLZ. */
3504       if (is64) {
3505          assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
3506                                 mkU64(isCLS ? 63 : 64),
3507                                 unop(Iop_Clz64, mkexpr(srcZ))));
3508          putIReg64orZR(dd, mkexpr(dst));
3509       } else {
3510          assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
3511                                 mkU64(isCLS ? 31 : 32),
3512                                 unop(Iop_Clz64, mkexpr(srcZ))));
3513          putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
3514       }
3515       DIP("cl%c %s, %s\n", isCLS ? 's' : 'z',
3516           nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn));
3517       return True;
3518    }
3519
3520    /* ------------------ LSLV/LSRV/ASRV/RORV ------------------ */
3521    /*    30 28        20 15   11 9 4
3522       sf 00 1101 0110 m  0010 00 n d   LSLV Rd,Rn,Rm
3523       sf 00 1101 0110 m  0010 01 n d   LSRV Rd,Rn,Rm
3524       sf 00 1101 0110 m  0010 10 n d   ASRV Rd,Rn,Rm
3525       sf 00 1101 0110 m  0010 11 n d   RORV Rd,Rn,Rm
3526    */
3527    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3528        && INSN(15,12) == BITS4(0,0,1,0)) {
3529       Bool   is64 = INSN(31,31) == 1;
3530       UInt   mm   = INSN(20,16);
3531       UInt   op   = INSN(11,10);
3532       UInt   nn   = INSN(9,5);
3533       UInt   dd   = INSN(4,0);
3534       IRType ty   = is64 ? Ity_I64 : Ity_I32;
3535       IRTemp srcL = newTemp(ty);
3536       IRTemp srcR = newTemp(Ity_I64);
3537       IRTemp res  = newTemp(ty);
3538       IROp   iop  = Iop_INVALID;
3539       assign(srcL, getIRegOrZR(is64, nn));
3540       assign(srcR, binop(Iop_And64, getIReg64orZR(mm),
3541                                     mkU64(is64 ? 63 : 31)));
3542       if (op < 3) {
3543          // LSLV, LSRV, ASRV
3544          switch (op) {
3545             case BITS2(0,0): iop = mkSHL(ty); break;
3546             case BITS2(0,1): iop = mkSHR(ty); break;
3547             case BITS2(1,0): iop = mkSAR(ty); break;
3548             default: vassert(0);
3549          }
3550          assign(res, binop(iop, mkexpr(srcL),
3551                                 unop(Iop_64to8, mkexpr(srcR))));
3552       } else {
3553          // RORV
3554          IROp opSHL = mkSHL(ty);
3555          IROp opSHR = mkSHR(ty);
3556          IROp opOR  = mkOR(ty);
3557          IRExpr* width = mkU64(is64 ? 64: 32);
3558          assign(
3559             res,
3560             IRExpr_ITE(
3561                binop(Iop_CmpEQ64, mkexpr(srcR), mkU64(0)),
3562                mkexpr(srcL),
3563                binop(opOR,
3564                      binop(opSHL,
3565                            mkexpr(srcL),
3566                            unop(Iop_64to8, binop(Iop_Sub64, width,
3567                                                             mkexpr(srcR)))),
3568                      binop(opSHR,
3569                            mkexpr(srcL), unop(Iop_64to8, mkexpr(srcR))))
3570          ));
3571       }
3572       putIRegOrZR(is64, dd, mkexpr(res));
3573       vassert(op < 4);
3574       const HChar* names[4] = { "lslv", "lsrv", "asrv", "rorv" };
3575       DIP("%s %s, %s, %s\n",
3576           names[op], nameIRegOrZR(is64,dd),
3577                      nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm));
3578       return True;
3579    }
3580
3581    /* -------------------- SDIV/UDIV -------------------- */
3582    /*    30 28        20 15    10 9 4
3583       sf 00 1101 0110 m  00001  1 n d  SDIV Rd,Rn,Rm
3584       sf 00 1101 0110 m  00001  0 n d  UDIV Rd,Rn,Rm
3585    */
3586    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3587        && INSN(15,11) == BITS5(0,0,0,0,1)) {
3588       Bool is64 = INSN(31,31) == 1;
3589       UInt mm   = INSN(20,16);
3590       Bool isS  = INSN(10,10) == 1;
3591       UInt nn   = INSN(9,5);
3592       UInt dd   = INSN(4,0);
3593       if (isS) {
3594          putIRegOrZR(is64, dd, binop(is64 ? Iop_DivS64 : Iop_DivS32,
3595                                      getIRegOrZR(is64, nn),
3596                                      getIRegOrZR(is64, mm)));
3597       } else {
3598          putIRegOrZR(is64, dd, binop(is64 ? Iop_DivU64 : Iop_DivU32,
3599                                      getIRegOrZR(is64, nn),
3600                                      getIRegOrZR(is64, mm)));
3601       }
3602       DIP("%cdiv %s, %s, %s\n", isS ? 's' : 'u',
3603           nameIRegOrZR(is64, dd),
3604           nameIRegOrZR(is64, nn), nameIRegOrZR(is64, mm));
3605       return True;
3606    }
3607
3608    /* ------------------ {S,U}M{ADD,SUB}L ------------------ */
3609    /* 31        23  20 15 14 9 4
3610       1001 1011 101 m  0  a  n d   UMADDL Xd,Wn,Wm,Xa
3611       1001 1011 001 m  0  a  n d   SMADDL Xd,Wn,Wm,Xa
3612       1001 1011 101 m  1  a  n d   UMSUBL Xd,Wn,Wm,Xa
3613       1001 1011 001 m  1  a  n d   SMSUBL Xd,Wn,Wm,Xa
3614       with operation
3615          Xd = Xa +/- (Wn *u/s Wm)
3616    */
3617    if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1) && INSN(22,21) == BITS2(0,1)) {
3618       Bool   isU   = INSN(23,23) == 1;
3619       UInt   mm    = INSN(20,16);
3620       Bool   isAdd = INSN(15,15) == 0;
3621       UInt   aa    = INSN(14,10);
3622       UInt   nn    = INSN(9,5);
3623       UInt   dd    = INSN(4,0);
3624       IRTemp wN    = newTemp(Ity_I32);
3625       IRTemp wM    = newTemp(Ity_I32);
3626       IRTemp xA    = newTemp(Ity_I64);
3627       IRTemp muld  = newTemp(Ity_I64);
3628       IRTemp res   = newTemp(Ity_I64);
3629       assign(wN, getIReg32orZR(nn));
3630       assign(wM, getIReg32orZR(mm));
3631       assign(xA, getIReg64orZR(aa));
3632       assign(muld, binop(isU ? Iop_MullU32 : Iop_MullS32,
3633                          mkexpr(wN), mkexpr(wM)));
3634       assign(res, binop(isAdd ? Iop_Add64 : Iop_Sub64,
3635                         mkexpr(xA), mkexpr(muld)));
3636       putIReg64orZR(dd, mkexpr(res));
3637       DIP("%cm%sl %s, %s, %s, %s\n", isU ? 'u' : 's', isAdd ? "add" : "sub",
3638           nameIReg64orZR(dd), nameIReg32orZR(nn),
3639           nameIReg32orZR(mm), nameIReg64orZR(aa));
3640       return True;
3641    }
3642
3643    /* -------------------- CRC32/CRC32C -------------------- */
3644    /* 31 30           20 15   11 9 4
3645       sf 00 1101 0110 m  0100 sz n d   CRC32<sz>  Wd, Wn, Wm|Xm
3646       sf 00 1101 0110 m  0101 sz n d   CRC32C<sz> Wd, Wn, Wm|Xm
3647    */
3648    if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
3649        && INSN(15,13) == BITS3(0,1,0)) {
3650       UInt bitSF = INSN(31,31);
3651       UInt mm    = INSN(20,16);
3652       UInt bitC  = INSN(12,12);
3653       UInt sz    = INSN(11,10);
3654       UInt nn    = INSN(9,5);
3655       UInt dd    = INSN(4,0);
3656       vassert(sz <= 3);
3657       if ((bitSF == 0 && sz <= BITS2(1,0))
3658           || (bitSF == 1 && sz == BITS2(1,1))) {
3659          UInt ix = (bitC == 1 ? 4 : 0) | sz;
3660          void* helpers[8]
3661             = { &arm64g_calc_crc32b,   &arm64g_calc_crc32h,
3662                 &arm64g_calc_crc32w,   &arm64g_calc_crc32x,
3663                 &arm64g_calc_crc32cb,  &arm64g_calc_crc32ch,
3664                 &arm64g_calc_crc32cw,  &arm64g_calc_crc32cx };
3665          const HChar* hNames[8]
3666             = { "arm64g_calc_crc32b",  "arm64g_calc_crc32h",
3667                 "arm64g_calc_crc32w",  "arm64g_calc_crc32x",
3668                 "arm64g_calc_crc32cb", "arm64g_calc_crc32ch",
3669                 "arm64g_calc_crc32cw", "arm64g_calc_crc32cx" };
3670          const HChar* iNames[8]
3671             = { "crc32b",  "crc32h",  "crc32w",  "crc32x",
3672                 "crc32cb", "crc32ch", "crc32cw", "crc32cx" };
3673
3674          IRTemp srcN = newTemp(Ity_I64);
3675          assign(srcN, unop(Iop_32Uto64, unop(Iop_64to32, getIReg64orZR(nn))));
3676
3677          IRTemp  srcM = newTemp(Ity_I64);
3678          IRExpr* at64 = getIReg64orZR(mm);
3679          switch (sz) {
3680             case BITS2(0,0):
3681                assign(srcM, binop(Iop_And64, at64, mkU64(0xFF))); break;
3682             case BITS2(0,1):
3683                assign(srcM, binop(Iop_And64, at64, mkU64(0xFFFF))); break;
3684             case BITS2(1,0):
3685                assign(srcM, binop(Iop_And64, at64, mkU64(0xFFFFFFFF))); break;
3686             case BITS2(1,1):
3687                assign(srcM, at64); break;
3688             default:
3689                vassert(0);
3690          }
3691
3692          vassert(ix <= 7);
3693
3694          putIReg64orZR(
3695             dd,
3696             unop(Iop_32Uto64,
3697                  unop(Iop_64to32,
3698                       mkIRExprCCall(Ity_I64, 0/*regparm*/,
3699                                     hNames[ix], helpers[ix],
3700                                     mkIRExprVec_2(mkexpr(srcN),
3701                                                   mkexpr(srcM))))));
3702
3703          DIP("%s %s, %s, %s\n", iNames[ix],
3704              nameIReg32orZR(dd),
3705              nameIReg32orZR(nn), nameIRegOrZR(bitSF == 1, mm));
3706          return True;
3707       }
3708       /* fall through */
3709    }
3710
3711    if (sigill_diag) {
3712       vex_printf("ARM64 front end: data_processing_register\n");
3713    }
3714    return False;
3715 #  undef INSN
3716 }
3717
3718
3719 /*------------------------------------------------------------*/
3720 /*--- Math helpers for vector interleave/deinterleave      ---*/
3721 /*------------------------------------------------------------*/
3722
3723 #define EX(_tmp) \
3724            mkexpr(_tmp)
3725 #define SL(_hi128,_lo128,_nbytes) \
3726            ( (_nbytes) == 0 \
3727                 ? (_lo128) \
3728                 : triop(Iop_SliceV128,(_hi128),(_lo128),mkU8(_nbytes)) )
3729 #define ROR(_v128,_nbytes) \
3730            SL((_v128),(_v128),(_nbytes))
3731 #define ROL(_v128,_nbytes) \
3732            SL((_v128),(_v128),16-(_nbytes))
3733 #define SHR(_v128,_nbytes) \
3734            binop(Iop_ShrV128,(_v128),mkU8(8*(_nbytes)))
3735 #define SHL(_v128,_nbytes) \
3736            binop(Iop_ShlV128,(_v128),mkU8(8*(_nbytes)))
3737 #define ILO64x2(_argL,_argR) \
3738            binop(Iop_InterleaveLO64x2,(_argL),(_argR))
3739 #define IHI64x2(_argL,_argR) \
3740            binop(Iop_InterleaveHI64x2,(_argL),(_argR))
3741 #define ILO32x4(_argL,_argR) \
3742            binop(Iop_InterleaveLO32x4,(_argL),(_argR))
3743 #define IHI32x4(_argL,_argR) \
3744            binop(Iop_InterleaveHI32x4,(_argL),(_argR))
3745 #define ILO16x8(_argL,_argR) \
3746            binop(Iop_InterleaveLO16x8,(_argL),(_argR))
3747 #define IHI16x8(_argL,_argR) \
3748            binop(Iop_InterleaveHI16x8,(_argL),(_argR))
3749 #define ILO8x16(_argL,_argR) \
3750            binop(Iop_InterleaveLO8x16,(_argL),(_argR))
3751 #define IHI8x16(_argL,_argR) \
3752            binop(Iop_InterleaveHI8x16,(_argL),(_argR))
3753 #define CEV32x4(_argL,_argR) \
3754            binop(Iop_CatEvenLanes32x4,(_argL),(_argR))
3755 #define COD32x4(_argL,_argR) \
3756            binop(Iop_CatOddLanes32x4,(_argL),(_argR))
3757 #define COD16x8(_argL,_argR) \
3758            binop(Iop_CatOddLanes16x8,(_argL),(_argR))
3759 #define COD8x16(_argL,_argR) \
3760            binop(Iop_CatOddLanes8x16,(_argL),(_argR))
3761 #define CEV8x16(_argL,_argR) \
3762            binop(Iop_CatEvenLanes8x16,(_argL),(_argR))
3763 #define AND(_arg1,_arg2) \
3764            binop(Iop_AndV128,(_arg1),(_arg2))
3765 #define OR2(_arg1,_arg2) \
3766            binop(Iop_OrV128,(_arg1),(_arg2))
3767 #define OR3(_arg1,_arg2,_arg3) \
3768            binop(Iop_OrV128,(_arg1),binop(Iop_OrV128,(_arg2),(_arg3)))
3769 #define OR4(_arg1,_arg2,_arg3,_arg4) \
3770            binop(Iop_OrV128, \
3771                  binop(Iop_OrV128,(_arg1),(_arg2)), \
3772                  binop(Iop_OrV128,(_arg3),(_arg4)))
3773
3774
3775 /* Do interleaving for 1 128 bit vector, for ST1 insns. */
3776 static
3777 void math_INTERLEAVE1_128( /*OUTx1*/ IRTemp* i0,
3778                            UInt laneSzBlg2, IRTemp u0 )
3779 {
3780    assign(*i0, mkexpr(u0));
3781 }
3782
3783
3784 /* Do interleaving for 2 128 bit vectors, for ST2 insns. */
3785 static
3786 void math_INTERLEAVE2_128( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
3787                            UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
3788 {
3789    /* This is pretty easy, since we have primitives directly to
3790       hand. */
3791    if (laneSzBlg2 == 3) {
3792       // 64x2
3793       // u1 == B1 B0, u0 == A1 A0
3794       // i1 == B1 A1, i0 == B0 A0
3795       assign(*i0, binop(Iop_InterleaveLO64x2, mkexpr(u1), mkexpr(u0)));
3796       assign(*i1, binop(Iop_InterleaveHI64x2, mkexpr(u1), mkexpr(u0)));
3797       return;
3798    }
3799    if (laneSzBlg2 == 2) {
3800       // 32x4
3801       // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
3802       // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
3803       assign(*i0, binop(Iop_InterleaveLO32x4, mkexpr(u1), mkexpr(u0)));
3804       assign(*i1, binop(Iop_InterleaveHI32x4, mkexpr(u1), mkexpr(u0)));
3805       return;
3806    }
3807    if (laneSzBlg2 == 1) {
3808       // 16x8
3809       // u1 == B{7..0}, u0 == A{7..0}
3810       // i0 == B3 A3 B2 A2 B1 A1 B0 A0
3811       // i1 == B7 A7 B6 A6 B5 A5 B4 A4
3812       assign(*i0, binop(Iop_InterleaveLO16x8, mkexpr(u1), mkexpr(u0)));
3813       assign(*i1, binop(Iop_InterleaveHI16x8, mkexpr(u1), mkexpr(u0)));
3814       return;
3815    }
3816    if (laneSzBlg2 == 0) {
3817       // 8x16
3818       // u1 == B{f..0}, u0 == A{f..0}
3819       // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
3820       // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
3821       assign(*i0, binop(Iop_InterleaveLO8x16, mkexpr(u1), mkexpr(u0)));
3822       assign(*i1, binop(Iop_InterleaveHI8x16, mkexpr(u1), mkexpr(u0)));
3823       return;
3824    }
3825    /*NOTREACHED*/
3826    vassert(0);
3827 }
3828
3829
3830 /* Do interleaving for 3 128 bit vectors, for ST3 insns. */
3831 static
3832 void math_INTERLEAVE3_128(
3833         /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
3834         UInt laneSzBlg2,
3835         IRTemp u0, IRTemp u1, IRTemp u2 )
3836 {
3837    if (laneSzBlg2 == 3) {
3838       // 64x2
3839       // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
3840       // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
3841       assign(*i2, IHI64x2( EX(u2), EX(u1) ));
3842       assign(*i1, ILO64x2( ROR(EX(u0),8), EX(u2) ));
3843       assign(*i0, ILO64x2( EX(u1), EX(u0) ));
3844       return;
3845    }
3846
3847    if (laneSzBlg2 == 2) {
3848       // 32x4
3849       // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
3850       // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
3851       // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
3852       IRTemp p0    = newTempV128();
3853       IRTemp p1    = newTempV128();
3854       IRTemp p2    = newTempV128();
3855       IRTemp c1100 = newTempV128();
3856       IRTemp c0011 = newTempV128();
3857       IRTemp c0110 = newTempV128();
3858       assign(c1100, mkV128(0xFF00));
3859       assign(c0011, mkV128(0x00FF));
3860       assign(c0110, mkV128(0x0FF0));
3861       // First interleave them at 64x2 granularity,
3862       // generating partial ("p") values.
3863       math_INTERLEAVE3_128(&p0, &p1, &p2, 3, u0, u1, u2);
3864       // And more shuffling around for the final answer
3865       assign(*i2, OR2( AND( IHI32x4(EX(p2), ROL(EX(p2),8)), EX(c1100) ),
3866                        AND( IHI32x4(ROR(EX(p1),4), EX(p2)), EX(c0011) ) ));
3867       assign(*i1, OR3( SHL(EX(p2),12),
3868                        AND(EX(p1),EX(c0110)),
3869                        SHR(EX(p0),12) ));
3870       assign(*i0, OR2( AND( ILO32x4(EX(p0),ROL(EX(p1),4)), EX(c1100) ),
3871                        AND( ILO32x4(ROR(EX(p0),8),EX(p0)), EX(c0011) ) ));
3872       return;
3873    }
3874
3875    if (laneSzBlg2 == 1) {
3876       // 16x8
3877       // u2 == C7 C6 C5 C4 C3 C2 C1 C0
3878       // u1 == B7 B6 B5 B4 B3 B2 B1 B0
3879       // u0 == A7 A6 A5 A4 A3 A2 A1 A0
3880       //
3881       // p2 == C7 C6 B7 B6 A7 A6 C5 C4
3882       // p1 == B5 B4 A5 A4 C3 C2 B3 B2
3883       // p0 == A3 A2 C1 C0 B1 B0 A1 A0
3884       //
3885       // i2 == C7 B7 A7 C6 B6 A6 C5 B5
3886       // i1 == A5 C4 B4 A4 C4 B3 A3 C2
3887       // i0 == B2 A2 C1 B1 A1 C0 B0 A0
3888       IRTemp p0    = newTempV128();
3889       IRTemp p1    = newTempV128();
3890       IRTemp p2    = newTempV128();
3891       IRTemp c1000 = newTempV128();
3892       IRTemp c0100 = newTempV128();
3893       IRTemp c0010 = newTempV128();
3894       IRTemp c0001 = newTempV128();
3895       assign(c1000, mkV128(0xF000));
3896       assign(c0100, mkV128(0x0F00));
3897       assign(c0010, mkV128(0x00F0));
3898       assign(c0001, mkV128(0x000F));
3899       // First interleave them at 32x4 granularity,
3900       // generating partial ("p") values.
3901       math_INTERLEAVE3_128(&p0, &p1, &p2, 2, u0, u1, u2);
3902       // And more shuffling around for the final answer
3903       assign(*i2,
3904              OR4( AND( IHI16x8( EX(p2),        ROL(EX(p2),4) ), EX(c1000) ),
3905                   AND( IHI16x8( ROL(EX(p2),6), EX(p2)        ), EX(c0100) ),
3906                   AND( IHI16x8( ROL(EX(p2),2), ROL(EX(p2),6) ), EX(c0010) ),
3907                   AND( ILO16x8( ROR(EX(p2),2), ROL(EX(p1),2) ), EX(c0001) )
3908       ));
3909       assign(*i1,
3910              OR4( AND( IHI16x8( ROL(EX(p1),4), ROR(EX(p2),2) ), EX(c1000) ),
3911                   AND( IHI16x8( EX(p1),        ROL(EX(p1),4) ), EX(c0100) ),
3912                   AND( IHI16x8( ROL(EX(p1),4), ROL(EX(p1),8) ), EX(c0010) ),
3913                   AND( IHI16x8( ROR(EX(p0),6), ROL(EX(p1),4) ), EX(c0001) )
3914       ));
3915       assign(*i0,
3916              OR4( AND( IHI16x8( ROR(EX(p1),2), ROL(EX(p0),2) ), EX(c1000) ),
3917                   AND( IHI16x8( ROL(EX(p0),2), ROL(EX(p0),6) ), EX(c0100) ),
3918                   AND( IHI16x8( ROL(EX(p0),8), ROL(EX(p0),2) ), EX(c0010) ),
3919                   AND( IHI16x8( ROL(EX(p0),4), ROL(EX(p0),8) ), EX(c0001) )
3920       ));
3921       return;
3922    }
3923
3924    if (laneSzBlg2 == 0) {
3925       // 8x16.  It doesn't seem worth the hassle of first doing a
3926       // 16x8 interleave, so just generate all 24 partial results
3927       // directly :-(
3928       // u2 == Cf .. C0, u1 == Bf .. B0, u0 == Af .. A0
3929       // i2 == Cf Bf Af Ce .. Bb Ab Ca
3930       // i1 == Ba Aa C9 B9 .. A6 C5 B5
3931       // i0 == A5 C4 B4 A4 .. C0 B0 A0
3932
3933       IRTemp i2_FEDC = newTempV128(); IRTemp i2_BA98 = newTempV128();
3934       IRTemp i2_7654 = newTempV128(); IRTemp i2_3210 = newTempV128();
3935       IRTemp i1_FEDC = newTempV128(); IRTemp i1_BA98 = newTempV128();
3936       IRTemp i1_7654 = newTempV128(); IRTemp i1_3210 = newTempV128();
3937       IRTemp i0_FEDC = newTempV128(); IRTemp i0_BA98 = newTempV128();
3938       IRTemp i0_7654 = newTempV128(); IRTemp i0_3210 = newTempV128();
3939       IRTemp i2_hi64 = newTempV128(); IRTemp i2_lo64 = newTempV128();
3940       IRTemp i1_hi64 = newTempV128(); IRTemp i1_lo64 = newTempV128();
3941       IRTemp i0_hi64 = newTempV128(); IRTemp i0_lo64 = newTempV128();
3942
3943       // eg XXXX(qqq, CC, 0xF, BB, 0xA)) sets qqq to be a vector
3944       // of the form 14 bytes junk : CC[0xF] : BB[0xA]
3945       //
3946 #     define XXXX(_tempName,_srcVec1,_srcShift1,_srcVec2,_srcShift2) \
3947          IRTemp t_##_tempName = newTempV128(); \
3948          assign(t_##_tempName, \
3949                 ILO8x16( ROR(EX(_srcVec1),(_srcShift1)), \
3950                          ROR(EX(_srcVec2),(_srcShift2)) ) )
3951
3952       // Let CC, BB, AA be (handy) aliases of u2, u1, u0 respectively
3953       IRTemp CC = u2; IRTemp BB = u1; IRTemp AA = u0;
3954
3955       // The slicing and reassembly are done as interleavedly as possible,
3956       // so as to minimise the demand for registers in the back end, which
3957       // was observed to be a problem in testing.
3958
3959       XXXX(CfBf, CC, 0xf, BB, 0xf); // i2[15:14]
3960       XXXX(AfCe, AA, 0xf, CC, 0xe);
3961       assign(i2_FEDC, ILO16x8(EX(t_CfBf), EX(t_AfCe)));
3962
3963       XXXX(BeAe, BB, 0xe, AA, 0xe);
3964       XXXX(CdBd, CC, 0xd, BB, 0xd);
3965       assign(i2_BA98, ILO16x8(EX(t_BeAe), EX(t_CdBd)));
3966       assign(i2_hi64, ILO32x4(EX(i2_FEDC), EX(i2_BA98)));
3967
3968       XXXX(AdCc, AA, 0xd, CC, 0xc);
3969       XXXX(BcAc, BB, 0xc, AA, 0xc);
3970       assign(i2_7654, ILO16x8(EX(t_AdCc), EX(t_BcAc)));
3971
3972       XXXX(CbBb, CC, 0xb, BB, 0xb);
3973       XXXX(AbCa, AA, 0xb, CC, 0xa); // i2[1:0]
3974       assign(i2_3210, ILO16x8(EX(t_CbBb), EX(t_AbCa)));
3975       assign(i2_lo64, ILO32x4(EX(i2_7654), EX(i2_3210)));
3976       assign(*i2, ILO64x2(EX(i2_hi64), EX(i2_lo64)));
3977
3978       XXXX(BaAa, BB, 0xa, AA, 0xa); // i1[15:14]
3979       XXXX(C9B9, CC, 0x9, BB, 0x9);
3980       assign(i1_FEDC, ILO16x8(EX(t_BaAa), EX(t_C9B9)));
3981
3982       XXXX(A9C8, AA, 0x9, CC, 0x8);
3983       XXXX(B8A8, BB, 0x8, AA, 0x8);
3984       assign(i1_BA98, ILO16x8(EX(t_A9C8), EX(t_B8A8)));
3985       assign(i1_hi64, ILO32x4(EX(i1_FEDC), EX(i1_BA98)));
3986
3987       XXXX(C7B7, CC, 0x7, BB, 0x7);
3988       XXXX(A7C6, AA, 0x7, CC, 0x6);
3989       assign(i1_7654, ILO16x8(EX(t_C7B7), EX(t_A7C6)));
3990
3991       XXXX(B6A6, BB, 0x6, AA, 0x6);
3992       XXXX(C5B5, CC, 0x5, BB, 0x5); // i1[1:0]
3993       assign(i1_3210, ILO16x8(EX(t_B6A6), EX(t_C5B5)));
3994       assign(i1_lo64, ILO32x4(EX(i1_7654), EX(i1_3210)));
3995       assign(*i1, ILO64x2(EX(i1_hi64), EX(i1_lo64)));
3996
3997       XXXX(A5C4, AA, 0x5, CC, 0x4); // i0[15:14]
3998       XXXX(B4A4, BB, 0x4, AA, 0x4);
3999       assign(i0_FEDC, ILO16x8(EX(t_A5C4), EX(t_B4A4)));
4000
4001       XXXX(C3B3, CC, 0x3, BB, 0x3);
4002       XXXX(A3C2, AA, 0x3, CC, 0x2);
4003       assign(i0_BA98, ILO16x8(EX(t_C3B3), EX(t_A3C2)));
4004       assign(i0_hi64, ILO32x4(EX(i0_FEDC), EX(i0_BA98)));
4005
4006       XXXX(B2A2, BB, 0x2, AA, 0x2);
4007       XXXX(C1B1, CC, 0x1, BB, 0x1);
4008       assign(i0_7654, ILO16x8(EX(t_B2A2), EX(t_C1B1)));
4009
4010       XXXX(A1C0, AA, 0x1, CC, 0x0);
4011       XXXX(B0A0, BB, 0x0, AA, 0x0); // i0[1:0]
4012       assign(i0_3210, ILO16x8(EX(t_A1C0), EX(t_B0A0)));
4013       assign(i0_lo64, ILO32x4(EX(i0_7654), EX(i0_3210)));
4014       assign(*i0, ILO64x2(EX(i0_hi64), EX(i0_lo64)));
4015
4016 #     undef XXXX
4017       return;
4018    }
4019
4020    /*NOTREACHED*/
4021    vassert(0);
4022 }
4023
4024
4025 /* Do interleaving for 4 128 bit vectors, for ST4 insns. */
4026 static
4027 void math_INTERLEAVE4_128(
4028         /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
4029         UInt laneSzBlg2,
4030         IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
4031 {
4032    if (laneSzBlg2 == 3) {
4033       // 64x2
4034       assign(*i0, ILO64x2(EX(u1), EX(u0)));
4035       assign(*i1, ILO64x2(EX(u3), EX(u2)));
4036       assign(*i2, IHI64x2(EX(u1), EX(u0)));
4037       assign(*i3, IHI64x2(EX(u3), EX(u2)));
4038       return;
4039    }
4040    if (laneSzBlg2 == 2) {
4041       // 32x4
4042       // First, interleave at the 64-bit lane size.
4043       IRTemp p0 = newTempV128();
4044       IRTemp p1 = newTempV128();
4045       IRTemp p2 = newTempV128();
4046       IRTemp p3 = newTempV128();
4047       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 3, u0, u1, u2, u3);
4048       // And interleave (cat) at the 32 bit size.
4049       assign(*i0, CEV32x4(EX(p1), EX(p0)));
4050       assign(*i1, COD32x4(EX(p1), EX(p0)));
4051       assign(*i2, CEV32x4(EX(p3), EX(p2)));
4052       assign(*i3, COD32x4(EX(p3), EX(p2)));
4053       return;
4054    }
4055    if (laneSzBlg2 == 1) {
4056       // 16x8
4057       // First, interleave at the 32-bit lane size.
4058       IRTemp p0 = newTempV128();
4059       IRTemp p1 = newTempV128();
4060       IRTemp p2 = newTempV128();
4061       IRTemp p3 = newTempV128();
4062       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 2, u0, u1, u2, u3);
4063       // And rearrange within each vector, to get the right 16 bit lanes.
4064       assign(*i0, COD16x8(EX(p0), SHL(EX(p0), 2)));
4065       assign(*i1, COD16x8(EX(p1), SHL(EX(p1), 2)));
4066       assign(*i2, COD16x8(EX(p2), SHL(EX(p2), 2)));
4067       assign(*i3, COD16x8(EX(p3), SHL(EX(p3), 2)));
4068       return;
4069    }
4070    if (laneSzBlg2 == 0) {
4071       // 8x16
4072       // First, interleave at the 16-bit lane size.
4073       IRTemp p0 = newTempV128();
4074       IRTemp p1 = newTempV128();
4075       IRTemp p2 = newTempV128();
4076       IRTemp p3 = newTempV128();
4077       math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 1, u0, u1, u2, u3);
4078       // And rearrange within each vector, to get the right 8 bit lanes.
4079       assign(*i0, IHI32x4(COD8x16(EX(p0),EX(p0)), CEV8x16(EX(p0),EX(p0))));
4080       assign(*i1, IHI32x4(COD8x16(EX(p1),EX(p1)), CEV8x16(EX(p1),EX(p1))));
4081       assign(*i2, IHI32x4(COD8x16(EX(p2),EX(p2)), CEV8x16(EX(p2),EX(p2))));
4082       assign(*i3, IHI32x4(COD8x16(EX(p3),EX(p3)), CEV8x16(EX(p3),EX(p3))));
4083       return;
4084    }
4085    /*NOTREACHED*/
4086    vassert(0);
4087 }
4088
4089
4090 /* Do deinterleaving for 1 128 bit vector, for LD1 insns. */
4091 static
4092 void math_DEINTERLEAVE1_128( /*OUTx1*/ IRTemp* u0,
4093                              UInt laneSzBlg2, IRTemp i0 )
4094 {
4095    assign(*u0, mkexpr(i0));
4096 }
4097
4098
4099 /* Do deinterleaving for 2 128 bit vectors, for LD2 insns. */
4100 static
4101 void math_DEINTERLEAVE2_128( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
4102                              UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
4103 {
4104    /* This is pretty easy, since we have primitives directly to
4105       hand. */
4106    if (laneSzBlg2 == 3) {
4107       // 64x2
4108       // i1 == B1 A1, i0 == B0 A0
4109       // u1 == B1 B0, u0 == A1 A0
4110       assign(*u0, binop(Iop_InterleaveLO64x2, mkexpr(i1), mkexpr(i0)));
4111       assign(*u1, binop(Iop_InterleaveHI64x2, mkexpr(i1), mkexpr(i0)));
4112       return;
4113    }
4114    if (laneSzBlg2 == 2) {
4115       // 32x4
4116       // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
4117       // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0,
4118       assign(*u0, binop(Iop_CatEvenLanes32x4, mkexpr(i1), mkexpr(i0)));
4119       assign(*u1, binop(Iop_CatOddLanes32x4, mkexpr(i1), mkexpr(i0)));
4120       return;
4121    }
4122    if (laneSzBlg2 == 1) {
4123       // 16x8
4124       // i0 == B3 A3 B2 A2 B1 A1 B0 A0
4125       // i1 == B7 A7 B6 A6 B5 A5 B4 A4
4126       // u1 == B{7..0}, u0 == A{7..0}
4127       assign(*u0, binop(Iop_CatEvenLanes16x8, mkexpr(i1), mkexpr(i0)));
4128       assign(*u1, binop(Iop_CatOddLanes16x8,  mkexpr(i1), mkexpr(i0)));
4129       return;
4130    }
4131    if (laneSzBlg2 == 0) {
4132       // 8x16
4133       // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
4134       // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
4135       // u1 == B{f..0}, u0 == A{f..0}
4136       assign(*u0, binop(Iop_CatEvenLanes8x16, mkexpr(i1), mkexpr(i0)));
4137       assign(*u1, binop(Iop_CatOddLanes8x16,  mkexpr(i1), mkexpr(i0)));
4138       return;
4139    }
4140    /*NOTREACHED*/
4141    vassert(0);
4142 }
4143
4144
4145 /* Do deinterleaving for 3 128 bit vectors, for LD3 insns. */
4146 static
4147 void math_DEINTERLEAVE3_128(
4148         /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
4149         UInt laneSzBlg2,
4150         IRTemp i0, IRTemp i1, IRTemp i2 )
4151 {
4152    if (laneSzBlg2 == 3) {
4153       // 64x2
4154       // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
4155       // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
4156       assign(*u2, ILO64x2( ROL(EX(i2),8), EX(i1)        ));
4157       assign(*u1, ILO64x2( EX(i2),        ROL(EX(i0),8) ));
4158       assign(*u0, ILO64x2( ROL(EX(i1),8), EX(i0)        ));
4159       return;
4160    }
4161
4162    if (laneSzBlg2 == 2) {
4163       // 32x4
4164       // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
4165       // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
4166       // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
4167       IRTemp t_a1c0b0a0 = newTempV128();
4168       IRTemp t_a2c1b1a1 = newTempV128();
4169       IRTemp t_a3c2b2a2 = newTempV128();
4170       IRTemp t_a0c3b3a3 = newTempV128();
4171       IRTemp p0 = newTempV128();
4172       IRTemp p1 = newTempV128();
4173       IRTemp p2 = newTempV128();
4174       // Compute some intermediate values.
4175       assign(t_a1c0b0a0, EX(i0));
4176       assign(t_a2c1b1a1, SL(EX(i1),EX(i0),3*4));
4177       assign(t_a3c2b2a2, SL(EX(i2),EX(i1),2*4));
4178       assign(t_a0c3b3a3, SL(EX(i0),EX(i2),1*4));
4179       // First deinterleave into lane-pairs
4180       assign(p0, ILO32x4(EX(t_a2c1b1a1),EX(t_a1c0b0a0)));
4181       assign(p1, ILO64x2(ILO32x4(EX(t_a0c3b3a3), EX(t_a3c2b2a2)),
4182                          IHI32x4(EX(t_a2c1b1a1), EX(t_a1c0b0a0))));
4183       assign(p2, ILO32x4(ROR(EX(t_a0c3b3a3),1*4), ROR(EX(t_a3c2b2a2),1*4)));
4184       // Then deinterleave at 64x2 granularity.
4185       math_DEINTERLEAVE3_128(u0, u1, u2, 3, p0, p1, p2);
4186       return;
4187    }
4188
4189    if (laneSzBlg2 == 1) {
4190       // 16x8
4191       // u2 == C7 C6 C5 C4 C3 C2 C1 C0
4192       // u1 == B7 B6 B5 B4 B3 B2 B1 B0
4193       // u0 == A7 A6 A5 A4 A3 A2 A1 A0
4194       //
4195       // i2 == C7 B7 A7 C6 B6 A6 C5 B5
4196       // i1 == A5 C4 B4 A4 C4 B3 A3 C2
4197       // i0 == B2 A2 C1 B1 A1 C0 B0 A0
4198       //
4199       // p2 == C7 C6 B7 B6 A7 A6 C5 C4
4200       // p1 == B5 B4 A5 A4 C3 C2 B3 B2
4201       // p0 == A3 A2 C1 C0 B1 B0 A1 A0
4202
4203       IRTemp s0, s1, s2, s3, t0, t1, t2, t3, p0, p1, p2, c00111111;
4204       s0 = s1 = s2 = s3
4205          = t0 = t1 = t2 = t3 = p0 = p1 = p2 = c00111111 = IRTemp_INVALID;
4206       newTempsV128_4(&s0, &s1, &s2, &s3);
4207       newTempsV128_4(&t0, &t1, &t2, &t3);
4208       newTempsV128_4(&p0, &p1, &p2, &c00111111);
4209
4210       // s0 == b2a2 c1b1a1 c0b0a0
4211       // s1 == b4a4 c3b3c3 c2b2a2
4212       // s2 == b6a6 c5b5a5 c4b4a4
4213       // s3 == b0a0 c7b7a7 c6b6a6
4214       assign(s0, EX(i0));
4215       assign(s1, SL(EX(i1),EX(i0),6*2));
4216       assign(s2, SL(EX(i2),EX(i1),4*2));
4217       assign(s3, SL(EX(i0),EX(i2),2*2));
4218
4219       // t0 == 0 0 c1c0 b1b0 a1a0
4220       // t1 == 0 0 c3c2 b3b2 a3a2
4221       // t2 == 0 0 c5c4 b5b4 a5a4
4222       // t3 == 0 0 c7c6 b7b6 a7a6
4223       assign(c00111111, mkV128(0x0FFF));
4224       assign(t0, AND( ILO16x8( ROR(EX(s0),3*2), EX(s0)), EX(c00111111)));
4225       assign(t1, AND( ILO16x8( ROR(EX(s1),3*2), EX(s1)), EX(c00111111)));
4226       assign(t2, AND( ILO16x8( ROR(EX(s2),3*2), EX(s2)), EX(c00111111)));
4227       assign(t3, AND( ILO16x8( ROR(EX(s3),3*2), EX(s3)), EX(c00111111)));
4228
4229       assign(p0, OR2(EX(t0),          SHL(EX(t1),6*2)));
4230       assign(p1, OR2(SHL(EX(t2),4*2), SHR(EX(t1),2*2)));
4231       assign(p2, OR2(SHL(EX(t3),2*2), SHR(EX(t2),4*2)));
4232
4233       // Then deinterleave at 32x4 granularity.
4234       math_DEINTERLEAVE3_128(u0, u1, u2, 2, p0, p1, p2);
4235       return;
4236    }
4237
4238    if (laneSzBlg2 == 0) {
4239       // 8x16.  This is the same scheme as for 16x8, with twice the
4240       // number of intermediate values.
4241       //
4242       // u2 == C{f..0}
4243       // u1 == B{f..0}
4244       // u0 == A{f..0}
4245       //
4246       // i2 == CBA{f} CBA{e} CBA{d} CBA{c} CBA{b} C{a}
4247       // i1 ==  BA{a} CBA{9} CBA{8} CBA{7} CBA{6} CB{5}
4248       // i0 ==   A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
4249       //
4250       // p2 == C{fe} B{fe} A{fe} C{dc} B{dc} A{dc} C{ba} B{ba}
4251       // p1 == A{ba} C{98} B{98} A{98} C{76} B{76} A{76} C{54}
4252       // p0 == B{54} A{54} C{32} B{32} A{32} C{10} B{10} A{10}
4253       //
4254       IRTemp s0, s1, s2, s3, s4, s5, s6, s7,
4255              t0, t1, t2, t3, t4, t5, t6, t7, p0, p1, p2, cMASK;
4256       s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7
4257          = t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = p0 = p1 = p2 = cMASK
4258          = IRTemp_INVALID;
4259       newTempsV128_4(&s0, &s1, &s2, &s3);
4260       newTempsV128_4(&s4, &s5, &s6, &s7);
4261       newTempsV128_4(&t0, &t1, &t2, &t3);
4262       newTempsV128_4(&t4, &t5, &t6, &t7);
4263       newTempsV128_4(&p0, &p1, &p2, &cMASK);
4264
4265       // s0 == A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
4266       // s1 == A{7} CBA{6} CBA{5} CBA{4} CBA{3} CBA{2}
4267       // s2 == A{9} CBA{8} CBA{7} CBA{6} CBA{5} CBA{4}
4268       // s3 == A{b} CBA{a} CBA{9} CBA{8} CBA{7} CBA{6}
4269       // s4 == A{d} CBA{c} CBA{b} CBA{a} CBA{9} CBA{8}
4270       // s5 == A{f} CBA{e} CBA{d} CBA{c} CBA{b} CBA{a}
4271       // s6 == A{1} CBA{0} CBA{f} CBA{e} CBA{d} CBA{c}
4272       // s7 == A{3} CBA{2} CBA{1} CBA{0} CBA{f} CBA{e}
4273       assign(s0, SL(EX(i1),EX(i0), 0));
4274       assign(s1, SL(EX(i1),EX(i0), 6));
4275       assign(s2, SL(EX(i1),EX(i0),12));
4276       assign(s3, SL(EX(i2),EX(i1), 2));
4277       assign(s4, SL(EX(i2),EX(i1), 8));
4278       assign(s5, SL(EX(i2),EX(i1),14));
4279       assign(s6, SL(EX(i0),EX(i2), 4));
4280       assign(s7, SL(EX(i0),EX(i2),10));
4281
4282       // t0 == 0--(ten)--0 C1 C0 B1 B0 A1 A0
4283       // t1 == 0--(ten)--0 C3 C2 B3 B2 A3 A2
4284       // t2 == 0--(ten)--0 C5 C4 B5 B4 A5 A4
4285       // t3 == 0--(ten)--0 C7 C6 B7 B6 A7 A6
4286       // t4 == 0--(ten)--0 C9 C8 B9 B8 A9 A8
4287       // t5 == 0--(ten)--0 Cb Ca Bb Ba Ab Aa
4288       // t6 == 0--(ten)--0 Cd Cc Bd Bc Ad Ac
4289       // t7 == 0--(ten)--0 Cf Ce Bf Be Af Ae
4290       assign(cMASK, mkV128(0x003F));
4291       assign(t0, AND( ILO8x16( ROR(EX(s0),3), EX(s0)), EX(cMASK)));
4292       assign(t1, AND( ILO8x16( ROR(EX(s1),3), EX(s1)), EX(cMASK)));
4293       assign(t2, AND( ILO8x16( ROR(EX(s2),3), EX(s2)), EX(cMASK)));
4294       assign(t3, AND( ILO8x16( ROR(EX(s3),3), EX(s3)), EX(cMASK)));
4295       assign(t4, AND( ILO8x16( ROR(EX(s4),3), EX(s4)), EX(cMASK)));
4296       assign(t5, AND( ILO8x16( ROR(EX(s5),3), EX(s5)), EX(cMASK)));
4297       assign(t6, AND( ILO8x16( ROR(EX(s6),3), EX(s6)), EX(cMASK)));
4298       assign(t7, AND( ILO8x16( ROR(EX(s7),3), EX(s7)), EX(cMASK)));
4299
4300       assign(p0, OR3( SHL(EX(t2),12), SHL(EX(t1),6), EX(t0) ));
4301       assign(p1, OR4( SHL(EX(t5),14), SHL(EX(t4),8),
4302                  SHL(EX(t3),2), SHR(EX(t2),4) ));
4303       assign(p2, OR3( SHL(EX(t7),10), SHL(EX(t6),4), SHR(EX(t5),2) ));
4304
4305       // Then deinterleave at 16x8 granularity.
4306       math_DEINTERLEAVE3_128(u0, u1, u2, 1, p0, p1, p2);
4307       return;
4308    }
4309
4310    /*NOTREACHED*/
4311    vassert(0);
4312 }
4313
4314
4315 /* Do deinterleaving for 4 128 bit vectors, for LD4 insns. */
4316 static
4317 void math_DEINTERLEAVE4_128(
4318         /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
4319         UInt laneSzBlg2,
4320         IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
4321 {
4322    if (laneSzBlg2 == 3) {
4323       // 64x2
4324       assign(*u0, ILO64x2(EX(i2), EX(i0)));
4325       assign(*u1, IHI64x2(EX(i2), EX(i0)));
4326       assign(*u2, ILO64x2(EX(i3), EX(i1)));
4327       assign(*u3, IHI64x2(EX(i3), EX(i1)));
4328       return;
4329    }
4330    if (laneSzBlg2 == 2) {
4331       // 32x4
4332       IRTemp p0 = newTempV128();
4333       IRTemp p2 = newTempV128();
4334       IRTemp p1 = newTempV128();
4335       IRTemp p3 = newTempV128();
4336       assign(p0, ILO32x4(EX(i1), EX(i0)));
4337       assign(p1, IHI32x4(EX(i1), EX(i0)));
4338       assign(p2, ILO32x4(EX(i3), EX(i2)));
4339       assign(p3, IHI32x4(EX(i3), EX(i2)));
4340       // And now do what we did for the 64-bit case.
4341       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 3, p0, p1, p2, p3);
4342       return;
4343    }
4344    if (laneSzBlg2 == 1) {
4345       // 16x8
4346       // Deinterleave into 32-bit chunks, then do as the 32-bit case.
4347       IRTemp p0 = newTempV128();
4348       IRTemp p1 = newTempV128();
4349       IRTemp p2 = newTempV128();
4350       IRTemp p3 = newTempV128();
4351       assign(p0, IHI16x8(EX(i0), SHL(EX(i0), 8)));
4352       assign(p1, IHI16x8(EX(i1), SHL(EX(i1), 8)));
4353       assign(p2, IHI16x8(EX(i2), SHL(EX(i2), 8)));
4354       assign(p3, IHI16x8(EX(i3), SHL(EX(i3), 8)));
4355       // From here on is like the 32 bit case.
4356       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 2, p0, p1, p2, p3);
4357       return;
4358    }
4359    if (laneSzBlg2 == 0) {
4360       // 8x16
4361       // Deinterleave into 16-bit chunks, then do as the 16-bit case.
4362       IRTemp p0 = newTempV128();
4363       IRTemp p1 = newTempV128();
4364       IRTemp p2 = newTempV128();
4365       IRTemp p3 = newTempV128();
4366       assign(p0, IHI64x2( IHI8x16(EX(i0),ROL(EX(i0),4)),
4367                           ILO8x16(EX(i0),ROL(EX(i0),4)) ));
4368       assign(p1, IHI64x2( IHI8x16(EX(i1),ROL(EX(i1),4)),
4369                           ILO8x16(EX(i1),ROL(EX(i1),4)) ));
4370       assign(p2, IHI64x2( IHI8x16(EX(i2),ROL(EX(i2),4)),
4371                           ILO8x16(EX(i2),ROL(EX(i2),4)) ));
4372       assign(p3, IHI64x2( IHI8x16(EX(i3),ROL(EX(i3),4)),
4373                           ILO8x16(EX(i3),ROL(EX(i3),4)) ));
4374       // From here on is like the 16 bit case.
4375       math_DEINTERLEAVE4_128(u0, u1, u2, u3, 1, p0, p1, p2, p3);
4376       return;
4377    }
4378    /*NOTREACHED*/
4379    vassert(0);
4380 }
4381
4382
4383 /* Wrappers that use the full-width (de)interleavers to do half-width
4384    (de)interleaving.  The scheme is to clone each input lane in the
4385    lower half of each incoming value, do a full width (de)interleave
4386    at the next lane size up, and remove every other lane of the the
4387    result.  The returned values may have any old junk in the upper
4388    64 bits -- the caller must ignore that. */
4389
4390 /* Helper function -- get doubling and narrowing operations. */
4391 static
4392 void math_get_doubler_and_halver ( /*OUT*/IROp* doubler,
4393                                    /*OUT*/IROp* halver,
4394                                    UInt laneSzBlg2 )
4395 {
4396    switch (laneSzBlg2) {
4397       case 2:
4398          *doubler = Iop_InterleaveLO32x4; *halver = Iop_CatEvenLanes32x4;
4399          break;
4400       case 1:
4401          *doubler = Iop_InterleaveLO16x8; *halver = Iop_CatEvenLanes16x8;
4402          break;
4403       case 0:
4404          *doubler = Iop_InterleaveLO8x16; *halver = Iop_CatEvenLanes8x16;
4405          break;
4406       default:
4407          vassert(0);
4408    }
4409 }
4410
4411 /* Do interleaving for 1 64 bit vector, for ST1 insns. */
4412 static
4413 void math_INTERLEAVE1_64( /*OUTx1*/ IRTemp* i0,
4414                           UInt laneSzBlg2, IRTemp u0 )
4415 {
4416    assign(*i0, mkexpr(u0));
4417 }
4418
4419
4420 /* Do interleaving for 2 64 bit vectors, for ST2 insns. */
4421 static
4422 void math_INTERLEAVE2_64( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
4423                           UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
4424 {
4425    if (laneSzBlg2 == 3) {
4426       // 1x64, degenerate case
4427       assign(*i0, EX(u0));
4428       assign(*i1, EX(u1));
4429       return;
4430    }
4431
4432    vassert(laneSzBlg2 <= 2);
4433    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4434    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4435
4436    IRTemp du0 = newTempV128();
4437    IRTemp du1 = newTempV128();
4438    assign(du0, binop(doubler, EX(u0), EX(u0)));
4439    assign(du1, binop(doubler, EX(u1), EX(u1)));
4440    IRTemp di0 = newTempV128();
4441    IRTemp di1 = newTempV128();
4442    math_INTERLEAVE2_128(&di0, &di1, laneSzBlg2 + 1, du0, du1);
4443    assign(*i0, binop(halver, EX(di0), EX(di0)));
4444    assign(*i1, binop(halver, EX(di1), EX(di1)));
4445 }
4446
4447
4448 /* Do interleaving for 3 64 bit vectors, for ST3 insns. */
4449 static
4450 void math_INTERLEAVE3_64(
4451         /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
4452         UInt laneSzBlg2,
4453         IRTemp u0, IRTemp u1, IRTemp u2 )
4454 {
4455    if (laneSzBlg2 == 3) {
4456       // 1x64, degenerate case
4457       assign(*i0, EX(u0));
4458       assign(*i1, EX(u1));
4459       assign(*i2, EX(u2));
4460       return;
4461    }
4462
4463    vassert(laneSzBlg2 <= 2);
4464    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4465    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4466
4467    IRTemp du0 = newTempV128();
4468    IRTemp du1 = newTempV128();
4469    IRTemp du2 = newTempV128();
4470    assign(du0, binop(doubler, EX(u0), EX(u0)));
4471    assign(du1, binop(doubler, EX(u1), EX(u1)));
4472    assign(du2, binop(doubler, EX(u2), EX(u2)));
4473    IRTemp di0 = newTempV128();
4474    IRTemp di1 = newTempV128();
4475    IRTemp di2 = newTempV128();
4476    math_INTERLEAVE3_128(&di0, &di1, &di2, laneSzBlg2 + 1, du0, du1, du2);
4477    assign(*i0, binop(halver, EX(di0), EX(di0)));
4478    assign(*i1, binop(halver, EX(di1), EX(di1)));
4479    assign(*i2, binop(halver, EX(di2), EX(di2)));
4480 }
4481
4482
4483 /* Do interleaving for 4 64 bit vectors, for ST4 insns. */
4484 static
4485 void math_INTERLEAVE4_64(
4486         /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
4487         UInt laneSzBlg2,
4488         IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
4489 {
4490    if (laneSzBlg2 == 3) {
4491       // 1x64, degenerate case
4492       assign(*i0, EX(u0));
4493       assign(*i1, EX(u1));
4494       assign(*i2, EX(u2));
4495       assign(*i3, EX(u3));
4496       return;
4497    }
4498
4499    vassert(laneSzBlg2 <= 2);
4500    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4501    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4502
4503    IRTemp du0 = newTempV128();
4504    IRTemp du1 = newTempV128();
4505    IRTemp du2 = newTempV128();
4506    IRTemp du3 = newTempV128();
4507    assign(du0, binop(doubler, EX(u0), EX(u0)));
4508    assign(du1, binop(doubler, EX(u1), EX(u1)));
4509    assign(du2, binop(doubler, EX(u2), EX(u2)));
4510    assign(du3, binop(doubler, EX(u3), EX(u3)));
4511    IRTemp di0 = newTempV128();
4512    IRTemp di1 = newTempV128();
4513    IRTemp di2 = newTempV128();
4514    IRTemp di3 = newTempV128();
4515    math_INTERLEAVE4_128(&di0, &di1, &di2, &di3,
4516                         laneSzBlg2 + 1, du0, du1, du2, du3);
4517    assign(*i0, binop(halver, EX(di0), EX(di0)));
4518    assign(*i1, binop(halver, EX(di1), EX(di1)));
4519    assign(*i2, binop(halver, EX(di2), EX(di2)));
4520    assign(*i3, binop(halver, EX(di3), EX(di3)));
4521 }
4522
4523
4524 /* Do deinterleaving for 1 64 bit vector, for LD1 insns. */
4525 static
4526 void math_DEINTERLEAVE1_64( /*OUTx1*/ IRTemp* u0,
4527                             UInt laneSzBlg2, IRTemp i0 )
4528 {
4529    assign(*u0, mkexpr(i0));
4530 }
4531
4532
4533 /* Do deinterleaving for 2 64 bit vectors, for LD2 insns. */
4534 static
4535 void math_DEINTERLEAVE2_64( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
4536                             UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
4537 {
4538    if (laneSzBlg2 == 3) {
4539       // 1x64, degenerate case
4540       assign(*u0, EX(i0));
4541       assign(*u1, EX(i1));
4542       return;
4543    }
4544
4545    vassert(laneSzBlg2 <= 2);
4546    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4547    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4548
4549    IRTemp di0 = newTempV128();
4550    IRTemp di1 = newTempV128();
4551    assign(di0, binop(doubler, EX(i0), EX(i0)));
4552    assign(di1, binop(doubler, EX(i1), EX(i1)));
4553
4554    IRTemp du0 = newTempV128();
4555    IRTemp du1 = newTempV128();
4556    math_DEINTERLEAVE2_128(&du0, &du1, laneSzBlg2 + 1, di0, di1);
4557    assign(*u0, binop(halver, EX(du0), EX(du0)));
4558    assign(*u1, binop(halver, EX(du1), EX(du1)));
4559 }
4560
4561
4562 /* Do deinterleaving for 3 64 bit vectors, for LD3 insns. */
4563 static
4564 void math_DEINTERLEAVE3_64(
4565         /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
4566         UInt laneSzBlg2,
4567         IRTemp i0, IRTemp i1, IRTemp i2 )
4568 {
4569    if (laneSzBlg2 == 3) {
4570       // 1x64, degenerate case
4571       assign(*u0, EX(i0));
4572       assign(*u1, EX(i1));
4573       assign(*u2, EX(i2));
4574       return;
4575    }
4576
4577    vassert(laneSzBlg2 <= 2);
4578    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4579    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4580
4581    IRTemp di0 = newTempV128();
4582    IRTemp di1 = newTempV128();
4583    IRTemp di2 = newTempV128();
4584    assign(di0, binop(doubler, EX(i0), EX(i0)));
4585    assign(di1, binop(doubler, EX(i1), EX(i1)));
4586    assign(di2, binop(doubler, EX(i2), EX(i2)));
4587    IRTemp du0 = newTempV128();
4588    IRTemp du1 = newTempV128();
4589    IRTemp du2 = newTempV128();
4590    math_DEINTERLEAVE3_128(&du0, &du1, &du2, laneSzBlg2 + 1, di0, di1, di2);
4591    assign(*u0, binop(halver, EX(du0), EX(du0)));
4592    assign(*u1, binop(halver, EX(du1), EX(du1)));
4593    assign(*u2, binop(halver, EX(du2), EX(du2)));
4594 }
4595
4596
4597 /* Do deinterleaving for 4 64 bit vectors, for LD4 insns. */
4598 static
4599 void math_DEINTERLEAVE4_64(
4600         /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
4601         UInt laneSzBlg2,
4602         IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
4603 {
4604    if (laneSzBlg2 == 3) {
4605       // 1x64, degenerate case
4606       assign(*u0, EX(i0));
4607       assign(*u1, EX(i1));
4608       assign(*u2, EX(i2));
4609       assign(*u3, EX(i3));
4610       return;
4611    }
4612
4613    vassert(laneSzBlg2 <= 2);
4614    IROp doubler = Iop_INVALID, halver = Iop_INVALID;
4615    math_get_doubler_and_halver(&doubler, &halver, laneSzBlg2);
4616
4617    IRTemp di0 = newTempV128();
4618    IRTemp di1 = newTempV128();
4619    IRTemp di2 = newTempV128();
4620    IRTemp di3 = newTempV128();
4621    assign(di0, binop(doubler, EX(i0), EX(i0)));
4622    assign(di1, binop(doubler, EX(i1), EX(i1)));
4623    assign(di2, binop(doubler, EX(i2), EX(i2)));
4624    assign(di3, binop(doubler, EX(i3), EX(i3)));
4625    IRTemp du0 = newTempV128();
4626    IRTemp du1 = newTempV128();
4627    IRTemp du2 = newTempV128();
4628    IRTemp du3 = newTempV128();
4629    math_DEINTERLEAVE4_128(&du0, &du1, &du2, &du3,
4630                           laneSzBlg2 + 1, di0, di1, di2, di3);
4631    assign(*u0, binop(halver, EX(du0), EX(du0)));
4632    assign(*u1, binop(halver, EX(du1), EX(du1)));
4633    assign(*u2, binop(halver, EX(du2), EX(du2)));
4634    assign(*u3, binop(halver, EX(du3), EX(du3)));
4635 }
4636
4637
4638 #undef EX
4639 #undef SL
4640 #undef ROR
4641 #undef ROL
4642 #undef SHR
4643 #undef SHL
4644 #undef ILO64x2
4645 #undef IHI64x2
4646 #undef ILO32x4
4647 #undef IHI32x4
4648 #undef ILO16x8
4649 #undef IHI16x8
4650 #undef ILO16x8
4651 #undef IHI16x8
4652 #undef CEV32x4
4653 #undef COD32x4
4654 #undef COD16x8
4655 #undef COD8x16
4656 #undef CEV8x16
4657 #undef AND
4658 #undef OR2
4659 #undef OR3
4660 #undef OR4
4661
4662
4663 /*------------------------------------------------------------*/
4664 /*--- Load and Store instructions                          ---*/
4665 /*------------------------------------------------------------*/
4666
4667 /* Generate the EA for a "reg + reg" style amode.  This is done from
4668    parts of the insn, but for sanity checking sake it takes the whole
4669    insn.  This appears to depend on insn[15:12], with opt=insn[15:13]
4670    and S=insn[12]:
4671
4672    The possible forms, along with their opt:S values, are:
4673       011:0   Xn|SP + Xm
4674       111:0   Xn|SP + Xm
4675       011:1   Xn|SP + Xm * transfer_szB
4676       111:1   Xn|SP + Xm * transfer_szB
4677       010:0   Xn|SP + 32Uto64(Wm)
4678       010:1   Xn|SP + 32Uto64(Wm) * transfer_szB
4679       110:0   Xn|SP + 32Sto64(Wm)
4680       110:1   Xn|SP + 32Sto64(Wm) * transfer_szB
4681
4682    Rm is insn[20:16].  Rn is insn[9:5].  Rt is insn[4:0].  Log2 of
4683    the transfer size is insn[23,31,30].  For integer loads/stores,
4684    insn[23] is zero, hence szLg2 can be at most 3 in such cases.
4685
4686    If the decoding fails, it returns IRTemp_INVALID.
4687
4688    isInt is True iff this is decoding is for transfers to/from integer
4689    registers.  If False it is for transfers to/from vector registers.
4690 */
4691 static IRTemp gen_indexed_EA ( /*OUT*/HChar* buf, UInt insn, Bool isInt )
4692 {
4693    UInt    optS  = SLICE_UInt(insn, 15, 12);
4694    UInt    mm    = SLICE_UInt(insn, 20, 16);
4695    UInt    nn    = SLICE_UInt(insn, 9, 5);
4696    UInt    szLg2 = (isInt ? 0 : (SLICE_UInt(insn, 23, 23) << 2))
4697                    | SLICE_UInt(insn, 31, 30); // Log2 of the size
4698
4699    buf[0] = 0;
4700
4701    /* Sanity checks, that this really is a load/store insn. */
4702    if (SLICE_UInt(insn, 11, 10) != BITS2(1,0))
4703       goto fail;
4704
4705    if (isInt
4706        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,1,1)/*LDR*/
4707        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,0,0,1)/*STR*/
4708        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,0,1)/*LDRSbhw Xt*/
4709        && SLICE_UInt(insn, 29, 21) != BITS9(1,1,1,0,0,0,1,1,1))/*LDRSbhw Wt*/
4710       goto fail;
4711
4712    if (!isInt
4713        && SLICE_UInt(insn, 29, 24) != BITS6(1,1,1,1,0,0)) /*LDR/STR*/
4714       goto fail;
4715
4716    /* Throw out non-verified but possibly valid cases. */
4717    switch (szLg2) {
4718       case BITS3(0,0,0): break; //  8 bit, valid for both int and vec
4719       case BITS3(0,0,1): break; // 16 bit, valid for both int and vec
4720       case BITS3(0,1,0): break; // 32 bit, valid for both int and vec
4721       case BITS3(0,1,1): break; // 64 bit, valid for both int and vec
4722       case BITS3(1,0,0): // can only ever be valid for the vector case
4723                          if (isInt) goto fail; else break;
4724       case BITS3(1,0,1): // these sizes are never valid
4725       case BITS3(1,1,0):
4726       case BITS3(1,1,1): goto fail;
4727
4728       default: vassert(0);
4729    }
4730
4731    IRExpr* rhs  = NULL;
4732    switch (optS) {
4733       case BITS4(1,1,1,0): goto fail; //ATC
4734       case BITS4(0,1,1,0):
4735          rhs = getIReg64orZR(mm);
4736          vex_sprintf(buf, "[%s, %s]",
4737                      nameIReg64orZR(nn), nameIReg64orZR(mm));
4738          break;
4739       case BITS4(1,1,1,1): goto fail; //ATC
4740       case BITS4(0,1,1,1):
4741          rhs = binop(Iop_Shl64, getIReg64orZR(mm), mkU8(szLg2));
4742          vex_sprintf(buf, "[%s, %s lsl %u]",
4743                      nameIReg64orZR(nn), nameIReg64orZR(mm), szLg2);
4744          break;
4745       case BITS4(0,1,0,0):
4746          rhs = unop(Iop_32Uto64, getIReg32orZR(mm));
4747          vex_sprintf(buf, "[%s, %s uxtx]",
4748                      nameIReg64orZR(nn), nameIReg32orZR(mm));
4749          break;
4750       case BITS4(0,1,0,1):
4751          rhs = binop(Iop_Shl64,
4752                      unop(Iop_32Uto64, getIReg32orZR(mm)), mkU8(szLg2));
4753          vex_sprintf(buf, "[%s, %s uxtx, lsl %u]",
4754                      nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
4755          break;
4756       case BITS4(1,1,0,0):
4757          rhs = unop(Iop_32Sto64, getIReg32orZR(mm));
4758          vex_sprintf(buf, "[%s, %s sxtx]",
4759                      nameIReg64orZR(nn), nameIReg32orZR(mm));
4760          break;
4761       case BITS4(1,1,0,1):
4762          rhs = binop(Iop_Shl64,
4763                      unop(Iop_32Sto64, getIReg32orZR(mm)), mkU8(szLg2));
4764          vex_sprintf(buf, "[%s, %s sxtx, lsl %u]",
4765                      nameIReg64orZR(nn), nameIReg32orZR(mm), szLg2);
4766          break;
4767       default:
4768          /* The rest appear to be genuinely invalid */
4769          goto fail;
4770    }
4771
4772    vassert(rhs);
4773    IRTemp res = newTemp(Ity_I64);
4774    assign(res, binop(Iop_Add64, getIReg64orSP(nn), rhs));
4775    return res;
4776
4777   fail:
4778    if (0 /*really, sigill_diag, but that causes too much plumbing*/) {
4779       vex_printf("gen_indexed_EA: unhandled case optS == 0x%x\n", optS);
4780    }
4781    return IRTemp_INVALID;
4782 }
4783
4784
4785 /* Generate an 8/16/32/64 bit integer store to ADDR for the lowest
4786    bits of DATAE :: Ity_I64. */
4787 static void gen_narrowing_store ( UInt szB, IRTemp addr, IRExpr* dataE )
4788 {
4789    IRExpr* addrE = mkexpr(addr);
4790    switch (szB) {
4791       case 8:
4792          storeLE(addrE, dataE);
4793          break;
4794       case 4:
4795          storeLE(addrE, unop(Iop_64to32, dataE));
4796          break;
4797       case 2:
4798          storeLE(addrE, unop(Iop_64to16, dataE));
4799          break;
4800       case 1:
4801          storeLE(addrE, unop(Iop_64to8, dataE));
4802          break;
4803       default:
4804          vassert(0);
4805    }
4806 }
4807
4808
4809 /* Generate an 8/16/32/64 bit unsigned widening load from ADDR,
4810    placing the result in an Ity_I64 temporary. */
4811 static IRTemp gen_zwidening_load ( UInt szB, IRTemp addr )
4812 {
4813    IRTemp  res   = newTemp(Ity_I64);
4814    IRExpr* addrE = mkexpr(addr);
4815    switch (szB) {
4816       case 8:
4817          assign(res, loadLE(Ity_I64,addrE));
4818          break;
4819       case 4:
4820          assign(res, unop(Iop_32Uto64, loadLE(Ity_I32,addrE)));
4821          break;
4822       case 2:
4823          assign(res, unop(Iop_16Uto64, loadLE(Ity_I16,addrE)));
4824          break;
4825       case 1:
4826          assign(res, unop(Iop_8Uto64, loadLE(Ity_I8,addrE)));
4827          break;
4828       default:
4829          vassert(0);
4830    }
4831    return res;
4832 }
4833
4834
4835 /* Generate a SIGBUS followed by a restart of the current instruction if
4836    `effective_addr` is `align`-aligned.  This is required behaviour for atomic
4837    instructions.  This assumes that guest_RIP_curr_instr is set correctly!
4838
4839    This is hardwired to generate SIGBUS because so far the only supported arm64
4840    (arm64-linux) does that.  Should we need to later extend it to generate some
4841    other signal, use the same scheme as with gen_SIGNAL_if_not_XX_aligned in
4842    guest_amd64_toIR.c. */
4843 static
4844 void gen_SIGBUS_if_not_XX_aligned ( IRTemp effective_addr, ULong align )
4845 {
4846    if (align == 1) {
4847       return;
4848    }
4849    vassert(align == 16 || align == 8 || align == 4 || align == 2);
4850    stmt(
4851       IRStmt_Exit(
4852          binop(Iop_CmpNE64,
4853                binop(Iop_And64,mkexpr(effective_addr),mkU64(align-1)),
4854                mkU64(0)),
4855          Ijk_SigBUS,
4856          IRConst_U64(guest_PC_curr_instr),
4857          OFFB_PC
4858       )
4859    );
4860 }
4861
4862
4863 /* Generate a "standard 7" name, from bitQ and size.  But also
4864    allow ".1d" since that's occasionally useful. */
4865 static
4866 const HChar* nameArr_Q_SZ ( UInt bitQ, UInt size )
4867 {
4868    vassert(bitQ <= 1 && size <= 3);
4869    const HChar* nms[8]
4870       = { "8b", "4h", "2s", "1d", "16b", "8h", "4s", "2d" };
4871    UInt ix = (bitQ << 2) | size;
4872    vassert(ix < 8);
4873    return nms[ix];
4874 }
4875
4876
4877 static
4878 Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
4879                           const VexAbiInfo* abiinfo, Bool sigill_diag)
4880 {
4881 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
4882
4883    /* ------------ LDR,STR (immediate, uimm12) ----------- */
4884    /* uimm12 is scaled by the transfer size
4885
4886       31 29  26    21    9  4
4887       |  |   |     |     |  |
4888       11 111 00100 imm12 nn tt    STR  Xt, [Xn|SP, #imm12 * 8]
4889       11 111 00101 imm12 nn tt    LDR  Xt, [Xn|SP, #imm12 * 8]
4890
4891       10 111 00100 imm12 nn tt    STR  Wt, [Xn|SP, #imm12 * 4]
4892       10 111 00101 imm12 nn tt    LDR  Wt, [Xn|SP, #imm12 * 4]
4893
4894       01 111 00100 imm12 nn tt    STRH Wt, [Xn|SP, #imm12 * 2]
4895       01 111 00101 imm12 nn tt    LDRH Wt, [Xn|SP, #imm12 * 2]
4896
4897       00 111 00100 imm12 nn tt    STRB Wt, [Xn|SP, #imm12 * 1]
4898       00 111 00101 imm12 nn tt    LDRB Wt, [Xn|SP, #imm12 * 1]
4899    */
4900    if (INSN(29,23) == BITS7(1,1,1,0,0,1,0)) {
4901       UInt   szLg2 = INSN(31,30);
4902       UInt   szB   = 1 << szLg2;
4903       Bool   isLD  = INSN(22,22) == 1;
4904       UInt   offs  = INSN(21,10) * szB;
4905       UInt   nn    = INSN(9,5);
4906       UInt   tt    = INSN(4,0);
4907       IRTemp ta    = newTemp(Ity_I64);
4908       assign(ta, binop(Iop_Add64, getIReg64orSP(nn), mkU64(offs)));
4909       if (nn == 31) { /* FIXME generate stack alignment check */ }
4910       vassert(szLg2 < 4);
4911       if (isLD) {
4912          putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, ta)));
4913       } else {
4914          gen_narrowing_store(szB, ta, getIReg64orZR(tt));
4915       }
4916       const HChar* ld_name[4] = { "ldrb", "ldrh", "ldr", "ldr" };
4917       const HChar* st_name[4] = { "strb", "strh", "str", "str" };
4918       DIP("%s %s, [%s, #%u]\n",
4919           (isLD ? ld_name : st_name)[szLg2], nameIRegOrZR(szB == 8, tt),
4920           nameIReg64orSP(nn), offs);
4921       return True;
4922    }
4923
4924    /* ------------ LDUR,STUR (immediate, simm9) ----------- */
4925    /*
4926       31 29  26      20   11 9  4
4927       |  |   |       |    |  |  |
4928       (at-Rn-then-Rn=EA)  |  |  |
4929       sz 111 00000 0 imm9 01 Rn Rt   STR Rt, [Xn|SP], #simm9
4930       sz 111 00001 0 imm9 01 Rn Rt   LDR Rt, [Xn|SP], #simm9
4931
4932       (at-EA-then-Rn=EA)
4933       sz 111 00000 0 imm9 11 Rn Rt   STR Rt, [Xn|SP, #simm9]!
4934       sz 111 00001 0 imm9 11 Rn Rt   LDR Rt, [Xn|SP, #simm9]!
4935
4936       (at-EA)
4937       sz 111 00000 0 imm9 00 Rn Rt   STR Rt, [Xn|SP, #simm9]
4938       sz 111 00001 0 imm9 00 Rn Rt   LDR Rt, [Xn|SP, #simm9]
4939
4940       simm9 is unscaled.
4941
4942       The case 'wback && Rn == Rt && Rt != 31' is disallowed.  In the
4943       load case this is because would create two competing values for
4944       Rt.  In the store case the reason is unclear, but the spec
4945       disallows it anyway.
4946
4947       Stores are narrowing, loads are unsigned widening.  sz encodes
4948       the transfer size in the normal way: 00=1, 01=2, 10=4, 11=8.
4949    */
4950    if ((INSN(29,21) & BITS9(1,1,1, 1,1,1,1,0, 1))
4951        == BITS9(1,1,1, 0,0,0,0,0, 0)) {
4952       UInt szLg2  = INSN(31,30);
4953       UInt szB    = 1 << szLg2;
4954       Bool isLoad = INSN(22,22) == 1;
4955       UInt imm9   = INSN(20,12);
4956       UInt nn     = INSN(9,5);
4957       UInt tt     = INSN(4,0);
4958       Bool wBack  = INSN(10,10) == 1;
4959       UInt how    = INSN(11,10);
4960       if (how == BITS2(1,0) || (wBack && nn == tt && tt != 31)) {
4961          /* undecodable; fall through */
4962       } else {
4963          if (nn == 31) { /* FIXME generate stack alignment check */ }
4964
4965          // Compute the transfer address TA and the writeback address WA.
4966          IRTemp tRN = newTemp(Ity_I64);
4967          assign(tRN, getIReg64orSP(nn));
4968          IRTemp tEA = newTemp(Ity_I64);
4969          Long simm9 = (Long)sx_to_64(imm9, 9);
4970          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
4971
4972          IRTemp tTA = newTemp(Ity_I64);
4973          IRTemp tWA = newTemp(Ity_I64);
4974          switch (how) {
4975             case BITS2(0,1):
4976                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
4977             case BITS2(1,1):
4978                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
4979             case BITS2(0,0):
4980                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
4981             default:
4982                vassert(0); /* NOTREACHED */
4983          }
4984
4985          /* Normally rN would be updated after the transfer.  However, in
4986             the special cases typifed by
4987                str x30, [sp,#-16]!
4988                str w1, [sp,#-32]!
4989             it is necessary to update SP before the transfer, (1)
4990             because Memcheck will otherwise complain about a write
4991             below the stack pointer, and (2) because the segfault
4992             stack extension mechanism will otherwise extend the stack
4993             only down to SP before the instruction, which might not be
4994             far enough, if the -16/-32 bit takes the actual access
4995             address to the next page.
4996          */
4997          Bool earlyWBack
4998            = wBack && simm9 < 0
4999              && (szB == 8 || szB == 4 || szB == 2 || szB == 1)
5000              && how == BITS2(1,1) && nn == 31 && !isLoad;
5001
5002          if (wBack && earlyWBack)
5003             putIReg64orSP(nn, mkexpr(tEA));
5004
5005          if (isLoad) {
5006             putIReg64orZR(tt, mkexpr(gen_zwidening_load(szB, tTA)));
5007          } else {
5008             gen_narrowing_store(szB, tTA, getIReg64orZR(tt));
5009          }
5010
5011          if (wBack && !earlyWBack)
5012             putIReg64orSP(nn, mkexpr(tEA));
5013
5014          const HChar* ld_name[4] = { "ldurb", "ldurh", "ldur", "ldur" };
5015          const HChar* st_name[4] = { "sturb", "sturh", "stur", "stur" };
5016          const HChar* fmt_str = NULL;
5017          switch (how) {
5018             case BITS2(0,1):
5019                fmt_str = "%s %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5020                break;
5021             case BITS2(1,1):
5022                fmt_str = "%s %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5023                break;
5024             case BITS2(0,0):
5025                fmt_str = "%s %s, [%s, #%lld] (at-Rn)\n";
5026                break;
5027             default:
5028                vassert(0);
5029          }
5030          DIP(fmt_str, (isLoad ? ld_name : st_name)[szLg2],
5031                       nameIRegOrZR(szB == 8, tt),
5032                       nameIReg64orSP(nn), simm9);
5033          return True;
5034       }
5035    }
5036
5037    /* -------- LDP,STP (immediate, simm7) (INT REGS) -------- */
5038    /* L==1 => mm==LD
5039       L==0 => mm==ST
5040       x==0 => 32 bit transfers, and zero extended loads
5041       x==1 => 64 bit transfers
5042       simm7 is scaled by the (single-register) transfer size
5043
5044       (at-Rn-then-Rn=EA)
5045       x0 101 0001 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP], #imm
5046
5047       (at-EA-then-Rn=EA)
5048       x0 101 0011 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]!
5049
5050       (at-EA)
5051       x0 101 0010 L imm7 Rt2 Rn Rt1  mmP Rt1,Rt2, [Xn|SP, #imm]
5052    */
5053    UInt insn_30_23 = INSN(30,23);
5054    if (insn_30_23 == BITS8(0,1,0,1,0,0,0,1)
5055        || insn_30_23 == BITS8(0,1,0,1,0,0,1,1)
5056        || insn_30_23 == BITS8(0,1,0,1,0,0,1,0)) {
5057       UInt bL     = INSN(22,22);
5058       UInt bX     = INSN(31,31);
5059       UInt bWBack = INSN(23,23);
5060       UInt rT1    = INSN(4,0);
5061       UInt rN     = INSN(9,5);
5062       UInt rT2    = INSN(14,10);
5063       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5064       if ((bWBack && (rT1 == rN || rT2 == rN) && rN != 31)
5065           || (bL && rT1 == rT2)) {
5066          /* undecodable; fall through */
5067       } else {
5068          if (rN == 31) { /* FIXME generate stack alignment check */ }
5069
5070          // Compute the transfer address TA and the writeback address WA.
5071          IRTemp tRN = newTemp(Ity_I64);
5072          assign(tRN, getIReg64orSP(rN));
5073          IRTemp tEA = newTemp(Ity_I64);
5074          simm7 = (bX ? 8 : 4) * simm7;
5075          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5076
5077          IRTemp tTA = newTemp(Ity_I64);
5078          IRTemp tWA = newTemp(Ity_I64);
5079          switch (INSN(24,23)) {
5080             case BITS2(0,1):
5081                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5082             case BITS2(1,1):
5083                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5084             case BITS2(1,0):
5085                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5086             default:
5087                vassert(0); /* NOTREACHED */
5088          }
5089
5090          /* Normally rN would be updated after the transfer.  However, in
5091             the special case typifed by
5092                stp x29, x30, [sp,#-112]!
5093             it is necessary to update SP before the transfer, (1)
5094             because Memcheck will otherwise complain about a write
5095             below the stack pointer, and (2) because the segfault
5096             stack extension mechanism will otherwise extend the stack
5097             only down to SP before the instruction, which might not be
5098             far enough, if the -112 bit takes the actual access
5099             address to the next page.
5100          */
5101          Bool earlyWBack
5102            = bWBack && simm7 < 0
5103              && INSN(24,23) == BITS2(1,1) && rN == 31 && bL == 0;
5104
5105          if (bWBack && earlyWBack)
5106             putIReg64orSP(rN, mkexpr(tEA));
5107
5108          /**/ if (bL == 1 && bX == 1) {
5109             // 64 bit load
5110             putIReg64orZR(rT1, loadLE(Ity_I64,
5111                                       binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
5112             putIReg64orZR(rT2, loadLE(Ity_I64,
5113                                       binop(Iop_Add64,mkexpr(tTA),mkU64(8))));
5114          } else if (bL == 1 && bX == 0) {
5115             // 32 bit load
5116             putIReg32orZR(rT1, loadLE(Ity_I32,
5117                                       binop(Iop_Add64,mkexpr(tTA),mkU64(0))));
5118             putIReg32orZR(rT2, loadLE(Ity_I32,
5119                                       binop(Iop_Add64,mkexpr(tTA),mkU64(4))));
5120          } else if (bL == 0 && bX == 1) {
5121             // 64 bit store
5122             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
5123                     getIReg64orZR(rT1));
5124             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(8)),
5125                     getIReg64orZR(rT2));
5126          } else {
5127             vassert(bL == 0 && bX == 0);
5128             // 32 bit store
5129             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(0)),
5130                     getIReg32orZR(rT1));
5131             storeLE(binop(Iop_Add64,mkexpr(tTA),mkU64(4)),
5132                     getIReg32orZR(rT2));
5133          }
5134
5135          if (bWBack && !earlyWBack)
5136             putIReg64orSP(rN, mkexpr(tEA));
5137
5138          const HChar* fmt_str = NULL;
5139          switch (INSN(24,23)) {
5140             case BITS2(0,1):
5141                fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5142                break;
5143             case BITS2(1,1):
5144                fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5145                break;
5146             case BITS2(1,0):
5147                fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
5148                break;
5149             default:
5150                vassert(0);
5151          }
5152          DIP(fmt_str, bL == 0 ? "st" : "ld",
5153                       nameIRegOrZR(bX == 1, rT1),
5154                       nameIRegOrZR(bX == 1, rT2),
5155                       nameIReg64orSP(rN), simm7);
5156          return True;
5157       }
5158    }
5159
5160    /* -------- LDPSW (immediate, simm7) (INT REGS) -------- */
5161    /* Does 32 bit transfers which are sign extended to 64 bits.
5162       simm7 is scaled by the (single-register) transfer size
5163
5164       (at-Rn-then-Rn=EA)
5165       01 101 0001 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP], #imm
5166
5167       (at-EA-then-Rn=EA)
5168       01 101 0011 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP, #imm]!
5169
5170       (at-EA)
5171       01 101 0010 1 imm7 Rt2 Rn Rt1  LDPSW Rt1,Rt2, [Xn|SP, #imm]
5172    */
5173    UInt insn_31_22 = INSN(31,22);
5174    if (insn_31_22 == BITS10(0,1,1,0,1,0,0,0,1,1)
5175        || insn_31_22 == BITS10(0,1,1,0,1,0,0,1,1,1)
5176        || insn_31_22 == BITS10(0,1,1,0,1,0,0,1,0,1)) {
5177       UInt bWBack = INSN(23,23);
5178       UInt rT1    = INSN(4,0);
5179       UInt rN     = INSN(9,5);
5180       UInt rT2    = INSN(14,10);
5181       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5182       if ((bWBack && (rT1 == rN || rT2 == rN) && rN != 31)
5183           || (rT1 == rT2)) {
5184          /* undecodable; fall through */
5185       } else {
5186          if (rN == 31) { /* FIXME generate stack alignment check */ }
5187
5188          // Compute the transfer address TA and the writeback address WA.
5189          IRTemp tRN = newTemp(Ity_I64);
5190          assign(tRN, getIReg64orSP(rN));
5191          IRTemp tEA = newTemp(Ity_I64);
5192          simm7 = 4 * simm7;
5193          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5194
5195          IRTemp tTA = newTemp(Ity_I64);
5196          IRTemp tWA = newTemp(Ity_I64);
5197          switch (INSN(24,23)) {
5198             case BITS2(0,1):
5199                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5200             case BITS2(1,1):
5201                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5202             case BITS2(1,0):
5203                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5204             default:
5205                vassert(0); /* NOTREACHED */
5206          }
5207
5208          // 32 bit load, sign extended to 64 bits
5209          putIReg64orZR(rT1, unop(Iop_32Sto64,
5210                                  loadLE(Ity_I32, binop(Iop_Add64,
5211                                                        mkexpr(tTA),
5212                                                        mkU64(0)))));
5213          putIReg64orZR(rT2, unop(Iop_32Sto64,
5214                                  loadLE(Ity_I32, binop(Iop_Add64,
5215                                                        mkexpr(tTA),
5216                                                        mkU64(4)))));
5217          if (bWBack)
5218             putIReg64orSP(rN, mkexpr(tEA));
5219
5220          const HChar* fmt_str = NULL;
5221          switch (INSN(24,23)) {
5222             case BITS2(0,1):
5223                fmt_str = "ldpsw %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5224                break;
5225             case BITS2(1,1):
5226                fmt_str = "ldpsw %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5227                break;
5228             case BITS2(1,0):
5229                fmt_str = "ldpsw %s, %s, [%s, #%lld] (at-Rn)\n";
5230                break;
5231             default:
5232                vassert(0);
5233          }
5234          DIP(fmt_str, nameIReg64orZR(rT1),
5235                       nameIReg64orZR(rT2),
5236                       nameIReg64orSP(rN), simm7);
5237          return True;
5238       }
5239    }
5240
5241    /* ---------------- LDR (literal, int reg) ---------------- */
5242    /* 31 29      23    4
5243       00 011 000 imm19 Rt   LDR   Wt, [PC + sxTo64(imm19 << 2)]
5244       01 011 000 imm19 Rt   LDR   Xt, [PC + sxTo64(imm19 << 2)]
5245       10 011 000 imm19 Rt   LDRSW Xt, [PC + sxTo64(imm19 << 2)]
5246       11 011 000 imm19 Rt   prefetch  [PC + sxTo64(imm19 << 2)]
5247       Just handles the first two cases for now.
5248    */
5249    if (INSN(29,24) == BITS6(0,1,1,0,0,0) && INSN(31,31) == 0) {
5250       UInt  imm19 = INSN(23,5);
5251       UInt  rT    = INSN(4,0);
5252       UInt  bX    = INSN(30,30);
5253       ULong ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
5254       if (bX) {
5255          putIReg64orZR(rT, loadLE(Ity_I64, mkU64(ea)));
5256       } else {
5257          putIReg32orZR(rT, loadLE(Ity_I32, mkU64(ea)));
5258       }
5259       DIP("ldr %s, 0x%llx (literal)\n", nameIRegOrZR(bX == 1, rT), ea);
5260       return True;
5261    }
5262
5263    /* -------------- {LD,ST}R (integer register) --------------- */
5264    /* 31 29        20 15     12 11 9  4
5265       |  |         |  |      |  |  |  |
5266       11 111000011 Rm option S  10 Rn Rt  LDR  Xt, [Xn|SP, R<m>{ext/sh}]
5267       10 111000011 Rm option S  10 Rn Rt  LDR  Wt, [Xn|SP, R<m>{ext/sh}]
5268       01 111000011 Rm option S  10 Rn Rt  LDRH Wt, [Xn|SP, R<m>{ext/sh}]
5269       00 111000011 Rm option S  10 Rn Rt  LDRB Wt, [Xn|SP, R<m>{ext/sh}]
5270
5271       11 111000001 Rm option S  10 Rn Rt  STR  Xt, [Xn|SP, R<m>{ext/sh}]
5272       10 111000001 Rm option S  10 Rn Rt  STR  Wt, [Xn|SP, R<m>{ext/sh}]
5273       01 111000001 Rm option S  10 Rn Rt  STRH Wt, [Xn|SP, R<m>{ext/sh}]
5274       00 111000001 Rm option S  10 Rn Rt  STRB Wt, [Xn|SP, R<m>{ext/sh}]
5275    */
5276    if (INSN(29,23) == BITS7(1,1,1,0,0,0,0)
5277        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5278       HChar  dis_buf[64];
5279       UInt   szLg2 = INSN(31,30);
5280       Bool   isLD  = INSN(22,22) == 1;
5281       UInt   tt    = INSN(4,0);
5282       IRTemp ea    = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
5283       if (ea != IRTemp_INVALID) {
5284          switch (szLg2) {
5285             case 3: /* 64 bit */
5286                if (isLD) {
5287                   putIReg64orZR(tt, loadLE(Ity_I64, mkexpr(ea)));
5288                   DIP("ldr %s, %s\n", nameIReg64orZR(tt), dis_buf);
5289                } else {
5290                   storeLE(mkexpr(ea), getIReg64orZR(tt));
5291                   DIP("str %s, %s\n", nameIReg64orZR(tt), dis_buf);
5292                }
5293                break;
5294             case 2: /* 32 bit */
5295                if (isLD) {
5296                   putIReg32orZR(tt, loadLE(Ity_I32, mkexpr(ea)));
5297                   DIP("ldr %s, %s\n", nameIReg32orZR(tt), dis_buf);
5298                } else {
5299                   storeLE(mkexpr(ea), getIReg32orZR(tt));
5300                   DIP("str %s, %s\n", nameIReg32orZR(tt), dis_buf);
5301                }
5302                break;
5303             case 1: /* 16 bit */
5304                if (isLD) {
5305                   putIReg64orZR(tt, unop(Iop_16Uto64,
5306                                          loadLE(Ity_I16, mkexpr(ea))));
5307                   DIP("ldruh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5308                } else {
5309                   storeLE(mkexpr(ea), unop(Iop_64to16, getIReg64orZR(tt)));
5310                   DIP("strh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5311                }
5312                break;
5313             case 0: /* 8 bit */
5314                if (isLD) {
5315                   putIReg64orZR(tt, unop(Iop_8Uto64,
5316                                          loadLE(Ity_I8, mkexpr(ea))));
5317                   DIP("ldrub %s, %s\n", nameIReg32orZR(tt), dis_buf);
5318                } else {
5319                   storeLE(mkexpr(ea), unop(Iop_64to8, getIReg64orZR(tt)));
5320                   DIP("strb %s, %s\n", nameIReg32orZR(tt), dis_buf);
5321                }
5322                break;
5323             default:
5324                vassert(0);
5325          }
5326          return True;
5327       }
5328    }
5329
5330    /* -------------- LDRS{B,H,W} (uimm12) -------------- */
5331    /* 31 29  26  23 21    9 4
5332       10 111 001 10 imm12 n t   LDRSW Xt, [Xn|SP, #pimm12 * 4]
5333       01 111 001 1x imm12 n t   LDRSH Rt, [Xn|SP, #pimm12 * 2]
5334       00 111 001 1x imm12 n t   LDRSB Rt, [Xn|SP, #pimm12 * 1]
5335       where
5336          Rt is Wt when x==1, Xt when x==0
5337    */
5338    if (INSN(29,23) == BITS7(1,1,1,0,0,1,1)) {
5339       /* Further checks on bits 31:30 and 22 */
5340       Bool valid = False;
5341       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5342          case BITS3(1,0,0):
5343          case BITS3(0,1,0): case BITS3(0,1,1):
5344          case BITS3(0,0,0): case BITS3(0,0,1):
5345             valid = True;
5346             break;
5347       }
5348       if (valid) {
5349          UInt    szLg2 = INSN(31,30);
5350          UInt    bitX  = INSN(22,22);
5351          UInt    imm12 = INSN(21,10);
5352          UInt    nn    = INSN(9,5);
5353          UInt    tt    = INSN(4,0);
5354          UInt    szB   = 1 << szLg2;
5355          IRExpr* ea    = binop(Iop_Add64,
5356                                getIReg64orSP(nn), mkU64(imm12 * szB));
5357          switch (szB) {
5358             case 4:
5359                vassert(bitX == 0);
5360                putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, ea)));
5361                DIP("ldrsw %s, [%s, #%u]\n", nameIReg64orZR(tt),
5362                    nameIReg64orSP(nn), imm12 * szB);
5363                break;
5364             case 2:
5365                if (bitX == 1) {
5366                   putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, ea)));
5367                } else {
5368                   putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, ea)));
5369                }
5370                DIP("ldrsh %s, [%s, #%u]\n",
5371                    nameIRegOrZR(bitX == 0, tt),
5372                    nameIReg64orSP(nn), imm12 * szB);
5373                break;
5374             case 1:
5375                if (bitX == 1) {
5376                   putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, ea)));
5377                } else {
5378                   putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, ea)));
5379                }
5380                DIP("ldrsb %s, [%s, #%u]\n",
5381                    nameIRegOrZR(bitX == 0, tt),
5382                    nameIReg64orSP(nn), imm12 * szB);
5383                break;
5384             default:
5385                vassert(0);
5386          }
5387          return True;
5388       }
5389       /* else fall through */
5390    }
5391
5392    /* -------------- LDRS{B,H,W} (simm9, upd) -------------- */
5393    /* (at-Rn-then-Rn=EA)
5394       31 29      23 21 20   11 9 4
5395       00 111 000 1x 0  imm9 01 n t  LDRSB Rt, [Xn|SP], #simm9
5396       01 111 000 1x 0  imm9 01 n t  LDRSH Rt, [Xn|SP], #simm9
5397       10 111 000 10 0  imm9 01 n t  LDRSW Xt, [Xn|SP], #simm9
5398
5399       (at-EA-then-Rn=EA)
5400       00 111 000 1x 0  imm9 11 n t  LDRSB Rt, [Xn|SP, #simm9]!
5401       01 111 000 1x 0  imm9 11 n t  LDRSH Rt, [Xn|SP, #simm9]!
5402       10 111 000 10 0  imm9 11 n t  LDRSW Xt, [Xn|SP, #simm9]!
5403       where
5404          Rt is Wt when x==1, Xt when x==0
5405          transfer-at-Rn when [11]==0, at EA when [11]==1
5406    */
5407    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5408        && INSN(21,21) == 0 && INSN(10,10) == 1) {
5409       /* Further checks on bits 31:30 and 22 */
5410       Bool valid = False;
5411       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5412          case BITS3(1,0,0):                    // LDRSW Xt
5413          case BITS3(0,1,0): case BITS3(0,1,1): // LDRSH Xt, Wt
5414          case BITS3(0,0,0): case BITS3(0,0,1): // LDRSB Xt, Wt
5415             valid = True;
5416             break;
5417       }
5418       if (valid) {
5419          UInt   szLg2 = INSN(31,30);
5420          UInt   imm9  = INSN(20,12);
5421          Bool   atRN  = INSN(11,11) == 0;
5422          UInt   nn    = INSN(9,5);
5423          UInt   tt    = INSN(4,0);
5424          IRTemp tRN   = newTemp(Ity_I64);
5425          IRTemp tEA   = newTemp(Ity_I64);
5426          IRTemp tTA   = IRTemp_INVALID;
5427          ULong  simm9 = sx_to_64(imm9, 9);
5428          Bool   is64  = INSN(22,22) == 0;
5429          assign(tRN, getIReg64orSP(nn));
5430          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5431          tTA = atRN ? tRN : tEA;
5432          HChar ch = '?';
5433          /* There are 5 cases:
5434                byte     load,           SX to 64
5435                byte     load, SX to 32, ZX to 64
5436                halfword load,           SX to 64
5437                halfword load, SX to 32, ZX to 64
5438                word     load,           SX to 64
5439             The ifs below handle them in the listed order.
5440          */
5441          if (szLg2 == 0) {
5442             ch = 'b';
5443             if (is64) {
5444                putIReg64orZR(tt, unop(Iop_8Sto64,
5445                                       loadLE(Ity_I8, mkexpr(tTA))));
5446             } else {
5447                putIReg32orZR(tt, unop(Iop_8Sto32,
5448                                       loadLE(Ity_I8, mkexpr(tTA))));
5449             }
5450          }
5451          else if (szLg2 == 1) {
5452             ch = 'h';
5453             if (is64) {
5454                putIReg64orZR(tt, unop(Iop_16Sto64,
5455                                       loadLE(Ity_I16, mkexpr(tTA))));
5456             } else {
5457                putIReg32orZR(tt, unop(Iop_16Sto32,
5458                                       loadLE(Ity_I16, mkexpr(tTA))));
5459             }
5460          }
5461          else if (szLg2 == 2 && is64) {
5462             ch = 'w';
5463             putIReg64orZR(tt, unop(Iop_32Sto64,
5464                                    loadLE(Ity_I32, mkexpr(tTA))));
5465          }
5466          else {
5467             vassert(0);
5468          }
5469          putIReg64orSP(nn, mkexpr(tEA));
5470          DIP(atRN ? "ldrs%c %s, [%s], #%llu\n" : "ldrs%c %s, [%s, #%llu]!",
5471              ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), simm9);
5472          return True;
5473       }
5474       /* else fall through */
5475    }
5476
5477    /* -------------- LDRS{B,H,W} (simm9, noUpd) -------------- */
5478    /* 31 29      23 21 20   11 9 4
5479       00 111 000 1x 0  imm9 00 n t  LDURSB Rt, [Xn|SP, #simm9]
5480       01 111 000 1x 0  imm9 00 n t  LDURSH Rt, [Xn|SP, #simm9]
5481       10 111 000 10 0  imm9 00 n t  LDURSW Xt, [Xn|SP, #simm9]
5482       where
5483          Rt is Wt when x==1, Xt when x==0
5484    */
5485    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5486        && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
5487       /* Further checks on bits 31:30 and 22 */
5488       Bool valid = False;
5489       switch ((INSN(31,30) << 1) | INSN(22,22)) {
5490          case BITS3(1,0,0):                    // LDURSW Xt
5491          case BITS3(0,1,0): case BITS3(0,1,1): // LDURSH Xt, Wt
5492          case BITS3(0,0,0): case BITS3(0,0,1): // LDURSB Xt, Wt
5493             valid = True;
5494             break;
5495       }
5496       if (valid) {
5497          UInt   szLg2 = INSN(31,30);
5498          UInt   imm9  = INSN(20,12);
5499          UInt   nn    = INSN(9,5);
5500          UInt   tt    = INSN(4,0);
5501          IRTemp tRN   = newTemp(Ity_I64);
5502          IRTemp tEA   = newTemp(Ity_I64);
5503          ULong  simm9 = sx_to_64(imm9, 9);
5504          Bool   is64  = INSN(22,22) == 0;
5505          assign(tRN, getIReg64orSP(nn));
5506          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5507          HChar ch = '?';
5508          /* There are 5 cases:
5509                byte     load,           SX to 64
5510                byte     load, SX to 32, ZX to 64
5511                halfword load,           SX to 64
5512                halfword load, SX to 32, ZX to 64
5513                word     load,           SX to 64
5514             The ifs below handle them in the listed order.
5515          */
5516          if (szLg2 == 0) {
5517             ch = 'b';
5518             if (is64) {
5519                putIReg64orZR(tt, unop(Iop_8Sto64,
5520                                       loadLE(Ity_I8, mkexpr(tEA))));
5521             } else {
5522                putIReg32orZR(tt, unop(Iop_8Sto32,
5523                                       loadLE(Ity_I8, mkexpr(tEA))));
5524             }
5525          }
5526          else if (szLg2 == 1) {
5527             ch = 'h';
5528             if (is64) {
5529                putIReg64orZR(tt, unop(Iop_16Sto64,
5530                                       loadLE(Ity_I16, mkexpr(tEA))));
5531             } else {
5532                putIReg32orZR(tt, unop(Iop_16Sto32,
5533                                       loadLE(Ity_I16, mkexpr(tEA))));
5534             }
5535          }
5536          else if (szLg2 == 2 && is64) {
5537             ch = 'w';
5538             putIReg64orZR(tt, unop(Iop_32Sto64,
5539                                    loadLE(Ity_I32, mkexpr(tEA))));
5540          }
5541          else {
5542             vassert(0);
5543          }
5544          DIP("ldurs%c %s, [%s, #%lld]\n",
5545              ch, nameIRegOrZR(is64, tt), nameIReg64orSP(nn), (Long)simm9);
5546          return True;
5547       }
5548       /* else fall through */
5549    }
5550
5551    /* -------- LDP,STP (immediate, simm7) (FP&VEC) -------- */
5552    /* L==1    => mm==LD
5553       L==0    => mm==ST
5554       sz==00  => 32 bit (S) transfers
5555       sz==01  => 64 bit (D) transfers
5556       sz==10  => 128 bit (Q) transfers
5557       sz==11  isn't allowed
5558       simm7 is scaled by the (single-register) transfer size
5559
5560       31 29  26   22 21   14 9 4
5561
5562       sz 101 1000 L  imm7 t2 n t1   mmNP SDQt1, SDQt2, [Xn|SP, #imm]
5563                                     (at-EA, with nontemporal hint)
5564
5565       sz 101 1001 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP], #imm
5566                                     (at-Rn-then-Rn=EA)
5567
5568       sz 101 1010 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]
5569                                     (at-EA)
5570
5571       sz 101 1011 L  imm7 t2 n t1   mmP SDQt1, SDQt2, [Xn|SP, #imm]!
5572                                     (at-EA-then-Rn=EA)
5573    */
5574    if (INSN(29,25) == BITS5(1,0,1,1,0)) {
5575       UInt szSlg2 = INSN(31,30); // log2 of the xfer size in 32-bit units
5576       Bool isLD   = INSN(22,22) == 1;
5577       Bool wBack  = INSN(23,23) == 1;
5578       Long simm7  = (Long)sx_to_64(INSN(21,15), 7);
5579       UInt tt2    = INSN(14,10);
5580       UInt nn     = INSN(9,5);
5581       UInt tt1    = INSN(4,0);
5582       if (szSlg2 == BITS2(1,1) || (isLD && tt1 == tt2)) {
5583          /* undecodable; fall through */
5584       } else {
5585          if (nn == 31) { /* FIXME generate stack alignment check */ }
5586
5587          // Compute the transfer address TA and the writeback address WA.
5588          UInt   szB = 4 << szSlg2; /* szB is the per-register size */
5589          IRTemp tRN = newTemp(Ity_I64);
5590          assign(tRN, getIReg64orSP(nn));
5591          IRTemp tEA = newTemp(Ity_I64);
5592          simm7 = szB * simm7;
5593          assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm7)));
5594
5595          IRTemp tTA = newTemp(Ity_I64);
5596          IRTemp tWA = newTemp(Ity_I64);
5597          switch (INSN(24,23)) {
5598             case BITS2(0,1):
5599                assign(tTA, mkexpr(tRN)); assign(tWA, mkexpr(tEA)); break;
5600             case BITS2(1,1):
5601                assign(tTA, mkexpr(tEA)); assign(tWA, mkexpr(tEA)); break;
5602             case BITS2(1,0):
5603             case BITS2(0,0):
5604                assign(tTA, mkexpr(tEA)); /* tWA is unused */ break;
5605             default:
5606                vassert(0); /* NOTREACHED */
5607          }
5608
5609          IRType ty = Ity_INVALID;
5610          switch (szB) {
5611             case 4:  ty = Ity_F32;  break;
5612             case 8:  ty = Ity_F64;  break;
5613             case 16: ty = Ity_V128; break;
5614             default: vassert(0);
5615          }
5616
5617          /* Normally rN would be updated after the transfer.  However, in
5618             the special cases typifed by
5619                stp q0, q1, [sp,#-512]!
5620                stp d0, d1, [sp,#-512]!
5621                stp s0, s1, [sp,#-512]!
5622             it is necessary to update SP before the transfer, (1)
5623             because Memcheck will otherwise complain about a write
5624             below the stack pointer, and (2) because the segfault
5625             stack extension mechanism will otherwise extend the stack
5626             only down to SP before the instruction, which might not be
5627             far enough, if the -512 bit takes the actual access
5628             address to the next page.
5629          */
5630          Bool earlyWBack
5631            = wBack && simm7 < 0
5632              && INSN(24,23) == BITS2(1,1) && nn == 31 && !isLD;
5633
5634          if (wBack && earlyWBack)
5635             putIReg64orSP(nn, mkexpr(tEA));
5636
5637          if (isLD) {
5638             if (szB < 16) {
5639                putQReg128(tt1, mkV128(0x0000));
5640             }
5641             putQRegLO(tt1,
5642                       loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(0))));
5643             if (szB < 16) {
5644                putQReg128(tt2, mkV128(0x0000));
5645             }
5646             putQRegLO(tt2,
5647                       loadLE(ty, binop(Iop_Add64, mkexpr(tTA), mkU64(szB))));
5648          } else {
5649             storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(0)),
5650                     getQRegLO(tt1, ty));
5651             storeLE(binop(Iop_Add64, mkexpr(tTA), mkU64(szB)),
5652                     getQRegLO(tt2, ty));
5653          }
5654
5655          if (wBack && !earlyWBack)
5656             putIReg64orSP(nn, mkexpr(tEA));
5657
5658          const HChar* fmt_str = NULL;
5659          switch (INSN(24,23)) {
5660             case BITS2(0,1):
5661                fmt_str = "%sp %s, %s, [%s], #%lld (at-Rn-then-Rn=EA)\n";
5662                break;
5663             case BITS2(1,1):
5664                fmt_str = "%sp %s, %s, [%s, #%lld]! (at-EA-then-Rn=EA)\n";
5665                break;
5666             case BITS2(1,0):
5667                fmt_str = "%sp %s, %s, [%s, #%lld] (at-Rn)\n";
5668                break;
5669             case BITS2(0,0):
5670                fmt_str = "%snp %s, %s, [%s, #%lld] (at-Rn)\n";
5671                break;
5672             default:
5673                vassert(0);
5674          }
5675          DIP(fmt_str, isLD ? "ld" : "st",
5676                       nameQRegLO(tt1, ty), nameQRegLO(tt2, ty),
5677                       nameIReg64orSP(nn), simm7);
5678          return True;
5679       }
5680    }
5681
5682    /* -------------- {LD,ST}R (vector register) --------------- */
5683    /* 31 29     23  20 15     12 11 9  4
5684       |  |      |   |  |      |  |  |  |
5685       00 111100 011 Rm option S  10 Rn Rt  LDR Bt, [Xn|SP, R<m>{ext/sh}]
5686       01 111100 011 Rm option S  10 Rn Rt  LDR Ht, [Xn|SP, R<m>{ext/sh}]
5687       10 111100 011 Rm option S  10 Rn Rt  LDR St, [Xn|SP, R<m>{ext/sh}]
5688       11 111100 011 Rm option S  10 Rn Rt  LDR Dt, [Xn|SP, R<m>{ext/sh}]
5689       00 111100 111 Rm option S  10 Rn Rt  LDR Qt, [Xn|SP, R<m>{ext/sh}]
5690
5691       00 111100 001 Rm option S  10 Rn Rt  STR Bt, [Xn|SP, R<m>{ext/sh}]
5692       01 111100 001 Rm option S  10 Rn Rt  STR Ht, [Xn|SP, R<m>{ext/sh}]
5693       10 111100 001 Rm option S  10 Rn Rt  STR St, [Xn|SP, R<m>{ext/sh}]
5694       11 111100 001 Rm option S  10 Rn Rt  STR Dt, [Xn|SP, R<m>{ext/sh}]
5695       00 111100 101 Rm option S  10 Rn Rt  STR Qt, [Xn|SP, R<m>{ext/sh}]
5696    */
5697    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5698        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5699       HChar  dis_buf[64];
5700       UInt   szLg2 = (INSN(23,23) << 2) | INSN(31,30);
5701       Bool   isLD  = INSN(22,22) == 1;
5702       UInt   tt    = INSN(4,0);
5703       if (szLg2 > 4) goto after_LDR_STR_vector_register;
5704       IRTemp ea    = gen_indexed_EA(dis_buf, insn, False/*to/from vec regs*/);
5705       if (ea == IRTemp_INVALID) goto after_LDR_STR_vector_register;
5706       switch (szLg2) {
5707          case 0: /* 8 bit */
5708             if (isLD) {
5709                putQReg128(tt, mkV128(0x0000));
5710                putQRegLO(tt, loadLE(Ity_I8, mkexpr(ea)));
5711                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
5712             } else {
5713                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I8));
5714                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I8), dis_buf);
5715             }
5716             break;
5717          case 1:
5718             if (isLD) {
5719                putQReg128(tt, mkV128(0x0000));
5720                putQRegLO(tt, loadLE(Ity_I16, mkexpr(ea)));
5721                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
5722             } else {
5723                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I16));
5724                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I16), dis_buf);
5725             }
5726             break;
5727          case 2: /* 32 bit */
5728             if (isLD) {
5729                putQReg128(tt, mkV128(0x0000));
5730                putQRegLO(tt, loadLE(Ity_I32, mkexpr(ea)));
5731                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
5732             } else {
5733                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I32));
5734                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I32), dis_buf);
5735             }
5736             break;
5737          case 3: /* 64 bit */
5738             if (isLD) {
5739                putQReg128(tt, mkV128(0x0000));
5740                putQRegLO(tt, loadLE(Ity_I64, mkexpr(ea)));
5741                DIP("ldr %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
5742             } else {
5743                storeLE(mkexpr(ea), getQRegLO(tt, Ity_I64));
5744                DIP("str %s, %s\n", nameQRegLO(tt, Ity_I64), dis_buf);
5745             }
5746             break;
5747          case 4:
5748             if (isLD) {
5749                putQReg128(tt, loadLE(Ity_V128, mkexpr(ea)));
5750                DIP("ldr %s, %s\n", nameQReg128(tt), dis_buf);
5751             } else {
5752                storeLE(mkexpr(ea), getQReg128(tt));
5753                DIP("str %s, %s\n", nameQReg128(tt), dis_buf);
5754             }
5755             break;
5756          default:
5757             vassert(0);
5758       }
5759       return True;
5760    }
5761   after_LDR_STR_vector_register:
5762
5763    /* ---------- LDRS{B,H,W} (integer register, SX) ---------- */
5764    /* 31 29      22 20 15  12 11 9  4
5765       |  |       |  |  |   |  |  |  |
5766       10 1110001 01 Rm opt S 10 Rn Rt    LDRSW Xt, [Xn|SP, R<m>{ext/sh}]
5767
5768       01 1110001 01 Rm opt S 10 Rn Rt    LDRSH Xt, [Xn|SP, R<m>{ext/sh}]
5769       01 1110001 11 Rm opt S 10 Rn Rt    LDRSH Wt, [Xn|SP, R<m>{ext/sh}]
5770
5771       00 1110001 01 Rm opt S 10 Rn Rt    LDRSB Xt, [Xn|SP, R<m>{ext/sh}]
5772       00 1110001 11 Rm opt S 10 Rn Rt    LDRSB Wt, [Xn|SP, R<m>{ext/sh}]
5773    */
5774    if (INSN(29,23) == BITS7(1,1,1,0,0,0,1)
5775        && INSN(21,21) == 1 && INSN(11,10) == BITS2(1,0)) {
5776       HChar  dis_buf[64];
5777       UInt   szLg2  = INSN(31,30);
5778       Bool   sxTo64 = INSN(22,22) == 0; // else sx to 32 and zx to 64
5779       UInt   tt     = INSN(4,0);
5780       if (szLg2 == 3) goto after_LDRS_integer_register;
5781       IRTemp ea     = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
5782       if (ea == IRTemp_INVALID) goto after_LDRS_integer_register;
5783       /* Enumerate the 5 variants explicitly. */
5784       if (szLg2 == 2/*32 bit*/ && sxTo64) {
5785          putIReg64orZR(tt, unop(Iop_32Sto64, loadLE(Ity_I32, mkexpr(ea))));
5786          DIP("ldrsw %s, %s\n", nameIReg64orZR(tt), dis_buf);
5787          return True;
5788       }
5789       else
5790       if (szLg2 == 1/*16 bit*/) {
5791          if (sxTo64) {
5792             putIReg64orZR(tt, unop(Iop_16Sto64, loadLE(Ity_I16, mkexpr(ea))));
5793             DIP("ldrsh %s, %s\n", nameIReg64orZR(tt), dis_buf);
5794          } else {
5795             putIReg32orZR(tt, unop(Iop_16Sto32, loadLE(Ity_I16, mkexpr(ea))));
5796             DIP("ldrsh %s, %s\n", nameIReg32orZR(tt), dis_buf);
5797          }
5798          return True;
5799       }
5800       else
5801       if (szLg2 == 0/*8 bit*/) {
5802          if (sxTo64) {
5803             putIReg64orZR(tt, unop(Iop_8Sto64, loadLE(Ity_I8, mkexpr(ea))));
5804             DIP("ldrsb %s, %s\n", nameIReg64orZR(tt), dis_buf);
5805          } else {
5806             putIReg32orZR(tt, unop(Iop_8Sto32, loadLE(Ity_I8, mkexpr(ea))));
5807             DIP("ldrsb %s, %s\n", nameIReg32orZR(tt), dis_buf);
5808          }
5809          return True;
5810       }
5811       /* else it's an invalid combination */
5812    }
5813   after_LDRS_integer_register:
5814
5815    /* -------- LDR/STR (immediate, SIMD&FP, unsigned offset) -------- */
5816    /* This is the Unsigned offset variant only.  The Post-Index and
5817       Pre-Index variants are below.
5818
5819       31 29      23 21    9 4
5820       00 111 101 01 imm12 n t   LDR Bt, [Xn|SP + imm12 * 1]
5821       01 111 101 01 imm12 n t   LDR Ht, [Xn|SP + imm12 * 2]
5822       10 111 101 01 imm12 n t   LDR St, [Xn|SP + imm12 * 4]
5823       11 111 101 01 imm12 n t   LDR Dt, [Xn|SP + imm12 * 8]
5824       00 111 101 11 imm12 n t   LDR Qt, [Xn|SP + imm12 * 16]
5825
5826       00 111 101 00 imm12 n t   STR Bt, [Xn|SP + imm12 * 1]
5827       01 111 101 00 imm12 n t   STR Ht, [Xn|SP + imm12 * 2]
5828       10 111 101 00 imm12 n t   STR St, [Xn|SP + imm12 * 4]
5829       11 111 101 00 imm12 n t   STR Dt, [Xn|SP + imm12 * 8]
5830       00 111 101 10 imm12 n t   STR Qt, [Xn|SP + imm12 * 16]
5831    */
5832    if (INSN(29,24) == BITS6(1,1,1,1,0,1)
5833        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4) {
5834       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5835       Bool   isLD   = INSN(22,22) == 1;
5836       UInt   pimm12 = INSN(21,10) << szLg2;
5837       UInt   nn     = INSN(9,5);
5838       UInt   tt     = INSN(4,0);
5839       IRTemp tEA    = newTemp(Ity_I64);
5840       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5841       assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(pimm12)));
5842       if (isLD) {
5843          if (szLg2 < 4) {
5844             putQReg128(tt, mkV128(0x0000));
5845          }
5846          putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
5847       } else {
5848          storeLE(mkexpr(tEA), getQRegLO(tt, ty));
5849       }
5850       DIP("%s %s, [%s, #%u]\n",
5851           isLD ? "ldr" : "str",
5852           nameQRegLO(tt, ty), nameIReg64orSP(nn), pimm12);
5853       return True;
5854    }
5855
5856    /* -------- LDR/STR (immediate, SIMD&FP, pre/post index) -------- */
5857    /* These are the Post-Index and Pre-Index variants.
5858
5859       31 29      23   20   11 9 4
5860       (at-Rn-then-Rn=EA)
5861       00 111 100 01 0 imm9 01 n t   LDR Bt, [Xn|SP], #simm
5862       01 111 100 01 0 imm9 01 n t   LDR Ht, [Xn|SP], #simm
5863       10 111 100 01 0 imm9 01 n t   LDR St, [Xn|SP], #simm
5864       11 111 100 01 0 imm9 01 n t   LDR Dt, [Xn|SP], #simm
5865       00 111 100 11 0 imm9 01 n t   LDR Qt, [Xn|SP], #simm
5866
5867       (at-EA-then-Rn=EA)
5868       00 111 100 01 0 imm9 11 n t   LDR Bt, [Xn|SP, #simm]!
5869       01 111 100 01 0 imm9 11 n t   LDR Ht, [Xn|SP, #simm]!
5870       10 111 100 01 0 imm9 11 n t   LDR St, [Xn|SP, #simm]!
5871       11 111 100 01 0 imm9 11 n t   LDR Dt, [Xn|SP, #simm]!
5872       00 111 100 11 0 imm9 11 n t   LDR Qt, [Xn|SP, #simm]!
5873
5874       Stores are the same except with bit 22 set to 0.
5875    */
5876    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5877        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
5878        && INSN(21,21) == 0 && INSN(10,10) == 1) {
5879       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5880       Bool   isLD   = INSN(22,22) == 1;
5881       UInt   imm9   = INSN(20,12);
5882       Bool   atRN   = INSN(11,11) == 0;
5883       UInt   nn     = INSN(9,5);
5884       UInt   tt     = INSN(4,0);
5885       IRTemp tRN    = newTemp(Ity_I64);
5886       IRTemp tEA    = newTemp(Ity_I64);
5887       IRTemp tTA    = IRTemp_INVALID;
5888       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5889       ULong  simm9  = sx_to_64(imm9, 9);
5890       assign(tRN, getIReg64orSP(nn));
5891       assign(tEA, binop(Iop_Add64, mkexpr(tRN), mkU64(simm9)));
5892       tTA = atRN ? tRN : tEA;
5893
5894       /* Do early writeback for the cases typified by
5895             str d8, [sp, #-32]!
5896             str d10, [sp, #-128]!
5897             str q1, [sp, #-32]!
5898          for the same reasons as described in a similar comment in the
5899          "LDP,STP (immediate, simm7) (FP&VEC)" case just above.
5900       */
5901       Bool earlyWBack
5902          = !atRN && !isLD && (ty == Ity_F64 || ty == Ity_V128)
5903            && nn == 31 && ((Long)simm9) < 0;
5904
5905       if (earlyWBack)
5906          putIReg64orSP(nn, mkexpr(tEA));
5907
5908       if (isLD) {
5909          if (szLg2 < 4) {
5910             putQReg128(tt, mkV128(0x0000));
5911          }
5912          putQRegLO(tt, loadLE(ty, mkexpr(tTA)));
5913       } else {
5914          storeLE(mkexpr(tTA), getQRegLO(tt, ty));
5915       }
5916
5917       if (!earlyWBack)
5918          putIReg64orSP(nn, mkexpr(tEA));
5919
5920       DIP(atRN ? "%s %s, [%s], #%lld\n" : "%s %s, [%s, #%lld]!\n",
5921           isLD ? "ldr" : "str",
5922           nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
5923       return True;
5924    }
5925
5926    /* -------- LDUR/STUR (unscaled offset, SIMD&FP) -------- */
5927    /* 31 29      23   20   11 9 4
5928       00 111 100 01 0 imm9 00 n t   LDR Bt, [Xn|SP, #simm]
5929       01 111 100 01 0 imm9 00 n t   LDR Ht, [Xn|SP, #simm]
5930       10 111 100 01 0 imm9 00 n t   LDR St, [Xn|SP, #simm]
5931       11 111 100 01 0 imm9 00 n t   LDR Dt, [Xn|SP, #simm]
5932       00 111 100 11 0 imm9 00 n t   LDR Qt, [Xn|SP, #simm]
5933
5934       00 111 100 00 0 imm9 00 n t   STR Bt, [Xn|SP, #simm]
5935       01 111 100 00 0 imm9 00 n t   STR Ht, [Xn|SP, #simm]
5936       10 111 100 00 0 imm9 00 n t   STR St, [Xn|SP, #simm]
5937       11 111 100 00 0 imm9 00 n t   STR Dt, [Xn|SP, #simm]
5938       00 111 100 10 0 imm9 00 n t   STR Qt, [Xn|SP, #simm]
5939    */
5940    if (INSN(29,24) == BITS6(1,1,1,1,0,0)
5941        && ((INSN(23,23) << 2) | INSN(31,30)) <= 4
5942        && INSN(21,21) == 0 && INSN(11,10) == BITS2(0,0)) {
5943       UInt   szLg2  = (INSN(23,23) << 2) | INSN(31,30);
5944       Bool   isLD   = INSN(22,22) == 1;
5945       UInt   imm9   = INSN(20,12);
5946       UInt   nn     = INSN(9,5);
5947       UInt   tt     = INSN(4,0);
5948       ULong  simm9  = sx_to_64(imm9, 9);
5949       IRTemp tEA    = newTemp(Ity_I64);
5950       IRType ty     = preferredVectorSubTypeFromSize(1 << szLg2);
5951       assign(tEA, binop(Iop_Add64, getIReg64orSP(nn), mkU64(simm9)));
5952       if (isLD) {
5953          if (szLg2 < 4) {
5954             putQReg128(tt, mkV128(0x0000));
5955          }
5956          putQRegLO(tt, loadLE(ty, mkexpr(tEA)));
5957       } else {
5958          storeLE(mkexpr(tEA), getQRegLO(tt, ty));
5959       }
5960       DIP("%s %s, [%s, #%lld]\n",
5961           isLD ? "ldur" : "stur",
5962           nameQRegLO(tt, ty), nameIReg64orSP(nn), (Long)simm9);
5963       return True;
5964    }
5965
5966    /* ---------------- LDR (literal, SIMD&FP) ---------------- */
5967    /* 31 29      23    4
5968       00 011 100 imm19 t    LDR St, [PC + sxTo64(imm19 << 2)]
5969       01 011 100 imm19 t    LDR Dt, [PC + sxTo64(imm19 << 2)]
5970       10 011 100 imm19 t    LDR Qt, [PC + sxTo64(imm19 << 2)]
5971    */
5972    if (INSN(29,24) == BITS6(0,1,1,1,0,0) && INSN(31,30) < BITS2(1,1)) {
5973       UInt   szB   = 4 << INSN(31,30);
5974       UInt   imm19 = INSN(23,5);
5975       UInt   tt    = INSN(4,0);
5976       ULong  ea    = guest_PC_curr_instr + sx_to_64(imm19 << 2, 21);
5977       IRType ty    = preferredVectorSubTypeFromSize(szB);
5978       putQReg128(tt, mkV128(0x0000));
5979       putQRegLO(tt, loadLE(ty, mkU64(ea)));
5980       DIP("ldr %s, 0x%llx (literal)\n", nameQRegLO(tt, ty), ea);
5981       return True;
5982    }
5983
5984    /* ------ LD1/ST1 (multiple 1-elem structs to/from 1 reg  ------ */
5985    /* ------ LD2/ST2 (multiple 2-elem structs to/from 2 regs ------ */
5986    /* ------ LD3/ST3 (multiple 3-elem structs to/from 3 regs ------ */
5987    /* ------ LD4/ST4 (multiple 4-elem structs to/from 4 regs ------ */
5988    /* 31 29  26   22 21 20    15   11 9 4
5989
5990       0q 001 1000 L  0  00000 0000 sz n t  xx4 {Vt..t+3.T}, [Xn|SP]
5991       0q 001 1001 L  0  m     0000 sz n t  xx4 {Vt..t+3.T}, [Xn|SP], step
5992
5993       0q 001 1000 L  0  00000 0100 sz n t  xx3 {Vt..t+2.T}, [Xn|SP]
5994       0q 001 1001 L  0  m     0100 sz n t  xx3 {Vt..t+2.T}, [Xn|SP], step
5995
5996       0q 001 1000 L  0  00000 1000 sz n t  xx2 {Vt..t+1.T}, [Xn|SP]
5997       0q 001 1001 L  0  m     1000 sz n t  xx2 {Vt..t+1.T}, [Xn|SP], step
5998
5999       0q 001 1000 L  0  00000 0111 sz n t  xx1 {Vt.T},      [Xn|SP]
6000       0q 001 1001 L  0  m     0111 sz n t  xx1 {Vt.T},      [Xn|SP], step
6001
6002       T    = defined by Q and sz in the normal way
6003       step = if m == 11111 then transfer-size else Xm
6004       xx   = case L of 1 -> LD ; 0 -> ST
6005    */
6006    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
6007        && INSN(21,21) == 0) {
6008       Bool bitQ  = INSN(30,30);
6009       Bool isPX  = INSN(23,23) == 1;
6010       Bool isLD  = INSN(22,22) == 1;
6011       UInt mm    = INSN(20,16);
6012       UInt opc   = INSN(15,12);
6013       UInt sz    = INSN(11,10);
6014       UInt nn    = INSN(9,5);
6015       UInt tt    = INSN(4,0);
6016       Bool isQ   = bitQ == 1;
6017       Bool is1d  = sz == BITS2(1,1) && !isQ;
6018       UInt nRegs = 0;
6019       switch (opc) {
6020          case BITS4(0,0,0,0): nRegs = 4; break;
6021          case BITS4(0,1,0,0): nRegs = 3; break;
6022          case BITS4(1,0,0,0): nRegs = 2; break;
6023          case BITS4(0,1,1,1): nRegs = 1; break;
6024          default: break;
6025       }
6026
6027       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
6028          If we see it, set nRegs to 0 so as to cause the next conditional
6029          to fail. */
6030       if (!isPX && mm != 0)
6031          nRegs = 0;
6032
6033       if (nRegs == 1                             /* .1d is allowed */
6034           || (nRegs >= 2 && nRegs <= 4 && !is1d) /* .1d is not allowed */) {
6035
6036          UInt xferSzB = (isQ ? 16 : 8) * nRegs;
6037
6038          /* Generate the transfer address (TA) and if necessary the
6039             writeback address (WB) */
6040          IRTemp tTA = newTemp(Ity_I64);
6041          assign(tTA, getIReg64orSP(nn));
6042          if (nn == 31) { /* FIXME generate stack alignment check */ }
6043          IRTemp tWB = IRTemp_INVALID;
6044          if (isPX) {
6045             tWB = newTemp(Ity_I64);
6046             assign(tWB, binop(Iop_Add64,
6047                               mkexpr(tTA),
6048                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6049                                                      : getIReg64orZR(mm)));
6050          }
6051
6052          /* -- BEGIN generate the transfers -- */
6053
6054          IRTemp u0, u1, u2, u3, i0, i1, i2, i3;
6055          u0 = u1 = u2 = u3 = i0 = i1 = i2 = i3 = IRTemp_INVALID;
6056          switch (nRegs) {
6057             case 4: u3 = newTempV128(); i3 = newTempV128(); /* fallthru */
6058             case 3: u2 = newTempV128(); i2 = newTempV128(); /* fallthru */
6059             case 2: u1 = newTempV128(); i1 = newTempV128(); /* fallthru */
6060             case 1: u0 = newTempV128(); i0 = newTempV128(); break;
6061             default: vassert(0);
6062          }
6063
6064          /* -- Multiple 128 or 64 bit stores -- */
6065          if (!isLD) {
6066             switch (nRegs) {
6067                case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
6068                case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
6069                case 2: assign(u1, getQReg128((tt+1) % 32)); /* fallthru */
6070                case 1: assign(u0, getQReg128((tt+0) % 32)); break;
6071                default: vassert(0);
6072             }
6073             switch (nRegs) {
6074                case 4:  (isQ ? math_INTERLEAVE4_128 : math_INTERLEAVE4_64)
6075                            (&i0, &i1, &i2, &i3, sz, u0, u1, u2, u3);
6076                         break;
6077                case 3:  (isQ ? math_INTERLEAVE3_128 : math_INTERLEAVE3_64)
6078                            (&i0, &i1, &i2, sz, u0, u1, u2);
6079                         break;
6080                case 2:  (isQ ? math_INTERLEAVE2_128 : math_INTERLEAVE2_64)
6081                            (&i0, &i1, sz, u0, u1);
6082                         break;
6083                case 1:  (isQ ? math_INTERLEAVE1_128 : math_INTERLEAVE1_64)
6084                            (&i0, sz, u0);
6085                         break;
6086                default: vassert(0);
6087             }
6088 #           define MAYBE_NARROW_TO_64(_expr) \
6089                       (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
6090             UInt step = isQ ? 16 : 8;
6091             switch (nRegs) {
6092                case 4:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
6093                                  MAYBE_NARROW_TO_64(mkexpr(i3)) );
6094                         /* fallthru */
6095                case 3:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
6096                                  MAYBE_NARROW_TO_64(mkexpr(i2)) );
6097                         /* fallthru */
6098                case 2:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
6099                                  MAYBE_NARROW_TO_64(mkexpr(i1)) );
6100                         /* fallthru */
6101                case 1:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
6102                                  MAYBE_NARROW_TO_64(mkexpr(i0)) );
6103                         break;
6104                default: vassert(0);
6105             }
6106 #           undef MAYBE_NARROW_TO_64
6107          }
6108
6109          /* -- Multiple 128 or 64 bit loads -- */
6110          else /* isLD */ {
6111             UInt   step   = isQ ? 16 : 8;
6112             IRType loadTy = isQ ? Ity_V128 : Ity_I64;
6113 #           define MAYBE_WIDEN_FROM_64(_expr) \
6114                       (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
6115             switch (nRegs) {
6116                case 4:
6117                   assign(i3, MAYBE_WIDEN_FROM_64(
6118                                 loadLE(loadTy,
6119                                        binop(Iop_Add64, mkexpr(tTA),
6120                                                         mkU64(3 * step)))));
6121                   /* fallthru */
6122                case 3:
6123                   assign(i2, MAYBE_WIDEN_FROM_64(
6124                                 loadLE(loadTy,
6125                                        binop(Iop_Add64, mkexpr(tTA),
6126                                                         mkU64(2 * step)))));
6127                   /* fallthru */
6128                case 2:
6129                   assign(i1, MAYBE_WIDEN_FROM_64(
6130                                 loadLE(loadTy,
6131                                        binop(Iop_Add64, mkexpr(tTA),
6132                                                         mkU64(1 * step)))));
6133                   /* fallthru */
6134                case 1:
6135                   assign(i0, MAYBE_WIDEN_FROM_64(
6136                                 loadLE(loadTy,
6137                                        binop(Iop_Add64, mkexpr(tTA),
6138                                                         mkU64(0 * step)))));
6139                   break;
6140                default:
6141                   vassert(0);
6142             }
6143 #           undef MAYBE_WIDEN_FROM_64
6144             switch (nRegs) {
6145                case 4:  (isQ ? math_DEINTERLEAVE4_128 : math_DEINTERLEAVE4_64)
6146                            (&u0, &u1, &u2, &u3, sz, i0,i1,i2,i3);
6147                         break;
6148                case 3:  (isQ ? math_DEINTERLEAVE3_128 : math_DEINTERLEAVE3_64)
6149                            (&u0, &u1, &u2, sz, i0, i1, i2);
6150                         break;
6151                case 2:  (isQ ? math_DEINTERLEAVE2_128 : math_DEINTERLEAVE2_64)
6152                            (&u0, &u1, sz, i0, i1);
6153                         break;
6154                case 1:  (isQ ? math_DEINTERLEAVE1_128 : math_DEINTERLEAVE1_64)
6155                            (&u0, sz, i0);
6156                         break;
6157                default: vassert(0);
6158             }
6159             switch (nRegs) {
6160                case 4:  putQReg128( (tt+3) % 32,
6161                                     math_MAYBE_ZERO_HI64(bitQ, u3));
6162                         /* fallthru */
6163                case 3:  putQReg128( (tt+2) % 32,
6164                                     math_MAYBE_ZERO_HI64(bitQ, u2));
6165                         /* fallthru */
6166                case 2:  putQReg128( (tt+1) % 32,
6167                                     math_MAYBE_ZERO_HI64(bitQ, u1));
6168                         /* fallthru */
6169                case 1:  putQReg128( (tt+0) % 32,
6170                                     math_MAYBE_ZERO_HI64(bitQ, u0));
6171                         break;
6172                default: vassert(0);
6173             }
6174          }
6175
6176          /* -- END generate the transfers -- */
6177
6178          /* Do the writeback, if necessary */
6179          if (isPX) {
6180             putIReg64orSP(nn, mkexpr(tWB));
6181          }
6182
6183          HChar pxStr[20];
6184          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6185          if (isPX) {
6186             if (mm == BITS5(1,1,1,1,1))
6187                vex_sprintf(pxStr, ", #%u", xferSzB);
6188             else
6189                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6190          }
6191          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6192          DIP("%s%u {v%u.%s .. v%u.%s}, [%s]%s\n",
6193              isLD ? "ld" : "st", nRegs,
6194              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6195              pxStr);
6196
6197          if (nRegs >= 3) {
6198             dres->hint = Dis_HintVerbose;
6199          }
6200          return True;
6201       }
6202       /* else fall through */
6203    }
6204
6205    /* ------ LD1/ST1 (multiple 1-elem structs to/from 2 regs  ------ */
6206    /* ------ LD1/ST1 (multiple 1-elem structs to/from 3 regs  ------ */
6207    /* ------ LD1/ST1 (multiple 1-elem structs to/from 4 regs  ------ */
6208    /* 31 29  26   22 21 20    15   11 9 4
6209
6210       0q 001 1000 L  0  00000 0010 sz n t  xx1 {Vt..t+3.T}, [Xn|SP]
6211       0q 001 1001 L  0  m     0010 sz n t  xx1 {Vt..t+3.T}, [Xn|SP], step
6212
6213       0q 001 1000 L  0  00000 0110 sz n t  xx1 {Vt..t+2.T}, [Xn|SP]
6214       0q 001 1001 L  0  m     0110 sz n t  xx1 {Vt..t+2.T}, [Xn|SP], step
6215
6216       0q 001 1000 L  0  00000 1010 sz n t  xx1 {Vt..t+1.T}, [Xn|SP]
6217       0q 001 1001 L  0  m     1010 sz n t  xx1 {Vt..t+1.T}, [Xn|SP], step
6218
6219       T    = defined by Q and sz in the normal way
6220       step = if m == 11111 then transfer-size else Xm
6221       xx   = case L of 1 -> LD ; 0 -> ST
6222    */
6223    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,0)
6224        && INSN(21,21) == 0) {
6225       Bool bitQ  = INSN(30,30);
6226       Bool isPX  = INSN(23,23) == 1;
6227       Bool isLD  = INSN(22,22) == 1;
6228       UInt mm    = INSN(20,16);
6229       UInt opc   = INSN(15,12);
6230       UInt sz    = INSN(11,10);
6231       UInt nn    = INSN(9,5);
6232       UInt tt    = INSN(4,0);
6233       Bool isQ   = bitQ == 1;
6234       UInt nRegs = 0;
6235       switch (opc) {
6236          case BITS4(0,0,1,0): nRegs = 4; break;
6237          case BITS4(0,1,1,0): nRegs = 3; break;
6238          case BITS4(1,0,1,0): nRegs = 2; break;
6239          default: break;
6240       }
6241
6242       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed.
6243          If we see it, set nRegs to 0 so as to cause the next conditional
6244          to fail. */
6245       if (!isPX && mm != 0)
6246          nRegs = 0;
6247
6248       if (nRegs >= 2 && nRegs <= 4) {
6249
6250          UInt xferSzB = (isQ ? 16 : 8) * nRegs;
6251
6252          /* Generate the transfer address (TA) and if necessary the
6253             writeback address (WB) */
6254          IRTemp tTA = newTemp(Ity_I64);
6255          assign(tTA, getIReg64orSP(nn));
6256          if (nn == 31) { /* FIXME generate stack alignment check */ }
6257          IRTemp tWB = IRTemp_INVALID;
6258          if (isPX) {
6259             tWB = newTemp(Ity_I64);
6260             assign(tWB, binop(Iop_Add64,
6261                               mkexpr(tTA),
6262                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6263                                                      : getIReg64orZR(mm)));
6264          }
6265
6266          /* -- BEGIN generate the transfers -- */
6267
6268          IRTemp u0, u1, u2, u3;
6269          u0 = u1 = u2 = u3 = IRTemp_INVALID;
6270          switch (nRegs) {
6271             case 4: u3 = newTempV128(); /* fallthru */
6272             case 3: u2 = newTempV128(); /* fallthru */
6273             case 2: u1 = newTempV128();
6274                     u0 = newTempV128(); break;
6275             default: vassert(0);
6276          }
6277
6278          /* -- Multiple 128 or 64 bit stores -- */
6279          if (!isLD) {
6280             switch (nRegs) {
6281                case 4: assign(u3, getQReg128((tt+3) % 32)); /* fallthru */
6282                case 3: assign(u2, getQReg128((tt+2) % 32)); /* fallthru */
6283                case 2: assign(u1, getQReg128((tt+1) % 32));
6284                        assign(u0, getQReg128((tt+0) % 32)); break;
6285                default: vassert(0);
6286             }
6287 #           define MAYBE_NARROW_TO_64(_expr) \
6288                       (isQ ? (_expr) : unop(Iop_V128to64,(_expr)))
6289             UInt step = isQ ? 16 : 8;
6290             switch (nRegs) {
6291                case 4:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(3*step)),
6292                                  MAYBE_NARROW_TO_64(mkexpr(u3)) );
6293                         /* fallthru */
6294                case 3:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(2*step)),
6295                                  MAYBE_NARROW_TO_64(mkexpr(u2)) );
6296                         /* fallthru */
6297                case 2:  storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(1*step)),
6298                                  MAYBE_NARROW_TO_64(mkexpr(u1)) );
6299                         storeLE( binop(Iop_Add64, mkexpr(tTA), mkU64(0*step)),
6300                                  MAYBE_NARROW_TO_64(mkexpr(u0)) );
6301                         break;
6302                default: vassert(0);
6303             }
6304 #           undef MAYBE_NARROW_TO_64
6305          }
6306
6307          /* -- Multiple 128 or 64 bit loads -- */
6308          else /* isLD */ {
6309             UInt   step   = isQ ? 16 : 8;
6310             IRType loadTy = isQ ? Ity_V128 : Ity_I64;
6311 #           define MAYBE_WIDEN_FROM_64(_expr) \
6312                       (isQ ? (_expr) : unop(Iop_64UtoV128,(_expr)))
6313             switch (nRegs) {
6314                case 4:
6315                   assign(u3, MAYBE_WIDEN_FROM_64(
6316                                 loadLE(loadTy,
6317                                        binop(Iop_Add64, mkexpr(tTA),
6318                                                         mkU64(3 * step)))));
6319                   /* fallthru */
6320                case 3:
6321                   assign(u2, MAYBE_WIDEN_FROM_64(
6322                                 loadLE(loadTy,
6323                                        binop(Iop_Add64, mkexpr(tTA),
6324                                                         mkU64(2 * step)))));
6325                   /* fallthru */
6326                case 2:
6327                   assign(u1, MAYBE_WIDEN_FROM_64(
6328                                 loadLE(loadTy,
6329                                        binop(Iop_Add64, mkexpr(tTA),
6330                                                         mkU64(1 * step)))));
6331                   assign(u0, MAYBE_WIDEN_FROM_64(
6332                                 loadLE(loadTy,
6333                                        binop(Iop_Add64, mkexpr(tTA),
6334                                                         mkU64(0 * step)))));
6335                   break;
6336                default:
6337                   vassert(0);
6338             }
6339 #           undef MAYBE_WIDEN_FROM_64
6340             switch (nRegs) {
6341                case 4:  putQReg128( (tt+3) % 32,
6342                                     math_MAYBE_ZERO_HI64(bitQ, u3));
6343                         /* fallthru */
6344                case 3:  putQReg128( (tt+2) % 32,
6345                                     math_MAYBE_ZERO_HI64(bitQ, u2));
6346                         /* fallthru */
6347                case 2:  putQReg128( (tt+1) % 32,
6348                                     math_MAYBE_ZERO_HI64(bitQ, u1));
6349                         putQReg128( (tt+0) % 32,
6350                                     math_MAYBE_ZERO_HI64(bitQ, u0));
6351                         break;
6352                default: vassert(0);
6353             }
6354          }
6355
6356          /* -- END generate the transfers -- */
6357
6358          /* Do the writeback, if necessary */
6359          if (isPX) {
6360             putIReg64orSP(nn, mkexpr(tWB));
6361          }
6362
6363          HChar pxStr[20];
6364          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6365          if (isPX) {
6366             if (mm == BITS5(1,1,1,1,1))
6367                vex_sprintf(pxStr, ", #%u", xferSzB);
6368             else
6369                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6370          }
6371          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6372          DIP("%s1 {v%u.%s .. v%u.%s}, [%s]%s\n",
6373              isLD ? "ld" : "st",
6374              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6375              pxStr);
6376
6377          return True;
6378       }
6379       /* else fall through */
6380    }
6381
6382    /* ---------- LD1R (single structure, replicate) ---------- */
6383    /* ---------- LD2R (single structure, replicate) ---------- */
6384    /* ---------- LD3R (single structure, replicate) ---------- */
6385    /* ---------- LD4R (single structure, replicate) ---------- */
6386    /* 31 29       22 20    15    11 9 4
6387       0q 001 1010 10 00000 110 0 sz n t  LD1R {Vt.T}, [Xn|SP]
6388       0q 001 1011 10 m     110 0 sz n t  LD1R {Vt.T}, [Xn|SP], step
6389
6390       0q 001 1010 11 00000 110 0 sz n t  LD2R {Vt..t+1.T}, [Xn|SP]
6391       0q 001 1011 11 m     110 0 sz n t  LD2R {Vt..t+1.T}, [Xn|SP], step
6392
6393       0q 001 1010 10 00000 111 0 sz n t  LD3R {Vt..t+2.T}, [Xn|SP]
6394       0q 001 1011 10 m     111 0 sz n t  LD3R {Vt..t+2.T}, [Xn|SP], step
6395
6396       0q 001 1010 11 00000 111 0 sz n t  LD4R {Vt..t+3.T}, [Xn|SP]
6397       0q 001 1011 11 m     111 0 sz n t  LD4R {Vt..t+3.T}, [Xn|SP], step
6398
6399       step = if m == 11111 then transfer-size else Xm
6400    */
6401    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)
6402        && INSN(22,22) == 1 && INSN(15,14) == BITS2(1,1)
6403        && INSN(12,12) == 0) {
6404       UInt   bitQ  = INSN(30,30);
6405       Bool   isPX  = INSN(23,23) == 1;
6406       UInt   nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
6407       UInt   mm    = INSN(20,16);
6408       UInt   sz    = INSN(11,10);
6409       UInt   nn    = INSN(9,5);
6410       UInt   tt    = INSN(4,0);
6411
6412       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
6413       if (isPX || mm == 0) {
6414
6415          IRType ty    = integerIRTypeOfSize(1 << sz);
6416
6417          UInt laneSzB = 1 << sz;
6418          UInt xferSzB = laneSzB * nRegs;
6419
6420          /* Generate the transfer address (TA) and if necessary the
6421             writeback address (WB) */
6422          IRTemp tTA = newTemp(Ity_I64);
6423          assign(tTA, getIReg64orSP(nn));
6424          if (nn == 31) { /* FIXME generate stack alignment check */ }
6425          IRTemp tWB = IRTemp_INVALID;
6426          if (isPX) {
6427             tWB = newTemp(Ity_I64);
6428             assign(tWB, binop(Iop_Add64,
6429                               mkexpr(tTA),
6430                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6431                                                      : getIReg64orZR(mm)));
6432          }
6433
6434          /* Do the writeback, if necessary */
6435          if (isPX) {
6436             putIReg64orSP(nn, mkexpr(tWB));
6437          }
6438
6439          IRTemp e0, e1, e2, e3, v0, v1, v2, v3;
6440          e0 = e1 = e2 = e3 = v0 = v1 = v2 = v3 = IRTemp_INVALID;
6441          switch (nRegs) {
6442             case 4:
6443                e3 = newTemp(ty);
6444                assign(e3, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6445                                                       mkU64(3 * laneSzB))));
6446                v3 = math_DUP_TO_V128(e3, ty);
6447                putQReg128((tt+3) % 32, math_MAYBE_ZERO_HI64(bitQ, v3));
6448                /* fallthrough */
6449             case 3:
6450                e2 = newTemp(ty);
6451                assign(e2, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6452                                                       mkU64(2 * laneSzB))));
6453                v2 = math_DUP_TO_V128(e2, ty);
6454                putQReg128((tt+2) % 32, math_MAYBE_ZERO_HI64(bitQ, v2));
6455                /* fallthrough */
6456             case 2:
6457                e1 = newTemp(ty);
6458                assign(e1, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6459                                                       mkU64(1 * laneSzB))));
6460                v1 = math_DUP_TO_V128(e1, ty);
6461                putQReg128((tt+1) % 32, math_MAYBE_ZERO_HI64(bitQ, v1));
6462                /* fallthrough */
6463             case 1:
6464                e0 = newTemp(ty);
6465                assign(e0, loadLE(ty, binop(Iop_Add64, mkexpr(tTA),
6466                                                       mkU64(0 * laneSzB))));
6467                v0 = math_DUP_TO_V128(e0, ty);
6468                putQReg128((tt+0) % 32, math_MAYBE_ZERO_HI64(bitQ, v0));
6469                break;
6470             default:
6471                vassert(0);
6472          }
6473
6474          HChar pxStr[20];
6475          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6476          if (isPX) {
6477             if (mm == BITS5(1,1,1,1,1))
6478                vex_sprintf(pxStr, ", #%u", xferSzB);
6479             else
6480                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6481          }
6482          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6483          DIP("ld%ur {v%u.%s .. v%u.%s}, [%s]%s\n",
6484              nRegs,
6485              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr, nameIReg64orSP(nn),
6486              pxStr);
6487
6488          return True;
6489       }
6490       /* else fall through */
6491    }
6492
6493    /* ------ LD1/ST1 (single structure, to/from one lane) ------ */
6494    /* ------ LD2/ST2 (single structure, to/from one lane) ------ */
6495    /* ------ LD3/ST3 (single structure, to/from one lane) ------ */
6496    /* ------ LD4/ST4 (single structure, to/from one lane) ------ */
6497    /* 31 29       22 21 20    15    11 9 4
6498       0q 001 1010 L  0  00000 xx0 S sz n t  op1 {Vt.T}[ix], [Xn|SP]
6499       0q 001 1011 L  0  m     xx0 S sz n t  op1 {Vt.T}[ix], [Xn|SP], step
6500
6501       0q 001 1010 L  1  00000 xx0 S sz n t  op2 {Vt..t+1.T}[ix], [Xn|SP]
6502       0q 001 1011 L  1  m     xx0 S sz n t  op2 {Vt..t+1.T}[ix], [Xn|SP], step
6503
6504       0q 001 1010 L  0  00000 xx1 S sz n t  op3 {Vt..t+2.T}[ix], [Xn|SP]
6505       0q 001 1011 L  0  m     xx1 S sz n t  op3 {Vt..t+2.T}[ix], [Xn|SP], step
6506
6507       0q 001 1010 L  1  00000 xx1 S sz n t  op4 {Vt..t+3.T}[ix], [Xn|SP]
6508       0q 001 1011 L  1  m     xx1 S sz n t  op4 {Vt..t+3.T}[ix], [Xn|SP], step
6509
6510       step = if m == 11111 then transfer-size else Xm
6511       op   = case L of 1 -> LD ; 0 -> ST
6512
6513       laneszB,ix = case xx:q:S:sz of 00:b:b:bb -> 1, bbbb
6514                                      01:b:b:b0 -> 2, bbb
6515                                      10:b:b:00 -> 4, bb
6516                                      10:b:0:01 -> 8, b
6517    */
6518    if (INSN(31,31) == 0 && INSN(29,24) == BITS6(0,0,1,1,0,1)) {
6519       UInt   bitQ  = INSN(30,30);
6520       Bool   isPX  = INSN(23,23) == 1;
6521       Bool   isLD  = INSN(22,22) == 1;
6522       UInt   nRegs = ((INSN(13,13) << 1) | INSN(21,21)) + 1;
6523       UInt   mm    = INSN(20,16);
6524       UInt   xx    = INSN(15,14);
6525       UInt   bitS  = INSN(12,12);
6526       UInt   sz    = INSN(11,10);
6527       UInt   nn    = INSN(9,5);
6528       UInt   tt    = INSN(4,0);
6529
6530       Bool valid = True;
6531
6532       /* The combination insn[23] == 0 && insn[20:16] != 0 is not allowed. */
6533       if (!isPX && mm != 0)
6534          valid = False;
6535
6536       UInt laneSzB = 0;  /* invalid */
6537       UInt ix      = 16; /* invalid */
6538
6539       UInt xx_q_S_sz = (xx << 4) | (bitQ << 3) | (bitS << 2) | sz;
6540       switch (xx_q_S_sz) {
6541          case 0x00: case 0x01: case 0x02: case 0x03:
6542          case 0x04: case 0x05: case 0x06: case 0x07:
6543          case 0x08: case 0x09: case 0x0A: case 0x0B:
6544          case 0x0C: case 0x0D: case 0x0E: case 0x0F:
6545             laneSzB = 1; ix = xx_q_S_sz & 0xF;
6546             break;
6547          case 0x10: case 0x12: case 0x14: case 0x16:
6548          case 0x18: case 0x1A: case 0x1C: case 0x1E:
6549             laneSzB = 2; ix = (xx_q_S_sz >> 1) & 7;
6550             break;
6551          case 0x20: case 0x24: case 0x28: case 0x2C:
6552             laneSzB = 4; ix = (xx_q_S_sz >> 2) & 3;
6553             break;
6554          case 0x21: case 0x29:
6555             laneSzB = 8; ix = (xx_q_S_sz >> 3) & 1;
6556             break;
6557          default:
6558             break;
6559       }
6560
6561       if (valid && laneSzB != 0) {
6562
6563          IRType ty      = integerIRTypeOfSize(laneSzB);
6564          UInt   xferSzB = laneSzB * nRegs;
6565
6566          /* Generate the transfer address (TA) and if necessary the
6567             writeback address (WB) */
6568          IRTemp tTA = newTemp(Ity_I64);
6569          assign(tTA, getIReg64orSP(nn));
6570          if (nn == 31) { /* FIXME generate stack alignment check */ }
6571          IRTemp tWB = IRTemp_INVALID;
6572          if (isPX) {
6573             tWB = newTemp(Ity_I64);
6574             assign(tWB, binop(Iop_Add64,
6575                               mkexpr(tTA),
6576                               mm == BITS5(1,1,1,1,1) ? mkU64(xferSzB)
6577                                                      : getIReg64orZR(mm)));
6578          }
6579
6580          /* Do the writeback, if necessary */
6581          if (isPX) {
6582             putIReg64orSP(nn, mkexpr(tWB));
6583          }
6584
6585          switch (nRegs) {
6586             case 4: {
6587                IRExpr* addr
6588                   = binop(Iop_Add64, mkexpr(tTA), mkU64(3 * laneSzB));
6589                if (isLD) {
6590                   putQRegLane((tt+3) % 32, ix, loadLE(ty, addr));
6591                } else {
6592                   storeLE(addr, getQRegLane((tt+3) % 32, ix, ty));
6593                }
6594             }
6595             /* fallthrough */
6596             case 3: {
6597                IRExpr* addr
6598                   = binop(Iop_Add64, mkexpr(tTA), mkU64(2 * laneSzB));
6599                if (isLD) {
6600                   putQRegLane((tt+2) % 32, ix, loadLE(ty, addr));
6601                } else {
6602                   storeLE(addr, getQRegLane((tt+2) % 32, ix, ty));
6603                }
6604             }
6605             /* fallthrough */
6606             case 2: {
6607                IRExpr* addr
6608                   = binop(Iop_Add64, mkexpr(tTA), mkU64(1 * laneSzB));
6609                if (isLD) {
6610                   putQRegLane((tt+1) % 32, ix, loadLE(ty, addr));
6611                } else {
6612                   storeLE(addr, getQRegLane((tt+1) % 32, ix, ty));
6613                }
6614             }
6615             /* fallthrough */
6616             case 1: {
6617                IRExpr* addr
6618                   = binop(Iop_Add64, mkexpr(tTA), mkU64(0 * laneSzB));
6619                if (isLD) {
6620                   putQRegLane((tt+0) % 32, ix, loadLE(ty, addr));
6621                } else {
6622                   storeLE(addr, getQRegLane((tt+0) % 32, ix, ty));
6623                }
6624                break;
6625             }
6626             default:
6627                vassert(0);
6628          }
6629
6630          HChar pxStr[20];
6631          pxStr[0] = pxStr[sizeof(pxStr)-1] = 0;
6632          if (isPX) {
6633             if (mm == BITS5(1,1,1,1,1))
6634                vex_sprintf(pxStr, ", #%u", xferSzB);
6635             else
6636                vex_sprintf(pxStr, ", %s", nameIReg64orZR(mm));
6637          }
6638          const HChar* arr = nameArr_Q_SZ(bitQ, sz);
6639          DIP("%s%u {v%u.%s .. v%u.%s}[%u], [%s]%s\n",
6640              isLD ? "ld" : "st", nRegs,
6641              (tt+0) % 32, arr, (tt+nRegs-1) % 32, arr,
6642              ix, nameIReg64orSP(nn), pxStr);
6643
6644          return True;
6645       }
6646       /* else fall through */
6647    }
6648
6649    /* ------------------ LD{,A}X{R,RH,RB} ------------------ */
6650    /* ------------------ ST{,L}X{R,RH,RB} ------------------ */
6651    /* 31 29     23  20      14    9 4
6652       sz 001000 010 11111 0 11111 n t   LDX{R,RH,RB}  Rt, [Xn|SP]
6653       sz 001000 010 11111 1 11111 n t   LDAX{R,RH,RB} Rt, [Xn|SP]
6654       sz 001000 000 s     0 11111 n t   STX{R,RH,RB}  Ws, Rt, [Xn|SP]
6655       sz 001000 000 s     1 11111 n t   STLX{R,RH,RB} Ws, Rt, [Xn|SP]
6656    */
6657    /* For the "standard" implementation we pass through the LL and SC to
6658       the host.  For the "fallback" implementation, for details see
6659         https://bugs.kde.org/show_bug.cgi?id=344524 and
6660         https://bugs.kde.org/show_bug.cgi?id=369459,
6661       but in short:
6662
6663       LoadLinked(addr)
6664         gs.LLsize = load_size // 1, 2, 4 or 8
6665         gs.LLaddr = addr
6666         gs.LLdata = zeroExtend(*addr)
6667
6668       StoreCond(addr, data)
6669         tmp_LLsize = gs.LLsize
6670         gs.LLsize = 0 // "no transaction"
6671         if tmp_LLsize != store_size        -> fail
6672         if addr != gs.LLaddr               -> fail
6673         if zeroExtend(*addr) != gs.LLdata  -> fail
6674         cas_ok = CAS(store_size, addr, gs.LLdata -> data)
6675         if !cas_ok                         -> fail
6676         succeed
6677
6678       When thread scheduled
6679         gs.LLsize = 0 // "no transaction"
6680         (coregrind/m_scheduler/scheduler.c, run_thread_for_a_while()
6681          has to do this bit)
6682    */
6683    if (INSN(29,24) == BITS6(0,0,1,0,0,0)
6684        && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,0)
6685        && INSN(14,10) == BITS5(1,1,1,1,1)) {
6686       UInt szBlg2     = INSN(31,30);
6687       Bool isLD       = INSN(22,22) == 1;
6688       Bool isAcqOrRel = INSN(15,15) == 1;
6689       UInt ss         = INSN(20,16);
6690       UInt nn         = INSN(9,5);
6691       UInt tt         = INSN(4,0);
6692
6693       vassert(szBlg2 < 4);
6694       UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
6695       IRType ty  = integerIRTypeOfSize(szB);
6696       const HChar* suffix[4] = { "rb", "rh", "r", "r" };
6697
6698       IRTemp ea = newTemp(Ity_I64);
6699       assign(ea, getIReg64orSP(nn));
6700       gen_SIGBUS_if_not_XX_aligned(ea, szB);
6701
6702       if (isLD && ss == BITS5(1,1,1,1,1)) {
6703          IRTemp res = newTemp(ty);
6704          if (abiinfo->guest__use_fallback_LLSC) {
6705             // Do the load first so we don't update any guest state
6706             // if it faults.
6707             IRTemp loaded_data64 = newTemp(Ity_I64);
6708             assign(loaded_data64, widenUto64(ty, loadLE(ty, mkexpr(ea))));
6709             stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64, mkexpr(loaded_data64) ));
6710             stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64, mkU64(0) ));
6711             stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
6712             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(szB) ));
6713             putIReg64orZR(tt, mkexpr(loaded_data64));
6714          } else {
6715             stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
6716             putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
6717          }
6718          if (isAcqOrRel) {
6719             stmt(IRStmt_MBE(Imbe_Fence));
6720          }
6721          DIP("ld%sx%s %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
6722              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
6723              abiinfo->guest__use_fallback_LLSC
6724                 ? "(fallback implementation)" : "");
6725          return True;
6726       }
6727       if (!isLD) {
6728          if (isAcqOrRel) {
6729             stmt(IRStmt_MBE(Imbe_Fence));
6730          }
6731          IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
6732          if (abiinfo->guest__use_fallback_LLSC) {
6733             // This is really ugly, since we don't have any way to do
6734             // proper if-then-else.  First, set up as if the SC failed,
6735             // and jump forwards if it really has failed.
6736
6737             // Continuation address
6738             IRConst* nia = IRConst_U64(guest_PC_curr_instr + 4);
6739
6740             // "the SC failed".  Any non-zero value means failure.
6741             putIReg64orZR(ss, mkU64(1));
6742
6743             IRTemp tmp_LLsize = newTemp(Ity_I64);
6744             assign(tmp_LLsize, IRExpr_Get(OFFB_LLSC_SIZE, Ity_I64));
6745             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) // "no transaction"
6746             ));
6747             // Fail if no or wrong-size transaction
6748             vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
6749             stmt( IRStmt_Exit(
6750                      binop(Iop_CmpNE64, mkexpr(tmp_LLsize), mkU64(szB)),
6751                      Ijk_Boring, nia, OFFB_PC
6752             ));
6753             // Fail if the address doesn't match the LL address
6754             stmt( IRStmt_Exit(
6755                       binop(Iop_CmpNE64, mkexpr(ea),
6756                                          IRExpr_Get(OFFB_LLSC_ADDR, Ity_I64)),
6757                       Ijk_Boring, nia, OFFB_PC
6758             ));
6759             // Fail if the data doesn't match the LL data
6760             IRTemp llsc_data64 = newTemp(Ity_I64);
6761             assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
6762             stmt( IRStmt_Exit(
6763                       binop(Iop_CmpNE64, widenUto64(ty, loadLE(ty, mkexpr(ea))),
6764                                          mkexpr(llsc_data64)),
6765                       Ijk_Boring, nia, OFFB_PC
6766             ));
6767             // Try to CAS the new value in.
6768             IRTemp old = newTemp(ty);
6769             IRTemp expd = newTemp(ty);
6770             assign(expd, narrowFrom64(ty, mkexpr(llsc_data64)));
6771             stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
6772                                      Iend_LE, mkexpr(ea),
6773                                      /*expdHi*/NULL, mkexpr(expd),
6774                                      /*dataHi*/NULL, data
6775             )));
6776             // Fail if the CAS failed (viz, old != expd)
6777             stmt( IRStmt_Exit(
6778                       binop(Iop_CmpNE64,
6779                             widenUto64(ty, mkexpr(old)),
6780                             widenUto64(ty, mkexpr(expd))),
6781                       Ijk_Boring, nia, OFFB_PC
6782             ));
6783             // Otherwise we succeeded (!)
6784             putIReg64orZR(ss, mkU64(0));
6785          } else {
6786             IRTemp res = newTemp(Ity_I1);
6787             stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
6788             /* IR semantics: res is 1 if store succeeds, 0 if it fails.
6789                Need to set rS to 1 on failure, 0 on success. */
6790             putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
6791                                                mkU64(1)));
6792          }
6793          DIP("st%sx%s %s, %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
6794              nameIRegOrZR(False, ss),
6795              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
6796              abiinfo->guest__use_fallback_LLSC
6797                 ? "(fallback implementation)" : "");
6798          return True;
6799       }
6800       /* else fall through */
6801    }
6802
6803    /* -------------------- LD{,A}XP -------------------- */
6804    /* -------------------- ST{,L}XP -------------------- */
6805    /* 31 30 29     23  20    15 14  9  4
6806        1 sz 001000 011 11111 0  t2  n  t1   LDXP  Rt1, Rt2, [Xn|SP]
6807        1 sz 001000 011 11111 1  t2  n  t1   LDAXP Rt1, Rt2, [Xn|SP]
6808        1 sz 001000 001 s     0  t2  n  t1   STXP  Ws, Rt1, Rt2, [Xn|SP]
6809        1 sz 001000 001 s     1  t2  n  t1   STLXP Ws, Rt1, Rt2, [Xn|SP]
6810    */
6811    /* See just above, "LD{,A}X{R,RH,RB} / ST{,L}X{R,RH,RB}", for detailed
6812       comments about this implementation.  Note the 'sz' field here is only 1
6813       bit; above, it is 2 bits, and has a different encoding.
6814    */
6815    if (INSN(31,31) == 1
6816        && INSN(29,24) == BITS6(0,0,1,0,0,0)
6817        && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,1)) {
6818       Bool elemIs64   = INSN(30,30) == 1;
6819       Bool isLD       = INSN(22,22) == 1;
6820       Bool isAcqOrRel = INSN(15,15) == 1;
6821       UInt ss         = INSN(20,16);
6822       UInt tt2        = INSN(14,10);
6823       UInt nn         = INSN(9,5);
6824       UInt tt1        = INSN(4,0);
6825
6826       UInt   elemSzB = elemIs64 ? 8 : 4;
6827       UInt   fullSzB = 2 * elemSzB;
6828       IRType elemTy  = integerIRTypeOfSize(elemSzB);
6829       IRType fullTy  = integerIRTypeOfSize(fullSzB);
6830
6831       IRTemp ea = newTemp(Ity_I64);
6832       assign(ea, getIReg64orSP(nn));
6833       gen_SIGBUS_if_not_XX_aligned(ea, fullSzB);
6834
6835       if (isLD && ss == BITS5(1,1,1,1,1)) {
6836          if (abiinfo->guest__use_fallback_LLSC) {
6837             // Fallback implementation of LL.
6838             // Do the load first so we don't update any guest state if it
6839             // faults.  Assumes little-endian guest.
6840             if (fullTy == Ity_I64) {
6841                vassert(elemSzB == 4);
6842                IRTemp loaded_data64 = newTemp(Ity_I64);
6843                assign(loaded_data64, loadLE(fullTy, mkexpr(ea)));
6844                stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64, mkexpr(loaded_data64) ));
6845                stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64, mkU64(0) ));
6846                stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
6847                stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(8) ));
6848                putIReg64orZR(tt1, unop(Iop_32Uto64,
6849                                        unop(Iop_64to32,
6850                                             mkexpr(loaded_data64))));
6851                putIReg64orZR(tt2, unop(Iop_32Uto64,
6852                                        unop(Iop_64HIto32,
6853                                             mkexpr(loaded_data64))));
6854             } else {
6855                vassert(elemSzB == 8 && fullTy == Ity_I128);
6856                IRTemp loaded_data128 = newTemp(Ity_I128);
6857                // Hack: do the load as V128 rather than I128 so as to avoid
6858                // having to implement I128 loads in the arm64 back end.
6859                assign(loaded_data128, unop(Iop_ReinterpV128asI128,
6860                                            loadLE(Ity_V128, mkexpr(ea))));
6861                IRTemp loaded_data_lo64 = newTemp(Ity_I64);
6862                IRTemp loaded_data_hi64 = newTemp(Ity_I64);
6863                assign(loaded_data_lo64, unop(Iop_128to64,
6864                                              mkexpr(loaded_data128)));
6865                assign(loaded_data_hi64, unop(Iop_128HIto64,
6866                                              mkexpr(loaded_data128)));
6867                stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64,
6868                                  mkexpr(loaded_data_lo64) ));
6869                stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64,
6870                                  mkexpr(loaded_data_hi64) ));
6871                stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
6872                stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(16) ));
6873                putIReg64orZR(tt1, mkexpr(loaded_data_lo64));
6874                putIReg64orZR(tt2, mkexpr(loaded_data_hi64));
6875             }
6876          } else {
6877             // Non-fallback implementation of LL.
6878             IRTemp res = newTemp(fullTy); // I64 or I128
6879             stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
6880             // Assuming a little-endian guest here.  Rt1 goes at the lower
6881             // address, so it must live in the least significant half of `res`.
6882             IROp opGetLO = fullTy == Ity_I128 ? Iop_128to64   : Iop_64to32;
6883             IROp opGetHI = fullTy == Ity_I128 ? Iop_128HIto64 : Iop_64HIto32;
6884             putIReg64orZR(tt1, widenUto64(elemTy, unop(opGetLO, mkexpr(res))));
6885             putIReg64orZR(tt2, widenUto64(elemTy, unop(opGetHI, mkexpr(res))));
6886          }
6887          if (isAcqOrRel) {
6888             stmt(IRStmt_MBE(Imbe_Fence));
6889          }
6890          DIP("ld%sxp %s, %s, [%s] %s\n",
6891              isAcqOrRel ? (isLD ? "a" : "l") : "",
6892              nameIRegOrZR(elemSzB == 8, tt1),
6893              nameIRegOrZR(elemSzB == 8, tt2),
6894              nameIReg64orSP(nn),
6895              abiinfo->guest__use_fallback_LLSC
6896                 ? "(fallback implementation)" : "");
6897          return True;
6898       }
6899       if (!isLD) {
6900          if (isAcqOrRel) {
6901             stmt(IRStmt_MBE(Imbe_Fence));
6902          }
6903          if (abiinfo->guest__use_fallback_LLSC) {
6904             // Fallback implementation of SC.
6905             // This is really ugly, since we don't have any way to do
6906             // proper if-then-else.  First, set up as if the SC failed,
6907             // and jump forwards if it really has failed.
6908
6909             // Continuation address
6910             IRConst* nia = IRConst_U64(guest_PC_curr_instr + 4);
6911
6912             // "the SC failed".  Any non-zero value means failure.
6913             putIReg64orZR(ss, mkU64(1));
6914
6915             IRTemp tmp_LLsize = newTemp(Ity_I64);
6916             assign(tmp_LLsize, IRExpr_Get(OFFB_LLSC_SIZE, Ity_I64));
6917             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) // "no transaction"
6918             ));
6919             // Fail if no or wrong-size transaction
6920             vassert((fullSzB == 8 && fullTy == Ity_I64)
6921                     || (fullSzB == 16 && fullTy == Ity_I128));
6922             stmt( IRStmt_Exit(
6923                      binop(Iop_CmpNE64, mkexpr(tmp_LLsize), mkU64(fullSzB)),
6924                      Ijk_Boring, nia, OFFB_PC
6925             ));
6926             // Fail if the address doesn't match the LL address
6927             stmt( IRStmt_Exit(
6928                       binop(Iop_CmpNE64, mkexpr(ea),
6929                                          IRExpr_Get(OFFB_LLSC_ADDR, Ity_I64)),
6930                       Ijk_Boring, nia, OFFB_PC
6931             ));
6932             // The data to be stored.
6933             IRTemp store_data = newTemp(fullTy);
6934             if (fullTy == Ity_I64) {
6935                assign(store_data,
6936                       binop(Iop_32HLto64,
6937                             narrowFrom64(Ity_I32, getIReg64orZR(tt2)),
6938                             narrowFrom64(Ity_I32, getIReg64orZR(tt1))));
6939             } else {
6940                assign(store_data,
6941                       binop(Iop_64HLto128,
6942                             getIReg64orZR(tt2), getIReg64orZR(tt1)));
6943             }
6944
6945             if (fullTy == Ity_I64) {
6946                // 64 bit (2x32 bit) path
6947                // Fail if the data in memory doesn't match the data stashed by
6948                // the LL.
6949                IRTemp llsc_data_lo64 = newTemp(Ity_I64);
6950                assign(llsc_data_lo64,
6951                       IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
6952                stmt( IRStmt_Exit(
6953                          binop(Iop_CmpNE64, loadLE(Ity_I64, mkexpr(ea)),
6954                                             mkexpr(llsc_data_lo64)),
6955                       Ijk_Boring, nia, OFFB_PC
6956                ));
6957                // Try to CAS the new value in.
6958                IRTemp old = newTemp(Ity_I64);
6959                IRTemp expd = newTemp(Ity_I64);
6960                assign(expd, mkexpr(llsc_data_lo64));
6961                stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
6962                                         Iend_LE, mkexpr(ea),
6963                                         /*expdHi*/NULL, mkexpr(expd),
6964                                         /*dataHi*/NULL, mkexpr(store_data)
6965                )));
6966                // Fail if the CAS failed (viz, old != expd)
6967                stmt( IRStmt_Exit(
6968                          binop(Iop_CmpNE64, mkexpr(old), mkexpr(expd)),
6969                          Ijk_Boring, nia, OFFB_PC
6970                ));
6971             } else {
6972                // 128 bit (2x64 bit) path
6973                // Fail if the data in memory doesn't match the data stashed by
6974                // the LL.
6975                IRTemp llsc_data_lo64 = newTemp(Ity_I64);
6976                assign(llsc_data_lo64,
6977                       IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
6978                IRTemp llsc_data_hi64 = newTemp(Ity_I64);
6979                assign(llsc_data_hi64,
6980                       IRExpr_Get(OFFB_LLSC_DATA_HI64, Ity_I64));
6981                IRTemp data_at_ea = newTemp(Ity_I128);
6982                assign(data_at_ea,
6983                       unop(Iop_ReinterpV128asI128,
6984                            loadLE(Ity_V128, mkexpr(ea))));
6985                stmt( IRStmt_Exit(
6986                         binop(Iop_CmpNE64,
6987                               unop(Iop_128to64, mkexpr(data_at_ea)),
6988                               mkexpr(llsc_data_lo64)),
6989                         Ijk_Boring, nia, OFFB_PC
6990                ));
6991                stmt( IRStmt_Exit(
6992                         binop(Iop_CmpNE64,
6993                               unop(Iop_128HIto64, mkexpr(data_at_ea)),
6994                               mkexpr(llsc_data_hi64)),
6995                         Ijk_Boring, nia, OFFB_PC
6996                ));
6997                // Try to CAS the new value in.
6998                IRTemp old_lo64 = newTemp(Ity_I64);
6999                IRTemp old_hi64 = newTemp(Ity_I64);
7000                IRTemp expd_lo64 = newTemp(Ity_I64);
7001                IRTemp expd_hi64 = newTemp(Ity_I64);
7002                IRTemp store_data_lo64 = newTemp(Ity_I64);
7003                IRTemp store_data_hi64 = newTemp(Ity_I64);
7004                assign(expd_lo64, mkexpr(llsc_data_lo64));
7005                assign(expd_hi64, mkexpr(llsc_data_hi64));
7006                assign(store_data_lo64, unop(Iop_128to64, mkexpr(store_data)));
7007                assign(store_data_hi64, unop(Iop_128HIto64, mkexpr(store_data)));
7008                stmt( IRStmt_CAS(mkIRCAS(old_hi64, old_lo64,
7009                                         Iend_LE, mkexpr(ea),
7010                                         mkexpr(expd_hi64), mkexpr(expd_lo64),
7011                                         mkexpr(store_data_hi64),
7012                                         mkexpr(store_data_lo64)
7013                )));
7014                // Fail if the CAS failed (viz, old != expd)
7015                stmt( IRStmt_Exit(
7016                         binop(Iop_CmpNE64, mkexpr(old_lo64), mkexpr(expd_lo64)),
7017                         Ijk_Boring, nia, OFFB_PC
7018                ));
7019                stmt( IRStmt_Exit(
7020                         binop(Iop_CmpNE64, mkexpr(old_hi64), mkexpr(expd_hi64)),
7021                         Ijk_Boring, nia, OFFB_PC
7022                ));
7023             }
7024             // Otherwise we succeeded (!)
7025             putIReg64orZR(ss, mkU64(0));
7026          } else {
7027             // Non-fallback implementation of SC.
7028             IRTemp  res     = newTemp(Ity_I1);
7029             IRExpr* dataLO  = narrowFrom64(elemTy, getIReg64orZR(tt1));
7030             IRExpr* dataHI  = narrowFrom64(elemTy, getIReg64orZR(tt2));
7031             IROp    opMerge = fullTy == Ity_I128 ? Iop_64HLto128 : Iop_32HLto64;
7032             IRExpr* data    = binop(opMerge, dataHI, dataLO);
7033             // Assuming a little-endian guest here.  Rt1 goes at the lower
7034             // address, so it must live in the least significant half of `data`.
7035             stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
7036             /* IR semantics: res is 1 if store succeeds, 0 if it fails.
7037                Need to set rS to 1 on failure, 0 on success. */
7038             putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
7039                                                mkU64(1)));
7040          }
7041          DIP("st%sxp %s, %s, %s, [%s] %s\n",
7042              isAcqOrRel ? (isLD ? "a" : "l") : "",
7043              nameIRegOrZR(False, ss),
7044              nameIRegOrZR(elemSzB == 8, tt1),
7045              nameIRegOrZR(elemSzB == 8, tt2),
7046              nameIReg64orSP(nn),
7047              abiinfo->guest__use_fallback_LLSC
7048                 ? "(fallback implementation)" : "");
7049          return True;
7050       }
7051       /* else fall through */
7052    }
7053
7054    /* ------------------ LDA{R,RH,RB} ------------------ */
7055    /* ------------------ STL{R,RH,RB} ------------------ */
7056    /* 31 29     23  20      14    9 4
7057       sz 001000 110 11111 1 11111 n t   LDAR<sz> Rt, [Xn|SP]
7058       sz 001000 100 11111 1 11111 n t   STLR<sz> Rt, [Xn|SP]
7059    */
7060    if (INSN(29,23) == BITS7(0,0,1,0,0,0,1)
7061        && INSN(21,10) == BITS12(0,1,1,1,1,1,1,1,1,1,1,1)) {
7062       UInt szBlg2 = INSN(31,30);
7063       Bool isLD   = INSN(22,22) == 1;
7064       UInt nn     = INSN(9,5);
7065       UInt tt     = INSN(4,0);
7066
7067       vassert(szBlg2 < 4);
7068       UInt   szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
7069       IRType ty  = integerIRTypeOfSize(szB);
7070       const HChar* suffix[4] = { "rb", "rh", "r", "r" };
7071
7072       IRTemp ea = newTemp(Ity_I64);
7073       assign(ea, getIReg64orSP(nn));
7074       gen_SIGBUS_if_not_XX_aligned(ea, szB);
7075
7076       if (isLD) {
7077          IRTemp res = newTemp(ty);
7078          assign(res, loadLE(ty, mkexpr(ea)));
7079          putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
7080          stmt(IRStmt_MBE(Imbe_Fence));
7081          DIP("lda%s %s, [%s]\n", suffix[szBlg2],
7082              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
7083       } else {
7084          stmt(IRStmt_MBE(Imbe_Fence));
7085          IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
7086          storeLE(mkexpr(ea), data);
7087          DIP("stl%s %s, [%s]\n", suffix[szBlg2],
7088              nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
7089       }
7090       return True;
7091    }
7092
7093    /* The PRFM cases that follow are possibly allow Rt values (the
7094       prefetch operation) which are not allowed by the documentation.
7095       This should be looked into. */
7096    /* ------------------ PRFM (immediate) ------------------ */
7097    /* 31           21    9 4
7098       11 111 00110 imm12 n t   PRFM pfrop=Rt, [Xn|SP, #pimm]
7099    */
7100    if (INSN(31,22) == BITS10(1,1,1,1,1,0,0,1,1,0)) {
7101       UInt imm12 = INSN(21,10);
7102       UInt nn    = INSN(9,5);
7103       UInt tt    = INSN(4,0);
7104       /* Generating any IR here is pointless, except for documentation
7105          purposes, as it will get optimised away later. */
7106       IRTemp ea = newTemp(Ity_I64);
7107       assign(ea, binop(Iop_Add64, getIReg64orSP(nn), mkU64(imm12 * 8)));
7108       DIP("prfm prfop=%u, [%s, #%u]\n", tt, nameIReg64orSP(nn), imm12 * 8);
7109       return True;
7110    }
7111
7112    /* ------------------ PRFM (register) ------------------ */
7113    /* 31 29      22 20 15  12 11 9  4
7114       11 1110001 01 Rm opt S  10 Rn Rt    PRFM pfrop=Rt, [Xn|SP, R<m>{ext/sh}]
7115    */
7116    if (INSN(31,21) == BITS11(1,1,1,1,1,0,0,0,1,0,1)
7117        && INSN(11,10) == BITS2(1,0)) {
7118       HChar  dis_buf[64];
7119       UInt   tt = INSN(4,0);
7120       IRTemp ea = gen_indexed_EA(dis_buf, insn, True/*to/from int regs*/);
7121       if (ea != IRTemp_INVALID) {
7122          /* No actual code to generate. */
7123          DIP("prfm prfop=%u, %s\n", tt, dis_buf);
7124          return True;
7125       }
7126    }
7127
7128    /* ------------------ PRFM (unscaled offset) ------------------ */
7129    /* 31 29      22 20   11 9  4
7130       11 1110001 00 imm9 00 Rn Rt    PRFM pfrop=Rt, [Xn|SP, #simm]
7131    */
7132    if (INSN(31,21) == BITS11(1,1, 1,1,1,0,0,0,1, 0,0)
7133        && INSN(11,10) == BITS2(0,0)) {
7134       ULong  imm9   = INSN(20,12);
7135       UInt   nn     = INSN(9,5);
7136       UInt   tt     = INSN(4,0);
7137       ULong  offset = sx_to_64(imm9, 9);
7138       IRTemp ea     = newTemp(Ity_I64);
7139       assign(ea, binop(Iop_Add64, getIReg64orSP(nn), mkU64(offset)));
7140       /* No actual code to generate. */
7141       DIP("prfum prfop=%u, [%s, #0x%llx]\n", tt, nameIReg64orSP(nn), offset);
7142       return True;
7143    }
7144
7145    /* ---------------- ARMv8.1-LSE: Atomic Memory Operations ---------------- */
7146    /* 31 29     23 22 21 20 15   11 9 4
7147       sz 111000 A  R  1  s  0000 00 n t LDADD{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
7148       sz 111000 A  R  1  s  0001 00 n t LDCLR{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
7149       sz 111000 A  R  1  s  0010 00 n t LDEOR{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
7150       sz 111000 A  R  1  s  0011 00 n t LDSET{,A}{,L}<sz>  <Rs>, <Rt>, [<Xn|SP>]
7151       sz 111000 A  R  1  s  0100 00 n t LDSMAX{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
7152       sz 111000 A  R  1  s  0101 00 n t LDSMIN{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
7153       sz 111000 A  R  1  s  0110 00 n t LDUMAX{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
7154       sz 111000 A  R  1  s  0111 00 n t LDUMIN{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
7155       sz 111000 A  R  1  s  1000 00 n t SWP{,A}{,L}<sz>    <Rs>, <Rt>, [<Xn|SP>]
7156    */
7157    if (INSN(29,24) == BITS6(1,1,1,0,0,0)
7158        && INSN(21,21) == 1
7159        && (INSN(15,12) <= BITS4(1,0,0,0))
7160        && INSN(11,10) == BITS2(0,0)) {
7161       UInt szBlg2 = INSN(31,30);
7162       Bool isAcq = INSN(23,23) == 1;
7163       Bool isRel = INSN(22,22) == 1;
7164       UInt ss  = INSN(20,16);
7165       UInt opc = INSN(15,12);
7166       UInt nn  = INSN(9,5);
7167       UInt tt  = INSN(4,0);
7168
7169       const HChar* nm = NULL;
7170       const HChar* suffix[4] = { "b", "h", "", "" };
7171
7172       vassert(szBlg2 < 4);
7173       UInt  szB = 1 << szBlg2; /* 1, 2, 4 or 8 bytes*/
7174       IRType ty = integerIRTypeOfSize(szB);
7175       Bool is64 = szB == 8;
7176       Bool isSigned = (opc == 4) || (opc == 5) /*smax || smin*/;
7177
7178       // IR used to emulate these atomic memory ops:
7179       // 1) barrier
7180       // 2) load
7181       // 3) widen operands and do arithmetic/logic op
7182       // 4) cas to see if target memory updated
7183       // 5) barrier
7184       // 6) repeat from 1) if cas says target memory not updated
7185       // 7) update register
7186
7187       IRTemp ea = newTemp(Ity_I64);
7188       assign(ea, getIReg64orSP(nn));
7189       gen_SIGBUS_if_not_XX_aligned(ea, szB);
7190
7191       // Insert barrier before loading for acquire and acquire-release variants:
7192       // A and AL.
7193       if (isAcq && (tt != 31))
7194          stmt(IRStmt_MBE(Imbe_Fence));
7195
7196       // Load LHS from memory, RHS from register.
7197       IRTemp orig = newTemp(ty);
7198       assign(orig, loadLE(ty, mkexpr(ea)));
7199       IRExpr *lhs = mkexpr(orig);
7200       IRExpr *rhs = narrowFrom64(ty, getIReg64orZR(ss));
7201       IRExpr *res = NULL;
7202
7203       lhs = isSigned ? widenSto64(ty, lhs) : widenUto64(ty, lhs);
7204       rhs = isSigned ? widenSto64(ty, rhs) : widenUto64(ty, rhs);
7205
7206       // Perform the operation.
7207       switch (opc) {
7208          case 0:
7209             nm = "ldadd";
7210             res = binop(Iop_Add64, lhs, rhs);
7211             break;
7212          case 1:
7213             nm = "ldclr";
7214             res = binop(Iop_And64, lhs, unop(mkNOT(Ity_I64), rhs));
7215             break;
7216          case 2:
7217             nm = "ldeor";
7218             res = binop(Iop_Xor64, lhs, rhs);
7219             break;
7220          case 3:
7221             nm = "ldset";
7222             res = binop(Iop_Or64, lhs, rhs);
7223             break;
7224          case 4:
7225             nm = "ldsmax";
7226             res = IRExpr_ITE(binop(Iop_CmpLT64S, lhs, rhs), rhs, lhs);
7227             break;
7228          case 5:
7229             nm = "ldsmin";
7230             res = IRExpr_ITE(binop(Iop_CmpLT64S, lhs, rhs), lhs, rhs);
7231             break;
7232          case 6:
7233             nm = "ldumax";
7234             res = IRExpr_ITE(binop(Iop_CmpLT64U, lhs, rhs), rhs, lhs);
7235             break;
7236          case 7:
7237             nm = "ldumin";
7238             res = IRExpr_ITE(binop(Iop_CmpLT64U, lhs, rhs), lhs, rhs);
7239             break;
7240          case 8:
7241             nm = "swp";
7242             res = rhs;
7243             break;
7244          default:
7245             vassert(0);
7246             break;
7247       }
7248
7249       // Store the result back if LHS remains unchanged in memory.
7250       IRTemp old = newTemp(ty);
7251       stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
7252                                Iend_LE, mkexpr(ea),
7253                                /*expdHi*/NULL, mkexpr(orig),
7254                                /*dataHi*/NULL, narrowFrom64(ty, res))) );
7255
7256       // Insert barrier after storing for release and acquire-release variants:
7257       // L and AL.
7258       if (isRel)
7259          stmt(IRStmt_MBE(Imbe_Fence));
7260
7261       // Retry if the CAS failed (i.e. when old != orig).
7262       IRConst* nia = IRConst_U64(guest_PC_curr_instr);
7263       stmt( IRStmt_Exit(
7264                 binop(Iop_CasCmpNE64,
7265                       widenUto64(ty, mkexpr(old)),
7266                       widenUto64(ty, mkexpr(orig))),
7267                 Ijk_Boring, nia, OFFB_PC ));
7268       // Otherwise we succeeded.
7269       putIReg64orZR(tt, widenUto64(ty, mkexpr(old)));
7270
7271       DIP("%s%s%s%s %s, %s, [%s]\n", nm, isAcq ? "a" : "", isRel ? "l" : "",
7272           suffix[szBlg2], nameIRegOrZR(is64, ss), nameIRegOrZR(is64, tt),
7273           nameIReg64orSP(nn));
7274       return True;
7275    }
7276
7277    /* ------------------ ARMv8.1-LSE: Compare-and-Swap ------------------ */
7278    /* 31 29      22 21 20 15 14    9 4
7279       sz 0010001 A  1  s  R  11111 n t CAS{,A}{,L}<sz> <Rs>, <Rt>, [<Xn|SP>]
7280    */
7281    if (INSN(29,23) == BITS7(0,0,1,0,0,0,1)
7282        && INSN(21,21) == 1
7283        && INSN(14,10) == BITS5(1,1,1,1,1)) {
7284       UInt szBlg2 = INSN(31,30);
7285       Bool isAcq = INSN(22,22) == 1;
7286       Bool isRel = INSN(15,15) == 1;
7287       UInt ss  = INSN(20,16);
7288       UInt nn  = INSN(9,5);
7289       UInt tt  = INSN(4,0);
7290
7291       const HChar* suffix[4] = { "b", "h", "", "" };
7292
7293       UInt  szB = 1 << szBlg2; /* 1, 2, 4 or 8 */
7294       IRType ty = integerIRTypeOfSize(szB);
7295       Bool is64 = szB == 8;
7296
7297       IRTemp ea = newTemp(Ity_I64);
7298       assign(ea, getIReg64orSP(nn));
7299       gen_SIGBUS_if_not_XX_aligned(ea, szB);
7300
7301       IRExpr *exp = narrowFrom64(ty, getIReg64orZR(ss));
7302       IRExpr *new = narrowFrom64(ty, getIReg64orZR(tt));
7303
7304       if (isAcq)
7305          stmt(IRStmt_MBE(Imbe_Fence));
7306
7307       // Store the result back if LHS remains unchanged in memory.
7308       IRTemp old = newTemp(ty);
7309       stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
7310                                Iend_LE, mkexpr(ea),
7311                                /*expdHi*/NULL, exp,
7312                                /*dataHi*/NULL, new)) );
7313
7314       if (isRel)
7315          stmt(IRStmt_MBE(Imbe_Fence));
7316
7317       putIReg64orZR(ss, widenUto64(ty, mkexpr(old)));
7318       DIP("cas%s%s%s %s, %s, [%s]\n",
7319           isAcq ? "a" : "", isRel ? "l" : "", suffix[szBlg2],
7320           nameIRegOrZR(is64, ss), nameIRegOrZR(is64, tt), nameIReg64orSP(nn));
7321       return True;
7322    }
7323
7324    /* ---------------- ARMv8.1-LSE: Compare-and-Swap Pair --------------- */
7325    /* 31 30 29      22 21 20 15 14    9 4
7326       0  sz 0010000 A  1  s  R  11111 n t CASP{,A}{,L} <Rs>, <Rt>, [<Xn|SP>]
7327    */
7328    if (INSN(31,31) == 0
7329        && INSN(29,23) == BITS7(0,0,1,0,0,0,0)
7330        && INSN(21,21) == 1
7331        && INSN(14,10) == BITS5(1,1,1,1,1)) {
7332       UInt is64 = INSN(30,30);
7333       Bool isAcq = INSN(22,22) == 1;
7334       Bool isRel = INSN(15,15) == 1;
7335       UInt ss  = INSN(20,16);
7336       UInt nn  = INSN(9,5);
7337       UInt tt  = INSN(4,0);
7338
7339       if ((ss & 0x1) || (tt & 0x1)) {
7340          /* undefined; fall through */
7341       } else {
7342          IRTemp ea = newTemp(Ity_I64);
7343          assign(ea, getIReg64orSP(nn));
7344          gen_SIGBUS_if_not_XX_aligned(ea, is64 ? 16 : 8);
7345
7346          IRExpr *expLo = getIRegOrZR(is64, ss);
7347          IRExpr *expHi = getIRegOrZR(is64, ss + 1);
7348          IRExpr *newLo = getIRegOrZR(is64, tt);
7349          IRExpr *newHi = getIRegOrZR(is64, tt + 1);
7350          IRTemp oldLo = newTemp(is64 ? Ity_I64 : Ity_I32);
7351          IRTemp oldHi = newTemp(is64 ? Ity_I64 : Ity_I32);
7352
7353          if (isAcq)
7354             stmt(IRStmt_MBE(Imbe_Fence));
7355
7356          stmt( IRStmt_CAS(mkIRCAS(oldHi, oldLo,
7357                                   Iend_LE, mkexpr(ea),
7358                                   expHi, expLo,
7359                                   newHi, newLo)) );
7360
7361          if (isRel)
7362             stmt(IRStmt_MBE(Imbe_Fence));
7363
7364          putIRegOrZR(is64, ss, mkexpr(oldLo));
7365          putIRegOrZR(is64, ss+1, mkexpr(oldHi));
7366          DIP("casp%s%s %s, %s, %s, %s, [%s]\n",
7367              isAcq ? "a" : "", isRel ? "l" : "",
7368              nameIRegOrZR(is64, ss), nameIRegOrZR(is64, ss+1),
7369              nameIRegOrZR(is64, tt), nameIRegOrZR(is64, tt+1),
7370              nameIReg64orSP(nn));
7371          return True;
7372       }
7373    }
7374
7375    if (sigill_diag) {
7376       vex_printf("ARM64 front end: load_store\n");
7377    }
7378
7379    return False;
7380 #  undef INSN
7381 }
7382
7383
7384 /*------------------------------------------------------------*/
7385 /*--- Control flow and misc instructions                   ---*/
7386 /*------------------------------------------------------------*/
7387
7388 static
7389 Bool dis_ARM64_branch_etc(/*MB_OUT*/DisResult* dres, UInt insn,
7390                           const VexArchInfo* archinfo,
7391                           const VexAbiInfo* abiinfo, Bool sigill_diag)
7392 {
7393 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
7394
7395    /* ---------------------- B cond ----------------------- */
7396    /* 31        24    4 3
7397       0101010 0 imm19 0 cond */
7398    if (INSN(31,24) == BITS8(0,1,0,1,0,1,0,0) && INSN(4,4) == 0) {
7399       UInt  cond   = INSN(3,0);
7400       ULong uimm64 = INSN(23,5) << 2;
7401       Long  simm64 = (Long)sx_to_64(uimm64, 21);
7402       vassert(dres->whatNext    == Dis_Continue);
7403       vassert(dres->len         == 4);
7404       vassert(dres->jk_StopHere == Ijk_INVALID);
7405       stmt( IRStmt_Exit(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
7406                         Ijk_Boring,
7407                         IRConst_U64(guest_PC_curr_instr + simm64),
7408                         OFFB_PC) );
7409       putPC(mkU64(guest_PC_curr_instr + 4));
7410       dres->whatNext    = Dis_StopHere;
7411       dres->jk_StopHere = Ijk_Boring;
7412       DIP("b.%s 0x%llx\n", nameCC(cond), guest_PC_curr_instr + simm64);
7413       return True;
7414    }
7415
7416    /* -------------------- B{L} uncond -------------------- */
7417    if (INSN(30,26) == BITS5(0,0,1,0,1)) {
7418       /* 000101 imm26  B  (PC + sxTo64(imm26 << 2))
7419          100101 imm26  B  (PC + sxTo64(imm26 << 2))
7420       */
7421       UInt  bLink  = INSN(31,31);
7422       ULong uimm64 = INSN(25,0) << 2;
7423       Long  simm64 = (Long)sx_to_64(uimm64, 28);
7424       if (bLink) {
7425          putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
7426       }
7427       putPC(mkU64(guest_PC_curr_instr + simm64));
7428       dres->whatNext = Dis_StopHere;
7429       dres->jk_StopHere = Ijk_Call;
7430       DIP("b%s 0x%llx\n", bLink == 1 ? "l" : "",
7431                           guest_PC_curr_instr + simm64);
7432       return True;
7433    }
7434
7435    /* --------------------- B{L} reg --------------------- */
7436    /* 31      24 22 20    15     9  4
7437       1101011 00 10 11111 000000 nn 00000  RET  Rn
7438       1101011 00 01 11111 000000 nn 00000  CALL Rn
7439       1101011 00 00 11111 000000 nn 00000  JMP  Rn
7440    */
7441    if (INSN(31,23) == BITS9(1,1,0,1,0,1,1,0,0)
7442        && INSN(20,16) == BITS5(1,1,1,1,1)
7443        && INSN(15,10) == BITS6(0,0,0,0,0,0)
7444        && INSN(4,0) == BITS5(0,0,0,0,0)) {
7445       UInt branch_type = INSN(22,21);
7446       UInt nn          = INSN(9,5);
7447       if (branch_type == BITS2(1,0) /* RET */) {
7448          putPC(getIReg64orZR(nn));
7449          dres->whatNext = Dis_StopHere;
7450          dres->jk_StopHere = Ijk_Ret;
7451          DIP("ret %s\n", nameIReg64orZR(nn));
7452          return True;
7453       }
7454       if (branch_type == BITS2(0,1) /* CALL */) {
7455          IRTemp dst = newTemp(Ity_I64);
7456          assign(dst, getIReg64orZR(nn));
7457          putIReg64orSP(30, mkU64(guest_PC_curr_instr + 4));
7458          putPC(mkexpr(dst));
7459          dres->whatNext = Dis_StopHere;
7460          dres->jk_StopHere = Ijk_Call;
7461          DIP("blr %s\n", nameIReg64orZR(nn));
7462          return True;
7463       }
7464       if (branch_type == BITS2(0,0) /* JMP */) {
7465          putPC(getIReg64orZR(nn));
7466          dres->whatNext = Dis_StopHere;
7467          dres->jk_StopHere = Ijk_Boring;
7468          DIP("jmp %s\n", nameIReg64orZR(nn));
7469          return True;
7470       }
7471    }
7472
7473    /* -------------------- CB{N}Z -------------------- */
7474    /* sf 011 010 1 imm19 Rt   CBNZ Xt|Wt, (PC + sxTo64(imm19 << 2))
7475       sf 011 010 0 imm19 Rt   CBZ  Xt|Wt, (PC + sxTo64(imm19 << 2))
7476    */
7477    if (INSN(30,25) == BITS6(0,1,1,0,1,0)) {
7478       Bool    is64   = INSN(31,31) == 1;
7479       Bool    bIfZ   = INSN(24,24) == 0;
7480       ULong   uimm64 = INSN(23,5) << 2;
7481       UInt    rT     = INSN(4,0);
7482       Long    simm64 = (Long)sx_to_64(uimm64, 21);
7483       IRExpr* cond   = NULL;
7484       if (is64) {
7485          cond = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
7486                       getIReg64orZR(rT), mkU64(0));
7487       } else {
7488          cond = binop(bIfZ ? Iop_CmpEQ32 : Iop_CmpNE32,
7489                       getIReg32orZR(rT), mkU32(0));
7490       }
7491       stmt( IRStmt_Exit(cond,
7492                         Ijk_Boring,
7493                         IRConst_U64(guest_PC_curr_instr + simm64),
7494                         OFFB_PC) );
7495       putPC(mkU64(guest_PC_curr_instr + 4));
7496       dres->whatNext    = Dis_StopHere;
7497       dres->jk_StopHere = Ijk_Boring;
7498       DIP("cb%sz %s, 0x%llx\n",
7499           bIfZ ? "" : "n", nameIRegOrZR(is64, rT),
7500           guest_PC_curr_instr + simm64);
7501       return True;
7502    }
7503
7504    /* -------------------- TB{N}Z -------------------- */
7505    /* 31 30      24 23  18  5 4
7506       b5 011 011 1  b40 imm14 t  TBNZ Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
7507       b5 011 011 0  b40 imm14 t  TBZ  Xt, #(b5:b40), (PC + sxTo64(imm14 << 2))
7508    */
7509    if (INSN(30,25) == BITS6(0,1,1,0,1,1)) {
7510       UInt    b5     = INSN(31,31);
7511       Bool    bIfZ   = INSN(24,24) == 0;
7512       UInt    b40    = INSN(23,19);
7513       UInt    imm14  = INSN(18,5);
7514       UInt    tt     = INSN(4,0);
7515       UInt    bitNo  = (b5 << 5) | b40;
7516       ULong   uimm64 = imm14 << 2;
7517       Long    simm64 = sx_to_64(uimm64, 16);
7518       IRExpr* cond
7519          = binop(bIfZ ? Iop_CmpEQ64 : Iop_CmpNE64,
7520                  binop(Iop_And64,
7521                        binop(Iop_Shr64, getIReg64orZR(tt), mkU8(bitNo)),
7522                        mkU64(1)),
7523                  mkU64(0));
7524       stmt( IRStmt_Exit(cond,
7525                         Ijk_Boring,
7526                         IRConst_U64(guest_PC_curr_instr + simm64),
7527                         OFFB_PC) );
7528       putPC(mkU64(guest_PC_curr_instr + 4));
7529       dres->whatNext    = Dis_StopHere;
7530       dres->jk_StopHere = Ijk_Boring;
7531       DIP("tb%sz %s, #%u, 0x%llx\n",
7532           bIfZ ? "" : "n", nameIReg64orZR(tt), bitNo,
7533           guest_PC_curr_instr + simm64);
7534       return True;
7535    }
7536
7537    /* -------------------- SVC -------------------- */
7538    /* 11010100 000 imm16 000 01
7539       Don't bother with anything except the imm16==0 case.
7540    */
7541    if (INSN(31,0) == 0xD4000001) {
7542       putPC(mkU64(guest_PC_curr_instr + 4));
7543       dres->whatNext    = Dis_StopHere;
7544       dres->jk_StopHere = Ijk_Sys_syscall;
7545       DIP("svc #0\n");
7546       return True;
7547    }
7548
7549    /* ------------------ M{SR,RS} ------------------ */
7550    /* ---- Case for MIDR_EL1 (RO) ----
7551       Read the Main ID register from host.
7552       0xD53800 000 Rt   MRS rT, midr_el1
7553    */
7554    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD5380000 /*MRS*/) {
7555       UInt tt    = INSN(4,0);
7556       IRTemp val = newTemp(Ity_I64);
7557       IRExpr** args = mkIRExprVec_0();
7558       IRDirty* d    = unsafeIRDirty_1_N (
7559                          val,
7560                          0/*regparms*/,
7561                          "arm64g_dirtyhelper_MRS_MIDR_EL1",
7562                          &arm64g_dirtyhelper_MRS_MIDR_EL1,
7563                          args
7564                       );
7565       /* execute the dirty call, dumping the result in val. */
7566       stmt( IRStmt_Dirty(d) );
7567       putIReg64orZR(tt, mkexpr(val));
7568       DIP("mrs %s, midr_el1\n", nameIReg64orZR(tt));
7569       return True;
7570    }
7571    /* ---- Case for MPIDR_EL1 (RO) ----
7572       Instead of returing a fake register, we use the same
7573       value as does the kernel emulation.
7574       0xD53800 101 Rt   MRS rT, mpidr_el1
7575    */
7576    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53800A0 /*MRS*/) {
7577       UInt tt    = INSN(4,0);
7578       putIReg64orZR(tt, mkU64((1UL<<31)));
7579       DIP("mrs %s, mpidr_el1 (FAKED)\n", nameIReg64orZR(tt));
7580       return True;
7581    }
7582    /* ---- Case for REVDIR_EL1 (RO) ----
7583       Instead of emulating the register, we just return the same
7584       value as does the kernel emulation.
7585       0xD53800 110 Rt   MRS rT, revdir_el1
7586    */
7587    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53800C0 /*MRS*/) {
7588       UInt tt    = INSN(4,0);
7589       putIReg32orZR(tt, mkU32(0x0));
7590       DIP("mrs %s, revdir_el1 (FAKED)\n", nameIReg32orZR(tt));
7591       return True;
7592    }
7593    /* ---- Case for ID_AA64PFR0_EL1 (RO) ----
7594       Instead of returing a fake register, we use the same
7595       value as does the kernel emulation. We set deprecate half
7596       precission floating-point to normal floating-point support.
7597       We set all other values to zero.
7598       0xD53804 000 Rt   MRS rT, id_aa64pfr0_el1
7599    */
7600    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD5380400 /*MRS*/) {
7601       UInt tt    = INSN(4,0);
7602       IRTemp val = newTemp(Ity_I64);
7603       IRExpr** args = mkIRExprVec_0();
7604       IRDirty* d    = unsafeIRDirty_1_N (
7605                         val,
7606                         0/*regparms*/,
7607                         "arm64g_dirtyhelper_MRS_ID_AA64PFR0_EL1",
7608                         &arm64g_dirtyhelper_MRS_ID_AA64PFR0_EL1,
7609                         args
7610                      );
7611       /* execute the dirty call, dumping the result in val. */
7612       stmt( IRStmt_Dirty(d) );
7613
7614       putIReg64orZR(tt, mkexpr(val));
7615       return True;
7616    }
7617    /* ---- Case for ID_AA64PFR1_EL1 (RO) ----
7618       We just return 0x0 here, as we don't support the opcodes of
7619       new commands in the emulation environment.
7620       0xD53804 001 Rt   MRS rT, id_aa64pfr1_el1
7621    */
7622    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD5380420 /*MRS*/) {
7623       UInt tt    = INSN(4,0);
7624       putIReg64orZR(tt, mkU64(0x0));
7625       DIP("mrs %s, id_aa64pfr1_el1 (FAKED)\n", nameIReg32orZR(tt));
7626       return True;
7627    }
7628    /* ---- Case for ID_AA64ZFR0_EL1 (RO) ----
7629       We just return 0x0 here, as we don't support the opcodes of
7630       new commands in the emulation environment.
7631       0xD53804 010 Rt   MRS rT, id_aa64zfr0_el1
7632    */
7633    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD5380440 /*MRS*/) {
7634       UInt tt    = INSN(4,0);
7635       putIReg64orZR(tt, mkU64(0x0));
7636       DIP("mrs %s, id_aa64zfr0_el1 (FAKED)\n", nameIReg64orZR(tt));
7637       return True;
7638    }
7639    /* ---- Case for ID_AA64DFR0_EL1 (RO) ----
7640       Just return the value indicating the implementation of the
7641       ARMv8 debug architecture without any extensions.
7642       0xD53805 000 Rt   MRS rT, id_aa64dfr0_el1
7643    */
7644    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD5380500 /*MRS*/) {
7645       UInt tt    = INSN(4,0);
7646       putIReg64orZR(tt, mkU64(0x6));
7647       DIP("mrs %s, id_aa64dfr0_el1 (FAKED)\n", nameIReg64orZR(tt));
7648       return True;
7649    }
7650    /* ---- Case for ID_AA64DFR1_EL1 (RO) ----
7651       We just return 0x0 here, as we don't support the opcodes of
7652       new commands in the emulation environment.
7653       0xD53805 001 Rt   MRS rT, id_aa64dfr1_el1
7654    */
7655    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD5380520 /*MRS*/) {
7656       UInt tt    = INSN(4,0);
7657       putIReg64orZR(tt, mkU64(0x0));
7658       DIP("mrs %s, id_aa64dfr1_el1 (FAKED)\n", nameIReg64orZR(tt));
7659       return True;
7660    }
7661    /* ---- Case for ID_AA64AFR0_EL1 (RO) ----
7662       We just return 0x0 here, as we don't support the opcodes of
7663       new commands in the emulation environment.
7664       0xD53805 100 Rt   MRS rT, id_aa64afr0_el1
7665    */
7666    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD5380580 /*MRS*/) {
7667       UInt tt    = INSN(4,0);
7668       putIReg64orZR(tt, mkU64(0x0));
7669       DIP("mrs %s, id_aa64afr0_el1 (FAKED)\n", nameIReg64orZR(tt));
7670       return True;
7671    }
7672    /* ---- Case for ID_AA64AFR1_EL1 (RO) ----
7673       We just return 0x0 here, as we don't support the opcodes of
7674       new commands in the emulation environment.
7675       0xD53805 101 Rt   MRS rT, id_aa64afr1_el1
7676    */
7677    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53805A0 /*MRS*/) {
7678       UInt tt    = INSN(4,0);
7679       putIReg64orZR(tt, mkU64(0x0));
7680       DIP("mrs %s, id_aa64afr1_el1 (FAKED)\n", nameIReg64orZR(tt));
7681       return True;
7682    }
7683    /* ---- Case for ID_AA64ISAR0_EL1 (RO) ----
7684       We only take care of SHA2, SHA1 and AES bits, as all the other
7685       commands are not part of the emulation environment.
7686       We degredate SHA2 from 0x2 to 0x1 as we don't support the commands.
7687       0xD53806 000 Rt   MRS rT, id_aa64isar0_el1
7688    */
7689    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD5380600 /*MRS*/) {
7690       UInt tt    = INSN(4,0);
7691       IRTemp val = newTemp(Ity_I64);
7692       IRExpr** args = mkIRExprVec_0();
7693       IRDirty* d    = unsafeIRDirty_1_N (
7694                         val,
7695                         0/*regparms*/,
7696                         "arm64g_dirtyhelper_MRS_ID_AA64ISAR0_EL1",
7697                         &arm64g_dirtyhelper_MRS_ID_AA64ISAR0_EL1,
7698                         args
7699                      );
7700       /* execute the dirty call, dumping the result in val. */
7701       stmt( IRStmt_Dirty(d) );
7702       putIReg64orZR(tt, mkexpr(val));
7703       DIP("mrs %s, id_aa64isar0_el1 (FAKED)\n", nameIReg64orZR(tt));
7704       return True;
7705    }
7706    /* ---- Case for ID_AA64ISAR1_EL1 (RO) ----
7707       We just return 0x0 here, as we don't support the opcodes of
7708       new commands in the emulation environment.
7709       0xD53806 001 Rt   MRS rT, id_aa64isar1_el1
7710    */
7711    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD5380620 /*MRS*/) {
7712       UInt tt    = INSN(4,0);
7713       IRTemp val = newTemp(Ity_I64);
7714       IRExpr** args = mkIRExprVec_0();
7715       IRDirty* d    = unsafeIRDirty_1_N (
7716                         val,
7717                         0/*regparms*/,
7718                         "arm64g_dirtyhelper_MRS_ID_AA64ISAR1_EL1",
7719                         &arm64g_dirtyhelper_MRS_ID_AA64ISAR1_EL1,
7720                         args
7721                      );
7722       /* execute the dirty call, dumping the result in val. */
7723       stmt( IRStmt_Dirty(d) );
7724       putIReg64orZR(tt, mkexpr(val));
7725       DIP("mrs %s, id_aa64isar1_el1 (FAKED)\n", nameIReg64orZR(tt));
7726       return True;
7727    }
7728    /* ---- Case for ID_AA64MMFR0_EL1 (RO) ----
7729       Instead of returing a fake register, we use the same
7730       value as does the kernel emulation.
7731       0xD53807 000 Rt   MRS rT, id_aa64mmfr0_el1
7732    */
7733    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD5380700 /*MRS*/) {
7734       UInt tt    = INSN(4,0);
7735       IRTemp val = newTemp(Ity_I64);
7736       IRExpr** args = mkIRExprVec_0();
7737       IRDirty* d    = unsafeIRDirty_1_N (
7738                         val,
7739                         0/*regparms*/,
7740                         "arm64g_dirtyhelper_MRS_ID_AA64MMFR0_EL1",
7741                         &arm64g_dirtyhelper_MRS_ID_AA64MMFR0_EL1,
7742                         args
7743                      );
7744       /* execute the dirty call, dumping the result in val. */
7745       stmt( IRStmt_Dirty(d) );
7746       putIReg64orZR(tt, mkexpr(val));
7747       DIP("mrs %s, id_aa64mmfr0_el1 (FAKED)\n", nameIReg64orZR(tt));
7748       return True;
7749    }
7750    /* ---- Case for ID_AA64MMFR1_EL1 (RO) ----
7751       Instead of returing a fake register, we use the same
7752       value as does the kernel emulation. Set VHE and HAFDBS
7753       to not implemented.
7754       0xD53807 001 Rt   MRS rT, id_aa64mmfr1_el1
7755    */
7756    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD5380720 /*MRS*/) {
7757       UInt tt    = INSN(4,0);
7758       IRTemp val = newTemp(Ity_I64);
7759       IRExpr** args = mkIRExprVec_0();
7760       IRDirty* d    = unsafeIRDirty_1_N (
7761                         val,
7762                         0/*regparms*/,
7763                         "arm64g_dirtyhelper_MRS_ID_AA64MMFR1_EL1",
7764                         &arm64g_dirtyhelper_MRS_ID_AA64MMFR1_EL1,
7765                         args
7766                      );
7767       /* execute the dirty call, dumping the result in val. */
7768       stmt( IRStmt_Dirty(d) );
7769       putIReg64orZR(tt, mkexpr(val));
7770       DIP("mrs %s, id_aa64mmfr1_el1 (FAKED)\n", nameIReg64orZR(tt));
7771       return True;
7772    }
7773    /* ---- Case for ID_AA64MMFR2_EL1 (RO) ----
7774       Return faked value of not implemented ARMv8.2 and ARMv8.3
7775       0xD53807 010 Rt   MRS rT, id_aa64mmfr2_el1
7776    */
7777    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD5380740 /*MRS*/) {
7778       UInt tt    = INSN(4,0);
7779       putIReg64orZR(tt, mkU64(0x0));
7780       DIP("mrs %s, id_aa64mmfr2_el1 (FAKED)\n", nameIReg64orZR(tt));
7781       return True;
7782    }
7783    /* ---- Cases for TPIDR_EL0 ----
7784       0xD51BD0 010 Rt   MSR tpidr_el0, rT
7785       0xD53BD0 010 Rt   MRS rT, tpidr_el0
7786    */
7787    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51BD040 /*MSR*/
7788        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53BD040 /*MRS*/) {
7789       Bool toSys = INSN(21,21) == 0;
7790       UInt tt    = INSN(4,0);
7791       if (toSys) {
7792          stmt( IRStmt_Put( OFFB_TPIDR_EL0, getIReg64orZR(tt)) );
7793          DIP("msr tpidr_el0, %s\n", nameIReg64orZR(tt));
7794       } else {
7795          putIReg64orZR(tt, IRExpr_Get( OFFB_TPIDR_EL0, Ity_I64 ));
7796          DIP("mrs %s, tpidr_el0\n", nameIReg64orZR(tt));
7797       }
7798       return True;
7799    }
7800    /* ---- Cases for FPCR ----
7801       0xD51B44 000 Rt  MSR fpcr, rT
7802       0xD53B44 000 Rt  MSR rT, fpcr
7803    */
7804    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4400 /*MSR*/
7805        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4400 /*MRS*/) {
7806       Bool toSys = INSN(21,21) == 0;
7807       UInt tt    = INSN(4,0);
7808       if (toSys) {
7809          stmt( IRStmt_Put( OFFB_FPCR, getIReg32orZR(tt)) );
7810          DIP("msr fpcr, %s\n", nameIReg64orZR(tt));
7811       } else {
7812          putIReg32orZR(tt, IRExpr_Get(OFFB_FPCR, Ity_I32));
7813          DIP("mrs %s, fpcr\n", nameIReg64orZR(tt));
7814       }
7815       return True;
7816    }
7817    /* ---- Cases for FPSR ----
7818       0xD51B44 001 Rt  MSR fpsr, rT
7819       0xD53B44 001 Rt  MSR rT, fpsr
7820       The only part of this we model is FPSR.QC.  All other bits
7821       are ignored when writing to it and RAZ when reading from it.
7822    */
7823    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4420 /*MSR*/
7824        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4420 /*MRS*/) {
7825       Bool toSys = INSN(21,21) == 0;
7826       UInt tt    = INSN(4,0);
7827       if (toSys) {
7828          /* Just deal with FPSR.QC.  Make up a V128 value which is
7829             zero if Xt[27] is zero and any other value if Xt[27] is
7830             nonzero. */
7831          IRTemp qc64 = newTemp(Ity_I64);
7832          assign(qc64, binop(Iop_And64,
7833                             binop(Iop_Shr64, getIReg64orZR(tt), mkU8(27)),
7834                             mkU64(1)));
7835          IRExpr* qcV128 = binop(Iop_64HLtoV128, mkexpr(qc64), mkexpr(qc64));
7836          stmt( IRStmt_Put( OFFB_QCFLAG, qcV128 ) );
7837          DIP("msr fpsr, %s\n", nameIReg64orZR(tt));
7838       } else {
7839          /* Generate a value which is all zeroes except for bit 27,
7840             which must be zero if QCFLAG is all zeroes and one otherwise. */
7841          IRTemp qcV128 = newTempV128();
7842          assign(qcV128, IRExpr_Get( OFFB_QCFLAG, Ity_V128 ));
7843          IRTemp qc64 = newTemp(Ity_I64);
7844          assign(qc64, binop(Iop_Or64, unop(Iop_V128HIto64, mkexpr(qcV128)),
7845                                       unop(Iop_V128to64,   mkexpr(qcV128))));
7846          IRExpr* res = binop(Iop_Shl64,
7847                              unop(Iop_1Uto64,
7848                                   binop(Iop_CmpNE64, mkexpr(qc64), mkU64(0))),
7849                              mkU8(27));
7850          putIReg64orZR(tt, res);
7851          DIP("mrs %s, fpsr\n", nameIReg64orZR(tt));
7852       }
7853       return True;
7854    }
7855    /* ---- Cases for NZCV ----
7856       D51B42 000 Rt  MSR nzcv, rT
7857       D53B42 000 Rt  MRS rT, nzcv
7858       The only parts of NZCV that actually exist are bits 31:28, which
7859       are the N Z C and V bits themselves.  Hence the flags thunk provides
7860       all the state we need.
7861    */
7862    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD51B4200 /*MSR*/
7863        || (INSN(31,0) & 0xFFFFFFE0) == 0xD53B4200 /*MRS*/) {
7864       Bool  toSys = INSN(21,21) == 0;
7865       UInt  tt    = INSN(4,0);
7866       if (toSys) {
7867          IRTemp t = newTemp(Ity_I64);
7868          assign(t, binop(Iop_And64, getIReg64orZR(tt), mkU64(0xF0000000ULL)));
7869          setFlags_COPY(t);
7870          DIP("msr %s, nzcv\n", nameIReg32orZR(tt));
7871       } else {
7872          IRTemp res = newTemp(Ity_I64);
7873          assign(res, mk_arm64g_calculate_flags_nzcv());
7874          putIReg32orZR(tt, unop(Iop_64to32, mkexpr(res)));
7875          DIP("mrs %s, nzcv\n", nameIReg64orZR(tt));
7876       }
7877       return True;
7878    }
7879    /* ---- Cases for DCZID_EL0 ----
7880       This is the data cache zero ID register. It controls whether
7881       DC ZVA is supported and if so the block size used. Support reads of it
7882       only by passing through to the host.
7883       D5 3B 00 111 Rt  MRS rT, dczid_el0
7884    */
7885    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B00E0) {
7886       UInt tt = INSN(4,0);
7887       IRTemp   val  = newTemp(Ity_I64);
7888       IRExpr** args = mkIRExprVec_0();
7889       IRDirty* d    = unsafeIRDirty_1_N (
7890                          val,
7891                          0/*regparms*/,
7892                          "arm64g_dirtyhelper_MRS_DCZID_EL0",
7893                          &arm64g_dirtyhelper_MRS_DCZID_EL0,
7894                          args
7895                       );
7896       /* execute the dirty call, dumping the result in val. */
7897       stmt( IRStmt_Dirty(d) );
7898       putIReg64orZR(tt, mkexpr(val));
7899       DIP("mrs %s, dczid_el0 (FAKED)\n", nameIReg64orZR(tt));
7900       return True;
7901    }
7902    /* ---- Cases for CTR_EL0 ----
7903       We just handle reads, and make up a value from the D and I line
7904       sizes in the VexArchInfo we are given, and patch in the following
7905       fields that the Foundation model gives ("natively"):
7906       CWG = 0b0100, ERG = 0b0100, L1Ip = 0b11
7907       D5 3B 00 001 Rt  MRS rT, dczid_el0
7908    */
7909    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53B0020) {
7910       UInt tt = INSN(4,0);
7911       /* Need to generate a value from dMinLine_lg2_szB and
7912          dMinLine_lg2_szB.  The value in the register is in 32-bit
7913          units, so need to subtract 2 from the values in the
7914          VexArchInfo.  We can assume that the values here are valid --
7915          disInstr_ARM64 checks them -- so there's no need to deal with
7916          out-of-range cases. */
7917       vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
7918               && archinfo->arm64_dMinLine_lg2_szB <= 17
7919               && archinfo->arm64_iMinLine_lg2_szB >= 2
7920               && archinfo->arm64_iMinLine_lg2_szB <= 17);
7921       UInt val
7922          = 0x8440c000 | ((0xF & (archinfo->arm64_dMinLine_lg2_szB - 2)) << 16)
7923                       | ((0xF & (archinfo->arm64_iMinLine_lg2_szB - 2)) << 0);
7924       putIReg64orZR(tt, mkU64(val));
7925       DIP("mrs %s, ctr_el0\n", nameIReg64orZR(tt));
7926       return True;
7927    }
7928    /* ---- Cases for CNTVCT_EL0 ----
7929       This is a timestamp counter of some sort.  Support reads of it only
7930       by passing through to the host.
7931       D5 3B E0 010 Rt  MRS Xt, cntvct_el0
7932    */
7933    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53BE040) {
7934       UInt     tt   = INSN(4,0);
7935       IRTemp   val  = newTemp(Ity_I64);
7936       IRExpr** args = mkIRExprVec_0();
7937       IRDirty* d    = unsafeIRDirty_1_N (
7938                          val,
7939                          0/*regparms*/,
7940                          "arm64g_dirtyhelper_MRS_CNTVCT_EL0",
7941                          &arm64g_dirtyhelper_MRS_CNTVCT_EL0,
7942                          args
7943                       );
7944       /* execute the dirty call, dumping the result in val. */
7945       stmt( IRStmt_Dirty(d) );
7946       putIReg64orZR(tt, mkexpr(val));
7947       DIP("mrs %s, cntvct_el0\n", nameIReg64orZR(tt));
7948       return True;
7949    }
7950    /* ---- Cases for CNTFRQ_EL0 ----
7951       This is always RO at EL0, so it's safe to pass through to the host.
7952       D5 3B E0 000 Rt  MRS Xt, cntfrq_el0
7953    */
7954    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD53BE000) {
7955       UInt     tt   = INSN(4,0);
7956       IRTemp   val  = newTemp(Ity_I64);
7957       IRExpr** args = mkIRExprVec_0();
7958       IRDirty* d    = unsafeIRDirty_1_N (
7959                          val,
7960                          0/*regparms*/,
7961                          "arm64g_dirtyhelper_MRS_CNTFRQ_EL0",
7962                          &arm64g_dirtyhelper_MRS_CNTFRQ_EL0,
7963                          args
7964                       );
7965       /* execute the dirty call, dumping the result in val. */
7966       stmt( IRStmt_Dirty(d) );
7967       putIReg64orZR(tt, mkexpr(val));
7968       DIP("mrs %s, cntfrq_el0\n", nameIReg64orZR(tt));
7969       return True;
7970    }
7971
7972    /* ------------------ IC_IVAU ------------------ */
7973    /* D5 0B 75 001 Rt  ic ivau, rT
7974    */
7975    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD50B7520) {
7976       /* We will always be provided with a valid iMinLine value. */
7977       vassert(archinfo->arm64_iMinLine_lg2_szB >= 2
7978               && archinfo->arm64_iMinLine_lg2_szB <= 17);
7979       /* Round the requested address, in rT, down to the start of the
7980          containing block. */
7981       UInt   tt      = INSN(4,0);
7982       ULong  lineszB = 1ULL << archinfo->arm64_iMinLine_lg2_szB;
7983       IRTemp addr    = newTemp(Ity_I64);
7984       assign( addr, binop( Iop_And64,
7985                            getIReg64orZR(tt),
7986                            mkU64(~(lineszB - 1))) );
7987       /* Set the invalidation range, request exit-and-invalidate, with
7988          continuation at the next instruction. */
7989       stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
7990       stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
7991       /* be paranoid ... */
7992       stmt( IRStmt_MBE(Imbe_Fence) );
7993       putPC(mkU64( guest_PC_curr_instr + 4 ));
7994       dres->whatNext    = Dis_StopHere;
7995       dres->jk_StopHere = Ijk_InvalICache;
7996       DIP("ic ivau, %s\n", nameIReg64orZR(tt));
7997       return True;
7998    }
7999
8000    /* ------------------ DC_ZVA ------------------ */
8001    /* D5 0B 74 001 Rt  dc zva, rT
8002    */
8003    if ((INSN(31,0) & 0xFFFFFFE0) == 0xD50B7420) {
8004       /* Round the requested address, in rT, down to the start of the
8005          containing block. */
8006       UInt   tt      = INSN(4,0);
8007       ULong  clearszB = 1UL << (archinfo->arm64_cache_block_size + 2);
8008       IRTemp addr    = newTemp(Ity_I64);
8009       assign( addr, binop( Iop_And64,
8010                            getIReg64orZR(tt),
8011                            mkU64(~(clearszB - 1))) );
8012       for (ULong o = 0; o < clearszB; o += 8) {
8013           storeLE(binop(Iop_Add64,mkexpr(addr),mkU64(o)), mkU64(0));
8014       }
8015       DIP("dc zva, %s\n", nameIReg64orZR(tt));
8016       return True;
8017    }
8018
8019    /* ------------------ DC_CVAU ------------------ */
8020    /* D5 0B 7A 001 Rt  dc cvac, rT
8021       D5 0B 7B 001 Rt  dc cvau, rT
8022       D5 0B 7C 001 Rt  dc cvap, rT
8023       D5 0B 7D 001 Rt  dc cvadp, rT
8024       D5 0B 7E 001 Rt  dc civac, rT
8025    */
8026    if (   (INSN(31,0) & 0xFFFFFFE0) == 0xD50B7A20
8027        || (INSN(31,0) & 0xFFFFFFE0) == 0xD50B7B20
8028        || (INSN(31,0) & 0xFFFFFFE0) == 0xD50B7C20
8029        || (INSN(31,0) & 0xFFFFFFE0) == 0xD50B7D20
8030        || (INSN(31,0) & 0xFFFFFFE0) == 0xD50B7E20) {
8031       /* Exactly the same scheme as for IC IVAU, except we observe the
8032          dMinLine size, and request an Ijk_FlushDCache instead of
8033          Ijk_InvalICache. */
8034       /* We will always be provided with a valid dMinLine value. */
8035       vassert(archinfo->arm64_dMinLine_lg2_szB >= 2
8036               && archinfo->arm64_dMinLine_lg2_szB <= 17);
8037       /* Round the requested address, in rT, down to the start of the
8038          containing block. */
8039       UInt   tt      = INSN(4,0);
8040       ULong  lineszB = 1ULL << archinfo->arm64_dMinLine_lg2_szB;
8041       IRTemp addr    = newTemp(Ity_I64);
8042       assign( addr, binop( Iop_And64,
8043                            getIReg64orZR(tt),
8044                            mkU64(~(lineszB - 1))) );
8045       /* Set the flush range, request exit-and-flush, with
8046          continuation at the next instruction. */
8047       stmt(IRStmt_Put(OFFB_CMSTART, mkexpr(addr)));
8048       stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(lineszB)));
8049       /* be paranoid ... */
8050       stmt( IRStmt_MBE(Imbe_Fence) );
8051       putPC(mkU64( guest_PC_curr_instr + 4 ));
8052       dres->whatNext    = Dis_StopHere;
8053       dres->jk_StopHere = Ijk_FlushDCache;
8054       DIP("dc cvau, %s\n", nameIReg64orZR(tt));
8055       return True;
8056    }
8057
8058    /* ------------------ ISB, DMB, DSB ------------------ */
8059    /* 31          21            11  7 6  4
8060       11010 10100 0 00 011 0011 CRm 1 01 11111  DMB opt
8061       11010 10100 0 00 011 0011 CRm 1 00 11111  DSB opt
8062       11010 10100 0 00 011 0011 CRm 1 10 11111  ISB opt
8063    */
8064    if (INSN(31,22) == BITS10(1,1,0,1,0,1,0,1,0,0)
8065        && INSN(21,12) == BITS10(0,0,0,0,1,1,0,0,1,1)
8066        && INSN(7,7) == 1
8067        && INSN(6,5) <= BITS2(1,0) && INSN(4,0) == BITS5(1,1,1,1,1)) {
8068       UInt opc = INSN(6,5);
8069       UInt CRm = INSN(11,8);
8070       vassert(opc <= 2 && CRm <= 15);
8071       stmt(IRStmt_MBE(Imbe_Fence));
8072       const HChar* opNames[3]
8073          = { "dsb", "dmb", "isb" };
8074       const HChar* howNames[16]
8075          = { "#0", "oshld", "oshst", "osh", "#4", "nshld", "nshst", "nsh",
8076              "#8", "ishld", "ishst", "ish", "#12", "ld", "st", "sy" };
8077       DIP("%s %s\n", opNames[opc], howNames[CRm]);
8078       return True;
8079    }
8080
8081    /* -------------------- NOP -------------------- */
8082    if (INSN(31,0) == 0xD503201F) {
8083       DIP("nop\n");
8084       return True;
8085    }
8086
8087    /* -------------------- BRK -------------------- */
8088    /* 31        23  20    4
8089       1101 0100 001 imm16 00000  BRK #imm16
8090    */
8091    if (INSN(31,24) == BITS8(1,1,0,1,0,1,0,0)
8092        && INSN(23,21) == BITS3(0,0,1) && INSN(4,0) == BITS5(0,0,0,0,0)) {
8093       UInt imm16 = INSN(20,5);
8094       /* Request SIGTRAP and then restart of this insn. */
8095       putPC(mkU64(guest_PC_curr_instr + 0));
8096       dres->whatNext    = Dis_StopHere;
8097       dres->jk_StopHere = Ijk_SigTRAP;
8098       DIP("brk #%u\n", imm16);
8099       return True;
8100    }
8101
8102    /* ------------------- YIELD ------------------- */
8103    /* 31        23        15        7
8104       1101 0101 0000 0011 0010 0000 0011 1111
8105    */
8106    if (INSN(31,0) == 0xD503203F) {
8107       /* Request yield followed by continuation at the next insn. */
8108       putPC(mkU64(guest_PC_curr_instr + 4));
8109       dres->whatNext    = Dis_StopHere;
8110       dres->jk_StopHere = Ijk_Yield;
8111       DIP("yield\n");
8112       return True;
8113    }
8114
8115    /* -------------------- HINT ------------------- */
8116    /* 31        23        15   11   4 3
8117       1101 0101 0000 0011 0010 imm7 1 1111
8118       Catch otherwise unhandled HINT instructions - any
8119       like YIELD which are explicitly handled should go
8120       above this case.
8121    */
8122    if (INSN(31,24) == BITS8(1,1,0,1,0,1,0,1)
8123        && INSN(23,16) == BITS8(0,0,0,0,0,0,1,1)
8124        && INSN(15,12) == BITS4(0,0,1,0)
8125        && INSN(4,0) == BITS5(1,1,1,1,1)) {
8126       UInt imm7 = INSN(11,5);
8127       DIP("hint #%u\n", imm7);
8128       return True;
8129    }
8130
8131    /* ------------------- CLREX ------------------ */
8132    /* 31        23        15   11 7
8133       1101 0101 0000 0011 0011 m  0101 1111  CLREX CRm
8134       CRm is apparently ignored.
8135    */
8136    if ((INSN(31,0) & 0xFFFFF0FF) == 0xD503305F) {
8137       UInt mm = INSN(11,8);
8138       /* AFAICS, this simply cancels a (all?) reservations made by a
8139          (any?) preceding LDREX(es).  Arrange to hand it through to
8140          the back end. */
8141       if (abiinfo->guest__use_fallback_LLSC) {
8142          stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) )); // "no transaction"
8143       } else {
8144          stmt( IRStmt_MBE(Imbe_CancelReservation) );
8145       }
8146       DIP("clrex #%u\n", mm);
8147       return True;
8148    }
8149
8150    if (sigill_diag) {
8151       vex_printf("ARM64 front end: branch_etc\n");
8152    }
8153    return False;
8154 #  undef INSN
8155 }
8156
8157
8158 /*------------------------------------------------------------*/
8159 /*--- SIMD and FP instructions: helper functions           ---*/
8160 /*------------------------------------------------------------*/
8161
8162 /* Some constructors for interleave/deinterleave expressions. */
8163
8164 static IRExpr* mk_CatEvenLanes64x2 ( IRTemp a10, IRTemp b10 ) {
8165    // returns a0 b0
8166    return binop(Iop_InterleaveLO64x2, mkexpr(a10), mkexpr(b10));
8167 }
8168
8169 static IRExpr* mk_CatOddLanes64x2 ( IRTemp a10, IRTemp b10 ) {
8170    // returns a1 b1
8171    return binop(Iop_InterleaveHI64x2, mkexpr(a10), mkexpr(b10));
8172 }
8173
8174 static IRExpr* mk_CatEvenLanes32x4 ( IRTemp a3210, IRTemp b3210 ) {
8175    // returns a2 a0 b2 b0
8176    return binop(Iop_CatEvenLanes32x4, mkexpr(a3210), mkexpr(b3210));
8177 }
8178
8179 static IRExpr* mk_CatOddLanes32x4 ( IRTemp a3210, IRTemp b3210 ) {
8180    // returns a3 a1 b3 b1
8181    return binop(Iop_CatOddLanes32x4, mkexpr(a3210), mkexpr(b3210));
8182 }
8183
8184 static IRExpr* mk_InterleaveLO32x4 ( IRTemp a3210, IRTemp b3210 ) {
8185    // returns a1 b1 a0 b0
8186    return binop(Iop_InterleaveLO32x4, mkexpr(a3210), mkexpr(b3210));
8187 }
8188
8189 static IRExpr* mk_InterleaveHI32x4 ( IRTemp a3210, IRTemp b3210 ) {
8190    // returns a3 b3 a2 b2
8191    return binop(Iop_InterleaveHI32x4, mkexpr(a3210), mkexpr(b3210));
8192 }
8193
8194 static IRExpr* mk_CatEvenLanes16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
8195    // returns a6 a4 a2 a0 b6 b4 b2 b0
8196    return binop(Iop_CatEvenLanes16x8, mkexpr(a76543210), mkexpr(b76543210));
8197 }
8198
8199 static IRExpr* mk_CatOddLanes16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
8200    // returns a7 a5 a3 a1 b7 b5 b3 b1
8201    return binop(Iop_CatOddLanes16x8, mkexpr(a76543210), mkexpr(b76543210));
8202 }
8203
8204 static IRExpr* mk_InterleaveLO16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
8205    // returns a3 b3 a2 b2 a1 b1 a0 b0
8206    return binop(Iop_InterleaveLO16x8, mkexpr(a76543210), mkexpr(b76543210));
8207 }
8208
8209 static IRExpr* mk_InterleaveHI16x8 ( IRTemp a76543210, IRTemp b76543210 ) {
8210    // returns a7 b7 a6 b6 a5 b5 a4 b4
8211    return binop(Iop_InterleaveHI16x8, mkexpr(a76543210), mkexpr(b76543210));
8212 }
8213
8214 static IRExpr* mk_CatEvenLanes8x16 ( IRTemp aFEDCBA9876543210,
8215                                      IRTemp bFEDCBA9876543210 ) {
8216    // returns aE aC aA a8 a6 a4 a2 a0 bE bC bA b8 b6 b4 b2 b0
8217    return binop(Iop_CatEvenLanes8x16, mkexpr(aFEDCBA9876543210),
8218                                       mkexpr(bFEDCBA9876543210));
8219 }
8220
8221 static IRExpr* mk_CatOddLanes8x16 ( IRTemp aFEDCBA9876543210,
8222                                     IRTemp bFEDCBA9876543210 ) {
8223    // returns aF aD aB a9 a7 a5 a3 a1 bF bD bB b9 b7 b5 b3 b1
8224    return binop(Iop_CatOddLanes8x16, mkexpr(aFEDCBA9876543210),
8225                                      mkexpr(bFEDCBA9876543210));
8226 }
8227
8228 static IRExpr* mk_InterleaveLO8x16 ( IRTemp aFEDCBA9876543210,
8229                                      IRTemp bFEDCBA9876543210 ) {
8230    // returns a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0
8231    return binop(Iop_InterleaveLO8x16, mkexpr(aFEDCBA9876543210),
8232                                       mkexpr(bFEDCBA9876543210));
8233 }
8234
8235 static IRExpr* mk_InterleaveHI8x16 ( IRTemp aFEDCBA9876543210,
8236                                      IRTemp bFEDCBA9876543210 ) {
8237    // returns aF bF aE bE aD bD aC bC aB bB aA bA a9 b9 a8 b8
8238    return binop(Iop_InterleaveHI8x16, mkexpr(aFEDCBA9876543210),
8239                                       mkexpr(bFEDCBA9876543210));
8240 }
8241
8242 /* Generate N copies of |bit| in the bottom of a ULong. */
8243 static ULong Replicate ( ULong bit, Int N )
8244 {
8245    vassert(bit <= 1 && N >= 1 && N < 64);
8246    if (bit == 0) {
8247       return 0;
8248     } else {
8249       /* Careful.  This won't work for N == 64. */
8250       return (1ULL << N) - 1;
8251    }
8252 }
8253
8254 static ULong Replicate32x2 ( ULong bits32 )
8255 {
8256    vassert(0 == (bits32 & ~0xFFFFFFFFULL));
8257    return (bits32 << 32) | bits32;
8258 }
8259
8260 static ULong Replicate16x4 ( ULong bits16 )
8261 {
8262    vassert(0 == (bits16 & ~0xFFFFULL));
8263    return Replicate32x2((bits16 << 16) | bits16);
8264 }
8265
8266 static ULong Replicate8x8 ( ULong bits8 )
8267 {
8268    vassert(0 == (bits8 & ~0xFFULL));
8269    return Replicate16x4((bits8 << 8) | bits8);
8270 }
8271
8272 /* Expand the VFPExpandImm-style encoding in the bottom 8 bits of
8273    |imm8| to either a 32-bit value if N is 32 or a 64 bit value if N
8274    is 64.  In the former case, the upper 32 bits of the returned value
8275    are guaranteed to be zero. */
8276 static ULong VFPExpandImm ( ULong imm8, Int N )
8277 {
8278    vassert(imm8 <= 0xFF);
8279    vassert(N == 32 || N == 64);
8280    Int E = ((N == 32) ? 8 : 11) - 2; // The spec incorrectly omits the -2.
8281    Int F = N - E - 1;
8282    ULong imm8_6 = (imm8 >> 6) & 1;
8283    /* sign: 1 bit */
8284    /* exp:  E bits */
8285    /* frac: F bits */
8286    ULong sign = (imm8 >> 7) & 1;
8287    ULong exp  = ((imm8_6 ^ 1) << (E-1)) | Replicate(imm8_6, E-1);
8288    ULong frac = ((imm8 & 63) << (F-6)) | Replicate(0, F-6);
8289    vassert(sign < (1ULL << 1));
8290    vassert(exp  < (1ULL << E));
8291    vassert(frac < (1ULL << F));
8292    vassert(1 + E + F == N);
8293    ULong res = (sign << (E+F)) | (exp << F) | frac;
8294    return res;
8295 }
8296
8297 /* Expand an AdvSIMDExpandImm-style encoding into a 64-bit value.
8298    This might fail, as indicated by the returned Bool.  Page 2530 of
8299    the manual. */
8300 static Bool AdvSIMDExpandImm ( /*OUT*/ULong* res,
8301                                UInt op, UInt cmode, UInt imm8 )
8302 {
8303    vassert(op <= 1);
8304    vassert(cmode <= 15);
8305    vassert(imm8 <= 255);
8306
8307    *res = 0; /* will overwrite iff returning True */
8308
8309    ULong imm64    = 0;
8310    Bool  testimm8 = False;
8311
8312    switch (cmode >> 1) {
8313       case 0:
8314          testimm8 = False; imm64 = Replicate32x2(imm8); break;
8315       case 1:
8316          testimm8 = True; imm64 = Replicate32x2(imm8 << 8UL); break;
8317       case 2:
8318          // Coverity is right but we want the overflow, Replicate32x2 expects
8319          // (and asserts) that the top 32bits be zero so if we get rid of the
8320          // overflow we would need to add a mask. So just let it overflow.
8321          // coverity[OVERFLOW_BEFORE_WIDEN:FALSE]
8322          testimm8 = True; imm64 = Replicate32x2(imm8 << 16UL); break;
8323       case 3:
8324          // coverity[OVERFLOW_BEFORE_WIDEN:FALSE]
8325          testimm8 = True; imm64 = Replicate32x2(imm8 << 24UL); break;
8326       case 4:
8327           testimm8 = False; imm64 = Replicate16x4(imm8); break;
8328       case 5:
8329           testimm8 = True; imm64 = Replicate16x4(imm8 << 8); break;
8330       case 6:
8331           testimm8 = True;
8332           if ((cmode & 1) == 0)
8333               imm64 = Replicate32x2((imm8 << 8) | 0xFF);
8334           else
8335               imm64 = Replicate32x2((imm8 << 16) | 0xFFFF);
8336           break;
8337       case 7:
8338          testimm8 = False;
8339          if ((cmode & 1) == 0 && op == 0)
8340              imm64 = Replicate8x8(imm8);
8341          if ((cmode & 1) == 0 && op == 1) {
8342              imm64 = 0;   imm64 |= (imm8 & 0x80) ? 0xFF : 0x00;
8343              imm64 <<= 8; imm64 |= (imm8 & 0x40) ? 0xFF : 0x00;
8344              imm64 <<= 8; imm64 |= (imm8 & 0x20) ? 0xFF : 0x00;
8345              imm64 <<= 8; imm64 |= (imm8 & 0x10) ? 0xFF : 0x00;
8346              imm64 <<= 8; imm64 |= (imm8 & 0x08) ? 0xFF : 0x00;
8347              imm64 <<= 8; imm64 |= (imm8 & 0x04) ? 0xFF : 0x00;
8348              imm64 <<= 8; imm64 |= (imm8 & 0x02) ? 0xFF : 0x00;
8349              imm64 <<= 8; imm64 |= (imm8 & 0x01) ? 0xFF : 0x00;
8350          }
8351          if ((cmode & 1) == 1 && op == 0) {
8352             ULong imm8_7  = (imm8 >> 7) & 1;
8353             ULong imm8_6  = (imm8 >> 6) & 1;
8354             ULong imm8_50 = imm8 & 63;
8355             ULong imm32 = (imm8_7                 << (1 + 5 + 6 + 19))
8356                           | ((imm8_6 ^ 1)         << (5 + 6 + 19))
8357                           | (Replicate(imm8_6, 5) << (6 + 19))
8358                           | (imm8_50              << 19);
8359             imm64 = Replicate32x2(imm32);
8360          }
8361          if ((cmode & 1) == 1 && op == 1) {
8362             // imm64 = imm8<7>:NOT(imm8<6>)
8363             //                :Replicate(imm8<6>,8):imm8<5:0>:Zeros(48);
8364             ULong imm8_7  = (imm8 >> 7) & 1;
8365             ULong imm8_6  = (imm8 >> 6) & 1;
8366             ULong imm8_50 = imm8 & 63;
8367             imm64 = (imm8_7 << 63) | ((imm8_6 ^ 1) << 62)
8368                     | (Replicate(imm8_6, 8) << 54)
8369                     | (imm8_50 << 48);
8370          }
8371          break;
8372       default:
8373         vassert(0);
8374    }
8375
8376    if (testimm8 && imm8 == 0)
8377       return False;
8378
8379    *res = imm64;
8380    return True;
8381 }
8382
8383 /* Help a bit for decoding laneage for vector operations that can be
8384    of the form 4x32, 2x64 or 2x32-and-zero-upper-half, as encoded by Q
8385    and SZ bits, typically for vector floating point. */
8386 static Bool getLaneInfo_Q_SZ ( /*OUT*/IRType* tyI,  /*OUT*/IRType* tyF,
8387                                /*OUT*/UInt* nLanes, /*OUT*/Bool* zeroUpper,
8388                                /*OUT*/const HChar** arrSpec,
8389                                Bool bitQ, Bool bitSZ )
8390 {
8391    vassert(bitQ == True || bitQ == False);
8392    vassert(bitSZ == True || bitSZ == False);
8393    if (bitQ && bitSZ) { // 2x64
8394       if (tyI)       *tyI       = Ity_I64;
8395       if (tyF)       *tyF       = Ity_F64;
8396       if (nLanes)    *nLanes    = 2;
8397       if (zeroUpper) *zeroUpper = False;
8398       if (arrSpec)   *arrSpec   = "2d";
8399       return True;
8400    }
8401    if (bitQ && !bitSZ) { // 4x32
8402       if (tyI)       *tyI       = Ity_I32;
8403       if (tyF)       *tyF       = Ity_F32;
8404       if (nLanes)    *nLanes    = 4;
8405       if (zeroUpper) *zeroUpper = False;
8406       if (arrSpec)   *arrSpec   = "4s";
8407       return True;
8408    }
8409    if (!bitQ && !bitSZ) { // 2x32
8410       if (tyI)       *tyI       = Ity_I32;
8411       if (tyF)       *tyF       = Ity_F32;
8412       if (nLanes)    *nLanes    = 2;
8413       if (zeroUpper) *zeroUpper = True;
8414       if (arrSpec)   *arrSpec   = "2s";
8415       return True;
8416    }
8417    // Else impliedly 1x64, which isn't allowed.
8418    return False;
8419 }
8420
8421 /* Helper for decoding laneage for shift-style vector operations
8422    that involve an immediate shift amount. */
8423 static Bool getLaneInfo_IMMH_IMMB ( /*OUT*/UInt* shift, /*OUT*/UInt* szBlg2,
8424                                     UInt immh, UInt immb )
8425 {
8426    vassert(immh < (1<<4));
8427    vassert(immb < (1<<3));
8428    UInt immhb = (immh << 3) | immb;
8429    if (immh & 8) {
8430       if (shift)  *shift  = 128 - immhb;
8431       if (szBlg2) *szBlg2 = 3;
8432       return True;
8433    }
8434    if (immh & 4) {
8435       if (shift)  *shift  = 64 - immhb;
8436       if (szBlg2) *szBlg2 = 2;
8437       return True;
8438    }
8439    if (immh & 2) {
8440       if (shift)  *shift  = 32 - immhb;
8441       if (szBlg2) *szBlg2 = 1;
8442       return True;
8443    }
8444    if (immh & 1) {
8445       if (shift)  *shift  = 16 - immhb;
8446       if (szBlg2) *szBlg2 = 0;
8447       return True;
8448    }
8449    return False;
8450 }
8451
8452 /* Generate IR to fold all lanes of the V128 value in 'src' as
8453    characterised by the operator 'op', and return the result in the
8454    bottom bits of a V128, with all other bits set to zero. */
8455 static IRTemp math_FOLDV ( IRTemp src, IROp op )
8456 {
8457    /* The basic idea is to use repeated applications of Iop_CatEven*
8458       and Iop_CatOdd* operators to 'src' so as to clone each lane into
8459       a complete vector.  Then fold all those vectors with 'op' and
8460       zero out all but the least significant lane. */
8461    switch (op) {
8462       case Iop_Min8Sx16: case Iop_Min8Ux16:
8463       case Iop_Max8Sx16: case Iop_Max8Ux16: case Iop_Add8x16: {
8464          /* NB: temp naming here is misleading -- the naming is for 8
8465             lanes of 16 bit, whereas what is being operated on is 16
8466             lanes of 8 bits. */
8467          IRTemp x76543210 = src;
8468          IRTemp x76547654 = newTempV128();
8469          IRTemp x32103210 = newTempV128();
8470          assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
8471          assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
8472          IRTemp x76767676 = newTempV128();
8473          IRTemp x54545454 = newTempV128();
8474          IRTemp x32323232 = newTempV128();
8475          IRTemp x10101010 = newTempV128();
8476          assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
8477          assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
8478          assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
8479          assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
8480          IRTemp x77777777 = newTempV128();
8481          IRTemp x66666666 = newTempV128();
8482          IRTemp x55555555 = newTempV128();
8483          IRTemp x44444444 = newTempV128();
8484          IRTemp x33333333 = newTempV128();
8485          IRTemp x22222222 = newTempV128();
8486          IRTemp x11111111 = newTempV128();
8487          IRTemp x00000000 = newTempV128();
8488          assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
8489          assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
8490          assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
8491          assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
8492          assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
8493          assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
8494          assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
8495          assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
8496          /* Naming not misleading after here. */
8497          IRTemp xAllF = newTempV128();
8498          IRTemp xAllE = newTempV128();
8499          IRTemp xAllD = newTempV128();
8500          IRTemp xAllC = newTempV128();
8501          IRTemp xAllB = newTempV128();
8502          IRTemp xAllA = newTempV128();
8503          IRTemp xAll9 = newTempV128();
8504          IRTemp xAll8 = newTempV128();
8505          IRTemp xAll7 = newTempV128();
8506          IRTemp xAll6 = newTempV128();
8507          IRTemp xAll5 = newTempV128();
8508          IRTemp xAll4 = newTempV128();
8509          IRTemp xAll3 = newTempV128();
8510          IRTemp xAll2 = newTempV128();
8511          IRTemp xAll1 = newTempV128();
8512          IRTemp xAll0 = newTempV128();
8513          assign(xAllF, mk_CatOddLanes8x16 (x77777777, x77777777));
8514          assign(xAllE, mk_CatEvenLanes8x16(x77777777, x77777777));
8515          assign(xAllD, mk_CatOddLanes8x16 (x66666666, x66666666));
8516          assign(xAllC, mk_CatEvenLanes8x16(x66666666, x66666666));
8517          assign(xAllB, mk_CatOddLanes8x16 (x55555555, x55555555));
8518          assign(xAllA, mk_CatEvenLanes8x16(x55555555, x55555555));
8519          assign(xAll9, mk_CatOddLanes8x16 (x44444444, x44444444));
8520          assign(xAll8, mk_CatEvenLanes8x16(x44444444, x44444444));
8521          assign(xAll7, mk_CatOddLanes8x16 (x33333333, x33333333));
8522          assign(xAll6, mk_CatEvenLanes8x16(x33333333, x33333333));
8523          assign(xAll5, mk_CatOddLanes8x16 (x22222222, x22222222));
8524          assign(xAll4, mk_CatEvenLanes8x16(x22222222, x22222222));
8525          assign(xAll3, mk_CatOddLanes8x16 (x11111111, x11111111));
8526          assign(xAll2, mk_CatEvenLanes8x16(x11111111, x11111111));
8527          assign(xAll1, mk_CatOddLanes8x16 (x00000000, x00000000));
8528          assign(xAll0, mk_CatEvenLanes8x16(x00000000, x00000000));
8529          IRTemp maxFE = newTempV128();
8530          IRTemp maxDC = newTempV128();
8531          IRTemp maxBA = newTempV128();
8532          IRTemp max98 = newTempV128();
8533          IRTemp max76 = newTempV128();
8534          IRTemp max54 = newTempV128();
8535          IRTemp max32 = newTempV128();
8536          IRTemp max10 = newTempV128();
8537          assign(maxFE, binop(op, mkexpr(xAllF), mkexpr(xAllE)));
8538          assign(maxDC, binop(op, mkexpr(xAllD), mkexpr(xAllC)));
8539          assign(maxBA, binop(op, mkexpr(xAllB), mkexpr(xAllA)));
8540          assign(max98, binop(op, mkexpr(xAll9), mkexpr(xAll8)));
8541          assign(max76, binop(op, mkexpr(xAll7), mkexpr(xAll6)));
8542          assign(max54, binop(op, mkexpr(xAll5), mkexpr(xAll4)));
8543          assign(max32, binop(op, mkexpr(xAll3), mkexpr(xAll2)));
8544          assign(max10, binop(op, mkexpr(xAll1), mkexpr(xAll0)));
8545          IRTemp maxFEDC = newTempV128();
8546          IRTemp maxBA98 = newTempV128();
8547          IRTemp max7654 = newTempV128();
8548          IRTemp max3210 = newTempV128();
8549          assign(maxFEDC, binop(op, mkexpr(maxFE), mkexpr(maxDC)));
8550          assign(maxBA98, binop(op, mkexpr(maxBA), mkexpr(max98)));
8551          assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
8552          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
8553          IRTemp maxFEDCBA98 = newTempV128();
8554          IRTemp max76543210 = newTempV128();
8555          assign(maxFEDCBA98, binop(op, mkexpr(maxFEDC), mkexpr(maxBA98)));
8556          assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
8557          IRTemp maxAllLanes = newTempV128();
8558          assign(maxAllLanes, binop(op, mkexpr(maxFEDCBA98),
8559                                        mkexpr(max76543210)));
8560          IRTemp res = newTempV128();
8561          assign(res, unop(Iop_ZeroHI120ofV128, mkexpr(maxAllLanes)));
8562          return res;
8563       }
8564       case Iop_Min16Sx8: case Iop_Min16Ux8:
8565       case Iop_Max16Sx8: case Iop_Max16Ux8: case Iop_Add16x8: {
8566          IRTemp x76543210 = src;
8567          IRTemp x76547654 = newTempV128();
8568          IRTemp x32103210 = newTempV128();
8569          assign(x76547654, mk_CatOddLanes64x2 (x76543210, x76543210));
8570          assign(x32103210, mk_CatEvenLanes64x2(x76543210, x76543210));
8571          IRTemp x76767676 = newTempV128();
8572          IRTemp x54545454 = newTempV128();
8573          IRTemp x32323232 = newTempV128();
8574          IRTemp x10101010 = newTempV128();
8575          assign(x76767676, mk_CatOddLanes32x4 (x76547654, x76547654));
8576          assign(x54545454, mk_CatEvenLanes32x4(x76547654, x76547654));
8577          assign(x32323232, mk_CatOddLanes32x4 (x32103210, x32103210));
8578          assign(x10101010, mk_CatEvenLanes32x4(x32103210, x32103210));
8579          IRTemp x77777777 = newTempV128();
8580          IRTemp x66666666 = newTempV128();
8581          IRTemp x55555555 = newTempV128();
8582          IRTemp x44444444 = newTempV128();
8583          IRTemp x33333333 = newTempV128();
8584          IRTemp x22222222 = newTempV128();
8585          IRTemp x11111111 = newTempV128();
8586          IRTemp x00000000 = newTempV128();
8587          assign(x77777777, mk_CatOddLanes16x8 (x76767676, x76767676));
8588          assign(x66666666, mk_CatEvenLanes16x8(x76767676, x76767676));
8589          assign(x55555555, mk_CatOddLanes16x8 (x54545454, x54545454));
8590          assign(x44444444, mk_CatEvenLanes16x8(x54545454, x54545454));
8591          assign(x33333333, mk_CatOddLanes16x8 (x32323232, x32323232));
8592          assign(x22222222, mk_CatEvenLanes16x8(x32323232, x32323232));
8593          assign(x11111111, mk_CatOddLanes16x8 (x10101010, x10101010));
8594          assign(x00000000, mk_CatEvenLanes16x8(x10101010, x10101010));
8595          IRTemp max76 = newTempV128();
8596          IRTemp max54 = newTempV128();
8597          IRTemp max32 = newTempV128();
8598          IRTemp max10 = newTempV128();
8599          assign(max76, binop(op, mkexpr(x77777777), mkexpr(x66666666)));
8600          assign(max54, binop(op, mkexpr(x55555555), mkexpr(x44444444)));
8601          assign(max32, binop(op, mkexpr(x33333333), mkexpr(x22222222)));
8602          assign(max10, binop(op, mkexpr(x11111111), mkexpr(x00000000)));
8603          IRTemp max7654 = newTempV128();
8604          IRTemp max3210 = newTempV128();
8605          assign(max7654, binop(op, mkexpr(max76), mkexpr(max54)));
8606          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
8607          IRTemp max76543210 = newTempV128();
8608          assign(max76543210, binop(op, mkexpr(max7654), mkexpr(max3210)));
8609          IRTemp res = newTempV128();
8610          assign(res, unop(Iop_ZeroHI112ofV128, mkexpr(max76543210)));
8611          return res;
8612       }
8613       case Iop_Max32Fx4: case Iop_Min32Fx4:
8614       case Iop_Min32Sx4: case Iop_Min32Ux4:
8615       case Iop_Max32Sx4: case Iop_Max32Ux4: case Iop_Add32x4: {
8616          IRTemp x3210 = src;
8617          IRTemp x3232 = newTempV128();
8618          IRTemp x1010 = newTempV128();
8619          assign(x3232, mk_CatOddLanes64x2 (x3210, x3210));
8620          assign(x1010, mk_CatEvenLanes64x2(x3210, x3210));
8621          IRTemp x3333 = newTempV128();
8622          IRTemp x2222 = newTempV128();
8623          IRTemp x1111 = newTempV128();
8624          IRTemp x0000 = newTempV128();
8625          assign(x3333, mk_CatOddLanes32x4 (x3232, x3232));
8626          assign(x2222, mk_CatEvenLanes32x4(x3232, x3232));
8627          assign(x1111, mk_CatOddLanes32x4 (x1010, x1010));
8628          assign(x0000, mk_CatEvenLanes32x4(x1010, x1010));
8629          IRTemp max32 = newTempV128();
8630          IRTemp max10 = newTempV128();
8631          assign(max32, binop(op, mkexpr(x3333), mkexpr(x2222)));
8632          assign(max10, binop(op, mkexpr(x1111), mkexpr(x0000)));
8633          IRTemp max3210 = newTempV128();
8634          assign(max3210, binop(op, mkexpr(max32), mkexpr(max10)));
8635          IRTemp res = newTempV128();
8636          assign(res, unop(Iop_ZeroHI96ofV128, mkexpr(max3210)));
8637          return res;
8638       }
8639       case Iop_Add64x2: {
8640          IRTemp x10 = src;
8641          IRTemp x00 = newTempV128();
8642          IRTemp x11 = newTempV128();
8643          assign(x11, binop(Iop_InterleaveHI64x2, mkexpr(x10), mkexpr(x10)));
8644          assign(x00, binop(Iop_InterleaveLO64x2, mkexpr(x10), mkexpr(x10)));
8645          IRTemp max10 = newTempV128();
8646          assign(max10, binop(op, mkexpr(x11), mkexpr(x00)));
8647          IRTemp res = newTempV128();
8648          assign(res, unop(Iop_ZeroHI64ofV128, mkexpr(max10)));
8649          return res;
8650       }
8651       default:
8652          vassert(0);
8653    }
8654 }
8655
8656
8657 /* Generate IR for TBL and TBX.  This deals with the 128 bit case
8658    only. */
8659 static IRTemp math_TBL_TBX ( IRTemp tab[4], UInt len, IRTemp src,
8660                              IRTemp oor_values )
8661 {
8662    vassert(len <= 3);
8663
8664    /* Generate some useful constants as concisely as possible. */
8665    IRTemp half15 = newTemp(Ity_I64);
8666    assign(half15, mkU64(0x0F0F0F0F0F0F0F0FULL));
8667    IRTemp half16 = newTemp(Ity_I64);
8668    assign(half16, mkU64(0x1010101010101010ULL));
8669
8670    /* A zero vector */
8671    IRTemp allZero = newTempV128();
8672    assign(allZero, mkV128(0x0000));
8673    /* A vector containing 15 in each 8-bit lane */
8674    IRTemp all15 = newTempV128();
8675    assign(all15, binop(Iop_64HLtoV128, mkexpr(half15), mkexpr(half15)));
8676    /* A vector containing 16 in each 8-bit lane */
8677    IRTemp all16 = newTempV128();
8678    assign(all16, binop(Iop_64HLtoV128, mkexpr(half16), mkexpr(half16)));
8679    /* A vector containing 32 in each 8-bit lane */
8680    IRTemp all32 = newTempV128();
8681    assign(all32, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all16)));
8682    /* A vector containing 48 in each 8-bit lane */
8683    IRTemp all48 = newTempV128();
8684    assign(all48, binop(Iop_Add8x16, mkexpr(all16), mkexpr(all32)));
8685    /* A vector containing 64 in each 8-bit lane */
8686    IRTemp all64 = newTempV128();
8687    assign(all64, binop(Iop_Add8x16, mkexpr(all32), mkexpr(all32)));
8688
8689    /* Group the 16/32/48/64 vectors so as to be indexable. */
8690    IRTemp allXX[4] = { all16, all32, all48, all64 };
8691
8692    /* Compute the result for each table vector, with zeroes in places
8693       where the index values are out of range, and OR them into the
8694       running vector. */
8695    IRTemp running_result = newTempV128();
8696    assign(running_result, mkV128(0));
8697
8698    UInt tabent;
8699    for (tabent = 0; tabent <= len; tabent++) {
8700       vassert(tabent < 4);
8701       IRTemp bias = newTempV128();
8702       assign(bias,
8703              mkexpr(tabent == 0 ? allZero : allXX[tabent-1]));
8704       IRTemp biased_indices = newTempV128();
8705       assign(biased_indices,
8706              binop(Iop_Sub8x16, mkexpr(src), mkexpr(bias)));
8707       IRTemp valid_mask = newTempV128();
8708       assign(valid_mask,
8709              binop(Iop_CmpGT8Ux16, mkexpr(all16), mkexpr(biased_indices)));
8710       IRTemp safe_biased_indices = newTempV128();
8711       assign(safe_biased_indices,
8712              binop(Iop_AndV128, mkexpr(biased_indices), mkexpr(all15)));
8713       IRTemp results_or_junk = newTempV128();
8714       assign(results_or_junk,
8715              binop(Iop_Perm8x16, mkexpr(tab[tabent]),
8716                                  mkexpr(safe_biased_indices)));
8717       IRTemp results_or_zero = newTempV128();
8718       assign(results_or_zero,
8719              binop(Iop_AndV128, mkexpr(results_or_junk), mkexpr(valid_mask)));
8720       /* And OR that into the running result. */
8721       IRTemp tmp = newTempV128();
8722       assign(tmp, binop(Iop_OrV128, mkexpr(results_or_zero),
8723                         mkexpr(running_result)));
8724       running_result = tmp;
8725    }
8726
8727    /* So now running_result holds the overall result where the indices
8728       are in range, and zero in out-of-range lanes.  Now we need to
8729       compute an overall validity mask and use this to copy in the
8730       lanes in the oor_values for out of range indices.  This is
8731       unnecessary for TBL but will get folded out by iropt, so we lean
8732       on that and generate the same code for TBL and TBX here. */
8733    IRTemp overall_valid_mask = newTempV128();
8734    assign(overall_valid_mask,
8735           binop(Iop_CmpGT8Ux16, mkexpr(allXX[len]), mkexpr(src)));
8736    IRTemp result = newTempV128();
8737    assign(result,
8738           binop(Iop_OrV128,
8739                 mkexpr(running_result),
8740                 binop(Iop_AndV128,
8741                       mkexpr(oor_values),
8742                       unop(Iop_NotV128, mkexpr(overall_valid_mask)))));
8743    return result;
8744 }
8745
8746
8747 /* Let |argL| and |argR| be V128 values, and let |opI64x2toV128| be
8748    an op which takes two I64s and produces a V128.  That is, a widening
8749    operator.  Generate IR which applies |opI64x2toV128| to either the
8750    lower (if |is2| is False) or upper (if |is2| is True) halves of
8751    |argL| and |argR|, and return the value in a new IRTemp.
8752 */
8753 static
8754 IRTemp math_BINARY_WIDENING_V128 ( Bool is2, IROp opI64x2toV128,
8755                                    IRExpr* argL, IRExpr* argR )
8756 {
8757    IRTemp res   = newTempV128();
8758    IROp   slice = is2 ? Iop_V128HIto64 : Iop_V128to64;
8759    assign(res, binop(opI64x2toV128, unop(slice, argL),
8760                                     unop(slice, argR)));
8761    return res;
8762 }
8763
8764
8765 /* Generate signed/unsigned absolute difference vector IR. */
8766 static
8767 IRTemp math_ABD ( Bool isU, UInt size, IRExpr* argLE, IRExpr* argRE )
8768 {
8769    vassert(size <= 3);
8770    IRTemp argL = newTempV128();
8771    IRTemp argR = newTempV128();
8772    IRTemp msk  = newTempV128();
8773    IRTemp res  = newTempV128();
8774    assign(argL, argLE);
8775    assign(argR, argRE);
8776    assign(msk, binop(isU ? mkVecCMPGTU(size) : mkVecCMPGTS(size),
8777                      mkexpr(argL), mkexpr(argR)));
8778    assign(res,
8779           binop(Iop_OrV128,
8780                 binop(Iop_AndV128,
8781                       binop(mkVecSUB(size), mkexpr(argL), mkexpr(argR)),
8782                       mkexpr(msk)),
8783                 binop(Iop_AndV128,
8784                       binop(mkVecSUB(size), mkexpr(argR), mkexpr(argL)),
8785                       unop(Iop_NotV128, mkexpr(msk)))));
8786    return res;
8787 }
8788
8789
8790 /* Generate IR that takes a V128 and sign- or zero-widens
8791    either the lower or upper set of lanes to twice-as-wide,
8792    resulting in a new V128 value. */
8793 static
8794 IRTemp math_WIDEN_LO_OR_HI_LANES ( Bool zWiden, Bool fromUpperHalf,
8795                                    UInt sizeNarrow, IRExpr* srcE )
8796 {
8797    IRTemp src = newTempV128();
8798    IRTemp res = newTempV128();
8799    assign(src, srcE);
8800    switch (sizeNarrow) {
8801       case X10:
8802          assign(res,
8803                 binop(zWiden ? Iop_ShrN64x2 : Iop_SarN64x2,
8804                       binop(fromUpperHalf ? Iop_InterleaveHI32x4
8805                                           : Iop_InterleaveLO32x4,
8806                             mkexpr(src),
8807                             mkexpr(src)),
8808                       mkU8(32)));
8809          break;
8810       case X01:
8811          assign(res,
8812                 binop(zWiden ? Iop_ShrN32x4 : Iop_SarN32x4,
8813                       binop(fromUpperHalf ? Iop_InterleaveHI16x8
8814                                           : Iop_InterleaveLO16x8,
8815                             mkexpr(src),
8816                             mkexpr(src)),
8817                       mkU8(16)));
8818          break;
8819       case X00:
8820          assign(res,
8821                 binop(zWiden ? Iop_ShrN16x8 : Iop_SarN16x8,
8822                       binop(fromUpperHalf ? Iop_InterleaveHI8x16
8823                                           : Iop_InterleaveLO8x16,
8824                             mkexpr(src),
8825                             mkexpr(src)),
8826                       mkU8(8)));
8827          break;
8828       default:
8829          vassert(0);
8830    }
8831    return res;
8832 }
8833
8834
8835 /* Generate IR that takes a V128 and sign- or zero-widens
8836    either the even or odd lanes to twice-as-wide,
8837    resulting in a new V128 value. */
8838 static
8839 IRTemp math_WIDEN_EVEN_OR_ODD_LANES ( Bool zWiden, Bool fromOdd,
8840                                       UInt sizeNarrow, IRExpr* srcE )
8841 {
8842    IRTemp src   = newTempV128();
8843    IRTemp res   = newTempV128();
8844    IROp   opSAR = mkVecSARN(sizeNarrow+1);
8845    IROp   opSHR = mkVecSHRN(sizeNarrow+1);
8846    IROp   opSHL = mkVecSHLN(sizeNarrow+1);
8847    IROp   opSxR = zWiden ? opSHR : opSAR;
8848    UInt   amt   = 0;
8849    switch (sizeNarrow) {
8850       case X10: amt = 32; break;
8851       case X01: amt = 16; break;
8852       case X00: amt = 8;  break;
8853       default: vassert(0);
8854    }
8855    assign(src, srcE);
8856    if (fromOdd) {
8857       assign(res, binop(opSxR, mkexpr(src), mkU8(amt)));
8858    } else {
8859       assign(res, binop(opSxR, binop(opSHL, mkexpr(src), mkU8(amt)),
8860                                mkU8(amt)));
8861    }
8862    return res;
8863 }
8864
8865
8866 /* Generate IR that takes two V128s and narrows (takes lower half)
8867    of each lane, producing a single V128 value. */
8868 static
8869 IRTemp math_NARROW_LANES ( IRTemp argHi, IRTemp argLo, UInt sizeNarrow )
8870 {
8871    IRTemp res = newTempV128();
8872    assign(res, binop(mkVecCATEVENLANES(sizeNarrow),
8873                      mkexpr(argHi), mkexpr(argLo)));
8874    return res;
8875 }
8876
8877
8878 /* Return a temp which holds the vector dup of the lane of width
8879    (1 << size) obtained from src[laneNo]. */
8880 static
8881 IRTemp math_DUP_VEC_ELEM ( IRExpr* src, UInt size, UInt laneNo )
8882 {
8883    vassert(size <= 3);
8884    /* Normalise |laneNo| so it is of the form
8885       x000 for D, xx00 for S, xxx0 for H, and xxxx for B.
8886       This puts the bits we want to inspect at constant offsets
8887       regardless of the value of |size|.
8888    */
8889    UInt ix = laneNo << size;
8890    vassert(ix <= 15);
8891    IROp ops[4] = { Iop_INVALID, Iop_INVALID, Iop_INVALID, Iop_INVALID };
8892    switch (size) {
8893       case 0: /* B */
8894          ops[0] = (ix & 1) ? Iop_CatOddLanes8x16 : Iop_CatEvenLanes8x16;
8895          /* fallthrough */
8896       case 1: /* H */
8897          ops[1] = (ix & 2) ? Iop_CatOddLanes16x8 : Iop_CatEvenLanes16x8;
8898          /* fallthrough */
8899       case 2: /* S */
8900          ops[2] = (ix & 4) ? Iop_CatOddLanes32x4 : Iop_CatEvenLanes32x4;
8901          /* fallthrough */
8902       case 3: /* D */
8903          ops[3] = (ix & 8) ? Iop_InterleaveHI64x2 : Iop_InterleaveLO64x2;
8904          break;
8905       default:
8906          vassert(0);
8907    }
8908    IRTemp res = newTempV128();
8909    assign(res, src);
8910    Int i;
8911    for (i = 3; i >= 0; i--) {
8912       if (ops[i] == Iop_INVALID)
8913          break;
8914       IRTemp tmp = newTempV128();
8915       assign(tmp, binop(ops[i], mkexpr(res), mkexpr(res)));
8916       res = tmp;
8917    }
8918    return res;
8919 }
8920
8921
8922 /* Let |srcV| be a V128 value, and let |imm5| be a lane-and-size
8923    selector encoded as shown below.  Return a new V128 holding the
8924    selected lane from |srcV| dup'd out to V128, and also return the
8925    lane number, log2 of the lane size in bytes, and width-character via
8926    *laneNo, *laneSzLg2 and *laneCh respectively.  It may be that imm5
8927    is an invalid selector, in which case return
8928    IRTemp_INVALID, 0, 0 and '?' respectively.
8929
8930    imm5 = xxxx1   signifies .b[xxxx]
8931         = xxx10   .h[xxx]
8932         = xx100   .s[xx]
8933         = x1000   .d[x]
8934         otherwise invalid
8935 */
8936 static
8937 IRTemp handle_DUP_VEC_ELEM ( /*OUT*/UInt* laneNo,
8938                              /*OUT*/UInt* laneSzLg2, /*OUT*/HChar* laneCh,
8939                              IRExpr* srcV, UInt imm5 )
8940 {
8941    *laneNo    = 0;
8942    *laneSzLg2 = 0;
8943    *laneCh    = '?';
8944
8945    if (imm5 & 1) {
8946       *laneNo    = (imm5 >> 1) & 15;
8947       *laneSzLg2 = 0;
8948       *laneCh    = 'b';
8949    }
8950    else if (imm5 & 2) {
8951       *laneNo    = (imm5 >> 2) & 7;
8952       *laneSzLg2 = 1;
8953       *laneCh    = 'h';
8954    }
8955    else if (imm5 & 4) {
8956       *laneNo    = (imm5 >> 3) & 3;
8957       *laneSzLg2 = 2;
8958       *laneCh    = 's';
8959    }
8960    else if (imm5 & 8) {
8961       *laneNo    = (imm5 >> 4) & 1;
8962       *laneSzLg2 = 3;
8963       *laneCh    = 'd';
8964    }
8965    else {
8966       /* invalid */
8967       return IRTemp_INVALID;
8968    }
8969
8970    return math_DUP_VEC_ELEM(srcV, *laneSzLg2, *laneNo);
8971 }
8972
8973
8974 /* Clone |imm| to every lane of a V128, with lane size log2 of |size|. */
8975 static
8976 IRTemp math_VEC_DUP_IMM ( UInt size, ULong imm )
8977 {
8978    IRType ty  = Ity_INVALID;
8979    IRTemp rcS = IRTemp_INVALID;
8980    switch (size) {
8981       case X01:
8982          vassert(imm <= 0xFFFFULL);
8983          ty  = Ity_I16;
8984          rcS = newTemp(ty); assign(rcS, mkU16( (UShort)imm ));
8985          break;
8986       case X10:
8987          vassert(imm <= 0xFFFFFFFFULL);
8988          ty  = Ity_I32;
8989          rcS = newTemp(ty); assign(rcS, mkU32( (UInt)imm ));
8990          break;
8991       case X11:
8992          ty  = Ity_I64;
8993          rcS = newTemp(ty); assign(rcS, mkU64(imm)); break;
8994       default:
8995          vassert(0);
8996    }
8997    IRTemp rcV = math_DUP_TO_V128(rcS, ty);
8998    return rcV;
8999 }
9000
9001
9002 /* Let |new64| be a V128 in which only the lower 64 bits are interesting,
9003    and the upper can contain any value -- it is ignored.  If |is2| is False,
9004    generate IR to put |new64| in the lower half of vector reg |dd| and zero
9005    the upper half.  If |is2| is True, generate IR to put |new64| in the upper
9006    half of vector reg |dd| and leave the lower half unchanged.  This
9007    simulates the behaviour of the "foo/foo2" instructions in which the
9008    destination is half the width of sources, for example addhn/addhn2.
9009 */
9010 static
9011 void putLO64andZUorPutHI64 ( Bool is2, UInt dd, IRTemp new64 )
9012 {
9013    if (is2) {
9014       /* Get the old contents of Vdd, zero the upper half, and replace
9015          it with 'x'. */
9016       IRTemp t_zero_oldLO = newTempV128();
9017       assign(t_zero_oldLO, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
9018       IRTemp t_newHI_zero = newTempV128();
9019       assign(t_newHI_zero, binop(Iop_InterleaveLO64x2, mkexpr(new64),
9020                                                        mkV128(0x0000)));
9021       IRTemp res = newTempV128();
9022       assign(res, binop(Iop_OrV128, mkexpr(t_zero_oldLO),
9023                                     mkexpr(t_newHI_zero)));
9024       putQReg128(dd, mkexpr(res));
9025    } else {
9026       /* This is simple. */
9027       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(new64)));
9028    }
9029 }
9030
9031
9032 /* Compute vector SQABS at lane size |size| for |srcE|, returning
9033    the q result in |*qabs| and the normal result in |*nabs|. */
9034 static
9035 void math_SQABS ( /*OUT*/IRTemp* qabs, /*OUT*/IRTemp* nabs,
9036                   IRExpr* srcE, UInt size )
9037 {
9038       IRTemp src, mask, maskn, nsub, qsub;
9039       src = mask = maskn = nsub = qsub = IRTemp_INVALID;
9040       newTempsV128_7(&src, &mask, &maskn, &nsub, &qsub, nabs, qabs);
9041       assign(src,   srcE);
9042       assign(mask,  binop(mkVecCMPGTS(size),  mkV128(0x0000), mkexpr(src)));
9043       assign(maskn, unop(Iop_NotV128, mkexpr(mask)));
9044       assign(nsub,  binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
9045       assign(qsub,  binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
9046       assign(*nabs, binop(Iop_OrV128,
9047                           binop(Iop_AndV128, mkexpr(nsub), mkexpr(mask)),
9048                           binop(Iop_AndV128, mkexpr(src),  mkexpr(maskn))));
9049       assign(*qabs, binop(Iop_OrV128,
9050                           binop(Iop_AndV128, mkexpr(qsub), mkexpr(mask)),
9051                           binop(Iop_AndV128, mkexpr(src),  mkexpr(maskn))));
9052 }
9053
9054
9055 /* Compute vector SQNEG at lane size |size| for |srcE|, returning
9056    the q result in |*qneg| and the normal result in |*nneg|. */
9057 static
9058 void math_SQNEG ( /*OUT*/IRTemp* qneg, /*OUT*/IRTemp* nneg,
9059                   IRExpr* srcE, UInt size )
9060 {
9061       IRTemp src = IRTemp_INVALID;
9062       newTempsV128_3(&src, nneg, qneg);
9063       assign(src,   srcE);
9064       assign(*nneg, binop(mkVecSUB(size),   mkV128(0x0000), mkexpr(src)));
9065       assign(*qneg, binop(mkVecQSUBS(size), mkV128(0x0000), mkexpr(src)));
9066 }
9067
9068
9069 /* Zero all except the least significant lane of |srcE|, where |size|
9070    indicates the lane size in the usual way. */
9071 static IRTemp math_ZERO_ALL_EXCEPT_LOWEST_LANE ( UInt size, IRExpr* srcE )
9072 {
9073    vassert(size < 4);
9074    IRTemp t = newTempV128();
9075    assign(t, unop(mkVecZEROHIxxOFV128(size), srcE));
9076    return t;
9077 }
9078
9079
9080 /* Generate IR to compute vector widening MULL from either the lower
9081    (is2==False) or upper (is2==True) halves of vecN and vecM.  The
9082    widening multiplies are unsigned when isU==True and signed when
9083    isU==False.  |size| is the narrow lane size indication.  Optionally,
9084    the product may be added to or subtracted from vecD, at the wide lane
9085    size.  This happens when |mas| is 'a' (add) or 's' (sub).  When |mas|
9086    is 'm' (only multiply) then the accumulate part does not happen, and
9087    |vecD| is expected to == IRTemp_INVALID.
9088
9089    Only size==0 (h_b_b), size==1 (s_h_h) and size==2 (d_s_s) variants
9090    are allowed.  The result is returned in a new IRTemp, which is
9091    returned in *res. */
9092 static
9093 void math_MULL_ACC ( /*OUT*/IRTemp* res,
9094                      Bool is2, Bool isU, UInt size, HChar mas,
9095                      IRTemp vecN, IRTemp vecM, IRTemp vecD )
9096 {
9097    vassert(res && *res == IRTemp_INVALID);
9098    vassert(size <= 2);
9099    vassert(mas == 'm' || mas == 'a' || mas == 's');
9100    if (mas == 'm') vassert(vecD == IRTemp_INVALID);
9101    IROp   mulOp = isU ? mkVecMULLU(size) : mkVecMULLS(size);
9102    IROp   accOp = (mas == 'a') ? mkVecADD(size+1)
9103                   : (mas == 's' ? mkVecSUB(size+1)
9104                   : Iop_INVALID);
9105    IRTemp mul   = math_BINARY_WIDENING_V128(is2, mulOp,
9106                                             mkexpr(vecN), mkexpr(vecM));
9107    *res = newTempV128();
9108    assign(*res, mas == 'm' ? mkexpr(mul)
9109                            : binop(accOp, mkexpr(vecD), mkexpr(mul)));
9110 }
9111
9112
9113 /* Same as math_MULL_ACC, except the multiply is signed widening,
9114    the multiplied value is then doubled, before being added to or
9115    subtracted from the accumulated value.  And everything is
9116    saturated.  In all cases, saturation residuals are returned
9117    via (sat1q, sat1n), and in the accumulate cases,
9118    via (sat2q, sat2n) too.  All results are returned in new temporaries.
9119    In the no-accumulate case, *sat2q and *sat2n are never instantiated,
9120    so the caller can tell this has happened. */
9121 static
9122 void math_SQDMULL_ACC ( /*OUT*/IRTemp* res,
9123                         /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
9124                         /*OUT*/IRTemp* sat2q, /*OUT*/IRTemp* sat2n,
9125                         Bool is2, UInt size, HChar mas,
9126                         IRTemp vecN, IRTemp vecM, IRTemp vecD )
9127 {
9128    vassert(size <= 2);
9129    vassert(mas == 'm' || mas == 'a' || mas == 's');
9130    /* Compute
9131          sat1q = vecN.D[is2] *sq vecM.d[is2] *q 2
9132          sat1n = vecN.D[is2] *s  vecM.d[is2] *  2
9133       IOW take either the low or high halves of vecN and vecM, signed widen,
9134       multiply, double that, and signedly saturate.  Also compute the same
9135       but without saturation.
9136    */
9137    vassert(sat2q && *sat2q == IRTemp_INVALID);
9138    vassert(sat2n && *sat2n == IRTemp_INVALID);
9139    newTempsV128_3(sat1q, sat1n, res);
9140    IRTemp tq = math_BINARY_WIDENING_V128(is2, mkVecQDMULLS(size),
9141                                          mkexpr(vecN), mkexpr(vecM));
9142    IRTemp tn = math_BINARY_WIDENING_V128(is2, mkVecMULLS(size),
9143                                          mkexpr(vecN), mkexpr(vecM));
9144    assign(*sat1q, mkexpr(tq));
9145    assign(*sat1n, binop(mkVecADD(size+1), mkexpr(tn), mkexpr(tn)));
9146
9147    /* If there is no accumulation, the final result is sat1q,
9148       and there's no assignment to sat2q or sat2n. */
9149    if (mas == 'm') {
9150       assign(*res, mkexpr(*sat1q));
9151       return;
9152    }
9153
9154    /* Compute
9155          sat2q  = vecD +sq/-sq sat1q
9156          sat2n  = vecD +/-     sat1n
9157          result = sat2q
9158    */
9159    newTempsV128_2(sat2q, sat2n);
9160    assign(*sat2q, binop(mas == 'a' ? mkVecQADDS(size+1) : mkVecQSUBS(size+1),
9161                         mkexpr(vecD), mkexpr(*sat1q)));
9162    assign(*sat2n, binop(mas == 'a' ? mkVecADD(size+1) : mkVecSUB(size+1),
9163                         mkexpr(vecD), mkexpr(*sat1n)));
9164    assign(*res, mkexpr(*sat2q));
9165 }
9166
9167
9168 /* Generate IR for widening signed vector multiplies.  The operands
9169    have their lane width signedly widened, and they are then multiplied
9170    at the wider width, returning results in two new IRTemps. */
9171 static
9172 void math_MULLS ( /*OUT*/IRTemp* resHI, /*OUT*/IRTemp* resLO,
9173                   UInt sizeNarrow, IRTemp argL, IRTemp argR )
9174 {
9175    vassert(sizeNarrow <= 2);
9176    newTempsV128_2(resHI, resLO);
9177    IRTemp argLhi = newTemp(Ity_I64);
9178    IRTemp argLlo = newTemp(Ity_I64);
9179    IRTemp argRhi = newTemp(Ity_I64);
9180    IRTemp argRlo = newTemp(Ity_I64);
9181    assign(argLhi, unop(Iop_V128HIto64, mkexpr(argL)));
9182    assign(argLlo, unop(Iop_V128to64,   mkexpr(argL)));
9183    assign(argRhi, unop(Iop_V128HIto64, mkexpr(argR)));
9184    assign(argRlo, unop(Iop_V128to64,   mkexpr(argR)));
9185    IROp opMulls = mkVecMULLS(sizeNarrow);
9186    assign(*resHI, binop(opMulls, mkexpr(argLhi), mkexpr(argRhi)));
9187    assign(*resLO, binop(opMulls, mkexpr(argLlo), mkexpr(argRlo)));
9188 }
9189
9190
9191 /* Generate IR for SQDMULH and SQRDMULH: signedly wideningly multiply,
9192    double that, possibly add a rounding constant (R variants), and take
9193    the high half. */
9194 static
9195 void math_SQDMULH ( /*OUT*/IRTemp* res,
9196                     /*OUT*/IRTemp* sat1q, /*OUT*/IRTemp* sat1n,
9197                     Bool isR, UInt size, IRTemp vN, IRTemp vM )
9198 {
9199    vassert(size == X01 || size == X10); /* s or h only */
9200
9201    newTempsV128_3(res, sat1q, sat1n);
9202
9203    IRTemp mullsHI = IRTemp_INVALID, mullsLO = IRTemp_INVALID;
9204    math_MULLS(&mullsHI, &mullsLO, size, vN, vM);
9205
9206    IRTemp addWide = mkVecADD(size+1);
9207
9208    if (isR) {
9209       assign(*sat1q, binop(mkVecQRDMULHIS(size), mkexpr(vN), mkexpr(vM)));
9210
9211       Int    rcShift    = size == X01 ? 15 : 31;
9212       IRTemp roundConst = math_VEC_DUP_IMM(size+1, 1ULL << rcShift);
9213       assign(*sat1n,
9214              binop(mkVecCATODDLANES(size),
9215                    binop(addWide,
9216                          binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
9217                          mkexpr(roundConst)),
9218                    binop(addWide,
9219                          binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO)),
9220                          mkexpr(roundConst))));
9221    } else {
9222       assign(*sat1q, binop(mkVecQDMULHIS(size), mkexpr(vN), mkexpr(vM)));
9223
9224       assign(*sat1n,
9225              binop(mkVecCATODDLANES(size),
9226                    binop(addWide, mkexpr(mullsHI), mkexpr(mullsHI)),
9227                    binop(addWide, mkexpr(mullsLO), mkexpr(mullsLO))));
9228    }
9229
9230    assign(*res, mkexpr(*sat1q));
9231 }
9232
9233 /* Generate IR for SQRDMLAH and SQRDMLSH: signedly wideningly multiply,
9234    double, add a rounding constant, take the high half and accumulate. */
9235 static
9236 void math_SQRDMLAH ( /*OUT*/IRTemp* res, /*OUT*/IRTemp* res_nosat, Bool isAdd,
9237                      UInt size, IRTemp vD, IRTemp vN, IRTemp vM )
9238 {
9239    vassert(size == X01 || size == X10); /* s or h only */
9240
9241    /* SQRDMLAH = SQADD(A, SQRDMULH(B, C)) */
9242
9243    IRTemp mul, mul_nosat, dummy;
9244    mul = mul_nosat = dummy = IRTemp_INVALID;
9245    math_SQDMULH(&mul, &dummy, &mul_nosat, True/*R*/, size, vN, vM);
9246
9247    IROp  op = isAdd ? mkVecADD(size)   : mkVecSUB(size);
9248    IROp qop = isAdd ? mkVecQADDS(size) : mkVecQSUBS(size);
9249    newTempsV128_2(res, res_nosat);
9250    assign(*res, binop(qop, mkexpr(vD), mkexpr(mul)));
9251    assign(*res_nosat, binop(op, mkexpr(vD), mkexpr(mul_nosat)));
9252 }
9253
9254
9255 /* Generate IR for SQSHL, UQSHL, SQSHLU by imm.  Put the result in
9256    a new temp in *res, and the Q difference pair in new temps in
9257    *qDiff1 and *qDiff2 respectively.  |nm| denotes which of the
9258    three operations it is. */
9259 static
9260 void math_QSHL_IMM ( /*OUT*/IRTemp* res,
9261                      /*OUT*/IRTemp* qDiff1, /*OUT*/IRTemp* qDiff2,
9262                      IRTemp src, UInt size, UInt shift, const HChar* nm )
9263 {
9264    vassert(size <= 3);
9265    UInt laneBits = 8 << size;
9266    vassert(shift < laneBits);
9267    newTempsV128_3(res, qDiff1, qDiff2);
9268    IRTemp z128 = newTempV128();
9269    assign(z128, mkV128(0x0000));
9270
9271    /* UQSHL */
9272    if (vex_streq(nm, "uqshl")) {
9273       IROp qop = mkVecQSHLNSATUU(size);
9274       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
9275       if (shift == 0) {
9276          /* No shift means no saturation. */
9277          assign(*qDiff1, mkexpr(z128));
9278          assign(*qDiff2, mkexpr(z128));
9279       } else {
9280          /* Saturation has occurred if any of the shifted-out bits are
9281             nonzero.  We get the shifted-out bits by right-shifting the
9282             original value. */
9283          UInt rshift = laneBits - shift;
9284          vassert(rshift >= 1 && rshift < laneBits);
9285          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
9286          assign(*qDiff2, mkexpr(z128));
9287       }
9288       return;
9289    }
9290
9291    /* SQSHL */
9292    if (vex_streq(nm, "sqshl")) {
9293       IROp qop = mkVecQSHLNSATSS(size);
9294       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
9295       if (shift == 0) {
9296          /* No shift means no saturation. */
9297          assign(*qDiff1, mkexpr(z128));
9298          assign(*qDiff2, mkexpr(z128));
9299       } else {
9300          /* Saturation has occurred if any of the shifted-out bits are
9301             different from the top bit of the original value. */
9302          UInt rshift = laneBits - 1 - shift;
9303          vassert(rshift < laneBits-1);
9304          /* qDiff1 is the shifted out bits, and the top bit of the original
9305             value, preceded by zeroes. */
9306          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
9307          /* qDiff2 is the top bit of the original value, cloned the
9308             correct number of times. */
9309          assign(*qDiff2, binop(mkVecSHRN(size),
9310                                binop(mkVecSARN(size), mkexpr(src),
9311                                                       mkU8(laneBits-1)),
9312                                mkU8(rshift)));
9313          /* This also succeeds in comparing the top bit of the original
9314             value to itself, which is a bit stupid, but not wrong. */
9315       }
9316       return;
9317    }
9318
9319    /* SQSHLU */
9320    if (vex_streq(nm, "sqshlu")) {
9321       IROp qop = mkVecQSHLNSATSU(size);
9322       assign(*res, binop(qop, mkexpr(src), mkU8(shift)));
9323       if (shift == 0) {
9324          /* If there's no shift, saturation depends on the top bit
9325             of the source. */
9326          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(laneBits-1)));
9327          assign(*qDiff2, mkexpr(z128));
9328       } else {
9329          /* Saturation has occurred if any of the shifted-out bits are
9330             nonzero.  We get the shifted-out bits by right-shifting the
9331             original value. */
9332          UInt rshift = laneBits - shift;
9333          vassert(rshift >= 1 && rshift < laneBits);
9334          assign(*qDiff1, binop(mkVecSHRN(size), mkexpr(src), mkU8(rshift)));
9335          assign(*qDiff2, mkexpr(z128));
9336       }
9337       return;
9338    }
9339
9340    vassert(0);
9341 }
9342
9343
9344 /* Generate IR to do SRHADD and URHADD. */
9345 static
9346 IRTemp math_RHADD ( UInt size, Bool isU, IRTemp aa, IRTemp bb )
9347 {
9348    /* Generate this:
9349       (A >> 1) + (B >> 1) + (((A & 1) + (B & 1) + 1) >> 1)
9350    */
9351    vassert(size <= 3);
9352    IROp opSHR = isU ? mkVecSHRN(size) : mkVecSARN(size);
9353    IROp opADD = mkVecADD(size);
9354    /* The only tricky bit is to generate the correct vector 1 constant. */
9355    const ULong ones64[4]
9356       = { 0x0101010101010101ULL, 0x0001000100010001ULL,
9357           0x0000000100000001ULL, 0x0000000000000001ULL };
9358    IRTemp imm64 = newTemp(Ity_I64);
9359    assign(imm64, mkU64(ones64[size]));
9360    IRTemp vecOne = newTempV128();
9361    assign(vecOne, binop(Iop_64HLtoV128, mkexpr(imm64), mkexpr(imm64)));
9362    IRTemp scaOne = newTemp(Ity_I8);
9363    assign(scaOne, mkU8(1));
9364    IRTemp res = newTempV128();
9365    assign(res,
9366           binop(opADD,
9367                 binop(opSHR, mkexpr(aa), mkexpr(scaOne)),
9368                 binop(opADD,
9369                       binop(opSHR, mkexpr(bb), mkexpr(scaOne)),
9370                       binop(opSHR,
9371                             binop(opADD,
9372                                   binop(opADD,
9373                                         binop(Iop_AndV128, mkexpr(aa),
9374                                                            mkexpr(vecOne)),
9375                                         binop(Iop_AndV128, mkexpr(bb),
9376                                                            mkexpr(vecOne))
9377                                   ),
9378                                   mkexpr(vecOne)
9379                             ),
9380                             mkexpr(scaOne)
9381                       )
9382                 )
9383           )
9384    );
9385    return res;
9386 }
9387
9388
9389 /* Generate IR to do {U,S}ADDLP */
9390 static
9391 IRTemp math_ADDLP ( UInt sizeNarrow, Bool isU, IRTemp src )
9392 {
9393    IRTemp res = newTempV128();
9394    assign(res,
9395             binop(mkVecADD(sizeNarrow+1),
9396                   mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
9397                            isU, True/*fromOdd*/, sizeNarrow, mkexpr(src))),
9398                   mkexpr(math_WIDEN_EVEN_OR_ODD_LANES(
9399                            isU, False/*!fromOdd*/, sizeNarrow, mkexpr(src)))));
9400    return res;
9401 }
9402
9403
9404 /* QCFLAG tracks the SIMD sticky saturation status.  Update the status
9405    thusly: if, after application of |opZHI| to both |qres| and |nres|,
9406    they have the same value, leave QCFLAG unchanged.  Otherwise, set it
9407    (implicitly) to 1.  |opZHI| may only be one of the Iop_ZeroHIxxofV128
9408    operators, or Iop_INVALID, in which case |qres| and |nres| are used
9409    unmodified.  The presence |opZHI| means this function can be used to
9410    generate QCFLAG update code for both scalar and vector SIMD operations.
9411 */
9412 static
9413 void updateQCFLAGwithDifferenceZHI ( IRTemp qres, IRTemp nres, IROp opZHI )
9414 {
9415    IRTemp diff      = newTempV128();
9416    IRTemp oldQCFLAG = newTempV128();
9417    IRTemp newQCFLAG = newTempV128();
9418    if (opZHI == Iop_INVALID) {
9419       assign(diff, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres)));
9420    } else {
9421       vassert(opZHI == Iop_ZeroHI64ofV128
9422               || opZHI == Iop_ZeroHI96ofV128 || opZHI == Iop_ZeroHI112ofV128);
9423       assign(diff, unop(opZHI, binop(Iop_XorV128, mkexpr(qres), mkexpr(nres))));
9424    }
9425    assign(oldQCFLAG, IRExpr_Get(OFFB_QCFLAG, Ity_V128));
9426    assign(newQCFLAG, binop(Iop_OrV128, mkexpr(oldQCFLAG), mkexpr(diff)));
9427    stmt(IRStmt_Put(OFFB_QCFLAG, mkexpr(newQCFLAG)));
9428 }
9429
9430
9431 /* A variant of updateQCFLAGwithDifferenceZHI in which |qres| and |nres|
9432    are used unmodified, hence suitable for QCFLAG updates for whole-vector
9433    operations. */
9434 static
9435 void updateQCFLAGwithDifference ( IRTemp qres, IRTemp nres )
9436 {
9437    updateQCFLAGwithDifferenceZHI(qres, nres, Iop_INVALID);
9438 }
9439
9440
9441 /* Generate IR to rearrange two vector values in a way which is useful
9442    for doing S/D/H add-pair etc operations.  There are 5 cases:
9443
9444    2d:  [m1 m0] [n1 n0]  -->  [m1 n1] [m0 n0]
9445
9446    4s:  [m3 m2 m1 m0] [n3 n2 n1 n0]  -->  [m3 m1 n3 n1] [m2 m0 n2 n0]
9447
9448    8h:  [m7 m6 m5 m4 m3 m2 m1 m0] [n7 n6 n5 n4 n3 n2 n1 n0] -->
9449         [m7 m5 n7 n5 m3 m1 n3 n1] [m6 m4 n6 n4 m2 m0 n2 n0]
9450
9451    2s:  [m3 m2 m1 m0] [n3 n2 n1 n0]  -->  [0 0 m1 n1] [0 0 m0 n0]
9452
9453    4h:  [m7 m6 m5 m4 m3 m2 m1 m0] [n7 n6 n5 n4 n3 n2 n1 n0] -->
9454         [ 0  0  0  0 m3 m1 n3 n1] [ 0  0  0  0 m2 m0 n2 n0]
9455 */
9456 static
9457 void math_REARRANGE_FOR_FLOATING_PAIRWISE (
9458         /*OUT*/IRTemp* rearrL, /*OUT*/IRTemp* rearrR,
9459         IRTemp vecM, IRTemp vecN, ARM64VecESize sz, UInt bitQ
9460      )
9461 {
9462    vassert(rearrL && *rearrL == IRTemp_INVALID);
9463    vassert(rearrR && *rearrR == IRTemp_INVALID);
9464    *rearrL = newTempV128();
9465    *rearrR = newTempV128();
9466
9467    switch (sz) {
9468       case ARM64VSizeD:
9469          // 2d case
9470          vassert(bitQ == 1);
9471          assign(*rearrL, binop(Iop_InterleaveHI64x2, mkexpr(vecM), mkexpr(vecN)));
9472          assign(*rearrR, binop(Iop_InterleaveLO64x2, mkexpr(vecM), mkexpr(vecN)));
9473          break;
9474
9475       case ARM64VSizeS:
9476          if (bitQ == 1) {
9477             // 4s case
9478             assign(*rearrL, binop(Iop_CatOddLanes32x4,  mkexpr(vecM), mkexpr(vecN)));
9479             assign(*rearrR, binop(Iop_CatEvenLanes32x4, mkexpr(vecM), mkexpr(vecN)));
9480          } else {
9481             // 2s case
9482             IRTemp m1n1m0n0 = newTempV128();
9483             IRTemp m0n0m1n1 = newTempV128();
9484             assign(m1n1m0n0, binop(Iop_InterleaveLO32x4,
9485                                    mkexpr(vecM), mkexpr(vecN)));
9486             assign(m0n0m1n1, triop(Iop_SliceV128,
9487                                    mkexpr(m1n1m0n0), mkexpr(m1n1m0n0), mkU8(8)));
9488             assign(*rearrL, unop(Iop_ZeroHI64ofV128, mkexpr(m1n1m0n0)));
9489             assign(*rearrR, unop(Iop_ZeroHI64ofV128, mkexpr(m0n0m1n1)));
9490          }
9491          break;
9492
9493       case ARM64VSizeH:
9494          if (bitQ == 1) {
9495             // 8h case
9496             assign(*rearrL, binop(Iop_CatOddLanes16x8,  mkexpr(vecM), mkexpr(vecN)));
9497             assign(*rearrR, binop(Iop_CatEvenLanes16x8, mkexpr(vecM), mkexpr(vecN)));
9498          } else {
9499             // 4h case
9500             IRTemp m3m1n3n1 = newTempV128();
9501             IRTemp m2m0n2n0 = newTempV128();
9502             assign(m3m1n3n1, binop(Iop_CatOddLanes16x8, mkexpr(vecM), mkexpr(vecN)));
9503             assign(m2m0n2n0, binop(Iop_CatEvenLanes16x8, mkexpr(vecM), mkexpr(vecN)));
9504             assign(*rearrL, unop(Iop_ZeroHI64ofV128,
9505                                  binop(Iop_CatEvenLanes32x4, mkexpr(m3m1n3n1),
9506                                                              mkexpr(m3m1n3n1))));
9507             assign(*rearrR, unop(Iop_ZeroHI64ofV128,
9508                                  binop(Iop_CatEvenLanes32x4, mkexpr(m2m0n2n0),
9509                                                              mkexpr(m2m0n2n0))));
9510          }
9511          break;
9512
9513       default: vpanic("math_REARRANGE_FOR_FLOATING_PAIRWISE");
9514    }
9515 }
9516
9517
9518 /* Returns 2.0 ^ (-n) for n in 1 .. 64 */
9519 static Double two_to_the_minus ( Int n )
9520 {
9521    if (n == 1) return 0.5;
9522    vassert(n >= 2 && n <= 64);
9523    Int half = n / 2;
9524    return two_to_the_minus(half) * two_to_the_minus(n - half);
9525 }
9526
9527
9528 /* Returns 2.0 ^ n for n in 1 .. 64 */
9529 static Double two_to_the_plus ( Int n )
9530 {
9531    if (n == 1) return 2.0;
9532    vassert(n >= 2 && n <= 64);
9533    Int half = n / 2;
9534    return two_to_the_plus(half) * two_to_the_plus(n - half);
9535 }
9536
9537
9538 /*------------------------------------------------------------*/
9539 /*--- SIMD and FP instructions                             ---*/
9540 /*------------------------------------------------------------*/
9541
9542 static
9543 Bool dis_AdvSIMD_EXT(/*MB_OUT*/DisResult* dres, UInt insn)
9544 {
9545    /* 31  29     23  21 20 15 14   10 9 4
9546       0 q 101110 op2 0  m  0  imm4 0  n d
9547       Decode fields: op2
9548    */
9549 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9550    if (INSN(31,31) != 0
9551        || INSN(29,24) != BITS6(1,0,1,1,1,0)
9552        || INSN(21,21) != 0 || INSN(15,15) != 0 || INSN(10,10) != 0) {
9553       return False;
9554    }
9555    UInt bitQ = INSN(30,30);
9556    UInt op2  = INSN(23,22);
9557    UInt mm   = INSN(20,16);
9558    UInt imm4 = INSN(14,11);
9559    UInt nn   = INSN(9,5);
9560    UInt dd   = INSN(4,0);
9561
9562    if (op2 == BITS2(0,0)) {
9563       /* -------- 00: EXT 16b_16b_16b, 8b_8b_8b -------- */
9564       IRTemp sHi = newTempV128();
9565       IRTemp sLo = newTempV128();
9566       IRTemp res = newTempV128();
9567       assign(sHi, getQReg128(mm));
9568       assign(sLo, getQReg128(nn));
9569       if (bitQ == 1) {
9570          if (imm4 == 0) {
9571             assign(res, mkexpr(sLo));
9572          } else {
9573             vassert(imm4 >= 1 && imm4 <= 15);
9574             assign(res, triop(Iop_SliceV128,
9575                               mkexpr(sHi), mkexpr(sLo), mkU8(imm4)));
9576          }
9577          putQReg128(dd, mkexpr(res));
9578          DIP("ext v%u.16b, v%u.16b, v%u.16b, #%u\n", dd, nn, mm, imm4);
9579       } else {
9580          if (imm4 >= 8) return False;
9581          if (imm4 == 0) {
9582             assign(res, mkexpr(sLo));
9583          } else {
9584             vassert(imm4 >= 1 && imm4 <= 7);
9585             IRTemp hi64lo64 = newTempV128();
9586             assign(hi64lo64, binop(Iop_InterleaveLO64x2,
9587                                    mkexpr(sHi), mkexpr(sLo)));
9588             assign(res, triop(Iop_SliceV128,
9589                               mkexpr(hi64lo64), mkexpr(hi64lo64), mkU8(imm4)));
9590          }
9591          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
9592          DIP("ext v%u.8b, v%u.8b, v%u.8b, #%u\n", dd, nn, mm, imm4);
9593       }
9594       return True;
9595    }
9596
9597    return False;
9598 #  undef INSN
9599 }
9600
9601
9602 static
9603 Bool dis_AdvSIMD_TBL_TBX(/*MB_OUT*/DisResult* dres, UInt insn)
9604 {
9605    /* 31  29     23  21 20 15 14  12 11 9 4
9606       0 q 001110 op2 0  m  0  len op 00 n d
9607       Decode fields: op2,len,op
9608    */
9609 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9610    if (INSN(31,31) != 0
9611        || INSN(29,24) != BITS6(0,0,1,1,1,0)
9612        || INSN(21,21) != 0
9613        || INSN(15,15) != 0
9614        || INSN(11,10) != BITS2(0,0)) {
9615       return False;
9616    }
9617    UInt bitQ  = INSN(30,30);
9618    UInt op2   = INSN(23,22);
9619    UInt mm    = INSN(20,16);
9620    UInt len   = INSN(14,13);
9621    UInt bitOP = INSN(12,12);
9622    UInt nn    = INSN(9,5);
9623    UInt dd    = INSN(4,0);
9624
9625    if (op2 == X00) {
9626       /* -------- 00,xx,0 TBL, xx register table -------- */
9627       /* -------- 00,xx,1 TBX, xx register table -------- */
9628       /* 31  28        20 15 14  12  9 4
9629          0q0 01110 000 m  0  len 000 n d  TBL Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
9630          0q0 01110 000 m  0  len 100 n d  TBX Vd.Ta, {Vn .. V(n+len)%32}, Vm.Ta
9631          where Ta = 16b(q=1) or 8b(q=0)
9632       */
9633       Bool isTBX = bitOP == 1;
9634       /* The out-of-range values to use. */
9635       IRTemp oor_values = newTempV128();
9636       assign(oor_values, isTBX ? getQReg128(dd) : mkV128(0));
9637       /* src value */
9638       IRTemp src = newTempV128();
9639       assign(src, getQReg128(mm));
9640       /* The table values */
9641       IRTemp tab[4];
9642       UInt   i;
9643       for (i = 0; i <= len; i++) {
9644          vassert(i < 4);
9645          tab[i] = newTempV128();
9646          assign(tab[i], getQReg128((nn + i) % 32));
9647       }
9648       IRTemp res = math_TBL_TBX(tab, len, src, oor_values);
9649       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9650       const HChar* Ta = bitQ ==1 ? "16b" : "8b";
9651       const HChar* nm = isTBX ? "tbx" : "tbl";
9652       DIP("%s %s.%s, {v%u.16b .. v%u.16b}, %s.%s\n",
9653           nm, nameQReg128(dd), Ta, nn, (nn + len) % 32, nameQReg128(mm), Ta);
9654       return True;
9655    }
9656
9657 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9658    return False;
9659 #  undef INSN
9660 }
9661
9662
9663 static
9664 Bool dis_AdvSIMD_ZIP_UZP_TRN(/*MB_OUT*/DisResult* dres, UInt insn)
9665 {
9666    /* 31  29     23   21 20 15 14     11 9 4
9667       0 q 001110 size 0  m  0  opcode 10 n d
9668       Decode fields: opcode
9669    */
9670 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9671    if (INSN(31,31) != 0
9672        || INSN(29,24) != BITS6(0,0,1,1,1,0)
9673        || INSN(21,21) != 0 || INSN(15,15) != 0 || INSN(11,10) != BITS2(1,0)) {
9674       return False;
9675    }
9676    UInt bitQ   = INSN(30,30);
9677    UInt size   = INSN(23,22);
9678    UInt mm     = INSN(20,16);
9679    UInt opcode = INSN(14,12);
9680    UInt nn     = INSN(9,5);
9681    UInt dd     = INSN(4,0);
9682
9683    if (opcode == BITS3(0,0,1) || opcode == BITS3(1,0,1)) {
9684       /* -------- 001 UZP1 std7_std7_std7 -------- */
9685       /* -------- 101 UZP2 std7_std7_std7 -------- */
9686       if (bitQ == 0 && size == X11) return False; // implied 1d case
9687       Bool   isUZP1 = opcode == BITS3(0,0,1);
9688       IROp   op     = isUZP1 ? mkVecCATEVENLANES(size)
9689                              : mkVecCATODDLANES(size);
9690       IRTemp preL = newTempV128();
9691       IRTemp preR = newTempV128();
9692       IRTemp res  = newTempV128();
9693       if (bitQ == 0) {
9694          assign(preL, binop(Iop_InterleaveLO64x2, getQReg128(mm),
9695                                                   getQReg128(nn)));
9696          assign(preR, mkexpr(preL));
9697       } else {
9698          assign(preL, getQReg128(mm));
9699          assign(preR, getQReg128(nn));
9700       }
9701       assign(res, binop(op, mkexpr(preL), mkexpr(preR)));
9702       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9703       const HChar* nm  = isUZP1 ? "uzp1" : "uzp2";
9704       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9705       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
9706           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
9707       return True;
9708    }
9709
9710    if (opcode == BITS3(0,1,0) || opcode == BITS3(1,1,0)) {
9711       /* -------- 010 TRN1 std7_std7_std7 -------- */
9712       /* -------- 110 TRN2 std7_std7_std7 -------- */
9713       if (bitQ == 0 && size == X11) return False; // implied 1d case
9714       Bool   isTRN1 = opcode == BITS3(0,1,0);
9715       IROp   op1    = isTRN1 ? mkVecCATEVENLANES(size)
9716                              : mkVecCATODDLANES(size);
9717       IROp op2 = mkVecINTERLEAVEHI(size);
9718       IRTemp srcM = newTempV128();
9719       IRTemp srcN = newTempV128();
9720       IRTemp res  = newTempV128();
9721       assign(srcM, getQReg128(mm));
9722       assign(srcN, getQReg128(nn));
9723       assign(res, binop(op2, binop(op1, mkexpr(srcM), mkexpr(srcM)),
9724                              binop(op1, mkexpr(srcN), mkexpr(srcN))));
9725       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9726       const HChar* nm  = isTRN1 ? "trn1" : "trn2";
9727       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9728       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
9729           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
9730       return True;
9731    }
9732
9733    if (opcode == BITS3(0,1,1) || opcode == BITS3(1,1,1)) {
9734       /* -------- 011 ZIP1 std7_std7_std7 -------- */
9735       /* -------- 111 ZIP2 std7_std7_std7 -------- */
9736       if (bitQ == 0 && size == X11) return False; // implied 1d case
9737       Bool   isZIP1 = opcode == BITS3(0,1,1);
9738       IROp   op     = isZIP1 ? mkVecINTERLEAVELO(size)
9739                              : mkVecINTERLEAVEHI(size);
9740       IRTemp preL = newTempV128();
9741       IRTemp preR = newTempV128();
9742       IRTemp res  = newTempV128();
9743       if (bitQ == 0 && !isZIP1) {
9744          IRTemp z128 = newTempV128();
9745          assign(z128, mkV128(0x0000));
9746          // preL = Vm shifted left 32 bits
9747          // preR = Vn shifted left 32 bits
9748          assign(preL, triop(Iop_SliceV128,
9749                             getQReg128(mm), mkexpr(z128), mkU8(12)));
9750          assign(preR, triop(Iop_SliceV128,
9751                             getQReg128(nn), mkexpr(z128), mkU8(12)));
9752
9753       } else {
9754          assign(preL, getQReg128(mm));
9755          assign(preR, getQReg128(nn));
9756       }
9757       assign(res, binop(op, mkexpr(preL), mkexpr(preR)));
9758       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9759       const HChar* nm  = isZIP1 ? "zip1" : "zip2";
9760       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9761       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
9762           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
9763       return True;
9764    }
9765
9766    return False;
9767 #  undef INSN
9768 }
9769
9770
9771 static
9772 Bool dis_AdvSIMD_across_lanes(/*MB_OUT*/DisResult* dres, UInt insn)
9773 {
9774    /* 31    28    23   21    16     11 9 4
9775       0 q u 01110 size 11000 opcode 10 n d
9776       Decode fields: u,size,opcode
9777    */
9778 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9779    if (INSN(31,31) != 0
9780        || INSN(28,24) != BITS5(0,1,1,1,0)
9781        || INSN(21,17) != BITS5(1,1,0,0,0) || INSN(11,10) != BITS2(1,0)) {
9782       return False;
9783    }
9784    UInt bitQ   = INSN(30,30);
9785    UInt bitU   = INSN(29,29);
9786    UInt size   = INSN(23,22);
9787    UInt opcode = INSN(16,12);
9788    UInt nn     = INSN(9,5);
9789    UInt dd     = INSN(4,0);
9790
9791    if (opcode == BITS5(0,0,0,1,1)) {
9792       /* -------- 0,xx,00011 SADDLV -------- */
9793       /* -------- 1,xx,00011 UADDLV -------- */
9794       /* size is the narrow size */
9795       if (size == X11 || (size == X10 && bitQ == 0)) return False;
9796       Bool   isU = bitU == 1;
9797       IRTemp src = newTempV128();
9798       assign(src, getQReg128(nn));
9799       /* The basic plan is to widen the lower half, and if Q = 1,
9800          the upper half too.  Add them together (if Q = 1), and in
9801          either case fold with add at twice the lane width.
9802       */
9803       IRExpr* widened
9804          = mkexpr(math_WIDEN_LO_OR_HI_LANES(
9805                      isU, False/*!fromUpperHalf*/, size, mkexpr(src)));
9806       if (bitQ == 1) {
9807          widened
9808             = binop(mkVecADD(size+1),
9809                     widened,
9810                     mkexpr(math_WIDEN_LO_OR_HI_LANES(
9811                               isU, True/*fromUpperHalf*/, size, mkexpr(src)))
9812               );
9813       }
9814       /* Now fold. */
9815       IRTemp tWi = newTempV128();
9816       assign(tWi, widened);
9817       IRTemp res = math_FOLDV(tWi, mkVecADD(size+1));
9818       putQReg128(dd, mkexpr(res));
9819       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9820       const HChar  ch  = "bhsd"[size];
9821       DIP("%s %s.%c, %s.%s\n", isU ? "uaddlv" : "saddlv",
9822           nameQReg128(dd), ch, nameQReg128(nn), arr);
9823       return True;
9824    }
9825
9826    UInt ix = 0;
9827    /**/ if (opcode == BITS5(0,1,0,1,0)) { ix = bitU == 0 ? 1 : 2; }
9828    else if (opcode == BITS5(1,1,0,1,0)) { ix = bitU == 0 ? 3 : 4; }
9829    else if (opcode == BITS5(1,1,0,1,1) && bitU == 0) { ix = 5; }
9830    /**/
9831    if (ix != 0) {
9832       /* -------- 0,xx,01010: SMAXV -------- (1) */
9833       /* -------- 1,xx,01010: UMAXV -------- (2) */
9834       /* -------- 0,xx,11010: SMINV -------- (3) */
9835       /* -------- 1,xx,11010: UMINV -------- (4) */
9836       /* -------- 0,xx,11011: ADDV  -------- (5) */
9837       vassert(ix >= 1 && ix <= 5);
9838       if (size == X11) return False; // 1d,2d cases not allowed
9839       if (size == X10 && bitQ == 0) return False; // 2s case not allowed
9840       const IROp opMAXS[3]
9841          = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4 };
9842       const IROp opMAXU[3]
9843          = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4 };
9844       const IROp opMINS[3]
9845          = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4 };
9846       const IROp opMINU[3]
9847          = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4 };
9848       const IROp opADD[3]
9849          = { Iop_Add8x16,  Iop_Add16x8,  Iop_Add32x4 };
9850       vassert(size < 3);
9851       IROp op = Iop_INVALID;
9852       const HChar* nm = NULL;
9853       switch (ix) {
9854          case 1: op = opMAXS[size]; nm = "smaxv"; break;
9855          case 2: op = opMAXU[size]; nm = "umaxv"; break;
9856          case 3: op = opMINS[size]; nm = "sminv"; break;
9857          case 4: op = opMINU[size]; nm = "uminv"; break;
9858          case 5: op = opADD[size];  nm = "addv";  break;
9859          default: vassert(0);
9860       }
9861       vassert(op != Iop_INVALID && nm != NULL);
9862       IRTemp tN1 = newTempV128();
9863       assign(tN1, getQReg128(nn));
9864       /* If Q == 0, we're just folding lanes in the lower half of
9865          the value.  In which case, copy the lower half of the
9866          source into the upper half, so we can then treat it the
9867          same as the full width case.  Except for the addition case,
9868          in which we have to zero out the upper half. */
9869       IRTemp tN2 = newTempV128();
9870       assign(tN2, bitQ == 0
9871                      ? (ix == 5 ? unop(Iop_ZeroHI64ofV128, mkexpr(tN1))
9872                                 : mk_CatEvenLanes64x2(tN1,tN1))
9873                      : mkexpr(tN1));
9874       IRTemp res = math_FOLDV(tN2, op);
9875       if (res == IRTemp_INVALID)
9876          return False; /* means math_FOLDV
9877                           doesn't handle this case yet */
9878       putQReg128(dd, mkexpr(res));
9879       const IRType tys[3] = { Ity_I8, Ity_I16, Ity_I32 };
9880       IRType laneTy = tys[size];
9881       const HChar* arr = nameArr_Q_SZ(bitQ, size);
9882       DIP("%s %s, %s.%s\n", nm,
9883           nameQRegLO(dd, laneTy), nameQReg128(nn), arr);
9884       return True;
9885    }
9886
9887    if ((size == X00 || size == X10)
9888        && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) {
9889       /* -------- 0,00,01100: FMAXMNV s_4s -------- */
9890       /* -------- 0,10,01100: FMINMNV s_4s -------- */
9891       /* -------- 1,00,01111: FMAXV   s_4s -------- */
9892       /* -------- 1,10,01111: FMINV   s_4s -------- */
9893       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
9894       if (bitQ == 0) return False; // Only 4s is allowed
9895       Bool   isMIN = (size & 2) == 2;
9896       Bool   isNM  = opcode == BITS5(0,1,1,0,0);
9897       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(2);
9898       IRTemp src = newTempV128();
9899       assign(src, getQReg128(nn));
9900       IRTemp res = math_FOLDV(src, opMXX);
9901       putQReg128(dd, mkexpr(res));
9902       DIP("%s%sv s%u, %u.4s\n",
9903           isMIN ? "fmin" : "fmax", isNM ? "nm" : "", dd, nn);
9904       return True;
9905    }
9906
9907 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9908    return False;
9909 #  undef INSN
9910 }
9911
9912
9913 static
9914 Bool dis_AdvSIMD_copy(/*MB_OUT*/DisResult* dres, UInt insn)
9915 {
9916    /* 31     28       20   15 14   10 9 4
9917       0 q op 01110000 imm5 0  imm4 1  n d
9918       Decode fields: q,op,imm4
9919    */
9920 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
9921    if (INSN(31,31) != 0
9922        || INSN(28,21) != BITS8(0,1,1,1,0,0,0,0)
9923        || INSN(15,15) != 0 || INSN(10,10) != 1) {
9924       return False;
9925    }
9926    UInt bitQ  = INSN(30,30);
9927    UInt bitOP = INSN(29,29);
9928    UInt imm5  = INSN(20,16);
9929    UInt imm4  = INSN(14,11);
9930    UInt nn    = INSN(9,5);
9931    UInt dd    = INSN(4,0);
9932
9933    /* -------- x,0,0000: DUP (element, vector) -------- */
9934    /* 31  28       20   15     9 4
9935       0q0 01110000 imm5 000001 n d  DUP Vd.T, Vn.Ts[index]
9936    */
9937    if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
9938       UInt   laneNo    = 0;
9939       UInt   laneSzLg2 = 0;
9940       HChar  laneCh    = '?';
9941       IRTemp res       = handle_DUP_VEC_ELEM(&laneNo, &laneSzLg2, &laneCh,
9942                                              getQReg128(nn), imm5);
9943       if (res == IRTemp_INVALID)
9944          return False;
9945       if (bitQ == 0 && laneSzLg2 == X11)
9946          return False; /* .1d case */
9947       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
9948       const HChar* arT = nameArr_Q_SZ(bitQ, laneSzLg2);
9949       DIP("dup %s.%s, %s.%c[%u]\n",
9950            nameQReg128(dd), arT, nameQReg128(nn), laneCh, laneNo);
9951       return True;
9952    }
9953
9954    /* -------- x,0,0001: DUP (general, vector) -------- */
9955    /* 31  28       20   15       9 4
9956       0q0 01110000 imm5 0 0001 1 n d  DUP Vd.T, Rn
9957       Q=0 writes 64, Q=1 writes 128
9958       imm5: xxxx1  8B(q=0)      or 16b(q=1),     R=W
9959             xxx10  4H(q=0)      or 8H(q=1),      R=W
9960             xx100  2S(q=0)      or 4S(q=1),      R=W
9961             x1000  Invalid(q=0) or 2D(q=1),      R=X
9962             x0000  Invalid(q=0) or Invalid(q=1)
9963       Require op=0, imm4=0001
9964    */
9965    if (bitOP == 0 && imm4 == BITS4(0,0,0,1)) {
9966       Bool   isQ = bitQ == 1;
9967       IRTemp w0  = newTemp(Ity_I64);
9968       const HChar* arT = "??";
9969       IRType laneTy = Ity_INVALID;
9970       if (imm5 & 1) {
9971          arT    = isQ ? "16b" : "8b";
9972          laneTy = Ity_I8;
9973          assign(w0, unop(Iop_8Uto64, unop(Iop_64to8, getIReg64orZR(nn))));
9974       }
9975       else if (imm5 & 2) {
9976          arT    = isQ ? "8h" : "4h";
9977          laneTy = Ity_I16;
9978          assign(w0, unop(Iop_16Uto64, unop(Iop_64to16, getIReg64orZR(nn))));
9979       }
9980       else if (imm5 & 4) {
9981          arT    = isQ ? "4s" : "2s";
9982          laneTy = Ity_I32;
9983          assign(w0, unop(Iop_32Uto64, unop(Iop_64to32, getIReg64orZR(nn))));
9984       }
9985       else if ((imm5 & 8) && isQ) {
9986          arT    = "2d";
9987          laneTy = Ity_I64;
9988          assign(w0, getIReg64orZR(nn));
9989       }
9990       else {
9991          /* invalid; leave laneTy unchanged. */
9992       }
9993       /* */
9994       if (laneTy != Ity_INVALID) {
9995          IRTemp w1 = math_DUP_TO_64(w0, laneTy);
9996          putQReg128(dd, binop(Iop_64HLtoV128,
9997                               isQ ? mkexpr(w1) : mkU64(0), mkexpr(w1)));
9998          DIP("dup %s.%s, %s\n",
9999              nameQReg128(dd), arT, nameIRegOrZR(laneTy == Ity_I64, nn));
10000          return True;
10001       }
10002       /* invalid */
10003       return False;
10004    }
10005
10006    /* -------- 1,0,0011: INS (general) -------- */
10007    /* 31  28       20   15     9 4
10008       010 01110000 imm5 000111 n d  INS Vd.Ts[ix], Rn
10009       where Ts,ix = case imm5 of xxxx1 -> B, xxxx
10010                                  xxx10 -> H, xxx
10011                                  xx100 -> S, xx
10012                                  x1000 -> D, x
10013    */
10014    if (bitQ == 1 && bitOP == 0 && imm4 == BITS4(0,0,1,1)) {
10015       HChar   ts     = '?';
10016       UInt    laneNo = 16;
10017       IRExpr* src    = NULL;
10018       if (imm5 & 1) {
10019          src    = unop(Iop_64to8, getIReg64orZR(nn));
10020          laneNo = (imm5 >> 1) & 15;
10021          ts     = 'b';
10022       }
10023       else if (imm5 & 2) {
10024          src    = unop(Iop_64to16, getIReg64orZR(nn));
10025          laneNo = (imm5 >> 2) & 7;
10026          ts     = 'h';
10027       }
10028       else if (imm5 & 4) {
10029          src    = unop(Iop_64to32, getIReg64orZR(nn));
10030          laneNo = (imm5 >> 3) & 3;
10031          ts     = 's';
10032       }
10033       else if (imm5 & 8) {
10034          src    = getIReg64orZR(nn);
10035          laneNo = (imm5 >> 4) & 1;
10036          ts     = 'd';
10037       }
10038       /* */
10039       if (src) {
10040          vassert(laneNo < 16);
10041          putQRegLane(dd, laneNo, src);
10042          DIP("ins %s.%c[%u], %s\n",
10043              nameQReg128(dd), ts, laneNo, nameIReg64orZR(nn));
10044          return True;
10045       }
10046       /* invalid */
10047       return False;
10048    }
10049
10050    /* -------- x,0,0101: SMOV -------- */
10051    /* -------- x,0,0111: UMOV -------- */
10052    /* 31  28        20   15     9 4
10053       0q0 01110 000 imm5 001111 n d  UMOV Xd/Wd, Vn.Ts[index]
10054       0q0 01110 000 imm5 001011 n d  SMOV Xd/Wd, Vn.Ts[index]
10055       dest is Xd when q==1, Wd when q==0
10056       UMOV:
10057          Ts,index,ops = case q:imm5 of
10058                           0:xxxx1 -> B, xxxx, 8Uto64
10059                           1:xxxx1 -> invalid
10060                           0:xxx10 -> H, xxx,  16Uto64
10061                           1:xxx10 -> invalid
10062                           0:xx100 -> S, xx,   32Uto64
10063                           1:xx100 -> invalid
10064                           1:x1000 -> D, x,    copy64
10065                           other   -> invalid
10066       SMOV:
10067          Ts,index,ops = case q:imm5 of
10068                           0:xxxx1 -> B, xxxx, (32Uto64 . 8Sto32)
10069                           1:xxxx1 -> B, xxxx, 8Sto64
10070                           0:xxx10 -> H, xxx,  (32Uto64 . 16Sto32)
10071                           1:xxx10 -> H, xxx,  16Sto64
10072                           0:xx100 -> invalid
10073                           1:xx100 -> S, xx,   32Sto64
10074                           1:x1000 -> invalid
10075                           other   -> invalid
10076    */
10077    if (bitOP == 0 && (imm4 == BITS4(0,1,0,1) || imm4 == BITS4(0,1,1,1))) {
10078       Bool isU  = (imm4 & 2) == 2;
10079       const HChar* arTs = "??";
10080       UInt    laneNo = 16; /* invalid */
10081       // Setting 'res' to non-NULL determines valid/invalid
10082       IRExpr* res    = NULL;
10083       if (!bitQ && (imm5 & 1)) { // 0:xxxx1
10084          laneNo = (imm5 >> 1) & 15;
10085          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
10086          res = isU ? unop(Iop_8Uto64, lane)
10087                    : unop(Iop_32Uto64, unop(Iop_8Sto32, lane));
10088          arTs = "b";
10089       }
10090       else if (bitQ && (imm5 & 1)) { // 1:xxxx1
10091          laneNo = (imm5 >> 1) & 15;
10092          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I8);
10093          res = isU ? NULL
10094                    : unop(Iop_8Sto64, lane);
10095          arTs = "b";
10096       }
10097       else if (!bitQ && (imm5 & 2)) { // 0:xxx10
10098          laneNo = (imm5 >> 2) & 7;
10099          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
10100          res = isU ? unop(Iop_16Uto64, lane)
10101                    : unop(Iop_32Uto64, unop(Iop_16Sto32, lane));
10102          arTs = "h";
10103       }
10104       else if (bitQ && (imm5 & 2)) { // 1:xxx10
10105          laneNo = (imm5 >> 2) & 7;
10106          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I16);
10107          res = isU ? NULL
10108                    : unop(Iop_16Sto64, lane);
10109          arTs = "h";
10110       }
10111       else if (!bitQ && (imm5 & 4)) { // 0:xx100
10112          laneNo = (imm5 >> 3) & 3;
10113          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
10114          res = isU ? unop(Iop_32Uto64, lane)
10115                    : NULL;
10116          arTs = "s";
10117       }
10118       else if (bitQ && (imm5 & 4)) { // 1:xxx10
10119          laneNo = (imm5 >> 3) & 3;
10120          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I32);
10121          res = isU ? NULL
10122                    : unop(Iop_32Sto64, lane);
10123          arTs = "s";
10124       }
10125       else if (bitQ && (imm5 & 8)) { // 1:x1000
10126          laneNo = (imm5 >> 4) & 1;
10127          IRExpr* lane = getQRegLane(nn, laneNo, Ity_I64);
10128          res = isU ? lane
10129                    : NULL;
10130          arTs = "d";
10131       }
10132       /* */
10133       if (res) {
10134          vassert(laneNo < 16);
10135          putIReg64orZR(dd, res);
10136          DIP("%cmov %s, %s.%s[%u]\n", isU ? 'u' : 's',
10137              nameIRegOrZR(bitQ == 1, dd),
10138              nameQReg128(nn), arTs, laneNo);
10139          return True;
10140       }
10141       /* invalid */
10142       return False;
10143    }
10144
10145    /* -------- 1,1,xxxx: INS (element) -------- */
10146    /* 31  28       20     14   9 4
10147       011 01110000 imm5 0 imm4 n d  INS Vd.Ts[ix1], Vn.Ts[ix2]
10148       where Ts,ix1,ix2
10149                = case imm5 of xxxx1 -> B, xxxx, imm4[3:0]
10150                               xxx10 -> H, xxx,  imm4[3:1]
10151                               xx100 -> S, xx,   imm4[3:2]
10152                               x1000 -> D, x,    imm4[3:3]
10153    */
10154    if (bitQ == 1 && bitOP == 1) {
10155       HChar   ts  = '?';
10156       IRType  ity = Ity_INVALID;
10157       UInt    ix1 = 16;
10158       UInt    ix2 = 16;
10159       if (imm5 & 1) {
10160          ts  = 'b';
10161          ity = Ity_I8;
10162          ix1 = (imm5 >> 1) & 15;
10163          ix2 = (imm4 >> 0) & 15;
10164       }
10165       else if (imm5 & 2) {
10166          ts  = 'h';
10167          ity = Ity_I16;
10168          ix1 = (imm5 >> 2) & 7;
10169          ix2 = (imm4 >> 1) & 7;
10170       }
10171       else if (imm5 & 4) {
10172          ts  = 's';
10173          ity = Ity_I32;
10174          ix1 = (imm5 >> 3) & 3;
10175          ix2 = (imm4 >> 2) & 3;
10176       }
10177       else if (imm5 & 8) {
10178          ts  = 'd';
10179          ity = Ity_I64;
10180          ix1 = (imm5 >> 4) & 1;
10181          ix2 = (imm4 >> 3) & 1;
10182       }
10183       /* */
10184       if (ity != Ity_INVALID) {
10185          vassert(ix1 < 16);
10186          vassert(ix2 < 16);
10187          putQRegLane(dd, ix1, getQRegLane(nn, ix2, ity));
10188          DIP("ins %s.%c[%u], %s.%c[%u]\n",
10189              nameQReg128(dd), ts, ix1, nameQReg128(nn), ts, ix2);
10190          return True;
10191       }
10192       /* invalid */
10193       return False;
10194    }
10195
10196    return False;
10197 #  undef INSN
10198 }
10199
10200
10201 static
10202 Bool dis_AdvSIMD_modified_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
10203 {
10204    /* 31    28          18  15    11 9     4
10205       0q op 01111 00000 abc cmode 01 defgh d
10206       Decode fields: q,op,cmode
10207       Bit 11 is really "o2", but it is always zero.
10208    */
10209 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10210    if (INSN(31,31) != 0
10211        || INSN(28,19) != BITS10(0,1,1,1,1,0,0,0,0,0)
10212        || INSN(11,10) != BITS2(0,1)) {
10213       return False;
10214    }
10215    UInt bitQ     = INSN(30,30);
10216    UInt bitOP    = INSN(29,29);
10217    UInt cmode    = INSN(15,12);
10218    UInt abcdefgh = (INSN(18,16) << 5) | INSN(9,5);
10219    UInt dd       = INSN(4,0);
10220
10221    ULong imm64lo  = 0;
10222    UInt  op_cmode = (bitOP << 4) | cmode;
10223    Bool  ok       = False;
10224    Bool  isORR    = False;
10225    Bool  isBIC    = False;
10226    Bool  isMOV    = False;
10227    Bool  isMVN    = False;
10228    Bool  isFMOV   = False;
10229    switch (op_cmode) {
10230       /* -------- x,0,0000 MOVI 32-bit shifted imm -------- */
10231       /* -------- x,0,0010 MOVI 32-bit shifted imm -------- */
10232       /* -------- x,0,0100 MOVI 32-bit shifted imm -------- */
10233       /* -------- x,0,0110 MOVI 32-bit shifted imm -------- */
10234       case BITS5(0,0,0,0,0): case BITS5(0,0,0,1,0):
10235       case BITS5(0,0,1,0,0): case BITS5(0,0,1,1,0): // 0:0xx0
10236          ok = True; isMOV = True; break;
10237
10238       /* -------- x,0,0001 ORR (vector, immediate) 32-bit -------- */
10239       /* -------- x,0,0011 ORR (vector, immediate) 32-bit -------- */
10240       /* -------- x,0,0101 ORR (vector, immediate) 32-bit -------- */
10241       /* -------- x,0,0111 ORR (vector, immediate) 32-bit -------- */
10242       case BITS5(0,0,0,0,1): case BITS5(0,0,0,1,1):
10243       case BITS5(0,0,1,0,1): case BITS5(0,0,1,1,1): // 0:0xx1
10244          ok = True; isORR = True; break;
10245
10246       /* -------- x,0,1000 MOVI 16-bit shifted imm -------- */
10247       /* -------- x,0,1010 MOVI 16-bit shifted imm -------- */
10248       case BITS5(0,1,0,0,0): case BITS5(0,1,0,1,0): // 0:10x0
10249          ok = True; isMOV = True; break;
10250
10251       /* -------- x,0,1001 ORR (vector, immediate) 16-bit -------- */
10252       /* -------- x,0,1011 ORR (vector, immediate) 16-bit -------- */
10253       case BITS5(0,1,0,0,1): case BITS5(0,1,0,1,1): // 0:10x1
10254          ok = True; isORR = True; break;
10255
10256       /* -------- x,0,1100 MOVI 32-bit shifting ones -------- */
10257       /* -------- x,0,1101 MOVI 32-bit shifting ones -------- */
10258       case BITS5(0,1,1,0,0): case BITS5(0,1,1,0,1): // 0:110x
10259          ok = True; isMOV = True; break;
10260
10261       /* -------- x,0,1110 MOVI 8-bit -------- */
10262       case BITS5(0,1,1,1,0):
10263          ok = True; isMOV = True; break;
10264
10265       /* -------- x,0,1111 FMOV (vector, immediate, F32) -------- */
10266       case BITS5(0,1,1,1,1): // 0:1111
10267          ok = True; isFMOV = True; break;
10268
10269       /* -------- x,1,0000 MVNI 32-bit shifted imm -------- */
10270       /* -------- x,1,0010 MVNI 32-bit shifted imm  -------- */
10271       /* -------- x,1,0100 MVNI 32-bit shifted imm  -------- */
10272       /* -------- x,1,0110 MVNI 32-bit shifted imm  -------- */
10273       case BITS5(1,0,0,0,0): case BITS5(1,0,0,1,0):
10274       case BITS5(1,0,1,0,0): case BITS5(1,0,1,1,0): // 1:0xx0
10275          ok = True; isMVN = True; break;
10276
10277       /* -------- x,1,0001 BIC (vector, immediate) 32-bit -------- */
10278       /* -------- x,1,0011 BIC (vector, immediate) 32-bit -------- */
10279       /* -------- x,1,0101 BIC (vector, immediate) 32-bit -------- */
10280       /* -------- x,1,0111 BIC (vector, immediate) 32-bit -------- */
10281       case BITS5(1,0,0,0,1): case BITS5(1,0,0,1,1):
10282       case BITS5(1,0,1,0,1): case BITS5(1,0,1,1,1): // 1:0xx1
10283          ok = True; isBIC = True; break;
10284
10285       /* -------- x,1,1000 MVNI 16-bit shifted imm -------- */
10286       /* -------- x,1,1010 MVNI 16-bit shifted imm -------- */
10287       case BITS5(1,1,0,0,0): case BITS5(1,1,0,1,0): // 1:10x0
10288          ok = True; isMVN = True; break;
10289
10290       /* -------- x,1,1001 BIC (vector, immediate) 16-bit -------- */
10291       /* -------- x,1,1011 BIC (vector, immediate) 16-bit -------- */
10292       case BITS5(1,1,0,0,1): case BITS5(1,1,0,1,1): // 1:10x1
10293          ok = True; isBIC = True; break;
10294
10295       /* -------- x,1,1100 MVNI 32-bit shifting ones -------- */
10296       /* -------- x,1,1101 MVNI 32-bit shifting ones -------- */
10297       case BITS5(1,1,1,0,0): case BITS5(1,1,1,0,1): // 1:110x
10298          ok = True; isMVN = True; break;
10299
10300       /* -------- 0,1,1110 MOVI 64-bit scalar -------- */
10301       /* -------- 1,1,1110 MOVI 64-bit vector -------- */
10302       case BITS5(1,1,1,1,0):
10303          ok = True; isMOV = True; break;
10304
10305       /* -------- 1,1,1111 FMOV (vector, immediate, F64) -------- */
10306       case BITS5(1,1,1,1,1): // 1:1111
10307          ok = bitQ == 1; isFMOV = True; break;
10308
10309       default:
10310         break;
10311    }
10312    if (ok) {
10313       vassert(1 == (isMOV ? 1 : 0) + (isMVN ? 1 : 0)
10314                    + (isORR ? 1 : 0) + (isBIC ? 1 : 0) + (isFMOV ? 1 : 0));
10315       ok = AdvSIMDExpandImm(&imm64lo, bitOP, cmode, abcdefgh);
10316    }
10317    if (ok) {
10318       if (isORR || isBIC) {
10319          ULong inv
10320             = isORR ? 0ULL : ~0ULL;
10321          IRExpr* immV128
10322             = binop(Iop_64HLtoV128, mkU64(inv ^ imm64lo), mkU64(inv ^ imm64lo));
10323          IRExpr* res
10324             = binop(isORR ? Iop_OrV128 : Iop_AndV128, getQReg128(dd), immV128);
10325          const HChar* nm = isORR ? "orr" : "bic";
10326          if (bitQ == 0) {
10327             putQReg128(dd, unop(Iop_ZeroHI64ofV128, res));
10328             DIP("%s %s.1d, %016llx\n", nm, nameQReg128(dd), imm64lo);
10329          } else {
10330             putQReg128(dd, res);
10331             DIP("%s %s.2d, #0x%016llx'%016llx\n", nm,
10332                 nameQReg128(dd), imm64lo, imm64lo);
10333          }
10334       }
10335       else if (isMOV || isMVN || isFMOV) {
10336          if (isMVN) imm64lo = ~imm64lo;
10337          ULong   imm64hi = bitQ == 0  ? 0  :  imm64lo;
10338          IRExpr* immV128 = binop(Iop_64HLtoV128, mkU64(imm64hi),
10339                                                  mkU64(imm64lo));
10340          putQReg128(dd, immV128);
10341          DIP("mov %s, #0x%016llx'%016llx\n", nameQReg128(dd), imm64hi, imm64lo);
10342       }
10343       return True;
10344    }
10345    /* else fall through */
10346
10347    return False;
10348 #  undef INSN
10349 }
10350
10351
10352 static
10353 Bool dis_AdvSIMD_scalar_copy(/*MB_OUT*/DisResult* dres, UInt insn)
10354 {
10355    /* 31    28       20   15 14   10 9 4
10356       01 op 11110000 imm5 0  imm4 1  n d
10357       Decode fields: op,imm4
10358    */
10359 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10360    if (INSN(31,30) != BITS2(0,1)
10361        || INSN(28,21) != BITS8(1,1,1,1,0,0,0,0)
10362        || INSN(15,15) != 0 || INSN(10,10) != 1) {
10363       return False;
10364    }
10365    UInt bitOP = INSN(29,29);
10366    UInt imm5  = INSN(20,16);
10367    UInt imm4  = INSN(14,11);
10368    UInt nn    = INSN(9,5);
10369    UInt dd    = INSN(4,0);
10370
10371    if (bitOP == 0 && imm4 == BITS4(0,0,0,0)) {
10372       /* -------- 0,0000 DUP (element, scalar) -------- */
10373       IRTemp w0     = newTemp(Ity_I64);
10374       const HChar* arTs = "??";
10375       IRType laneTy = Ity_INVALID;
10376       UInt   laneNo = 16; /* invalid */
10377       if (imm5 & 1) {
10378          arTs   = "b";
10379          laneNo = (imm5 >> 1) & 15;
10380          laneTy = Ity_I8;
10381          assign(w0, unop(Iop_8Uto64, getQRegLane(nn, laneNo, laneTy)));
10382       }
10383       else if (imm5 & 2) {
10384          arTs   = "h";
10385          laneNo = (imm5 >> 2) & 7;
10386          laneTy = Ity_I16;
10387          assign(w0, unop(Iop_16Uto64, getQRegLane(nn, laneNo, laneTy)));
10388       }
10389       else if (imm5 & 4) {
10390          arTs   = "s";
10391          laneNo = (imm5 >> 3) & 3;
10392          laneTy = Ity_I32;
10393          assign(w0, unop(Iop_32Uto64, getQRegLane(nn, laneNo, laneTy)));
10394       }
10395       else if (imm5 & 8) {
10396          arTs   = "d";
10397          laneNo = (imm5 >> 4) & 1;
10398          laneTy = Ity_I64;
10399          assign(w0, getQRegLane(nn, laneNo, laneTy));
10400       }
10401       else {
10402          /* invalid; leave laneTy unchanged. */
10403       }
10404       /* */
10405       if (laneTy != Ity_INVALID) {
10406          vassert(laneNo < 16);
10407          putQReg128(dd, binop(Iop_64HLtoV128, mkU64(0), mkexpr(w0)));
10408          DIP("dup %s, %s.%s[%u]\n",
10409              nameQRegLO(dd, laneTy), nameQReg128(nn), arTs, laneNo);
10410          return True;
10411       }
10412       /* else fall through */
10413    }
10414
10415    return False;
10416 #  undef INSN
10417 }
10418
10419
10420 static
10421 Bool dis_AdvSIMD_scalar_pairwise(/*MB_OUT*/DisResult* dres, UInt insn,
10422                                  const VexArchInfo* archinfo)
10423 {
10424    /* 31   28    23 21    16     11 9 4
10425       01 u 11110 sz 11000 opcode 10 n d
10426       Decode fields: u,sz,opcode
10427    */
10428 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10429    if (INSN(31,30) != BITS2(0,1)
10430        || INSN(28,24) != BITS5(1,1,1,1,0)
10431        || INSN(21,17) != BITS5(1,1,0,0,0)
10432        || INSN(11,10) != BITS2(1,0)) {
10433       return False;
10434    }
10435    UInt bitU   = INSN(29,29);
10436    UInt sz     = INSN(23,22);
10437    UInt opcode = INSN(16,12);
10438    UInt nn     = INSN(9,5);
10439    UInt dd     = INSN(4,0);
10440
10441    if (bitU == 0 && sz == X11 && opcode == BITS5(1,1,0,1,1)) {
10442       /* -------- 0,11,11011 ADDP d_2d -------- */
10443       IRTemp xy = newTempV128();
10444       IRTemp xx = newTempV128();
10445       assign(xy, getQReg128(nn));
10446       assign(xx, binop(Iop_InterleaveHI64x2, mkexpr(xy), mkexpr(xy)));
10447       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
10448                           binop(Iop_Add64x2, mkexpr(xy), mkexpr(xx))));
10449       DIP("addp d%u, %s.2d\n", dd, nameQReg128(nn));
10450       return True;
10451    }
10452
10453    if (bitU == 1 && sz <= X01 && opcode == BITS5(0,1,1,0,1)) {
10454       /* -------- 1,00,01101 ADDP s_2s -------- */
10455       /* -------- 1,01,01101 ADDP d_2d -------- */
10456       Bool   isD   = sz == X01;
10457       IROp   opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
10458       IROp   opADD = mkVecADDF(isD ? 3 : 2);
10459       IRTemp src   = newTempV128();
10460       IRTemp argL  = newTempV128();
10461       IRTemp argR  = newTempV128();
10462       assign(src, getQReg128(nn));
10463       assign(argL, unop(opZHI, mkexpr(src)));
10464       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
10465                                                     mkU8(isD ? 8 : 4))));
10466       putQReg128(dd, unop(opZHI,
10467                           triop(opADD, mkexpr(mk_get_IR_rounding_mode()),
10468                                               mkexpr(argL), mkexpr(argR))));
10469       DIP(isD ? "faddp d%u, v%u.2d\n" : "faddp s%u, v%u.2s\n", dd, nn);
10470       return True;
10471    }
10472
10473    /* Half-precision floating point ADDP (v8.2). */
10474    if (bitU == 0 && sz <= X00 && opcode == BITS5(0,1,1,0,1)) {
10475       /* -------- 0,00,01101 ADDP h_2h -------- */
10476       if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
10477          return False;
10478       IROp   opZHI = mkVecZEROHIxxOFV128(1);
10479       IROp   opADD = mkVecADDF(1);
10480       IRTemp src   = newTempV128();
10481       IRTemp argL  = newTempV128();
10482       IRTemp argR  = newTempV128();
10483       assign(src, getQReg128(nn));
10484       assign(argL, unop(opZHI, mkexpr(src)));
10485       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
10486                                                     mkU8(2))));
10487       putQReg128(dd, unop(opZHI,
10488                           triop(opADD, mkexpr(mk_get_IR_rounding_mode()),
10489                                               mkexpr(argL), mkexpr(argR))));
10490       DIP("faddp h%u, v%u.2h\n", dd, nn);
10491       return True;
10492    }
10493
10494    if (bitU == 1
10495        && (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,1,1))) {
10496       /* -------- 1,0x,01100 FMAXNMP d_2d, s_2s -------- */
10497       /* -------- 1,1x,01100 FMINNMP d_2d, s_2s -------- */
10498       /* -------- 1,0x,01111 FMAXP   d_2d, s_2s -------- */
10499       /* -------- 1,1x,01111 FMINP   d_2d, s_2s -------- */
10500       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
10501       Bool   isD   = (sz & 1) == 1;
10502       Bool   isMIN = (sz & 2) == 2;
10503       Bool   isNM  = opcode == BITS5(0,1,1,0,0);
10504       IROp   opZHI = mkVecZEROHIxxOFV128(isD ? 3 : 2);
10505       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? 3 : 2);
10506       IRTemp src   = newTempV128();
10507       IRTemp argL  = newTempV128();
10508       IRTemp argR  = newTempV128();
10509       assign(src, getQReg128(nn));
10510       assign(argL, unop(opZHI, mkexpr(src)));
10511       assign(argR, unop(opZHI, triop(Iop_SliceV128, mkexpr(src), mkexpr(src),
10512                                                     mkU8(isD ? 8 : 4))));
10513       putQReg128(dd, unop(opZHI,
10514                           binop(opMXX, mkexpr(argL), mkexpr(argR))));
10515       HChar c = isD ? 'd' : 's';
10516       DIP("%s%sp %c%u, v%u.2%c\n",
10517            isMIN ? "fmin" : "fmax", isNM ? "nm" : "", c, dd, nn, c);
10518       return True;
10519    }
10520
10521    return False;
10522 #  undef INSN
10523 }
10524
10525
10526 static
10527 Bool dis_AdvSIMD_scalar_shift_by_imm(/*MB_OUT*/DisResult* dres, UInt insn)
10528 {
10529    /* 31   28     22   18   15     10 9 4
10530       01 u 111110 immh immb opcode 1  n d
10531       Decode fields: u,immh,opcode
10532    */
10533 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10534    if (INSN(31,30) != BITS2(0,1)
10535        || INSN(28,23) != BITS6(1,1,1,1,1,0) || INSN(10,10) != 1) {
10536       return False;
10537    }
10538    UInt bitU   = INSN(29,29);
10539    UInt immh   = INSN(22,19);
10540    UInt immb   = INSN(18,16);
10541    UInt opcode = INSN(15,11);
10542    UInt nn     = INSN(9,5);
10543    UInt dd     = INSN(4,0);
10544    UInt immhb  = (immh << 3) | immb;
10545
10546    if ((immh & 8) == 8
10547        && (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0))) {
10548       /* -------- 0,1xxx,00000 SSHR d_d_#imm -------- */
10549       /* -------- 1,1xxx,00000 USHR d_d_#imm -------- */
10550       /* -------- 0,1xxx,00010 SSRA d_d_#imm -------- */
10551       /* -------- 1,1xxx,00010 USRA d_d_#imm -------- */
10552       Bool isU   = bitU == 1;
10553       Bool isAcc = opcode == BITS5(0,0,0,1,0);
10554       UInt sh    = 128 - immhb;
10555       vassert(sh >= 1 && sh <= 64);
10556       IROp    op  = isU ? Iop_ShrN64x2 : Iop_SarN64x2;
10557       IRExpr* src = getQReg128(nn);
10558       IRTemp  shf = newTempV128();
10559       IRTemp  res = newTempV128();
10560       if (sh == 64 && isU) {
10561          assign(shf, mkV128(0x0000));
10562       } else {
10563          UInt nudge = 0;
10564          if (sh == 64) {
10565             vassert(!isU);
10566             nudge = 1;
10567          }
10568          assign(shf, binop(op, src, mkU8(sh - nudge)));
10569       }
10570       assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
10571                         : mkexpr(shf));
10572       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10573       const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
10574                               : (isU ? "ushr" : "sshr");
10575       DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
10576       return True;
10577    }
10578
10579    if ((immh & 8) == 8
10580        && (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0))) {
10581       /* -------- 0,1xxx,00100 SRSHR d_d_#imm -------- */
10582       /* -------- 1,1xxx,00100 URSHR d_d_#imm -------- */
10583       /* -------- 0,1xxx,00110 SRSRA d_d_#imm -------- */
10584       /* -------- 1,1xxx,00110 URSRA d_d_#imm -------- */
10585       Bool isU   = bitU == 1;
10586       Bool isAcc = opcode == BITS5(0,0,1,1,0);
10587       UInt sh    = 128 - immhb;
10588       vassert(sh >= 1 && sh <= 64);
10589       IROp    op  = isU ? Iop_Rsh64Ux2 : Iop_Rsh64Sx2;
10590       vassert(sh >= 1 && sh <= 64);
10591       IRExpr* src  = getQReg128(nn);
10592       IRTemp  imm8 = newTemp(Ity_I8);
10593       assign(imm8, mkU8((UChar)(-sh)));
10594       IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
10595       IRTemp  shf  = newTempV128();
10596       IRTemp  res  = newTempV128();
10597       assign(shf, binop(op, src, amt));
10598       assign(res, isAcc ? binop(Iop_Add64x2, getQReg128(dd), mkexpr(shf))
10599                         : mkexpr(shf));
10600       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10601       const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
10602                               : (isU ? "urshr" : "srshr");
10603       DIP("%s d%u, d%u, #%u\n", nm, dd, nn, sh);
10604       return True;
10605    }
10606
10607    if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,0,0)) {
10608       /* -------- 1,1xxx,01000 SRI d_d_#imm -------- */
10609       UInt sh = 128 - immhb;
10610       vassert(sh >= 1 && sh <= 64);
10611       if (sh == 64) {
10612          putQReg128(dd, unop(Iop_ZeroHI64ofV128, getQReg128(dd)));
10613       } else {
10614          /* sh is in range 1 .. 63 */
10615          ULong   nmask  = (ULong)(((Long)0x8000000000000000ULL) >> (sh-1));
10616          IRExpr* nmaskV = binop(Iop_64HLtoV128, mkU64(nmask), mkU64(nmask));
10617          IRTemp  res    = newTempV128();
10618          assign(res, binop(Iop_OrV128,
10619                            binop(Iop_AndV128, getQReg128(dd), nmaskV),
10620                            binop(Iop_ShrN64x2, getQReg128(nn), mkU8(sh))));
10621          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10622       }
10623       DIP("sri d%u, d%u, #%u\n", dd, nn, sh);
10624       return True;
10625    }
10626
10627    if (bitU == 0 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
10628       /* -------- 0,1xxx,01010 SHL d_d_#imm -------- */
10629       UInt sh = immhb - 64;
10630       vassert(sh < 64);
10631       putQReg128(dd,
10632                  unop(Iop_ZeroHI64ofV128,
10633                       sh == 0 ? getQReg128(nn)
10634                               : binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
10635       DIP("shl d%u, d%u, #%u\n", dd, nn, sh);
10636       return True;
10637    }
10638
10639    if (bitU == 1 && (immh & 8) == 8 && opcode == BITS5(0,1,0,1,0)) {
10640       /* -------- 1,1xxx,01010 SLI d_d_#imm -------- */
10641       UInt sh = immhb - 64;
10642       vassert(sh < 64);
10643       if (sh == 0) {
10644          putQReg128(dd, unop(Iop_ZeroHI64ofV128, getQReg128(nn)));
10645       } else {
10646          /* sh is in range 1 .. 63 */
10647          ULong   nmask  = (1ULL << sh) - 1;
10648          IRExpr* nmaskV = binop(Iop_64HLtoV128, mkU64(nmask), mkU64(nmask));
10649          IRTemp  res    = newTempV128();
10650          assign(res, binop(Iop_OrV128,
10651                            binop(Iop_AndV128, getQReg128(dd), nmaskV),
10652                            binop(Iop_ShlN64x2, getQReg128(nn), mkU8(sh))));
10653          putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10654       }
10655       DIP("sli d%u, d%u, #%u\n", dd, nn, sh);
10656       return True;
10657    }
10658
10659    if (opcode == BITS5(0,1,1,1,0)
10660        || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
10661       /* -------- 0,01110  SQSHL  #imm -------- */
10662       /* -------- 1,01110  UQSHL  #imm -------- */
10663       /* -------- 1,01100  SQSHLU #imm -------- */
10664       UInt size  = 0;
10665       UInt shift = 0;
10666       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10667       if (!ok) return False;
10668       vassert(size <= 3);
10669       /* The shift encoding has opposite sign for the leftwards case.
10670          Adjust shift to compensate. */
10671       UInt lanebits = 8 << size;
10672       shift = lanebits - shift;
10673       vassert(shift < lanebits);
10674       const HChar* nm = NULL;
10675       /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
10676       else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
10677       else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
10678       else vassert(0);
10679       IRTemp qDiff1 = IRTemp_INVALID;
10680       IRTemp qDiff2 = IRTemp_INVALID;
10681       IRTemp res = IRTemp_INVALID;
10682       IRTemp src = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn));
10683       /* This relies on the fact that the zeroed out lanes generate zeroed
10684          result lanes and don't saturate, so there's no point in trimming
10685          the resulting res, qDiff1 or qDiff2 values. */
10686       math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
10687       putQReg128(dd, mkexpr(res));
10688       updateQCFLAGwithDifference(qDiff1, qDiff2);
10689       const HChar arr = "bhsd"[size];
10690       DIP("%s %c%u, %c%u, #%u\n", nm, arr, dd, arr, nn, shift);
10691       return True;
10692    }
10693
10694    if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
10695        || (bitU == 1
10696            && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
10697       /* -------- 0,10010   SQSHRN #imm -------- */
10698       /* -------- 1,10010   UQSHRN #imm -------- */
10699       /* -------- 0,10011  SQRSHRN #imm -------- */
10700       /* -------- 1,10011  UQRSHRN #imm -------- */
10701       /* -------- 1,10000  SQSHRUN #imm -------- */
10702       /* -------- 1,10001 SQRSHRUN #imm -------- */
10703       UInt size  = 0;
10704       UInt shift = 0;
10705       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
10706       if (!ok || size == X11) return False;
10707       // always true, size is unsigned int
10708       //vassert(size >= X00);
10709       vassert(size <= X10);
10710       vassert(shift >= 1 && shift <= (8 << size));
10711       const HChar* nm = "??";
10712       IROp op = Iop_INVALID;
10713       /* Decide on the name and the operation. */
10714       /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
10715          nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size);
10716       }
10717       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
10718          nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size);
10719       }
10720       else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) {
10721          nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size);
10722       }
10723       else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
10724          nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size);
10725       }
10726       else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) {
10727          nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size);
10728       }
10729       else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) {
10730          nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size);
10731       }
10732       else vassert(0);
10733       /* Compute the result (Q, shifted value) pair. */
10734       IRTemp src128 = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size+1, getQReg128(nn));
10735       IRTemp pair   = newTempV128();
10736       assign(pair, binop(op, mkexpr(src128), mkU8(shift)));
10737       /* Update the result reg */
10738       IRTemp res64in128 = newTempV128();
10739       assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair)));
10740       putQReg128(dd, mkexpr(res64in128));
10741       /* Update the Q flag. */
10742       IRTemp q64q64 = newTempV128();
10743       assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair)));
10744       IRTemp z128 = newTempV128();
10745       assign(z128, mkV128(0x0000));
10746       updateQCFLAGwithDifference(q64q64, z128);
10747       /* */
10748       const HChar arrNarrow = "bhsd"[size];
10749       const HChar arrWide   = "bhsd"[size+1];
10750       DIP("%s %c%u, %c%u, #%u\n", nm, arrNarrow, dd, arrWide, nn, shift);
10751       return True;
10752    }
10753
10754    if (immh >= BITS4(0,1,0,0) && opcode == BITS5(1,1,1,0,0)) {
10755       /* -------- 0,!=00xx,11100 SCVTF d_d_imm, s_s_imm -------- */
10756       /* -------- 1,!=00xx,11100 UCVTF d_d_imm, s_s_imm -------- */
10757       UInt size  = 0;
10758       UInt fbits = 0;
10759       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
10760       /* The following holds because immh is never zero. */
10761       vassert(ok);
10762       /* The following holds because immh >= 0100. */
10763       vassert(size == X10 || size == X11);
10764       Bool isD = size == X11;
10765       Bool isU = bitU == 1;
10766       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
10767       Double  scale  = two_to_the_minus(fbits);
10768       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
10769                              : IRExpr_Const(IRConst_F32( (Float)scale ));
10770       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
10771       IROp    opCVT  = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
10772                            : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
10773       IRType tyF = isD ? Ity_F64 : Ity_F32;
10774       IRType tyI = isD ? Ity_I64 : Ity_I32;
10775       IRTemp src = newTemp(tyI);
10776       IRTemp res = newTemp(tyF);
10777       IRTemp rm  = mk_get_IR_rounding_mode();
10778       assign(src, getQRegLane(nn, 0, tyI));
10779       assign(res, triop(opMUL, mkexpr(rm),
10780                                binop(opCVT, mkexpr(rm), mkexpr(src)), scaleE));
10781       putQRegLane(dd, 0, mkexpr(res));
10782       if (!isD) {
10783          putQRegLane(dd, 1, mkU32(0));
10784       }
10785       putQRegLane(dd, 1, mkU64(0));
10786       const HChar ch = isD ? 'd' : 's';
10787       DIP("%s %c%u, %c%u, #%u\n", isU ? "ucvtf" : "scvtf",
10788           ch, dd, ch, nn, fbits);
10789       return True;
10790    }
10791
10792    if (immh >= BITS4(0,1,0,0) && opcode == BITS5(1,1,1,1,1)) {
10793       /* -------- 0,!=00xx,11111 FCVTZS d_d_imm, s_s_imm -------- */
10794       /* -------- 1,!=00xx,11111 FCVTZU d_d_imm, s_s_imm -------- */
10795       UInt size  = 0;
10796       UInt fbits = 0;
10797       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
10798       /* The following holds because immh is never zero. */
10799       vassert(ok);
10800       /* The following holds because immh >= 0100. */
10801       vassert(size == X10 || size == X11);
10802       Bool isD = size == X11;
10803       Bool isU = bitU == 1;
10804       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
10805       Double  scale  = two_to_the_plus(fbits);
10806       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
10807                            : IRExpr_Const(IRConst_F32( (Float)scale ));
10808       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
10809       IROp    opCVT  = isU ? (isD ? Iop_F64toI64U : Iop_F32toI32U)
10810                            : (isD ? Iop_F64toI64S : Iop_F32toI32S);
10811       IRType tyF = isD ? Ity_F64 : Ity_F32;
10812       IRType tyI = isD ? Ity_I64 : Ity_I32;
10813       IRTemp src = newTemp(tyF);
10814       IRTemp res = newTemp(tyI);
10815       IRTemp rm  = newTemp(Ity_I32);
10816       assign(src, getQRegLane(nn, 0, tyF));
10817       assign(rm,  mkU32(Irrm_ZERO));
10818       assign(res, binop(opCVT, mkexpr(rm),
10819                                triop(opMUL, mkexpr(rm), mkexpr(src), scaleE)));
10820       putQRegLane(dd, 0, mkexpr(res));
10821       if (!isD) {
10822          putQRegLane(dd, 1, mkU32(0));
10823       }
10824       putQRegLane(dd, 1, mkU64(0));
10825       const HChar ch = isD ? 'd' : 's';
10826       DIP("%s %c%u, %c%u, #%u\n", isU ? "fcvtzu" : "fcvtzs",
10827           ch, dd, ch, nn, fbits);
10828       return True;
10829    }
10830
10831 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10832    return False;
10833 #  undef INSN
10834 }
10835
10836
10837 static
10838 Bool dis_AdvSIMD_scalar_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
10839 {
10840    /* 31 29 28    23   21 20 15     11 9 4
10841       01 U  11110 size 1  m  opcode 00 n d
10842       Decode fields: u,opcode
10843    */
10844 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10845    if (INSN(31,30) != BITS2(0,1)
10846        || INSN(28,24) != BITS5(1,1,1,1,0)
10847        || INSN(21,21) != 1
10848        || INSN(11,10) != BITS2(0,0)) {
10849       return False;
10850    }
10851    UInt bitU   = INSN(29,29);
10852    UInt size   = INSN(23,22);
10853    UInt mm     = INSN(20,16);
10854    UInt opcode = INSN(15,12);
10855    UInt nn     = INSN(9,5);
10856    UInt dd     = INSN(4,0);
10857    vassert(size < 4);
10858
10859    if (bitU == 0
10860        && (opcode == BITS4(1,1,0,1)
10861            || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
10862       /* -------- 0,1101  SQDMULL -------- */ // 0 (ks)
10863       /* -------- 0,1001  SQDMLAL -------- */ // 1
10864       /* -------- 0,1011  SQDMLSL -------- */ // 2
10865       /* Widens, and size refers to the narrowed lanes. */
10866       UInt ks = 3;
10867       switch (opcode) {
10868          case BITS4(1,1,0,1): ks = 0; break;
10869          case BITS4(1,0,0,1): ks = 1; break;
10870          case BITS4(1,0,1,1): ks = 2; break;
10871          default: vassert(0);
10872       }
10873       vassert(ks <= 2);
10874       if (size == X00 || size == X11) return False;
10875       vassert(size <= 2);
10876       IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
10877       vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
10878       newTempsV128_3(&vecN, &vecM, &vecD);
10879       assign(vecN, getQReg128(nn));
10880       assign(vecM, getQReg128(mm));
10881       assign(vecD, getQReg128(dd));
10882       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
10883                        False/*!is2*/, size, "mas"[ks],
10884                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
10885       IROp opZHI = mkVecZEROHIxxOFV128(size+1);
10886       putQReg128(dd, unop(opZHI, mkexpr(res)));
10887       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
10888       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
10889       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
10890          updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
10891       }
10892       const HChar* nm        = ks == 0 ? "sqdmull"
10893                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
10894       const HChar  arrNarrow = "bhsd"[size];
10895       const HChar  arrWide   = "bhsd"[size+1];
10896       DIP("%s %c%u, %c%u, %c%u\n",
10897           nm, arrWide, dd, arrNarrow, nn, arrNarrow, mm);
10898       return True;
10899    }
10900
10901    return False;
10902 #  undef INSN
10903 }
10904
10905
10906 static
10907 Bool dis_AdvSIMD_scalar_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
10908 {
10909    /* 31 29 28    23   21 20 15     10 9 4
10910       01 U  11110 size 1  m  opcode 1  n d
10911       Decode fields: u,size,opcode
10912    */
10913 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
10914    if (INSN(31,30) != BITS2(0,1)
10915        || INSN(28,24) != BITS5(1,1,1,1,0)
10916        || INSN(21,21) != 1
10917        || INSN(10,10) != 1) {
10918       return False;
10919    }
10920    UInt bitU   = INSN(29,29);
10921    UInt size   = INSN(23,22);
10922    UInt mm     = INSN(20,16);
10923    UInt opcode = INSN(15,11);
10924    UInt nn     = INSN(9,5);
10925    UInt dd     = INSN(4,0);
10926    vassert(size < 4);
10927
10928    if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
10929       /* -------- 0,xx,00001 SQADD std4_std4_std4 -------- */
10930       /* -------- 1,xx,00001 UQADD std4_std4_std4 -------- */
10931       /* -------- 0,xx,00101 SQSUB std4_std4_std4 -------- */
10932       /* -------- 1,xx,00101 UQSUB std4_std4_std4 -------- */
10933       Bool isADD = opcode == BITS5(0,0,0,0,1);
10934       Bool isU   = bitU == 1;
10935       IROp qop   = Iop_INVALID;
10936       IROp nop   = Iop_INVALID;
10937       if (isADD) {
10938          qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
10939          nop = mkVecADD(size);
10940       } else {
10941          qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
10942          nop = mkVecSUB(size);
10943       }
10944       IRTemp argL = newTempV128();
10945       IRTemp argR = newTempV128();
10946       IRTemp qres = newTempV128();
10947       IRTemp nres = newTempV128();
10948       assign(argL, getQReg128(nn));
10949       assign(argR, getQReg128(mm));
10950       assign(qres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10951                              size, binop(qop, mkexpr(argL), mkexpr(argR)))));
10952       assign(nres, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(
10953                              size, binop(nop, mkexpr(argL), mkexpr(argR)))));
10954       putQReg128(dd, mkexpr(qres));
10955       updateQCFLAGwithDifference(qres, nres);
10956       const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd")
10957                                : (isU ? "uqsub" : "sqsub");
10958       const HChar  arr = "bhsd"[size];
10959       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
10960       return True;
10961    }
10962
10963    if (size == X11 && opcode == BITS5(0,0,1,1,0)) {
10964       /* -------- 0,11,00110 CMGT d_d_d -------- */ // >s
10965       /* -------- 1,11,00110 CMHI d_d_d -------- */ // >u
10966       Bool    isGT = bitU == 0;
10967       IRExpr* argL = getQReg128(nn);
10968       IRExpr* argR = getQReg128(mm);
10969       IRTemp  res  = newTempV128();
10970       assign(res,
10971              isGT ? binop(Iop_CmpGT64Sx2, argL, argR)
10972                   : binop(Iop_CmpGT64Ux2, argL, argR));
10973       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10974       DIP("%s %s, %s, %s\n",isGT ? "cmgt" : "cmhi",
10975           nameQRegLO(dd, Ity_I64),
10976           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10977       return True;
10978    }
10979
10980    if (size == X11 && opcode == BITS5(0,0,1,1,1)) {
10981       /* -------- 0,11,00111 CMGE d_d_d -------- */ // >=s
10982       /* -------- 1,11,00111 CMHS d_d_d -------- */ // >=u
10983       Bool    isGE = bitU == 0;
10984       IRExpr* argL = getQReg128(nn);
10985       IRExpr* argR = getQReg128(mm);
10986       IRTemp  res  = newTempV128();
10987       assign(res,
10988              isGE ? unop(Iop_NotV128, binop(Iop_CmpGT64Sx2, argR, argL))
10989                   : unop(Iop_NotV128, binop(Iop_CmpGT64Ux2, argR, argL)));
10990       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
10991       DIP("%s %s, %s, %s\n", isGE ? "cmge" : "cmhs",
10992           nameQRegLO(dd, Ity_I64),
10993           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
10994       return True;
10995    }
10996
10997    if (size == X11 && (opcode == BITS5(0,1,0,0,0)
10998                        || opcode == BITS5(0,1,0,1,0))) {
10999       /* -------- 0,xx,01000 SSHL  d_d_d -------- */
11000       /* -------- 0,xx,01010 SRSHL d_d_d -------- */
11001       /* -------- 1,xx,01000 USHL  d_d_d -------- */
11002       /* -------- 1,xx,01010 URSHL d_d_d -------- */
11003       Bool isU = bitU == 1;
11004       Bool isR = opcode == BITS5(0,1,0,1,0);
11005       IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
11006                      : (isU ? mkVecSHU(size)  : mkVecSHS(size));
11007       IRTemp res = newTempV128();
11008       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
11009       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
11010       const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
11011                              : (isU ? "ushl"  : "sshl");
11012       DIP("%s %s, %s, %s\n", nm,
11013           nameQRegLO(dd, Ity_I64),
11014           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
11015       return True;
11016    }
11017
11018    if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
11019       /* -------- 0,xx,01001 SQSHL  std4_std4_std4 -------- */
11020       /* -------- 0,xx,01011 SQRSHL std4_std4_std4 -------- */
11021       /* -------- 1,xx,01001 UQSHL  std4_std4_std4 -------- */
11022       /* -------- 1,xx,01011 UQRSHL std4_std4_std4 -------- */
11023       Bool isU = bitU == 1;
11024       Bool isR = opcode == BITS5(0,1,0,1,1);
11025       IROp op  = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
11026                      : (isU ? mkVecQANDUQSH(size)  : mkVecQANDSQSH(size));
11027       /* This is a bit tricky.  Since we're only interested in the lowest
11028          lane of the result, we zero out all the rest in the operands, so
11029          as to ensure that other lanes don't pollute the returned Q value.
11030          This works because it means, for the lanes we don't care about, we
11031          are shifting zero by zero, which can never saturate. */
11032       IRTemp res256 = newTemp(Ity_V256);
11033       IRTemp resSH  = newTempV128();
11034       IRTemp resQ   = newTempV128();
11035       IRTemp zero   = newTempV128();
11036       assign(
11037          res256,
11038          binop(op,
11039                mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(nn))),
11040                mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, getQReg128(mm)))));
11041       assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
11042       assign(resQ,  unop(Iop_V256toV128_1, mkexpr(res256)));
11043       assign(zero,  mkV128(0x0000));
11044       putQReg128(dd, mkexpr(resSH));
11045       updateQCFLAGwithDifference(resQ, zero);
11046       const HChar* nm  = isR ? (isU ? "uqrshl" : "sqrshl")
11047                              : (isU ? "uqshl"  : "sqshl");
11048       const HChar  arr = "bhsd"[size];
11049       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
11050       return True;
11051    }
11052
11053    if (size == X11 && opcode == BITS5(1,0,0,0,0)) {
11054       /* -------- 0,11,10000 ADD d_d_d -------- */
11055       /* -------- 1,11,10000 SUB d_d_d -------- */
11056       Bool   isSUB = bitU == 1;
11057       IRTemp res   = newTemp(Ity_I64);
11058       assign(res, binop(isSUB ? Iop_Sub64 : Iop_Add64,
11059                         getQRegLane(nn, 0, Ity_I64),
11060                         getQRegLane(mm, 0, Ity_I64)));
11061       putQRegLane(dd, 0, mkexpr(res));
11062       putQRegLane(dd, 1, mkU64(0));
11063       DIP("%s %s, %s, %s\n", isSUB ? "sub" : "add",
11064           nameQRegLO(dd, Ity_I64),
11065           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
11066       return True;
11067    }
11068
11069    if (size == X11 && opcode == BITS5(1,0,0,0,1)) {
11070       /* -------- 0,11,10001 CMTST d_d_d -------- */ // &, != 0
11071       /* -------- 1,11,10001 CMEQ  d_d_d -------- */ // ==
11072       Bool    isEQ = bitU == 1;
11073       IRExpr* argL = getQReg128(nn);
11074       IRExpr* argR = getQReg128(mm);
11075       IRTemp  res  = newTempV128();
11076       assign(res,
11077              isEQ ? binop(Iop_CmpEQ64x2, argL, argR)
11078                   : unop(Iop_NotV128, binop(Iop_CmpEQ64x2,
11079                                             binop(Iop_AndV128, argL, argR),
11080                                             mkV128(0x0000))));
11081       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
11082       DIP("%s %s, %s, %s\n", isEQ ? "cmeq" : "cmtst",
11083           nameQRegLO(dd, Ity_I64),
11084           nameQRegLO(nn, Ity_I64), nameQRegLO(mm, Ity_I64));
11085       return True;
11086    }
11087
11088    if (opcode == BITS5(1,0,1,1,0)) {
11089       /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
11090       /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
11091       if (size == X00 || size == X11) return False;
11092       Bool isR = bitU == 1;
11093       IRTemp res, sat1q, sat1n, vN, vM;
11094       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
11095       newTempsV128_2(&vN, &vM);
11096       assign(vN, getQReg128(nn));
11097       assign(vM, getQReg128(mm));
11098       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
11099       putQReg128(dd,
11100                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res))));
11101       updateQCFLAGwithDifference(
11102          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1q)),
11103          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(sat1n)));
11104       const HChar  arr = "bhsd"[size];
11105       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
11106       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
11107       return True;
11108    }
11109
11110    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
11111       /* -------- 1,1x,11010 FABD d_d_d, s_s_s -------- */
11112       IRType ity = size == X11 ? Ity_F64 : Ity_F32;
11113       IRTemp res = newTemp(ity);
11114       assign(res, unop(mkABSF(ity),
11115                        triop(mkSUBF(ity),
11116                              mkexpr(mk_get_IR_rounding_mode()),
11117                              getQRegLO(nn,ity), getQRegLO(mm,ity))));
11118       putQReg128(dd, mkV128(0x0000));
11119       putQRegLO(dd, mkexpr(res));
11120       DIP("fabd %s, %s, %s\n",
11121           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
11122       return True;
11123    }
11124
11125    if (bitU == 0 && size <= X01 && opcode == BITS5(1,1,0,1,1)) {
11126       /* -------- 0,0x,11011 FMULX d_d_d, s_s_s -------- */
11127       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
11128       IRType ity = size == X01 ? Ity_F64 : Ity_F32;
11129       IRTemp res = newTemp(ity);
11130       assign(res, triop(mkMULF(ity),
11131                         mkexpr(mk_get_IR_rounding_mode()),
11132                         getQRegLO(nn,ity), getQRegLO(mm,ity)));
11133       putQReg128(dd, mkV128(0x0000));
11134       putQRegLO(dd, mkexpr(res));
11135       DIP("fmulx %s, %s, %s\n",
11136           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
11137       return True;
11138    }
11139
11140    if (size <= X01 && opcode == BITS5(1,1,1,0,0)) {
11141       /* -------- 0,0x,11100 FCMEQ d_d_d, s_s_s -------- */
11142       /* -------- 1,0x,11100 FCMGE d_d_d, s_s_s -------- */
11143       Bool   isD   = size == X01;
11144       IRType ity   = isD ? Ity_F64 : Ity_F32;
11145       Bool   isGE  = bitU == 1;
11146       IROp   opCMP = isGE ? (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4)
11147                           : (isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4);
11148       IRTemp res   = newTempV128();
11149       assign(res, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn)) // swapd
11150                        : binop(opCMP, getQReg128(nn), getQReg128(mm)));
11151       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
11152                                                              mkexpr(res))));
11153       DIP("%s %s, %s, %s\n", isGE ? "fcmge" : "fcmeq",
11154           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
11155       return True;
11156    }
11157
11158    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,0,0)) {
11159       /* -------- 1,1x,11100 FCMGT d_d_d, s_s_s -------- */
11160       Bool   isD   = size == X11;
11161       IRType ity   = isD ? Ity_F64 : Ity_F32;
11162       IROp   opCMP = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
11163       IRTemp res   = newTempV128();
11164       assign(res, binop(opCMP, getQReg128(mm), getQReg128(nn))); // swapd
11165       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
11166                                                              mkexpr(res))));
11167       DIP("%s %s, %s, %s\n", "fcmgt",
11168           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
11169       return True;
11170    }
11171
11172    if (bitU == 1 && opcode == BITS5(1,1,1,0,1)) {
11173       /* -------- 1,0x,11101 FACGE d_d_d, s_s_s -------- */
11174       /* -------- 1,1x,11101 FACGT d_d_d, s_s_s -------- */
11175       Bool   isD   = (size & 1) == 1;
11176       IRType ity   = isD ? Ity_F64 : Ity_F32;
11177       Bool   isGT  = (size & 2) == 2;
11178       IROp   opCMP = isGT ? (isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4)
11179                           : (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4);
11180       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
11181       IRTemp res   = newTempV128();
11182       assign(res, binop(opCMP, unop(opABS, getQReg128(mm)),
11183                                unop(opABS, getQReg128(nn)))); // swapd
11184       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
11185                                                              mkexpr(res))));
11186       DIP("%s %s, %s, %s\n", isGT ? "facgt" : "facge",
11187           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
11188       return True;
11189    }
11190
11191    if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
11192       /* -------- 0,0x,11111: FRECPS  d_d_d, s_s_s -------- */
11193       /* -------- 0,1x,11111: FRSQRTS d_d_d, s_s_s -------- */
11194       Bool isSQRT = (size & 2) == 2;
11195       Bool isD    = (size & 1) == 1;
11196       IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
11197                            : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
11198       IRTemp res = newTempV128();
11199       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
11200       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
11201                                                              mkexpr(res))));
11202       HChar c = isD ? 'd' : 's';
11203       DIP("%s %c%u, %c%u, %c%u\n", isSQRT ? "frsqrts" : "frecps",
11204           c, dd, c, nn, c, mm);
11205       return True;
11206    }
11207
11208    return False;
11209 #  undef INSN
11210 }
11211
11212 static
11213 Bool dis_AdvSIMD_scalar_three_same_extra(/*MB_OUT*/DisResult* dres, UInt insn,
11214                                          const VexArchInfo* archinfo)
11215 {
11216    /* 31 29 28    23   21 20 15     10 9 4
11217       01 U  11110 size 0  m  opcode 1  n d
11218       Decode fields: u,size,opcode
11219    */
11220 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11221    if (INSN(31,30) != BITS2(0,1)
11222        || INSN(28,24) != BITS5(1,1,1,1,0)
11223        || INSN(21,21) != 0
11224        || INSN(10,10) != 1) {
11225       return False;
11226    }
11227    UInt bitU   = INSN(29,29);
11228    UInt size   = INSN(23,22);
11229    UInt mm     = INSN(20,16);
11230    UInt opcode = INSN(15,11);
11231    UInt nn     = INSN(9,5);
11232    UInt dd     = INSN(4,0);
11233    vassert(size < 4);
11234    vassert(mm < 32 && nn < 32 && dd < 32);
11235
11236    if (bitU == 1 && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1))) {
11237       /* -------- xx,10000 SQRDMLAH s and h variants only -------- */
11238       /* -------- xx,10001 SQRDMLSH s and h variants only -------- */
11239       if (size == X00 || size == X11) return False;
11240       Bool isAdd = opcode == BITS5(1,0,0,0,0);
11241
11242       IRTemp res, res_nosat, vD, vN, vM;
11243       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
11244       newTempsV128_3(&vD, &vN, &vM);
11245       assign(vD, getQReg128(dd));
11246       assign(vN, getQReg128(nn));
11247       assign(vM, getQReg128(mm));
11248
11249       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
11250       putQReg128(dd,
11251                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res))));
11252       updateQCFLAGwithDifference(
11253          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res)),
11254          math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(res_nosat)));
11255
11256       const HChar  arr = "hs"[size];
11257       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
11258       DIP("%s %c%u, %c%u, %c%u\n", nm, arr, dd, arr, nn, arr, mm);
11259       return True;
11260    }
11261
11262    if (bitU == 1 && size == X11 && opcode == BITS5(0,0,0,1,0)) {
11263       /* -------- 1,11,00010 FABD h_h_h -------- */
11264       if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
11265          return False;
11266       IRTemp res = newTemp(Ity_F16);
11267       assign(res, unop(mkABSF(Ity_F16),
11268                        triop(mkSUBF(Ity_F16),
11269                              mkexpr(mk_get_IR_rounding_mode()),
11270                              getQRegLO(nn,Ity_F16), getQRegLO(mm,Ity_F16))));
11271       putQReg128(dd, mkV128(0x0000));
11272       putQRegLO(dd, mkexpr(res));
11273       DIP("fabd %s, %s, %s\n",
11274           nameQRegLO(dd, Ity_F16), nameQRegLO(nn, Ity_F16), nameQRegLO(mm, Ity_F16));
11275       return True;
11276    }
11277
11278    if (size == X01 && opcode == BITS5(0,0,1,0,0)) {
11279       /* -------- 0,01,00100 FCMEQ h_h_h -------- */
11280       /* -------- 1,01,00100 FCMGE h_h_h -------- */
11281       if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
11282          return False;
11283       Bool   isGE  = bitU == 1;
11284       IROp   opCMP = isGE ? Iop_CmpLE16Fx8 : Iop_CmpEQ16Fx8;
11285       IRTemp res   = newTempV128();
11286       /* Swap source and destination in order to use existing LE IR op for GE. */
11287       assign(res, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn))
11288                        : binop(opCMP, getQReg128(nn), getQReg128(mm)));
11289       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(X01, mkexpr(res))));
11290       DIP("%s %s, %s, %s\n", isGE ? "fcmge" : "fcmeq",
11291           nameQRegLO(dd, Ity_F16), nameQRegLO(nn, Ity_F16), nameQRegLO(mm, Ity_F16));
11292       return True;
11293    }
11294
11295    if (bitU == 1 && size == X11 && opcode == BITS5(0,0,1,0,0)) {
11296       /* -------- 1,11,00100 FCMGT h_h_h -------- */
11297       if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
11298          return False;
11299       IRTemp res   = newTempV128();
11300       /* Swap source and destination in order to use existing LT IR op for GT. */
11301       assign(res, binop(Iop_CmpLT16Fx8, getQReg128(mm), getQReg128(nn)));
11302       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(X01, mkexpr(res))));
11303       DIP("%s %s, %s, %s\n", "fcmgt",
11304           nameQRegLO(dd, Ity_F16), nameQRegLO(nn, Ity_F16), nameQRegLO(mm, Ity_F16));
11305       return True;
11306    }
11307
11308    if (bitU == 1 && opcode == BITS5(0,0,1,0,1)) {
11309       /* -------- 1,01,00101 FACGE h_h_h -------- */
11310       /* -------- 1,01,00101 FACGT h_h_h -------- */
11311       if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
11312          return False;
11313       IRType ity   = Ity_F16;
11314       Bool   isGT  = (size & 2) == 2;
11315       IROp   opCMP = isGT ? Iop_CmpLT16Fx8 : Iop_CmpLE16Fx8;
11316       IROp   opABS = Iop_Abs16Fx8;
11317       IRTemp res   = newTempV128();
11318       assign(res, binop(opCMP, unop(opABS, getQReg128(mm)),
11319                                unop(opABS, getQReg128(nn))));
11320       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(X01,
11321                                                              mkexpr(res))));
11322       DIP("%s %s, %s, %s\n", isGT ? "facgt" : "facge",
11323           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
11324       return True;
11325    }
11326
11327    return False;
11328 #  undef INSN
11329 }
11330
11331
11332 static
11333 Bool dis_AdvSIMD_scalar_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
11334 {
11335    /* 31 29 28    23   21    16     11 9 4
11336       01 U  11110 size 10000 opcode 10 n d
11337       Decode fields: u,size,opcode
11338    */
11339 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11340    if (INSN(31,30) != BITS2(0,1)
11341        || INSN(28,24) != BITS5(1,1,1,1,0)
11342        || INSN(21,17) != BITS5(1,0,0,0,0)
11343        || INSN(11,10) != BITS2(1,0)) {
11344       return False;
11345    }
11346    UInt bitU   = INSN(29,29);
11347    UInt size   = INSN(23,22);
11348    UInt opcode = INSN(16,12);
11349    UInt nn     = INSN(9,5);
11350    UInt dd     = INSN(4,0);
11351    vassert(size < 4);
11352
11353    if (opcode == BITS5(0,0,0,1,1)) {
11354       /* -------- 0,xx,00011: SUQADD std4_std4 -------- */
11355       /* -------- 1,xx,00011: USQADD std4_std4 -------- */
11356       /* These are a bit tricky (to say the least).  See comments on
11357          the vector variants (in dis_AdvSIMD_two_reg_misc) below for
11358          details. */
11359       Bool   isUSQADD = bitU == 1;
11360       IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
11361                              : mkVecQADDEXTUSSATSS(size);
11362       IROp   nop  = mkVecADD(size);
11363       IRTemp argL = newTempV128();
11364       IRTemp argR = newTempV128();
11365       assign(argL, getQReg128(nn));
11366       assign(argR, getQReg128(dd));
11367       IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
11368                        size, binop(qop, mkexpr(argL), mkexpr(argR)));
11369       IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
11370                        size, binop(nop, mkexpr(argL), mkexpr(argR)));
11371       putQReg128(dd, mkexpr(qres));
11372       updateQCFLAGwithDifference(qres, nres);
11373       const HChar arr = "bhsd"[size];
11374       DIP("%s %c%u, %c%u\n", isUSQADD ? "usqadd" : "suqadd", arr, dd, arr, nn);
11375       return True;
11376    }
11377
11378    if (opcode == BITS5(0,0,1,1,1)) {
11379       /* -------- 0,xx,00111 SQABS std4_std4 -------- */
11380       /* -------- 1,xx,00111 SQNEG std4_std4 -------- */
11381       Bool isNEG = bitU == 1;
11382       IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
11383       (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
11384                                          getQReg128(nn), size );
11385       IRTemp qres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(qresFW));
11386       IRTemp nres = math_ZERO_ALL_EXCEPT_LOWEST_LANE(size, mkexpr(nresFW));
11387       putQReg128(dd, mkexpr(qres));
11388       updateQCFLAGwithDifference(qres, nres);
11389       const HChar arr = "bhsd"[size];
11390       DIP("%s %c%u, %c%u\n", isNEG ? "sqneg" : "sqabs", arr, dd, arr, nn);
11391       return True;
11392    }
11393
11394    if (size == X11 && opcode == BITS5(0,1,0,0,0)) {
11395       /* -------- 0,11,01000: CMGT d_d_#0 -------- */ // >s 0
11396       /* -------- 1,11,01000: CMGE d_d_#0 -------- */ // >=s 0
11397       Bool    isGT = bitU == 0;
11398       IRExpr* argL = getQReg128(nn);
11399       IRExpr* argR = mkV128(0x0000);
11400       IRTemp  res  = newTempV128();
11401       assign(res, isGT ? binop(Iop_CmpGT64Sx2, argL, argR)
11402                        : unop(Iop_NotV128, binop(Iop_CmpGT64Sx2, argR, argL)));
11403       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
11404       DIP("cm%s d%u, d%u, #0\n", isGT ? "gt" : "ge", dd, nn);
11405       return True;
11406    }
11407
11408    if (size == X11 && opcode == BITS5(0,1,0,0,1)) {
11409       /* -------- 0,11,01001: CMEQ d_d_#0 -------- */ // == 0
11410       /* -------- 1,11,01001: CMLE d_d_#0 -------- */ // <=s 0
11411       Bool    isEQ = bitU == 0;
11412       IRExpr* argL = getQReg128(nn);
11413       IRExpr* argR = mkV128(0x0000);
11414       IRTemp  res  = newTempV128();
11415       assign(res, isEQ ? binop(Iop_CmpEQ64x2, argL, argR)
11416                        : unop(Iop_NotV128,
11417                               binop(Iop_CmpGT64Sx2, argL, argR)));
11418       putQReg128(dd, unop(Iop_ZeroHI64ofV128, mkexpr(res)));
11419       DIP("cm%s d%u, d%u, #0\n", isEQ ? "eq" : "le", dd, nn);
11420       return True;
11421    }
11422
11423    if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,0)) {
11424       /* -------- 0,11,01010: CMLT d_d_#0 -------- */ // <s 0
11425       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
11426                           binop(Iop_CmpGT64Sx2, mkV128(0x0000),
11427                                                 getQReg128(nn))));
11428       DIP("cm%s d%u, d%u, #0\n", "lt", dd, nn);
11429       return True;
11430    }
11431
11432    if (bitU == 0 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
11433       /* -------- 0,11,01011 ABS d_d -------- */
11434       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
11435                           unop(Iop_Abs64x2, getQReg128(nn))));
11436       DIP("abs d%u, d%u\n", dd, nn);
11437       return True;
11438    }
11439
11440    if (bitU == 1 && size == X11 && opcode == BITS5(0,1,0,1,1)) {
11441       /* -------- 1,11,01011 NEG d_d -------- */
11442       putQReg128(dd, unop(Iop_ZeroHI64ofV128,
11443                           binop(Iop_Sub64x2, mkV128(0x0000), getQReg128(nn))));
11444       DIP("neg d%u, d%u\n", dd, nn);
11445       return True;
11446    }
11447
11448    UInt ix = 0; /*INVALID*/
11449    if (size >= X10) {
11450       switch (opcode) {
11451          case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 4 : 1; break;
11452          case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 5 : 2; break;
11453          case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;
11454          default: break;
11455       }
11456    }
11457    if (ix > 0) {
11458       /* -------- 0,1x,01100 FCMGT d_d_#0.0, s_s_#0.0 (ix 1) -------- */
11459       /* -------- 0,1x,01101 FCMEQ d_d_#0.0, s_s_#0.0 (ix 2) -------- */
11460       /* -------- 0,1x,01110 FCMLT d_d_#0.0, s_s_#0.0 (ix 3) -------- */
11461       /* -------- 1,1x,01100 FCMGE d_d_#0.0, s_s_#0.0 (ix 4) -------- */
11462       /* -------- 1,1x,01101 FCMLE d_d_#0.0, s_s_#0.0 (ix 5) -------- */
11463       Bool   isD     = size == X11;
11464       IRType ity     = isD ? Ity_F64 : Ity_F32;
11465       IROp   opCmpEQ = isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
11466       IROp   opCmpLE = isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
11467       IROp   opCmpLT = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
11468       IROp   opCmp   = Iop_INVALID;
11469       Bool   swap    = False;
11470       const HChar* nm = "??";
11471       switch (ix) {
11472          case 1: nm = "fcmgt"; opCmp = opCmpLT; swap = True; break;
11473          case 2: nm = "fcmeq"; opCmp = opCmpEQ; break;
11474          case 3: nm = "fcmlt"; opCmp = opCmpLT; break;
11475          case 4: nm = "fcmge"; opCmp = opCmpLE; swap = True; break;
11476          case 5: nm = "fcmle"; opCmp = opCmpLE; break;
11477          default: vassert(0);
11478       }
11479       IRExpr* zero = mkV128(0x0000);
11480       IRTemp res = newTempV128();
11481       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
11482                        : binop(opCmp, getQReg128(nn), zero));
11483       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
11484                                                              mkexpr(res))));
11485
11486       DIP("%s %s, %s, #0.0\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
11487       return True;
11488    }
11489
11490    if (opcode == BITS5(1,0,1,0,0)
11491        || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) {
11492       /* -------- 0,xx,10100: SQXTN -------- */
11493       /* -------- 1,xx,10100: UQXTN -------- */
11494       /* -------- 1,xx,10010: SQXTUN -------- */
11495       if (size == X11) return False;
11496       vassert(size < 3);
11497       IROp  opN    = Iop_INVALID;
11498       Bool  zWiden = True;
11499       const HChar* nm = "??";
11500       /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) {
11501          opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False;
11502       }
11503       else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) {
11504          opN = mkVecQNARROWUNUU(size); nm = "uqxtn";
11505       }
11506       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
11507          opN = mkVecQNARROWUNSU(size); nm = "sqxtun";
11508       }
11509       else vassert(0);
11510       IRTemp src  = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
11511                        size+1, getQReg128(nn));
11512       IRTemp resN = math_ZERO_ALL_EXCEPT_LOWEST_LANE(
11513                        size, unop(Iop_64UtoV128, unop(opN, mkexpr(src))));
11514       putQReg128(dd, mkexpr(resN));
11515       /* This widens zero lanes to zero, and compares it against zero, so all
11516          of the non-participating lanes make no contribution to the
11517          Q flag state. */
11518       IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/,
11519                                               size, mkexpr(resN));
11520       updateQCFLAGwithDifference(src, resW);
11521       const HChar arrNarrow = "bhsd"[size];
11522       const HChar arrWide   = "bhsd"[size+1];
11523       DIP("%s %c%u, %c%u\n", nm, arrNarrow, dd, arrWide, nn);
11524       return True;
11525    }
11526
11527    if (opcode == BITS5(1,0,1,1,0) && bitU == 1 && size == X01) {
11528       /* -------- 1,01,10110 FCVTXN s_d -------- */
11529       /* Using Irrm_NEAREST here isn't right.  The docs say "round to
11530          odd" but I don't know what that really means. */
11531       putQRegLO(dd,
11532                 binop(Iop_F64toF32, mkU32(Irrm_NEAREST),
11533                                     getQRegLO(nn, Ity_F64)));
11534       putQRegLane(dd, 1, mkU32(0));
11535       putQRegLane(dd, 1, mkU64(0));
11536       DIP("fcvtxn s%u, d%u\n", dd, nn);
11537       return True;
11538    }
11539
11540    ix = 0; /*INVALID*/
11541    switch (opcode) {
11542       case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
11543       case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
11544       case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
11545       default: break;
11546    }
11547    if (ix > 0) {
11548       /* -------- 0,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
11549       /* -------- 0,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
11550       /* -------- 0,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
11551       /* -------- 0,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
11552       /* -------- 0,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
11553       /* -------- 1,0x,11010 FCVTNS d_d, s_s (ix 1) -------- */
11554       /* -------- 1,0x,11011 FCVTMS d_d, s_s (ix 2) -------- */
11555       /* -------- 1,0x,11100 FCVTAS d_d, s_s (ix 3) -------- */
11556       /* -------- 1,1x,11010 FCVTPS d_d, s_s (ix 4) -------- */
11557       /* -------- 1,1x,11011 FCVTZS d_d, s_s (ix 5) -------- */
11558       Bool           isD  = (size & 1) == 1;
11559       IRType         tyF  = isD ? Ity_F64 : Ity_F32;
11560       IRType         tyI  = isD ? Ity_I64 : Ity_I32;
11561       IRRoundingMode irrm = 8; /*impossible*/
11562       HChar          ch   = '?';
11563       switch (ix) {
11564          case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
11565          case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
11566          case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
11567          case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
11568          case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
11569          default: vassert(0);
11570       }
11571       IROp cvt = Iop_INVALID;
11572       if (bitU == 1) {
11573          cvt = isD ? Iop_F64toI64U : Iop_F32toI32U;
11574       } else {
11575          cvt = isD ? Iop_F64toI64S : Iop_F32toI32S;
11576       }
11577       IRTemp src = newTemp(tyF);
11578       IRTemp res = newTemp(tyI);
11579       assign(src, getQRegLane(nn, 0, tyF));
11580       assign(res, binop(cvt, mkU32(irrm), mkexpr(src)));
11581       putQRegLane(dd, 0, mkexpr(res)); /* bits 31-0 or 63-0 */
11582       if (!isD) {
11583          putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
11584       }
11585       putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
11586       HChar sOrD = isD ? 'd' : 's';
11587       DIP("fcvt%c%c %c%u, %c%u\n", ch, bitU == 1 ? 'u' : 's',
11588           sOrD, dd, sOrD, nn);
11589       return True;
11590    }
11591
11592    if (size <= X01 && opcode == BITS5(1,1,1,0,1)) {
11593       /* -------- 0,0x,11101: SCVTF d_d, s_s -------- */
11594       /* -------- 1,0x,11101: UCVTF d_d, s_s -------- */
11595       Bool   isU = bitU == 1;
11596       Bool   isD = (size & 1) == 1;
11597       IRType tyI = isD ? Ity_I64 : Ity_I32;
11598       IROp   iop = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
11599                        : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
11600       IRTemp rm  = mk_get_IR_rounding_mode();
11601       putQRegLO(dd, binop(iop, mkexpr(rm), getQRegLO(nn, tyI)));
11602       if (!isD) {
11603          putQRegLane(dd, 1, mkU32(0)); /* bits 63-32 */
11604       }
11605       putQRegLane(dd, 1, mkU64(0));    /* bits 127-64 */
11606       HChar c = isD ? 'd' : 's';
11607       DIP("%ccvtf %c%u, %c%u\n", isU ? 'u' : 's', c, dd, c, nn);
11608       return True;
11609    }
11610
11611    if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
11612       /* -------- 0,1x,11101: FRECPE  d_d, s_s -------- */
11613       /* -------- 1,1x,11101: FRSQRTE d_d, s_s -------- */
11614       Bool isSQRT = bitU == 1;
11615       Bool isD    = (size & 1) == 1;
11616       IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
11617                            : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
11618       IRTemp resV = newTempV128();
11619       assign(resV, unop(op, getQReg128(nn)));
11620       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? X11 : X10,
11621                                                              mkexpr(resV))));
11622       HChar c = isD ? 'd' : 's';
11623       DIP("%s %c%u, %c%u\n", isSQRT ? "frsqrte" : "frecpe", c, dd, c, nn);
11624       return True;
11625    }
11626
11627    if (bitU == 0 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
11628       /* -------- 0,1x,11111: FRECPX  d_d, s_s -------- */
11629       Bool   isD = (size & 1) == 1;
11630       IRType ty  = isD ? Ity_F64 : Ity_F32;
11631       IROp   op  = isD ? Iop_RecpExpF64 : Iop_RecpExpF32;
11632       IRTemp res = newTemp(ty);
11633       IRTemp rm  = mk_get_IR_rounding_mode();
11634       assign(res, binop(op, mkexpr(rm), getQRegLane(nn, 0, ty)));
11635       putQReg128(dd, mkV128(0x0000));
11636       putQRegLane(dd, 0, mkexpr(res));
11637       HChar c = isD ? 'd' : 's';
11638       DIP("%s %c%u, %c%u\n", "frecpx", c, dd, c, nn);
11639       return True;
11640    }
11641
11642    return False;
11643 #  undef INSN
11644 }
11645
11646
11647 static
11648 Bool dis_AdvSIMD_scalar_two_reg_misc_fp16(/*MB_OUT*/DisResult* dres, UInt insn,
11649                                           const VexArchInfo* archinfo)
11650 {
11651    /* This decode function only handles instructions with half-precision
11652       floating-point (fp16) operands.
11653    */
11654    if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
11655       return False;
11656
11657    /* 31 29 28    23   21    16     11 9 4
11658       01 U  11110 size 11100 opcode 10 n d
11659       Decode fields: u,size,opcode
11660    */
11661 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11662    if (INSN(31,30) != BITS2(0,1)
11663        || INSN(28,24) != BITS5(1,1,1,1,0)
11664        || INSN(21,17) != BITS5(1,1,1,0,0)
11665        || INSN(11,10) != BITS2(1,0)) {
11666       return False;
11667    }
11668    UInt bitU   = INSN(29,29);
11669    UInt size   = INSN(23,22);
11670    UInt opcode = INSN(16,12);
11671    UInt nn     = INSN(9,5);
11672    UInt dd     = INSN(4,0);
11673    vassert(size == 3);
11674
11675    /* Decoding FCM<condtion> based on opcode and bitU. ix used to select
11676     * <condition>
11677     */
11678    UInt ix = 0; // Invalid <condition>
11679    switch (opcode) {
11680       case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 4 : 1; break; // FCMLE=4,FCMEQ=1
11681       case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 5 : 2; break; // FCMGE=5,FCMGT=2
11682       case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;    // FCMLT=3
11683       default: break;
11684    }
11685    if (ix > 0) {
11686       /* -------- 0,01101 FCMEQ h_h_#0.0 (ix 1) -------- */
11687       /* -------- 0,01100 FCMGT h_h_#0.0 (ix 2) -------- */
11688       /* -------- 0,01110 FCMLT h_h_#0.0 (ix 3) -------- */
11689       /* -------- 1,01101 FCMLE h_h_#0.0 (ix 4) -------- */
11690       /* -------- 1,01100 FCMGE h_h_#0.0 (ix 5) -------- */
11691       IRType ity     = Ity_F16;
11692       IROp   opCmp   = Iop_INVALID;
11693       Bool   swap    = False;
11694       const HChar* nm = "??";
11695       switch (ix) {
11696          case 1: nm = "fcmeq"; opCmp = Iop_CmpEQ16Fx8; break;
11697          case 2: nm = "fcmgt"; opCmp = Iop_CmpLT16Fx8; swap = True; break;
11698          case 3: nm = "fcmlt"; opCmp = Iop_CmpLT16Fx8; break;
11699          case 4: nm = "fcmle"; opCmp = Iop_CmpLE16Fx8; break;
11700          case 5: nm = "fcmge"; opCmp = Iop_CmpLE16Fx8; swap = True; break;
11701          default: vassert(0);
11702       }
11703       IRExpr* zero = mkV128(0x0000);
11704       IRTemp res   = newTempV128();
11705       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
11706                        : binop(opCmp, getQReg128(nn), zero));
11707       putQReg128(dd, mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(X01, mkexpr(res))));
11708
11709       DIP("%s %s, %s, #0.0\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
11710       return True;
11711    }
11712
11713    return False;
11714 #  undef INSN
11715 }
11716
11717
11718 static
11719 Bool dis_AdvSIMD_scalar_x_indexed_element(/*MB_OUT*/DisResult* dres, UInt insn)
11720 {
11721    /* 31   28    23   21 20 19 15     11   9 4
11722       01 U 11111 size L  M  m  opcode H  0 n d
11723       Decode fields are: u,size,opcode
11724       M is really part of the mm register number.  Individual
11725       cases need to inspect L and H though.
11726    */
11727 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11728    if (INSN(31,30) != BITS2(0,1)
11729        || INSN(28,24) != BITS5(1,1,1,1,1) || INSN(10,10) != 0) {
11730       return False;
11731    }
11732    UInt bitU   = INSN(29,29);
11733    UInt size   = INSN(23,22);
11734    UInt bitL   = INSN(21,21);
11735    UInt bitM   = INSN(20,20);
11736    UInt mmLO4  = INSN(19,16);
11737    UInt opcode = INSN(15,12);
11738    UInt bitH   = INSN(11,11);
11739    UInt nn     = INSN(9,5);
11740    UInt dd     = INSN(4,0);
11741    vassert(size < 4);
11742    vassert(bitH < 2 && bitM < 2 && bitL < 2);
11743
11744    if (bitU == 0 && size >= X10
11745        && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
11746       /* -------- 0,1x,0001 FMLA d_d_d[], s_s_s[] -------- */
11747       /* -------- 0,1x,0101 FMLS d_d_d[], s_s_s[] -------- */
11748       Bool isD   = (size & 1) == 1;
11749       Bool isSUB = opcode == BITS4(0,1,0,1);
11750       UInt index;
11751       if      (!isD)             index = (bitH << 1) | bitL;
11752       else if (isD && bitL == 0) index = bitH;
11753       else return False; // sz:L == x11 => unallocated encoding
11754       vassert(index < (isD ? 2 : 4));
11755       IRType ity   = isD ? Ity_F64 : Ity_F32;
11756       IRTemp elem  = newTemp(ity);
11757       UInt   mm    = (bitM << 4) | mmLO4;
11758       assign(elem, getQRegLane(mm, index, ity));
11759       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
11760       IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
11761       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
11762       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
11763       IRTemp rm    = mk_get_IR_rounding_mode();
11764       IRTemp t1    = newTempV128();
11765       IRTemp t2    = newTempV128();
11766       // FIXME: double rounding; use FMA primops instead
11767       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
11768       assign(t2, triop(isSUB ? opSUB : opADD,
11769                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
11770       putQReg128(dd,
11771                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
11772                                                          mkexpr(t2))));
11773       const HChar c = isD ? 'd' : 's';
11774       DIP("%s %c%u, %c%u, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
11775           c, dd, c, nn, nameQReg128(mm), c, index);
11776       return True;
11777    }
11778
11779    if (size >= X10 && opcode == BITS4(1,0,0,1)) {
11780       /* -------- 0,1x,1001 FMUL  d_d_d[], s_s_s[] -------- */
11781       /* -------- 1,1x,1001 FMULX d_d_d[], s_s_s[] -------- */
11782       Bool isD    = (size & 1) == 1;
11783       Bool isMULX = bitU == 1;
11784       UInt index;
11785       if      (!isD)             index = (bitH << 1) | bitL;
11786       else if (isD && bitL == 0) index = bitH;
11787       else return False; // sz:L == x11 => unallocated encoding
11788       vassert(index < (isD ? 2 : 4));
11789       IRType ity   = isD ? Ity_F64 : Ity_F32;
11790       IRTemp elem  = newTemp(ity);
11791       UInt   mm    = (bitM << 4) | mmLO4;
11792       assign(elem, getQRegLane(mm, index, ity));
11793       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
11794       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
11795       IRTemp rm    = mk_get_IR_rounding_mode();
11796       IRTemp t1    = newTempV128();
11797       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
11798       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
11799       putQReg128(dd,
11800                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
11801                                                          mkexpr(t1))));
11802       const HChar c = isD ? 'd' : 's';
11803       DIP("%s %c%u, %c%u, %s.%c[%u]\n", isMULX ? "fmulx" : "fmul",
11804           c, dd, c, nn, nameQReg128(mm), c, index);
11805       return True;
11806    }
11807
11808    if (bitU == 0
11809        && (opcode == BITS4(1,0,1,1)
11810            || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
11811       /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
11812       /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
11813       /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
11814       /* Widens, and size refers to the narrowed lanes. */
11815       UInt ks = 3;
11816       switch (opcode) {
11817          case BITS4(1,0,1,1): ks = 0; break;
11818          case BITS4(0,0,1,1): ks = 1; break;
11819          case BITS4(0,1,1,1): ks = 2; break;
11820          default: vassert(0);
11821       }
11822       vassert(ks <= 2);
11823       UInt mm  = 32; // invalid
11824       UInt ix  = 16; // invalid
11825       switch (size) {
11826          case X00:
11827             return False; // h_b_b[] case is not allowed
11828          case X01:
11829             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
11830          case X10:
11831             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
11832          case X11:
11833             return False; // q_d_d[] case is not allowed
11834          default:
11835             vassert(0);
11836       }
11837       vassert(mm < 32 && ix < 16);
11838       IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
11839       vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
11840       newTempsV128_2(&vecN, &vecD);
11841       assign(vecN, getQReg128(nn));
11842       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
11843       assign(vecD, getQReg128(dd));
11844       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
11845                        False/*!is2*/, size, "mas"[ks],
11846                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
11847       IROp opZHI = mkVecZEROHIxxOFV128(size+1);
11848       putQReg128(dd, unop(opZHI, mkexpr(res)));
11849       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
11850       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
11851       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
11852          updateQCFLAGwithDifferenceZHI(sat2q, sat2n, opZHI);
11853       }
11854       const HChar* nm        = ks == 0 ? "sqmull"
11855                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
11856       const HChar  arrNarrow = "bhsd"[size];
11857       const HChar  arrWide   = "bhsd"[size+1];
11858       DIP("%s %c%u, %c%u, v%u.%c[%u]\n",
11859           nm, arrWide, dd, arrNarrow, nn, dd, arrNarrow, ix);
11860       return True;
11861    }
11862
11863    if (bitU == 0 && (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1))) {
11864       /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
11865       /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
11866       UInt mm  = 32; // invalid
11867       UInt ix  = 16; // invalid
11868       switch (size) {
11869          case X00:
11870             return False; // b case is not allowed
11871          case X01:
11872             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
11873          case X10:
11874             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
11875          case X11:
11876             return False; // q case is not allowed
11877          default:
11878             vassert(0);
11879       }
11880       vassert(mm < 32 && ix < 16);
11881       Bool isR = opcode == BITS4(1,1,0,1);
11882       IRTemp res, sat1q, sat1n, vN, vM;
11883       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
11884       vN = newTempV128();
11885       assign(vN, getQReg128(nn));
11886       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
11887       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
11888       IROp opZHI = mkVecZEROHIxxOFV128(size);
11889       putQReg128(dd, unop(opZHI, mkexpr(res)));
11890       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
11891       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
11892       HChar ch         = size == X01 ? 'h' : 's';
11893       DIP("%s %c%u, %c%u, v%d.%c[%u]\n", nm, ch, dd, ch, nn, ch, (Int)dd, ix);
11894       return True;
11895    }
11896
11897    if (bitU == 1 && (opcode == BITS4(1,1,0,1) || opcode == BITS4(1,1,1,1))) {
11898       /* -------- 0,xx,1101 SQRDMLAH s and h variants only -------- */
11899       /* -------- 0,xx,1111 SQRDMLSH s and h variants only -------- */
11900       UInt mm  = 32; // invalid
11901       UInt ix  = 16; // invalid
11902       switch (size) {
11903          case X00:
11904             return False; // b case is not allowed
11905          case X01:
11906             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
11907          case X10:
11908             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
11909          case X11:
11910             return False; // d case is not allowed
11911          default:
11912             vassert(0);
11913       }
11914       vassert(size < 4);
11915       vassert(mm < 32 && ix < 16);
11916       Bool isAdd = opcode == BITS4(1,1,0,1);
11917
11918       IRTemp res, res_nosat, vD, vN, vM;
11919       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
11920       newTempsV128_2(&vD, &vN);
11921       assign(vD, getQReg128(dd));
11922       assign(vN, getQReg128(nn));
11923       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
11924
11925       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
11926       IROp opZHI = mkVecZEROHIxxOFV128(size);
11927       putQReg128(dd, unop(opZHI, mkexpr(res)));
11928       updateQCFLAGwithDifferenceZHI(res, res_nosat, opZHI);
11929
11930       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
11931       HChar ch         = size == X01 ? 'h' : 's';
11932       DIP("%s %c%u, %c%u, v%d.%c[%u]\n", nm, ch, dd, ch, nn, ch, (Int)dd, ix);
11933       return True;
11934    }
11935
11936    return False;
11937 #  undef INSN
11938 }
11939
11940
11941 static
11942 Bool dis_AdvSIMD_shift_by_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
11943 {
11944    /* 31    28     22   18   15     10 9 4
11945       0 q u 011110 immh immb opcode 1  n d
11946       Decode fields: u,opcode
11947    */
11948 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
11949    if (INSN(31,31) != 0
11950        || INSN(28,23) != BITS6(0,1,1,1,1,0) || INSN(10,10) != 1) {
11951       return False;
11952    }
11953    UInt bitQ   = INSN(30,30);
11954    UInt bitU   = INSN(29,29);
11955    UInt immh   = INSN(22,19);
11956    UInt immb   = INSN(18,16);
11957    UInt opcode = INSN(15,11);
11958    UInt nn     = INSN(9,5);
11959    UInt dd     = INSN(4,0);
11960
11961    if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,0,1,0)) {
11962       /* -------- 0,00000 SSHR std7_std7_#imm -------- */
11963       /* -------- 1,00000 USHR std7_std7_#imm -------- */
11964       /* -------- 0,00010 SSRA std7_std7_#imm -------- */
11965       /* -------- 1,00010 USRA std7_std7_#imm -------- */
11966       /* laneTy, shift = case immh:immb of
11967                          0001:xxx -> B, SHR:8-xxx
11968                          001x:xxx -> H, SHR:16-xxxx
11969                          01xx:xxx -> S, SHR:32-xxxxx
11970                          1xxx:xxx -> D, SHR:64-xxxxxx
11971                          other    -> invalid
11972       */
11973       UInt size  = 0;
11974       UInt shift = 0;
11975       Bool isQ   = bitQ == 1;
11976       Bool isU   = bitU == 1;
11977       Bool isAcc = opcode == BITS5(0,0,0,1,0);
11978       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
11979       if (!ok || (bitQ == 0 && size == X11)) return False;
11980       vassert(size <= 3);
11981       UInt lanebits = 8 << size;
11982       vassert(shift >= 1 && shift <= lanebits);
11983       IROp    op  = isU ? mkVecSHRN(size) : mkVecSARN(size);
11984       IRExpr* src = getQReg128(nn);
11985       IRTemp  shf = newTempV128();
11986       IRTemp  res = newTempV128();
11987       if (shift == lanebits && isU) {
11988          assign(shf, mkV128(0x0000));
11989       } else {
11990          UInt nudge = 0;
11991          if (shift == lanebits) {
11992             vassert(!isU);
11993             nudge = 1;
11994          }
11995          assign(shf, binop(op, src, mkU8(shift - nudge)));
11996       }
11997       assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
11998                         : mkexpr(shf));
11999       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12000       HChar laneCh = "bhsd"[size];
12001       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
12002       const HChar* nm = isAcc ? (isU ? "usra" : "ssra")
12003                               : (isU ? "ushr" : "sshr");
12004       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
12005           nameQReg128(dd), nLanes, laneCh,
12006           nameQReg128(nn), nLanes, laneCh, shift);
12007       return True;
12008    }
12009
12010    if (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,1,0)) {
12011       /* -------- 0,00100 SRSHR std7_std7_#imm -------- */
12012       /* -------- 1,00100 URSHR std7_std7_#imm -------- */
12013       /* -------- 0,00110 SRSRA std7_std7_#imm -------- */
12014       /* -------- 1,00110 URSRA std7_std7_#imm -------- */
12015       /* laneTy, shift = case immh:immb of
12016                          0001:xxx -> B, SHR:8-xxx
12017                          001x:xxx -> H, SHR:16-xxxx
12018                          01xx:xxx -> S, SHR:32-xxxxx
12019                          1xxx:xxx -> D, SHR:64-xxxxxx
12020                          other    -> invalid
12021       */
12022       UInt size  = 0;
12023       UInt shift = 0;
12024       Bool isQ   = bitQ == 1;
12025       Bool isU   = bitU == 1;
12026       Bool isAcc = opcode == BITS5(0,0,1,1,0);
12027       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
12028       if (!ok || (bitQ == 0 && size == X11)) return False;
12029       vassert(size <= 3);
12030       UInt lanebits = 8 << size;
12031       vassert(shift >= 1 && shift <= lanebits);
12032       IROp    op   = isU ? mkVecRSHU(size) : mkVecRSHS(size);
12033       IRExpr* src  = getQReg128(nn);
12034       IRTemp  imm8 = newTemp(Ity_I8);
12035       assign(imm8, mkU8((UChar)(-shift)));
12036       IRExpr* amt  = mkexpr(math_DUP_TO_V128(imm8, Ity_I8));
12037       IRTemp  shf  = newTempV128();
12038       IRTemp  res  = newTempV128();
12039       assign(shf, binop(op, src, amt));
12040       assign(res, isAcc ? binop(mkVecADD(size), getQReg128(dd), mkexpr(shf))
12041                         : mkexpr(shf));
12042       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12043       HChar laneCh = "bhsd"[size];
12044       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
12045       const HChar* nm = isAcc ? (isU ? "ursra" : "srsra")
12046                               : (isU ? "urshr" : "srshr");
12047       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
12048           nameQReg128(dd), nLanes, laneCh,
12049           nameQReg128(nn), nLanes, laneCh, shift);
12050       return True;
12051    }
12052
12053    if (bitU == 1 && opcode == BITS5(0,1,0,0,0)) {
12054       /* -------- 1,01000 SRI std7_std7_#imm -------- */
12055       /* laneTy, shift = case immh:immb of
12056                          0001:xxx -> B, SHR:8-xxx
12057                          001x:xxx -> H, SHR:16-xxxx
12058                          01xx:xxx -> S, SHR:32-xxxxx
12059                          1xxx:xxx -> D, SHR:64-xxxxxx
12060                          other    -> invalid
12061       */
12062       UInt size  = 0;
12063       UInt shift = 0;
12064       Bool isQ   = bitQ == 1;
12065       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
12066       if (!ok || (bitQ == 0 && size == X11)) return False;
12067       vassert(size <= 3);
12068       UInt lanebits = 8 << size;
12069       vassert(shift >= 1 && shift <= lanebits);
12070       IRExpr* src = getQReg128(nn);
12071       IRTemp  res = newTempV128();
12072       if (shift == lanebits) {
12073          assign(res, getQReg128(dd));
12074       } else {
12075          assign(res, binop(mkVecSHRN(size), src, mkU8(shift)));
12076          IRExpr* nmask = binop(mkVecSHLN(size),
12077                                mkV128(0xFFFF), mkU8(lanebits - shift));
12078          IRTemp  tmp   = newTempV128();
12079          assign(tmp, binop(Iop_OrV128,
12080                            mkexpr(res),
12081                            binop(Iop_AndV128, getQReg128(dd), nmask)));
12082          res = tmp;
12083       }
12084       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12085       HChar laneCh = "bhsd"[size];
12086       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
12087       DIP("%s %s.%u%c, %s.%u%c, #%u\n", "sri",
12088           nameQReg128(dd), nLanes, laneCh,
12089           nameQReg128(nn), nLanes, laneCh, shift);
12090       return True;
12091    }
12092
12093    if (opcode == BITS5(0,1,0,1,0)) {
12094       /* -------- 0,01010 SHL std7_std7_#imm -------- */
12095       /* -------- 1,01010 SLI std7_std7_#imm -------- */
12096       /* laneTy, shift = case immh:immb of
12097                          0001:xxx -> B, xxx
12098                          001x:xxx -> H, xxxx
12099                          01xx:xxx -> S, xxxxx
12100                          1xxx:xxx -> D, xxxxxx
12101                          other    -> invalid
12102       */
12103       UInt size  = 0;
12104       UInt shift = 0;
12105       Bool isSLI = bitU == 1;
12106       Bool isQ   = bitQ == 1;
12107       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
12108       if (!ok || (bitQ == 0 && size == X11)) return False;
12109       vassert(size <= 3);
12110       /* The shift encoding has opposite sign for the leftwards case.
12111          Adjust shift to compensate. */
12112       UInt lanebits = 8 << size;
12113       shift = lanebits - shift;
12114       vassert(shift < lanebits);
12115       IROp    op  = mkVecSHLN(size);
12116       IRExpr* src = getQReg128(nn);
12117       IRTemp  res = newTempV128();
12118       if (shift == 0) {
12119          assign(res, src);
12120       } else {
12121          assign(res, binop(op, src, mkU8(shift)));
12122          if (isSLI) {
12123             IRExpr* nmask = binop(mkVecSHRN(size),
12124                                   mkV128(0xFFFF), mkU8(lanebits - shift));
12125             IRTemp  tmp   = newTempV128();
12126             assign(tmp, binop(Iop_OrV128,
12127                               mkexpr(res),
12128                               binop(Iop_AndV128, getQReg128(dd), nmask)));
12129             res = tmp;
12130          }
12131       }
12132       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12133       HChar laneCh = "bhsd"[size];
12134       UInt  nLanes = (isQ ? 128 : 64) / lanebits;
12135       const HChar* nm = isSLI ? "sli" : "shl";
12136       DIP("%s %s.%u%c, %s.%u%c, #%u\n", nm,
12137           nameQReg128(dd), nLanes, laneCh,
12138           nameQReg128(nn), nLanes, laneCh, shift);
12139       return True;
12140    }
12141
12142    if (opcode == BITS5(0,1,1,1,0)
12143        || (bitU == 1 && opcode == BITS5(0,1,1,0,0))) {
12144       /* -------- 0,01110  SQSHL  std7_std7_#imm -------- */
12145       /* -------- 1,01110  UQSHL  std7_std7_#imm -------- */
12146       /* -------- 1,01100  SQSHLU std7_std7_#imm -------- */
12147       UInt size  = 0;
12148       UInt shift = 0;
12149       Bool isQ   = bitQ == 1;
12150       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
12151       if (!ok || (bitQ == 0 && size == X11)) return False;
12152       vassert(size >= 0 && size <= 3);
12153       /* The shift encoding has opposite sign for the leftwards case.
12154          Adjust shift to compensate. */
12155       UInt lanebits = 8 << size;
12156       shift = lanebits - shift;
12157       vassert(shift >= 0 && shift < lanebits);
12158       const HChar* nm = NULL;
12159       /**/ if (bitU == 0 && opcode == BITS5(0,1,1,1,0)) nm = "sqshl";
12160       else if (bitU == 1 && opcode == BITS5(0,1,1,1,0)) nm = "uqshl";
12161       else if (bitU == 1 && opcode == BITS5(0,1,1,0,0)) nm = "sqshlu";
12162       else vassert(0);
12163       IRTemp qDiff1 = IRTemp_INVALID;
12164       IRTemp qDiff2 = IRTemp_INVALID;
12165       IRTemp res = IRTemp_INVALID;
12166       IRTemp src = newTempV128();
12167       assign(src, getQReg128(nn));
12168       math_QSHL_IMM(&res, &qDiff1, &qDiff2, src, size, shift, nm);
12169       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12170       updateQCFLAGwithDifferenceZHI(qDiff1, qDiff2,
12171                                     isQ ? Iop_INVALID : Iop_ZeroHI64ofV128);
12172       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12173       DIP("%s %s.%s, %s.%s, #%u\n", nm,
12174           nameQReg128(dd), arr, nameQReg128(nn), arr, shift);
12175       return True;
12176    }
12177
12178    if (bitU == 0
12179        && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1))) {
12180       /* -------- 0,10000  SHRN{,2} #imm -------- */
12181       /* -------- 0,10001 RSHRN{,2} #imm -------- */
12182       /* Narrows, and size is the narrow size. */
12183       UInt size  = 0;
12184       UInt shift = 0;
12185       Bool is2   = bitQ == 1;
12186       Bool isR   = opcode == BITS5(1,0,0,0,1);
12187       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
12188       if (!ok || size == X11) return False;
12189       vassert(shift >= 1);
12190       IRTemp t1 = newTempV128();
12191       IRTemp t2 = newTempV128();
12192       IRTemp t3 = newTempV128();
12193       assign(t1, getQReg128(nn));
12194       assign(t2, isR ? binop(mkVecADD(size+1),
12195                              mkexpr(t1),
12196                              mkexpr(math_VEC_DUP_IMM(size+1, 1ULL<<(shift-1))))
12197                      : mkexpr(t1));
12198       assign(t3, binop(mkVecSHRN(size+1), mkexpr(t2), mkU8(shift)));
12199       IRTemp t4 = math_NARROW_LANES(t3, t3, size);
12200       putLO64andZUorPutHI64(is2, dd, t4);
12201       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12202       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12203       DIP("%s %s.%s, %s.%s, #%u\n", isR ? "rshrn" : "shrn",
12204           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
12205       return True;
12206    }
12207
12208    if (opcode == BITS5(1,0,0,1,0) || opcode == BITS5(1,0,0,1,1)
12209        || (bitU == 1
12210            && (opcode == BITS5(1,0,0,0,0) || opcode == BITS5(1,0,0,0,1)))) {
12211       /* -------- 0,10010   SQSHRN{,2} #imm -------- */
12212       /* -------- 1,10010   UQSHRN{,2} #imm -------- */
12213       /* -------- 0,10011  SQRSHRN{,2} #imm -------- */
12214       /* -------- 1,10011  UQRSHRN{,2} #imm -------- */
12215       /* -------- 1,10000  SQSHRUN{,2} #imm -------- */
12216       /* -------- 1,10001 SQRSHRUN{,2} #imm -------- */
12217       UInt size  = 0;
12218       UInt shift = 0;
12219       Bool is2   = bitQ == 1;
12220       Bool ok    = getLaneInfo_IMMH_IMMB(&shift, &size, immh, immb);
12221       if (!ok || size == X11) return False;
12222       vassert(shift >= 1 && shift <= (8 << size));
12223       const HChar* nm = "??";
12224       IROp op = Iop_INVALID;
12225       /* Decide on the name and the operation. */
12226       /**/ if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
12227          nm = "sqshrn"; op = mkVecQANDqsarNNARROWSS(size);
12228       }
12229       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
12230          nm = "uqshrn"; op = mkVecQANDqshrNNARROWUU(size);
12231       }
12232       else if (bitU == 0 && opcode == BITS5(1,0,0,1,1)) {
12233          nm = "sqrshrn"; op = mkVecQANDqrsarNNARROWSS(size);
12234       }
12235       else if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
12236          nm = "uqrshrn"; op = mkVecQANDqrshrNNARROWUU(size);
12237       }
12238       else if (bitU == 1 && opcode == BITS5(1,0,0,0,0)) {
12239          nm = "sqshrun"; op = mkVecQANDqsarNNARROWSU(size);
12240       }
12241       else if (bitU == 1 && opcode == BITS5(1,0,0,0,1)) {
12242          nm = "sqrshrun"; op = mkVecQANDqrsarNNARROWSU(size);
12243       }
12244       else vassert(0);
12245       /* Compute the result (Q, shifted value) pair. */
12246       IRTemp src128 = newTempV128();
12247       assign(src128, getQReg128(nn));
12248       IRTemp pair = newTempV128();
12249       assign(pair, binop(op, mkexpr(src128), mkU8(shift)));
12250       /* Update the result reg */
12251       IRTemp res64in128 = newTempV128();
12252       assign(res64in128, unop(Iop_ZeroHI64ofV128, mkexpr(pair)));
12253       putLO64andZUorPutHI64(is2, dd, res64in128);
12254       /* Update the Q flag. */
12255       IRTemp q64q64 = newTempV128();
12256       assign(q64q64, binop(Iop_InterleaveHI64x2, mkexpr(pair), mkexpr(pair)));
12257       IRTemp z128 = newTempV128();
12258       assign(z128, mkV128(0x0000));
12259       updateQCFLAGwithDifference(q64q64, z128);
12260       /* */
12261       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12262       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12263       DIP("%s %s.%s, %s.%s, #%u\n", nm,
12264           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide, shift);
12265       return True;
12266    }
12267
12268    if (opcode == BITS5(1,0,1,0,0)) {
12269       /* -------- 0,10100 SSHLL{,2} #imm -------- */
12270       /* -------- 1,10100 USHLL{,2} #imm -------- */
12271       /* 31  28     22   18   15     9 4
12272          0q0 011110 immh immb 101001 n d  SSHLL Vd.Ta, Vn.Tb, #sh
12273          0q1 011110 immh immb 101001 n d  USHLL Vd.Ta, Vn.Tb, #sh
12274          where Ta,Tb,sh
12275            = case immh of 1xxx -> invalid
12276                           01xx -> 2d, 2s(q0)/4s(q1),  immh:immb - 32 (0..31)
12277                           001x -> 4s, 4h(q0)/8h(q1),  immh:immb - 16 (0..15)
12278                           0001 -> 8h, 8b(q0)/16b(q1), immh:immb - 8  (0..7)
12279                           0000 -> AdvSIMD modified immediate (???)
12280       */
12281       Bool    isQ   = bitQ == 1;
12282       Bool    isU   = bitU == 1;
12283       UInt    immhb = (immh << 3) | immb;
12284       IRTemp  src   = newTempV128();
12285       IRTemp  zero  = newTempV128();
12286       IRExpr* res   = NULL;
12287       UInt    sh    = 0;
12288       const HChar* ta = "??";
12289       const HChar* tb = "??";
12290       assign(src, getQReg128(nn));
12291       assign(zero, mkV128(0x0000));
12292       if (immh & 8) {
12293          /* invalid; don't assign to res */
12294       }
12295       else if (immh & 4) {
12296          sh = immhb - 32;
12297          vassert(sh < 32); /* so 32-sh is 1..32 */
12298          ta = "2d";
12299          tb = isQ ? "4s" : "2s";
12300          IRExpr* tmp = isQ ? mk_InterleaveHI32x4(src, zero)
12301                            : mk_InterleaveLO32x4(src, zero);
12302          res = binop(isU ? Iop_ShrN64x2 : Iop_SarN64x2, tmp, mkU8(32-sh));
12303       }
12304       else if (immh & 2) {
12305          sh = immhb - 16;
12306          vassert(sh < 16); /* so 16-sh is 1..16 */
12307          ta = "4s";
12308          tb = isQ ? "8h" : "4h";
12309          IRExpr* tmp = isQ ? mk_InterleaveHI16x8(src, zero)
12310                            : mk_InterleaveLO16x8(src, zero);
12311          res = binop(isU ? Iop_ShrN32x4 : Iop_SarN32x4, tmp, mkU8(16-sh));
12312       }
12313       else if (immh & 1) {
12314          sh = immhb - 8;
12315          vassert(sh < 8); /* so 8-sh is 1..8 */
12316          ta = "8h";
12317          tb = isQ ? "16b" : "8b";
12318          IRExpr* tmp = isQ ? mk_InterleaveHI8x16(src, zero)
12319                            : mk_InterleaveLO8x16(src, zero);
12320          res = binop(isU ? Iop_ShrN16x8 : Iop_SarN16x8, tmp, mkU8(8-sh));
12321       } else {
12322          vassert(immh == 0);
12323          /* invalid; don't assign to res */
12324       }
12325       /* */
12326       if (res) {
12327          putQReg128(dd, res);
12328          DIP("%cshll%s %s.%s, %s.%s, #%u\n",
12329              isU ? 'u' : 's', isQ ? "2" : "",
12330              nameQReg128(dd), ta, nameQReg128(nn), tb, sh);
12331          return True;
12332       }
12333       return False;
12334    }
12335
12336    if (opcode == BITS5(1,1,1,0,0)) {
12337       /* -------- 0,11100 SCVTF {2d_2d,4s_4s,2s_2s}_imm -------- */
12338       /* -------- 1,11100 UCVTF {2d_2d,4s_4s,2s_2s}_imm -------- */
12339       /* If immh is of the form 00xx, the insn is invalid. */
12340       if (immh < BITS4(0,1,0,0)) return False;
12341       UInt size  = 0;
12342       UInt fbits = 0;
12343       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
12344       /* The following holds because immh is never zero. */
12345       vassert(ok);
12346       /* The following holds because immh >= 0100. */
12347       vassert(size == X10 || size == X11);
12348       Bool isD = size == X11;
12349       Bool isU = bitU == 1;
12350       Bool isQ = bitQ == 1;
12351       if (isD && !isQ) return False; /* reject .1d case */
12352       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
12353       Double  scale  = two_to_the_minus(fbits);
12354       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
12355                            : IRExpr_Const(IRConst_F32( (Float)scale ));
12356       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
12357       IROp    opCVT  = isU ? (isD ? Iop_I64UtoF64 : Iop_I32UtoF32)
12358                            : (isD ? Iop_I64StoF64 : Iop_I32StoF32);
12359       IRType tyF = isD ? Ity_F64 : Ity_F32;
12360       IRType tyI = isD ? Ity_I64 : Ity_I32;
12361       UInt nLanes = (isQ ? 2 : 1) * (isD ? 1 : 2);
12362       vassert(nLanes == 2 || nLanes == 4);
12363       for (UInt i = 0; i < nLanes; i++) {
12364          IRTemp src = newTemp(tyI);
12365          IRTemp res = newTemp(tyF);
12366          IRTemp rm  = mk_get_IR_rounding_mode();
12367          assign(src, getQRegLane(nn, i, tyI));
12368          assign(res, triop(opMUL, mkexpr(rm),
12369                                   binop(opCVT, mkexpr(rm), mkexpr(src)),
12370                                   scaleE));
12371          putQRegLane(dd, i, mkexpr(res));
12372       }
12373       if (!isQ) {
12374          putQRegLane(dd, 1, mkU64(0));
12375       }
12376       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12377       DIP("%s %s.%s, %s.%s, #%u\n", isU ? "ucvtf" : "scvtf",
12378           nameQReg128(dd), arr, nameQReg128(nn), arr, fbits);
12379       return True;
12380    }
12381
12382    if (opcode == BITS5(1,1,1,1,1)) {
12383       /* -------- 0,11111 FCVTZS {2d_2d,4s_4s,2s_2s}_imm -------- */
12384       /* -------- 1,11111 FCVTZU {2d_2d,4s_4s,2s_2s}_imm -------- */
12385       /* If immh is of the form 00xx, the insn is invalid. */
12386       if (immh < BITS4(0,1,0,0)) return False;
12387       UInt size  = 0;
12388       UInt fbits = 0;
12389       Bool ok    = getLaneInfo_IMMH_IMMB(&fbits, &size, immh, immb);
12390       /* The following holds because immh is never zero. */
12391       vassert(ok);
12392       /* The following holds because immh >= 0100. */
12393       vassert(size == X10 || size == X11);
12394       Bool isD = size == X11;
12395       Bool isU = bitU == 1;
12396       Bool isQ = bitQ == 1;
12397       if (isD && !isQ) return False; /* reject .1d case */
12398       vassert(fbits >= 1 && fbits <= (isD ? 64 : 32));
12399       Double  scale  = two_to_the_plus(fbits);
12400       IRExpr* scaleE = isD ? IRExpr_Const(IRConst_F64(scale))
12401                            : IRExpr_Const(IRConst_F32( (Float)scale ));
12402       IROp    opMUL  = isD ? Iop_MulF64 : Iop_MulF32;
12403       IROp    opCVT  = isU ? (isD ? Iop_F64toI64U : Iop_F32toI32U)
12404                            : (isD ? Iop_F64toI64S : Iop_F32toI32S);
12405       IRType tyF = isD ? Ity_F64 : Ity_F32;
12406       IRType tyI = isD ? Ity_I64 : Ity_I32;
12407       UInt nLanes = (isQ ? 2 : 1) * (isD ? 1 : 2);
12408       vassert(nLanes == 2 || nLanes == 4);
12409       for (UInt i = 0; i < nLanes; i++) {
12410          IRTemp src = newTemp(tyF);
12411          IRTemp res = newTemp(tyI);
12412          IRTemp rm  = newTemp(Ity_I32);
12413          assign(src, getQRegLane(nn, i, tyF));
12414          assign(rm,  mkU32(Irrm_ZERO));
12415          assign(res, binop(opCVT, mkexpr(rm),
12416                                   triop(opMUL, mkexpr(rm),
12417                                                mkexpr(src), scaleE)));
12418          putQRegLane(dd, i, mkexpr(res));
12419       }
12420       if (!isQ) {
12421          putQRegLane(dd, 1, mkU64(0));
12422       }
12423       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12424       DIP("%s %s.%s, %s.%s, #%u\n", isU ? "fcvtzu" : "fcvtzs",
12425           nameQReg128(dd), arr, nameQReg128(nn), arr, fbits);
12426       return True;
12427    }
12428
12429 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12430    return False;
12431 #  undef INSN
12432 }
12433
12434
12435 static
12436 Bool dis_AdvSIMD_three_different(/*MB_OUT*/DisResult* dres, UInt insn)
12437 {
12438    /* 31 30 29 28    23   21 20 15     11 9 4
12439       0  Q  U  01110 size 1  m  opcode 00 n d
12440       Decode fields: u,opcode
12441    */
12442 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12443    if (INSN(31,31) != 0
12444        || INSN(28,24) != BITS5(0,1,1,1,0)
12445        || INSN(21,21) != 1
12446        || INSN(11,10) != BITS2(0,0)) {
12447       return False;
12448    }
12449    UInt bitQ   = INSN(30,30);
12450    UInt bitU   = INSN(29,29);
12451    UInt size   = INSN(23,22);
12452    UInt mm     = INSN(20,16);
12453    UInt opcode = INSN(15,12);
12454    UInt nn     = INSN(9,5);
12455    UInt dd     = INSN(4,0);
12456    vassert(size < 4);
12457    Bool is2    = bitQ == 1;
12458
12459    if (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,0,1,0)) {
12460       /* -------- 0,0000 SADDL{2} -------- */
12461       /* -------- 1,0000 UADDL{2} -------- */
12462       /* -------- 0,0010 SSUBL{2} -------- */
12463       /* -------- 1,0010 USUBL{2} -------- */
12464       /* Widens, and size refers to the narrow lanes. */
12465       if (size == X11) return False;
12466       vassert(size <= 2);
12467       Bool   isU   = bitU == 1;
12468       Bool   isADD = opcode == BITS4(0,0,0,0);
12469       IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
12470       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
12471       IRTemp res   = newTempV128();
12472       assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
12473                         mkexpr(argL), mkexpr(argR)));
12474       putQReg128(dd, mkexpr(res));
12475       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12476       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12477       const HChar* nm        = isADD ? (isU ? "uaddl" : "saddl")
12478                                      : (isU ? "usubl" : "ssubl");
12479       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
12480           nameQReg128(dd), arrWide,
12481           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
12482       return True;
12483    }
12484
12485    if (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,0,1,1)) {
12486       /* -------- 0,0001 SADDW{2} -------- */
12487       /* -------- 1,0001 UADDW{2} -------- */
12488       /* -------- 0,0011 SSUBW{2} -------- */
12489       /* -------- 1,0011 USUBW{2} -------- */
12490       /* Widens, and size refers to the narrow lanes. */
12491       if (size == X11) return False;
12492       vassert(size <= 2);
12493       Bool   isU   = bitU == 1;
12494       Bool   isADD = opcode == BITS4(0,0,0,1);
12495       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
12496       IRTemp res   = newTempV128();
12497       assign(res, binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
12498                         getQReg128(nn), mkexpr(argR)));
12499       putQReg128(dd, mkexpr(res));
12500       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12501       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12502       const HChar* nm        = isADD ? (isU ? "uaddw" : "saddw")
12503                                      : (isU ? "usubw" : "ssubw");
12504       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
12505           nameQReg128(dd), arrWide,
12506           nameQReg128(nn), arrWide, nameQReg128(mm), arrNarrow);
12507       return True;
12508    }
12509
12510    if (opcode == BITS4(0,1,0,0) || opcode == BITS4(0,1,1,0)) {
12511       /* -------- 0,0100  ADDHN{2} -------- */
12512       /* -------- 1,0100 RADDHN{2} -------- */
12513       /* -------- 0,0110  SUBHN{2} -------- */
12514       /* -------- 1,0110 RSUBHN{2} -------- */
12515       /* Narrows, and size refers to the narrowed lanes. */
12516       if (size == X11) return False;
12517       vassert(size <= 2);
12518       const UInt shift[3] = { 8, 16, 32 };
12519       Bool isADD = opcode == BITS4(0,1,0,0);
12520       Bool isR   = bitU == 1;
12521       /* Combined elements in wide lanes */
12522       IRTemp  wide  = newTempV128();
12523       IRExpr* wideE = binop(isADD ? mkVecADD(size+1) : mkVecSUB(size+1),
12524                             getQReg128(nn), getQReg128(mm));
12525       if (isR) {
12526          wideE = binop(mkVecADD(size+1),
12527                        wideE,
12528                        mkexpr(math_VEC_DUP_IMM(size+1,
12529                                                1ULL << (shift[size]-1))));
12530       }
12531       assign(wide, wideE);
12532       /* Top halves of elements, still in wide lanes */
12533       IRTemp shrd = newTempV128();
12534       assign(shrd, binop(mkVecSHRN(size+1), mkexpr(wide), mkU8(shift[size])));
12535       /* Elements now compacted into lower 64 bits */
12536       IRTemp new64 = newTempV128();
12537       assign(new64, binop(mkVecCATEVENLANES(size), mkexpr(shrd), mkexpr(shrd)));
12538       putLO64andZUorPutHI64(is2, dd, new64);
12539       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12540       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12541       const HChar* nm = isADD ? (isR ? "raddhn" : "addhn")
12542                               : (isR ? "rsubhn" : "subhn");
12543       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
12544           nameQReg128(dd), arrNarrow,
12545           nameQReg128(nn), arrWide, nameQReg128(mm), arrWide);
12546       return True;
12547    }
12548
12549    if (opcode == BITS4(0,1,0,1) || opcode == BITS4(0,1,1,1)) {
12550       /* -------- 0,0101 SABAL{2} -------- */
12551       /* -------- 1,0101 UABAL{2} -------- */
12552       /* -------- 0,0111 SABDL{2} -------- */
12553       /* -------- 1,0111 UABDL{2} -------- */
12554       /* Widens, and size refers to the narrow lanes. */
12555       if (size == X11) return False;
12556       vassert(size <= 2);
12557       Bool   isU   = bitU == 1;
12558       Bool   isACC = opcode == BITS4(0,1,0,1);
12559       IRTemp argL  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(nn));
12560       IRTemp argR  = math_WIDEN_LO_OR_HI_LANES(isU, is2, size, getQReg128(mm));
12561       IRTemp abd   = math_ABD(isU, size+1, mkexpr(argL), mkexpr(argR));
12562       IRTemp res   = newTempV128();
12563       assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(abd), getQReg128(dd))
12564                         : mkexpr(abd));
12565       putQReg128(dd, mkexpr(res));
12566       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12567       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12568       const HChar* nm        = isACC ? (isU ? "uabal" : "sabal")
12569                                      : (isU ? "uabdl" : "sabdl");
12570       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
12571           nameQReg128(dd), arrWide,
12572           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
12573       return True;
12574    }
12575
12576    if (opcode == BITS4(1,1,0,0)
12577        || opcode == BITS4(1,0,0,0) || opcode == BITS4(1,0,1,0)) {
12578       /* -------- 0,1100  SMULL{2} -------- */ // 0 (ks)
12579       /* -------- 1,1100  UMULL{2} -------- */ // 0
12580       /* -------- 0,1000  SMLAL{2} -------- */ // 1
12581       /* -------- 1,1000  UMLAL{2} -------- */ // 1
12582       /* -------- 0,1010  SMLSL{2} -------- */ // 2
12583       /* -------- 1,1010  UMLSL{2} -------- */ // 2
12584       /* Widens, and size refers to the narrow lanes. */
12585       UInt ks = 3;
12586       switch (opcode) {
12587          case BITS4(1,1,0,0): ks = 0; break;
12588          case BITS4(1,0,0,0): ks = 1; break;
12589          case BITS4(1,0,1,0): ks = 2; break;
12590          default: vassert(0);
12591       }
12592       vassert(ks <= 2);
12593       if (size == X11) return False;
12594       vassert(size <= 2);
12595       Bool   isU  = bitU == 1;
12596       IRTemp vecN = newTempV128();
12597       IRTemp vecM = newTempV128();
12598       IRTemp vecD = newTempV128();
12599       assign(vecN, getQReg128(nn));
12600       assign(vecM, getQReg128(mm));
12601       assign(vecD, getQReg128(dd));
12602       IRTemp res = IRTemp_INVALID;
12603       math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
12604                     vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
12605       putQReg128(dd, mkexpr(res));
12606       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12607       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12608       const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
12609       DIP("%c%s%s %s.%s, %s.%s, %s.%s\n", isU ? 'u' : 's', nm, is2 ? "2" : "",
12610           nameQReg128(dd), arrWide,
12611           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
12612       return True;
12613    }
12614
12615    if (bitU == 0
12616        && (opcode == BITS4(1,1,0,1)
12617            || opcode == BITS4(1,0,0,1) || opcode == BITS4(1,0,1,1))) {
12618       /* -------- 0,1101  SQDMULL{2} -------- */ // 0 (ks)
12619       /* -------- 0,1001  SQDMLAL{2} -------- */ // 1
12620       /* -------- 0,1011  SQDMLSL{2} -------- */ // 2
12621       /* Widens, and size refers to the narrow lanes. */
12622       UInt ks = 3;
12623       switch (opcode) {
12624          case BITS4(1,1,0,1): ks = 0; break;
12625          case BITS4(1,0,0,1): ks = 1; break;
12626          case BITS4(1,0,1,1): ks = 2; break;
12627          default: vassert(0);
12628       }
12629       vassert(ks <= 2);
12630       if (size == X00 || size == X11) return False;
12631       vassert(size <= 2);
12632       IRTemp vecN, vecM, vecD, res, sat1q, sat1n, sat2q, sat2n;
12633       vecN = vecM = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
12634       newTempsV128_3(&vecN, &vecM, &vecD);
12635       assign(vecN, getQReg128(nn));
12636       assign(vecM, getQReg128(mm));
12637       assign(vecD, getQReg128(dd));
12638       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
12639                        is2, size, "mas"[ks],
12640                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
12641       putQReg128(dd, mkexpr(res));
12642       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
12643       updateQCFLAGwithDifference(sat1q, sat1n);
12644       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
12645          updateQCFLAGwithDifference(sat2q, sat2n);
12646       }
12647       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
12648       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
12649       const HChar* nm        = ks == 0 ? "sqdmull"
12650                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
12651       DIP("%s%s %s.%s, %s.%s, %s.%s\n", nm, is2 ? "2" : "",
12652           nameQReg128(dd), arrWide,
12653           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
12654       return True;
12655    }
12656
12657    if (bitU == 0 && opcode == BITS4(1,1,1,0)) {
12658       /* -------- 0,1110  PMULL{2} -------- */
12659       /* Widens, and size refers to the narrow lanes. */
12660       if (size != X00 && size != X11) return False;
12661       IRTemp  res  = IRTemp_INVALID;
12662       IRExpr* srcN = getQReg128(nn);
12663       IRExpr* srcM = getQReg128(mm);
12664       const HChar* arrNarrow = NULL;
12665       const HChar* arrWide   = NULL;
12666       if (size == X00) {
12667          res = math_BINARY_WIDENING_V128(is2, Iop_PolynomialMull8x8,
12668                                          srcN, srcM);
12669          arrNarrow = nameArr_Q_SZ(bitQ, size);
12670          arrWide   = nameArr_Q_SZ(1,    size+1);
12671       } else {
12672          /* The same thing as the X00 case, except we have to call
12673             a helper to do it. */
12674          vassert(size == X11);
12675          res = newTemp(Ity_V128);
12676          IROp slice
12677             = is2 ? Iop_V128HIto64 : Iop_V128to64;
12678          IRExpr** args
12679             = mkIRExprVec_3( IRExpr_VECRET(),
12680                              unop(slice, srcN), unop(slice, srcM));
12681          IRDirty* di
12682             = unsafeIRDirty_1_N( res, 0/*regparms*/,
12683                                       "arm64g_dirtyhelper_PMULLQ",
12684                                       &arm64g_dirtyhelper_PMULLQ, args);
12685          stmt(IRStmt_Dirty(di));
12686          /* We can't use nameArr_Q_SZ for this because it can't deal with
12687             Q-sized (128 bit) results.  Hence do it by hand. */
12688          arrNarrow = bitQ == 0 ? "1d" : "2d";
12689          arrWide   = "1q";
12690       }
12691       putQReg128(dd, mkexpr(res));
12692       DIP("%s%s %s.%s, %s.%s, %s.%s\n", "pmull", is2 ? "2" : "",
12693           nameQReg128(dd), arrWide,
12694           nameQReg128(nn), arrNarrow, nameQReg128(mm), arrNarrow);
12695       return True;
12696    }
12697
12698    return False;
12699 #  undef INSN
12700 }
12701
12702
12703 static
12704 Bool dis_AdvSIMD_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
12705 {
12706    /* 31 30 29 28    23   21 20 15     10 9 4
12707       0  Q  U  01110 size 1  m  opcode 1  n d
12708       Decode fields: u,size,opcode
12709    */
12710 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
12711    if (INSN(31,31) != 0
12712        || INSN(28,24) != BITS5(0,1,1,1,0)
12713        || INSN(21,21) != 1
12714        || INSN(10,10) != 1) {
12715       return False;
12716    }
12717    UInt bitQ   = INSN(30,30);
12718    UInt bitU   = INSN(29,29);
12719    UInt size   = INSN(23,22);
12720    UInt mm     = INSN(20,16);
12721    UInt opcode = INSN(15,11);
12722    UInt nn     = INSN(9,5);
12723    UInt dd     = INSN(4,0);
12724    vassert(size < 4);
12725
12726    if (opcode == BITS5(0,0,0,0,0) || opcode == BITS5(0,0,1,0,0)) {
12727       /* -------- 0,xx,00000 SHADD std6_std6_std6 -------- */
12728       /* -------- 1,xx,00000 UHADD std6_std6_std6 -------- */
12729       /* -------- 0,xx,00100 SHSUB std6_std6_std6 -------- */
12730       /* -------- 1,xx,00100 UHSUB std6_std6_std6 -------- */
12731       if (size == X11) return False;
12732       Bool isADD = opcode == BITS5(0,0,0,0,0);
12733       Bool isU   = bitU == 1;
12734       /* Widen both args out, do the math, narrow to final result. */
12735       IRTemp argL   = newTempV128();
12736       IRTemp argLhi = IRTemp_INVALID;
12737       IRTemp argLlo = IRTemp_INVALID;
12738       IRTemp argR   = newTempV128();
12739       IRTemp argRhi = IRTemp_INVALID;
12740       IRTemp argRlo = IRTemp_INVALID;
12741       IRTemp resHi  = newTempV128();
12742       IRTemp resLo  = newTempV128();
12743       IRTemp res    = IRTemp_INVALID;
12744       assign(argL, getQReg128(nn));
12745       argLlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argL));
12746       argLhi = math_WIDEN_LO_OR_HI_LANES(isU, True,  size, mkexpr(argL));
12747       assign(argR, getQReg128(mm));
12748       argRlo = math_WIDEN_LO_OR_HI_LANES(isU, False, size, mkexpr(argR));
12749       argRhi = math_WIDEN_LO_OR_HI_LANES(isU, True,  size, mkexpr(argR));
12750       IROp opADDSUB = isADD ? mkVecADD(size+1) : mkVecSUB(size+1);
12751       IROp opSxR = isU ? mkVecSHRN(size+1) : mkVecSARN(size+1);
12752       assign(resHi, binop(opSxR,
12753                           binop(opADDSUB, mkexpr(argLhi), mkexpr(argRhi)),
12754                           mkU8(1)));
12755       assign(resLo, binop(opSxR,
12756                           binop(opADDSUB, mkexpr(argLlo), mkexpr(argRlo)),
12757                           mkU8(1)));
12758       res = math_NARROW_LANES ( resHi, resLo, size );
12759       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12760       const HChar* nm  = isADD ? (isU ? "uhadd" : "shadd")
12761                                : (isU ? "uhsub" : "shsub");
12762       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12763       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12764           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12765       return True;
12766    }
12767
12768    if (opcode == BITS5(0,0,0,1,0)) {
12769       /* -------- 0,xx,00010 SRHADD std7_std7_std7 -------- */
12770       /* -------- 1,xx,00010 URHADD std7_std7_std7 -------- */
12771       if (bitQ == 0 && size == X11) return False; // implied 1d case
12772       Bool   isU  = bitU == 1;
12773       IRTemp argL = newTempV128();
12774       IRTemp argR = newTempV128();
12775       assign(argL, getQReg128(nn));
12776       assign(argR, getQReg128(mm));
12777       IRTemp res = math_RHADD(size, isU, argL, argR);
12778       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12779       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12780       DIP("%s %s.%s, %s.%s, %s.%s\n", isU ? "urhadd" : "srhadd",
12781           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12782       return True;
12783    }
12784
12785    if (opcode == BITS5(0,0,0,0,1) || opcode == BITS5(0,0,1,0,1)) {
12786       /* -------- 0,xx,00001 SQADD std7_std7_std7 -------- */
12787       /* -------- 1,xx,00001 UQADD std7_std7_std7 -------- */
12788       /* -------- 0,xx,00101 SQSUB std7_std7_std7 -------- */
12789       /* -------- 1,xx,00101 UQSUB std7_std7_std7 -------- */
12790       if (bitQ == 0 && size == X11) return False; // implied 1d case
12791       Bool isADD = opcode == BITS5(0,0,0,0,1);
12792       Bool isU   = bitU == 1;
12793       IROp qop   = Iop_INVALID;
12794       IROp nop   = Iop_INVALID;
12795       if (isADD) {
12796          qop = isU ? mkVecQADDU(size) : mkVecQADDS(size);
12797          nop = mkVecADD(size);
12798       } else {
12799          qop = isU ? mkVecQSUBU(size) : mkVecQSUBS(size);
12800          nop = mkVecSUB(size);
12801       }
12802       IRTemp argL = newTempV128();
12803       IRTemp argR = newTempV128();
12804       IRTemp qres = newTempV128();
12805       IRTemp nres = newTempV128();
12806       assign(argL, getQReg128(nn));
12807       assign(argR, getQReg128(mm));
12808       assign(qres, math_MAYBE_ZERO_HI64_fromE(
12809                       bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
12810       assign(nres, math_MAYBE_ZERO_HI64_fromE(
12811                       bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
12812       putQReg128(dd, mkexpr(qres));
12813       updateQCFLAGwithDifference(qres, nres);
12814       const HChar* nm  = isADD ? (isU ? "uqadd" : "sqadd")
12815                                : (isU ? "uqsub" : "sqsub");
12816       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12817       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12818           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12819       return True;
12820    }
12821
12822    if (bitU == 0 && opcode == BITS5(0,0,0,1,1)) {
12823       /* -------- 0,00,00011 AND 16b_16b_16b, 8b_8b_8b -------- */
12824       /* -------- 0,01,00011 BIC 16b_16b_16b, 8b_8b_8b -------- */
12825       /* -------- 0,10,00011 ORR 16b_16b_16b, 8b_8b_8b -------- */
12826       /* -------- 0,10,00011 ORN 16b_16b_16b, 8b_8b_8b -------- */
12827       Bool   isORx  = (size & 2) == 2;
12828       Bool   invert = (size & 1) == 1;
12829       IRTemp res    = newTempV128();
12830       assign(res, binop(isORx ? Iop_OrV128 : Iop_AndV128,
12831                         getQReg128(nn),
12832                         invert ? unop(Iop_NotV128, getQReg128(mm))
12833                                : getQReg128(mm)));
12834       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12835       const HChar* names[4] = { "and", "bic", "orr", "orn" };
12836       const HChar* ar = bitQ == 1 ? "16b" : "8b";
12837       DIP("%s %s.%s, %s.%s, %s.%s\n", names[INSN(23,22)],
12838           nameQReg128(dd), ar, nameQReg128(nn), ar, nameQReg128(mm), ar);
12839       return True;
12840    }
12841
12842    if (bitU == 1 && opcode == BITS5(0,0,0,1,1)) {
12843       /* -------- 1,00,00011 EOR 16b_16b_16b, 8b_8b_8b -------- */
12844       /* -------- 1,01,00011 BSL 16b_16b_16b, 8b_8b_8b -------- */
12845       /* -------- 1,10,00011 BIT 16b_16b_16b, 8b_8b_8b -------- */
12846       /* -------- 1,10,00011 BIF 16b_16b_16b, 8b_8b_8b -------- */
12847       IRTemp argD = newTempV128();
12848       IRTemp argN = newTempV128();
12849       IRTemp argM = newTempV128();
12850       assign(argD, getQReg128(dd));
12851       assign(argN, getQReg128(nn));
12852       assign(argM, getQReg128(mm));
12853       const IROp opXOR = Iop_XorV128;
12854       const IROp opAND = Iop_AndV128;
12855       const IROp opNOT = Iop_NotV128;
12856       IRTemp res = newTempV128();
12857       switch (size) {
12858          case BITS2(0,0): /* EOR */
12859             assign(res, binop(opXOR, mkexpr(argM), mkexpr(argN)));
12860             break;
12861          case BITS2(0,1): /* BSL */
12862             assign(res, binop(opXOR, mkexpr(argM),
12863                               binop(opAND,
12864                                     binop(opXOR, mkexpr(argM), mkexpr(argN)),
12865                                           mkexpr(argD))));
12866             break;
12867          case BITS2(1,0): /* BIT */
12868             assign(res, binop(opXOR, mkexpr(argD),
12869                               binop(opAND,
12870                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
12871                                     mkexpr(argM))));
12872             break;
12873          case BITS2(1,1): /* BIF */
12874             assign(res, binop(opXOR, mkexpr(argD),
12875                               binop(opAND,
12876                                     binop(opXOR, mkexpr(argD), mkexpr(argN)),
12877                                     unop(opNOT, mkexpr(argM)))));
12878             break;
12879          default:
12880             vassert(0);
12881       }
12882       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12883       const HChar* nms[4] = { "eor", "bsl", "bit", "bif" };
12884       const HChar* arr = bitQ == 1 ? "16b" : "8b";
12885       DIP("%s %s.%s, %s.%s, %s.%s\n", nms[size],
12886           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12887       return True;
12888    }
12889
12890    if (opcode == BITS5(0,0,1,1,0)) {
12891       /* -------- 0,xx,00110 CMGT std7_std7_std7 -------- */ // >s
12892       /* -------- 1,xx,00110 CMHI std7_std7_std7 -------- */ // >u
12893       if (bitQ == 0 && size == X11) return False; // implied 1d case
12894       Bool   isGT  = bitU == 0;
12895       IRExpr* argL = getQReg128(nn);
12896       IRExpr* argR = getQReg128(mm);
12897       IRTemp  res  = newTempV128();
12898       assign(res,
12899              isGT ? binop(mkVecCMPGTS(size), argL, argR)
12900                   : binop(mkVecCMPGTU(size), argL, argR));
12901       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12902       const HChar* nm  = isGT ? "cmgt" : "cmhi";
12903       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12904       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12905           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12906       return True;
12907    }
12908
12909    if (opcode == BITS5(0,0,1,1,1)) {
12910       /* -------- 0,xx,00111 CMGE std7_std7_std7 -------- */ // >=s
12911       /* -------- 1,xx,00111 CMHS std7_std7_std7 -------- */ // >=u
12912       if (bitQ == 0 && size == X11) return False; // implied 1d case
12913       Bool    isGE = bitU == 0;
12914       IRExpr* argL = getQReg128(nn);
12915       IRExpr* argR = getQReg128(mm);
12916       IRTemp  res  = newTempV128();
12917       assign(res,
12918              isGE ? unop(Iop_NotV128, binop(mkVecCMPGTS(size), argR, argL))
12919                   : unop(Iop_NotV128, binop(mkVecCMPGTU(size), argR, argL)));
12920       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12921       const HChar* nm  = isGE ? "cmge" : "cmhs";
12922       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12923       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12924           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12925       return True;
12926    }
12927
12928    if (opcode == BITS5(0,1,0,0,0) || opcode == BITS5(0,1,0,1,0)) {
12929       /* -------- 0,xx,01000 SSHL  std7_std7_std7 -------- */
12930       /* -------- 0,xx,01010 SRSHL std7_std7_std7 -------- */
12931       /* -------- 1,xx,01000 USHL  std7_std7_std7 -------- */
12932       /* -------- 1,xx,01010 URSHL std7_std7_std7 -------- */
12933       if (bitQ == 0 && size == X11) return False; // implied 1d case
12934       Bool isU = bitU == 1;
12935       Bool isR = opcode == BITS5(0,1,0,1,0);
12936       IROp op  = isR ? (isU ? mkVecRSHU(size) : mkVecRSHS(size))
12937                      : (isU ? mkVecSHU(size)  : mkVecSHS(size));
12938       IRTemp res = newTempV128();
12939       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
12940       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
12941       const HChar* nm  = isR ? (isU ? "urshl" : "srshl")
12942                              : (isU ? "ushl"  : "sshl");
12943       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12944       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12945           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12946       return True;
12947    }
12948
12949    if (opcode == BITS5(0,1,0,0,1) || opcode == BITS5(0,1,0,1,1)) {
12950       /* -------- 0,xx,01001 SQSHL  std7_std7_std7 -------- */
12951       /* -------- 0,xx,01011 SQRSHL std7_std7_std7 -------- */
12952       /* -------- 1,xx,01001 UQSHL  std7_std7_std7 -------- */
12953       /* -------- 1,xx,01011 UQRSHL std7_std7_std7 -------- */
12954       if (bitQ == 0 && size == X11) return False; // implied 1d case
12955       Bool isU = bitU == 1;
12956       Bool isR = opcode == BITS5(0,1,0,1,1);
12957       IROp op  = isR ? (isU ? mkVecQANDUQRSH(size) : mkVecQANDSQRSH(size))
12958                      : (isU ? mkVecQANDUQSH(size)  : mkVecQANDSQSH(size));
12959       /* This is a bit tricky.  If we're only interested in the lowest 64 bits
12960          of the result (viz, bitQ == 0), then we must adjust the operands to
12961          ensure that the upper part of the result, that we don't care about,
12962          doesn't pollute the returned Q value.  To do this, zero out the upper
12963          operand halves beforehand.  This works because it means, for the
12964          lanes we don't care about, we are shifting zero by zero, which can
12965          never saturate. */
12966       IRTemp res256 = newTemp(Ity_V256);
12967       IRTemp resSH  = newTempV128();
12968       IRTemp resQ   = newTempV128();
12969       IRTemp zero   = newTempV128();
12970       assign(res256, binop(op,
12971                            math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(nn)),
12972                            math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(mm))));
12973       assign(resSH, unop(Iop_V256toV128_0, mkexpr(res256)));
12974       assign(resQ,  unop(Iop_V256toV128_1, mkexpr(res256)));
12975       assign(zero,  mkV128(0x0000));
12976       putQReg128(dd, mkexpr(resSH));
12977       updateQCFLAGwithDifference(resQ, zero);
12978       const HChar* nm  = isR ? (isU ? "uqrshl" : "sqrshl")
12979                              : (isU ? "uqshl"  : "sqshl");
12980       const HChar* arr = nameArr_Q_SZ(bitQ, size);
12981       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
12982           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
12983       return True;
12984    }
12985
12986    if (opcode == BITS5(0,1,1,0,0) || opcode == BITS5(0,1,1,0,1)) {
12987       /* -------- 0,xx,01100 SMAX std7_std7_std7 -------- */
12988       /* -------- 1,xx,01100 UMAX std7_std7_std7 -------- */
12989       /* -------- 0,xx,01101 SMIN std7_std7_std7 -------- */
12990       /* -------- 1,xx,01101 UMIN std7_std7_std7 -------- */
12991       if (bitQ == 0 && size == X11) return False; // implied 1d case
12992       Bool isU   = bitU == 1;
12993       Bool isMAX = (opcode & 1) == 0;
12994       IROp op    = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
12995                          : (isU ? mkVecMINU(size) : mkVecMINS(size));
12996       IRTemp t   = newTempV128();
12997       assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
12998       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t));
12999       const HChar* nm = isMAX ? (isU ? "umax" : "smax")
13000                               : (isU ? "umin" : "smin");
13001       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13002       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
13003           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13004       return True;
13005    }
13006
13007    if (opcode == BITS5(0,1,1,1,0) || opcode == BITS5(0,1,1,1,1)) {
13008       /* -------- 0,xx,01110 SABD std6_std6_std6 -------- */
13009       /* -------- 1,xx,01110 UABD std6_std6_std6 -------- */
13010       /* -------- 0,xx,01111 SABA std6_std6_std6 -------- */
13011       /* -------- 1,xx,01111 UABA std6_std6_std6 -------- */
13012       if (size == X11) return False; // 1d/2d cases not allowed
13013       Bool isU   = bitU == 1;
13014       Bool isACC = opcode == BITS5(0,1,1,1,1);
13015       vassert(size <= 2);
13016       IRTemp t1 = math_ABD(isU, size, getQReg128(nn), getQReg128(mm));
13017       IRTemp t2 = newTempV128();
13018       assign(t2, isACC ? binop(mkVecADD(size), mkexpr(t1), getQReg128(dd))
13019                        : mkexpr(t1));
13020       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
13021       const HChar* nm  = isACC ? (isU ? "uaba" : "saba")
13022                                : (isU ? "uabd" : "sabd");
13023       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13024       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
13025           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13026       return True;
13027    }
13028
13029    if (opcode == BITS5(1,0,0,0,0)) {
13030       /* -------- 0,xx,10000 ADD std7_std7_std7 -------- */
13031       /* -------- 1,xx,10000 SUB std7_std7_std7 -------- */
13032       if (bitQ == 0 && size == X11) return False; // implied 1d case
13033       Bool   isSUB = bitU == 1;
13034       IROp   op    = isSUB ? mkVecSUB(size) : mkVecADD(size);
13035       IRTemp t     = newTempV128();
13036       assign(t, binop(op, getQReg128(nn), getQReg128(mm)));
13037       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t));
13038       const HChar* nm  = isSUB ? "sub" : "add";
13039       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13040       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
13041           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13042       return True;
13043    }
13044
13045    if (opcode == BITS5(1,0,0,0,1)) {
13046       /* -------- 0,xx,10001 CMTST std7_std7_std7 -------- */ // &, != 0
13047       /* -------- 1,xx,10001 CMEQ  std7_std7_std7 -------- */ // ==
13048       if (bitQ == 0 && size == X11) return False; // implied 1d case
13049       Bool    isEQ = bitU == 1;
13050       IRExpr* argL = getQReg128(nn);
13051       IRExpr* argR = getQReg128(mm);
13052       IRTemp  res  = newTempV128();
13053       assign(res,
13054              isEQ ? binop(mkVecCMPEQ(size), argL, argR)
13055                   : unop(Iop_NotV128, binop(mkVecCMPEQ(size),
13056                                             binop(Iop_AndV128, argL, argR),
13057                                             mkV128(0x0000))));
13058       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13059       const HChar* nm  = isEQ ? "cmeq" : "cmtst";
13060       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13061       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
13062           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13063       return True;
13064    }
13065
13066    if (opcode == BITS5(1,0,0,1,0)) {
13067       /* -------- 0,xx,10010 MLA std7_std7_std7 -------- */
13068       /* -------- 1,xx,10010 MLS std7_std7_std7 -------- */
13069       if (bitQ == 0 && size == X11) return False; // implied 1d case
13070       Bool isMLS = bitU == 1;
13071       IROp   opMUL    = mkVecMUL(size);
13072       IROp   opADDSUB = isMLS ? mkVecSUB(size) : mkVecADD(size);
13073       IRTemp res      = newTempV128();
13074       if (opMUL != Iop_INVALID && opADDSUB != Iop_INVALID) {
13075          assign(res, binop(opADDSUB,
13076                            getQReg128(dd),
13077                            binop(opMUL, getQReg128(nn), getQReg128(mm))));
13078          putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13079          const HChar* arr = nameArr_Q_SZ(bitQ, size);
13080          DIP("%s %s.%s, %s.%s, %s.%s\n", isMLS ? "mls" : "mla",
13081              nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13082          return True;
13083       }
13084       return False;
13085    }
13086
13087    if (opcode == BITS5(1,0,0,1,1)) {
13088       /* -------- 0,xx,10011 MUL  std7_std7_std7 -------- */
13089       /* -------- 1,xx,10011 PMUL 16b_16b_16b, 8b_8b_8b -------- */
13090       if (bitQ == 0 && size == X11) return False; // implied 1d case
13091       Bool isPMUL = bitU == 1;
13092       const IROp opsPMUL[4]
13093          = { Iop_PolynomialMul8x16, Iop_INVALID, Iop_INVALID, Iop_INVALID };
13094       IROp   opMUL = isPMUL ? opsPMUL[size] : mkVecMUL(size);
13095       IRTemp res   = newTempV128();
13096       if (opMUL != Iop_INVALID) {
13097          assign(res, binop(opMUL, getQReg128(nn), getQReg128(mm)));
13098          putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13099          const HChar* arr = nameArr_Q_SZ(bitQ, size);
13100          DIP("%s %s.%s, %s.%s, %s.%s\n", isPMUL ? "pmul" : "mul",
13101              nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13102          return True;
13103       }
13104       return False;
13105    }
13106
13107    if (opcode == BITS5(1,0,1,0,0) || opcode == BITS5(1,0,1,0,1)) {
13108       /* -------- 0,xx,10100 SMAXP std6_std6_std6 -------- */
13109       /* -------- 1,xx,10100 UMAXP std6_std6_std6 -------- */
13110       /* -------- 0,xx,10101 SMINP std6_std6_std6 -------- */
13111       /* -------- 1,xx,10101 UMINP std6_std6_std6 -------- */
13112       if (size == X11) return False;
13113       Bool isU   = bitU == 1;
13114       Bool isMAX = opcode == BITS5(1,0,1,0,0);
13115       IRTemp vN  = newTempV128();
13116       IRTemp vM  = newTempV128();
13117       IROp op = isMAX ? (isU ? mkVecMAXU(size) : mkVecMAXS(size))
13118                       : (isU ? mkVecMINU(size) : mkVecMINS(size));
13119       assign(vN, getQReg128(nn));
13120       assign(vM, getQReg128(mm));
13121       IRTemp res128 = newTempV128();
13122       assign(res128,
13123              binop(op,
13124                    binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
13125                    binop(mkVecCATODDLANES(size),  mkexpr(vM), mkexpr(vN))));
13126       /* In the half-width case, use CatEL32x4 to extract the half-width
13127          result from the full-width result. */
13128       IRExpr* res
13129          = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
13130                             binop(Iop_CatEvenLanes32x4, mkexpr(res128),
13131                                                         mkexpr(res128)))
13132                      : mkexpr(res128);
13133       putQReg128(dd, res);
13134       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13135       const HChar* nm  = isMAX ? (isU ? "umaxp" : "smaxp")
13136                                : (isU ? "uminp" : "sminp");
13137       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
13138           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13139       return True;
13140    }
13141
13142    if (opcode == BITS5(1,0,1,1,0)) {
13143       /* -------- 0,xx,10110 SQDMULH s and h variants only -------- */
13144       /* -------- 1,xx,10110 SQRDMULH s and h variants only -------- */
13145       if (size == X00 || size == X11) return False;
13146       Bool isR = bitU == 1;
13147       IRTemp res, sat1q, sat1n, vN, vM;
13148       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
13149       newTempsV128_2(&vN, &vM);
13150       assign(vN, getQReg128(nn));
13151       assign(vM, getQReg128(mm));
13152       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
13153       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13154       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
13155       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
13156       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13157       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
13158       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
13159           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13160       return True;
13161    }
13162
13163    if (bitU == 0 && opcode == BITS5(1,0,1,1,1)) {
13164       /* -------- 0,xx,10111 ADDP std7_std7_std7 -------- */
13165       if (bitQ == 0 && size == X11) return False; // implied 1d case
13166       IRTemp vN = newTempV128();
13167       IRTemp vM = newTempV128();
13168       assign(vN, getQReg128(nn));
13169       assign(vM, getQReg128(mm));
13170       IRTemp res128 = newTempV128();
13171       assign(res128,
13172              binop(mkVecADD(size),
13173                    binop(mkVecCATEVENLANES(size), mkexpr(vM), mkexpr(vN)),
13174                    binop(mkVecCATODDLANES(size),  mkexpr(vM), mkexpr(vN))));
13175       /* In the half-width case, use CatEL32x4 to extract the half-width
13176          result from the full-width result. */
13177       IRExpr* res
13178          = bitQ == 0 ? unop(Iop_ZeroHI64ofV128,
13179                             binop(Iop_CatEvenLanes32x4, mkexpr(res128),
13180                                                         mkexpr(res128)))
13181                      : mkexpr(res128);
13182       putQReg128(dd, res);
13183       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13184       DIP("addp %s.%s, %s.%s, %s.%s\n",
13185           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13186       return True;
13187    }
13188
13189    if (bitU == 0
13190        && (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,1,1,0))) {
13191       /* -------- 0,0x,11000 FMAXNM 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13192       /* -------- 0,1x,11000 FMINNM 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13193       /* -------- 0,0x,11110 FMAX   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13194       /* -------- 0,1x,11110 FMIN   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13195       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
13196       Bool   isD   = (size & 1) == 1;
13197       if (bitQ == 0 && isD) return False; // implied 1d case
13198       Bool   isMIN = (size & 2) == 2;
13199       Bool   isNM  = opcode == BITS5(1,1,0,0,0);
13200       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? X11 : X10);
13201       IRTemp res   = newTempV128();
13202       assign(res, binop(opMXX, getQReg128(nn), getQReg128(mm)));
13203       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13204       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13205       DIP("%s%s %s.%s, %s.%s, %s.%s\n",
13206           isMIN ? "fmin" : "fmax", isNM ? "nm" : "",
13207           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13208       return True;
13209    }
13210
13211    if (bitU == 0 && opcode == BITS5(1,1,0,0,1)) {
13212       /* -------- 0,0x,11001 FMLA 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13213       /* -------- 0,1x,11001 FMLS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13214       Bool isD   = (size & 1) == 1;
13215       Bool isSUB = (size & 2) == 2;
13216       if (bitQ == 0 && isD) return False; // implied 1d case
13217       IROp opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
13218       IROp opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
13219       IROp opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
13220       IRTemp rm = mk_get_IR_rounding_mode();
13221       IRTemp t1 = newTempV128();
13222       IRTemp t2 = newTempV128();
13223       // FIXME: double rounding; use FMA primops instead
13224       assign(t1, triop(opMUL,
13225                        mkexpr(rm), getQReg128(nn), getQReg128(mm)));
13226       assign(t2, triop(isSUB ? opSUB : opADD,
13227                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
13228       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
13229       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13230       DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fmls" : "fmla",
13231           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13232       return True;
13233    }
13234
13235    if (bitU == 0 && opcode == BITS5(1,1,0,1,0)) {
13236       /* -------- 0,0x,11010 FADD 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13237       /* -------- 0,1x,11010 FSUB 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13238       Bool isD   = (size & 1) == 1;
13239       Bool isSUB = (size & 2) == 2;
13240       if (bitQ == 0 && isD) return False; // implied 1d case
13241       const IROp ops[4]
13242          = { Iop_Add32Fx4, Iop_Add64Fx2, Iop_Sub32Fx4, Iop_Sub64Fx2 };
13243       IROp   op = ops[size];
13244       IRTemp rm = mk_get_IR_rounding_mode();
13245       IRTemp t1 = newTempV128();
13246       IRTemp t2 = newTempV128();
13247       assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
13248       assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
13249       putQReg128(dd, mkexpr(t2));
13250       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13251       DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fsub" : "fadd",
13252           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13253       return True;
13254    }
13255
13256    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,0,1,0)) {
13257       /* -------- 1,1x,11010 FABD 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13258       Bool isD = (size & 1) == 1;
13259       if (bitQ == 0 && isD) return False; // implied 1d case
13260       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
13261       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
13262       IRTemp rm    = mk_get_IR_rounding_mode();
13263       IRTemp t1    = newTempV128();
13264       IRTemp t2    = newTempV128();
13265       // FIXME: use Abd primop instead?
13266       assign(t1, triop(opSUB, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
13267       assign(t2, unop(opABS, mkexpr(t1)));
13268       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
13269       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13270       DIP("fabd %s.%s, %s.%s, %s.%s\n",
13271           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13272       return True;
13273    }
13274
13275    if (size <= X01 && opcode == BITS5(1,1,0,1,1)) {
13276       /* -------- 0,0x,11011 FMULX 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13277       /* -------- 1,0x,11011 FMUL  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13278       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
13279       Bool isD    = (size & 1) == 1;
13280       Bool isMULX = bitU == 0;
13281       if (bitQ == 0 && isD) return False; // implied 1d case
13282       IRTemp rm = mk_get_IR_rounding_mode();
13283       IRTemp t1 = newTempV128();
13284       assign(t1, triop(isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4,
13285                        mkexpr(rm), getQReg128(nn), getQReg128(mm)));
13286       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
13287       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13288       DIP("%s %s.%s, %s.%s, %s.%s\n", isMULX ? "fmulx" : "fmul",
13289           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13290       return True;
13291    }
13292
13293    if (size <= X01 && opcode == BITS5(1,1,1,0,0)) {
13294       /* -------- 0,0x,11100 FCMEQ 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13295       /* -------- 1,0x,11100 FCMGE 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13296       Bool isD = (size & 1) == 1;
13297       if (bitQ == 0 && isD) return False; // implied 1d case
13298       Bool   isGE  = bitU == 1;
13299       IROp   opCMP = isGE ? (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4)
13300                           : (isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4);
13301       IRTemp t1    = newTempV128();
13302       assign(t1, isGE ? binop(opCMP, getQReg128(mm), getQReg128(nn)) // swapd
13303                       : binop(opCMP, getQReg128(nn), getQReg128(mm)));
13304       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
13305       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13306       DIP("%s %s.%s, %s.%s, %s.%s\n", isGE ? "fcmge" : "fcmeq",
13307           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13308       return True;
13309    }
13310
13311    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,0,0)) {
13312       /* -------- 1,1x,11100 FCMGT 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13313       Bool isD = (size & 1) == 1;
13314       if (bitQ == 0 && isD) return False; // implied 1d case
13315       IROp   opCMP = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
13316       IRTemp t1    = newTempV128();
13317       assign(t1, binop(opCMP, getQReg128(mm), getQReg128(nn))); // swapd
13318       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
13319       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13320       DIP("%s %s.%s, %s.%s, %s.%s\n", "fcmgt",
13321           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13322       return True;
13323    }
13324
13325    if (bitU == 1 && opcode == BITS5(1,1,1,0,1)) {
13326       /* -------- 1,0x,11101 FACGE 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13327       /* -------- 1,1x,11101 FACGT 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13328       Bool isD  = (size & 1) == 1;
13329       Bool isGT = (size & 2) == 2;
13330       if (bitQ == 0 && isD) return False; // implied 1d case
13331       IROp   opCMP = isGT ? (isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4)
13332                           : (isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4);
13333       IROp   opABS = isD ? Iop_Abs64Fx2 : Iop_Abs32Fx4;
13334       IRTemp t1    = newTempV128();
13335       assign(t1, binop(opCMP, unop(opABS, getQReg128(mm)),
13336                               unop(opABS, getQReg128(nn)))); // swapd
13337       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
13338       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13339       DIP("%s %s.%s, %s.%s, %s.%s\n", isGT ? "facgt" : "facge",
13340           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13341       return True;
13342    }
13343
13344    if (bitU == 1
13345        && (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,1,1,0))) {
13346       /* -------- 1,0x,11000 FMAXNMP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13347       /* -------- 1,1x,11000 FMINNMP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13348       /* -------- 1,0x,11110 FMAXP   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13349       /* -------- 1,1x,11110 FMINP   2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13350       /* FMAXNM, FMINNM: FIXME -- KLUDGED */
13351       Bool isD = (size & 1) == 1;
13352       if (bitQ == 0 && isD) return False; // implied 1d case
13353       Bool   isMIN = (size & 2) == 2;
13354       Bool   isNM  = opcode == BITS5(1,1,0,0,0);
13355       IROp   opMXX = (isMIN ? mkVecMINF : mkVecMAXF)(isD ? 3 : 2);
13356       IRTemp srcN  = newTempV128();
13357       IRTemp srcM  = newTempV128();
13358       IRTemp preL  = IRTemp_INVALID;
13359       IRTemp preR  = IRTemp_INVALID;
13360       assign(srcN, getQReg128(nn));
13361       assign(srcM, getQReg128(mm));
13362       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR, srcM, srcN,
13363                                            isD ? ARM64VSizeD : ARM64VSizeS, bitQ);
13364       putQReg128(
13365          dd, math_MAYBE_ZERO_HI64_fromE(
13366                 bitQ,
13367                 binop(opMXX, mkexpr(preL), mkexpr(preR))));
13368       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13369       DIP("%s%sp %s.%s, %s.%s, %s.%s\n",
13370           isMIN ? "fmin" : "fmax", isNM ? "nm" : "",
13371           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13372       return True;
13373    }
13374
13375    if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,0,1,0)) {
13376       /* -------- 1,0x,11010 FADDP 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13377       Bool isD = size == X01;
13378       if (bitQ == 0 && isD) return False; // implied 1d case
13379       IRTemp srcN = newTempV128();
13380       IRTemp srcM = newTempV128();
13381       IRTemp preL = IRTemp_INVALID;
13382       IRTemp preR = IRTemp_INVALID;
13383       assign(srcN, getQReg128(nn));
13384       assign(srcM, getQReg128(mm));
13385       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR, srcM, srcN,
13386                                            isD ? ARM64VSizeD : ARM64VSizeS, bitQ);
13387       putQReg128(
13388          dd, math_MAYBE_ZERO_HI64_fromE(
13389                 bitQ,
13390                 triop(mkVecADDF(isD ? 3 : 2),
13391                       mkexpr(mk_get_IR_rounding_mode()),
13392                       mkexpr(preL), mkexpr(preR))));
13393       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13394       DIP("%s %s.%s, %s.%s, %s.%s\n", "faddp",
13395           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13396       return True;
13397    }
13398
13399    if (bitU == 1 && size <= X01 && opcode == BITS5(1,1,1,1,1)) {
13400       /* -------- 1,0x,11111 FDIV 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13401       Bool isD = (size & 1) == 1;
13402       if (bitQ == 0 && isD) return False; // implied 1d case
13403       vassert(size <= 1);
13404       const IROp ops[2] = { Iop_Div32Fx4, Iop_Div64Fx2 };
13405       IROp   op = ops[size];
13406       IRTemp rm = mk_get_IR_rounding_mode();
13407       IRTemp t1 = newTempV128();
13408       IRTemp t2 = newTempV128();
13409       assign(t1, triop(op, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
13410       assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
13411       putQReg128(dd, mkexpr(t2));
13412       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13413       DIP("%s %s.%s, %s.%s, %s.%s\n", "fdiv",
13414           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13415       return True;
13416    }
13417
13418    if (bitU == 0 && opcode == BITS5(1,1,1,1,1)) {
13419       /* -------- 0,0x,11111: FRECPS  2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13420       /* -------- 0,1x,11111: FRSQRTS 2d_2d_2d, 4s_4s_4s, 2s_2s_2s -------- */
13421       Bool isSQRT = (size & 2) == 2;
13422       Bool isD    = (size & 1) == 1;
13423       if (bitQ == 0 && isD) return False; // implied 1d case
13424       IROp op     = isSQRT ? (isD ? Iop_RSqrtStep64Fx2 : Iop_RSqrtStep32Fx4)
13425                            : (isD ? Iop_RecipStep64Fx2 : Iop_RecipStep32Fx4);
13426       IRTemp res = newTempV128();
13427       assign(res, binop(op, getQReg128(nn), getQReg128(mm)));
13428       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13429       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
13430       DIP("%s %s.%s, %s.%s, %s.%s\n", isSQRT ? "frsqrts" : "frecps",
13431           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13432       return True;
13433    }
13434
13435    return False;
13436 #  undef INSN
13437 }
13438
13439
13440 static
13441 Bool dis_AdvSIMD_three_same_extra(/*MB_OUT*/DisResult* dres, UInt insn)
13442 {
13443    /* 31 30 29 28    23   21 20 15 14     10 9 4
13444       0  Q  U  01110 size 0  m  1  opcode 1  n d
13445       Decode fields: u,size,opcode
13446    */
13447 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13448    if (INSN(31,31) != 0
13449        || INSN(28,24) != BITS5(0,1,1,1,0)
13450        || INSN(21,21) != 0
13451        || INSN(15,15) != 1
13452        || INSN(10,10) != 1) {
13453       return False;
13454    }
13455    UInt bitQ   = INSN(30,30);
13456    UInt bitU   = INSN(29,29);
13457    UInt size   = INSN(23,22);
13458    UInt mm     = INSN(20,16);
13459    UInt opcode = INSN(14,11);
13460    UInt nn     = INSN(9,5);
13461    UInt dd     = INSN(4,0);
13462    vassert(size < 4);
13463    vassert(mm < 32 && nn < 32 && dd < 32);
13464
13465    if (bitU == 1 && (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,0,0,1))) {
13466       /* -------- 0,xx,10110 SQRDMLAH s and h variants only -------- */
13467       /* -------- 1,xx,10110 SQRDMLSH s and h variants only -------- */
13468       if (size == X00 || size == X11) return False;
13469       Bool isAdd = opcode == BITS4(0,0,0,0);
13470
13471       IRTemp res, res_nosat, vD, vN, vM;
13472       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
13473       newTempsV128_3(&vD, &vN, &vM);
13474       assign(vD, getQReg128(dd));
13475       assign(vN, getQReg128(nn));
13476       assign(vM, getQReg128(mm));
13477
13478       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
13479       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
13480       updateQCFLAGwithDifferenceZHI(res, res_nosat, opZHI);
13481       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13482
13483       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13484       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
13485       DIP("%s %s.%s, %s.%s, %s.%s\n", nm,
13486           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13487       return True;
13488    }
13489
13490    return False;
13491 #  undef INSN
13492 }
13493
13494 static
13495 Bool dis_AdvSIMD_three_same_fp16(/*MB_OUT*/DisResult* dres, UInt insn,
13496                                  const VexArchInfo* archinfo)
13497 {
13498    /* This decode function only handles instructions with half-precision
13499       floating-point (fp16) operands.
13500    */
13501    if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
13502       return False;
13503
13504    /* 31 30 29 28    23   21 20 15     10 9 4
13505       0  Q  U  01110 size 0  m  opcode 1  n d
13506       Decode fields: u,size,opcode
13507    */
13508 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13509    if (INSN(31,31) != 0
13510        || INSN(28,24) != BITS5(0,1,1,1,0)
13511        || INSN(21,21) != 0
13512        || INSN(10,10) != 1) {
13513       return False;
13514    }
13515    UInt bitQ   = INSN(30,30);
13516    UInt bitU   = INSN(29,29);
13517    UInt size   = INSN(23,22);
13518    UInt mm     = INSN(20,16);
13519    UInt opcode = INSN(15,11);
13520    UInt nn     = INSN(9,5);
13521    UInt dd     = INSN(4,0);
13522    vassert(size < 4);
13523    vassert(mm < 32 && nn < 32 && dd < 32);
13524
13525    if (bitU == 1 && size == X01 && opcode == BITS5(0,0,0,1,0)) {
13526       /* -------- 1,01,00010 FADDP 4h_4h_4h, 8h_8h_8h -------- */
13527       IROp  opADD = mkVecADDF(1); //bitQ == 0 ? 0 : 1);
13528       IRTemp srcN = newTempV128();
13529       IRTemp srcM = newTempV128();
13530       IRTemp preL = IRTemp_INVALID;
13531       IRTemp preR = IRTemp_INVALID;
13532       assign(srcN, getQReg128(nn));
13533       assign(srcM, getQReg128(mm));
13534       math_REARRANGE_FOR_FLOATING_PAIRWISE(&preL, &preR, srcM, srcN,
13535                                            ARM64VSizeH, bitQ);
13536       putQReg128(
13537          dd, math_MAYBE_ZERO_HI64_fromE(
13538                 bitQ,
13539                 triop(opADD, mkexpr(mk_get_IR_rounding_mode()),
13540                       mkexpr(preL), mkexpr(preR))));
13541       const HChar* arr = bitQ == 0 ? "4h" : "8h";
13542       DIP("%s %s.%s, %s.%s, %s.%s\n", "faddp",
13543           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13544       return True;
13545    }
13546
13547    if (bitU == 1 && size == X11 && opcode == BITS5(0,0,0,1,0)) {
13548       /* -------- 1,11,00010 FABD 4h_4h_4h, 8h_8h_8h -------- */
13549       IRTemp rm    = mk_get_IR_rounding_mode();
13550       IRTemp t1    = newTempV128();
13551       IRTemp t2    = newTempV128();
13552       assign(t1, triop(Iop_Sub16Fx8, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
13553       assign(t2, unop(Iop_Abs16Fx8, mkexpr(t1)));
13554       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
13555       const HChar* arr = bitQ == 0 ? "4h" : "8h";
13556       DIP("%s %s.%s, %s.%s, %s.%s\n", "fabd",
13557           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13558       return True;
13559    }
13560
13561    if (size == X01 && opcode == BITS5(0,0,1,0,0)) {
13562       /* -------- 0,01,00100 FCMEQ 4h_4h_4h, 8h_8h_8h -------- */
13563       /* -------- 1,01,00100 FCMGE 4h_4h_4h, 8h_8h_8h -------- */
13564       Bool   isGE  = bitU == 1;
13565       IRTemp t1    = newTempV128();
13566       /* Swap source and destination in order to use existing LE IR op for GE. */
13567       assign(t1, isGE ? binop(Iop_CmpLE16Fx8, getQReg128(mm), getQReg128(nn))
13568                       : binop(Iop_CmpEQ16Fx8, getQReg128(nn), getQReg128(mm)));
13569       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
13570       const HChar* arr = bitQ == 0 ? "4h" : "8h";
13571       DIP("%s %s.%s, %s.%s, %s.%s\n", isGE ? "fcmge" : "fcmeq",
13572           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13573       return True;
13574    }
13575
13576    if (size == X11 && opcode == BITS5(0,0,1,0,0)) {
13577       /* -------- 1,11,00100 FCMGT 4h_4h_4h, 8h_8h_8h -------- */
13578       IRTemp t1    = newTempV128();
13579       /* Swap source and destination in order to use existing LT IR op for GT. */
13580       assign(t1, binop(Iop_CmpLT16Fx8, getQReg128(mm), getQReg128(nn)));
13581       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
13582       const HChar* arr = bitQ == 0 ? "4h" : "8h";
13583       DIP("%s %s.%s, %s.%s, %s.%s\n", "fcmgt",
13584           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13585       return True;
13586    }
13587
13588    if (bitU == 1 && opcode == BITS5(0,0,1,0,1)) {
13589       /* -------- 1,01,00101 FACGE 4h_4h_4h 8h_8h_8h -------- */
13590       /* -------- 1,11,00101 FACGT 4h_4h_4h 8h_8h_8h -------- */
13591       Bool isGT  = (size & 3) == 3;
13592       IROp opCMP = isGT ? Iop_CmpLT16Fx8 : Iop_CmpLE16Fx8;
13593       IROp opABS = Iop_Abs16Fx8;
13594       IRTemp t1  = newTempV128();
13595       assign(t1, binop(opCMP, unop(opABS, getQReg128(mm)),
13596                               unop(opABS, getQReg128(nn))));
13597       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t1));
13598       const HChar* arr = bitQ == 0 ? "4h" : "8h";
13599       DIP("%s %s.%s, %s.%s, %s.%s\n", isGT ? "facgt" : "facge",
13600           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13601       return True;
13602    }
13603
13604    if (bitU == 0 && size == X01 && opcode == BITS5(0,0,0,1,0)) {
13605       /* -------- 0,01,00010 FADD 4h_4h_4h, 8h_8h_8h -------- */
13606       IRTemp rm = mk_get_IR_rounding_mode();
13607       IRTemp t1 = newTempV128();
13608       IRTemp t2 = newTempV128();
13609       assign(t1, triop(Iop_Add16Fx8, mkexpr(rm), getQReg128(nn), getQReg128(mm)));
13610       assign(t2, math_MAYBE_ZERO_HI64(bitQ, t1));
13611       putQReg128(dd, mkexpr(t2));
13612       const HChar* arr = bitQ == 0 ? "4h" : "8h";
13613       DIP("%s %s.%s, %s.%s, %s.%s\n", "fadd",
13614           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
13615       return True;
13616    }
13617
13618    return False;
13619 #  undef INSN
13620 }
13621
13622
13623 static
13624 Bool dis_AdvSIMD_two_reg_misc(/*MB_OUT*/DisResult* dres, UInt insn)
13625 {
13626    /* 31 30 29 28    23   21    16     11 9 4
13627       0  Q  U  01110 size 10000 opcode 10 n d
13628       Decode fields: U,size,opcode
13629    */
13630 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
13631    if (INSN(31,31) != 0
13632        || INSN(28,24) != BITS5(0,1,1,1,0)
13633        || INSN(21,17) != BITS5(1,0,0,0,0)
13634        || INSN(11,10) != BITS2(1,0)) {
13635       return False;
13636    }
13637    UInt bitQ   = INSN(30,30);
13638    UInt bitU   = INSN(29,29);
13639    UInt size   = INSN(23,22);
13640    UInt opcode = INSN(16,12);
13641    UInt nn     = INSN(9,5);
13642    UInt dd     = INSN(4,0);
13643    vassert(size < 4);
13644
13645    if (bitU == 0 && size <= X10 && opcode == BITS5(0,0,0,0,0)) {
13646       /* -------- 0,00,00000: REV64 16b_16b, 8b_8b -------- */
13647       /* -------- 0,01,00000: REV64 8h_8h, 4h_4h -------- */
13648       /* -------- 0,10,00000: REV64 4s_4s, 2s_2s -------- */
13649       const IROp iops[3] = { Iop_Reverse8sIn64_x2,
13650                              Iop_Reverse16sIn64_x2, Iop_Reverse32sIn64_x2 };
13651       vassert(size <= 2);
13652       IRTemp res = newTempV128();
13653       assign(res, unop(iops[size], getQReg128(nn)));
13654       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13655       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13656       DIP("%s %s.%s, %s.%s\n", "rev64",
13657           nameQReg128(dd), arr, nameQReg128(nn), arr);
13658       return True;
13659    }
13660
13661    if (bitU == 1 && size <= X01 && opcode == BITS5(0,0,0,0,0)) {
13662       /* -------- 1,00,00000: REV32 16b_16b, 8b_8b -------- */
13663       /* -------- 1,01,00000: REV32 8h_8h, 4h_4h -------- */
13664       Bool   isH = size == X01;
13665       IRTemp res = newTempV128();
13666       IROp   iop = isH ? Iop_Reverse16sIn32_x4 : Iop_Reverse8sIn32_x4;
13667       assign(res, unop(iop, getQReg128(nn)));
13668       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13669       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13670       DIP("%s %s.%s, %s.%s\n", "rev32",
13671           nameQReg128(dd), arr, nameQReg128(nn), arr);
13672       return True;
13673    }
13674
13675    if (bitU == 0 && size == X00 && opcode == BITS5(0,0,0,0,1)) {
13676       /* -------- 0,00,00001: REV16 16b_16b, 8b_8b -------- */
13677       IRTemp res = newTempV128();
13678       assign(res, unop(Iop_Reverse8sIn16_x8, getQReg128(nn)));
13679       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13680       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13681       DIP("%s %s.%s, %s.%s\n", "rev16",
13682           nameQReg128(dd), arr, nameQReg128(nn), arr);
13683       return True;
13684    }
13685
13686    if (opcode == BITS5(0,0,0,1,0) || opcode == BITS5(0,0,1,1,0)) {
13687       /* -------- 0,xx,00010: SADDLP std6_std6 -------- */
13688       /* -------- 1,xx,00010: UADDLP std6_std6 -------- */
13689       /* -------- 0,xx,00110: SADALP std6_std6 -------- */
13690       /* -------- 1,xx,00110: UADALP std6_std6 -------- */
13691       /* Widens, and size refers to the narrow size. */
13692       if (size == X11) return False; // no 1d or 2d cases
13693       Bool   isU   = bitU == 1;
13694       Bool   isACC = opcode == BITS5(0,0,1,1,0);
13695       IRTemp src   = newTempV128();
13696       IRTemp sum   = newTempV128();
13697       IRTemp res   = newTempV128();
13698       assign(src, getQReg128(nn));
13699       sum = math_ADDLP(size, isU, src);
13700       assign(res, isACC ? binop(mkVecADD(size+1), mkexpr(sum), getQReg128(dd))
13701                         : mkexpr(sum));
13702       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13703       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13704       const HChar* arrWide   = nameArr_Q_SZ(bitQ, size+1);
13705       DIP("%s %s.%s, %s.%s\n", isACC ? (isU ? "uadalp" : "sadalp")
13706                                      : (isU ? "uaddlp" : "saddlp"),
13707           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
13708       return True;
13709    }
13710
13711    if (opcode == BITS5(0,0,0,1,1)) {
13712       /* -------- 0,xx,00011: SUQADD std7_std7 -------- */
13713       /* -------- 1,xx,00011: USQADD std7_std7 -------- */
13714       if (bitQ == 0 && size == X11) return False; // implied 1d case
13715       Bool isUSQADD = bitU == 1;
13716       /* This is switched (in the US vs SU sense) deliberately.
13717          SUQADD corresponds to the ExtUSsatSS variants and
13718          USQADD corresponds to the ExtSUsatUU variants.
13719          See libvex_ir for more details. */
13720       IROp   qop  = isUSQADD ? mkVecQADDEXTSUSATUU(size)
13721                              : mkVecQADDEXTUSSATSS(size);
13722       IROp   nop  = mkVecADD(size);
13723       IRTemp argL = newTempV128();
13724       IRTemp argR = newTempV128();
13725       IRTemp qres = newTempV128();
13726       IRTemp nres = newTempV128();
13727       /* Because the two arguments to the addition are implicitly
13728          extended differently (one signedly, the other unsignedly) it is
13729          important to present them to the primop in the correct order. */
13730       assign(argL, getQReg128(nn));
13731       assign(argR, getQReg128(dd));
13732       assign(qres, math_MAYBE_ZERO_HI64_fromE(
13733                       bitQ, binop(qop, mkexpr(argL), mkexpr(argR))));
13734       assign(nres, math_MAYBE_ZERO_HI64_fromE(
13735                       bitQ, binop(nop, mkexpr(argL), mkexpr(argR))));
13736       putQReg128(dd, mkexpr(qres));
13737       updateQCFLAGwithDifference(qres, nres);
13738       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13739       DIP("%s %s.%s, %s.%s\n", isUSQADD ? "usqadd" : "suqadd",
13740           nameQReg128(dd), arr, nameQReg128(nn), arr);
13741       return True;
13742    }
13743
13744    if (opcode == BITS5(0,0,1,0,0)) {
13745       /* -------- 0,xx,00100: CLS std6_std6 -------- */
13746       /* -------- 1,xx,00100: CLZ std6_std6 -------- */
13747       if (size == X11) return False; // no 1d or 2d cases
13748       const IROp opsCLS[3] = { Iop_Cls8x16, Iop_Cls16x8, Iop_Cls32x4 };
13749       const IROp opsCLZ[3] = { Iop_Clz8x16, Iop_Clz16x8, Iop_Clz32x4 };
13750       Bool   isCLZ = bitU == 1;
13751       IRTemp res   = newTempV128();
13752       vassert(size <= 2);
13753       assign(res, unop(isCLZ ? opsCLZ[size] : opsCLS[size], getQReg128(nn)));
13754       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13755       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13756       DIP("%s %s.%s, %s.%s\n", isCLZ ? "clz" : "cls",
13757           nameQReg128(dd), arr, nameQReg128(nn), arr);
13758       return True;
13759    }
13760
13761    if (size == X00 && opcode == BITS5(0,0,1,0,1)) {
13762       /* -------- 0,00,00101: CNT 16b_16b, 8b_8b -------- */
13763       /* -------- 1,00,00101: NOT 16b_16b, 8b_8b -------- */
13764       IRTemp res = newTempV128();
13765       assign(res, unop(bitU == 0 ? Iop_Cnt8x16 : Iop_NotV128, getQReg128(nn)));
13766       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13767       const HChar* arr = nameArr_Q_SZ(bitQ, 0);
13768       DIP("%s %s.%s, %s.%s\n", bitU == 0 ? "cnt" : "not",
13769           nameQReg128(dd), arr, nameQReg128(nn), arr);
13770       return True;
13771    }
13772
13773    if (bitU == 1 && size == X01 && opcode == BITS5(0,0,1,0,1)) {
13774       /* -------- 1,01,00101  RBIT 16b_16b, 8b_8b -------- */
13775       IRTemp res = newTempV128();
13776       assign(res, unop(Iop_Reverse1sIn8_x16, getQReg128(nn)));
13777       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13778       const HChar* arr = nameArr_Q_SZ(bitQ, 0);
13779       DIP("%s %s.%s, %s.%s\n", "rbit",
13780           nameQReg128(dd), arr, nameQReg128(nn), arr);
13781       return True;
13782    }
13783
13784    if (opcode == BITS5(0,0,1,1,1)) {
13785       /* -------- 0,xx,00111 SQABS std7_std7 -------- */
13786       /* -------- 1,xx,00111 SQNEG std7_std7 -------- */
13787       if (bitQ == 0 && size == X11) return False; // implied 1d case
13788       Bool   isNEG  = bitU == 1;
13789       IRTemp qresFW = IRTemp_INVALID, nresFW = IRTemp_INVALID;
13790       (isNEG ? math_SQNEG : math_SQABS)( &qresFW, &nresFW,
13791                                          getQReg128(nn), size );
13792       IRTemp qres = newTempV128(), nres = newTempV128();
13793       assign(qres, math_MAYBE_ZERO_HI64(bitQ, qresFW));
13794       assign(nres, math_MAYBE_ZERO_HI64(bitQ, nresFW));
13795       putQReg128(dd, mkexpr(qres));
13796       updateQCFLAGwithDifference(qres, nres);
13797       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13798       DIP("%s %s.%s, %s.%s\n", isNEG ? "sqneg" : "sqabs",
13799           nameQReg128(dd), arr, nameQReg128(nn), arr);
13800       return True;
13801    }
13802
13803    if (opcode == BITS5(0,1,0,0,0)) {
13804       /* -------- 0,xx,01000: CMGT std7_std7_#0 -------- */ // >s 0
13805       /* -------- 1,xx,01000: CMGE std7_std7_#0 -------- */ // >=s 0
13806       if (bitQ == 0 && size == X11) return False; // implied 1d case
13807       Bool    isGT  = bitU == 0;
13808       IRExpr* argL  = getQReg128(nn);
13809       IRExpr* argR  = mkV128(0x0000);
13810       IRTemp  res   = newTempV128();
13811       IROp    opGTS = mkVecCMPGTS(size);
13812       assign(res, isGT ? binop(opGTS, argL, argR)
13813                        : unop(Iop_NotV128, binop(opGTS, argR, argL)));
13814       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13815       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13816       DIP("cm%s %s.%s, %s.%s, #0\n", isGT ? "gt" : "ge",
13817           nameQReg128(dd), arr, nameQReg128(nn), arr);
13818       return True;
13819    }
13820
13821    if (opcode == BITS5(0,1,0,0,1)) {
13822       /* -------- 0,xx,01001: CMEQ std7_std7_#0 -------- */ // == 0
13823       /* -------- 1,xx,01001: CMLE std7_std7_#0 -------- */ // <=s 0
13824       if (bitQ == 0 && size == X11) return False; // implied 1d case
13825       Bool    isEQ = bitU == 0;
13826       IRExpr* argL = getQReg128(nn);
13827       IRExpr* argR = mkV128(0x0000);
13828       IRTemp  res  = newTempV128();
13829       assign(res, isEQ ? binop(mkVecCMPEQ(size), argL, argR)
13830                        : unop(Iop_NotV128,
13831                               binop(mkVecCMPGTS(size), argL, argR)));
13832       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13833       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13834       DIP("cm%s %s.%s, %s.%s, #0\n", isEQ ? "eq" : "le",
13835           nameQReg128(dd), arr, nameQReg128(nn), arr);
13836       return True;
13837    }
13838
13839    if (bitU == 0 && opcode == BITS5(0,1,0,1,0)) {
13840       /* -------- 0,xx,01010: CMLT std7_std7_#0 -------- */ // <s 0
13841       if (bitQ == 0 && size == X11) return False; // implied 1d case
13842       IRExpr* argL = getQReg128(nn);
13843       IRExpr* argR = mkV128(0x0000);
13844       IRTemp  res  = newTempV128();
13845       assign(res, binop(mkVecCMPGTS(size), argR, argL));
13846       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13847       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13848       DIP("cm%s %s.%s, %s.%s, #0\n", "lt",
13849           nameQReg128(dd), arr, nameQReg128(nn), arr);
13850       return True;
13851    }
13852
13853    if (bitU == 0 && opcode == BITS5(0,1,0,1,1)) {
13854       /* -------- 0,xx,01011: ABS std7_std7 -------- */
13855       if (bitQ == 0 && size == X11) return False; // implied 1d case
13856       IRTemp res = newTempV128();
13857       assign(res, unop(mkVecABS(size), getQReg128(nn)));
13858       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13859       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13860       DIP("abs %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
13861       return True;
13862    }
13863
13864    if (bitU == 1 && opcode == BITS5(0,1,0,1,1)) {
13865       /* -------- 1,xx,01011: NEG std7_std7 -------- */
13866       if (bitQ == 0 && size == X11) return False; // implied 1d case
13867       IRTemp res = newTempV128();
13868       assign(res, binop(mkVecSUB(size), mkV128(0x0000), getQReg128(nn)));
13869       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13870       const HChar* arr = nameArr_Q_SZ(bitQ, size);
13871       DIP("neg %s.%s, %s.%s\n", nameQReg128(dd), arr, nameQReg128(nn), arr);
13872       return True;
13873    }
13874
13875    UInt ix = 0; /*INVALID*/
13876    if (size >= X10) {
13877       switch (opcode) {
13878          case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 4 : 1; break;
13879          case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 5 : 2; break;
13880          case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;
13881          default: break;
13882       }
13883    }
13884    if (ix > 0) {
13885       /* -------- 0,1x,01100 FCMGT 2d_2d,4s_4s,2s_2s _#0.0 (ix 1) -------- */
13886       /* -------- 0,1x,01101 FCMEQ 2d_2d,4s_4s,2s_2s _#0.0 (ix 2) -------- */
13887       /* -------- 0,1x,01110 FCMLT 2d_2d,4s_4s,2s_2s _#0.0 (ix 3) -------- */
13888       /* -------- 1,1x,01100 FCMGE 2d_2d,4s_4s,2s_2s _#0.0 (ix 4) -------- */
13889       /* -------- 1,1x,01101 FCMLE 2d_2d,4s_4s,2s_2s _#0.0 (ix 5) -------- */
13890       if (bitQ == 0 && size == X11) return False; // implied 1d case
13891       Bool   isD     = size == X11;
13892       IROp   opCmpEQ = isD ? Iop_CmpEQ64Fx2 : Iop_CmpEQ32Fx4;
13893       IROp   opCmpLE = isD ? Iop_CmpLE64Fx2 : Iop_CmpLE32Fx4;
13894       IROp   opCmpLT = isD ? Iop_CmpLT64Fx2 : Iop_CmpLT32Fx4;
13895       IROp   opCmp   = Iop_INVALID;
13896       Bool   swap    = False;
13897       const HChar* nm = "??";
13898       switch (ix) {
13899          case 1: nm = "fcmgt"; opCmp = opCmpLT; swap = True; break;
13900          case 2: nm = "fcmeq"; opCmp = opCmpEQ; break;
13901          case 3: nm = "fcmlt"; opCmp = opCmpLT; break;
13902          case 4: nm = "fcmge"; opCmp = opCmpLE; swap = True; break;
13903          case 5: nm = "fcmle"; opCmp = opCmpLE; break;
13904          default: vassert(0);
13905       }
13906       IRExpr* zero = mkV128(0x0000);
13907       IRTemp res = newTempV128();
13908       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
13909                        : binop(opCmp, getQReg128(nn), zero));
13910       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13911       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
13912       DIP("%s %s.%s, %s.%s, #0.0\n", nm,
13913           nameQReg128(dd), arr, nameQReg128(nn), arr);
13914       return True;
13915    }
13916
13917    if (size >= X10 && opcode == BITS5(0,1,1,1,1)) {
13918       /* -------- 0,1x,01111: FABS 2d_2d, 4s_4s, 2s_2s -------- */
13919       /* -------- 1,1x,01111: FNEG 2d_2d, 4s_4s, 2s_2s -------- */
13920       if (bitQ == 0 && size == X11) return False; // implied 1d case
13921       Bool   isFNEG = bitU == 1;
13922       IROp   op     = isFNEG ? (size == X10 ? Iop_Neg32Fx4 : Iop_Neg64Fx2)
13923                              : (size == X10 ? Iop_Abs32Fx4 : Iop_Abs64Fx2);
13924       IRTemp res = newTempV128();
13925       assign(res, unop(op, getQReg128(nn)));
13926       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
13927       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
13928       DIP("%s %s.%s, %s.%s\n", isFNEG ? "fneg" : "fabs",
13929           nameQReg128(dd), arr, nameQReg128(nn), arr);
13930       return True;
13931    }
13932
13933    if (bitU == 0 && opcode == BITS5(1,0,0,1,0)) {
13934       /* -------- 0,xx,10010: XTN{,2} -------- */
13935       if (size == X11) return False;
13936       vassert(size < 3);
13937       Bool   is2  = bitQ == 1;
13938       IROp   opN  = mkVecNARROWUN(size);
13939       IRTemp resN = newTempV128();
13940       assign(resN, unop(Iop_64UtoV128, unop(opN, getQReg128(nn))));
13941       putLO64andZUorPutHI64(is2, dd, resN);
13942       const HChar* nm        = "xtn";
13943       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13944       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13945       DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm,
13946           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
13947       return True;
13948    }
13949
13950    if (opcode == BITS5(1,0,1,0,0)
13951        || (bitU == 1 && opcode == BITS5(1,0,0,1,0))) {
13952       /* -------- 0,xx,10100: SQXTN{,2} -------- */
13953       /* -------- 1,xx,10100: UQXTN{,2} -------- */
13954       /* -------- 1,xx,10010: SQXTUN{,2} -------- */
13955       if (size == X11) return False;
13956       vassert(size < 3);
13957       Bool  is2    = bitQ == 1;
13958       IROp  opN    = Iop_INVALID;
13959       Bool  zWiden = True;
13960       const HChar* nm = "??";
13961       /**/ if (bitU == 0 && opcode == BITS5(1,0,1,0,0)) {
13962          opN = mkVecQNARROWUNSS(size); nm = "sqxtn"; zWiden = False;
13963       }
13964       else if (bitU == 1 && opcode == BITS5(1,0,1,0,0)) {
13965          opN = mkVecQNARROWUNUU(size); nm = "uqxtn";
13966       }
13967       else if (bitU == 1 && opcode == BITS5(1,0,0,1,0)) {
13968          opN = mkVecQNARROWUNSU(size); nm = "sqxtun";
13969       }
13970       else vassert(0);
13971       IRTemp src  = newTempV128();
13972       assign(src, getQReg128(nn));
13973       IRTemp resN = newTempV128();
13974       assign(resN, unop(Iop_64UtoV128, unop(opN, mkexpr(src))));
13975       putLO64andZUorPutHI64(is2, dd, resN);
13976       IRTemp resW = math_WIDEN_LO_OR_HI_LANES(zWiden, False/*!fromUpperHalf*/,
13977                                               size, mkexpr(resN));
13978       updateQCFLAGwithDifference(src, resW);
13979       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
13980       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
13981       DIP("%s%s %s.%s, %s.%s\n", is2 ? "2" : "", nm,
13982           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
13983       return True;
13984    }
13985
13986    if (bitU == 1 && opcode == BITS5(1,0,0,1,1)) {
13987       /* -------- 1,xx,10011 SHLL{2} #lane-width -------- */
13988       /* Widens, and size is the narrow size. */
13989       if (size == X11) return False;
13990       Bool is2   = bitQ == 1;
13991       IROp opINT = is2 ? mkVecINTERLEAVEHI(size) : mkVecINTERLEAVELO(size);
13992       IROp opSHL = mkVecSHLN(size+1);
13993       IRTemp src = newTempV128();
13994       IRTemp res = newTempV128();
13995       assign(src, getQReg128(nn));
13996       assign(res, binop(opSHL, binop(opINT, mkexpr(src), mkexpr(src)),
13997                                mkU8(8 << size)));
13998       putQReg128(dd, mkexpr(res));
13999       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
14000       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
14001       DIP("shll%s %s.%s, %s.%s, #%d\n", is2 ? "2" : "",
14002           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow, 8 << size);
14003       return True;
14004    }
14005
14006    if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,0)) {
14007       /* -------- 0,0x,10110: FCVTN 4h/8h_4s, 2s/4s_2d -------- */
14008       UInt   nLanes = size == X00 ? 4 : 2;
14009       IRType srcTy  = size == X00 ? Ity_F32 : Ity_F64;
14010       IROp   opCvt  = size == X00 ? Iop_F32toF16 : Iop_F64toF32;
14011       IRTemp rm     = mk_get_IR_rounding_mode();
14012       IRTemp src[nLanes];
14013       for (UInt i = 0; i < nLanes; i++) {
14014          src[i] = newTemp(srcTy);
14015          assign(src[i], getQRegLane(nn, i, srcTy));
14016       }
14017       for (UInt i = 0; i < nLanes; i++) {
14018          putQRegLane(dd, nLanes * bitQ + i,
14019                          binop(opCvt, mkexpr(rm), mkexpr(src[i])));
14020       }
14021       if (bitQ == 0) {
14022          putQRegLane(dd, 1, mkU64(0));
14023       }
14024       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
14025       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
14026       DIP("fcvtn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
14027           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
14028       return True;
14029    }
14030
14031    if (bitU == 1 && size == X01 && opcode == BITS5(1,0,1,1,0)) {
14032       /* -------- 1,01,10110: FCVTXN 2s/4s_2d -------- */
14033       /* Using Irrm_NEAREST here isn't right.  The docs say "round to
14034          odd" but I don't know what that really means. */
14035       IRType srcTy = Ity_F64;
14036       IROp   opCvt = Iop_F64toF32;
14037       IRTemp src[2];
14038       for (UInt i = 0; i < 2; i++) {
14039          src[i] = newTemp(srcTy);
14040          assign(src[i], getQRegLane(nn, i, srcTy));
14041       }
14042       for (UInt i = 0; i < 2; i++) {
14043          putQRegLane(dd, 2 * bitQ + i,
14044                          binop(opCvt, mkU32(Irrm_NEAREST), mkexpr(src[i])));
14045       }
14046       if (bitQ == 0) {
14047          putQRegLane(dd, 1, mkU64(0));
14048       }
14049       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
14050       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
14051       DIP("fcvtxn%s %s.%s, %s.%s\n", bitQ ? "2" : "",
14052           nameQReg128(dd), arrNarrow, nameQReg128(nn), arrWide);
14053       return True;
14054    }
14055
14056    if (bitU == 0 && size <= X01 && opcode == BITS5(1,0,1,1,1)) {
14057       /* -------- 0,0x,10111: FCVTL 4s_4h/8h, 2d_2s/4s -------- */
14058       UInt   nLanes = size == X00 ? 4 : 2;
14059       IRType srcTy  = size == X00 ? Ity_F16 : Ity_F32;
14060       IROp   opCvt  = size == X00 ? Iop_F16toF32 : Iop_F32toF64;
14061       IRTemp src[nLanes];
14062       for (UInt i = 0; i < nLanes; i++) {
14063          src[i] = newTemp(srcTy);
14064          assign(src[i], getQRegLane(nn, nLanes * bitQ + i, srcTy));
14065       }
14066       for (UInt i = 0; i < nLanes; i++) {
14067          putQRegLane(dd, i, unop(opCvt, mkexpr(src[i])));
14068       }
14069       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, 1+size);
14070       const HChar* arrWide   = nameArr_Q_SZ(1,    1+size+1);
14071       DIP("fcvtl%s %s.%s, %s.%s\n", bitQ ? "2" : "",
14072           nameQReg128(dd), arrWide, nameQReg128(nn), arrNarrow);
14073       return True;
14074    }
14075
14076    ix = 0;
14077    if (opcode == BITS5(1,1,0,0,0) || opcode == BITS5(1,1,0,0,1)) {
14078       ix = 1 + ((((bitU & 1) << 2) | ((size & 2) << 0)) | ((opcode & 1) << 0));
14079       // = 1 + bitU[0]:size[1]:opcode[0]
14080       vassert(ix >= 1 && ix <= 8);
14081       if (ix == 7) ix = 0;
14082    }
14083    if (ix > 0) {
14084       /* -------- 0,0x,11000 FRINTN 2d_2d, 4s_4s, 2s_2s (1) -------- */
14085       /* -------- 0,0x,11001 FRINTM 2d_2d, 4s_4s, 2s_2s (2) -------- */
14086       /* -------- 0,1x,11000 FRINTP 2d_2d, 4s_4s, 2s_2s (3) -------- */
14087       /* -------- 0,1x,11001 FRINTZ 2d_2d, 4s_4s, 2s_2s (4) -------- */
14088       /* -------- 1,0x,11000 FRINTA 2d_2d, 4s_4s, 2s_2s (5) -------- */
14089       /* -------- 1,0x,11001 FRINTX 2d_2d, 4s_4s, 2s_2s (6) -------- */
14090       /* -------- 1,1x,11000 (apparently unassigned)    (7) -------- */
14091       /* -------- 1,1x,11001 FRINTI 2d_2d, 4s_4s, 2s_2s (8) -------- */
14092       /* rm plan:
14093          FRINTN: tieeven
14094          FRINTM: -inf
14095          FRINTP: +inf
14096          FRINTZ: zero
14097          FRINTA: tieaway
14098          FRINTX: per FPCR + "exact = TRUE"
14099          FRINTI: per FPCR
14100       */
14101       Bool isD = (size & 1) == 1;
14102       if (bitQ == 0 && isD) return False; // implied 1d case
14103
14104       UChar   ch = '?';
14105       IROp    op = isD ? Iop_RoundF64toInt : Iop_RoundF32toInt;
14106       Bool    isBinop = True;
14107       IRExpr* irrmE = NULL;
14108       switch (ix) {
14109          case 1: ch = 'n'; isBinop = False; op = isD ? Iop_RoundF64toIntE : Iop_RoundF32toIntE; break;
14110          case 2: ch = 'm'; irrmE = mkU32(Irrm_NegINF); break;
14111          case 3: ch = 'p'; irrmE = mkU32(Irrm_PosINF); break;
14112          case 4: ch = 'z'; irrmE = mkU32(Irrm_ZERO); break;
14113          case 5: ch = 'a'; isBinop = False; op = isD ? Iop_RoundF64toIntA0 : Iop_RoundF32toIntA0; break;
14114          // I am unsure about the following, due to the "integral exact"
14115          // description in the manual.  What does it mean? (frintx, that is)
14116          case 6: ch = 'x'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
14117          case 8: ch = 'i'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
14118          default: vassert(0);
14119       }
14120
14121       if (isD) {
14122          for (UInt i = 0; i < 2; i++) {
14123             if (isBinop) {
14124                IRTemp irrm = newTemp(Ity_I32);
14125                assign(irrm, irrmE);
14126                putQRegLane(dd, i, binop(op, mkexpr(irrm),
14127                                                getQRegLane(nn, i, Ity_F64)));
14128             } else {
14129                 putQRegLane(dd, i, unop(op, getQRegLane(nn, i, Ity_F64)));
14130             }
14131          }
14132       } else {
14133          UInt n = bitQ==1 ? 4 : 2;
14134          for (UInt i = 0; i < n; i++) {
14135             if (isBinop) {
14136                IRTemp irrm = newTemp(Ity_I32);
14137                assign(irrm, irrmE);
14138                putQRegLane(dd, i, binop(op, mkexpr(irrm),
14139                                                getQRegLane(nn, i, Ity_F32)));
14140             } else {
14141                 putQRegLane(dd, i, unop(op, getQRegLane(nn, i, Ity_F32)));
14142             }
14143          }
14144          if (bitQ == 0)
14145             putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
14146       }
14147       const HChar* arr = nameArr_Q_SZ(bitQ, size);
14148       DIP("frint%c %s.%s, %s.%s\n", ch,
14149           nameQReg128(dd), arr, nameQReg128(nn), arr);
14150       return True;
14151    }
14152
14153    ix = 0; /*INVALID*/
14154    switch (opcode) {
14155       case BITS5(1,1,0,1,0): ix = ((size & 2) == 2) ? 4 : 1; break;
14156       case BITS5(1,1,0,1,1): ix = ((size & 2) == 2) ? 5 : 2; break;
14157       case BITS5(1,1,1,0,0): if ((size & 2) == 0) ix = 3; break;
14158       default: break;
14159    }
14160    if (ix > 0) {
14161       /* -------- 0,0x,11010 FCVTNS 2d_2d, 4s_4s, 2s_2s (ix 1) -------- */
14162       /* -------- 0,0x,11011 FCVTMS 2d_2d, 4s_4s, 2s_2s (ix 2) -------- */
14163       /* -------- 0,0x,11100 FCVTAS 2d_2d, 4s_4s, 2s_2s (ix 3) -------- */
14164       /* -------- 0,1x,11010 FCVTPS 2d_2d, 4s_4s, 2s_2s (ix 4) -------- */
14165       /* -------- 0,1x,11011 FCVTZS 2d_2d, 4s_4s, 2s_2s (ix 5) -------- */
14166       /* -------- 1,0x,11010 FCVTNS 2d_2d, 4s_4s, 2s_2s (ix 1) -------- */
14167       /* -------- 1,0x,11011 FCVTMS 2d_2d, 4s_4s, 2s_2s (ix 2) -------- */
14168       /* -------- 1,0x,11100 FCVTAS 2d_2d, 4s_4s, 2s_2s (ix 3) -------- */
14169       /* -------- 1,1x,11010 FCVTPS 2d_2d, 4s_4s, 2s_2s (ix 4) -------- */
14170       /* -------- 1,1x,11011 FCVTZS 2d_2d, 4s_4s, 2s_2s (ix 5) -------- */
14171       Bool isD = (size & 1) == 1;
14172       if (bitQ == 0 && isD) return False; // implied 1d case
14173
14174       IRRoundingMode irrm = 8; /*impossible*/
14175       HChar          ch   = '?';
14176       switch (ix) {
14177          case 1: ch = 'n'; irrm = Irrm_NEAREST; break;
14178          case 2: ch = 'm'; irrm = Irrm_NegINF;  break;
14179          case 3: ch = 'a'; irrm = Irrm_NEAREST; break; /* kludge? */
14180          case 4: ch = 'p'; irrm = Irrm_PosINF;  break;
14181          case 5: ch = 'z'; irrm = Irrm_ZERO;    break;
14182          default: vassert(0);
14183       }
14184       IROp cvt = Iop_INVALID;
14185       if (bitU == 1) {
14186          cvt = isD ? Iop_F64toI64U : Iop_F32toI32U;
14187       } else {
14188          cvt = isD ? Iop_F64toI64S : Iop_F32toI32S;
14189       }
14190       if (isD) {
14191          for (UInt i = 0; i < 2; i++) {
14192             putQRegLane(dd, i, binop(cvt, mkU32(irrm),
14193                                             getQRegLane(nn, i, Ity_F64)));
14194          }
14195       } else {
14196          UInt n = bitQ==1 ? 4 : 2;
14197          for (UInt i = 0; i < n; i++) {
14198             putQRegLane(dd, i, binop(cvt, mkU32(irrm),
14199                                             getQRegLane(nn, i, Ity_F32)));
14200          }
14201          if (bitQ == 0)
14202             putQRegLane(dd, 1, mkU64(0)); // zero out lanes 2 and 3
14203       }
14204       const HChar* arr = nameArr_Q_SZ(bitQ, size);
14205       DIP("fcvt%c%c %s.%s, %s.%s\n", ch, bitU == 1 ? 'u' : 's',
14206           nameQReg128(dd), arr, nameQReg128(nn), arr);
14207       return True;
14208    }
14209
14210    if (size == X10 && opcode == BITS5(1,1,1,0,0)) {
14211       /* -------- 0,10,11100: URECPE  4s_4s, 2s_2s -------- */
14212       /* -------- 1,10,11100: URSQRTE 4s_4s, 2s_2s -------- */
14213       Bool isREC = bitU == 0;
14214       IROp op    = isREC ? Iop_RecipEst32Ux4 : Iop_RSqrtEst32Ux4;
14215       IRTemp res = newTempV128();
14216       assign(res, unop(op, getQReg128(nn)));
14217       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
14218       const HChar* nm  = isREC ? "urecpe" : "ursqrte";
14219       const HChar* arr = nameArr_Q_SZ(bitQ, size);
14220       DIP("%s %s.%s, %s.%s\n", nm,
14221           nameQReg128(dd), arr, nameQReg128(nn), arr);
14222       return True;
14223    }
14224
14225    if (size <= X01 && opcode == BITS5(1,1,1,0,1)) {
14226       /* -------- 0,0x,11101: SCVTF -------- */
14227       /* -------- 1,0x,11101: UCVTF -------- */
14228       /* 31  28      22 21       15     9 4
14229          0q0 01110 0 sz 1  00001 110110 n d  SCVTF Vd, Vn
14230          0q1 01110 0 sz 1  00001 110110 n d  UCVTF Vd, Vn
14231          with laneage:
14232          case sz:Q of 00 -> 2S, zero upper, 01 -> 4S, 10 -> illegal, 11 -> 2D
14233       */
14234       Bool isQ   = bitQ == 1;
14235       Bool isU   = bitU == 1;
14236       Bool isF64 = (size & 1) == 1;
14237       if (isQ || !isF64) {
14238          IRType tyF = Ity_INVALID, tyI = Ity_INVALID;
14239          UInt   nLanes = 0;
14240          Bool   zeroHI = False;
14241          const HChar* arrSpec = NULL;
14242          Bool   ok  = getLaneInfo_Q_SZ(&tyI, &tyF, &nLanes, &zeroHI, &arrSpec,
14243                                        isQ, isF64 );
14244          IROp   iop = isU ? (isF64 ? Iop_I64UtoF64 : Iop_I32UtoF32)
14245                           : (isF64 ? Iop_I64StoF64 : Iop_I32StoF32);
14246          IRTemp rm  = mk_get_IR_rounding_mode();
14247          UInt   i;
14248          vassert(ok); /* the 'if' above should ensure this */
14249          for (i = 0; i < nLanes; i++) {
14250             putQRegLane(dd, i,
14251                         binop(iop, mkexpr(rm), getQRegLane(nn, i, tyI)));
14252          }
14253          if (zeroHI) {
14254             putQRegLane(dd, 1, mkU64(0));
14255          }
14256          DIP("%ccvtf %s.%s, %s.%s\n", isU ? 'u' : 's',
14257              nameQReg128(dd), arrSpec, nameQReg128(nn), arrSpec);
14258          return True;
14259       }
14260       /* else fall through */
14261    }
14262
14263    if (size >= X10 && opcode == BITS5(1,1,1,0,1)) {
14264       /* -------- 0,1x,11101: FRECPE  2d_2d, 4s_4s, 2s_2s -------- */
14265       /* -------- 1,1x,11101: FRSQRTE 2d_2d, 4s_4s, 2s_2s -------- */
14266       Bool isSQRT = bitU == 1;
14267       Bool isD    = (size & 1) == 1;
14268       IROp op     = isSQRT ? (isD ? Iop_RSqrtEst64Fx2 : Iop_RSqrtEst32Fx4)
14269                            : (isD ? Iop_RecipEst64Fx2 : Iop_RecipEst32Fx4);
14270       if (bitQ == 0 && isD) return False; // implied 1d case
14271       IRTemp resV = newTempV128();
14272       assign(resV, unop(op, getQReg128(nn)));
14273       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
14274       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
14275       DIP("%s %s.%s, %s.%s\n", isSQRT ? "frsqrte" : "frecpe",
14276           nameQReg128(dd), arr, nameQReg128(nn), arr);
14277       return True;
14278    }
14279
14280    if (bitU == 1 && size >= X10 && opcode == BITS5(1,1,1,1,1)) {
14281       /* -------- 1,1x,11111: FSQRT 2d_2d, 4s_4s, 2s_2s -------- */
14282       Bool isD = (size & 1) == 1;
14283       IROp op  = isD ? Iop_Sqrt64Fx2 : Iop_Sqrt32Fx4;
14284       if (bitQ == 0 && isD) return False; // implied 1d case
14285       IRTemp resV = newTempV128();
14286       assign(resV, binop(op, mkexpr(mk_get_IR_rounding_mode()),
14287                              getQReg128(nn)));
14288       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
14289       const HChar* arr = bitQ == 0 ? "2s" : (size == X11 ? "2d" : "4s");
14290       DIP("%s %s.%s, %s.%s\n", "fsqrt",
14291           nameQReg128(dd), arr, nameQReg128(nn), arr);
14292       return True;
14293    }
14294
14295    return False;
14296 #  undef INSN
14297 }
14298
14299
14300 static
14301 Bool dis_AdvSIMD_two_reg_misc_fp16(/*MB_OUT*/DisResult* dres, UInt insn,
14302                                    const VexArchInfo* archinfo)
14303 {
14304    /* This decode function only handles instructions with half-precision
14305       floating-point (fp16) operands.
14306    */
14307    if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
14308       return False;
14309
14310    /* 31 30 29 28    23   21    16     11 9 4
14311       0  Q  U  01110 size 11100 opcode 10 n d
14312       Decode fields: U,size,opcode
14313    */
14314 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14315    if (INSN(31,31) != 0
14316        || INSN(28,24) != BITS5(0,1,1,1,0)
14317        || INSN(21,17) != BITS5(1,1,1,0,0)
14318        || INSN(11,10) != BITS2(1,0)) {
14319       return False;
14320    }
14321    UInt bitQ   = INSN(30,30);
14322    UInt bitU   = INSN(29,29);
14323    UInt size   = INSN(23,22);
14324    UInt opcode = INSN(16,12);
14325    UInt nn     = INSN(9,5);
14326    UInt dd     = INSN(4,0);
14327    vassert(size < 4);
14328
14329    if (size == X11 && opcode == BITS5(0,1,1,1,1)) {
14330       /* -------- Q,0,11,01111: FABS 4h_4h, 8h_8h -------- */
14331       /* -------- Q,1,11,01111: FNEG 4h_4h, 8h_8h -------- */
14332       Bool   isFNEG = bitU == 1;
14333       IROp   op     = isFNEG ? Iop_Neg16Fx8 : Iop_Abs16Fx8;
14334       IRTemp res = newTempV128();
14335       assign(res, unop(op, getQReg128(nn)));
14336       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
14337       const HChar* arr = bitQ == 0 ? "4h" : "8h";
14338       DIP("%s %s.%s, %s.%s\n", isFNEG ? "fneg" : "fabs",
14339           nameQReg128(dd), arr, nameQReg128(nn), arr);
14340       return True;
14341    }
14342
14343    if (bitU == 1 && size == X11 && opcode == BITS5(1,1,1,1,1)) {
14344       /* -------- 1,11,11111: FSQRT 4h_4h, 8h_8h -------- */
14345       IRTemp resV = newTempV128();
14346       assign(resV, binop(Iop_Sqrt16Fx8, mkexpr(mk_get_IR_rounding_mode()),
14347                              getQReg128(nn)));
14348       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, resV));
14349       const HChar* arr = bitQ == 0 ? "4h" : "8h";
14350       DIP("%s %s.%s, %s.%s\n", "fsqrt",
14351           nameQReg128(dd), arr, nameQReg128(nn), arr);
14352       return True;
14353    }
14354
14355    /* Decoding FCM<condtion> based on opcode and bitU. ix used to select
14356     * <condition>
14357     */
14358    UInt ix = 0; // Invalid <condition>
14359    switch (opcode) {
14360       case BITS5(0,1,1,0,1): ix = (bitU == 1) ? 4 : 1; break; // FCMLE=4,FCMEQ=1
14361       case BITS5(0,1,1,0,0): ix = (bitU == 1) ? 5 : 2; break; // FCMGE=5,FCMGT=2
14362       case BITS5(0,1,1,1,0): if (bitU == 0) ix = 3; break;    // FCMLT=3
14363       default: break;
14364    }
14365    if (ix > 0) {
14366       /* -------- 0,01101 FCMEQ 4h_4h,8h_8h _#0.0 (ix 1) -------- */
14367       /* -------- 0,01100 FCMGT 4h_4h,8h_8h _#0.0 (ix 2) -------- */
14368       /* -------- 0,01110 FCMLT 4h_4h,8h_8h _#0.0 (ix 3) -------- */
14369       /* -------- 1,01101 FCMLE 4h_4h,8h_8h _#0.0 (ix 4) -------- */
14370       /* -------- 1,01100 FCMGE 4h_4h,8h_8h _#0.0 (ix 5) -------- */
14371       IROp   opCmp   = Iop_INVALID;
14372       Bool   swap    = False;
14373       const HChar* nm = "??";
14374       switch (ix) {
14375          case 1: nm = "fcmeq"; opCmp = Iop_CmpEQ16Fx8; break;
14376          case 2: nm = "fcmgt"; opCmp = Iop_CmpLT16Fx8; swap = True; break;
14377          case 3: nm = "fcmlt"; opCmp = Iop_CmpLT16Fx8; break;
14378          case 4: nm = "fcmle"; opCmp = Iop_CmpLE16Fx8; break;
14379          case 5: nm = "fcmge"; opCmp = Iop_CmpLE16Fx8; swap = True; break;
14380          default: vassert(0);
14381       }
14382       IRExpr* zero = mkV128(0x0000);
14383       IRTemp res = newTempV128();
14384       assign(res, swap ? binop(opCmp, zero, getQReg128(nn))
14385                        : binop(opCmp, getQReg128(nn), zero));
14386       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
14387       const HChar* arr = bitQ == 0 ? "4h" : "8h";
14388       DIP("%s %s.%s, %s.%s, #0.0\n", nm,
14389           nameQReg128(dd), arr, nameQReg128(nn), arr);
14390       return True;
14391    }
14392
14393    return False;
14394 #  undef INSN
14395 }
14396
14397 static
14398 Bool dis_AdvSIMD_vector_x_indexed_elem(/*MB_OUT*/DisResult* dres, UInt insn)
14399 {
14400    /* 31    28    23   21 20 19 15     11   9 4
14401       0 Q U 01111 size L  M  m  opcode H  0 n d
14402       Decode fields are: u,size,opcode
14403       M is really part of the mm register number.  Individual
14404       cases need to inspect L and H though.
14405    */
14406 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14407    if (INSN(31,31) != 0
14408        || INSN(28,24) != BITS5(0,1,1,1,1) || INSN(10,10) !=0) {
14409       return False;
14410    }
14411    UInt bitQ   = INSN(30,30);
14412    UInt bitU   = INSN(29,29);
14413    UInt size   = INSN(23,22);
14414    UInt bitL   = INSN(21,21);
14415    UInt bitM   = INSN(20,20);
14416    UInt mmLO4  = INSN(19,16);
14417    UInt opcode = INSN(15,12);
14418    UInt bitH   = INSN(11,11);
14419    UInt nn     = INSN(9,5);
14420    UInt dd     = INSN(4,0);
14421    vassert(size < 4);
14422    vassert(bitH < 2 && bitM < 2 && bitL < 2);
14423
14424    if (bitU == 0 && size >= X10
14425        && (opcode == BITS4(0,0,0,1) || opcode == BITS4(0,1,0,1))) {
14426       /* -------- 0,1x,0001 FMLA 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
14427       /* -------- 0,1x,0101 FMLS 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
14428       if (bitQ == 0 && size == X11) return False; // implied 1d case
14429       Bool isD   = (size & 1) == 1;
14430       Bool isSUB = opcode == BITS4(0,1,0,1);
14431       UInt index;
14432       if      (!isD)             index = (bitH << 1) | bitL;
14433       else if (isD && bitL == 0) index = bitH;
14434       else return False; // sz:L == x11 => unallocated encoding
14435       vassert(index < (isD ? 2 : 4));
14436       IRType ity   = isD ? Ity_F64 : Ity_F32;
14437       IRTemp elem  = newTemp(ity);
14438       UInt   mm    = (bitM << 4) | mmLO4;
14439       assign(elem, getQRegLane(mm, index, ity));
14440       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
14441       IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
14442       IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
14443       IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
14444       IRTemp rm    = mk_get_IR_rounding_mode();
14445       IRTemp t1    = newTempV128();
14446       IRTemp t2    = newTempV128();
14447       // FIXME: double rounding; use FMA primops instead
14448       assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
14449       assign(t2, triop(isSUB ? opSUB : opADD,
14450                        mkexpr(rm), getQReg128(dd), mkexpr(t1)));
14451       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
14452       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
14453       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
14454           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm),
14455           isD ? 'd' : 's', index);
14456       return True;
14457    }
14458
14459    if (size >= X10 && opcode == BITS4(1,0,0,1)) {
14460       /* -------- 0,1x,1001 FMUL  2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
14461       /* -------- 1,1x,1001 FMULX 2d_2d_d[], 4s_4s_s[], 2s_2s_s[] -------- */
14462       if (bitQ == 0 && size == X11) return False; // implied 1d case
14463       Bool isD    = (size & 1) == 1;
14464       Bool isMULX = bitU == 1;
14465       UInt index;
14466       if      (!isD)             index = (bitH << 1) | bitL;
14467       else if (isD && bitL == 0) index = bitH;
14468       else return False; // sz:L == x11 => unallocated encoding
14469       vassert(index < (isD ? 2 : 4));
14470       IRType ity  = isD ? Ity_F64 : Ity_F32;
14471       IRTemp elem = newTemp(ity);
14472       UInt   mm   = (bitM << 4) | mmLO4;
14473       assign(elem, getQRegLane(mm, index, ity));
14474       IRTemp dupd = math_DUP_TO_V128(elem, ity);
14475       // KLUDGE: FMULX is treated the same way as FMUL.  That can't be right.
14476       IRTemp res  = newTempV128();
14477       assign(res, triop(isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4,
14478                         mkexpr(mk_get_IR_rounding_mode()),
14479                         getQReg128(nn), mkexpr(dupd)));
14480       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
14481       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
14482       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n",
14483           isMULX ? "fmulx" : "fmul", nameQReg128(dd), arr,
14484           nameQReg128(nn), arr, nameQReg128(mm), isD ? 'd' : 's', index);
14485       return True;
14486    }
14487
14488    if ((bitU == 1 && (opcode == BITS4(0,0,0,0) || opcode == BITS4(0,1,0,0)))
14489        || (bitU == 0 && opcode == BITS4(1,0,0,0))) {
14490       /* -------- 1,xx,0000 MLA s/h variants only -------- */
14491       /* -------- 1,xx,0100 MLS s/h variants only -------- */
14492       /* -------- 0,xx,1000 MUL s/h variants only -------- */
14493       Bool isMLA = opcode == BITS4(0,0,0,0);
14494       Bool isMLS = opcode == BITS4(0,1,0,0);
14495       UInt mm    = 32; // invalid
14496       UInt ix    = 16; // invalid
14497       switch (size) {
14498          case X00:
14499             return False; // b case is not allowed
14500          case X01:
14501             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
14502          case X10:
14503             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
14504          case X11:
14505             return False; // d case is not allowed
14506          default:
14507             vassert(0);
14508       }
14509       vassert(mm < 32 && ix < 16);
14510       IROp   opMUL = mkVecMUL(size);
14511       IROp   opADD = mkVecADD(size);
14512       IROp   opSUB = mkVecSUB(size);
14513       HChar  ch    = size == X01 ? 'h' : 's';
14514       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
14515       IRTemp vecD  = newTempV128();
14516       IRTemp vecN  = newTempV128();
14517       IRTemp res   = newTempV128();
14518       assign(vecD, getQReg128(dd));
14519       assign(vecN, getQReg128(nn));
14520       IRExpr* prod = binop(opMUL, mkexpr(vecN), mkexpr(vecM));
14521       if (isMLA || isMLS) {
14522          assign(res, binop(isMLA ? opADD : opSUB, mkexpr(vecD), prod));
14523       } else {
14524          assign(res, prod);
14525       }
14526       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
14527       const HChar* arr = nameArr_Q_SZ(bitQ, size);
14528       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isMLA ? "mla"
14529                                                 : (isMLS ? "mls" : "mul"),
14530           nameQReg128(dd), arr,
14531           nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
14532       return True;
14533    }
14534
14535    if (opcode == BITS4(1,0,1,0)
14536        || opcode == BITS4(0,0,1,0) || opcode == BITS4(0,1,1,0)) {
14537       /* -------- 0,xx,1010 SMULL s/h variants only -------- */ // 0 (ks)
14538       /* -------- 1,xx,1010 UMULL s/h variants only -------- */ // 0
14539       /* -------- 0,xx,0010 SMLAL s/h variants only -------- */ // 1
14540       /* -------- 1,xx,0010 UMLAL s/h variants only -------- */ // 1
14541       /* -------- 0,xx,0110 SMLSL s/h variants only -------- */ // 2
14542       /* -------- 1,xx,0110 SMLSL s/h variants only -------- */ // 2
14543       /* Widens, and size refers to the narrowed lanes. */
14544       UInt ks = 3;
14545       switch (opcode) {
14546          case BITS4(1,0,1,0): ks = 0; break;
14547          case BITS4(0,0,1,0): ks = 1; break;
14548          case BITS4(0,1,1,0): ks = 2; break;
14549          default: vassert(0);
14550       }
14551       vassert(ks <= 2);
14552       Bool isU = bitU == 1;
14553       Bool is2 = bitQ == 1;
14554       UInt mm  = 32; // invalid
14555       UInt ix  = 16; // invalid
14556       switch (size) {
14557          case X00:
14558             return False; // h_b_b[] case is not allowed
14559          case X01:
14560             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
14561          case X10:
14562             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
14563          case X11:
14564             return False; // q_d_d[] case is not allowed
14565          default:
14566             vassert(0);
14567       }
14568       vassert(mm < 32 && ix < 16);
14569       IRTemp vecN  = newTempV128();
14570       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
14571       IRTemp vecD  = newTempV128();
14572       assign(vecN, getQReg128(nn));
14573       assign(vecD, getQReg128(dd));
14574       IRTemp res = IRTemp_INVALID;
14575       math_MULL_ACC(&res, is2, isU, size, "mas"[ks],
14576                     vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
14577       putQReg128(dd, mkexpr(res));
14578       const HChar* nm        = ks == 0 ? "mull" : (ks == 1 ? "mlal" : "mlsl");
14579       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
14580       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
14581       HChar ch               = size == X01 ? 'h' : 's';
14582       DIP("%c%s%s %s.%s, %s.%s, %s.%c[%u]\n",
14583           isU ? 'u' : 's', nm, is2 ? "2" : "",
14584           nameQReg128(dd), arrWide,
14585           nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
14586       return True;
14587    }
14588
14589    if (bitU == 0
14590        && (opcode == BITS4(1,0,1,1)
14591            || opcode == BITS4(0,0,1,1) || opcode == BITS4(0,1,1,1))) {
14592       /* -------- 0,xx,1011 SQDMULL s/h variants only -------- */ // 0 (ks)
14593       /* -------- 0,xx,0011 SQDMLAL s/h variants only -------- */ // 1
14594       /* -------- 0,xx,0111 SQDMLSL s/h variants only -------- */ // 2
14595       /* Widens, and size refers to the narrowed lanes. */
14596       UInt ks = 3;
14597       switch (opcode) {
14598          case BITS4(1,0,1,1): ks = 0; break;
14599          case BITS4(0,0,1,1): ks = 1; break;
14600          case BITS4(0,1,1,1): ks = 2; break;
14601          default: vassert(0);
14602       }
14603       vassert(ks <= 2);
14604       Bool is2 = bitQ == 1;
14605       UInt mm  = 32; // invalid
14606       UInt ix  = 16; // invalid
14607       switch (size) {
14608          case X00:
14609             return False; // h_b_b[] case is not allowed
14610          case X01:
14611             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
14612          case X10:
14613             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
14614          case X11:
14615             return False; // q_d_d[] case is not allowed
14616          default:
14617             vassert(0);
14618       }
14619       vassert(mm < 32 && ix < 16);
14620       IRTemp vecN, vecD, res, sat1q, sat1n, sat2q, sat2n;
14621       vecN = vecD = res = sat1q = sat1n = sat2q = sat2n = IRTemp_INVALID;
14622       newTempsV128_2(&vecN, &vecD);
14623       assign(vecN, getQReg128(nn));
14624       IRTemp vecM  = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
14625       assign(vecD, getQReg128(dd));
14626       math_SQDMULL_ACC(&res, &sat1q, &sat1n, &sat2q, &sat2n,
14627                        is2, size, "mas"[ks],
14628                        vecN, vecM, ks == 0 ? IRTemp_INVALID : vecD);
14629       putQReg128(dd, mkexpr(res));
14630       vassert(sat1q != IRTemp_INVALID && sat1n != IRTemp_INVALID);
14631       updateQCFLAGwithDifference(sat1q, sat1n);
14632       if (sat2q != IRTemp_INVALID || sat2n != IRTemp_INVALID) {
14633          updateQCFLAGwithDifference(sat2q, sat2n);
14634       }
14635       const HChar* nm        = ks == 0 ? "sqdmull"
14636                                        : (ks == 1 ? "sqdmlal" : "sqdmlsl");
14637       const HChar* arrNarrow = nameArr_Q_SZ(bitQ, size);
14638       const HChar* arrWide   = nameArr_Q_SZ(1,    size+1);
14639       HChar ch               = size == X01 ? 'h' : 's';
14640       DIP("%s%s %s.%s, %s.%s, %s.%c[%u]\n",
14641           nm, is2 ? "2" : "",
14642           nameQReg128(dd), arrWide,
14643           nameQReg128(nn), arrNarrow, nameQReg128(dd), ch, ix);
14644       return True;
14645    }
14646
14647    if (bitU == 0 && (opcode == BITS4(1,1,0,0) || opcode == BITS4(1,1,0,1))) {
14648       /* -------- 0,xx,1100 SQDMULH s and h variants only -------- */
14649       /* -------- 0,xx,1101 SQRDMULH s and h variants only -------- */
14650       UInt mm  = 32; // invalid
14651       UInt ix  = 16; // invalid
14652       switch (size) {
14653          case X00:
14654             return False; // b case is not allowed
14655          case X01:
14656             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
14657          case X10:
14658             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
14659          case X11:
14660             return False; // q case is not allowed
14661          default:
14662             vassert(0);
14663       }
14664       vassert(mm < 32 && ix < 16);
14665       Bool isR = opcode == BITS4(1,1,0,1);
14666       IRTemp res, sat1q, sat1n, vN, vM;
14667       res = sat1q = sat1n = vN = vM = IRTemp_INVALID;
14668       vN = newTempV128();
14669       assign(vN, getQReg128(nn));
14670       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
14671       math_SQDMULH(&res, &sat1q, &sat1n, isR, size, vN, vM);
14672       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
14673       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
14674       updateQCFLAGwithDifferenceZHI(sat1q, sat1n, opZHI);
14675       const HChar* nm  = isR ? "sqrdmulh" : "sqdmulh";
14676       const HChar* arr = nameArr_Q_SZ(bitQ, size);
14677       HChar ch         = size == X01 ? 'h' : 's';
14678       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", nm,
14679           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(dd), ch, ix);
14680       return True;
14681    }
14682
14683    if (bitU == 1 && (opcode == BITS4(1,1,0,1) || opcode == BITS4(1,1,1,1))) {
14684       /* -------- 0,xx,1101 SQRDMLAH s and h variants only -------- */
14685       /* -------- 0,xx,1111 SQRDMLSH s and h variants only -------- */
14686       UInt mm  = 32; // invalid
14687       UInt ix  = 16; // invalid
14688       switch (size) {
14689          case X00:
14690             return False; // b case is not allowed
14691          case X01:        // h
14692             mm = mmLO4; ix = (bitH << 2) | (bitL << 1) | (bitM << 0); break;
14693          case X10:        // s
14694             mm = (bitM << 4) | mmLO4; ix = (bitH << 1) | (bitL << 0); break;
14695          case X11:
14696             return False; // d case is not allowed
14697          default:
14698             vassert(0);
14699       }
14700       vassert(mm < 32 && ix < 16);
14701
14702       IRTemp res, res_nosat, vD, vN, vM;
14703       res = res_nosat = vD = vN = vM = IRTemp_INVALID;
14704       newTempsV128_2(&vD, &vN);
14705       assign(vD, getQReg128(dd));
14706       assign(vN, getQReg128(nn));
14707
14708       vM = math_DUP_VEC_ELEM(getQReg128(mm), size, ix);
14709       Bool isAdd = opcode == BITS4(1,1,0,1);
14710       math_SQRDMLAH(&res, &res_nosat, isAdd, size, vD, vN, vM);
14711       IROp opZHI = bitQ == 0 ? Iop_ZeroHI64ofV128 : Iop_INVALID;
14712       updateQCFLAGwithDifferenceZHI(res, res_nosat, opZHI);
14713       putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
14714
14715       const HChar* arr = nameArr_Q_SZ(bitQ, size);
14716       const HChar* nm  = isAdd ? "sqrdmlah" : "sqrdmlsh";
14717       HChar ch         = size == X01 ? 'h' : 's';
14718       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", nm,
14719           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), ch, ix);
14720       return True;
14721    }
14722
14723    return False;
14724 #  undef INSN
14725 }
14726
14727
14728 static
14729 Bool dis_AdvSIMD_crypto_aes(/*MB_OUT*/DisResult* dres, UInt insn)
14730 {
14731    /* 31        23   21    16     11 9 4
14732       0100 1110 size 10100 opcode 10 n d
14733       Decode fields are: size,opcode
14734       Size is always 00 in ARMv8, it appears.
14735    */
14736 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14737    if (INSN(31,24) != BITS8(0,1,0,0,1,1,1,0)
14738       || INSN(21,17) != BITS5(1,0,1,0,0) || INSN(11,10) != BITS2(1,0)) {
14739       return False;
14740    }
14741    UInt size   = INSN(23,22);
14742    UInt opcode = INSN(16,12);
14743    UInt nn     = INSN(9,5);
14744    UInt dd     = INSN(4,0);
14745
14746    if (size == BITS2(0,0)
14747        && (opcode == BITS5(0,0,1,0,0) || opcode == BITS5(0,0,1,0,1))) {
14748       /* -------- 00,00100: AESE Vd.16b, Vn.16b -------- */
14749       /* -------- 00,00101: AESD Vd.16b, Vn.16b -------- */
14750       Bool   isD  = opcode == BITS5(0,0,1,0,1);
14751       IRTemp op1  = newTemp(Ity_V128);
14752       IRTemp op2  = newTemp(Ity_V128);
14753       IRTemp xord = newTemp(Ity_V128);
14754       IRTemp res  = newTemp(Ity_V128);
14755       void*        helper = isD ? &arm64g_dirtyhelper_AESD
14756                                 : &arm64g_dirtyhelper_AESE;
14757       const HChar* hname  = isD ? "arm64g_dirtyhelper_AESD"
14758                                 : "arm64g_dirtyhelper_AESE";
14759       assign(op1, getQReg128(dd));
14760       assign(op2, getQReg128(nn));
14761       assign(xord, binop(Iop_XorV128, mkexpr(op1), mkexpr(op2)));
14762       IRDirty* di
14763          = unsafeIRDirty_1_N( res, 0/*regparms*/, hname, helper,
14764                               mkIRExprVec_3(
14765                                  IRExpr_VECRET(),
14766                                  unop(Iop_V128HIto64, mkexpr(xord)),
14767                                  unop(Iop_V128to64, mkexpr(xord)) ) );
14768       stmt(IRStmt_Dirty(di));
14769       putQReg128(dd, mkexpr(res));
14770       DIP("aes%c %s.16b, %s.16b\n", isD ? 'd' : 'e',
14771                                     nameQReg128(dd), nameQReg128(nn));
14772       return True;
14773    }
14774
14775    if (size == BITS2(0,0)
14776        && (opcode == BITS5(0,0,1,1,0) || opcode == BITS5(0,0,1,1,1))) {
14777       /* -------- 00,00110: AESMC  Vd.16b, Vn.16b -------- */
14778       /* -------- 00,00111: AESIMC Vd.16b, Vn.16b -------- */
14779       Bool   isI  = opcode == BITS5(0,0,1,1,1);
14780       IRTemp src  = newTemp(Ity_V128);
14781       IRTemp res  = newTemp(Ity_V128);
14782       void*        helper = isI ? &arm64g_dirtyhelper_AESIMC
14783                                 : &arm64g_dirtyhelper_AESMC;
14784       const HChar* hname  = isI ? "arm64g_dirtyhelper_AESIMC"
14785                                 : "arm64g_dirtyhelper_AESMC";
14786       assign(src, getQReg128(nn));
14787       IRDirty* di
14788          = unsafeIRDirty_1_N( res, 0/*regparms*/, hname, helper,
14789                               mkIRExprVec_3(
14790                                  IRExpr_VECRET(),
14791                                  unop(Iop_V128HIto64, mkexpr(src)),
14792                                  unop(Iop_V128to64, mkexpr(src)) ) );
14793       stmt(IRStmt_Dirty(di));
14794       putQReg128(dd, mkexpr(res));
14795       DIP("aes%s %s.16b, %s.16b\n", isI ? "imc" : "mc",
14796                                     nameQReg128(dd), nameQReg128(nn));
14797       return True;
14798    }
14799
14800    return False;
14801 #  undef INSN
14802 }
14803
14804
14805 static
14806 Bool dis_AdvSIMD_crypto_three_reg_sha(/*MB_OUT*/DisResult* dres, UInt insn)
14807 {
14808    /* 31   27   23 21 20 15 14  11 9 4
14809       0101 1110 sz 0  m  0  opc 00 n d
14810       Decode fields are: sz,opc
14811    */
14812 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14813    if (INSN(31,24) != BITS8(0,1,0,1,1,1,1,0) || INSN(21,21) != 0
14814        || INSN(15,15) != 0 || INSN(11,10) != BITS2(0,0)) {
14815       return False;
14816    }
14817    UInt sz  = INSN(23,22);
14818    UInt mm  = INSN(20,16);
14819    UInt opc = INSN(14,12);
14820    UInt nn  = INSN(9,5);
14821    UInt dd  = INSN(4,0);
14822    if (sz == BITS2(0,0) && opc <= BITS3(1,1,0)) {
14823       /* -------- 00,000 SHA1C     Qd,    Sn,    Vm.4S -------- */
14824       /* -------- 00,001 SHA1P     Qd,    Sn,    Vm.4S -------- */
14825       /* -------- 00,010 SHA1M     Qd,    Sn,    Vm.4S -------- */
14826       /* -------- 00,011 SHA1SU0   Vd.4S, Vn.4S, Vm.4S -------- */
14827       /* -------- 00,100 SHA256H   Qd,    Qn,    Vm.4S -------- */
14828       /* -------- 00,101 SHA256H2  Qd,    Qn,    Vm.4S -------- */
14829       /* -------- 00,110 SHA256SU1 Vd.4S, Vn.4S, Vm.4S -------- */
14830       vassert(opc < 7);
14831       const HChar* inames[7]
14832          = { "sha1c", "sha1p", "sha1m", "sha1su0",
14833              "sha256h", "sha256h2", "sha256su1" };
14834       void(*helpers[7])(V128*,ULong,ULong,ULong,ULong,ULong,ULong)
14835          = { &arm64g_dirtyhelper_SHA1C,    &arm64g_dirtyhelper_SHA1P,
14836              &arm64g_dirtyhelper_SHA1M,    &arm64g_dirtyhelper_SHA1SU0,
14837              &arm64g_dirtyhelper_SHA256H,  &arm64g_dirtyhelper_SHA256H2,
14838              &arm64g_dirtyhelper_SHA256SU1 };
14839       const HChar* hnames[7]
14840          = { "arm64g_dirtyhelper_SHA1C",    "arm64g_dirtyhelper_SHA1P",
14841              "arm64g_dirtyhelper_SHA1M",    "arm64g_dirtyhelper_SHA1SU0",
14842              "arm64g_dirtyhelper_SHA256H",  "arm64g_dirtyhelper_SHA256H2",
14843              "arm64g_dirtyhelper_SHA256SU1" };
14844       IRTemp vD      = newTemp(Ity_V128);
14845       IRTemp vN      = newTemp(Ity_V128);
14846       IRTemp vM      = newTemp(Ity_V128);
14847       IRTemp vDhi    = newTemp(Ity_I64);
14848       IRTemp vDlo    = newTemp(Ity_I64);
14849       IRTemp vNhiPre = newTemp(Ity_I64);
14850       IRTemp vNloPre = newTemp(Ity_I64);
14851       IRTemp vNhi    = newTemp(Ity_I64);
14852       IRTemp vNlo    = newTemp(Ity_I64);
14853       IRTemp vMhi    = newTemp(Ity_I64);
14854       IRTemp vMlo    = newTemp(Ity_I64);
14855       assign(vD,      getQReg128(dd));
14856       assign(vN,      getQReg128(nn));
14857       assign(vM,      getQReg128(mm));
14858       assign(vDhi,    unop(Iop_V128HIto64, mkexpr(vD)));
14859       assign(vDlo,    unop(Iop_V128to64,   mkexpr(vD)));
14860       assign(vNhiPre, unop(Iop_V128HIto64, mkexpr(vN)));
14861       assign(vNloPre, unop(Iop_V128to64,   mkexpr(vN)));
14862       assign(vMhi,    unop(Iop_V128HIto64, mkexpr(vM)));
14863       assign(vMlo,    unop(Iop_V128to64,   mkexpr(vM)));
14864       /* Mask off any bits of the N register operand that aren't actually
14865          needed, so that Memcheck doesn't complain unnecessarily. */
14866       switch (opc) {
14867          case BITS3(0,0,0): case BITS3(0,0,1): case BITS3(0,1,0):
14868             assign(vNhi, mkU64(0));
14869             assign(vNlo, unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(vNloPre))));
14870             break;
14871          case BITS3(0,1,1): case BITS3(1,0,0):
14872          case BITS3(1,0,1): case BITS3(1,1,0):
14873             assign(vNhi, mkexpr(vNhiPre));
14874             assign(vNlo, mkexpr(vNloPre));
14875             break;
14876          default:
14877             vassert(0);
14878       }
14879       IRTemp res = newTemp(Ity_V128);
14880       IRDirty* di
14881          = unsafeIRDirty_1_N( res, 0/*regparms*/, hnames[opc], helpers[opc],
14882                               mkIRExprVec_7(
14883                                  IRExpr_VECRET(),
14884                                  mkexpr(vDhi), mkexpr(vDlo), mkexpr(vNhi),
14885                                  mkexpr(vNlo), mkexpr(vMhi), mkexpr(vMlo)));
14886       stmt(IRStmt_Dirty(di));
14887       putQReg128(dd, mkexpr(res));
14888       switch (opc) {
14889          case BITS3(0,0,0): case BITS3(0,0,1): case BITS3(0,1,0):
14890             DIP("%s q%u, s%u, v%u.4s\n", inames[opc], dd, nn, mm);
14891             break;
14892          case BITS3(0,1,1): case BITS3(1,1,0):
14893             DIP("%s v%u.4s, v%u.4s, v%u.4s\n", inames[opc], dd, nn, mm);
14894             break;
14895          case BITS3(1,0,0): case BITS3(1,0,1):
14896             DIP("%s q%u, q%u, v%u.4s\n", inames[opc], dd, nn, mm);
14897             break;
14898          default:
14899             vassert(0);
14900       }
14901       return True;
14902    }
14903
14904    return False;
14905 #  undef INSN
14906 }
14907
14908
14909 static
14910 Bool dis_AdvSIMD_crypto_two_reg_sha(/*MB_OUT*/DisResult* dres, UInt insn)
14911 {
14912    /* 31   27   23 21    16  11 9 4
14913       0101 1110 sz 10100 opc 10 n d
14914       Decode fields are: sz,opc
14915    */
14916 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
14917    if (INSN(31,24) != BITS8(0,1,0,1,1,1,1,0)
14918        || INSN(21,17) != BITS5(1,0,1,0,0) || INSN(11,10) != BITS2(1,0)) {
14919       return False;
14920    }
14921    UInt sz  = INSN(23,22);
14922    UInt opc = INSN(16,12);
14923    UInt nn  = INSN(9,5);
14924    UInt dd  = INSN(4,0);
14925    if (sz == BITS2(0,0) && opc <= BITS5(0,0,0,1,0)) {
14926       /* -------- 00,00000 SHA1H     Sd,    Sn    -------- */
14927       /* -------- 00,00001 SHA1SU1   Vd.4S, Vn.4S -------- */
14928       /* -------- 00,00010 SHA256SU0 Vd.4S, Vn.4S -------- */
14929       vassert(opc < 3);
14930       const HChar* inames[3] = { "sha1h", "sha1su1", "sha256su0" };
14931       IRTemp vD   = newTemp(Ity_V128);
14932       IRTemp vN   = newTemp(Ity_V128);
14933       IRTemp vDhi = newTemp(Ity_I64);
14934       IRTemp vDlo = newTemp(Ity_I64);
14935       IRTemp vNhi = newTemp(Ity_I64);
14936       IRTemp vNlo = newTemp(Ity_I64);
14937       assign(vD,   getQReg128(dd));
14938       assign(vN,   getQReg128(nn));
14939       assign(vDhi, unop(Iop_V128HIto64, mkexpr(vD)));
14940       assign(vDlo, unop(Iop_V128to64,   mkexpr(vD)));
14941       assign(vNhi, unop(Iop_V128HIto64, mkexpr(vN)));
14942       assign(vNlo, unop(Iop_V128to64,   mkexpr(vN)));
14943       /* Mask off any bits of the N register operand that aren't actually
14944          needed, so that Memcheck doesn't complain unnecessarily.  Also
14945          construct the calls, given that the helper functions don't take
14946          the same number of arguments. */
14947       IRDirty* di  = NULL;
14948       IRTemp   res = newTemp(Ity_V128);
14949       switch (opc) {
14950          case BITS5(0,0,0,0,0): {
14951             IRExpr* vNloMasked = unop(Iop_32Uto64,
14952                                       unop(Iop_64to32, mkexpr(vNlo)));
14953             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
14954                                     "arm64g_dirtyhelper_SHA1H",
14955                                     &arm64g_dirtyhelper_SHA1H,
14956                                     mkIRExprVec_3(
14957                                        IRExpr_VECRET(),
14958                                        mkU64(0), vNloMasked) );
14959             break;
14960          }
14961          case BITS5(0,0,0,0,1):
14962             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
14963                                     "arm64g_dirtyhelper_SHA1SU1",
14964                                     &arm64g_dirtyhelper_SHA1SU1,
14965                                     mkIRExprVec_5(
14966                                        IRExpr_VECRET(),
14967                                        mkexpr(vDhi), mkexpr(vDlo),
14968                                        mkexpr(vNhi), mkexpr(vNlo)) );
14969             break;
14970          case BITS5(0,0,0,1,0):
14971             di = unsafeIRDirty_1_N( res, 0/*regparms*/,
14972                                     "arm64g_dirtyhelper_SHA256SU0",
14973                                     &arm64g_dirtyhelper_SHA256SU0,
14974                                     mkIRExprVec_5(
14975                                        IRExpr_VECRET(),
14976                                        mkexpr(vDhi), mkexpr(vDlo),
14977                                        mkexpr(vNhi), mkexpr(vNlo)) );
14978             break;
14979          default:
14980             vassert(0);
14981       }
14982       stmt(IRStmt_Dirty(di));
14983       putQReg128(dd, mkexpr(res));
14984       switch (opc) {
14985          case BITS5(0,0,0,0,0):
14986             DIP("%s s%u, s%u\n", inames[opc], dd, nn);
14987             break;
14988          case BITS5(0,0,0,0,1): case BITS5(0,0,0,1,0):
14989             DIP("%s v%u.4s, v%u.4s\n", inames[opc], dd, nn);
14990             break;
14991          default:
14992             vassert(0);
14993       }
14994       return True;
14995    }
14996
14997    return False;
14998 #  undef INSN
14999 }
15000
15001
15002 static
15003 Bool dis_AdvSIMD_crypto_three_reg_sha512(/*MB_OUT*/DisResult* dres, UInt insn)
15004 {
15005    /* 31   27   23  20 15 14 13 11  9 4
15006       1100 1110 011 m  1  o  00 opc n d
15007       Decode fields are: o,opc
15008    */
15009 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15010    if (INSN(31,21) != BITS11(1,1,0,0,1,1,1,0,0,1,1) || INSN(15,15) != 1
15011        || INSN(13,12) != BITS2(0,0)) {
15012       return False;
15013    }
15014    UInt mm   = INSN(20,16);
15015    UInt bitO = INSN(14,14);
15016    UInt opc  = INSN(11,10);
15017    UInt nn   = INSN(9,5);
15018    UInt dd   = INSN(4,0);
15019    if (bitO == 0 && opc <= BITS2(1,0)) {
15020       /* -------- 0,00 SHA512H   Qd,    Qn,    Vm.2D -------- */
15021       /* -------- 0,01 SHA512H2  Qd,    Qn,    Vm.2D -------- */
15022       /* -------- 0,10 SHA512SU1 Vd.2D, Vn.2D, Vm.2D -------- */
15023       vassert(opc < 3);
15024       const HChar* inames[3] = { "sha512h", "sha512h2", "sha512su1" };
15025       void(*helpers[3])(V128*,ULong,ULong,ULong,ULong,ULong,ULong)
15026          = { &arm64g_dirtyhelper_SHA512H,  &arm64g_dirtyhelper_SHA512H2,
15027              &arm64g_dirtyhelper_SHA512SU1 };
15028       const HChar* hnames[3]
15029          = { "arm64g_dirtyhelper_SHA512H",  "arm64g_dirtyhelper_SHA512H2",
15030              "arm64g_dirtyhelper_SHA512SU1" };
15031       IRTemp vD   = newTemp(Ity_V128);
15032       IRTemp vN   = newTemp(Ity_V128);
15033       IRTemp vM   = newTemp(Ity_V128);
15034       IRTemp vDhi = newTemp(Ity_I64);
15035       IRTemp vDlo = newTemp(Ity_I64);
15036       IRTemp vNhi = newTemp(Ity_I64);
15037       IRTemp vNlo = newTemp(Ity_I64);
15038       IRTemp vMhi = newTemp(Ity_I64);
15039       IRTemp vMlo = newTemp(Ity_I64);
15040       assign(vD,   getQReg128(dd));
15041       assign(vN,   getQReg128(nn));
15042       assign(vM,   getQReg128(mm));
15043       assign(vDhi, unop(Iop_V128HIto64, mkexpr(vD)));
15044       assign(vDlo, unop(Iop_V128to64,   mkexpr(vD)));
15045       /* vNhi is initialized below. */
15046       assign(vNlo, unop(Iop_V128to64,   mkexpr(vN)));
15047       assign(vMhi, unop(Iop_V128HIto64, mkexpr(vM)));
15048       assign(vMlo, unop(Iop_V128to64,   mkexpr(vM)));
15049       /* SHA512H2 does not use the upper half of the N register. Mask it off so
15050          that Memcheck doesn't complain unnecessarily. */
15051       switch (opc) {
15052          case BITS2(0,1):
15053             assign(vNhi, mkU64(0));
15054             break;
15055          case BITS2(0,0): case BITS2(1,0):
15056             assign(vNhi, unop(Iop_V128HIto64, mkexpr(vN)));
15057             break;
15058          default:
15059             vassert(0);
15060       }
15061       IRTemp res = newTemp(Ity_V128);
15062       IRDirty* di
15063          = unsafeIRDirty_1_N( res, 0/*regparms*/, hnames[opc], helpers[opc],
15064                               mkIRExprVec_7(
15065                                  IRExpr_VECRET(),
15066                                  mkexpr(vDhi), mkexpr(vDlo), mkexpr(vNhi),
15067                                  mkexpr(vNlo), mkexpr(vMhi), mkexpr(vMlo)));
15068       stmt(IRStmt_Dirty(di));
15069       putQReg128(dd, mkexpr(res));
15070       switch (opc) {
15071          case BITS2(0,0): case BITS2(0,1):
15072             DIP("%s q%u, q%u, v%u.2d\n", inames[opc], dd, nn, mm);
15073             break;
15074          case BITS2(1,0):
15075             DIP("%s v%u.2d, v%u.2d, v%u.2d\n", inames[opc], dd, nn, mm);
15076             break;
15077          default:
15078             vassert(0);
15079       }
15080       return True;
15081    }
15082
15083    return False;
15084 #  undef INSN
15085 }
15086
15087
15088 static
15089 Bool dis_AdvSIMD_crypto_two_reg_sha512(/*MB_OUT*/DisResult* dres, UInt insn)
15090 {
15091    /* 31   27   23   19   15   11  9 4
15092       1100 1110 1100 0000 1000 opc n d
15093       Decode fields are: opc
15094    */
15095 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15096    if (INSN(31,20) != BITS12(1,1,0,0,1,1,1,0,1,1,0,0)
15097        || INSN(19,12) != BITS8(0,0,0,0,1,0,0,0)) {
15098       return False;
15099    }
15100    UInt opc = INSN(11,10);
15101    UInt nn  = INSN(9,5);
15102    UInt dd  = INSN(4,0);
15103    if (opc == BITS2(0,0)) {
15104       /* -------- 00 SHA512SU0 Vd.2D, Vn.2D -------- */
15105       IRTemp vD   = newTemp(Ity_V128);
15106       IRTemp vN   = newTemp(Ity_V128);
15107       IRTemp vDhi = newTemp(Ity_I64);
15108       IRTemp vDlo = newTemp(Ity_I64);
15109       IRTemp vNhi = newTemp(Ity_I64);
15110       IRTemp vNlo = newTemp(Ity_I64);
15111       assign(vD,   getQReg128(dd));
15112       assign(vN,   getQReg128(nn));
15113       /* SHA512SU0 ignores the upper half of the N register. Mask it off, so
15114          that Memcheck doesn't complain unnecessarily. */
15115       assign(vDhi, unop(Iop_V128HIto64, mkexpr(vD)));
15116       assign(vDlo, unop(Iop_V128to64,   mkexpr(vD)));
15117       assign(vNhi, mkU64(0));
15118       assign(vNlo, unop(Iop_V128to64,   mkexpr(vN)));
15119       IRTemp   res = newTemp(Ity_V128);
15120       IRDirty* di  = unsafeIRDirty_1_N( res, 0/*regparms*/,
15121                                         "arm64g_dirtyhelper_SHA512SU0",
15122                                         &arm64g_dirtyhelper_SHA512SU0,
15123                                         mkIRExprVec_5(
15124                                            IRExpr_VECRET(),
15125                                            mkexpr(vDhi), mkexpr(vDlo),
15126                                            mkexpr(vNhi), mkexpr(vNlo)) );
15127       stmt(IRStmt_Dirty(di));
15128       putQReg128(dd, mkexpr(res));
15129       DIP("sha512su0 v%u.2d, v%u.2d\n", dd, nn);
15130       return True;
15131    }
15132
15133    return False;
15134 #  undef INSN
15135 }
15136
15137
15138 static
15139 Bool dis_AdvSIMD_fp_compare(/*MB_OUT*/DisResult* dres, UInt insn)
15140 {
15141    /* 31  28    23 21 20 15 13   9 4
15142       000 11110 ty 1  m  op 1000 n opcode2
15143       The first 3 bits are really "M 0 S", but M and S are always zero.
15144       Decode fields are: ty,op,opcode2
15145    */
15146 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15147    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
15148        || INSN(21,21) != 1 || INSN(13,10) != BITS4(1,0,0,0)) {
15149       return False;
15150    }
15151    UInt ty      = INSN(23,22);
15152    UInt mm      = INSN(20,16);
15153    UInt op      = INSN(15,14);
15154    UInt nn      = INSN(9,5);
15155    UInt opcode2 = INSN(4,0);
15156    vassert(ty < 4);
15157
15158    if (ty <= X01 && op == X00
15159        && (opcode2 & BITS5(0,0,1,1,1)) == BITS5(0,0,0,0,0)) {
15160       /* -------- 0x,00,00000 FCMP  d_d,   s_s -------- */
15161       /* -------- 0x,00,01000 FCMP  d_#0, s_#0 -------- */
15162       /* -------- 0x,00,10000 FCMPE d_d,   s_s -------- */
15163       /* -------- 0x,00,11000 FCMPE d_#0, s_#0 -------- */
15164       /* 31        23   20    15      9 4
15165          000 11110 01 1     m 00 1000 n 10 000  FCMPE Dn, Dm
15166          000 11110 01 1 00000 00 1000 n 11 000  FCMPE Dn, #0.0
15167          000 11110 01 1     m 00 1000 n 00 000  FCMP  Dn, Dm
15168          000 11110 01 1 00000 00 1000 n 01 000  FCMP  Dn, #0.0
15169
15170          000 11110 00 1     m 00 1000 n 10 000  FCMPE Sn, Sm
15171          000 11110 00 1 00000 00 1000 n 11 000  FCMPE Sn, #0.0
15172          000 11110 00 1     m 00 1000 n 00 000  FCMP  Sn, Sm
15173          000 11110 00 1 00000 00 1000 n 01 000  FCMP  Sn, #0.0
15174
15175          FCMPE generates Invalid Operation exn if either arg is any kind
15176          of NaN.  FCMP generates Invalid Operation exn if either arg is a
15177          signalling NaN.  We ignore this detail here and produce the same
15178          IR for both.
15179       */
15180       Bool   isD     = (ty & 1) == 1;
15181       Bool   isCMPE  = (opcode2 & 16) == 16;
15182       Bool   cmpZero = (opcode2 & 8) == 8;
15183       IRType ity     = isD ? Ity_F64 : Ity_F32;
15184       Bool   valid   = True;
15185       if (cmpZero && mm != 0) valid = False;
15186       if (valid) {
15187          IRTemp argL  = newTemp(ity);
15188          IRTemp argR  = newTemp(ity);
15189          IRTemp irRes = newTemp(Ity_I32);
15190          assign(argL, getQRegLO(nn, ity));
15191          assign(argR,
15192                 cmpZero
15193                    ? (IRExpr_Const(isD ? IRConst_F64i(0) : IRConst_F32i(0)))
15194                    : getQRegLO(mm, ity));
15195          assign(irRes, binop(isD ? Iop_CmpF64 : Iop_CmpF32,
15196                              mkexpr(argL), mkexpr(argR)));
15197          IRTemp nzcv = mk_convert_IRCmpF64Result_to_NZCV(irRes);
15198          IRTemp nzcv_28x0 = newTemp(Ity_I64);
15199          assign(nzcv_28x0, binop(Iop_Shl64, mkexpr(nzcv), mkU8(28)));
15200          setFlags_COPY(nzcv_28x0);
15201          DIP("fcmp%s %s, %s\n", isCMPE ? "e" : "", nameQRegLO(nn, ity),
15202              cmpZero ? "#0.0" : nameQRegLO(mm, ity));
15203          return True;
15204       }
15205       return False;
15206    }
15207
15208    return False;
15209 #  undef INSN
15210 }
15211
15212
15213 static
15214 Bool dis_AdvSIMD_fp_conditional_compare(/*MB_OUT*/DisResult* dres, UInt insn,
15215                                         const VexArchInfo* archinfo, Bool sigill_diag)
15216 {
15217    /* 31  28    23 21 20 15   11 9 4  3
15218       000 11110 ty 1  m  cond 01 n op nzcv
15219       The first 3 bits are really "M 0 S", but M and S are always zero.
15220       Decode fields are: ty,op
15221    */
15222 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15223    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
15224        || INSN(21,21) != 1 || INSN(11,10) != BITS2(0,1)) {
15225       return False;
15226    }
15227    UInt ty   = INSN(23,22);
15228    UInt mm   = INSN(20,16);
15229    UInt cond = INSN(15,12);
15230    UInt nn   = INSN(9,5);
15231    UInt op   = INSN(4,4);
15232    UInt nzcv = INSN(3,0);
15233    vassert(ty < 4 && op <= 1);
15234
15235    /* -------- 00,0 FCCMP  s_s -------- */
15236    /* -------- 00,1 FCCMPE s_s -------- */
15237    /* -------- 01,0 FCCMP  d_d -------- */
15238    /* -------- 01,1 FCCMPE d_d -------- */
15239    /* -------- 11,0 FCCMP  h_h -------- */
15240    /* -------- 11,1 FCCMPE h_h -------- */
15241
15242    /* FCCMPE generates Invalid Operation exn if either arg is any kind
15243       of NaN.  FCCMP generates Invalid Operation exn if either arg is a
15244       signalling NaN.  We ignore this detail here and produce the same
15245       IR for both.
15246    */
15247    Bool   isCMPE = op == 1;
15248    IRType ity;
15249    IROp   irop;
15250    if (ty == 0) {
15251       ity  = Ity_F32;
15252       irop = Iop_CmpF32;
15253    }
15254    else if (ty == 1) {
15255       ity  = Ity_F64;
15256       irop = Iop_CmpF64;
15257    }
15258    else if (ty == 3) {
15259       if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
15260          return False;
15261       ity  = Ity_F16;
15262       irop = Iop_CmpF16;
15263    }
15264    else {
15265       /* ty = 2 is an illegal encoding */
15266       if (sigill_diag) {
15267          vex_printf("ARM64 front end: dis_AdvSIMD_fp_conditional_compare\n");
15268       }
15269       return False;
15270    }
15271    IRTemp argL   = newTemp(ity);
15272    IRTemp argR   = newTemp(ity);
15273    IRTemp irRes  = newTemp(Ity_I32);
15274    assign(argL,  getQRegLO(nn, ity));
15275    assign(argR,  getQRegLO(mm, ity));
15276    assign(irRes, binop(irop, mkexpr(argL), mkexpr(argR)));
15277    IRTemp condT = newTemp(Ity_I1);
15278    assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));
15279    IRTemp nzcvT = mk_convert_IRCmpF64Result_to_NZCV(irRes);
15280
15281    IRTemp nzcvT_28x0 = newTemp(Ity_I64);
15282    assign(nzcvT_28x0, binop(Iop_Shl64, mkexpr(nzcvT), mkU8(28)));
15283
15284    IRExpr* nzcvF_28x0 = mkU64(((ULong)nzcv) << 28);
15285
15286    IRTemp nzcv_28x0 = newTemp(Ity_I64);
15287    assign(nzcv_28x0, IRExpr_ITE(mkexpr(condT),
15288                                 mkexpr(nzcvT_28x0), nzcvF_28x0));
15289    setFlags_COPY(nzcv_28x0);
15290    DIP("fccmp%s %s, %s, #%u, %s\n", isCMPE ? "e" : "",
15291        nameQRegLO(nn, ity), nameQRegLO(mm, ity), nzcv, nameCC(cond));
15292    return True;
15293
15294 #  undef INSN
15295 }
15296
15297
15298 static
15299 Bool dis_AdvSIMD_fp_conditional_select(/*MB_OUT*/DisResult* dres, UInt insn)
15300 {
15301    /* 31        23 21 20 15   11 9 5
15302       000 11110 ty 1  m  cond 11 n d
15303       The first 3 bits are really "M 0 S", but M and S are always zero.
15304       Decode fields: ty
15305    */
15306 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15307    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0) || INSN(21,21) != 1
15308        || INSN(11,10) != BITS2(1,1)) {
15309       return False;
15310    }
15311    UInt ty   = INSN(23,22);
15312    UInt mm   = INSN(20,16);
15313    UInt cond = INSN(15,12);
15314    UInt nn   = INSN(9,5);
15315    UInt dd   = INSN(4,0);
15316    if (ty <= X01) {
15317       /* -------- 00: FCSEL s_s -------- */
15318       /* -------- 00: FCSEL d_d -------- */
15319       IRType ity = ty == X01 ? Ity_F64 : Ity_F32;
15320       IRTemp srcT = newTemp(ity);
15321       IRTemp srcF = newTemp(ity);
15322       IRTemp res  = newTemp(ity);
15323       assign(srcT, getQRegLO(nn, ity));
15324       assign(srcF, getQRegLO(mm, ity));
15325       assign(res, IRExpr_ITE(
15326                      unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
15327                      mkexpr(srcT), mkexpr(srcF)));
15328       putQReg128(dd, mkV128(0x0000));
15329       putQRegLO(dd, mkexpr(res));
15330       DIP("fcsel %s, %s, %s, %s\n",
15331           nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity),
15332           nameCC(cond));
15333       return True;
15334    }
15335    return False;
15336 #  undef INSN
15337 }
15338
15339
15340 static
15341 Bool dis_AdvSIMD_fp_data_proc_1_source(/*MB_OUT*/DisResult* dres, UInt insn)
15342 {
15343    /* 31  28    23 21 20     14    9 4
15344       000 11110 ty 1  opcode 10000 n d
15345       The first 3 bits are really "M 0 S", but M and S are always zero.
15346       Decode fields: ty,opcode
15347    */
15348 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15349    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
15350        || INSN(21,21) != 1 || INSN(14,10) != BITS5(1,0,0,0,0)) {
15351       return False;
15352    }
15353    UInt ty     = INSN(23,22);
15354    UInt opcode = INSN(20,15);
15355    UInt nn     = INSN(9,5);
15356    UInt dd     = INSN(4,0);
15357
15358    if (opcode <= BITS6(0,0,0,0,1,1)) {
15359       /* -------- 0x,000000: FMOV  d_d, s_s -------- */
15360       /* -------- 0x,000001: FABS  d_d, s_s, h_h --- */
15361       /* -------- 0x,000010: FNEG  d_d, s_s, h_h --- */
15362       /* -------- 0x,000011: FSQRT d_d, s_s, h_h --- */
15363       IRType ity;
15364       if (ty == X01) ity = Ity_F64;
15365       else if (ty == X00) ity = Ity_F32;
15366       else if (ty == X11) ity = Ity_F16;
15367       else vassert(0);
15368       IRTemp src = newTemp(ity);
15369       IRTemp res = newTemp(ity);
15370       const HChar* nm = "??";
15371       assign(src, getQRegLO(nn, ity));
15372       switch (opcode) {
15373          case BITS6(0,0,0,0,0,0):
15374             nm = "fmov"; assign(res, mkexpr(src)); break;
15375          case BITS6(0,0,0,0,0,1):
15376             nm = "fabs"; assign(res, unop(mkABSF(ity), mkexpr(src))); break;
15377          case BITS6(0,0,0,0,1,0):
15378             nm = "fneg"; assign(res, unop(mkNEGF(ity), mkexpr(src))); break;
15379          case BITS6(0,0,0,0,1,1):
15380             nm = "fsqrt";
15381             assign(res, binop(mkSQRTF(ity),
15382                               mkexpr(mk_get_IR_rounding_mode()),
15383                               mkexpr(src))); break;
15384          default:
15385             vassert(0);
15386       }
15387       putQReg128(dd, mkV128(0x0000));
15388       putQRegLO(dd, mkexpr(res));
15389       DIP("%s %s, %s\n", nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
15390       return True;
15391    }
15392
15393    if (   (ty == X11 && (opcode == BITS6(0,0,0,1,0,0)
15394                          || opcode == BITS6(0,0,0,1,0,1)))
15395        || (ty == X00 && (opcode == BITS6(0,0,0,1,1,1)
15396                          || opcode == BITS6(0,0,0,1,0,1)))
15397        || (ty == X01 && (opcode == BITS6(0,0,0,1,1,1)
15398                          || opcode == BITS6(0,0,0,1,0,0)))) {
15399       /* -------- 11,000100: FCVT s_h -------- */
15400       /* -------- 11,000101: FCVT d_h -------- */
15401       /* -------- 00,000111: FCVT h_s -------- */
15402       /* -------- 00,000101: FCVT d_s -------- */
15403       /* -------- 01,000111: FCVT h_d -------- */
15404       /* -------- 01,000100: FCVT s_d -------- */
15405       /* 31        23 21    16 14    9 4
15406          000 11110 11 10001 00 10000 n d   FCVT Sd, Hn
15407          --------- 11 ----- 01 ---------   FCVT Dd, Hn
15408          --------- 00 ----- 11 ---------   FCVT Hd, Sn
15409          --------- 00 ----- 01 ---------   FCVT Dd, Sn
15410          --------- 01 ----- 11 ---------   FCVT Hd, Dn
15411          --------- 01 ----- 00 ---------   FCVT Sd, Dn
15412          Rounding, when dst is smaller than src, is per the FPCR.
15413       */
15414       UInt b2322 = ty;
15415       UInt b1615 = opcode & BITS2(1,1);
15416       switch ((b2322 << 2) | b1615) {
15417          case BITS4(0,0,0,1):   // S -> D
15418          case BITS4(1,1,0,1): { // H -> D
15419             Bool   srcIsH = b2322 == BITS2(1,1);
15420             IRType srcTy  = srcIsH ? Ity_F16 : Ity_F32;
15421             IRTemp res    = newTemp(Ity_F64);
15422             assign(res, unop(srcIsH ? Iop_F16toF64 : Iop_F32toF64,
15423                              getQRegLO(nn, srcTy)));
15424             putQReg128(dd, mkV128(0x0000));
15425             putQRegLO(dd, mkexpr(res));
15426             DIP("fcvt %s, %s\n",
15427                 nameQRegLO(dd, Ity_F64), nameQRegLO(nn, srcTy));
15428             return True;
15429          }
15430          case BITS4(0,1,0,0):   // D -> S
15431          case BITS4(0,1,1,1): { // D -> H
15432             Bool   dstIsH = b1615 == BITS2(1,1);
15433             IRType dstTy  = dstIsH ? Ity_F16 : Ity_F32;
15434             IRTemp res    = newTemp(dstTy);
15435             assign(res, binop(dstIsH ? Iop_F64toF16 : Iop_F64toF32,
15436                               mkexpr(mk_get_IR_rounding_mode()),
15437                               getQRegLO(nn, Ity_F64)));
15438             putQReg128(dd, mkV128(0x0000));
15439             putQRegLO(dd, mkexpr(res));
15440             DIP("fcvt %s, %s\n",
15441                 nameQRegLO(dd, dstTy), nameQRegLO(nn, Ity_F64));
15442             return True;
15443          }
15444          case BITS4(0,0,1,1):   // S -> H
15445          case BITS4(1,1,0,0): { // H -> S
15446             Bool   toH   = b1615 == BITS2(1,1);
15447             IRType srcTy = toH ? Ity_F32 : Ity_F16;
15448             IRType dstTy = toH ? Ity_F16 : Ity_F32;
15449             IRTemp res = newTemp(dstTy);
15450             if (toH) {
15451                assign(res, binop(Iop_F32toF16,
15452                                  mkexpr(mk_get_IR_rounding_mode()),
15453                                  getQRegLO(nn, srcTy)));
15454
15455             } else {
15456                assign(res, unop(Iop_F16toF32,
15457                                 getQRegLO(nn, srcTy)));
15458             }
15459             putQReg128(dd, mkV128(0x0000));
15460             putQRegLO(dd, mkexpr(res));
15461             DIP("fcvt %s, %s\n",
15462                 nameQRegLO(dd, dstTy), nameQRegLO(nn, srcTy));
15463             return True;
15464          }
15465          default:
15466             break;
15467       }
15468       /* else unhandled */
15469       return False;
15470    }
15471
15472    if (ty <= X01
15473        && opcode >= BITS6(0,0,1,0,0,0) && opcode <= BITS6(0,0,1,1,1,1)
15474        && opcode != BITS6(0,0,1,1,0,1)) {
15475       /* -------- 0x,001000 FRINTN d_d, s_s -------- */
15476       /* -------- 0x,001001 FRINTP d_d, s_s -------- */
15477       /* -------- 0x,001010 FRINTM d_d, s_s -------- */
15478       /* -------- 0x,001011 FRINTZ d_d, s_s -------- */
15479       /* -------- 0x,001100 FRINTA d_d, s_s -------- */
15480       /* -------- 0x,001110 FRINTX d_d, s_s -------- */
15481       /* -------- 0x,001111 FRINTI d_d, s_s -------- */
15482       /* 31        23 21   17  14    9 4
15483          000 11110 0x 1001 111 10000 n d  FRINTI Fd, Fm (round per FPCR)
15484                            rm
15485          x==0 => S-registers, x==1 => D-registers
15486          rm (17:15) encodings:
15487             111 per FPCR  (FRINTI)
15488             001 +inf      (FRINTP)
15489             010 -inf      (FRINTM)
15490             011 zero      (FRINTZ)
15491             000 tieeven   (FRINTN)
15492             100 tieaway   (FRINTA)
15493             110 per FPCR + "exact = TRUE" (FRINTX)
15494             101 unallocated
15495       */
15496       Bool    isD     = (ty & 1) == 1;
15497       UInt    rm      = opcode & BITS6(0,0,0,1,1,1);
15498       IRType  ity     = isD ? Ity_F64 : Ity_F32;
15499       IRExpr* irrmE   = NULL;
15500       UChar   ch      = '?';
15501       IROp    op      = isD ? Iop_RoundF64toInt : Iop_RoundF32toInt;
15502       Bool    isBinop = True;
15503       switch (rm) {
15504          case BITS3(0,1,1): ch = 'z'; irrmE = mkU32(Irrm_ZERO); break;
15505          case BITS3(0,1,0): ch = 'm'; irrmE = mkU32(Irrm_NegINF); break;
15506          case BITS3(0,0,1): ch = 'p'; irrmE = mkU32(Irrm_PosINF); break;
15507          case BITS3(1,0,0): ch = 'a'; isBinop = False; op = isD ? Iop_RoundF64toIntA0 : Iop_RoundF32toIntA0; break;
15508          // I am unsure about the following, due to the "integral exact"
15509          // description in the manual.  What does it mean? (frintx, that is)
15510          // PJF exact means that if the rounding can't be done without
15511          // precision loss (dst numerically equal to src after the rounding)
15512          // then an exception is raised / the IXC bit gets set in the FPSR
15513          case BITS3(1,1,0):
15514             ch = 'x'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
15515          case BITS3(1,1,1):
15516             ch = 'i'; irrmE = mkexpr(mk_get_IR_rounding_mode()); break;
15517          case BITS3(0,0,0): ch = 'n'; isBinop = False; op = isD ? Iop_RoundF64toIntE : Iop_RoundF32toIntE; break;
15518          default: break;
15519       }
15520       if (irrmE || !isBinop) {
15521          IRTemp src = newTemp(ity);
15522          IRTemp dst = newTemp(ity);
15523          assign(src, getQRegLO(nn, ity));
15524          if (isBinop) {
15525             assign(dst, binop(op, irrmE, mkexpr(src)));
15526          } else {
15527              assign(dst, unop(op, mkexpr(src)));
15528          }
15529          putQReg128(dd, mkV128(0x0000));
15530          putQRegLO(dd, mkexpr(dst));
15531          DIP("frint%c %s, %s\n",
15532              ch, nameQRegLO(dd, ity), nameQRegLO(nn, ity));
15533          return True;
15534       }
15535       return False;
15536    }
15537
15538    return False;
15539 #  undef INSN
15540 }
15541
15542
15543 static
15544 Bool dis_AdvSIMD_fp_data_proc_2_source(/*MB_OUT*/DisResult* dres, UInt insn,
15545                                        const VexArchInfo* archinfo)
15546 {
15547    /* 31  28    23 21 20 15     11 9 4
15548       000 11110 ty 1  m  opcode 10 n d
15549       The first 3 bits are really "M 0 S", but M and S are always zero.
15550       Decode fields: ty, opcode
15551    */
15552 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15553    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
15554        || INSN(21,21) != 1 || INSN(11,10) != BITS2(1,0)) {
15555       return False;
15556    }
15557    UInt ty     = INSN(23,22);
15558    UInt mm     = INSN(20,16);
15559    UInt opcode = INSN(15,12);
15560    UInt nn     = INSN(9,5);
15561    UInt dd     = INSN(4,0);
15562
15563    if (ty <= X01 && opcode <= BITS4(0,1,1,1)) {
15564       /* ------- 0x,0000: FMUL d_d, s_s ------- */
15565       /* ------- 0x,0001: FDIV d_d, s_s ------- */
15566       /* ------- 0x,0010: FADD d_d, s_s ------- */
15567       /* ------- 0x,0011: FSUB d_d, s_s ------- */
15568       /* ------- 0x,0100: FMAX d_d, s_s ------- */
15569       /* ------- 0x,0101: FMIN d_d, s_s ------- */
15570       /* ------- 0x,0110: FMAXNM d_d, s_s ------- (FIXME KLUDGED) */
15571       /* ------- 0x,0111: FMINNM d_d, s_s ------- (FIXME KLUDGED) */
15572       IRType ity = ty == X00 ? Ity_F32 : Ity_F64;
15573       IROp   iop = Iop_INVALID;
15574       const HChar* nm = "???";
15575       switch (opcode) {
15576          case BITS4(0,0,0,0): nm = "fmul"; iop = mkMULF(ity); break;
15577          case BITS4(0,0,0,1): nm = "fdiv"; iop = mkDIVF(ity); break;
15578          case BITS4(0,0,1,0): nm = "fadd"; iop = mkADDF(ity); break;
15579          case BITS4(0,0,1,1): nm = "fsub"; iop = mkSUBF(ity); break;
15580          case BITS4(0,1,0,0): nm = "fmax"; iop = mkVecMAXF(ty+2); break;
15581          case BITS4(0,1,0,1): nm = "fmin"; iop = mkVecMINF(ty+2); break;
15582          case BITS4(0,1,1,0): nm = "fmaxnm"; iop = mkVecMAXF(ty+2); break; //!!
15583          case BITS4(0,1,1,1): nm = "fminnm"; iop = mkVecMINF(ty+2); break; //!!
15584          default: vassert(0);
15585       }
15586       if (opcode <= BITS4(0,0,1,1)) {
15587          // This is really not good code.  TODO: avoid width-changing
15588          IRTemp res = newTemp(ity);
15589          assign(res, triop(iop, mkexpr(mk_get_IR_rounding_mode()),
15590                                 getQRegLO(nn, ity), getQRegLO(mm, ity)));
15591          putQReg128(dd, mkV128(0));
15592          putQRegLO(dd, mkexpr(res));
15593       } else {
15594          putQReg128(dd, unop(mkVecZEROHIxxOFV128(ty+2),
15595                              binop(iop, getQReg128(nn), getQReg128(mm))));
15596       }
15597       DIP("%s %s, %s, %s\n",
15598           nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
15599       return True;
15600    }
15601
15602    if (ty <= X01 && opcode == BITS4(1,0,0,0)) {
15603       /* ------- 0x,1000: FNMUL d_d, s_s ------- */
15604       IRType ity  = ty == X00 ? Ity_F32 : Ity_F64;
15605       IROp   iop  = mkMULF(ity);
15606       IROp   iopn = mkNEGF(ity);
15607       const HChar* nm = "fnmul";
15608       IRExpr* resE = unop(iopn,
15609                           triop(iop, mkexpr(mk_get_IR_rounding_mode()),
15610                                 getQRegLO(nn, ity), getQRegLO(mm, ity)));
15611       IRTemp  res  = newTemp(ity);
15612       assign(res, resE);
15613       putQReg128(dd, mkV128(0));
15614       putQRegLO(dd, mkexpr(res));
15615       DIP("%s %s, %s, %s\n",
15616           nm, nameQRegLO(dd, ity), nameQRegLO(nn, ity), nameQRegLO(mm, ity));
15617       return True;
15618    }
15619
15620    if (ty == X11 && opcode <= BITS4(0,0,1,0)) {
15621       /* ------- 11,0010: FADD h_h ------- */
15622       if ((archinfo->hwcaps & VEX_HWCAPS_ARM64_FP16) == 0)
15623          return False;
15624       IRTemp res = newTemp(Ity_F16);
15625       assign(res, triop(mkADDF(Ity_F16), mkexpr(mk_get_IR_rounding_mode()),
15626                              getQRegLO(nn, Ity_F16), getQRegLO(mm, Ity_F16)));
15627       putQReg128(dd, mkV128(0));
15628       putQRegLO(dd, mkexpr(res));
15629       DIP("fadd %s, %s, %s\n",
15630           nameQRegLO(dd, Ity_F16), nameQRegLO(nn, Ity_F16), nameQRegLO(mm, Ity_F16));
15631       return True;
15632    }
15633
15634    return False;
15635 #  undef INSN
15636 }
15637
15638
15639 static
15640 Bool dis_AdvSIMD_fp_data_proc_3_source(/*MB_OUT*/DisResult* dres, UInt insn)
15641 {
15642    /* 31  28    23 21 20 15 14 9 4
15643       000 11111 ty o1 m  o0 a  n d
15644       The first 3 bits are really "M 0 S", but M and S are always zero.
15645       Decode fields: ty,o1,o0
15646    */
15647 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15648    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,1)) {
15649       return False;
15650    }
15651    UInt ty    = INSN(23,22);
15652    UInt bitO1 = INSN(21,21);
15653    UInt mm    = INSN(20,16);
15654    UInt bitO0 = INSN(15,15);
15655    UInt aa    = INSN(14,10);
15656    UInt nn    = INSN(9,5);
15657    UInt dd    = INSN(4,0);
15658    vassert(ty < 4);
15659
15660    if (ty <= X01) {
15661       /* -------- 0x,0,0 FMADD  d_d_d_d, s_s_s_s -------- */
15662       /* -------- 0x,0,1 FMSUB  d_d_d_d, s_s_s_s -------- */
15663       /* -------- 0x,1,0 FNMADD d_d_d_d, s_s_s_s -------- */
15664       /* -------- 0x,1,1 FNMSUB d_d_d_d, s_s_s_s -------- */
15665       /* -------------------- F{N}M{ADD,SUB} -------------------- */
15666       /* 31          22   20 15 14 9 4   ix
15667          000 11111 0 sz 0 m  0  a  n d   0   FMADD  Fd,Fn,Fm,Fa
15668          000 11111 0 sz 0 m  1  a  n d   1   FMSUB  Fd,Fn,Fm,Fa
15669          000 11111 0 sz 1 m  0  a  n d   2   FNMADD Fd,Fn,Fm,Fa
15670          000 11111 0 sz 1 m  1  a  n d   3   FNMSUB Fd,Fn,Fm,Fa
15671          where Fx=Dx when sz=1, Fx=Sx when sz=0
15672
15673                   -----SPEC------    ----IMPL----
15674          fmadd       a +    n * m    fmadd (a, n, m)
15675          fmsub       a + (-n) * m    fmsub (a, n, m)
15676          fnmadd   (-a) + (-n) * m    fmadd (-a, -n, m)
15677          fnmsub   (-a) +    n * m    fmadd (-a, n, m)
15678
15679          Note Iop_MAdd/SubF32/64 take arguments in the order: rm, N, M, A
15680       */
15681       Bool    isD   = (ty & 1) == 1;
15682       UInt    ix    = (bitO1 << 1) | bitO0;
15683       IRType  ity   = isD ? Ity_F64 : Ity_F32;
15684       IROp    opFMADD = mkFMADDF(ity);
15685       IROp    opFMSUB = mkFMSUBF(ity);
15686       IROp    opNEG = mkNEGF(ity);
15687       IRTemp  res   = newTemp(ity);
15688       IRExpr* eA    = getQRegLO(aa, ity);
15689       IRExpr* eN    = getQRegLO(nn, ity);
15690       IRExpr* eM    = getQRegLO(mm, ity);
15691       IRExpr* rm    = mkexpr(mk_get_IR_rounding_mode());
15692       switch (ix) {
15693          case 0: /* FMADD */
15694             assign(res, qop(opFMADD, rm, eN, eM, eA));
15695             break;
15696          case 1: /* FMSUB */
15697             assign(res, qop(opFMSUB, rm, eN, eM, eA));
15698             break;
15699          case 2: /* FNMADD */
15700             assign(res, qop(opFMADD, rm, unop(opNEG, eN), eM,
15701                             unop(opNEG,eA)));
15702             break;
15703          case 3: /* FNMSUB */
15704             assign(res, qop(opFMADD, rm, eN, eM, unop(opNEG, eA)));
15705             break;
15706          default:
15707             vassert(0);
15708       }
15709       putQReg128(dd, mkV128(0x0000));
15710       putQRegLO(dd, mkexpr(res));
15711       const HChar* names[4] = { "fmadd", "fmsub", "fnmadd", "fnmsub" };
15712       DIP("%s %s, %s, %s, %s\n",
15713           names[ix], nameQRegLO(dd, ity), nameQRegLO(nn, ity),
15714                      nameQRegLO(mm, ity), nameQRegLO(aa, ity));
15715       return True;
15716    }
15717
15718    return False;
15719 #  undef INSN
15720 }
15721
15722
15723 static
15724 Bool dis_AdvSIMD_fp_immediate(/*MB_OUT*/DisResult* dres, UInt insn)
15725 {
15726    /* 31  28    23 21 20   12  9    4
15727       000 11110 ty 1  imm8 100 imm5 d
15728       The first 3 bits are really "M 0 S", but M and S are always zero.
15729    */
15730 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15731    if (INSN(31,24) != BITS8(0,0,0,1,1,1,1,0)
15732        || INSN(21,21) != 1 || INSN(12,10) != BITS3(1,0,0)) {
15733       return False;
15734    }
15735    UInt ty     = INSN(23,22);
15736    UInt imm8   = INSN(20,13);
15737    UInt imm5   = INSN(9,5);
15738    UInt dd     = INSN(4,0);
15739
15740    /* ------- 00,00000: FMOV s_imm ------- */
15741    /* ------- 01,00000: FMOV d_imm ------- */
15742    if (ty <= X01 && imm5 == BITS5(0,0,0,0,0)) {
15743       Bool  isD  = (ty & 1) == 1;
15744       ULong imm  = VFPExpandImm(imm8, isD ? 64 : 32);
15745       if (!isD) {
15746          vassert(0 == (imm & 0xFFFFFFFF00000000ULL));
15747       }
15748       putQReg128(dd, mkV128(0));
15749       putQRegLO(dd, isD ? mkU64(imm) : mkU32(imm & 0xFFFFFFFFULL));
15750       DIP("fmov %s, #0x%llx\n",
15751           nameQRegLO(dd, isD ? Ity_F64 : Ity_F32), imm);
15752       return True;
15753    }
15754
15755    return False;
15756 #  undef INSN
15757 }
15758
15759
15760 static
15761 Bool dis_AdvSIMD_fp_to_from_fixedp_conv(/*MB_OUT*/DisResult* dres, UInt insn)
15762 {
15763 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15764    /* 31 30 29 28    23   21 20    18     15    9 4
15765       sf  0  0 11110 type 0  rmode opcode scale n d
15766       The first 3 bits are really "sf 0 S", but S is always zero.
15767       Decode fields: sf,type,rmode,opcode
15768    */
15769 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15770    if (INSN(30,29) != BITS2(0,0)
15771        || INSN(28,24) != BITS5(1,1,1,1,0)
15772        || INSN(21,21) != 0) {
15773       return False;
15774    }
15775    UInt bitSF = INSN(31,31);
15776    UInt ty    = INSN(23,22); // type
15777    UInt rm    = INSN(20,19); // rmode
15778    UInt op    = INSN(18,16); // opcode
15779    UInt sc    = INSN(15,10); // scale
15780    UInt nn    = INSN(9,5);
15781    UInt dd    = INSN(4,0);
15782
15783    if (ty <= X01 && rm == X11
15784        && (op == BITS3(0,0,0) || op == BITS3(0,0,1))) {
15785       /* -------- (ix) sf ty rm opc -------- */
15786       /* -------- 0    0  00 11 000: FCVTZS w_s_#fbits -------- */
15787       /* -------- 1    0  01 11 000: FCVTZS w_d_#fbits -------- */
15788       /* -------- 2    1  00 11 000: FCVTZS x_s_#fbits -------- */
15789       /* -------- 3    1  01 11 000: FCVTZS x_d_#fbits -------- */
15790
15791       /* -------- 4    0  00 11 001: FCVTZU w_s_#fbits -------- */
15792       /* -------- 5    0  01 11 001: FCVTZU w_d_#fbits -------- */
15793       /* -------- 6    1  00 11 001: FCVTZU x_s_#fbits -------- */
15794       /* -------- 7    1  01 11 001: FCVTZU x_d_#fbits -------- */
15795       Bool isI64 = bitSF == 1;
15796       Bool isF64 = (ty & 1) == 1;
15797       Bool isU   = (op & 1) == 1;
15798       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
15799
15800       Int fbits = 64 - sc;
15801       vassert(fbits >= 1 && fbits <= (isI64 ? 64 : 32));
15802
15803       Double  scale  = two_to_the_plus(fbits);
15804       IRExpr* scaleE = isF64 ? IRExpr_Const(IRConst_F64(scale))
15805                              : IRExpr_Const(IRConst_F32( (Float)scale ));
15806       IROp    opMUL  = isF64 ? Iop_MulF64 : Iop_MulF32;
15807
15808       const IROp ops[8]
15809         = { Iop_F32toI32S, Iop_F64toI32S, Iop_F32toI64S, Iop_F64toI64S,
15810             Iop_F32toI32U, Iop_F64toI32U, Iop_F32toI64U, Iop_F64toI64U };
15811       IRTemp irrm = newTemp(Ity_I32);
15812       assign(irrm, mkU32(Irrm_ZERO));
15813
15814       IRExpr* src = getQRegLO(nn, isF64 ? Ity_F64 : Ity_F32);
15815       IRExpr* res = binop(ops[ix], mkexpr(irrm),
15816                                    triop(opMUL, mkexpr(irrm), src, scaleE));
15817       putIRegOrZR(isI64, dd, res);
15818
15819       DIP("fcvtz%c %s, %s, #%d\n",
15820           isU ? 'u' : 's', nameIRegOrZR(isI64, dd),
15821           nameQRegLO(nn, isF64 ? Ity_F64 : Ity_F32), fbits);
15822       return True;
15823    }
15824
15825    /* ------ sf,ty,rm,opc ------ */
15826    /* ------ x,0x,00,010  SCVTF s/d, w/x, #fbits  ------ */
15827    /* ------ x,0x,00,011  UCVTF s/d, w/x, #fbits  ------ */
15828    /* (ix) sf  S 28    ty   rm opc 15    9 4
15829       0    0 0 0 11110 00 0 00 010 scale n d  SCVTF Sd, Wn, #fbits
15830       1    0 0 0 11110 01 0 00 010 scale n d  SCVTF Dd, Wn, #fbits
15831       2    1 0 0 11110 00 0 00 010 scale n d  SCVTF Sd, Xn, #fbits
15832       3    1 0 0 11110 01 0 00 010 scale n d  SCVTF Dd, Xn, #fbits
15833
15834       4    0 0 0 11110 00 0 00 011 scale n d  UCVTF Sd, Wn, #fbits
15835       5    0 0 0 11110 01 0 00 011 scale n d  UCVTF Dd, Wn, #fbits
15836       6    1 0 0 11110 00 0 00 011 scale n d  UCVTF Sd, Xn, #fbits
15837       7    1 0 0 11110 01 0 00 011 scale n d  UCVTF Dd, Xn, #fbits
15838
15839       These are signed/unsigned conversion from integer registers to
15840       FP registers, all 4 32/64-bit combinations, rounded per FPCR,
15841       scaled per |scale|.
15842    */
15843    if (ty <= X01 && rm == X00
15844        && (op == BITS3(0,1,0) || op == BITS3(0,1,1))
15845        && (bitSF == 1 || ((sc >> 5) & 1) == 1)) {
15846       Bool isI64 = bitSF == 1;
15847       Bool isF64 = (ty & 1) == 1;
15848       Bool isU   = (op & 1) == 1;
15849       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
15850
15851       Int fbits = 64 - sc;
15852       vassert(fbits >= 1 && fbits <= (isI64 ? 64 : 32));
15853
15854       Double  scale  = two_to_the_minus(fbits);
15855       IRExpr* scaleE = isF64 ? IRExpr_Const(IRConst_F64(scale))
15856                              : IRExpr_Const(IRConst_F32( (Float)scale ));
15857       IROp    opMUL  = isF64 ? Iop_MulF64 : Iop_MulF32;
15858
15859       const IROp ops[8]
15860         = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
15861             Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
15862       IRExpr* src = getIRegOrZR(isI64, nn);
15863       IRExpr* res = (isF64 && !isI64)
15864                        ? unop(ops[ix], src)
15865                        : binop(ops[ix],
15866                                mkexpr(mk_get_IR_rounding_mode()), src);
15867       putQReg128(dd, mkV128(0));
15868       putQRegLO(dd, triop(opMUL, mkU32(Irrm_NEAREST), res, scaleE));
15869
15870       DIP("%ccvtf %s, %s, #%d\n",
15871           isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32),
15872           nameIRegOrZR(isI64, nn), fbits);
15873       return True;
15874    }
15875
15876    return False;
15877 #  undef INSN
15878 }
15879
15880
15881 static
15882 Bool dis_AdvSIMD_fp_to_from_int_conv(/*MB_OUT*/DisResult* dres, UInt insn)
15883 {
15884    /* 31 30 29 28    23   21 20    18     15     9 4
15885       sf  0  0 11110 type 1  rmode opcode 000000 n d
15886       The first 3 bits are really "sf 0 S", but S is always zero.
15887       Decode fields: sf,type,rmode,opcode
15888    */
15889 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
15890    if (INSN(30,29) != BITS2(0,0)
15891        || INSN(28,24) != BITS5(1,1,1,1,0)
15892        || INSN(21,21) != 1
15893        || INSN(15,10) != BITS6(0,0,0,0,0,0)) {
15894       return False;
15895    }
15896    UInt bitSF = INSN(31,31);
15897    UInt ty    = INSN(23,22); // type
15898    UInt rm    = INSN(20,19); // rmode
15899    UInt op    = INSN(18,16); // opcode
15900    UInt nn    = INSN(9,5);
15901    UInt dd    = INSN(4,0);
15902
15903    // op = 000, 001
15904    /* -------- FCVT{N,P,M,Z,A}{S,U} (scalar, integer) -------- */
15905    /*    30       23   20 18  15     9 4
15906       sf 00 11110 0x 1 00 000 000000 n d  FCVTNS Rd, Fn (round to
15907       sf 00 11110 0x 1 00 001 000000 n d  FCVTNU Rd, Fn  nearest)
15908       ---------------- 01 --------------  FCVTP-------- (round to +inf)
15909       ---------------- 10 --------------  FCVTM-------- (round to -inf)
15910       ---------------- 11 --------------  FCVTZ-------- (round to zero)
15911       ---------------- 00 100 ----------  FCVTAS------- (nearest, ties away)
15912       ---------------- 00 101 ----------  FCVTAU------- (nearest, ties away)
15913
15914       Rd is Xd when sf==1, Wd when sf==0
15915       Fn is Dn when x==1, Sn when x==0
15916       20:19 carry the rounding mode, using the same encoding as FPCR
15917    */
15918    if (ty <= X01
15919        && (   ((op == BITS3(0,0,0) || op == BITS3(0,0,1)) && True)
15920            || ((op == BITS3(1,0,0) || op == BITS3(1,0,1)) && rm == BITS2(0,0))
15921           )
15922       ) {
15923       Bool isI64 = bitSF == 1;
15924       Bool isF64 = (ty & 1) == 1;
15925       Bool isU   = (op & 1) == 1;
15926       /* Decide on the IR rounding mode to use. */
15927       IRRoundingMode irrm = 8; /*impossible*/
15928       HChar ch = '?';
15929       if (op == BITS3(0,0,0) || op == BITS3(0,0,1)) {
15930          switch (rm) {
15931             case BITS2(0,0): ch = 'n'; irrm = Irrm_NEAREST; break;
15932             case BITS2(0,1): ch = 'p'; irrm = Irrm_PosINF; break;
15933             case BITS2(1,0): ch = 'm'; irrm = Irrm_NegINF; break;
15934             case BITS2(1,1): ch = 'z'; irrm = Irrm_ZERO; break;
15935             default: vassert(0);
15936          }
15937       } else {
15938          vassert(op == BITS3(1,0,0) || op == BITS3(1,0,1));
15939          switch (rm) {
15940             case BITS2(0,0): ch = 'a'; irrm = Irrm_NEAREST_TIE_AWAY_0; break;
15941             default: vassert(0);
15942          }
15943       }
15944       vassert(irrm != 8);
15945       /* Decide on the conversion primop, based on the source size,
15946          dest size and signedness (8 possibilities).  Case coding:
15947             F32 ->s I32   0
15948             F32 ->u I32   1
15949             F32 ->s I64   2
15950             F32 ->u I64   3
15951             F64 ->s I32   4
15952             F64 ->u I32   5
15953             F64 ->s I64   6
15954             F64 ->u I64   7
15955       */
15956       UInt ix = (isF64 ? 4 : 0) | (isI64 ? 2 : 0) | (isU ? 1 : 0);
15957       vassert(ix < 8);
15958       const IROp iops[8]
15959          = { Iop_F32toI32S, Iop_F32toI32U, Iop_F32toI64S, Iop_F32toI64U,
15960              Iop_F64toI32S, Iop_F64toI32U, Iop_F64toI64S, Iop_F64toI64U };
15961       IROp iop = iops[ix];
15962       // A bit of ATCery: bounce all cases we haven't seen an example of.
15963       if (/* F32toI32S */
15964              (iop == Iop_F32toI32S && irrm == Irrm_ZERO)              /* FCVTZS Wd,Sn */
15965           || (iop == Iop_F32toI32S && irrm == Irrm_NegINF)            /* FCVTMS Wd,Sn */
15966           || (iop == Iop_F32toI32S && irrm == Irrm_PosINF)            /* FCVTPS Wd,Sn */
15967           || (iop == Iop_F32toI32S && irrm == Irrm_NEAREST)           /* FCVTNS W,S */
15968           || (iop == Iop_F32toI32S && irrm == Irrm_NEAREST_TIE_AWAY_0)/* FCVTAS W,S */
15969           /* F32toI32U */
15970           || (iop == Iop_F32toI32U && irrm == Irrm_ZERO)              /* FCVTZU Wd,Sn */
15971           || (iop == Iop_F32toI32U && irrm == Irrm_NegINF)            /* FCVTMU Wd,Sn */
15972           || (iop == Iop_F32toI32U && irrm == Irrm_PosINF)            /* FCVTPU Wd,Sn */
15973           || (iop == Iop_F32toI32U && irrm == Irrm_NEAREST)           /* FCVTNU W,S */
15974           || (iop == Iop_F32toI32U && irrm == Irrm_NEAREST_TIE_AWAY_0)/* FCVTAU W,S */
15975           /* F32toI64S */
15976           || (iop == Iop_F32toI64S && irrm == Irrm_ZERO)              /* FCVTZS Xd,Sn */
15977           || (iop == Iop_F32toI64S && irrm == Irrm_NegINF)            /* FCVTMS Xd,Sn */
15978           || (iop == Iop_F32toI64S && irrm == Irrm_PosINF)            /* FCVTPS Xd,Sn */
15979           || (iop == Iop_F32toI64S && irrm == Irrm_NEAREST)           /* FCVTNS X,S */
15980           || (iop == Iop_F32toI64S && irrm == Irrm_NEAREST_TIE_AWAY_0)/* FCVTAS X,S */
15981           /* F32toI64U */
15982           || (iop == Iop_F32toI64U && irrm == Irrm_ZERO)              /* FCVTZU Xd,Sn */
15983           || (iop == Iop_F32toI64U && irrm == Irrm_NegINF)            /* FCVTMU Xd,Sn */
15984           || (iop == Iop_F32toI64U && irrm == Irrm_PosINF)            /* FCVTPU Xd,Sn */
15985           || (iop == Iop_F32toI64U && irrm == Irrm_NEAREST)           /* FCVTNU X,S */
15986           || (iop == Iop_F32toI64U && irrm == Irrm_NEAREST_TIE_AWAY_0)/* FCVTAU X,S */
15987           /* F64toI32S */
15988           || (iop == Iop_F64toI32S && irrm == Irrm_ZERO)              /* FCVTZS Wd,Dn */
15989           || (iop == Iop_F64toI32S && irrm == Irrm_NegINF)            /* FCVTMS Wd,Dn */
15990           || (iop == Iop_F64toI32S && irrm == Irrm_PosINF)            /* FCVTPS Wd,Dn */
15991           || (iop == Iop_F64toI32S && irrm == Irrm_NEAREST)           /* FCVTNS W,D */
15992           || (iop == Iop_F64toI32S && irrm == Irrm_NEAREST_TIE_AWAY_0)/* FCVTAS W,D */
15993           /* F64toI32U */
15994           || (iop == Iop_F64toI32U && irrm == Irrm_ZERO)              /* FCVTZU Wd,Dn */
15995           || (iop == Iop_F64toI32U && irrm == Irrm_NegINF)            /* FCVTMU Wd,Dn */
15996           || (iop == Iop_F64toI32U && irrm == Irrm_PosINF)            /* FCVTPU Wd,Dn */
15997           || (iop == Iop_F64toI32U && irrm == Irrm_NEAREST)           /* FCVTNU W,D */
15998           || (iop == Iop_F64toI32U && irrm == Irrm_NEAREST_TIE_AWAY_0)/* FCVTAU W,D */
15999           /* F64toI64S */
16000           || (iop == Iop_F64toI64S && irrm == Irrm_ZERO)              /* FCVTZS Xd,Dn */
16001           || (iop == Iop_F64toI64S && irrm == Irrm_NegINF)            /* FCVTMS Xd,Dn */
16002           || (iop == Iop_F64toI64S && irrm == Irrm_PosINF)            /* FCVTPS Xd,Dn */
16003           || (iop == Iop_F64toI64S && irrm == Irrm_NEAREST)           /* FCVTNS X,D */
16004           || (iop == Iop_F64toI64S && irrm == Irrm_NEAREST_TIE_AWAY_0)/* FCVTAS X,D */
16005           /* F64toI64U */
16006           || (iop == Iop_F64toI64U && irrm == Irrm_ZERO)              /* FCVTZU Xd,Dn */
16007           || (iop == Iop_F64toI64U && irrm == Irrm_NegINF)            /* FCVTMU Xd,Dn */
16008           || (iop == Iop_F64toI64U && irrm == Irrm_PosINF)            /* FCVTPU Xd,Dn */
16009           || (iop == Iop_F64toI64U && irrm == Irrm_NEAREST)           /* FCVTNU X,D */
16010           || (iop == Iop_F64toI64U && irrm == Irrm_NEAREST_TIE_AWAY_0)/* FCVTAU X,D */
16011          ) {
16012         /* validated */
16013       } else {
16014         return False;
16015       }
16016       IRType srcTy  = isF64 ? Ity_F64 : Ity_F32;
16017       IRType dstTy  = isI64 ? Ity_I64 : Ity_I32;
16018       IRTemp src    = newTemp(srcTy);
16019       IRTemp dst    = newTemp(dstTy);
16020       assign(src, getQRegLO(nn, srcTy));
16021       assign(dst, binop(iop, mkU32(irrm), mkexpr(src)));
16022       putIRegOrZR(isI64, dd, mkexpr(dst));
16023       DIP("fcvt%c%c %s, %s\n", ch, isU ? 'u' : 's',
16024           nameIRegOrZR(isI64, dd), nameQRegLO(nn, srcTy));
16025       return True;
16026    }
16027
16028    // op = 010, 011
16029    /* -------------- {S,U}CVTF (scalar, integer) -------------- */
16030    /* (ix) sf  S 28    ty   rm op  15     9 4
16031       0    0 0 0 11110 00 1 00 010 000000 n d  SCVTF Sd, Wn
16032       1    0 0 0 11110 01 1 00 010 000000 n d  SCVTF Dd, Wn
16033       2    1 0 0 11110 00 1 00 010 000000 n d  SCVTF Sd, Xn
16034       3    1 0 0 11110 01 1 00 010 000000 n d  SCVTF Dd, Xn
16035
16036       4    0 0 0 11110 00 1 00 011 000000 n d  UCVTF Sd, Wn
16037       5    0 0 0 11110 01 1 00 011 000000 n d  UCVTF Dd, Wn
16038       6    1 0 0 11110 00 1 00 011 000000 n d  UCVTF Sd, Xn
16039       7    1 0 0 11110 01 1 00 011 000000 n d  UCVTF Dd, Xn
16040
16041       These are signed/unsigned conversion from integer registers to
16042       FP registers, all 4 32/64-bit combinations, rounded per FPCR.
16043    */
16044    if (ty <= X01 && rm == X00 && (op == BITS3(0,1,0) || op == BITS3(0,1,1))) {
16045       Bool isI64 = bitSF == 1;
16046       Bool isF64 = (ty & 1) == 1;
16047       Bool isU   = (op & 1) == 1;
16048       UInt ix    = (isU ? 4 : 0) | (isI64 ? 2 : 0) | (isF64 ? 1 : 0);
16049       const IROp ops[8]
16050         = { Iop_I32StoF32, Iop_I32StoF64, Iop_I64StoF32, Iop_I64StoF64,
16051             Iop_I32UtoF32, Iop_I32UtoF64, Iop_I64UtoF32, Iop_I64UtoF64 };
16052       IRExpr* src = getIRegOrZR(isI64, nn);
16053       IRExpr* res = (isF64 && !isI64)
16054                        ? unop(ops[ix], src)
16055                        : binop(ops[ix],
16056                                mkexpr(mk_get_IR_rounding_mode()), src);
16057       putQReg128(dd, mkV128(0));
16058       putQRegLO(dd, res);
16059       DIP("%ccvtf %s, %s\n",
16060           isU ? 'u' : 's', nameQRegLO(dd, isF64 ? Ity_F64 : Ity_F32),
16061           nameIRegOrZR(isI64, nn));
16062       return True;
16063    }
16064
16065    // op = 110, 111
16066    /* -------- FMOV (general) -------- */
16067    /* case sf  S       ty   rm op  15     9 4
16068        (1) 0 0 0 11110 00 1 00 111 000000 n d     FMOV Sd,      Wn
16069        (2) 1 0 0 11110 01 1 00 111 000000 n d     FMOV Dd,      Xn
16070        (3) 1 0 0 11110 10 1 01 111 000000 n d     FMOV Vd.D[1], Xn
16071
16072        (4) 0 0 0 11110 00 1 00 110 000000 n d     FMOV Wd, Sn
16073        (5) 1 0 0 11110 01 1 00 110 000000 n d     FMOV Xd, Dn
16074        (6) 1 0 0 11110 10 1 01 110 000000 n d     FMOV Xd, Vn.D[1]
16075    */
16076    if (1) {
16077       UInt ix = 0; // case
16078       if (bitSF == 0) {
16079          if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,1))
16080             ix = 1;
16081          else
16082          if (ty == BITS2(0,0) && rm == BITS2(0,0) && op == BITS3(1,1,0))
16083             ix = 4;
16084       } else {
16085          vassert(bitSF == 1);
16086          if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,1))
16087             ix = 2;
16088          else
16089          if (ty == BITS2(0,1) && rm == BITS2(0,0) && op == BITS3(1,1,0))
16090             ix = 5;
16091          else
16092          if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,1))
16093             ix = 3;
16094          else
16095          if (ty == BITS2(1,0) && rm == BITS2(0,1) && op == BITS3(1,1,0))
16096             ix = 6;
16097       }
16098       if (ix > 0) {
16099          switch (ix) {
16100             case 1:
16101                putQReg128(dd, mkV128(0));
16102                putQRegLO(dd, getIReg32orZR(nn));
16103                DIP("fmov s%u, w%u\n", dd, nn);
16104                break;
16105             case 2:
16106                putQReg128(dd, mkV128(0));
16107                putQRegLO(dd, getIReg64orZR(nn));
16108                DIP("fmov d%u, x%u\n", dd, nn);
16109                break;
16110             case 3:
16111                putQRegHI64(dd, getIReg64orZR(nn));
16112                DIP("fmov v%u.d[1], x%u\n", dd, nn);
16113                break;
16114             case 4:
16115                putIReg32orZR(dd, getQRegLO(nn, Ity_I32));
16116                DIP("fmov w%u, s%u\n", dd, nn);
16117                break;
16118             case 5:
16119                putIReg64orZR(dd, getQRegLO(nn, Ity_I64));
16120                DIP("fmov x%u, d%u\n", dd, nn);
16121                break;
16122             case 6:
16123                putIReg64orZR(dd, getQRegHI64(nn));
16124                DIP("fmov x%u, v%u.d[1]\n", dd, nn);
16125                break;
16126             default:
16127                vassert(0);
16128          }
16129          return True;
16130       }
16131       /* undecodable; fall through */
16132    }
16133
16134    return False;
16135 #  undef INSN
16136 }
16137
16138
16139 static
16140 Bool dis_AdvSIMD_dot_product(/*MB_OUT*/DisResult* dres, UInt insn)
16141 {
16142    /* by element
16143       31 30 29 28    23   21 20 15   11 10 9 4
16144       0  Q  U  01111 size L  m  1110 H  0  n d
16145       vector
16146       31 30 29 28    23   21 20 15   11 10 9 4
16147       0  Q  U  01110 size 0  m  1001 0  1  n d
16148    */
16149 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
16150    if (INSN(31,31) != 0) {
16151       return False;
16152    }
16153    UInt bitQ    = INSN(30,30);
16154    UInt bitU    = INSN(29,29);
16155    UInt opcode1 = INSN(28,24);
16156    UInt size    = INSN(23,22);
16157    UInt bitL    = INSN(21,21);
16158    UInt mm      = INSN(20,16);
16159    UInt opcode2 = INSN(15,12);
16160    UInt bitH    = INSN(11,11);
16161    UInt opcode3 = INSN(10,10);
16162    UInt nn      = INSN(9,5);
16163    UInt dd      = INSN(4,0);
16164    UInt index   = (bitH << 1) + bitL;
16165    vassert(index <= 3);
16166
16167    Bool byElement;
16168    if (opcode1 == BITS5(0,1,1,1,1)
16169        && opcode2 == BITS4(1,1,1,0)
16170        && opcode3 == 0) {
16171       byElement = True;
16172    } else if (opcode1 == BITS5(0,1,1,1,0)
16173        && opcode2 == BITS4(1,0,0,1)
16174        && opcode3 == 1
16175        && bitL == 0 && bitH == 0) {
16176       byElement = False;
16177    } else {
16178       return False;
16179    }
16180
16181    // '10' is the only valid size
16182    if (size != X10) return False;
16183
16184    IRExpr* src1 = math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(nn));
16185    IRExpr* src2 = getQReg128(mm);
16186    if (byElement) {
16187       src2 = mkexpr(math_DUP_VEC_ELEM(src2, X10, index));
16188    }
16189
16190    IROp mulOp = bitU ? Iop_Mull8Ux8 : Iop_Mull8Sx8;
16191    IRTemp loProductSums = math_ADDLP(
16192          X01, bitU, math_BINARY_WIDENING_V128(False, mulOp, src1, src2));
16193    IRTemp hiProductSums = math_ADDLP(
16194          X01, bitU, math_BINARY_WIDENING_V128(True, mulOp, src1, src2));
16195
16196    IRTemp res = newTempV128();
16197    assign(res, binop(Iop_Add32x4,
16198           mk_CatEvenLanes32x4(hiProductSums, loProductSums),
16199           mk_CatOddLanes32x4(hiProductSums, loProductSums)));
16200
16201    // These instructions accumulate into the destination, but in non-q
16202    // form the upper 64 bits get forced to 0
16203    IRExpr* accVal = math_MAYBE_ZERO_HI64_fromE(bitQ, getQReg128(dd));
16204    putQReg128(dd, binop(mkVecADD(size), mkexpr(res), accVal));
16205
16206    const HChar* nm = bitU ? "udot" : "sdot";
16207    const HChar* destWidth = nameArr_Q_SZ(bitQ, size);
16208    const HChar* srcWidth  = nameArr_Q_SZ(bitQ, X00);
16209    if (byElement) {
16210       DIP("%s v%u.%s, v%u.%s, v%u.4b[%u]\n", nm,
16211          dd, destWidth,
16212          nn, srcWidth, mm, index);
16213    } else {
16214       DIP("%s v%u.%s, v%u.%s, v%u.%s\n", nm,
16215          dd, destWidth,
16216          nn, srcWidth, mm, srcWidth);
16217    }
16218
16219    return True;
16220 #  undef INSN
16221 }
16222
16223
16224 static
16225 Bool dis_ARM64_simd_and_fp(/*MB_OUT*/DisResult* dres, UInt insn,
16226                            const VexArchInfo* archinfo, Bool sigill_diag)
16227 {
16228    Bool ok;
16229    ok = dis_AdvSIMD_EXT(dres, insn);
16230    if (UNLIKELY(ok)) return True;
16231    ok = dis_AdvSIMD_TBL_TBX(dres, insn);
16232    if (UNLIKELY(ok)) return True;
16233    ok = dis_AdvSIMD_ZIP_UZP_TRN(dres, insn);
16234    if (UNLIKELY(ok)) return True;
16235    ok = dis_AdvSIMD_across_lanes(dres, insn);
16236    if (UNLIKELY(ok)) return True;
16237    ok = dis_AdvSIMD_copy(dres, insn);
16238    if (UNLIKELY(ok)) return True;
16239    ok = dis_AdvSIMD_modified_immediate(dres, insn);
16240    if (UNLIKELY(ok)) return True;
16241    ok = dis_AdvSIMD_scalar_copy(dres, insn);
16242    if (UNLIKELY(ok)) return True;
16243    ok = dis_AdvSIMD_scalar_pairwise(dres, insn, archinfo);
16244    if (UNLIKELY(ok)) return True;
16245    ok = dis_AdvSIMD_scalar_shift_by_imm(dres, insn);
16246    if (UNLIKELY(ok)) return True;
16247    ok = dis_AdvSIMD_scalar_three_different(dres, insn);
16248    if (UNLIKELY(ok)) return True;
16249    ok = dis_AdvSIMD_scalar_three_same(dres, insn);
16250    if (UNLIKELY(ok)) return True;
16251    ok = dis_AdvSIMD_scalar_three_same_extra(dres, insn, archinfo);
16252    if (UNLIKELY(ok)) return True;
16253    ok = dis_AdvSIMD_scalar_two_reg_misc(dres, insn);
16254    if (UNLIKELY(ok)) return True;
16255    ok = dis_AdvSIMD_scalar_two_reg_misc_fp16(dres, insn, archinfo);
16256    if (UNLIKELY(ok)) return True;
16257    ok = dis_AdvSIMD_scalar_x_indexed_element(dres, insn);
16258    if (UNLIKELY(ok)) return True;
16259    ok = dis_AdvSIMD_shift_by_immediate(dres, insn);
16260    if (UNLIKELY(ok)) return True;
16261    ok = dis_AdvSIMD_three_different(dres, insn);
16262    if (UNLIKELY(ok)) return True;
16263    ok = dis_AdvSIMD_three_same(dres, insn);
16264    if (UNLIKELY(ok)) return True;
16265    ok = dis_AdvSIMD_three_same_extra(dres, insn);
16266    if (UNLIKELY(ok)) return True;
16267    ok = dis_AdvSIMD_three_same_fp16(dres, insn, archinfo);
16268    if (UNLIKELY(ok)) return True;
16269    ok = dis_AdvSIMD_two_reg_misc(dres, insn);
16270    if (UNLIKELY(ok)) return True;
16271    ok = dis_AdvSIMD_two_reg_misc_fp16(dres, insn, archinfo);
16272    if (UNLIKELY(ok)) return True;
16273    ok = dis_AdvSIMD_vector_x_indexed_elem(dres, insn);
16274    if (UNLIKELY(ok)) return True;
16275    ok = dis_AdvSIMD_crypto_aes(dres, insn);
16276    if (UNLIKELY(ok)) return True;
16277    ok = dis_AdvSIMD_crypto_three_reg_sha(dres, insn);
16278    if (UNLIKELY(ok)) return True;
16279    ok = dis_AdvSIMD_crypto_two_reg_sha(dres, insn);
16280    if (UNLIKELY(ok)) return True;
16281    ok = dis_AdvSIMD_crypto_three_reg_sha512(dres, insn);
16282    if (UNLIKELY(ok)) return True;
16283    ok = dis_AdvSIMD_crypto_two_reg_sha512(dres, insn);
16284    if (UNLIKELY(ok)) return True;
16285    ok = dis_AdvSIMD_fp_compare(dres, insn);
16286    if (UNLIKELY(ok)) return True;
16287    ok = dis_AdvSIMD_fp_conditional_compare(dres, insn, archinfo, sigill_diag);
16288    if (UNLIKELY(ok)) return True;
16289    ok = dis_AdvSIMD_fp_conditional_select(dres, insn);
16290    if (UNLIKELY(ok)) return True;
16291    ok = dis_AdvSIMD_fp_data_proc_1_source(dres, insn);
16292    if (UNLIKELY(ok)) return True;
16293    ok = dis_AdvSIMD_fp_data_proc_2_source(dres, insn, archinfo);
16294    if (UNLIKELY(ok)) return True;
16295    ok = dis_AdvSIMD_fp_data_proc_3_source(dres, insn);
16296    if (UNLIKELY(ok)) return True;
16297    ok = dis_AdvSIMD_fp_immediate(dres, insn);
16298    if (UNLIKELY(ok)) return True;
16299    ok = dis_AdvSIMD_fp_to_from_fixedp_conv(dres, insn);
16300    if (UNLIKELY(ok)) return True;
16301    ok = dis_AdvSIMD_fp_to_from_int_conv(dres, insn);
16302    if (UNLIKELY(ok)) return True;
16303    ok = dis_AdvSIMD_dot_product(dres, insn);
16304    if (UNLIKELY(ok)) return True;
16305    return False;
16306 }
16307
16308
16309 /*------------------------------------------------------------*/
16310 /*--- Disassemble a single ARM64 instruction               ---*/
16311 /*------------------------------------------------------------*/
16312
16313 /* Disassemble a single ARM64 instruction into IR.  The instruction
16314    has is located at |guest_instr| and has guest IP of
16315    |guest_PC_curr_instr|, which will have been set before the call
16316    here.  Returns True iff the instruction was decoded, in which case
16317    *dres will be set accordingly, or False, in which case *dres should
16318    be ignored by the caller. */
16319
16320 static
16321 Bool disInstr_ARM64_WRK (
16322         /*MB_OUT*/DisResult* dres,
16323         const UChar* guest_instr,
16324         const VexArchInfo* archinfo,
16325         const VexAbiInfo*  abiinfo,
16326         Bool sigill_diag
16327      )
16328 {
16329    // A macro to fish bits out of 'insn'.
16330 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
16331
16332 //ZZ    DisResult dres;
16333 //ZZ    UInt      insn;
16334 //ZZ    //Bool      allow_VFP = False;
16335 //ZZ    //UInt      hwcaps = archinfo->hwcaps;
16336 //ZZ    IRTemp    condT; /* :: Ity_I32 */
16337 //ZZ    UInt      summary;
16338 //ZZ    HChar     dis_buf[128];  // big enough to hold LDMIA etc text
16339 //ZZ
16340 //ZZ    /* What insn variants are we supporting today? */
16341 //ZZ    //allow_VFP  = (0 != (hwcaps & VEX_HWCAPS_ARM_VFP));
16342 //ZZ    // etc etc
16343
16344    /* Set result defaults. */
16345    dres->whatNext    = Dis_Continue;
16346    dres->len         = 4;
16347    dres->jk_StopHere = Ijk_INVALID;
16348    dres->hint        = Dis_HintNone;
16349
16350    /* At least this is simple on ARM64: insns are all 4 bytes long, and
16351       4-aligned.  So just fish the whole thing out of memory right now
16352       and have done. */
16353    UInt insn = getUIntLittleEndianly( guest_instr );
16354
16355    if (0) vex_printf("insn: 0x%x\n", insn);
16356
16357    DIP("\t(arm64) 0x%llx:  ", (ULong)guest_PC_curr_instr);
16358
16359    vassert(0 == (guest_PC_curr_instr & 3ULL));
16360
16361    /* ----------------------------------------------------------- */
16362
16363    /* Spot "Special" instructions (see comment at top of file). */
16364    {
16365       const UChar* code = guest_instr;
16366       /* Spot the 16-byte preamble:
16367             93CC0D8C   ror x12, x12, #3
16368             93CC358C   ror x12, x12, #13
16369             93CCCD8C   ror x12, x12, #51
16370             93CCF58C   ror x12, x12, #61
16371       */
16372       UInt word1 = 0x93CC0D8C;
16373       UInt word2 = 0x93CC358C;
16374       UInt word3 = 0x93CCCD8C;
16375       UInt word4 = 0x93CCF58C;
16376       if (getUIntLittleEndianly(code+ 0) == word1 &&
16377           getUIntLittleEndianly(code+ 4) == word2 &&
16378           getUIntLittleEndianly(code+ 8) == word3 &&
16379           getUIntLittleEndianly(code+12) == word4) {
16380          /* Got a "Special" instruction preamble.  Which one is it? */
16381          if (getUIntLittleEndianly(code+16) == 0xAA0A014A
16382                                                /* orr x10,x10,x10 */) {
16383             /* X3 = client_request ( X4 ) */
16384             DIP("x3 = client_request ( x4 )\n");
16385             putPC(mkU64( guest_PC_curr_instr + 20 ));
16386             dres->jk_StopHere = Ijk_ClientReq;
16387             dres->whatNext    = Dis_StopHere;
16388             return True;
16389          }
16390          else
16391          if (getUIntLittleEndianly(code+16) == 0xAA0B016B
16392                                                /* orr x11,x11,x11 */) {
16393             /* X3 = guest_NRADDR */
16394             DIP("x3 = guest_NRADDR\n");
16395             dres->len = 20;
16396             putIReg64orZR(3, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
16397             return True;
16398          }
16399          else
16400          if (getUIntLittleEndianly(code+16) == 0xAA0C018C
16401                                                /* orr x12,x12,x12 */) {
16402             /*  branch-and-link-to-noredir X8 */
16403             DIP("branch-and-link-to-noredir x8\n");
16404             putIReg64orZR(30, mkU64(guest_PC_curr_instr + 20));
16405             putPC(getIReg64orZR(8));
16406             dres->jk_StopHere = Ijk_NoRedir;
16407             dres->whatNext    = Dis_StopHere;
16408             return True;
16409          }
16410          else
16411          if (getUIntLittleEndianly(code+16) == 0xAA090129
16412                                                /* orr x9,x9,x9 */) {
16413             /* IR injection */
16414             DIP("IR injection\n");
16415             vex_inject_ir(irsb, Iend_LE);
16416             // Invalidate the current insn. The reason is that the IRop we're
16417             // injecting here can change. In which case the translation has to
16418             // be redone. For ease of handling, we simply invalidate all the
16419             // time.
16420             stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_PC_curr_instr)));
16421             stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(20)));
16422             putPC(mkU64( guest_PC_curr_instr + 20 ));
16423             dres->whatNext    = Dis_StopHere;
16424             dres->jk_StopHere = Ijk_InvalICache;
16425             return True;
16426          }
16427          /* We don't know what it is. */
16428          return False;
16429          /*NOTREACHED*/
16430       }
16431    }
16432
16433    /* ----------------------------------------------------------- */
16434
16435    /* Main ARM64 instruction decoder starts here. */
16436
16437    Bool ok = False;
16438
16439    /* insn[28:25] determines the top-level grouping, so let's start
16440       off with that.
16441
16442       For all of these dis_ARM64_ functions, we pass *dres with the
16443       normal default results "insn OK, 4 bytes long, keep decoding" so
16444       they don't need to change it.  However, decodes of control-flow
16445       insns may cause *dres to change.
16446    */
16447    switch (INSN(28,25)) {
16448       case BITS4(1,0,0,0): case BITS4(1,0,0,1):
16449          // Data processing - immediate
16450          ok = dis_ARM64_data_processing_immediate(dres, insn, sigill_diag);
16451          break;
16452       case BITS4(1,0,1,0): case BITS4(1,0,1,1):
16453          // Branch, exception generation and system instructions
16454          ok = dis_ARM64_branch_etc(dres, insn, archinfo, abiinfo, sigill_diag);
16455          break;
16456       case BITS4(0,1,0,0): case BITS4(0,1,1,0):
16457       case BITS4(1,1,0,0): case BITS4(1,1,1,0):
16458          // Loads and stores
16459          ok = dis_ARM64_load_store(dres, insn, abiinfo, sigill_diag);
16460          break;
16461       case BITS4(0,1,0,1): case BITS4(1,1,0,1):
16462          // Data processing - register
16463          ok = dis_ARM64_data_processing_register(dres, insn, sigill_diag);
16464          break;
16465       case BITS4(0,1,1,1): case BITS4(1,1,1,1):
16466          // Data processing - SIMD and floating point
16467          ok = dis_ARM64_simd_and_fp(dres, insn, archinfo, sigill_diag);
16468          break;
16469       case BITS4(0,0,0,0): case BITS4(0,0,0,1):
16470       case BITS4(0,0,1,0): case BITS4(0,0,1,1):
16471          // UNALLOCATED
16472          break;
16473       default:
16474          vassert(0); /* Can't happen */
16475    }
16476
16477    /* If the next-level down decoders failed, make sure |dres| didn't
16478       get changed. */
16479    if (!ok) {
16480       vassert(dres->whatNext    == Dis_Continue);
16481       vassert(dres->len         == 4);
16482       vassert(dres->jk_StopHere == Ijk_INVALID);
16483    }
16484
16485    return ok;
16486
16487 #  undef INSN
16488 }
16489
16490
16491 /*------------------------------------------------------------*/
16492 /*--- Top-level fn                                         ---*/
16493 /*------------------------------------------------------------*/
16494
16495 /* Disassemble a single instruction into IR.  The instruction
16496    is located in host memory at &guest_code[delta]. */
16497
16498 DisResult disInstr_ARM64 ( IRSB*        irsb_IN,
16499                            const UChar* guest_code_IN,
16500                            Long         delta_IN,
16501                            Addr         guest_IP,
16502                            VexArch      guest_arch,
16503                            const VexArchInfo* archinfo,
16504                            const VexAbiInfo*  abiinfo,
16505                            VexEndness   host_endness_IN,
16506                            Bool         sigill_diag_IN )
16507 {
16508    DisResult dres;
16509    vex_bzero(&dres, sizeof(dres));
16510
16511    /* Set globals (see top of this file) */
16512    vassert(guest_arch == VexArchARM64);
16513
16514    irsb                = irsb_IN;
16515    host_endness        = host_endness_IN;
16516    guest_PC_curr_instr = (Addr64)guest_IP;
16517
16518    /* Sanity checks */
16519    /* (x::UInt - 2) <= 15   ===   x >= 2 && x <= 17 (I hope) */
16520    vassert((archinfo->arm64_dMinLine_lg2_szB - 2) <= 15);
16521    vassert((archinfo->arm64_iMinLine_lg2_szB - 2) <= 15);
16522
16523    /* Try to decode */
16524    Bool ok = disInstr_ARM64_WRK( &dres,
16525                                  &guest_code_IN[delta_IN],
16526                                  archinfo, abiinfo, sigill_diag_IN );
16527    if (ok) {
16528       /* All decode successes end up here. */
16529       vassert(dres.len == 4 || dres.len == 20);
16530       switch (dres.whatNext) {
16531          case Dis_Continue:
16532             putPC( mkU64(dres.len + guest_PC_curr_instr) );
16533             break;
16534          case Dis_StopHere:
16535             break;
16536          default:
16537             vassert(0);
16538       }
16539       DIP("\n");
16540    } else {
16541       /* All decode failures end up here. */
16542       if (sigill_diag_IN) {
16543          Int   i, j;
16544          UChar buf[64];
16545          UInt  insn
16546                   = getUIntLittleEndianly( &guest_code_IN[delta_IN] );
16547          vex_bzero(buf, sizeof(buf));
16548          for (i = j = 0; i < 32; i++) {
16549             if (i > 0) {
16550               if ((i & 7) == 0) buf[j++] = ' ';
16551               else if ((i & 3) == 0) buf[j++] = '\'';
16552             }
16553             buf[j++] = (insn & (1<<(31-i))) ? '1' : '0';
16554          }
16555          vex_printf("disInstr(arm64): unhandled instruction 0x%08x\n", insn);
16556          vex_printf("disInstr(arm64): %s\n", buf);
16557       }
16558
16559       /* Tell the dispatcher that this insn cannot be decoded, and so
16560          has not been executed, and (is currently) the next to be
16561          executed.  PC should be up-to-date since it is made so at the
16562          start of each insn, but nevertheless be paranoid and update
16563          it again right now. */
16564       putPC( mkU64(guest_PC_curr_instr) );
16565       dres.len         = 0;
16566       dres.whatNext    = Dis_StopHere;
16567       dres.jk_StopHere = Ijk_NoDecode;
16568    }
16569    return dres;
16570 }
16571
16572
16573 /*--------------------------------------------------------------------*/
16574 /*--- end                                       guest_arm64_toIR.c ---*/
16575 /*--------------------------------------------------------------------*/