VEX/priv/guest_arm64_helpers.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                             guest_arm64_helpers.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2013-2017 OpenWorks
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, see <http://www.gnu.org/licenses/>.
  25
  26    The GNU General Public License is contained in the file COPYING.
  27 */
  28
  29 #include "libvex_basictypes.h"
  30 #include "libvex_emnote.h"
  31 #include "libvex_guest_arm64.h"
  32 #include "libvex_ir.h"
  33 #include "libvex.h"
  34
  35 #include "main_util.h"
  36 #include "main_globals.h"
  37 #include "guest_generic_bb_to_IR.h"
  38 #include "guest_arm64_defs.h"
  39
  40
  41 /* This file contains helper functions for arm guest code.  Calls to
  42    these functions are generated by the back end.  These calls are of
  43    course in the host machine code and this file will be compiled to
  44    host machine code, so that all makes sense.
  45
  46    Only change the signatures of these helper functions very
  47    carefully.  If you change the signature here, you'll have to change
  48    the parameters passed to it in the IR calls constructed by
  49    guest_arm64_toIR.c.
  50 */
  51
  52
  53 /* Set to 1 to get detailed profiling info about individual N, Z, C
  54    and V flag evaluation. */
  55 #define PROFILE_NZCV_FLAGS 0
  56
  57 #if PROFILE_NZCV_FLAGS
  58
  59 static UInt tab_eval[ARM64G_CC_OP_NUMBER][16];
  60 static UInt initted = 0;
  61 static UInt tot_evals = 0;
  62
  63 static void initCounts ( void )
  64 {
  65    UInt i, j;
  66    for (i = 0; i < ARM64G_CC_OP_NUMBER; i++) {
  67       for (j = 0; j < 16; j++) {
  68          tab_eval[i][j] = 0;
  69       }
  70    }
  71    initted = 1;
  72 }
  73
  74 static void showCounts ( void )
  75 {
  76    const HChar* nameCC[16]
  77       = { "EQ", "NE", "CS", "CC", "MI", "PL", "VS", "VC",
  78           "HI", "LS", "GE", "LT", "GT", "LE", "AL", "NV" };
  79    UInt i, j;
  80    ULong sum = 0;
  81    vex_printf("\nCC_OP          0         1         2         3    "
  82               "     4         5         6\n");
  83    vex_printf(  "--------------------------------------------------"
  84               "--------------------------\n");
  85    for (j = 0; j < 16; j++) {
  86       vex_printf("%2d %s  ", j, nameCC[j]);
  87       for (i = 0; i < ARM64G_CC_OP_NUMBER; i++) {
  88          vex_printf("%9d ", tab_eval[i][j]);
  89          sum += tab_eval[i][j];
  90       }
  91       vex_printf("\n");
  92    }
  93    vex_printf("(In total %llu calls)\n", sum);
  94 }
  95
  96 #define NOTE_EVAL(_cc_op, _cond) \
  97    do { \
  98       if (!initted) initCounts(); \
  99       vassert( ((UInt)(_cc_op)) < ARM64G_CC_OP_NUMBER); \
 100       vassert( ((UInt)(_cond)) < 16); \
 101       tab_eval[(UInt)(_cc_op)][(UInt)(cond)]++;  \
 102       tot_evals++; \
 103       if (0 == (tot_evals & 0x7FFF)) \
 104         showCounts(); \
 105    } while (0)
 106
 107 #endif /* PROFILE_NZCV_FLAGS */
 108
 109
 110 /* Calculate the N flag from the supplied thunk components, in the
 111    least significant bit of the word.  Returned bits 63:1 are zero. */
 112 static
 113 ULong arm64g_calculate_flag_n ( ULong cc_op, ULong cc_dep1,
 114                                 ULong cc_dep2, ULong cc_dep3 )
 115 {
 116    switch (cc_op) {
 117       case ARM64G_CC_OP_COPY: {
 118          /* (nzcv:28x0, unused, unused) */
 119          ULong nf   = (cc_dep1 >> ARM64G_CC_SHIFT_N) & 1;
 120          return nf;
 121       }
 122       case ARM64G_CC_OP_ADD32: {
 123          /* (argL, argR, unused) */
 124          UInt  argL = (UInt)cc_dep1;
 125          UInt  argR = (UInt)cc_dep2;
 126          UInt  res  = argL + argR;
 127          ULong nf   = (ULong)(res >> 31);
 128          return nf;
 129       }
 130       case ARM64G_CC_OP_ADD64: {
 131          /* (argL, argR, unused) */
 132          ULong argL = cc_dep1;
 133          ULong argR = cc_dep2;
 134          ULong res  = argL + argR;
 135          ULong nf   = (ULong)(res >> 63);
 136          return nf;
 137       }
 138       case ARM64G_CC_OP_SUB32: {
 139          /* (argL, argR, unused) */
 140          UInt  argL = (UInt)cc_dep1;
 141          UInt  argR = (UInt)cc_dep2;
 142          UInt  res  = argL - argR;
 143          ULong nf   = (ULong)(res >> 31);
 144          return nf;
 145       }
 146       case ARM64G_CC_OP_SUB64: {
 147          /* (argL, argR, unused) */
 148          ULong argL = cc_dep1;
 149          ULong argR = cc_dep2;
 150          ULong res  = argL - argR;
 151          ULong nf   = res >> 63;
 152          return nf;
 153       }
 154       case ARM64G_CC_OP_ADC32: {
 155          /* (argL, argR, oldC) */
 156          UInt  argL = cc_dep1;
 157          UInt  argR = cc_dep2;
 158          UInt  oldC = cc_dep3;
 159          vassert((oldC & ~1) == 0);
 160          UInt  res  = argL + argR + oldC;
 161          ULong nf   = (ULong)(res >> 31);
 162          return nf;
 163       }
 164       case ARM64G_CC_OP_ADC64: {
 165          /* (argL, argR, oldC) */
 166          ULong argL = cc_dep1;
 167          ULong argR = cc_dep2;
 168          ULong oldC = cc_dep3;
 169          vassert((oldC & ~1) == 0);
 170          ULong res  = argL + argR + oldC;
 171          ULong nf   = res >> 63;
 172          return nf;
 173       }
 174       case ARM64G_CC_OP_SBC32: {
 175          /* (argL, argR, oldC) */
 176          UInt  argL = cc_dep1;
 177          UInt  argR = cc_dep2;
 178          UInt  oldC = cc_dep3;
 179          vassert((oldC & ~1) == 0);
 180          UInt  res  = argL - argR - (oldC ^ 1);
 181          ULong nf   = (ULong)(res >> 31);
 182          return nf;
 183       }
 184       case ARM64G_CC_OP_SBC64: {
 185          /* (argL, argR, oldC) */
 186          ULong argL = cc_dep1;
 187          ULong argR = cc_dep2;
 188          ULong oldC = cc_dep3;
 189          vassert((oldC & ~1) == 0);
 190          ULong res  = argL - argR - (oldC ^ 1);
 191          ULong nf   = res >> 63;
 192          return nf;
 193       }
 194       case ARM64G_CC_OP_LOGIC32: {
 195          /* (res, unused, unused) */
 196          UInt  res = (UInt)cc_dep1;
 197          ULong nf  = res >> 31;
 198          return nf;
 199       }
 200       case ARM64G_CC_OP_LOGIC64: {
 201          /* (res, unused, unused) */
 202          ULong res = cc_dep1;
 203          ULong nf  = res >> 63;
 204          return nf;
 205       }
 206 //ZZ       case ARMG_CC_OP_MUL: {
 207 //ZZ          /* (res, unused, oldC:oldV) */
 208 //ZZ          UInt res  = cc_dep1;
 209 //ZZ          UInt nf   = res >> 31;
 210 //ZZ          return nf;
 211 //ZZ       }
 212 //ZZ       case ARMG_CC_OP_MULL: {
 213 //ZZ          /* (resLo32, resHi32, oldC:oldV) */
 214 //ZZ          UInt resHi32 = cc_dep2;
 215 //ZZ          UInt nf      = resHi32 >> 31;
 216 //ZZ          return nf;
 217 //ZZ       }
 218       default:
 219          /* shouldn't really make these calls from generated code */
 220          vex_printf("arm64g_calculate_flag_n"
 221                     "( op=%llu, dep1=0x%llx, dep2=0x%llx, dep3=0x%llx )\n",
 222                     cc_op, cc_dep1, cc_dep2, cc_dep3 );
 223          vpanic("arm64g_calculate_flag_n");
 224    }
 225 }
 226
 227
 228 /* Calculate the Z flag from the supplied thunk components, in the
 229    least significant bit of the word.  Returned bits 63:1 are zero. */
 230 static
 231 ULong arm64g_calculate_flag_z ( ULong cc_op, ULong cc_dep1,
 232                                 ULong cc_dep2, ULong cc_dep3 )
 233 {
 234    switch (cc_op) {
 235       case ARM64G_CC_OP_COPY: {
 236          /* (nzcv:28x0, unused, unused) */
 237          ULong zf   = (cc_dep1 >> ARM64G_CC_SHIFT_Z) & 1;
 238          return zf;
 239       }
 240       case ARM64G_CC_OP_ADD32: {
 241          /* (argL, argR, unused) */
 242          UInt  argL = (UInt)cc_dep1;
 243          UInt  argR = (UInt)cc_dep2;
 244          UInt  res  = argL + argR;
 245          ULong zf   = res == 0;
 246          return zf;
 247       }
 248       case ARM64G_CC_OP_ADD64: {
 249          /* (argL, argR, unused) */
 250          ULong argL = cc_dep1;
 251          ULong argR = cc_dep2;
 252          ULong res  = argL + argR;
 253          ULong zf   = res == 0;
 254          return zf;
 255       }
 256       case ARM64G_CC_OP_SUB32: {
 257          /* (argL, argR, unused) */
 258          UInt  argL = (UInt)cc_dep1;
 259          UInt  argR = (UInt)cc_dep2;
 260          UInt  res  = argL - argR;
 261          ULong zf   = res == 0;
 262          return zf;
 263       }
 264       case ARM64G_CC_OP_SUB64: {
 265          /* (argL, argR, unused) */
 266          ULong argL = cc_dep1;
 267          ULong argR = cc_dep2;
 268          ULong res  = argL - argR;
 269          ULong zf   = res == 0;
 270          return zf;
 271       }
 272       case ARM64G_CC_OP_ADC32: {
 273          /* (argL, argR, oldC) */
 274          UInt  argL = cc_dep1;
 275          UInt  argR = cc_dep2;
 276          UInt  oldC = cc_dep3;
 277          vassert((oldC & ~1) == 0);
 278          UInt  res  = argL + argR + oldC;
 279          ULong zf   = res == 0;
 280          return zf;
 281       }
 282       case ARM64G_CC_OP_ADC64: {
 283          /* (argL, argR, oldC) */
 284          ULong argL = cc_dep1;
 285          ULong argR = cc_dep2;
 286          ULong oldC = cc_dep3;
 287          vassert((oldC & ~1) == 0);
 288          ULong res  = argL + argR + oldC;
 289          ULong zf   = res == 0;
 290          return zf;
 291       }
 292       case ARM64G_CC_OP_SBC32: {
 293          /* (argL, argR, oldC) */
 294          UInt  argL = cc_dep1;
 295          UInt  argR = cc_dep2;
 296          UInt  oldC = cc_dep3;
 297          vassert((oldC & ~1) == 0);
 298          UInt  res  = argL - argR - (oldC ^ 1);
 299          ULong zf   = res == 0;
 300          return zf;
 301       }
 302       case ARM64G_CC_OP_SBC64: {
 303          /* (argL, argR, oldC) */
 304          ULong argL = cc_dep1;
 305          ULong argR = cc_dep2;
 306          ULong oldC = cc_dep3;
 307          vassert((oldC & ~1) == 0);
 308          ULong res  = argL - argR - (oldC ^ 1);
 309          ULong zf   = res == 0;
 310          return zf;
 311       }
 312       case ARM64G_CC_OP_LOGIC32: {
 313          /* (res, unused, unused) */
 314          UInt  res  = (UInt)cc_dep1;
 315          ULong zf   = res == 0;
 316          return zf;
 317       }
 318       case ARM64G_CC_OP_LOGIC64: {
 319          /* (res, unused, unused) */
 320          ULong res  = cc_dep1;
 321          ULong zf   = res == 0;
 322          return zf;
 323       }
 324 //ZZ       case ARMG_CC_OP_MUL: {
 325 //ZZ          /* (res, unused, oldC:oldV) */
 326 //ZZ          UInt res  = cc_dep1;
 327 //ZZ          UInt zf   = res == 0;
 328 //ZZ          return zf;
 329 //ZZ       }
 330 //ZZ       case ARMG_CC_OP_MULL: {
 331 //ZZ          /* (resLo32, resHi32, oldC:oldV) */
 332 //ZZ          UInt resLo32 = cc_dep1;
 333 //ZZ          UInt resHi32 = cc_dep2;
 334 //ZZ          UInt zf      = (resHi32|resLo32) == 0;
 335 //ZZ          return zf;
 336 //ZZ       }
 337       default:
 338          /* shouldn't really make these calls from generated code */
 339          vex_printf("arm64g_calculate_flag_z"
 340                     "( op=%llu, dep1=0x%llx, dep2=0x%llx, dep3=0x%llx )\n",
 341                     cc_op, cc_dep1, cc_dep2, cc_dep3 );
 342          vpanic("arm64g_calculate_flag_z");
 343    }
 344 }
 345
 346
 347 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 348 /* Calculate the C flag from the supplied thunk components, in the
 349    least significant bit of the word.  Returned bits 63:1 are zero. */
 350 ULong arm64g_calculate_flag_c ( ULong cc_op, ULong cc_dep1,
 351                                 ULong cc_dep2, ULong cc_dep3 )
 352 {
 353    switch (cc_op) {
 354       case ARM64G_CC_OP_COPY: {
 355          /* (nzcv:28x0, unused, unused) */
 356          ULong cf = (cc_dep1 >> ARM64G_CC_SHIFT_C) & 1;
 357          return cf;
 358       }
 359       case ARM64G_CC_OP_ADD32: {
 360          /* (argL, argR, unused) */
 361          UInt  argL = (UInt)cc_dep1;
 362          UInt  argR = (UInt)cc_dep2;
 363          UInt  res  = argL + argR;
 364          ULong cf   = res < argL;
 365          return cf;
 366       }
 367       case ARM64G_CC_OP_ADD64: {
 368          /* (argL, argR, unused) */
 369          ULong argL = cc_dep1;
 370          ULong argR = cc_dep2;
 371          ULong res  = argL + argR;
 372          ULong cf   = res < argL;
 373          return cf;
 374       }
 375       case ARM64G_CC_OP_SUB32: {
 376          /* (argL, argR, unused) */
 377          UInt  argL = (UInt)cc_dep1;
 378          UInt  argR = (UInt)cc_dep2;
 379          ULong cf   = argL >= argR;
 380          return cf;
 381       }
 382       case ARM64G_CC_OP_SUB64: {
 383          /* (argL, argR, unused) */
 384          ULong argL = cc_dep1;
 385          ULong argR = cc_dep2;
 386          ULong cf   = argL >= argR;
 387          return cf;
 388       }
 389       case ARM64G_CC_OP_ADC32: {
 390          /* (argL, argR, oldC) */
 391          UInt  argL = cc_dep1;
 392          UInt  argR = cc_dep2;
 393          UInt  oldC = cc_dep3;
 394          vassert((oldC & ~1) == 0);
 395          UInt  res  = argL + argR + oldC;
 396          ULong cf   = oldC ? (res <= argL) : (res < argL);
 397          return cf;
 398       }
 399       case ARM64G_CC_OP_ADC64: {
 400          /* (argL, argR, oldC) */
 401          ULong argL = cc_dep1;
 402          ULong argR = cc_dep2;
 403          ULong oldC = cc_dep3;
 404          vassert((oldC & ~1) == 0);
 405          ULong res  = argL + argR + oldC;
 406          ULong cf   = oldC ? (res <= argL) : (res < argL);
 407          return cf;
 408       }
 409       case ARM64G_CC_OP_SBC32: {
 410          /* (argL, argR, oldC) */
 411          UInt  argL = cc_dep1;
 412          UInt  argR = cc_dep2;
 413          UInt  oldC = cc_dep3;
 414          vassert((oldC & ~1) == 0);
 415          ULong cf   = oldC ? (argL >= argR) : (argL > argR);
 416          return cf;
 417       }
 418       case ARM64G_CC_OP_SBC64: {
 419          /* (argL, argR, oldC) */
 420          ULong argL = cc_dep1;
 421          ULong argR = cc_dep2;
 422          ULong oldC = cc_dep3;
 423          vassert((oldC & ~1) == 0);
 424          ULong cf   = oldC ? (argL >= argR) : (argL > argR);
 425          return cf;
 426       }
 427       case ARM64G_CC_OP_LOGIC32:
 428       case ARM64G_CC_OP_LOGIC64: {
 429          /* (res, unused, unused) */
 430          return 0; // C after logic is zero on arm64
 431       }
 432 //ZZ       case ARMG_CC_OP_MUL: {
 433 //ZZ          /* (res, unused, oldC:oldV) */
 434 //ZZ          UInt oldC = (cc_dep3 >> 1) & 1;
 435 //ZZ          vassert((cc_dep3 & ~3) == 0);
 436 //ZZ          UInt cf   = oldC;
 437 //ZZ          return cf;
 438 //ZZ       }
 439 //ZZ       case ARMG_CC_OP_MULL: {
 440 //ZZ          /* (resLo32, resHi32, oldC:oldV) */
 441 //ZZ          UInt oldC    = (cc_dep3 >> 1) & 1;
 442 //ZZ          vassert((cc_dep3 & ~3) == 0);
 443 //ZZ          UInt cf      = oldC;
 444 //ZZ          return cf;
 445 //ZZ       }
 446       default:
 447          /* shouldn't really make these calls from generated code */
 448          vex_printf("arm64g_calculate_flag_c"
 449                     "( op=%llu, dep1=0x%llx, dep2=0x%llx, dep3=0x%llx )\n",
 450                     cc_op, cc_dep1, cc_dep2, cc_dep3 );
 451          vpanic("arm64g_calculate_flag_c");
 452    }
 453 }
 454
 455
 456 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 457 /* Calculate the V flag from the supplied thunk components, in the
 458    least significant bit of the word.  Returned bits 63:1 are zero. */
 459 static
 460 ULong arm64g_calculate_flag_v ( ULong cc_op, ULong cc_dep1,
 461                                 ULong cc_dep2, ULong cc_dep3 )
 462 {
 463    switch (cc_op) {
 464       case ARM64G_CC_OP_COPY: {
 465          /* (nzcv:28x0, unused, unused) */
 466          ULong vf   = (cc_dep1 >> ARM64G_CC_SHIFT_V) & 1;
 467          return vf;
 468       }
 469       case ARM64G_CC_OP_ADD32: {
 470          /* (argL, argR, unused) */
 471          UInt  argL = (UInt)cc_dep1;
 472          UInt  argR = (UInt)cc_dep2;
 473          UInt  res  = argL + argR;
 474          ULong vf   = (ULong)(((res ^ argL) & (res ^ argR)) >> 31);
 475          return vf;
 476       }
 477       case ARM64G_CC_OP_ADD64: {
 478          /* (argL, argR, unused) */
 479          ULong argL = cc_dep1;
 480          ULong argR = cc_dep2;
 481          ULong res  = argL + argR;
 482          ULong vf   = ((res ^ argL) & (res ^ argR)) >> 63;
 483          return vf;
 484       }
 485       case ARM64G_CC_OP_SUB32: {
 486          /* (argL, argR, unused) */
 487          UInt  argL = (UInt)cc_dep1;
 488          UInt  argR = (UInt)cc_dep2;
 489          UInt  res  = argL - argR;
 490          ULong vf   = (ULong)(((argL ^ argR) & (argL ^ res)) >> 31);
 491          return vf;
 492       }
 493       case ARM64G_CC_OP_SUB64: {
 494          /* (argL, argR, unused) */
 495          ULong argL = cc_dep1;
 496          ULong argR = cc_dep2;
 497          ULong res  = argL - argR;
 498          ULong vf   = (((argL ^ argR) & (argL ^ res))) >> 63;
 499          return vf;
 500       }
 501       case ARM64G_CC_OP_ADC32: {
 502          /* (argL, argR, oldC) */
 503          UInt  argL = cc_dep1;
 504          UInt  argR = cc_dep2;
 505          UInt  oldC = cc_dep3;
 506          vassert((oldC & ~1) == 0);
 507          UInt  res  = argL + argR + oldC;
 508          ULong vf   = (ULong)(((res ^ argL) & (res ^ argR)) >> 31);
 509          return vf;
 510       }
 511       case ARM64G_CC_OP_ADC64: {
 512          /* (argL, argR, oldC) */
 513          ULong argL = cc_dep1;
 514          ULong argR = cc_dep2;
 515          ULong oldC = cc_dep3;
 516          vassert((oldC & ~1) == 0);
 517          ULong res  = argL + argR + oldC;
 518          ULong vf   = ((res ^ argL) & (res ^ argR)) >> 63;
 519          return vf;
 520       }
 521       case ARM64G_CC_OP_SBC32: {
 522          /* (argL, argR, oldC) */
 523          UInt  argL = cc_dep1;
 524          UInt  argR = cc_dep2;
 525          UInt  oldC = cc_dep3;
 526          vassert((oldC & ~1) == 0);
 527          UInt  res  = argL - argR - (oldC ^ 1);
 528          ULong vf   = (ULong)(((argL ^ argR) & (argL ^ res)) >> 31);
 529          return vf;
 530       }
 531       case ARM64G_CC_OP_SBC64: {
 532          /* (argL, argR, oldC) */
 533          ULong argL = cc_dep1;
 534          ULong argR = cc_dep2;
 535          ULong oldC = cc_dep3;
 536          vassert((oldC & ~1) == 0);
 537          ULong res  = argL - argR - (oldC ^ 1);
 538          ULong vf   = ((argL ^ argR) & (argL ^ res)) >> 63;
 539          return vf;
 540       }
 541       case ARM64G_CC_OP_LOGIC32:
 542       case ARM64G_CC_OP_LOGIC64: {
 543          /* (res, unused, unused) */
 544          return 0; // V after logic is zero on arm64
 545       }
 546 //ZZ       case ARMG_CC_OP_MUL: {
 547 //ZZ          /* (res, unused, oldC:oldV) */
 548 //ZZ          UInt oldV = (cc_dep3 >> 0) & 1;
 549 //ZZ          vassert((cc_dep3 & ~3) == 0);
 550 //ZZ          UInt vf   = oldV;
 551 //ZZ          return vf;
 552 //ZZ       }
 553 //ZZ       case ARMG_CC_OP_MULL: {
 554 //ZZ          /* (resLo32, resHi32, oldC:oldV) */
 555 //ZZ          UInt oldV    = (cc_dep3 >> 0) & 1;
 556 //ZZ          vassert((cc_dep3 & ~3) == 0);
 557 //ZZ          UInt vf      = oldV;
 558 //ZZ          return vf;
 559 //ZZ       }
 560       default:
 561          /* shouldn't really make these calls from generated code */
 562          vex_printf("arm64g_calculate_flag_v"
 563                     "( op=%llu, dep1=0x%llx, dep2=0x%llx, dep3=0x%llx )\n",
 564                     cc_op, cc_dep1, cc_dep2, cc_dep3 );
 565          vpanic("arm64g_calculate_flag_v");
 566    }
 567 }
 568
 569
 570 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 571 /* Calculate NZCV from the supplied thunk components, in the positions
 572    they appear in the CPSR, viz bits 31:28 for N Z C V respectively.
 573    Returned bits 27:0 are zero. */
 574 ULong arm64g_calculate_flags_nzcv ( ULong cc_op, ULong cc_dep1,
 575                                     ULong cc_dep2, ULong cc_dep3 )
 576 {
 577    ULong f;
 578    ULong res = 0;
 579    f = 1 & arm64g_calculate_flag_n(cc_op, cc_dep1, cc_dep2, cc_dep3);
 580    res |= (f << ARM64G_CC_SHIFT_N);
 581    f = 1 & arm64g_calculate_flag_z(cc_op, cc_dep1, cc_dep2, cc_dep3);
 582    res |= (f << ARM64G_CC_SHIFT_Z);
 583    f = 1 & arm64g_calculate_flag_c(cc_op, cc_dep1, cc_dep2, cc_dep3);
 584    res |= (f << ARM64G_CC_SHIFT_C);
 585    f = 1 & arm64g_calculate_flag_v(cc_op, cc_dep1, cc_dep2, cc_dep3);
 586    res |= (f << ARM64G_CC_SHIFT_V);
 587    return res;
 588 }
 589
 590 void LibVEX_GuestARM64_put_nzcv_c ( ULong new_carry_flag,
 591                                   /*MOD*/VexGuestARM64State* vex_state )
 592 {
 593    ULong nzcv = arm64g_calculate_flags_nzcv(
 594       vex_state->guest_CC_OP,
 595       vex_state->guest_CC_DEP1,
 596       vex_state->guest_CC_DEP2,
 597       vex_state->guest_CC_NDEP
 598       );
 599    if (new_carry_flag & 1) {
 600       nzcv |= ARM64G_CC_MASK_C;
 601    } else {
 602       nzcv &= ~ARM64G_CC_MASK_C;
 603    }
 604    vex_state->guest_CC_OP   = ARM64G_CC_OP_COPY;
 605    vex_state->guest_CC_DEP1 = nzcv;
 606    vex_state->guest_CC_DEP2 = 0;
 607    vex_state->guest_CC_NDEP = 0;
 608 }
 609
 610 //ZZ
 611 //ZZ /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 612 //ZZ /* Calculate the QC flag from the arguments, in the lowest bit
 613 //ZZ    of the word (bit 0).  Urr, having this out of line is bizarre.
 614 //ZZ    Push back inline. */
 615 //ZZ UInt armg_calculate_flag_qc ( UInt resL1, UInt resL2,
 616 //ZZ                               UInt resR1, UInt resR2 )
 617 //ZZ {
 618 //ZZ    if (resL1 != resR1 || resL2 != resR2)
 619 //ZZ       return 1;
 620 //ZZ    else
 621 //ZZ       return 0;
 622 //ZZ }
 623
 624 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 625 /* Calculate the specified condition from the thunk components, in the
 626    lowest bit of the word (bit 0).  Returned bits 63:1 are zero. */
 627 ULong arm64g_calculate_condition ( /* ARM64Condcode << 4 | cc_op */
 628                                    ULong cond_n_op ,
 629                                    ULong cc_dep1,
 630                                    ULong cc_dep2, ULong cc_dep3 )
 631 {
 632    ULong cond  = cond_n_op >> 4;
 633    ULong cc_op = cond_n_op & 0xF;
 634    ULong inv   = cond & 1;
 635    ULong nf, zf, vf, cf;
 636
 637 #  if PROFILE_NZCV_FLAGS
 638    NOTE_EVAL(cc_op, cond);
 639 #  endif
 640
 641    //   vex_printf("XXXXXXXX %llx %llx %llx %llx\n",
 642    //              cond_n_op, cc_dep1, cc_dep2, cc_dep3);
 643
 644    switch (cond) {
 645       case ARM64CondEQ:    // Z=1         => z
 646       case ARM64CondNE:    // Z=0
 647          zf = arm64g_calculate_flag_z(cc_op, cc_dep1, cc_dep2, cc_dep3);
 648          return inv ^ zf;
 649
 650       case ARM64CondCS:    // C=1         => c
 651       case ARM64CondCC:    // C=0
 652          cf = arm64g_calculate_flag_c(cc_op, cc_dep1, cc_dep2, cc_dep3);
 653          return inv ^ cf;
 654
 655       case ARM64CondMI:    // N=1         => n
 656       case ARM64CondPL:    // N=0
 657          nf = arm64g_calculate_flag_n(cc_op, cc_dep1, cc_dep2, cc_dep3);
 658          return inv ^ nf;
 659
 660       case ARM64CondVS:    // V=1         => v
 661       case ARM64CondVC:    // V=0
 662          vf = arm64g_calculate_flag_v(cc_op, cc_dep1, cc_dep2, cc_dep3);
 663          return inv ^ vf;
 664
 665       case ARM64CondHI:    // C=1 && Z=0   => c & ~z
 666       case ARM64CondLS:    // C=0 || Z=1
 667          cf = arm64g_calculate_flag_c(cc_op, cc_dep1, cc_dep2, cc_dep3);
 668          zf = arm64g_calculate_flag_z(cc_op, cc_dep1, cc_dep2, cc_dep3);
 669          return inv ^ (1 & (cf & ~zf));
 670
 671       case ARM64CondGE:    // N=V          => ~(n^v)
 672       case ARM64CondLT:    // N!=V
 673          nf = arm64g_calculate_flag_n(cc_op, cc_dep1, cc_dep2, cc_dep3);
 674          vf = arm64g_calculate_flag_v(cc_op, cc_dep1, cc_dep2, cc_dep3);
 675          return inv ^ (1 & ~(nf ^ vf));
 676
 677       case ARM64CondGT:    // Z=0 && N=V   => ~z & ~(n^v)  =>  ~(z | (n^v))
 678       case ARM64CondLE:    // Z=1 || N!=V
 679          nf = arm64g_calculate_flag_n(cc_op, cc_dep1, cc_dep2, cc_dep3);
 680          vf = arm64g_calculate_flag_v(cc_op, cc_dep1, cc_dep2, cc_dep3);
 681          zf = arm64g_calculate_flag_z(cc_op, cc_dep1, cc_dep2, cc_dep3);
 682          return inv ^ (1 & ~(zf | (nf ^ vf)));
 683
 684       case ARM64CondAL:    // 1
 685       case ARM64CondNV:    // 1
 686          return 1;
 687
 688       default:
 689          /* shouldn't really make these calls from generated code */
 690          vex_printf("arm64g_calculate_condition(ARM64)"
 691                     "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
 692                     cond, cc_op, cc_dep1, cc_dep2, cc_dep3 );
 693          vpanic("armg_calculate_condition(ARM64)");
 694    }
 695 }
 696
 697
 698 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 699 ULong arm64g_calc_crc32b ( ULong acc, ULong bits )
 700 {
 701    UInt  i;
 702    ULong crc = (bits & 0xFFULL) ^ acc;
 703    for (i = 0; i < 8; i++)
 704       crc = (crc >> 1) ^ ((crc & 1) ? 0xEDB88320ULL : 0);
 705    return crc;
 706 }
 707
 708 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 709 ULong arm64g_calc_crc32h ( ULong acc, ULong bits )
 710 {
 711    UInt  i;
 712    ULong crc = (bits & 0xFFFFULL) ^ acc;
 713    for (i = 0; i < 16; i++)
 714       crc = (crc >> 1) ^ ((crc & 1) ? 0xEDB88320ULL : 0);
 715    return crc;
 716 }
 717
 718 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 719 ULong arm64g_calc_crc32w ( ULong acc, ULong bits )
 720 {
 721    UInt  i;
 722    ULong crc = (bits & 0xFFFFFFFFULL) ^ acc;
 723    for (i = 0; i < 32; i++)
 724       crc = (crc >> 1) ^ ((crc & 1) ? 0xEDB88320ULL : 0);
 725    return crc;
 726 }
 727
 728 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 729 ULong arm64g_calc_crc32x ( ULong acc, ULong bits )
 730 {
 731    UInt  i;
 732    ULong crc = bits ^ acc;
 733    for (i = 0; i < 64; i++)
 734       crc = (crc >> 1) ^ ((crc & 1) ? 0xEDB88320ULL : 0);
 735    return crc;
 736
 737 }
 738
 739 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 740 ULong arm64g_calc_crc32cb ( ULong acc, ULong bits )
 741 {
 742    UInt  i;
 743    ULong crc = (bits & 0xFFULL) ^ acc;
 744    for (i = 0; i < 8; i++)
 745       crc = (crc >> 1) ^ ((crc & 1) ? 0x82F63B78ULL : 0);
 746    return crc;
 747 }
 748
 749 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 750 ULong arm64g_calc_crc32ch ( ULong acc, ULong bits )
 751 {
 752    UInt  i;
 753    ULong crc = (bits & 0xFFFFULL) ^ acc;
 754    for (i = 0; i < 16; i++)
 755       crc = (crc >> 1) ^ ((crc & 1) ? 0x82F63B78ULL : 0);
 756    return crc;
 757 }
 758
 759 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 760 ULong arm64g_calc_crc32cw ( ULong acc, ULong bits )
 761 {
 762    UInt  i;
 763    ULong crc = (bits & 0xFFFFFFFFULL) ^ acc;
 764    for (i = 0; i < 32; i++)
 765       crc = (crc >> 1) ^ ((crc & 1) ? 0x82F63B78ULL : 0);
 766    return crc;
 767 }
 768
 769 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 770 ULong arm64g_calc_crc32cx ( ULong acc, ULong bits )
 771 {
 772    UInt  i;
 773    ULong crc = bits ^ acc;
 774    for (i = 0; i < 64; i++)
 775       crc = (crc >> 1) ^ ((crc & 1) ? 0x82F63B78ULL : 0);
 776    return crc;
 777 }
 778
 779 /* CALLED FROM GENERATED CODE */
 780 /* DIRTY HELPER (non-referentially-transparent) */
 781 /* Horrible hack.  On non-arm64 platforms, return 0. */
 782 ULong arm64g_dirtyhelper_MRS_DCZID_EL0 ( void )
 783 {
 784 #  if defined(__aarch64__) && !defined(__arm__)
 785    ULong w = 0x5555555555555555ULL; /* overwritten */
 786    __asm__ __volatile__("mrs %0, dczid_el0" : "=r"(w));
 787    return w;
 788 #  else
 789    return 0ULL;
 790 #  endif
 791 }
 792
 793 /* CALLED FROM GENERATED CODE */
 794 /* DIRTY HELPER (non-referentially-transparent) */
 795 /* Horrible hack.  On non-arm64 platforms, return 0. */
 796 ULong arm64g_dirtyhelper_MRS_CNTVCT_EL0 ( void )
 797 {
 798 #  if defined(__aarch64__) && !defined(__arm__)
 799    ULong w = 0x5555555555555555ULL; /* overwritten */
 800    __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(w));
 801    return w;
 802 #  else
 803    return 0ULL;
 804 #  endif
 805 }
 806
 807
 808 /* CALLED FROM GENERATED CODE */
 809 /* DIRTY HELPER (non-referentially-transparent) */
 810 /* Horrible hack.  On non-arm64 platforms, return 0. */
 811 ULong arm64g_dirtyhelper_MRS_CNTFRQ_EL0 ( void )
 812 {
 813 #  if defined(__aarch64__) && !defined(__arm__)
 814    ULong w = 0x5555555555555555ULL; /* overwritten */
 815    __asm__ __volatile__("mrs %0, cntfrq_el0" : "=r"(w));
 816    return w;
 817 #  else
 818    return 0ULL;
 819 #  endif
 820 }
 821
 822 /* CALLED FROM GENERATED CODE */
 823 /* DIRTY HELPER (non-referentially-transparent) */
 824 /* Horrible hack.  On non-arm64 platforms, return 0. */
 825 ULong arm64g_dirtyhelper_MRS_MIDR_EL1 ( void )
 826 {
 827 #  if defined(__aarch64__) && !defined(__arm__)
 828    ULong w = 0x5555555555555555ULL; /* overwritten */
 829    __asm__ __volatile__("mrs %0, midr_el1" : "=r"(w));
 830    return w;
 831 #  else
 832    return 0ULL;
 833 #  endif
 834 }
 835
 836 /* CALLED FROM GENERATED CODE */
 837 /* DIRTY HELPER (non-referentially-transparent) */
 838 /* Horrible hack.  On non-arm64 platforms, return 0. */
 839 ULong arm64g_dirtyhelper_MRS_ID_AA64PFR0_EL1 ( void )
 840 {
 841 #  if defined(__aarch64__) && !defined(__arm__)
 842    ULong w = 0x5555555555555555ULL; /* overwritten */
 843    __asm__ __volatile__("mrs %0, id_aa64pfr0_el1" : "=r"(w));
 844
 845    // The control word uses the following nibbles (as seen on RPi)
 846    // unsupported unless indicated
 847    // 0 to 3 - EL0 to EL3 exception level handling
 848    // 4 - FP includes half-precision (partial support)
 849    // 5 - AdvSIMD also includes haf-precision
 850
 851    /* If half-precision fp is present we fall back to normal
 852       half precision implementation because of missing support in the emulation.
 853       If no AdvSIMD and FP are implemented, we preserve the value */
 854    w = (w >> 16);
 855    w &= 0xff;
 856    switch(w) {
 857      case 0x01:
 858        w = 0x0;
 859        break;
 860      case 0xff:
 861        w = (0xFF<<16);
 862        break;
 863      default:
 864        w = 0x0;
 865        break;
 866    }
 867
 868    return w;
 869 #  else
 870    return 0ULL;
 871 #  endif
 872 }
 873
 874 /* CALLED FROM GENERATED CODE */
 875 /* DIRTY HELPER (non-referentially-transparent) */
 876 /* Horrible hack.  On non-arm64 platforms, return 0. */
 877 ULong arm64g_dirtyhelper_MRS_ID_AA64MMFR0_EL1 ( void )
 878 {
 879 #  if defined(__aarch64__) && !defined(__arm__)
 880    ULong w = 0x5555555555555555ULL; /* overwritten */
 881    __asm__ __volatile__("mrs %0, id_aa64mmfr0_el1" : "=r"(w));
 882    return w;
 883 #  else
 884    return 0ULL;
 885 #  endif
 886 }
 887
 888 /* CALLED FROM GENERATED CODE */
 889 /* DIRTY HELPER (non-referentially-transparent) */
 890 /* Horrible hack.  On non-arm64 platforms, return 0. */
 891 ULong arm64g_dirtyhelper_MRS_ID_AA64MMFR1_EL1 ( void )
 892 {
 893 #  if defined(__aarch64__) && !defined(__arm__)
 894    ULong w = 0x5555555555555555ULL; /* overwritten */
 895    __asm__ __volatile__("mrs %0, id_aa64mmfr1_el1" : "=r"(w));
 896
 897    /* Clear VH and HAFDBS bits */
 898    w &= ~(0xF0F);
 899    return w;
 900 #  else
 901    return 0ULL;
 902 #  endif
 903 }
 904
 905 /* CALLED FROM GENERATED CODE */
 906 /* DIRTY HELPER (non-referentially-transparent) */
 907 /* Horrible hack.  On non-arm64 platforms, return 0. */
 908 ULong arm64g_dirtyhelper_MRS_ID_AA64ISAR0_EL1 ( void )
 909 {
 910 #  if defined(__aarch64__) && !defined(__arm__)
 911    ULong w = 0x5555555555555555ULL; /* overwritten */
 912    __asm__ __volatile__("mrs %0, id_aa64isar0_el1" : "=r"(w));
 913
 914    // In the mask below, nibbles are (higher nibbles all unsupported)
 915    // 0 - RES0
 916    // 1 - AES
 917    // 2 - SHA1
 918    // 3 - SHA2
 919    // 4 - CRC32
 920    // 5 - Atomic bits
 921    // 6 - TME (unsupported)
 922    // 7 - RDM
 923    // 8 - SHA3 (unsupported)
 924    // 9 - SM3 (unsupported)
 925    // 10 - SM4 (unsupported)
 926    // 11 - DP
 927
 928    //     10
 929    //     109876543210
 930    w &= 0xF000F0FFFFFF;
 931
 932    return w;
 933 #  else
 934    return 0ULL;
 935 #  endif
 936 }
 937
 938 /* CALLED FROM GENERATED CODE */
 939 /* DIRTY HELPER (non-referentially-transparent) */
 940 /* Horrible hack.  On non-arm64 platforms, return 0. */
 941 ULong arm64g_dirtyhelper_MRS_ID_AA64ISAR1_EL1 ( void )
 942 {
 943 #  if defined(__aarch64__) && !defined(__arm__)
 944    ULong w = 0x5555555555555555ULL; /* overwritten */
 945    __asm__ __volatile__("mrs %0, id_aa64isar1_el1" : "=r"(w));
 946
 947    // only nibble 0 DBP
 948    w &= 0xF;
 949
 950    return w;
 951 #  else
 952    return 0ULL;
 953 #  endif
 954 }
 955
 956 void arm64g_dirtyhelper_PMULLQ ( /*OUT*/V128* res, ULong arg1, ULong arg2 )
 957 {
 958    /* This doesn't need to be a dirty helper, except for the fact that
 959       a clean helper can't return a 128 bit value.  This is a pretty
 960       lame implementation of PMULLQ, but at least it doesn't contain any
 961       data dependent branches, and has lots of ILP.  I guess we could unroll
 962       the loop completely and offer extensive prayers to the gods of ILP
 963       if more performance is needed. */
 964    UInt i;
 965    ULong accHi = 0, accLo = 0;
 966    ULong op2Hi = 0, op2Lo = arg2;
 967    for (i = 0; i < 64; i++) {
 968       /* Make |mask| be all 0s or all 1s, a copy of arg1[i] */
 969       Long mask = arg1 << (63-i);
 970       mask >>= 63;
 971       accHi ^= (op2Hi & mask);
 972       accLo ^= (op2Lo & mask);
 973       /* do: op2Hi:op2Lo <<=u 1 */
 974       op2Hi <<= 1;
 975       op2Hi |= ((op2Lo >> 63) & 1);
 976       op2Lo <<= 1;
 977    }
 978    res->w64[1] = accHi;
 979    res->w64[0] = accLo;
 980 }
 981
 982
 983 /*---------------------------------------------------------------*/
 984 /*--- Crypto instruction helpers                              ---*/
 985 /*---------------------------------------------------------------*/
 986
 987 /* DIRTY HELPERS for doing AES support:
 988    * AESE (SubBytes, then ShiftRows)
 989    * AESD (InvShiftRows, then InvSubBytes)
 990    * AESMC (MixColumns)
 991    * AESIMC (InvMixColumns)
 992    These don't actually have to be dirty helpers -- they could be
 993    clean, but for the fact that they return a V128 and a clean helper
 994    can't do that.
 995
 996    The ARMv8 manual seems to imply that AESE first performs ShiftRows,
 997    then SubBytes.  This seems to contradict FIPS 197, so the
 998    implementation below is consistent with FIPS 197.  One can observe
 999    that the two transformations commute -- the order in which they
1000    happen makes no difference to the result.  So the ambiguity doesn't
1001    actually matter, but it is confusing.  The v8 manual looks correct
1002    about AESD, though.
1003
1004    The three functions rj_xtime, aesMixColumn and aesInvMixColumn only,
1005    are taken from "A byte-oriented AES-256 implementation" and are subject
1006    to the following usage terms:
1007
1008      Byte-oriented AES-256 implementation.
1009      All lookup tables replaced with 'on the fly' calculations.
1010
1011      Copyright (c) 2007-2011 Ilya O. Levin, http://www.literatecode.com
1012      Other contributors: Hal Finney
1013
1014      Permission to use, copy, modify, and distribute this software for any
1015      purpose with or without fee is hereby granted, provided that the above
1016      copyright notice and this permission notice appear in all copies.
1017
1018      THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
1019      WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1020      MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
1021      ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
1022      WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
1023      ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
1024      OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
1025 */
1026
1027 const UChar aesMapSubBytes[256]
1028    = { 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
1029        0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
1030        0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
1031        0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
1032        0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
1033        0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
1034        0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
1035        0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
1036        0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
1037        0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
1038        0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
1039        0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
1040        0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
1041        0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
1042        0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
1043        0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
1044        0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
1045        0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
1046        0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
1047        0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
1048        0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
1049        0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
1050        0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
1051        0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
1052        0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
1053        0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
1054        0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
1055        0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
1056        0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
1057        0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
1058        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
1059        0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
1060      };
1061
1062 const UChar aesMapInvSubBytes[256]
1063    = { 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
1064        0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
1065        0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
1066        0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
1067        0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
1068        0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
1069        0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
1070        0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
1071        0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
1072        0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
1073        0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
1074        0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
1075        0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
1076        0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
1077        0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
1078        0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
1079        0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
1080        0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
1081        0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
1082        0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
1083        0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
1084        0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
1085        0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
1086        0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
1087        0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
1088        0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
1089        0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
1090        0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
1091        0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
1092        0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
1093        0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
1094        0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
1095      };
1096
1097 static inline UChar rj_xtime ( UChar x )
1098 {
1099    UChar y = (UChar)(x << 1);
1100    return (x & 0x80) ? (y ^ 0x1b) : y;
1101 }
1102
1103 static void aesMixColumn ( /*MOD*/UChar* r )
1104 {
1105    UChar a = r[0];
1106    UChar b = r[1];
1107    UChar c = r[2];
1108    UChar d = r[3];
1109    UChar e = a ^ b ^ c ^ d;
1110    r[0] ^= e ^ rj_xtime(a ^ b);
1111    r[1] ^= e ^ rj_xtime(b ^ c);
1112    r[2] ^= e ^ rj_xtime(c ^ d);
1113    r[3] ^= e ^ rj_xtime(d ^ a);
1114 }
1115
1116 static void aesInvMixColumn ( /*MOD*/UChar* r )
1117 {
1118    UChar a = r[0];
1119    UChar b = r[1];
1120    UChar c = r[2];
1121    UChar d = r[3];
1122    UChar e = a ^ b ^ c ^ d;
1123    UChar z = rj_xtime(e);
1124    UChar x = e ^ rj_xtime(rj_xtime(z ^ a ^ c));
1125    UChar y = e ^ rj_xtime(rj_xtime(z ^ b ^ d));
1126    r[0] ^= x ^ rj_xtime(a ^ b);
1127    r[1] ^= y ^ rj_xtime(b ^ c);
1128    r[2] ^= x ^ rj_xtime(c ^ d);
1129    r[3] ^= y ^ rj_xtime(d ^ a);
1130 }
1131
1132
1133 /* CALLED FROM GENERATED CODE */
1134 void arm64g_dirtyhelper_AESE ( /*OUT*/V128* res, ULong argHi, ULong argLo )
1135 {
1136    res->w64[1] = argHi;
1137    res->w64[0] = argLo;
1138
1139    /* First do SubBytes on the State. */
1140    UInt i;
1141    for (i = 0; i < 16; i++) {
1142       res->w8[i] = aesMapSubBytes[res->w8[i] & 0xFF];
1143    }
1144
1145    /* Then do ShiftRows on the State. */
1146 #  define XX(_ix) res->w8[_ix]
1147    { UChar old1 = XX(1);
1148      XX(1) = XX(5); XX(5) = XX(9); XX(9) = XX(13); XX(13) = old1;
1149    }
1150    { UChar old2 = XX(2); UChar old6 = XX(6);
1151      XX(2) = XX(10); XX(6) = XX(14); XX(10) = old2; XX(14) = old6;
1152    }
1153    { UChar old15 = XX(15);
1154      XX(15) = XX(11); XX(11) = XX(7); XX(7) = XX(3); XX(3) = old15;
1155    }
1156 #  undef XX
1157 }
1158
1159
1160 /* CALLED FROM GENERATED CODE */
1161 void arm64g_dirtyhelper_AESD ( /*OUT*/V128* res, ULong argHi, ULong argLo )
1162 {
1163    res->w64[1] = argHi;
1164    res->w64[0] = argLo;
1165
1166    /* First do InvShiftRows on the State. */
1167 #  define XX(_ix) res->w8[_ix]
1168    { UChar old13 = XX(13);
1169      XX(13) = XX(9); XX(9) = XX(5); XX(5) = XX(1); XX(1) = old13;
1170    }
1171    { UChar old14 = XX(14); UChar old10 = XX(10);
1172      XX(14) = XX(6); XX(10) = XX(2); XX(6) = old14; XX(2) = old10;
1173    }
1174    { UChar old3 = XX(3);
1175      XX(3) = XX(7); XX(7) = XX(11); XX(11) = XX(15); XX(15) = old3;
1176    }
1177 #  undef XX
1178
1179 /* Then do InvSubBytes on the State. */
1180    UInt i;
1181    for (i = 0; i < 16; i++) {
1182       res->w8[i] = aesMapInvSubBytes[res->w8[i] & 0xFF];
1183    }
1184 }
1185
1186
1187 /* CALLED FROM GENERATED CODE */
1188 void arm64g_dirtyhelper_AESMC ( /*OUT*/V128* res, ULong argHi, ULong argLo )
1189 {
1190    res->w64[1] = argHi;
1191    res->w64[0] = argLo;
1192    aesMixColumn(&res->w8[0]);
1193    aesMixColumn(&res->w8[4]);
1194    aesMixColumn(&res->w8[8]);
1195    aesMixColumn(&res->w8[12]);
1196 }
1197
1198
1199 /* CALLED FROM GENERATED CODE */
1200 void arm64g_dirtyhelper_AESIMC ( /*OUT*/V128* res, ULong argHi, ULong argLo )
1201 {
1202    res->w64[1] = argHi;
1203    res->w64[0] = argLo;
1204    aesInvMixColumn(&res->w8[0]);
1205    aesInvMixColumn(&res->w8[4]);
1206    aesInvMixColumn(&res->w8[8]);
1207    aesInvMixColumn(&res->w8[12]);
1208 }
1209
1210
1211 /* DIRTY HELPERS for SHA instruction support.  As with the AES helpers
1212    above, these are actually pure functions and are only dirty because
1213    clean helpers can't return a V128. */
1214
1215 static inline UInt ROL32 ( UInt x, UInt sh ) {
1216    vassert(sh > 0 && sh < 32);
1217    return (x << sh) | (x >> (32 - sh));
1218 }
1219
1220 static inline UInt ROR32 ( UInt x, UInt sh ) {
1221    vassert(sh > 0 && sh < 32);
1222    return (x >> sh) | (x << (32 - sh));
1223 }
1224
1225 static inline ULong ROR64 ( ULong x, ULong sh ) {
1226    vassert(sh > 0 && sh < 64);
1227    return (x >> sh) | (x << (64 - sh));
1228 }
1229
1230 static inline UInt SHAchoose ( UInt x, UInt y, UInt z ) {
1231    return ((y ^ z) & x) ^ z;
1232 }
1233
1234 static inline UInt SHAmajority ( UInt x, UInt y, UInt z ) {
1235    return (x & y) | ((x | y) & z);
1236 }
1237
1238 static inline UInt SHAparity ( UInt x, UInt y, UInt z ) {
1239    return x ^ y ^ z;
1240 }
1241
1242 static inline UInt SHAhashSIGMA0 ( UInt x ) {
1243    return ROR32(x, 2) ^ ROR32(x, 13) ^ ROR32(x, 22);
1244 }
1245
1246 static inline UInt SHAhashSIGMA1 ( UInt x ) {
1247    return ROR32(x, 6) ^ ROR32(x, 11) ^ ROR32(x, 25);
1248 }
1249
1250 static void SHA256hash ( /*MOD*/V128* X, /*MOD*/V128* Y, const V128* W )
1251 {
1252    UInt e;
1253    for (e = 0; e <= 3; e++) {
1254       UInt chs = SHAchoose(Y->w32[0], Y->w32[1], Y->w32[2]);
1255       UInt maj = SHAmajority(X->w32[0], X->w32[1], X->w32[2]);
1256       UInt t   = Y->w32[3] + SHAhashSIGMA1(Y->w32[0]) + chs + W->w32[e];
1257       X->w32[3] = t + X->w32[3];
1258       Y->w32[3] = t + SHAhashSIGMA0(X->w32[0]) + maj;
1259       UInt ts = Y->w32[3];
1260       Y->w32[3] = Y->w32[2];
1261       Y->w32[2] = Y->w32[1];
1262       Y->w32[1] = Y->w32[0];
1263       Y->w32[0] = X->w32[3];
1264       X->w32[3] = X->w32[2];
1265       X->w32[2] = X->w32[1];
1266       X->w32[1] = X->w32[0];
1267       X->w32[0] = ts;
1268    }
1269 }
1270
1271 /* CALLED FROM GENERATED CODE */
1272 void arm64g_dirtyhelper_SHA1C ( /*OUT*/V128* res, ULong dHi, ULong dLo,
1273                                 ULong nHi, ULong nLo, ULong mHi, ULong mLo )
1274 {
1275    vassert(nHi == 0);
1276    vassert((nLo >> 32) == 0);
1277    V128 X; X.w64[1] = dHi; X.w64[0] = dLo;
1278    UInt Y; Y = (UInt)nLo;
1279    V128 W; W.w64[1] = mHi; W.w64[0] = mLo;
1280    UInt e;
1281    for (e = 0; e <= 3; e++) {
1282       UInt t = SHAchoose(X.w32[1], X.w32[2], X.w32[3]);
1283       Y = Y + ROL32(X.w32[0], 5) + t + W.w32[e];
1284       X.w32[1] = ROL32(X.w32[1], 30);
1285       UInt oldY = Y;
1286       Y = X.w32[3];
1287       X.w32[3] = X.w32[2];
1288       X.w32[2] = X.w32[1];
1289       X.w32[1] = X.w32[0];
1290       X.w32[0] = oldY;
1291    }
1292    res->w64[1] = X.w64[1];
1293    res->w64[0] = X.w64[0];
1294 }
1295
1296 /* CALLED FROM GENERATED CODE */
1297 void arm64g_dirtyhelper_SHA1H ( /*OUT*/V128* res, ULong nHi, ULong nLo )
1298 {
1299    vassert(nHi == 0);
1300    vassert((nLo >> 32) == 0);
1301    res->w32[3] = res->w32[2] = res->w32[1] = 0;
1302    res->w32[0] = ROL32((UInt)nLo, 30);
1303 }
1304
1305 /* CALLED FROM GENERATED CODE */
1306 void arm64g_dirtyhelper_SHA1M ( /*OUT*/V128* res, ULong dHi, ULong dLo,
1307                                 ULong nHi, ULong nLo, ULong mHi, ULong mLo )
1308 {
1309    vassert(nHi == 0);
1310    vassert((nLo >> 32) == 0);
1311    V128 X; X.w64[1] = dHi; X.w64[0] = dLo;
1312    UInt Y; Y = (UInt)nLo;
1313    V128 W; W.w64[1] = mHi; W.w64[0] = mLo;
1314    UInt e;
1315    for (e = 0; e <= 3; e++) {
1316       UInt t = SHAmajority(X.w32[1], X.w32[2], X.w32[3]);
1317       Y = Y + ROL32(X.w32[0], 5) + t + W.w32[e];
1318       X.w32[1] = ROL32(X.w32[1], 30);
1319       UInt oldY = Y;
1320       Y = X.w32[3];
1321       X.w32[3] = X.w32[2];
1322       X.w32[2] = X.w32[1];
1323       X.w32[1] = X.w32[0];
1324       X.w32[0] = oldY;
1325    }
1326    res->w64[1] = X.w64[1];
1327    res->w64[0] = X.w64[0];
1328 }
1329
1330 /* CALLED FROM GENERATED CODE */
1331 void arm64g_dirtyhelper_SHA1P ( /*OUT*/V128* res, ULong dHi, ULong dLo,
1332                                 ULong nHi, ULong nLo, ULong mHi, ULong mLo )
1333 {
1334    vassert(nHi == 0);
1335    vassert((nLo >> 32) == 0);
1336    V128 X; X.w64[1] = dHi; X.w64[0] = dLo;
1337    UInt Y; Y = (UInt)nLo;
1338    V128 W; W.w64[1] = mHi; W.w64[0] = mLo;
1339    UInt e;
1340    for (e = 0; e <= 3; e++) {
1341       UInt t = SHAparity(X.w32[1], X.w32[2], X.w32[3]);
1342       Y = Y + ROL32(X.w32[0], 5) + t + W.w32[e];
1343       X.w32[1] = ROL32(X.w32[1], 30);
1344       UInt oldY = Y;
1345       Y = X.w32[3];
1346       X.w32[3] = X.w32[2];
1347       X.w32[2] = X.w32[1];
1348       X.w32[1] = X.w32[0];
1349       X.w32[0] = oldY;
1350    }
1351    res->w64[1] = X.w64[1];
1352    res->w64[0] = X.w64[0];
1353 }
1354
1355 /* CALLED FROM GENERATED CODE */
1356 void arm64g_dirtyhelper_SHA1SU0 ( /*OUT*/V128* res, ULong dHi, ULong dLo,
1357                                   ULong nHi, ULong nLo, ULong mHi, ULong mLo )
1358 {
1359    res->w64[1] = nLo;
1360    res->w64[0] = dHi;
1361    res->w64[1] ^= dHi ^ mHi;
1362    res->w64[0] ^= dLo ^ mLo;
1363 }
1364
1365 /* CALLED FROM GENERATED CODE */
1366 void arm64g_dirtyhelper_SHA1SU1 ( /*OUT*/V128* res, ULong dHi, ULong dLo,
1367                                   ULong nHi, ULong nLo )
1368 {
1369    /* This computes "T = Vd ^ (Vn >>u 32)" */
1370    V128 T; T.w64[1] = nHi; T.w64[0] = nLo;
1371    T.w32[0] = T.w32[1];
1372    T.w32[1] = T.w32[2];
1373    T.w32[2] = T.w32[3];
1374    T.w32[3] = 0;
1375    T.w64[1] ^= dHi;
1376    T.w64[0] ^= dLo;
1377    /* */
1378    res->w32[0] = ROL32(T.w32[0], 1);
1379    res->w32[1] = ROL32(T.w32[1], 1);
1380    res->w32[2] = ROL32(T.w32[2], 1);
1381    res->w32[3] = ROL32(T.w32[3], 1) ^ ROL32(T.w32[0], 2);
1382 }
1383
1384 /* CALLED FROM GENERATED CODE */
1385 void arm64g_dirtyhelper_SHA256H2 ( /*OUT*/V128* res, ULong dHi, ULong dLo,
1386                                    ULong nHi, ULong nLo, ULong mHi, ULong mLo )
1387 {
1388    V128 X; X.w64[1] = nHi; X.w64[0] = nLo;
1389    V128 Y; Y.w64[1] = dHi; Y.w64[0] = dLo;
1390    V128 W; W.w64[1] = mHi; W.w64[0] = mLo;
1391    SHA256hash(&X, &Y, &W);
1392    res->w64[1] = Y.w64[1];
1393    res->w64[0] = Y.w64[0];
1394 }
1395
1396 /* CALLED FROM GENERATED CODE */
1397 void arm64g_dirtyhelper_SHA256H ( /*OUT*/V128* res, ULong dHi, ULong dLo,
1398                                   ULong nHi, ULong nLo, ULong mHi, ULong mLo )
1399 {
1400    V128 X; X.w64[1] = dHi; X.w64[0] = dLo;
1401    V128 Y; Y.w64[1] = nHi; Y.w64[0] = nLo;
1402    V128 W; W.w64[1] = mHi; W.w64[0] = mLo;
1403    SHA256hash(&X, &Y, &W);
1404    res->w64[1] = X.w64[1];
1405    res->w64[0] = X.w64[0];
1406 }
1407
1408 /* CALLED FROM GENERATED CODE */
1409 void arm64g_dirtyhelper_SHA256SU0 ( /*OUT*/V128* res, ULong dHi, ULong dLo,
1410                                     ULong nHi, ULong nLo )
1411
1412 {
1413    res->w64[1] = res->w64[0] = 0;
1414    V128 op1; op1.w64[1] = dHi; op1.w64[0] = dLo;
1415    V128 op2; op2.w64[1] = nHi; op2.w64[0] = nLo;
1416    V128 T;
1417    T.w32[3] = op2.w32[0];
1418    T.w32[2] = op1.w32[3];
1419    T.w32[1] = op1.w32[2];
1420    T.w32[0] = op1.w32[1];
1421    UInt e;
1422    for (e = 0; e <= 3; e++) {
1423       UInt elt = T.w32[e];
1424       elt = ROR32(elt, 7) ^ ROR32(elt, 18) ^ (elt >> 3);
1425       res->w32[e] = elt + op1.w32[e];
1426    }
1427 }
1428
1429 /* CALLED FROM GENERATED CODE */
1430 void arm64g_dirtyhelper_SHA256SU1 ( /*OUT*/V128* res, ULong dHi, ULong dLo,
1431                                     ULong nHi, ULong nLo,
1432                                     ULong mHi, ULong mLo )
1433 {
1434    res->w64[0] = res->w64[1] = 0;
1435    V128 op1; op1.w64[1] = dHi; op1.w64[0] = dLo;
1436    V128 op2; op2.w64[1] = nHi; op2.w64[0] = nLo;
1437    V128 op3; op3.w64[1] = mHi; op3.w64[0] = mLo;
1438    V128 T0;
1439    T0.w32[3] = op3.w32[0];
1440    T0.w32[2] = op2.w32[3];
1441    T0.w32[1] = op2.w32[2];
1442    T0.w32[0] = op2.w32[1];
1443    UInt T1[2];
1444    UInt e;
1445    T1[1] = op3.w32[3];
1446    T1[0] = op3.w32[2];
1447    for (e = 0; e <= 1; e++) {
1448       UInt elt = T1[e];
1449       elt = ROR32(elt, 17) ^ ROR32(elt, 19) ^ (elt >> 10);
1450       elt = elt + op1.w32[e] + T0.w32[e];
1451       res->w32[e] = elt;
1452    }
1453    T1[1] = res->w32[1];
1454    T1[0] = res->w32[0];
1455    for (e = 2; e <= 3; e++) {
1456       UInt elt = T1[e-2];
1457       elt = ROR32(elt, 17) ^ ROR32(elt, 19) ^ (elt >> 10);
1458       elt = elt + op1.w32[e] + T0.w32[e];
1459       res->w32[e] = elt;
1460    }
1461 }
1462
1463 /* CALLED FROM GENERATED CODE */
1464 void arm64g_dirtyhelper_SHA512H2 ( /*OUT*/V128* res, ULong dHi, ULong dLo,
1465                                    ULong nHi, ULong nLo, ULong mHi, ULong mLo )
1466 {
1467    vassert(nHi == 0);
1468    ULong X = nLo;
1469    V128 Y; Y.w64[1] = mHi; Y.w64[0] = mLo;
1470    V128 W; W.w64[1] = dHi; W.w64[0] = dLo;
1471    ULong NSigma0 = ROR64(Y.w64[0], 28) ^ ROR64(Y.w64[0], 34)
1472       ^ ROR64(Y.w64[0], 39);
1473    res->w64[1] = (X & Y.w64[1]) ^ (X & Y.w64[0]) ^ (Y.w64[1] & Y.w64[0]);
1474    res->w64[1] += NSigma0 + W.w64[1];
1475    NSigma0 = ROR64(res->w64[1], 28) ^ ROR64(res->w64[1], 34)
1476       ^ ROR64(res->w64[1], 39);
1477    res->w64[0] = (res->w64[1] & Y.w64[0]) ^ (res->w64[1] & Y.w64[1])
1478       ^ (Y.w64[0] & Y.w64[1]);
1479    res->w64[0] += NSigma0 + W.w64[0];
1480 }
1481
1482 /* CALLED FROM GENERATED CODE */
1483 void arm64g_dirtyhelper_SHA512H ( /*OUT*/V128* res, ULong dHi, ULong dLo,
1484                                   ULong nHi, ULong nLo, ULong mHi, ULong mLo )
1485 {
1486    V128 X; X.w64[1] = nHi; X.w64[0] = nLo;
1487    V128 Y; Y.w64[1] = mHi; Y.w64[0] = mLo;
1488    V128 W; W.w64[1] = dHi; W.w64[0] = dLo;
1489    ULong MSigma1 = ROR64(Y.w64[1], 14) ^ ROR64(Y.w64[1], 18)
1490       ^ ROR64(Y.w64[1], 41);
1491    res->w64[1] = (Y.w64[1] & X.w64[0]) ^ (~Y.w64[1] & X.w64[1]);
1492    res->w64[1] += MSigma1 + W.w64[1];
1493    ULong tmp = res->w64[1] + Y.w64[0];
1494    MSigma1 = ROR64(tmp, 14) ^ ROR64(tmp, 18) ^ ROR64(tmp, 41);
1495    res->w64[0] = (tmp & Y.w64[1]) ^ (~tmp & X.w64[0]);
1496    res->w64[0] += MSigma1 + W.w64[0];
1497 }
1498
1499 /* CALLED FROM GENERATED CODE */
1500 void arm64g_dirtyhelper_SHA512SU0 ( /*OUT*/V128* res, ULong dHi, ULong dLo,
1501                                     ULong nHi, ULong nLo )
1502
1503 {
1504    vassert(nHi == 0);
1505    ULong X = nLo;
1506    V128 W; W.w64[1] = dHi; W.w64[0] = dLo;
1507    ULong sig0 = ROR64(W.w64[1], 1) ^ ROR64(W.w64[1], 8) ^ (W.w64[1] >> 7);
1508    res->w64[0] = W.w64[0] + sig0;
1509    sig0 = ROR64(X, 1) ^ ROR64(X, 8) ^ (X >> 7);
1510    res->w64[1] = W.w64[1] + sig0;
1511 }
1512
1513 /* CALLED FROM GENERATED CODE */
1514 void arm64g_dirtyhelper_SHA512SU1 ( /*OUT*/V128* res, ULong dHi, ULong dLo,
1515                                     ULong nHi, ULong nLo,
1516                                     ULong mHi, ULong mLo )
1517 {
1518    V128 X; X.w64[1] = nHi; X.w64[0] = nLo;
1519    V128 Y; Y.w64[1] = mHi; Y.w64[0] = mLo;
1520    V128 W; W.w64[1] = dHi; W.w64[0] = dLo;
1521    ULong sig1 = ROR64(X.w64[1], 19) ^ ROR64(X.w64[1], 61) ^ (X.w64[1] >> 6);
1522    res->w64[1] = W.w64[1] + sig1 + Y.w64[1];
1523    sig1 = ROR64(X.w64[0], 19) ^ ROR64(X.w64[0], 61) ^ (X.w64[0] >> 6);
1524    res->w64[0] = W.w64[0] + sig1 + Y.w64[0];
1525 }
1526
1527
1528 /*---------------------------------------------------------------*/
1529 /*--- Flag-helpers translation-time function specialisers.    ---*/
1530 /*--- These help iropt specialise calls the above run-time    ---*/
1531 /*--- flags functions.                                        ---*/
1532 /*---------------------------------------------------------------*/
1533
1534 /* Used by the optimiser to try specialisations.  Returns an
1535    equivalent expression, or NULL if none. */
1536
1537 static Bool isU64 ( IRExpr* e, ULong n )
1538 {
1539    return
1540       toBool( e->tag == Iex_Const
1541               && e->Iex.Const.con->tag == Ico_U64
1542               && e->Iex.Const.con->Ico.U64 == n );
1543 }
1544
1545 IRExpr* guest_arm64_spechelper ( const HChar* function_name,
1546                                  IRExpr** args,
1547                                  IRStmt** precedingStmts,
1548                                  Int      n_precedingStmts )
1549 {
1550 #  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
1551 #  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
1552 #  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
1553 #  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
1554 #  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
1555
1556    Int i, arity = 0;
1557    for (i = 0; args[i]; i++)
1558       arity++;
1559 //ZZ #  if 0
1560 //ZZ    vex_printf("spec request:\n");
1561 //ZZ    vex_printf("   %s  ", function_name);
1562 //ZZ    for (i = 0; i < arity; i++) {
1563 //ZZ       vex_printf("  ");
1564 //ZZ       ppIRExpr(args[i]);
1565 //ZZ    }
1566 //ZZ    vex_printf("\n");
1567 //ZZ #  endif
1568
1569    /* --------- specialising "arm64g_calculate_condition" --------- */
1570
1571    if (vex_streq(function_name, "arm64g_calculate_condition")) {
1572
1573       /* specialise calls to the "arm64g_calculate_condition" function.
1574          Not sure whether this is strictly necessary, but: the
1575          replacement IR must produce only the values 0 or 1.  Bits
1576          63:1 are required to be zero. */
1577       IRExpr *cond_n_op, *cc_dep1, *cc_dep2  ; //, *cc_ndep;
1578       vassert(arity == 4);
1579       cond_n_op = args[0]; /* (ARM64Condcode << 4)  |  ARM64G_CC_OP_* */
1580       cc_dep1   = args[1];
1581       cc_dep2   = args[2];
1582       //cc_ndep   = args[3];
1583
1584       /*---------------- SUB64 ----------------*/
1585
1586       /* 0, 1 */
1587       if (isU64(cond_n_op, (ARM64CondEQ << 4) | ARM64G_CC_OP_SUB64)) {
1588          /* EQ after SUB --> test argL == argR */
1589          return unop(Iop_1Uto64,
1590                      binop(Iop_CmpEQ64, cc_dep1, cc_dep2));
1591       }
1592       if (isU64(cond_n_op, (ARM64CondNE << 4) | ARM64G_CC_OP_SUB64)) {
1593          /* NE after SUB --> test argL != argR */
1594          return unop(Iop_1Uto64,
1595                      binop(Iop_CmpNE64, cc_dep1, cc_dep2));
1596       }
1597
1598       /* 2, 3 */
1599       if (isU64(cond_n_op, (ARM64CondCS << 4) | ARM64G_CC_OP_SUB64)) {
1600          /* CS after SUB --> test argL >=u argR
1601                          --> test argR <=u argL */
1602          return unop(Iop_1Uto64,
1603                      binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
1604       }
1605       if (isU64(cond_n_op, (ARM64CondCC << 4) | ARM64G_CC_OP_SUB64)) {
1606          /* CC after SUB --> test argL <u argR */
1607          return unop(Iop_1Uto64,
1608                      binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
1609       }
1610
1611       /* 8, 9 */
1612       if (isU64(cond_n_op, (ARM64CondLS << 4) | ARM64G_CC_OP_SUB64)) {
1613          /* LS after SUB --> test argL <=u argR */
1614          return unop(Iop_1Uto64,
1615                      binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
1616       }
1617       if (isU64(cond_n_op, (ARM64CondHI << 4) | ARM64G_CC_OP_SUB64)) {
1618          /* HI after SUB --> test argL >u argR
1619                          --> test argR <u argL */
1620          return unop(Iop_1Uto64,
1621                      binop(Iop_CmpLT64U, cc_dep2, cc_dep1));
1622       }
1623
1624       /* 10, 11 */
1625       if (isU64(cond_n_op, (ARM64CondLT << 4) | ARM64G_CC_OP_SUB64)) {
1626          /* LT after SUB --> test argL <s argR */
1627          return unop(Iop_1Uto64,
1628                      binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
1629       }
1630       if (isU64(cond_n_op, (ARM64CondGE << 4) | ARM64G_CC_OP_SUB64)) {
1631          /* GE after SUB --> test argL >=s argR
1632                          --> test argR <=s argL */
1633          return unop(Iop_1Uto64,
1634                      binop(Iop_CmpLE64S, cc_dep2, cc_dep1));
1635       }
1636
1637       /* 12, 13 */
1638       if (isU64(cond_n_op, (ARM64CondGT << 4) | ARM64G_CC_OP_SUB64)) {
1639          /* GT after SUB --> test argL >s argR
1640                          --> test argR <s argL */
1641          return unop(Iop_1Uto64,
1642                      binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
1643       }
1644       if (isU64(cond_n_op, (ARM64CondLE << 4) | ARM64G_CC_OP_SUB64)) {
1645          /* LE after SUB --> test argL <=s argR */
1646          return unop(Iop_1Uto64,
1647                      binop(Iop_CmpLE64S, cc_dep1, cc_dep2));
1648       }
1649
1650       /*---------------- SUB32 ----------------*/
1651
1652       /* 0, 1 */
1653       if (isU64(cond_n_op, (ARM64CondEQ << 4) | ARM64G_CC_OP_SUB32)) {
1654          /* EQ after SUB --> test argL == argR */
1655          return unop(Iop_1Uto64,
1656                      binop(Iop_CmpEQ32, unop(Iop_64to32, cc_dep1),
1657                                         unop(Iop_64to32, cc_dep2)));
1658       }
1659       if (isU64(cond_n_op, (ARM64CondNE << 4) | ARM64G_CC_OP_SUB32)) {
1660          /* NE after SUB --> test argL != argR */
1661          return unop(Iop_1Uto64,
1662                      binop(Iop_CmpNE32, unop(Iop_64to32, cc_dep1),
1663                                         unop(Iop_64to32, cc_dep2)));
1664       }
1665
1666       /* 2, 3 */
1667       if (isU64(cond_n_op, (ARM64CondCS << 4) | ARM64G_CC_OP_SUB32)) {
1668          /* CS after SUB --> test argL >=u argR
1669                          --> test argR <=u argL */
1670          return unop(Iop_1Uto64,
1671                      binop(Iop_CmpLE32U, unop(Iop_64to32, cc_dep2),
1672                                          unop(Iop_64to32, cc_dep1)));
1673       }
1674       if (isU64(cond_n_op, (ARM64CondCC << 4) | ARM64G_CC_OP_SUB32)) {
1675          /* CC after SUB --> test argL <u argR */
1676          return unop(Iop_1Uto64,
1677                      binop(Iop_CmpLT32U, unop(Iop_64to32, cc_dep1),
1678                                          unop(Iop_64to32, cc_dep2)));
1679       }
1680
1681       /* 8, 9 */
1682       if (isU64(cond_n_op, (ARM64CondLS << 4) | ARM64G_CC_OP_SUB32)) {
1683          /* LS after SUB --> test argL <=u argR */
1684          return unop(Iop_1Uto64,
1685                      binop(Iop_CmpLE32U, unop(Iop_64to32, cc_dep1),
1686                                          unop(Iop_64to32, cc_dep2)));
1687       }
1688       if (isU64(cond_n_op, (ARM64CondHI << 4) | ARM64G_CC_OP_SUB32)) {
1689          /* HI after SUB --> test argL >u argR
1690                          --> test argR <u argL */
1691          return unop(Iop_1Uto64,
1692                      binop(Iop_CmpLT32U, unop(Iop_64to32, cc_dep2),
1693                                          unop(Iop_64to32, cc_dep1)));
1694       }
1695
1696       /* 10, 11 */
1697       if (isU64(cond_n_op, (ARM64CondLT << 4) | ARM64G_CC_OP_SUB32)) {
1698          /* LT after SUB --> test argL <s argR */
1699          return unop(Iop_1Uto64,
1700                      binop(Iop_CmpLT32S, unop(Iop_64to32, cc_dep1),
1701                                          unop(Iop_64to32, cc_dep2)));
1702       }
1703       if (isU64(cond_n_op, (ARM64CondGE << 4) | ARM64G_CC_OP_SUB32)) {
1704          /* GE after SUB --> test argL >=s argR
1705                          --> test argR <=s argL */
1706          return unop(Iop_1Uto64,
1707                      binop(Iop_CmpLE32S, unop(Iop_64to32, cc_dep2),
1708                                          unop(Iop_64to32, cc_dep1)));
1709       }
1710
1711       /* 12, 13 */
1712       if (isU64(cond_n_op, (ARM64CondGT << 4) | ARM64G_CC_OP_SUB32)) {
1713          /* GT after SUB --> test argL >s argR
1714                          --> test argR <s argL */
1715          return unop(Iop_1Uto64,
1716                      binop(Iop_CmpLT32S, unop(Iop_64to32, cc_dep2),
1717                                          unop(Iop_64to32, cc_dep1)));
1718       }
1719       if (isU64(cond_n_op, (ARM64CondLE << 4) | ARM64G_CC_OP_SUB32)) {
1720          /* LE after SUB --> test argL <=s argR */
1721          return unop(Iop_1Uto64,
1722                      binop(Iop_CmpLE32S, unop(Iop_64to32, cc_dep1),
1723                                          unop(Iop_64to32, cc_dep2)));
1724       }
1725
1726 //ZZ       /*---------------- SBB ----------------*/
1727 //ZZ
1728 //ZZ       if (isU32(cond_n_op, (ARMCondHS << 4) | ARMG_CC_OP_SBB)) {
1729 //ZZ          /* This seems to happen a lot in softfloat code, eg __divdf3+140 */
1730 //ZZ          /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */
1731 //ZZ          /* HS after SBB (same as C after SBB below)
1732 //ZZ             --> oldC ? (argL >=u argR) : (argL >u argR)
1733 //ZZ             --> oldC ? (argR <=u argL) : (argR <u argL)
1734 //ZZ          */
1735 //ZZ          return
1736 //ZZ             IRExpr_ITE(
1737 //ZZ                binop(Iop_CmpNE32, cc_ndep, mkU32(0)),
1738 //ZZ                /* case oldC != 0 */
1739 //ZZ                unop(Iop_1Uto32, binop(Iop_CmpLE32U, cc_dep2, cc_dep1)),
1740 //ZZ                /* case oldC == 0 */
1741 //ZZ                unop(Iop_1Uto32, binop(Iop_CmpLT32U, cc_dep2, cc_dep1))
1742 //ZZ             );
1743 //ZZ       }
1744
1745       /*---------------- LOGIC32 ----------------*/
1746
1747       if (isU64(cond_n_op, (ARM64CondEQ << 4) | ARM64G_CC_OP_LOGIC32)) {
1748          /* EQ after LOGIC32 --> test res[31:0] == 0 */
1749          return unop(Iop_1Uto64,
1750                      binop(Iop_CmpEQ32,
1751                            unop(Iop_64to32, cc_dep1), mkU32(0)));
1752       }
1753       if (isU64(cond_n_op, (ARM64CondNE << 4) | ARM64G_CC_OP_LOGIC32)) {
1754          /* NE after LOGIC32 --> test res[31:0] != 0 */
1755          return unop(Iop_1Uto64,
1756                      binop(Iop_CmpNE32,
1757                            unop(Iop_64to32, cc_dep1), mkU32(0)));
1758       }
1759
1760       /*---------------- LOGIC64 ----------------*/
1761
1762       if (isU64(cond_n_op, (ARM64CondEQ << 4) | ARM64G_CC_OP_LOGIC64)) {
1763          /* EQ after LOGIC64 --> test res[63:0] == 0 */
1764          return unop(Iop_1Uto64,
1765                      binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
1766       }
1767       if (isU64(cond_n_op, (ARM64CondNE << 4) | ARM64G_CC_OP_LOGIC64)) {
1768          /* NE after LOGIC64 --> test res[63:0] != 0 */
1769          return unop(Iop_1Uto64,
1770                      binop(Iop_CmpNE64, cc_dep1, mkU64(0)));
1771       }
1772
1773 //ZZ       if (isU32(cond_n_op, (ARMCondNE << 4) | ARMG_CC_OP_LOGIC)) {
1774 //ZZ          /* NE after LOGIC --> test res != 0 */
1775 //ZZ          return unop(Iop_1Uto32,
1776 //ZZ                      binop(Iop_CmpNE32, cc_dep1, mkU32(0)));
1777 //ZZ       }
1778 //ZZ
1779 //ZZ       if (isU32(cond_n_op, (ARMCondPL << 4) | ARMG_CC_OP_LOGIC)) {
1780 //ZZ          /* PL after LOGIC --> test (res >> 31) == 0 */
1781 //ZZ          return unop(Iop_1Uto32,
1782 //ZZ                      binop(Iop_CmpEQ32,
1783 //ZZ                            binop(Iop_Shr32, cc_dep1, mkU8(31)),
1784 //ZZ                            mkU32(0)));
1785 //ZZ       }
1786 //ZZ       if (isU32(cond_n_op, (ARMCondMI << 4) | ARMG_CC_OP_LOGIC)) {
1787 //ZZ          /* MI after LOGIC --> test (res >> 31) == 1 */
1788 //ZZ          return unop(Iop_1Uto32,
1789 //ZZ                      binop(Iop_CmpEQ32,
1790 //ZZ                            binop(Iop_Shr32, cc_dep1, mkU8(31)),
1791 //ZZ                            mkU32(1)));
1792 //ZZ       }
1793
1794       /*---------------- COPY ----------------*/
1795
1796       if (isU64(cond_n_op, (ARM64CondEQ << 4) | ARM64G_CC_OP_COPY)) {
1797          /* EQ after COPY --> (cc_dep1 >> ARM64G_CC_SHIFT_Z) & 1 */
1798          return binop(Iop_And64,
1799                       binop(Iop_Shr64, cc_dep1,
1800                                        mkU8(ARM64G_CC_SHIFT_Z)),
1801                       mkU64(1));
1802       }
1803       if (isU64(cond_n_op, (ARM64CondNE << 4) | ARM64G_CC_OP_COPY)) {
1804          /* NE after COPY --> ((cc_dep1 >> ARM64G_CC_SHIFT_Z) ^ 1) & 1 */
1805          return binop(Iop_And64,
1806                       binop(Iop_Xor64,
1807                             binop(Iop_Shr64, cc_dep1,
1808                                              mkU8(ARM64G_CC_SHIFT_Z)),
1809                             mkU64(1)),
1810                       mkU64(1));
1811       }
1812
1813 //ZZ       /*----------------- AL -----------------*/
1814 //ZZ
1815 //ZZ       /* A critically important case for Thumb code.
1816 //ZZ
1817 //ZZ          What we're trying to spot is the case where cond_n_op is an
1818 //ZZ          expression of the form Or32(..., 0xE0) since that means the
1819 //ZZ          caller is asking for CondAL and we can simply return 1
1820 //ZZ          without caring what the ... part is.  This is a potentially
1821 //ZZ          dodgy kludge in that it assumes that the ... part has zeroes
1822 //ZZ          in bits 7:4, so that the result of the Or32 is guaranteed to
1823 //ZZ          be 0xE in bits 7:4.  Given that the places where this first
1824 //ZZ          arg are constructed (in guest_arm_toIR.c) are very
1825 //ZZ          constrained, we can get away with this.  To make this
1826 //ZZ          guaranteed safe would require to have a new primop, Slice44
1827 //ZZ          or some such, thusly
1828 //ZZ
1829 //ZZ          Slice44(arg1, arg2) = 0--(24)--0 arg1[7:4] arg2[3:0]
1830 //ZZ
1831 //ZZ          and we would then look for Slice44(0xE0, ...)
1832 //ZZ          which would give the required safety property.
1833 //ZZ
1834 //ZZ          It would be infeasibly expensive to scan backwards through
1835 //ZZ          the entire block looking for an assignment to the temp, so
1836 //ZZ          just look at the previous 16 statements.  That should find it
1837 //ZZ          if it is an interesting case, as a result of how the
1838 //ZZ          boilerplate guff at the start of each Thumb insn translation
1839 //ZZ          is made.
1840 //ZZ       */
1841 //ZZ       if (cond_n_op->tag == Iex_RdTmp) {
1842 //ZZ          Int    j;
1843 //ZZ          IRTemp look_for = cond_n_op->Iex.RdTmp.tmp;
1844 //ZZ          Int    limit    = n_precedingStmts - 16;
1845 //ZZ          if (limit < 0) limit = 0;
1846 //ZZ          if (0) vex_printf("scanning %d .. %d\n", n_precedingStmts-1, limit);
1847 //ZZ          for (j = n_precedingStmts - 1; j >= limit; j--) {
1848 //ZZ             IRStmt* st = precedingStmts[j];
1849 //ZZ             if (st->tag == Ist_WrTmp
1850 //ZZ                 && st->Ist.WrTmp.tmp == look_for
1851 //ZZ                 && st->Ist.WrTmp.data->tag == Iex_Binop
1852 //ZZ                 && st->Ist.WrTmp.data->Iex.Binop.op == Iop_Or32
1853 //ZZ                 && isU32(st->Ist.WrTmp.data->Iex.Binop.arg2, (ARMCondAL << 4)))
1854 //ZZ                return mkU32(1);
1855 //ZZ          }
1856 //ZZ          /* Didn't find any useful binding to the first arg
1857 //ZZ             in the previous 16 stmts. */
1858 //ZZ       }
1859    }
1860
1861 //ZZ    /* --------- specialising "armg_calculate_flag_c" --------- */
1862 //ZZ
1863 //ZZ    else
1864 //ZZ    if (vex_streq(function_name, "armg_calculate_flag_c")) {
1865 //ZZ
1866 //ZZ       /* specialise calls to the "armg_calculate_flag_c" function.
1867 //ZZ          Note that the returned value must be either 0 or 1; nonzero
1868 //ZZ          bits 31:1 are not allowed.  In turn, incoming oldV and oldC
1869 //ZZ          values (from the thunk) are assumed to have bits 31:1
1870 //ZZ          clear. */
1871 //ZZ       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
1872 //ZZ       vassert(arity == 4);
1873 //ZZ       cc_op   = args[0]; /* ARMG_CC_OP_* */
1874 //ZZ       cc_dep1 = args[1];
1875 //ZZ       cc_dep2 = args[2];
1876 //ZZ       cc_ndep = args[3];
1877 //ZZ
1878 //ZZ       if (isU32(cc_op, ARMG_CC_OP_LOGIC)) {
1879 //ZZ          /* Thunk args are (result, shco, oldV) */
1880 //ZZ          /* C after LOGIC --> shco */
1881 //ZZ          return cc_dep2;
1882 //ZZ       }
1883 //ZZ
1884 //ZZ       if (isU32(cc_op, ARMG_CC_OP_SUB)) {
1885 //ZZ          /* Thunk args are (argL, argR, unused) */
1886 //ZZ          /* C after SUB --> argL >=u argR
1887 //ZZ                         --> argR <=u argL */
1888 //ZZ          return unop(Iop_1Uto32,
1889 //ZZ                      binop(Iop_CmpLE32U, cc_dep2, cc_dep1));
1890 //ZZ       }
1891 //ZZ
1892 //ZZ       if (isU32(cc_op, ARMG_CC_OP_SBB)) {
1893 //ZZ          /* This happens occasionally in softfloat code, eg __divdf3+140 */
1894 //ZZ          /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */
1895 //ZZ          /* C after SBB (same as HS after SBB above)
1896 //ZZ             --> oldC ? (argL >=u argR) : (argL >u argR)
1897 //ZZ             --> oldC ? (argR <=u argL) : (argR <u argL)
1898 //ZZ          */
1899 //ZZ          return
1900 //ZZ             IRExpr_ITE(
1901 //ZZ                binop(Iop_CmpNE32, cc_ndep, mkU32(0)),
1902 //ZZ                /* case oldC != 0 */
1903 //ZZ                unop(Iop_1Uto32, binop(Iop_CmpLE32U, cc_dep2, cc_dep1)),
1904 //ZZ                /* case oldC == 0 */
1905 //ZZ                unop(Iop_1Uto32, binop(Iop_CmpLT32U, cc_dep2, cc_dep1))
1906 //ZZ             );
1907 //ZZ       }
1908 //ZZ
1909 //ZZ    }
1910 //ZZ
1911 //ZZ    /* --------- specialising "armg_calculate_flag_v" --------- */
1912 //ZZ
1913 //ZZ    else
1914 //ZZ    if (vex_streq(function_name, "armg_calculate_flag_v")) {
1915 //ZZ
1916 //ZZ       /* specialise calls to the "armg_calculate_flag_v" function.
1917 //ZZ          Note that the returned value must be either 0 or 1; nonzero
1918 //ZZ          bits 31:1 are not allowed.  In turn, incoming oldV and oldC
1919 //ZZ          values (from the thunk) are assumed to have bits 31:1
1920 //ZZ          clear. */
1921 //ZZ       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
1922 //ZZ       vassert(arity == 4);
1923 //ZZ       cc_op   = args[0]; /* ARMG_CC_OP_* */
1924 //ZZ       cc_dep1 = args[1];
1925 //ZZ       cc_dep2 = args[2];
1926 //ZZ       cc_ndep = args[3];
1927 //ZZ
1928 //ZZ       if (isU32(cc_op, ARMG_CC_OP_LOGIC)) {
1929 //ZZ          /* Thunk args are (result, shco, oldV) */
1930 //ZZ          /* V after LOGIC --> oldV */
1931 //ZZ          return cc_ndep;
1932 //ZZ       }
1933 //ZZ
1934 //ZZ       if (isU32(cc_op, ARMG_CC_OP_SUB)) {
1935 //ZZ          /* Thunk args are (argL, argR, unused) */
1936 //ZZ          /* V after SUB
1937 //ZZ             --> let res = argL - argR
1938 //ZZ                 in ((argL ^ argR) & (argL ^ res)) >> 31
1939 //ZZ             --> ((argL ^ argR) & (argL ^ (argL - argR))) >> 31
1940 //ZZ          */
1941 //ZZ          IRExpr* argL = cc_dep1;
1942 //ZZ          IRExpr* argR = cc_dep2;
1943 //ZZ          return
1944 //ZZ             binop(Iop_Shr32,
1945 //ZZ                   binop(Iop_And32,
1946 //ZZ                         binop(Iop_Xor32, argL, argR),
1947 //ZZ                         binop(Iop_Xor32, argL, binop(Iop_Sub32, argL, argR))
1948 //ZZ                   ),
1949 //ZZ                   mkU8(31)
1950 //ZZ             );
1951 //ZZ       }
1952 //ZZ
1953 //ZZ       if (isU32(cc_op, ARMG_CC_OP_SBB)) {
1954 //ZZ          /* This happens occasionally in softfloat code, eg __divdf3+140 */
1955 //ZZ          /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */
1956 //ZZ          /* V after SBB
1957 //ZZ             --> let res = argL - argR - (oldC ^ 1)
1958 //ZZ                 in  (argL ^ argR) & (argL ^ res) & 1
1959 //ZZ          */
1960 //ZZ          return
1961 //ZZ             binop(
1962 //ZZ                Iop_And32,
1963 //ZZ                binop(
1964 //ZZ                   Iop_And32,
1965 //ZZ                   // argL ^ argR
1966 //ZZ                   binop(Iop_Xor32, cc_dep1, cc_dep2),
1967 //ZZ                   // argL ^ (argL - argR - (oldC ^ 1))
1968 //ZZ                   binop(Iop_Xor32,
1969 //ZZ                         cc_dep1,
1970 //ZZ                         binop(Iop_Sub32,
1971 //ZZ                               binop(Iop_Sub32, cc_dep1, cc_dep2),
1972 //ZZ                               binop(Iop_Xor32, cc_ndep, mkU32(1)))
1973 //ZZ                   )
1974 //ZZ                ),
1975 //ZZ                mkU32(1)
1976 //ZZ             );
1977 //ZZ       }
1978 //ZZ
1979 //ZZ    }
1980
1981 #  undef unop
1982 #  undef binop
1983 #  undef mkU64
1984 #  undef mkU8
1985
1986    return NULL;
1987 }
1988
1989
1990 /*----------------------------------------------*/
1991 /*--- The exported fns ..                    ---*/
1992 /*----------------------------------------------*/
1993
1994 //ZZ /* VISIBLE TO LIBVEX CLIENT */
1995 //ZZ #if 0
1996 //ZZ void LibVEX_GuestARM_put_flags ( UInt flags_native,
1997 //ZZ                                  /*OUT*/VexGuestARMState* vex_state )
1998 //ZZ {
1999 //ZZ    vassert(0); // FIXME
2000 //ZZ
2001 //ZZ    /* Mask out everything except N Z V C. */
2002 //ZZ    flags_native
2003 //ZZ       &= (ARMG_CC_MASK_N | ARMG_CC_MASK_Z | ARMG_CC_MASK_V | ARMG_CC_MASK_C);
2004 //ZZ
2005 //ZZ    vex_state->guest_CC_OP   = ARMG_CC_OP_COPY;
2006 //ZZ    vex_state->guest_CC_DEP1 = flags_native;
2007 //ZZ    vex_state->guest_CC_DEP2 = 0;
2008 //ZZ    vex_state->guest_CC_NDEP = 0;
2009 //ZZ }
2010 //ZZ #endif
2011
2012 /* negative zero carry o-v-erflow flags */
2013 /* VISIBLE TO LIBVEX CLIENT */
2014 ULong LibVEX_GuestARM64_get_nzcv ( /*IN*/const VexGuestARM64State* vex_state )
2015 {
2016    ULong nzcv = 0;
2017    // NZCV
2018    nzcv |= arm64g_calculate_flags_nzcv(
2019                vex_state->guest_CC_OP,
2020                vex_state->guest_CC_DEP1,
2021                vex_state->guest_CC_DEP2,
2022                vex_state->guest_CC_NDEP
2023             );
2024    vassert(0 == (nzcv & 0xFFFFFFFF0FFFFFFFULL));
2025 //ZZ    // Q
2026 //ZZ    if (vex_state->guest_QFLAG32 > 0)
2027 //ZZ       cpsr |= (1 << 27);
2028 //ZZ    // GE
2029 //ZZ    if (vex_state->guest_GEFLAG0 > 0)
2030 //ZZ       cpsr |= (1 << 16);
2031 //ZZ    if (vex_state->guest_GEFLAG1 > 0)
2032 //ZZ       cpsr |= (1 << 17);
2033 //ZZ    if (vex_state->guest_GEFLAG2 > 0)
2034 //ZZ       cpsr |= (1 << 18);
2035 //ZZ    if (vex_state->guest_GEFLAG3 > 0)
2036 //ZZ       cpsr |= (1 << 19);
2037 //ZZ    // M
2038 //ZZ    cpsr |= (1 << 4); // 0b10000 means user-mode
2039 //ZZ    // J,T   J (bit 24) is zero by initialisation above
2040 //ZZ    // T  we copy from R15T[0]
2041 //ZZ    if (vex_state->guest_R15T & 1)
2042 //ZZ       cpsr |= (1 << 5);
2043 //ZZ    // ITSTATE we punt on for the time being.  Could compute it
2044 //ZZ    // if needed though.
2045 //ZZ    // E, endianness, 0 (littleendian) from initialisation above
2046 //ZZ    // A,I,F disable some async exceptions.  Not sure about these.
2047 //ZZ    // Leave as zero for the time being.
2048    return nzcv;
2049 }
2050
2051 /* floating point status resgister */
2052 /* VISIBLE TO LIBVEX CLIENT */
2053 ULong LibVEX_GuestARM64_get_fpsr ( const VexGuestARM64State* vex_state )
2054 {
2055    UInt w32 = vex_state->guest_QCFLAG[0] | vex_state->guest_QCFLAG[1]
2056               | vex_state->guest_QCFLAG[2] | vex_state->guest_QCFLAG[3];
2057    ULong fpsr = 0;
2058    // QC
2059    if (w32 != 0)
2060       fpsr |= (1 << 27);
2061    return fpsr;
2062 }
2063
2064 void LibVEX_GuestARM64_set_fpsr ( /*MOD*/VexGuestARM64State* vex_state,
2065                                   ULong fpsr )
2066 {
2067    // QC
2068    vex_state->guest_QCFLAG[0] = (UInt)((fpsr >> 27) & 1);
2069    vex_state->guest_QCFLAG[1] = 0;
2070    vex_state->guest_QCFLAG[2] = 0;
2071    vex_state->guest_QCFLAG[3] = 0;
2072 }
2073
2074 /* VISIBLE TO LIBVEX CLIENT */
2075 void LibVEX_GuestARM64_initialise ( /*OUT*/VexGuestARM64State* vex_state )
2076 {
2077    vex_bzero(vex_state, sizeof(*vex_state));
2078 //ZZ    vex_state->host_EvC_FAILADDR = 0;
2079 //ZZ    vex_state->host_EvC_COUNTER = 0;
2080 //ZZ
2081 //ZZ    vex_state->guest_R0  = 0;
2082 //ZZ    vex_state->guest_R1  = 0;
2083 //ZZ    vex_state->guest_R2  = 0;
2084 //ZZ    vex_state->guest_R3  = 0;
2085 //ZZ    vex_state->guest_R4  = 0;
2086 //ZZ    vex_state->guest_R5  = 0;
2087 //ZZ    vex_state->guest_R6  = 0;
2088 //ZZ    vex_state->guest_R7  = 0;
2089 //ZZ    vex_state->guest_R8  = 0;
2090 //ZZ    vex_state->guest_R9  = 0;
2091 //ZZ    vex_state->guest_R10 = 0;
2092 //ZZ    vex_state->guest_R11 = 0;
2093 //ZZ    vex_state->guest_R12 = 0;
2094 //ZZ    vex_state->guest_R13 = 0;
2095 //ZZ    vex_state->guest_R14 = 0;
2096 //ZZ    vex_state->guest_R15T = 0;  /* NB: implies ARM mode */
2097 //ZZ
2098    vex_state->guest_CC_OP   = ARM64G_CC_OP_COPY;
2099 //ZZ    vex_state->guest_CC_DEP1 = 0;
2100 //ZZ    vex_state->guest_CC_DEP2 = 0;
2101 //ZZ    vex_state->guest_CC_NDEP = 0;
2102 //ZZ    vex_state->guest_QFLAG32 = 0;
2103 //ZZ    vex_state->guest_GEFLAG0 = 0;
2104 //ZZ    vex_state->guest_GEFLAG1 = 0;
2105 //ZZ    vex_state->guest_GEFLAG2 = 0;
2106 //ZZ    vex_state->guest_GEFLAG3 = 0;
2107 //ZZ
2108 //ZZ    vex_state->guest_EMNOTE  = EmNote_NONE;
2109 //ZZ    vex_state->guest_CMSTART = 0;
2110 //ZZ    vex_state->guest_CMLEN   = 0;
2111 //ZZ    vex_state->guest_NRADDR  = 0;
2112 //ZZ    vex_state->guest_IP_AT_SYSCALL = 0;
2113 //ZZ
2114 //ZZ    vex_state->guest_D0  = 0;
2115 //ZZ    vex_state->guest_D1  = 0;
2116 //ZZ    vex_state->guest_D2  = 0;
2117 //ZZ    vex_state->guest_D3  = 0;
2118 //ZZ    vex_state->guest_D4  = 0;
2119 //ZZ    vex_state->guest_D5  = 0;
2120 //ZZ    vex_state->guest_D6  = 0;
2121 //ZZ    vex_state->guest_D7  = 0;
2122 //ZZ    vex_state->guest_D8  = 0;
2123 //ZZ    vex_state->guest_D9  = 0;
2124 //ZZ    vex_state->guest_D10 = 0;
2125 //ZZ    vex_state->guest_D11 = 0;
2126 //ZZ    vex_state->guest_D12 = 0;
2127 //ZZ    vex_state->guest_D13 = 0;
2128 //ZZ    vex_state->guest_D14 = 0;
2129 //ZZ    vex_state->guest_D15 = 0;
2130 //ZZ    vex_state->guest_D16 = 0;
2131 //ZZ    vex_state->guest_D17 = 0;
2132 //ZZ    vex_state->guest_D18 = 0;
2133 //ZZ    vex_state->guest_D19 = 0;
2134 //ZZ    vex_state->guest_D20 = 0;
2135 //ZZ    vex_state->guest_D21 = 0;
2136 //ZZ    vex_state->guest_D22 = 0;
2137 //ZZ    vex_state->guest_D23 = 0;
2138 //ZZ    vex_state->guest_D24 = 0;
2139 //ZZ    vex_state->guest_D25 = 0;
2140 //ZZ    vex_state->guest_D26 = 0;
2141 //ZZ    vex_state->guest_D27 = 0;
2142 //ZZ    vex_state->guest_D28 = 0;
2143 //ZZ    vex_state->guest_D29 = 0;
2144 //ZZ    vex_state->guest_D30 = 0;
2145 //ZZ    vex_state->guest_D31 = 0;
2146 //ZZ
2147 //ZZ    /* ARM encoded; zero is the default as it happens (result flags
2148 //ZZ       (NZCV) cleared, FZ disabled, round to nearest, non-vector mode,
2149 //ZZ       all exns masked, all exn sticky bits cleared). */
2150 //ZZ    vex_state->guest_FPSCR = 0;
2151 //ZZ
2152 //ZZ    vex_state->guest_TPIDRURO = 0;
2153 //ZZ
2154 //ZZ    /* Not in a Thumb IT block. */
2155 //ZZ    vex_state->guest_ITSTATE = 0;
2156 //ZZ
2157 //ZZ    vex_state->padding1 = 0;
2158 //ZZ    vex_state->padding2 = 0;
2159 //ZZ    vex_state->padding3 = 0;
2160 //ZZ    vex_state->padding4 = 0;
2161 //ZZ    vex_state->padding5 = 0;
2162 }
2163
2164
2165 /*-----------------------------------------------------------*/
2166 /*--- Describing the arm guest state, for the benefit     ---*/
2167 /*--- of iropt and instrumenters.                         ---*/
2168 /*-----------------------------------------------------------*/
2169
2170 /* Figure out if any part of the guest state contained in minoff
2171    .. maxoff requires precise memory exceptions.  If in doubt return
2172    True (but this generates significantly slower code).
2173
2174    We enforce precise exns for guest SP, PC, 29(FP), 30(LR).
2175    That might be overkill (for 29 and 30); I don't know.
2176 */
2177 Bool guest_arm64_state_requires_precise_mem_exns (
2178         Int minoff, Int maxoff, VexRegisterUpdates pxControl
2179      )
2180 {
2181    Int xsp_min = offsetof(VexGuestARM64State, guest_XSP);
2182    Int xsp_max = xsp_min + 8 - 1;
2183    Int pc_min  = offsetof(VexGuestARM64State, guest_PC);
2184    Int pc_max  = pc_min + 8 - 1;
2185
2186    if (maxoff < xsp_min || minoff > xsp_max) {
2187       /* no overlap with xsp */
2188       if (pxControl == VexRegUpdSpAtMemAccess)
2189          return False; // We only need to check stack pointer.
2190    } else {
2191       return True;
2192    }
2193
2194    if (maxoff < pc_min || minoff > pc_max) {
2195       /* no overlap with pc */
2196    } else {
2197       return True;
2198    }
2199
2200    /* Guessing that we need PX for FP, but I don't really know. */
2201    Int x29_min = offsetof(VexGuestARM64State, guest_X29);
2202    Int x29_max = x29_min + 8 - 1;
2203
2204    if (maxoff < x29_min || minoff > x29_max) {
2205       /* no overlap with x29 */
2206    } else {
2207       return True;
2208    }
2209
2210    /* Guessing that we need PX for LR, but I don't really know. */
2211    Int x30_min = offsetof(VexGuestARM64State, guest_X30);
2212    Int x30_max = x30_min + 8 - 1;
2213
2214    if (maxoff < x30_min || minoff > x30_max) {
2215       /* no overlap with r30 */
2216    } else {
2217       return True;
2218    }
2219
2220    return False;
2221 }
2222
2223
2224 #define ALWAYSDEFD(field)                             \
2225     { offsetof(VexGuestARM64State, field),            \
2226       (sizeof ((VexGuestARM64State*)0)->field) }
2227 VexGuestLayout
2228    arm64Guest_layout
2229       = {
2230           /* Total size of the guest state, in bytes. */
2231           .total_sizeB = sizeof(VexGuestARM64State),
2232
2233           /* Describe the stack pointer. */
2234           .offset_SP = offsetof(VexGuestARM64State,guest_XSP),
2235           .sizeof_SP = 8,
2236
2237           /* Describe the instruction pointer. */
2238           .offset_IP = offsetof(VexGuestARM64State,guest_PC),
2239           .sizeof_IP = 8,
2240
2241           /* Describe any sections to be regarded by Memcheck as
2242              'always-defined'. */
2243           .n_alwaysDefd = 9,
2244
2245           /* flags thunk: OP is always defd, whereas DEP1 and DEP2
2246              have to be tracked.  See detailed comment in gdefs.h on
2247              meaning of thunk fields. */
2248           .alwaysDefd
2249              = { /* 0 */ ALWAYSDEFD(guest_PC),
2250                  /* 1 */ ALWAYSDEFD(guest_CC_OP),
2251                  /* 2 */ ALWAYSDEFD(guest_CC_NDEP),
2252                  /* 3 */ ALWAYSDEFD(guest_EMNOTE),
2253                  /* 4 */ ALWAYSDEFD(guest_CMSTART),
2254                  /* 5 */ ALWAYSDEFD(guest_CMLEN),
2255                  /* 6 */ ALWAYSDEFD(guest_NRADDR),
2256                  /* 7 */ ALWAYSDEFD(guest_IP_AT_SYSCALL),
2257                  /* 8 */ ALWAYSDEFD(guest_TPIDR_EL0)
2258                }
2259         };
2260
2261
2262 /*---------------------------------------------------------------*/
2263 /*--- end                               guest_arm64_helpers.c ---*/
2264 /*---------------------------------------------------------------*/