syswrap openat2 for all linux arches
[valgrind.git] / memcheck / mc_translate.c
blobec8ac53217aaae22dc723c4eec5bddf84d277a47
2 /*--------------------------------------------------------------------*/
3 /*--- Instrument IR to perform memory checking operations. ---*/
4 /*--- mc_translate.c ---*/
5 /*--------------------------------------------------------------------*/
7 /*
8 This file is part of MemCheck, a heavyweight Valgrind tool for
9 detecting memory errors.
11 Copyright (C) 2000-2017 Julian Seward
12 jseward@acm.org
14 This program is free software; you can redistribute it and/or
15 modify it under the terms of the GNU General Public License as
16 published by the Free Software Foundation; either version 2 of the
17 License, or (at your option) any later version.
19 This program is distributed in the hope that it will be useful, but
20 WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with this program; if not, see <http://www.gnu.org/licenses/>.
27 The GNU General Public License is contained in the file COPYING.
30 #include "pub_tool_basics.h"
31 #include "pub_tool_poolalloc.h" // For mc_include.h
32 #include "pub_tool_hashtable.h" // For mc_include.h
33 #include "pub_tool_libcassert.h"
34 #include "pub_tool_libcprint.h"
35 #include "pub_tool_tooliface.h"
36 #include "pub_tool_machine.h" // VG_(fnptr_to_fnentry)
37 #include "pub_tool_xarray.h"
38 #include "pub_tool_mallocfree.h"
39 #include "pub_tool_libcbase.h"
41 #include "mc_include.h"
44 /* FIXMEs JRS 2011-June-16.
46 Check the interpretation for vector narrowing and widening ops,
47 particularly the saturating ones. I suspect they are either overly
48 pessimistic and/or wrong.
50 Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
51 saturating shifts): the interpretation is overly pessimistic.
52 See comments on the relevant cases below for details.
54 Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
55 both rounding and non-rounding variants): ditto
58 /* This file implements the Memcheck instrumentation, and in
59 particular contains the core of its undefined value detection
60 machinery. For a comprehensive background of the terminology,
61 algorithms and rationale used herein, read:
63 Using Valgrind to detect undefined value errors with
64 bit-precision
66 Julian Seward and Nicholas Nethercote
68 2005 USENIX Annual Technical Conference (General Track),
69 Anaheim, CA, USA, April 10-15, 2005.
71 ----
73 Here is as good a place as any to record exactly when V bits are and
74 should be checked, why, and what function is responsible.
77 Memcheck complains when an undefined value is used:
79 1. In the condition of a conditional branch. Because it could cause
80 incorrect control flow, and thus cause incorrect externally-visible
81 behaviour. [mc_translate.c:complainIfUndefined]
83 2. As an argument to a system call, or as the value that specifies
84 the system call number. Because it could cause an incorrect
85 externally-visible side effect. [mc_translate.c:mc_pre_reg_read]
87 3. As the address in a load or store. Because it could cause an
88 incorrect value to be used later, which could cause externally-visible
89 behaviour (eg. via incorrect control flow or an incorrect system call
90 argument) [complainIfUndefined]
92 4. As the target address of a branch. Because it could cause incorrect
93 control flow. [complainIfUndefined]
95 5. As an argument to setenv, unsetenv, or putenv. Because it could put
96 an incorrect value into the external environment.
97 [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
99 6. As the index in a GETI or PUTI operation. I'm not sure why... (njn).
100 [complainIfUndefined]
102 7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
103 VALGRIND_CHECK_VALUE_IS_DEFINED client requests. Because the user
104 requested it. [in memcheck.h]
107 Memcheck also complains, but should not, when an undefined value is used:
109 8. As the shift value in certain SIMD shift operations (but not in the
110 standard integer shift operations). This inconsistency is due to
111 historical reasons.) [complainIfUndefined]
114 Memcheck does not complain, but should, when an undefined value is used:
116 9. As an input to a client request. Because the client request may
117 affect the visible behaviour -- see bug #144362 for an example
118 involving the malloc replacements in vg_replace_malloc.c and
119 VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
120 isn't identified. That bug report also has some info on how to solve
121 the problem. [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
124 In practice, 1 and 2 account for the vast majority of cases.
127 /* Generation of addr-definedness, addr-validity and
128 guard-definedness checks pertaining to loads and stores (Iex_Load,
129 Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
130 loads/stores) was re-checked 11 May 2013. */
133 /*------------------------------------------------------------*/
134 /*--- Forward decls ---*/
135 /*------------------------------------------------------------*/
137 struct _MCEnv;
139 // See below for comments explaining what this is for.
140 typedef
141 enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
142 HowUsed;
144 static IRType shadowTypeV ( IRType ty );
145 static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e,
146 HowUsed hu/*use HuOth if unknown*/ );
147 static IRTemp findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
149 static IRExpr *i128_const_zero(void);
152 /*------------------------------------------------------------*/
153 /*--- Memcheck running state, and tmp management. ---*/
154 /*------------------------------------------------------------*/
156 /* For a few (maybe 1%) IROps, we have both a cheaper, less exact vbit
157 propagation scheme, and a more expensive, more precise vbit propagation
158 scheme. This enum describes, for such an IROp, which scheme to use. */
159 typedef
160 enum {
161 // Use the cheaper, less-exact variant.
162 DLcheap=4,
163 // Choose between cheap and expensive based on analysis of the block
164 // to be instrumented. Note that the choice may be done on a
165 // per-instance basis of the IROp that this DetailLevel describes.
166 DLauto,
167 // Use the more expensive, more-exact variant.
168 DLexpensive
170 DetailLevel;
173 /* A readonly part of the running state. For IROps that have both a
174 less-exact and more-exact interpretation, records which interpretation is
175 to be used. */
176 typedef
177 struct {
178 // For Add32/64 and Sub32/64, all 3 settings are allowed. For the
179 // DLauto case, a per-instance decision is to be made by inspecting
180 // the associated tmp's entry in MCEnv.tmpHowUsed.
181 DetailLevel dl_Add32;
182 DetailLevel dl_Add64;
183 DetailLevel dl_Sub32;
184 DetailLevel dl_Sub64;
185 // For Cmp{EQ,NE}{64,32,16,8}, only DLcheap and DLexpensive are
186 // allowed.
187 DetailLevel dl_CmpEQ64_CmpNE64;
188 DetailLevel dl_CmpEQ32_CmpNE32;
189 DetailLevel dl_CmpEQ16_CmpNE16;
190 DetailLevel dl_CmpEQ8_CmpNE8;
192 DetailLevelByOp;
194 static void DetailLevelByOp__set_all ( /*OUT*/DetailLevelByOp* dlbo,
195 DetailLevel dl )
197 dlbo->dl_Add32 = dl;
198 dlbo->dl_Add64 = dl;
199 dlbo->dl_Sub32 = dl;
200 dlbo->dl_Sub64 = dl;
201 dlbo->dl_CmpEQ64_CmpNE64 = dl;
202 dlbo->dl_CmpEQ32_CmpNE32 = dl;
203 dlbo->dl_CmpEQ16_CmpNE16 = dl;
204 dlbo->dl_CmpEQ8_CmpNE8 = dl;
207 static void DetailLevelByOp__check_sanity ( const DetailLevelByOp* dlbo )
209 tl_assert(dlbo->dl_Add32 >= DLcheap && dlbo->dl_Add32 <= DLexpensive);
210 tl_assert(dlbo->dl_Add64 >= DLcheap && dlbo->dl_Add64 <= DLexpensive);
211 tl_assert(dlbo->dl_Sub32 >= DLcheap && dlbo->dl_Sub32 <= DLexpensive);
212 tl_assert(dlbo->dl_Sub64 >= DLcheap && dlbo->dl_Sub64 <= DLexpensive);
213 tl_assert(dlbo->dl_CmpEQ64_CmpNE64 == DLcheap
214 || dlbo->dl_CmpEQ64_CmpNE64 == DLexpensive);
215 tl_assert(dlbo->dl_CmpEQ32_CmpNE32 == DLcheap
216 || dlbo->dl_CmpEQ32_CmpNE32 == DLexpensive);
217 tl_assert(dlbo->dl_CmpEQ16_CmpNE16 == DLcheap
218 || dlbo->dl_CmpEQ16_CmpNE16 == DLexpensive);
219 tl_assert(dlbo->dl_CmpEQ8_CmpNE8 == DLcheap
220 || dlbo->dl_CmpEQ8_CmpNE8 == DLexpensive);
223 static UInt DetailLevelByOp__count ( const DetailLevelByOp* dlbo,
224 DetailLevel dl )
226 UInt n = 0;
227 n += (dlbo->dl_Add32 == dl ? 1 : 0);
228 n += (dlbo->dl_Add64 == dl ? 1 : 0);
229 n += (dlbo->dl_Sub32 == dl ? 1 : 0);
230 n += (dlbo->dl_Sub64 == dl ? 1 : 0);
231 n += (dlbo->dl_CmpEQ64_CmpNE64 == dl ? 1 : 0);
232 n += (dlbo->dl_CmpEQ32_CmpNE32 == dl ? 1 : 0);
233 n += (dlbo->dl_CmpEQ16_CmpNE16 == dl ? 1 : 0);
234 n += (dlbo->dl_CmpEQ8_CmpNE8 == dl ? 1 : 0);
235 return n;
239 /* Carries info about a particular tmp. The tmp's number is not
240 recorded, as this is implied by (equal to) its index in the tmpMap
241 in MCEnv. The tmp's type is also not recorded, as this is present
242 in MCEnv.sb->tyenv.
244 When .kind is Orig, .shadowV and .shadowB may give the identities
245 of the temps currently holding the associated definedness (shadowV)
246 and origin (shadowB) values, or these may be IRTemp_INVALID if code
247 to compute such values has not yet been emitted.
249 When .kind is VSh or BSh then the tmp is holds a V- or B- value,
250 and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
251 illogical for a shadow tmp itself to be shadowed.
253 typedef
254 enum { Orig=1, VSh=2, BSh=3 }
255 TempKind;
257 typedef
258 struct {
259 TempKind kind;
260 IRTemp shadowV;
261 IRTemp shadowB;
263 TempMapEnt;
266 /* A |HowUsed| value carries analysis results about how values are used,
267 pertaining to whether we need to instrument integer adds expensively or
268 not. The running state carries a (readonly) mapping from original tmp to
269 a HowUsed value for it. A usage value can be one of three values,
270 forming a 3-point chain lattice.
272 HuOth ("Other") used in some arbitrary way
274 HuPCa ("PCast") used *only* in effectively a PCast, in which all
275 | we care about is the all-defined vs not-all-defined distinction
277 HuUnU ("Unused") not used at all.
279 The "safe" (don't-know) end of the lattice is "HuOth". See comments
280 below in |preInstrumentationAnalysis| for further details.
282 /* DECLARED ABOVE:
283 typedef
284 enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
285 HowUsed;
288 // Not actually necessary, but we don't want to waste D1 space.
289 STATIC_ASSERT(sizeof(HowUsed) == 1);
292 /* Carries around state during memcheck instrumentation. */
293 typedef
294 struct _MCEnv {
295 /* MODIFIED: the superblock being constructed. IRStmts are
296 added. */
297 IRSB* sb;
298 Bool trace;
300 /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
301 current kind and possibly shadow temps for each temp in the
302 IRSB being constructed. Note that it does not contain the
303 type of each tmp. If you want to know the type, look at the
304 relevant entry in sb->tyenv. It follows that at all times
305 during the instrumentation process, the valid indices for
306 tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
307 total number of Orig, V- and B- temps allocated so far.
309 The reason for this strange split (types in one place, all
310 other info in another) is that we need the types to be
311 attached to sb so as to make it possible to do
312 "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
313 instrumentation process. */
314 XArray* /* of TempMapEnt */ tmpMap;
316 /* READONLY: contains details of which ops should be expensively
317 instrumented. */
318 DetailLevelByOp dlbo;
320 /* READONLY: for each original tmp, how the tmp is used. This is
321 computed by |preInstrumentationAnalysis|. Valid indices are
322 0 .. #temps_in_sb-1 (same as for tmpMap). */
323 HowUsed* tmpHowUsed;
325 /* READONLY: the guest layout. This indicates which parts of
326 the guest state should be regarded as 'always defined'. */
327 const VexGuestLayout* layout;
329 /* READONLY: the host word type. Needed for constructing
330 arguments of type 'HWord' to be passed to helper functions.
331 Ity_I32 or Ity_I64 only. */
332 IRType hWordTy;
334 MCEnv;
337 /* SHADOW TMP MANAGEMENT. Shadow tmps are allocated lazily (on
338 demand), as they are encountered. This is for two reasons.
340 (1) (less important reason): Many original tmps are unused due to
341 initial IR optimisation, and we do not want to spaces in tables
342 tracking them.
344 Shadow IRTemps are therefore allocated on demand. mce.tmpMap is a
345 table indexed [0 .. n_types-1], which gives the current shadow for
346 each original tmp, or INVALID_IRTEMP if none is so far assigned.
347 It is necessary to support making multiple assignments to a shadow
348 -- specifically, after testing a shadow for definedness, it needs
349 to be made defined. But IR's SSA property disallows this.
351 (2) (more important reason): Therefore, when a shadow needs to get
352 a new value, a new temporary is created, the value is assigned to
353 that, and the tmpMap is updated to reflect the new binding.
355 A corollary is that if the tmpMap maps a given tmp to
356 IRTemp_INVALID and we are hoping to read that shadow tmp, it means
357 there's a read-before-write error in the original tmps. The IR
358 sanity checker should catch all such anomalies, however.
361 /* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
362 both the table in mce->sb and to our auxiliary mapping. Note that
363 newTemp may cause mce->tmpMap to resize, hence previous results
364 from VG_(indexXA)(mce->tmpMap) are invalidated. */
365 static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
367 Word newIx;
368 TempMapEnt ent;
369 IRTemp tmp = newIRTemp(mce->sb->tyenv, ty);
370 ent.kind = kind;
371 ent.shadowV = IRTemp_INVALID;
372 ent.shadowB = IRTemp_INVALID;
373 newIx = VG_(addToXA)( mce->tmpMap, &ent );
374 tl_assert(newIx == (Word)tmp);
375 return tmp;
379 /* Find the tmp currently shadowing the given original tmp. If none
380 so far exists, allocate one. */
381 static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
383 TempMapEnt* ent;
384 /* VG_(indexXA) range-checks 'orig', hence no need to check
385 here. */
386 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
387 tl_assert(ent->kind == Orig);
388 if (ent->shadowV == IRTemp_INVALID) {
389 IRTemp tmpV
390 = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
391 /* newTemp may cause mce->tmpMap to resize, hence previous results
392 from VG_(indexXA) are invalid. */
393 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
394 tl_assert(ent->kind == Orig);
395 tl_assert(ent->shadowV == IRTemp_INVALID);
396 ent->shadowV = tmpV;
398 return ent->shadowV;
401 /* Allocate a new shadow for the given original tmp. This means any
402 previous shadow is abandoned. This is needed because it is
403 necessary to give a new value to a shadow once it has been tested
404 for undefinedness, but unfortunately IR's SSA property disallows
405 this. Instead we must abandon the old shadow, allocate a new one
406 and use that instead.
408 This is the same as findShadowTmpV, except we don't bother to see
409 if a shadow temp already existed -- we simply allocate a new one
410 regardless. */
411 static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
413 TempMapEnt* ent;
414 /* VG_(indexXA) range-checks 'orig', hence no need to check
415 here. */
416 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
417 tl_assert(ent->kind == Orig);
418 if (1) {
419 IRTemp tmpV
420 = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
421 /* newTemp may cause mce->tmpMap to resize, hence previous results
422 from VG_(indexXA) are invalid. */
423 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
424 tl_assert(ent->kind == Orig);
425 ent->shadowV = tmpV;
430 /*------------------------------------------------------------*/
431 /*--- IRAtoms -- a subset of IRExprs ---*/
432 /*------------------------------------------------------------*/
434 /* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
435 isIRAtom() in libvex_ir.h. Because this instrumenter expects flat
436 input, most of this code deals in atoms. Usefully, a value atom
437 always has a V-value which is also an atom: constants are shadowed
438 by constants, and temps are shadowed by the corresponding shadow
439 temporary. */
441 typedef IRExpr IRAtom;
443 /* (used for sanity checks only): is this an atom which looks
444 like it's from original code? */
445 static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
447 if (a1->tag == Iex_Const)
448 return True;
449 if (a1->tag == Iex_RdTmp) {
450 TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
451 return ent->kind == Orig;
453 return False;
456 /* (used for sanity checks only): is this an atom which looks
457 like it's from shadow code? */
458 static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
460 if (a1->tag == Iex_Const)
461 return True;
462 if (a1->tag == Iex_RdTmp) {
463 TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
464 return ent->kind == VSh || ent->kind == BSh;
466 return False;
469 /* (used for sanity checks only): check that both args are atoms and
470 are identically-kinded. */
471 static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
473 if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
474 return True;
475 if (a1->tag == Iex_Const && a2->tag == Iex_Const)
476 return True;
477 return False;
481 /*------------------------------------------------------------*/
482 /*--- Type management ---*/
483 /*------------------------------------------------------------*/
485 /* Shadow state is always accessed using integer types. This returns
486 an integer type with the same size (as per sizeofIRType) as the
487 given type. The only valid shadow types are Bit, I8, I16, I32,
488 I64, I128, V128, V256. */
490 static IRType shadowTypeV ( IRType ty )
492 switch (ty) {
493 case Ity_I1:
494 case Ity_I8:
495 case Ity_I16:
496 case Ity_I32:
497 case Ity_I64:
498 case Ity_I128: return ty;
499 case Ity_F16: return Ity_I16;
500 case Ity_F32: return Ity_I32;
501 case Ity_D32: return Ity_I32;
502 case Ity_F64: return Ity_I64;
503 case Ity_D64: return Ity_I64;
504 case Ity_F128: return Ity_I128;
505 case Ity_D128: return Ity_I128;
506 case Ity_V128: return Ity_V128;
507 case Ity_V256: return Ity_V256;
508 default: ppIRType(ty);
509 VG_(tool_panic)("memcheck:shadowTypeV");
513 /* Produce a 'defined' value of the given shadow type. Should only be
514 supplied shadow types (Bit/I8/I16/I32/UI64). */
515 static IRExpr* definedOfType ( IRType ty ) {
516 switch (ty) {
517 case Ity_I1: return IRExpr_Const(IRConst_U1(False));
518 case Ity_I8: return IRExpr_Const(IRConst_U8(0));
519 case Ity_I16: return IRExpr_Const(IRConst_U16(0));
520 case Ity_I32: return IRExpr_Const(IRConst_U32(0));
521 case Ity_I64: return IRExpr_Const(IRConst_U64(0));
522 case Ity_I128: return i128_const_zero();
523 case Ity_V128: return IRExpr_Const(IRConst_V128(0x0000));
524 case Ity_V256: return IRExpr_Const(IRConst_V256(0x00000000));
525 default: VG_(tool_panic)("memcheck:definedOfType");
530 /*------------------------------------------------------------*/
531 /*--- Constructing IR fragments ---*/
532 /*------------------------------------------------------------*/
534 /* add stmt to a bb */
535 static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
536 if (mce->trace) {
537 VG_(printf)(" %c: ", cat);
538 ppIRStmt(st);
539 VG_(printf)("\n");
541 addStmtToIRSB(mce->sb, st);
544 /* assign value to tmp */
545 static inline
546 void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
547 stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
550 /* build various kinds of expressions */
551 #define triop(_op, _arg1, _arg2, _arg3) \
552 IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
553 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
554 #define unop(_op, _arg) IRExpr_Unop((_op),(_arg))
555 #define mkU1(_n) IRExpr_Const(IRConst_U1(_n))
556 #define mkU8(_n) IRExpr_Const(IRConst_U8(_n))
557 #define mkU16(_n) IRExpr_Const(IRConst_U16(_n))
558 #define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
559 #define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
560 #define mkV128(_n) IRExpr_Const(IRConst_V128(_n))
561 #define mkexpr(_tmp) IRExpr_RdTmp((_tmp))
563 /* Bind the given expression to a new temporary, and return the
564 temporary. This effectively converts an arbitrary expression into
565 an atom.
567 'ty' is the type of 'e' and hence the type that the new temporary
568 needs to be. But passing it in is redundant, since we can deduce
569 the type merely by inspecting 'e'. So at least use that fact to
570 assert that the two types agree. */
571 static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
573 TempKind k;
574 IRTemp t;
575 IRType tyE = typeOfIRExpr(mce->sb->tyenv, e);
577 tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
578 switch (cat) {
579 case 'V': k = VSh; break;
580 case 'B': k = BSh; break;
581 case 'C': k = Orig; break;
582 /* happens when we are making up new "orig"
583 expressions, for IRCAS handling */
584 default: tl_assert(0);
586 t = newTemp(mce, ty, k);
587 assign(cat, mce, t, e);
588 return mkexpr(t);
592 /*------------------------------------------------------------*/
593 /*--- Helper functions for 128-bit ops ---*/
594 /*------------------------------------------------------------*/
596 static IRExpr *i128_const_zero(void)
598 IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
599 return binop(Iop_64HLto128, z64, z64);
602 /* There are no I128-bit loads and/or stores [as generated by any
603 current front ends]. So we do not need to worry about that in
604 expr2vbits_Load */
607 /*------------------------------------------------------------*/
608 /*--- Constructing definedness primitive ops ---*/
609 /*------------------------------------------------------------*/
611 /* --------- Defined-if-either-defined --------- */
613 static IRAtom* mkDifD1 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
614 tl_assert(isShadowAtom(mce,a1));
615 tl_assert(isShadowAtom(mce,a2));
616 return assignNew('V', mce, Ity_I1, binop(Iop_And1, a1, a2));
619 static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
620 tl_assert(isShadowAtom(mce,a1));
621 tl_assert(isShadowAtom(mce,a2));
622 return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
625 static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
626 tl_assert(isShadowAtom(mce,a1));
627 tl_assert(isShadowAtom(mce,a2));
628 return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
631 static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
632 tl_assert(isShadowAtom(mce,a1));
633 tl_assert(isShadowAtom(mce,a2));
634 return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
637 static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
638 tl_assert(isShadowAtom(mce,a1));
639 tl_assert(isShadowAtom(mce,a2));
640 return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
643 static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
644 tl_assert(isShadowAtom(mce,a1));
645 tl_assert(isShadowAtom(mce,a2));
646 return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
649 static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
650 tl_assert(isShadowAtom(mce,a1));
651 tl_assert(isShadowAtom(mce,a2));
652 return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
655 /* --------- Undefined-if-either-undefined --------- */
657 static IRAtom* mkUifU1 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
658 tl_assert(isShadowAtom(mce,a1));
659 tl_assert(isShadowAtom(mce,a2));
660 return assignNew('V', mce, Ity_I1, binop(Iop_Or1, a1, a2));
663 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
664 tl_assert(isShadowAtom(mce,a1));
665 tl_assert(isShadowAtom(mce,a2));
666 return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
669 static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
670 tl_assert(isShadowAtom(mce,a1));
671 tl_assert(isShadowAtom(mce,a2));
672 return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
675 static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
676 tl_assert(isShadowAtom(mce,a1));
677 tl_assert(isShadowAtom(mce,a2));
678 return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
681 static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
682 tl_assert(isShadowAtom(mce,a1));
683 tl_assert(isShadowAtom(mce,a2));
684 return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
687 static IRAtom* mkUifU128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
688 IRAtom *tmp1, *tmp2, *tmp3, *tmp4, *tmp5, *tmp6;
689 tl_assert(isShadowAtom(mce,a1));
690 tl_assert(isShadowAtom(mce,a2));
691 tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a1));
692 tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a1));
693 tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a2));
694 tmp4 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a2));
695 tmp5 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp1, tmp3));
696 tmp6 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp4));
698 return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp6, tmp5));
701 static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
702 tl_assert(isShadowAtom(mce,a1));
703 tl_assert(isShadowAtom(mce,a2));
704 return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
707 static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
708 tl_assert(isShadowAtom(mce,a1));
709 tl_assert(isShadowAtom(mce,a2));
710 return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
713 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
714 switch (vty) {
715 case Ity_I8: return mkUifU8(mce, a1, a2);
716 case Ity_I16: return mkUifU16(mce, a1, a2);
717 case Ity_I32: return mkUifU32(mce, a1, a2);
718 case Ity_I64: return mkUifU64(mce, a1, a2);
719 case Ity_I128: return mkUifU128(mce, a1, a2);
720 case Ity_V128: return mkUifUV128(mce, a1, a2);
721 case Ity_V256: return mkUifUV256(mce, a1, a2);
722 default:
723 VG_(printf)("\n"); ppIRType(vty); VG_(printf)("\n");
724 VG_(tool_panic)("memcheck:mkUifU");
728 /* --------- The Left-family of operations. --------- */
730 static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
731 tl_assert(isShadowAtom(mce,a1));
732 return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
735 static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
736 tl_assert(isShadowAtom(mce,a1));
737 return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
740 static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
741 tl_assert(isShadowAtom(mce,a1));
742 return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
745 static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
746 tl_assert(isShadowAtom(mce,a1));
747 return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
750 /* --------- The Right-family of operations. --------- */
752 /* Unfortunately these are a lot more expensive then their Left
753 counterparts. Fortunately they are only very rarely used -- only for
754 count-leading-zeroes instrumentation. */
756 static IRAtom* mkRight32 ( MCEnv* mce, IRAtom* a1 )
758 for (Int i = 1; i <= 16; i *= 2) {
759 // a1 |= (a1 >>u i)
760 IRAtom* tmp
761 = assignNew('V', mce, Ity_I32, binop(Iop_Shr32, a1, mkU8(i)));
762 a1 = assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, tmp));
764 return a1;
767 static IRAtom* mkRight64 ( MCEnv* mce, IRAtom* a1 )
769 for (Int i = 1; i <= 32; i *= 2) {
770 // a1 |= (a1 >>u i)
771 IRAtom* tmp
772 = assignNew('V', mce, Ity_I64, binop(Iop_Shr64, a1, mkU8(i)));
773 a1 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, tmp));
775 return a1;
778 /* --------- 'Improvement' functions for AND/OR. --------- */
780 /* ImproveAND(data, vbits) = data OR vbits. Defined (0) data 0s give
781 defined (0); all other -> undefined (1).
783 static IRAtom* mkImproveAND1 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
785 tl_assert(isOriginalAtom(mce, data));
786 tl_assert(isShadowAtom(mce, vbits));
787 tl_assert(sameKindedAtoms(data, vbits));
788 return assignNew('V', mce, Ity_I1, binop(Iop_Or1, data, vbits));
791 static IRAtom* mkImproveAND8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
793 tl_assert(isOriginalAtom(mce, data));
794 tl_assert(isShadowAtom(mce, vbits));
795 tl_assert(sameKindedAtoms(data, vbits));
796 return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
799 static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
801 tl_assert(isOriginalAtom(mce, data));
802 tl_assert(isShadowAtom(mce, vbits));
803 tl_assert(sameKindedAtoms(data, vbits));
804 return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
807 static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
809 tl_assert(isOriginalAtom(mce, data));
810 tl_assert(isShadowAtom(mce, vbits));
811 tl_assert(sameKindedAtoms(data, vbits));
812 return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
815 static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
817 tl_assert(isOriginalAtom(mce, data));
818 tl_assert(isShadowAtom(mce, vbits));
819 tl_assert(sameKindedAtoms(data, vbits));
820 return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
823 static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
825 tl_assert(isOriginalAtom(mce, data));
826 tl_assert(isShadowAtom(mce, vbits));
827 tl_assert(sameKindedAtoms(data, vbits));
828 return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
831 static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
833 tl_assert(isOriginalAtom(mce, data));
834 tl_assert(isShadowAtom(mce, vbits));
835 tl_assert(sameKindedAtoms(data, vbits));
836 return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
839 /* ImproveOR(data, vbits) = ~data OR vbits. Defined (0) data 1s give
840 defined (0); all other -> undefined (1).
842 static IRAtom* mkImproveOR1 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
844 tl_assert(isOriginalAtom(mce, data));
845 tl_assert(isShadowAtom(mce, vbits));
846 tl_assert(sameKindedAtoms(data, vbits));
847 return assignNew(
848 'V', mce, Ity_I1,
849 binop(Iop_Or1,
850 assignNew('V', mce, Ity_I1, unop(Iop_Not1, data)),
851 vbits) );
854 static IRAtom* mkImproveOR8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
856 tl_assert(isOriginalAtom(mce, data));
857 tl_assert(isShadowAtom(mce, vbits));
858 tl_assert(sameKindedAtoms(data, vbits));
859 return assignNew(
860 'V', mce, Ity_I8,
861 binop(Iop_Or8,
862 assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)),
863 vbits) );
866 static IRAtom* mkImproveOR16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
868 tl_assert(isOriginalAtom(mce, data));
869 tl_assert(isShadowAtom(mce, vbits));
870 tl_assert(sameKindedAtoms(data, vbits));
871 return assignNew(
872 'V', mce, Ity_I16,
873 binop(Iop_Or16,
874 assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)),
875 vbits) );
878 static IRAtom* mkImproveOR32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
880 tl_assert(isOriginalAtom(mce, data));
881 tl_assert(isShadowAtom(mce, vbits));
882 tl_assert(sameKindedAtoms(data, vbits));
883 return assignNew(
884 'V', mce, Ity_I32,
885 binop(Iop_Or32,
886 assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)),
887 vbits) );
890 static IRAtom* mkImproveOR64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
892 tl_assert(isOriginalAtom(mce, data));
893 tl_assert(isShadowAtom(mce, vbits));
894 tl_assert(sameKindedAtoms(data, vbits));
895 return assignNew(
896 'V', mce, Ity_I64,
897 binop(Iop_Or64,
898 assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)),
899 vbits) );
902 static IRAtom* mkImproveORV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
904 tl_assert(isOriginalAtom(mce, data));
905 tl_assert(isShadowAtom(mce, vbits));
906 tl_assert(sameKindedAtoms(data, vbits));
907 return assignNew(
908 'V', mce, Ity_V128,
909 binop(Iop_OrV128,
910 assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)),
911 vbits) );
914 static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
916 tl_assert(isOriginalAtom(mce, data));
917 tl_assert(isShadowAtom(mce, vbits));
918 tl_assert(sameKindedAtoms(data, vbits));
919 return assignNew(
920 'V', mce, Ity_V256,
921 binop(Iop_OrV256,
922 assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)),
923 vbits) );
926 /* --------- Pessimising casts. --------- */
928 /* The function returns an expression of type DST_TY. If any of the VBITS
929 is undefined (value == 1) the resulting expression has all bits set to
930 1. Otherwise, all bits are 0. */
932 static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
934 IRType src_ty;
935 IRAtom* tmp1;
937 /* Note, dst_ty is a shadow type, not an original type. */
938 tl_assert(isShadowAtom(mce,vbits));
939 src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
941 /* Fast-track some common cases */
942 if (src_ty == Ity_I32 && dst_ty == Ity_I32)
943 return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
945 if (src_ty == Ity_I64 && dst_ty == Ity_I64)
946 return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
948 if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
949 /* PCast the arg, then clone it. */
950 IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
951 return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
954 if (src_ty == Ity_I32 && dst_ty == Ity_V128) {
955 /* PCast the arg, then clone it 4 times. */
956 IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
957 tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
958 return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
961 if (src_ty == Ity_I32 && dst_ty == Ity_V256) {
962 /* PCast the arg, then clone it 8 times. */
963 IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
964 tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
965 tmp = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
966 return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, tmp, tmp));
969 if (src_ty == Ity_I64 && dst_ty == Ity_I32) {
970 /* PCast the arg. This gives all 0s or all 1s. Then throw away
971 the top half. */
972 IRAtom* tmp = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
973 return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
976 if (src_ty == Ity_V128 && dst_ty == Ity_I64) {
977 /* Use InterleaveHI64x2 to copy the top half of the vector into
978 the bottom half. Then we can UifU it with the original, throw
979 away the upper half of the result, and PCast-I64-to-I64
980 the lower half. */
981 // Generates vbits[127:64] : vbits[127:64]
982 IRAtom* hi64hi64
983 = assignNew('V', mce, Ity_V128,
984 binop(Iop_InterleaveHI64x2, vbits, vbits));
985 // Generates
986 // UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
987 // == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
988 IRAtom* lohi64
989 = mkUifUV128(mce, hi64hi64, vbits);
990 // Generates UifU(vbits[127:64],vbits[63:0])
991 IRAtom* lo64
992 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64));
993 // Generates
994 // PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
995 // == PCast-to-I64( vbits[127:0] )
996 IRAtom* res
997 = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64));
998 return res;
1001 /* Else do it the slow way .. */
1002 /* First of all, collapse vbits down to a single bit. */
1003 tmp1 = NULL;
1004 switch (src_ty) {
1005 case Ity_I1:
1006 tmp1 = vbits;
1007 break;
1008 case Ity_I8:
1009 tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
1010 break;
1011 case Ity_I16:
1012 tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
1013 break;
1014 case Ity_I32:
1015 tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
1016 break;
1017 case Ity_I64:
1018 tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
1019 break;
1020 case Ity_I128: {
1021 /* Gah. Chop it in half, OR the halves together, and compare
1022 that with zero. */
1023 IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
1024 IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
1025 IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
1026 tmp1 = assignNew('V', mce, Ity_I1,
1027 unop(Iop_CmpNEZ64, tmp4));
1028 break;
1030 case Ity_V128: {
1031 /* Chop it in half, OR the halves together, and compare that
1032 * with zero.
1034 IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vbits));
1035 IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vbits));
1036 IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
1037 tmp1 = assignNew('V', mce, Ity_I1,
1038 unop(Iop_CmpNEZ64, tmp4));
1039 break;
1041 default:
1042 ppIRType(src_ty);
1043 VG_(tool_panic)("mkPCastTo(1)");
1045 tl_assert(tmp1);
1046 /* Now widen up to the dst type. */
1047 switch (dst_ty) {
1048 case Ity_I1:
1049 return tmp1;
1050 case Ity_I8:
1051 return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
1052 case Ity_I16:
1053 return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
1054 case Ity_I32:
1055 return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
1056 case Ity_I64:
1057 return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1058 case Ity_V128:
1059 tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1060 tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
1061 return tmp1;
1062 case Ity_I128:
1063 tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1064 tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
1065 return tmp1;
1066 case Ity_V256:
1067 tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1068 tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
1069 tmp1, tmp1));
1070 tmp1 = assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
1071 tmp1, tmp1));
1072 return tmp1;
1073 default:
1074 ppIRType(dst_ty);
1075 VG_(tool_panic)("mkPCastTo(2)");
1079 /* This is a minor variant. It takes an arg of some type and returns
1080 a value of the same type. The result consists entirely of Defined
1081 (zero) bits except its least significant bit, which is a PCast of
1082 the entire argument down to a single bit. */
1083 static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty )
1085 if (ty == Ity_V128) {
1086 /* --- Case for V128 --- */
1087 IRAtom* varg128 = varg;
1088 // generates: PCast-to-I64(varg128)
1089 IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128);
1090 // Now introduce zeros (defined bits) in the top 63 places
1091 // generates: Def--(63)--Def PCast-to-I1(varg128)
1092 IRAtom* d63pc
1093 = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1)));
1094 // generates: Def--(64)--Def
1095 IRAtom* d64
1096 = definedOfType(Ity_I64);
1097 // generates: Def--(127)--Def PCast-to-I1(varg128)
1098 IRAtom* res
1099 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc));
1100 return res;
1102 if (ty == Ity_I64) {
1103 /* --- Case for I64 --- */
1104 // PCast to 64
1105 IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg);
1106 // Zero (Def) out the top 63 bits
1107 IRAtom* res
1108 = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1)));
1109 return res;
1111 /*NOTREACHED*/
1112 tl_assert(0);
1115 /* --------- Optimistic casts. --------- */
1117 /* The function takes and returns an expression of type TY. If any of the
1118 VBITS indicate defined (value == 0) the resulting expression has all bits
1119 set to 0. Otherwise, all bits are 1. In words, if any bits are defined
1120 then all bits are made to be defined.
1122 In short we compute (vbits - (vbits >>u 1)) >>s (bitsize(vbits)-1).
1124 static IRAtom* mkOCastAt( MCEnv* mce, IRType ty, IRAtom* vbits )
1126 IROp opSUB, opSHR, opSAR;
1127 UInt sh;
1129 switch (ty) {
1130 case Ity_I64:
1131 opSUB = Iop_Sub64; opSHR = Iop_Shr64; opSAR = Iop_Sar64; sh = 63;
1132 break;
1133 case Ity_I32:
1134 opSUB = Iop_Sub32; opSHR = Iop_Shr32; opSAR = Iop_Sar32; sh = 31;
1135 break;
1136 case Ity_I16:
1137 opSUB = Iop_Sub16; opSHR = Iop_Shr16; opSAR = Iop_Sar16; sh = 15;
1138 break;
1139 case Ity_I8:
1140 opSUB = Iop_Sub8; opSHR = Iop_Shr8; opSAR = Iop_Sar8; sh = 7;
1141 break;
1142 default:
1143 ppIRType(ty);
1144 VG_(tool_panic)("mkOCastTo");
1147 IRAtom *shr1, *at;
1148 shr1 = assignNew('V', mce,ty, binop(opSHR, vbits, mkU8(1)));
1149 at = assignNew('V', mce,ty, binop(opSUB, vbits, shr1));
1150 at = assignNew('V', mce,ty, binop(opSAR, at, mkU8(sh)));
1151 return at;
1155 /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
1157 Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
1158 PCasting to Ity_U1. However, sometimes it is necessary to be more
1159 accurate. The insight is that the result is defined if two
1160 corresponding bits can be found, one from each argument, so that
1161 both bits are defined but are different -- that makes EQ say "No"
1162 and NE say "Yes". Hence, we compute an improvement term and DifD
1163 it onto the "normal" (UifU) result.
1165 The result is:
1167 PCastTo<1> (
1168 -- naive version
1169 UifU<sz>(vxx, vyy)
1171 `DifD<sz>`
1173 -- improvement term
1174 OCast<sz>(vec)
1177 where
1178 vec contains 0 (defined) bits where the corresponding arg bits
1179 are defined but different, and 1 bits otherwise.
1181 vec = Or<sz>( vxx, // 0 iff bit defined
1182 vyy, // 0 iff bit defined
1183 Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
1186 If any bit of vec is 0, the result is defined and so the
1187 improvement term should produce 0...0, else it should produce
1188 1...1.
1190 Hence require for the improvement term:
1192 OCast(vec) = if vec == 1...1 then 1...1 else 0...0
1194 which you can think of as an "optimistic cast" (OCast, the opposite of
1195 the normal "pessimistic cast" (PCast) family. An OCast says all bits
1196 are defined if any bit is defined.
1198 It is possible to show that
1200 if vec == 1...1 then 1...1 else 0...0
1202 can be implemented in straight-line code as
1204 (vec - (vec >>u 1)) >>s (word-size-in-bits - 1)
1206 We note that vec contains the sub-term Or<sz>(vxx, vyy). Since UifU is
1207 implemented with Or (since 1 signifies undefinedness), this is a
1208 duplicate of the UifU<sz>(vxx, vyy) term and so we can CSE it out, giving
1209 a final version of:
1211 let naive = UifU<sz>(vxx, vyy)
1212 vec = Or<sz>(naive, Not<sz>(Xor<sz)(xx, yy))
1214 PCastTo<1>( DifD<sz>(naive, OCast<sz>(vec)) )
1216 This was extensively re-analysed and checked on 6 July 05 and again
1217 in July 2017.
1219 static IRAtom* expensiveCmpEQorNE ( MCEnv* mce,
1220 IRType ty,
1221 IRAtom* vxx, IRAtom* vyy,
1222 IRAtom* xx, IRAtom* yy )
1224 IRAtom *naive, *vec, *improved, *final_cast;
1225 IROp opDIFD, opUIFU, opOR, opXOR, opNOT;
1227 tl_assert(isShadowAtom(mce,vxx));
1228 tl_assert(isShadowAtom(mce,vyy));
1229 tl_assert(isOriginalAtom(mce,xx));
1230 tl_assert(isOriginalAtom(mce,yy));
1231 tl_assert(sameKindedAtoms(vxx,xx));
1232 tl_assert(sameKindedAtoms(vyy,yy));
1234 switch (ty) {
1235 case Ity_I8:
1236 opDIFD = Iop_And8;
1237 opUIFU = Iop_Or8;
1238 opOR = Iop_Or8;
1239 opXOR = Iop_Xor8;
1240 opNOT = Iop_Not8;
1241 break;
1242 case Ity_I16:
1243 opDIFD = Iop_And16;
1244 opUIFU = Iop_Or16;
1245 opOR = Iop_Or16;
1246 opXOR = Iop_Xor16;
1247 opNOT = Iop_Not16;
1248 break;
1249 case Ity_I32:
1250 opDIFD = Iop_And32;
1251 opUIFU = Iop_Or32;
1252 opOR = Iop_Or32;
1253 opXOR = Iop_Xor32;
1254 opNOT = Iop_Not32;
1255 break;
1256 case Ity_I64:
1257 opDIFD = Iop_And64;
1258 opUIFU = Iop_Or64;
1259 opOR = Iop_Or64;
1260 opXOR = Iop_Xor64;
1261 opNOT = Iop_Not64;
1262 break;
1263 default:
1264 VG_(tool_panic)("expensiveCmpEQorNE");
1267 naive
1268 = assignNew('V', mce, ty, binop(opUIFU, vxx, vyy));
1270 vec
1271 = assignNew(
1272 'V', mce,ty,
1273 binop( opOR,
1274 naive,
1275 assignNew(
1276 'V', mce,ty,
1277 unop(opNOT,
1278 assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
1280 improved
1281 = assignNew( 'V', mce,ty,
1282 binop(opDIFD, naive, mkOCastAt(mce, ty, vec)));
1284 final_cast
1285 = mkPCastTo( mce, Ity_I1, improved );
1287 return final_cast;
1290 /* Check if we can know, despite the uncertain bits, that xx is greater than yy.
1291 Notice that it's xx > yy and not the other way around. This is Intel syntax
1292 with destination first. It will appear reversed in gdb disassembly (AT&T
1293 syntax).
1295 static IRAtom* expensiveCmpGT ( MCEnv* mce,
1296 IROp opGT,
1297 IRAtom* vxx, IRAtom* vyy,
1298 IRAtom* xx, IRAtom* yy )
1300 IROp opAND, opOR, opXOR, opNOT, opSHL;
1301 IRType ty;
1302 unsigned int word_size;
1303 Bool is_signed;
1305 tl_assert(isShadowAtom(mce,vxx));
1306 tl_assert(isShadowAtom(mce,vyy));
1307 tl_assert(isOriginalAtom(mce,xx));
1308 tl_assert(isOriginalAtom(mce,yy));
1309 tl_assert(sameKindedAtoms(vxx,xx));
1310 tl_assert(sameKindedAtoms(vyy,yy));
1312 switch (opGT) {
1313 case Iop_CmpGT64Sx2:
1314 case Iop_CmpGT64Ux2:
1315 opSHL = Iop_ShlN64x2;
1316 word_size = 64;
1317 break;
1318 case Iop_CmpGT32Sx4:
1319 case Iop_CmpGT32Ux4:
1320 opSHL = Iop_ShlN32x4;
1321 word_size = 32;
1322 break;
1323 case Iop_CmpGT16Sx8:
1324 case Iop_CmpGT16Ux8:
1325 opSHL = Iop_ShlN16x8;
1326 word_size = 16;
1327 break;
1328 case Iop_CmpGT8Sx16:
1329 case Iop_CmpGT8Ux16:
1330 opSHL = Iop_ShlN8x16;
1331 word_size = 8;
1332 break;
1333 default:
1334 VG_(tool_panic)("expensiveCmpGT");
1337 switch (opGT) {
1338 case Iop_CmpGT64Sx2:
1339 case Iop_CmpGT32Sx4:
1340 case Iop_CmpGT16Sx8:
1341 case Iop_CmpGT8Sx16:
1342 is_signed = True;
1343 break;
1344 case Iop_CmpGT64Ux2:
1345 case Iop_CmpGT32Ux4:
1346 case Iop_CmpGT16Ux8:
1347 case Iop_CmpGT8Ux16:
1348 is_signed = False;
1349 break;
1350 default:
1351 VG_(tool_panic)("expensiveCmpGT");
1354 ty = Ity_V128;
1355 opAND = Iop_AndV128;
1356 opOR = Iop_OrV128;
1357 opXOR = Iop_XorV128;
1358 opNOT = Iop_NotV128;
1360 IRAtom *MSBs;
1361 if (is_signed) {
1362 // For unsigned it's easy to make the min and max: Just set the unknown
1363 // bits all to 0s or 1s. For signed it's harder because having a 1 in the
1364 // MSB makes a number smaller, not larger! We can work around this by
1365 // flipping the MSB before and after computing the min and max values.
1366 IRAtom *all_ones = mkV128(0xffff);
1367 MSBs = assignNew('V', mce, ty, binop(opSHL, all_ones, mkU8(word_size-1)));
1368 xx = assignNew('V', mce, ty, binop(opXOR, xx, MSBs));
1369 yy = assignNew('V', mce, ty, binop(opXOR, yy, MSBs));
1370 // From here on out, we're dealing with MSB-flipped integers.
1372 // We can combine xx and vxx to create two values: the largest that xx could
1373 // possibly be and the smallest that xx could possibly be. Likewise, we can
1374 // do the same for yy. We'll call those max_xx and min_xx and max_yy and
1375 // min_yy.
1376 IRAtom *not_vxx = assignNew('V', mce, ty, unop(opNOT, vxx));
1377 IRAtom *not_vyy = assignNew('V', mce, ty, unop(opNOT, vyy));
1378 IRAtom *max_xx = assignNew('V', mce, ty, binop(opOR, xx, vxx));
1379 IRAtom *min_xx = assignNew('V', mce, ty, binop(opAND, xx, not_vxx));
1380 IRAtom *max_yy = assignNew('V', mce, ty, binop(opOR, yy, vyy));
1381 IRAtom *min_yy = assignNew('V', mce, ty, binop(opAND, yy, not_vyy));
1382 if (is_signed) {
1383 // Unflip the MSBs.
1384 max_xx = assignNew('V', mce, ty, binop(opXOR, max_xx, MSBs));
1385 min_xx = assignNew('V', mce, ty, binop(opXOR, min_xx, MSBs));
1386 max_yy = assignNew('V', mce, ty, binop(opXOR, max_yy, MSBs));
1387 min_yy = assignNew('V', mce, ty, binop(opXOR, min_yy, MSBs));
1389 IRAtom *min_xx_gt_max_yy = assignNew('V', mce, ty, binop(opGT, min_xx, max_yy));
1390 IRAtom *max_xx_gt_min_yy = assignNew('V', mce, ty, binop(opGT, max_xx, min_yy));
1391 // If min_xx is greater than max_yy then xx is surely greater than yy so we know
1392 // our answer for sure. If max_xx is not greater than min_yy then xx can't
1393 // possible be greater than yy so again we know the answer for sure. For all
1394 // other cases, we can't know.
1396 // So the result is defined if:
1398 // min_xx_gt_max_yy | ~max_xx_gt_min_yy
1400 // Because defined in vbits is 0s and not 1s, we need to invert that:
1402 // ~(min_xx_gt_max_yy | ~max_xx_gt_min_yy)
1404 // We can use DeMorgan's Law to simplify the above:
1406 // ~min_xx_gt_max_yy & max_xx_gt_min_yy
1407 IRAtom *not_min_xx_gt_max_yy = assignNew('V', mce, ty, unop(opNOT, min_xx_gt_max_yy));
1408 return assignNew('V', mce, ty, binop(opAND, not_min_xx_gt_max_yy, max_xx_gt_min_yy));
1411 /* --------- Semi-accurate interpretation of CmpORD. --------- */
1413 /* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
1415 CmpORD32S(x,y) = 1<<3 if x <s y
1416 = 1<<2 if x >s y
1417 = 1<<1 if x == y
1419 and similarly the unsigned variant. The default interpretation is:
1421 CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
1422 & (7<<1)
1424 The "& (7<<1)" reflects the fact that all result bits except 3,2,1
1425 are zero and therefore defined (viz, zero).
1427 Also deal with a special case better:
1429 CmpORD32S(x,0)
1431 Here, bit 3 (LT) of the result is a copy of the top bit of x and
1432 will be defined even if the rest of x isn't. In which case we do:
1434 CmpORD32S#(x,x#,0,{impliedly 0}#)
1435 = PCast(x#) & (3<<1) -- standard interp for GT#,EQ#
1436 | (x# >>u 31) << 3 -- LT# = x#[31]
1438 Analogous handling for CmpORD64{S,U}.
1440 static Bool isZeroU32 ( IRAtom* e )
1442 return
1443 toBool( e->tag == Iex_Const
1444 && e->Iex.Const.con->tag == Ico_U32
1445 && e->Iex.Const.con->Ico.U32 == 0 );
1448 static Bool isZeroU64 ( IRAtom* e )
1450 return
1451 toBool( e->tag == Iex_Const
1452 && e->Iex.Const.con->tag == Ico_U64
1453 && e->Iex.Const.con->Ico.U64 == 0 );
1456 static IRAtom* doCmpORD ( MCEnv* mce,
1457 IROp cmp_op,
1458 IRAtom* xxhash, IRAtom* yyhash,
1459 IRAtom* xx, IRAtom* yy )
1461 Bool m64 = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
1462 Bool syned = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
1463 IROp opOR = m64 ? Iop_Or64 : Iop_Or32;
1464 IROp opAND = m64 ? Iop_And64 : Iop_And32;
1465 IROp opSHL = m64 ? Iop_Shl64 : Iop_Shl32;
1466 IROp opSHR = m64 ? Iop_Shr64 : Iop_Shr32;
1467 IROp op1UtoWS = m64 ? Iop_1Uto64 : Iop_1Uto32;
1468 IRType ty = m64 ? Ity_I64 : Ity_I32;
1469 Int width = m64 ? 64 : 32;
1471 Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
1473 tl_assert(isShadowAtom(mce,xxhash));
1474 tl_assert(isShadowAtom(mce,yyhash));
1475 tl_assert(isOriginalAtom(mce,xx));
1476 tl_assert(isOriginalAtom(mce,yy));
1477 tl_assert(sameKindedAtoms(xxhash,xx));
1478 tl_assert(sameKindedAtoms(yyhash,yy));
1479 tl_assert(cmp_op == Iop_CmpORD32S || cmp_op == Iop_CmpORD32U
1480 || cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U);
1482 if (0) {
1483 ppIROp(cmp_op); VG_(printf)(" ");
1484 ppIRExpr(xx); VG_(printf)(" "); ppIRExpr( yy ); VG_(printf)("\n");
1487 if (syned && isZero(yy)) {
1488 /* fancy interpretation */
1489 /* if yy is zero, then it must be fully defined (zero#). */
1490 tl_assert(isZero(yyhash));
1491 // This is still inaccurate, but I don't think it matters, since
1492 // nobody writes code of the form
1493 // "is <partially-undefined-value> signedly greater than zero?".
1494 // We therefore simply declare "x >s 0" to be undefined if any bit in
1495 // x is undefined. That's clearly suboptimal in some cases. Eg, if
1496 // the highest order bit is a defined 1 then x is negative so it
1497 // doesn't matter whether the remaining bits are defined or not.
1498 IRAtom* t_0_gt_0_0
1499 = assignNew(
1500 'V', mce,ty,
1501 binop(
1502 opAND,
1503 mkPCastTo(mce,ty, xxhash),
1504 m64 ? mkU64(1<<2) : mkU32(1<<2)
1506 // For "x <s 0", we can just copy the definedness of the top bit of x
1507 // and we have a precise result.
1508 IRAtom* t_lt_0_0_0
1509 = assignNew(
1510 'V', mce,ty,
1511 binop(
1512 opSHL,
1513 assignNew(
1514 'V', mce,ty,
1515 binop(opSHR, xxhash, mkU8(width-1))),
1516 mkU8(3)
1518 // For "x == 0" we can hand the problem off to expensiveCmpEQorNE.
1519 IRAtom* t_0_0_eq_0
1520 = assignNew(
1521 'V', mce,ty,
1522 binop(
1523 opSHL,
1524 assignNew('V', mce,ty,
1525 unop(
1526 op1UtoWS,
1527 expensiveCmpEQorNE(mce, ty, xxhash, yyhash, xx, yy))
1529 mkU8(1)
1531 return
1532 binop(
1533 opOR,
1534 assignNew('V', mce,ty, binop(opOR, t_lt_0_0_0, t_0_gt_0_0)),
1535 t_0_0_eq_0
1537 } else {
1538 /* standard interpretation */
1539 IRAtom* sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
1540 return
1541 binop(
1542 opAND,
1543 mkPCastTo( mce,ty,
1544 mkUifU(mce,ty, xxhash,yyhash)),
1545 sevenLeft1
1551 /*------------------------------------------------------------*/
1552 /*--- Emit a test and complaint if something is undefined. ---*/
1553 /*------------------------------------------------------------*/
1555 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
1558 /* Set the annotations on a dirty helper to indicate that the stack
1559 pointer and instruction pointers might be read. This is the
1560 behaviour of all 'emit-a-complaint' style functions we might
1561 call. */
1563 static void setHelperAnns ( MCEnv* mce, IRDirty* di ) {
1564 di->nFxState = 2;
1565 di->fxState[0].fx = Ifx_Read;
1566 di->fxState[0].offset = mce->layout->offset_SP;
1567 di->fxState[0].size = mce->layout->sizeof_SP;
1568 di->fxState[0].nRepeats = 0;
1569 di->fxState[0].repeatLen = 0;
1570 di->fxState[1].fx = Ifx_Read;
1571 di->fxState[1].offset = mce->layout->offset_IP;
1572 di->fxState[1].size = mce->layout->sizeof_IP;
1573 di->fxState[1].nRepeats = 0;
1574 di->fxState[1].repeatLen = 0;
1578 /* Check the supplied *original* |atom| for undefinedness, and emit a
1579 complaint if so. Once that happens, mark it as defined. This is
1580 possible because the atom is either a tmp or literal. If it's a
1581 tmp, it will be shadowed by a tmp, and so we can set the shadow to
1582 be defined. In fact as mentioned above, we will have to allocate a
1583 new tmp to carry the new 'defined' shadow value, and update the
1584 original->tmp mapping accordingly; we cannot simply assign a new
1585 value to an existing shadow tmp as this breaks SSAness.
1587 The checks are performed, any resulting complaint emitted, and
1588 |atom|'s shadow temp set to 'defined', ONLY in the case that
1589 |guard| evaluates to True at run-time. If it evaluates to False
1590 then no action is performed. If |guard| is NULL (the usual case)
1591 then it is assumed to be always-true, and hence these actions are
1592 performed unconditionally.
1594 This routine does not generate code to check the definedness of
1595 |guard|. The caller is assumed to have taken care of that already.
1597 static void complainIfUndefined ( MCEnv* mce, IRAtom* atom, IRExpr *guard )
1599 IRAtom* vatom;
1600 IRType ty;
1601 Int sz;
1602 IRDirty* di;
1603 IRAtom* cond;
1604 IRAtom* origin;
1605 void* fn;
1606 const HChar* nm;
1607 IRExpr** args;
1608 Int nargs;
1610 // Don't do V bit tests if we're not reporting undefined value errors.
1611 if (MC_(clo_mc_level) == 1)
1612 return;
1614 if (guard)
1615 tl_assert(isOriginalAtom(mce, guard));
1617 /* Since the original expression is atomic, there's no duplicated
1618 work generated by making multiple V-expressions for it. So we
1619 don't really care about the possibility that someone else may
1620 also create a V-interpretion for it. */
1621 tl_assert(isOriginalAtom(mce, atom));
1622 vatom = expr2vbits( mce, atom, HuOth );
1623 tl_assert(isShadowAtom(mce, vatom));
1624 tl_assert(sameKindedAtoms(atom, vatom));
1626 ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1628 /* sz is only used for constructing the error message */
1629 sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
1631 cond = mkPCastTo( mce, Ity_I1, vatom );
1632 /* cond will be 0 if all defined, and 1 if any not defined. */
1634 /* Get the origin info for the value we are about to check. At
1635 least, if we are doing origin tracking. If not, use a dummy
1636 zero origin. */
1637 if (MC_(clo_mc_level) == 3) {
1638 origin = schemeE( mce, atom );
1639 if (mce->hWordTy == Ity_I64) {
1640 origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
1642 } else {
1643 origin = NULL;
1646 fn = NULL;
1647 nm = NULL;
1648 args = NULL;
1649 nargs = -1;
1651 switch (sz) {
1652 case 0:
1653 if (origin) {
1654 fn = &MC_(helperc_value_check0_fail_w_o);
1655 nm = "MC_(helperc_value_check0_fail_w_o)";
1656 args = mkIRExprVec_1(origin);
1657 nargs = 1;
1658 } else {
1659 fn = &MC_(helperc_value_check0_fail_no_o);
1660 nm = "MC_(helperc_value_check0_fail_no_o)";
1661 args = mkIRExprVec_0();
1662 nargs = 0;
1664 break;
1665 case 1:
1666 if (origin) {
1667 fn = &MC_(helperc_value_check1_fail_w_o);
1668 nm = "MC_(helperc_value_check1_fail_w_o)";
1669 args = mkIRExprVec_1(origin);
1670 nargs = 1;
1671 } else {
1672 fn = &MC_(helperc_value_check1_fail_no_o);
1673 nm = "MC_(helperc_value_check1_fail_no_o)";
1674 args = mkIRExprVec_0();
1675 nargs = 0;
1677 break;
1678 case 4:
1679 if (origin) {
1680 fn = &MC_(helperc_value_check4_fail_w_o);
1681 nm = "MC_(helperc_value_check4_fail_w_o)";
1682 args = mkIRExprVec_1(origin);
1683 nargs = 1;
1684 } else {
1685 fn = &MC_(helperc_value_check4_fail_no_o);
1686 nm = "MC_(helperc_value_check4_fail_no_o)";
1687 args = mkIRExprVec_0();
1688 nargs = 0;
1690 break;
1691 case 8:
1692 if (origin) {
1693 fn = &MC_(helperc_value_check8_fail_w_o);
1694 nm = "MC_(helperc_value_check8_fail_w_o)";
1695 args = mkIRExprVec_1(origin);
1696 nargs = 1;
1697 } else {
1698 fn = &MC_(helperc_value_check8_fail_no_o);
1699 nm = "MC_(helperc_value_check8_fail_no_o)";
1700 args = mkIRExprVec_0();
1701 nargs = 0;
1703 break;
1704 case 2:
1705 case 16:
1706 if (origin) {
1707 fn = &MC_(helperc_value_checkN_fail_w_o);
1708 nm = "MC_(helperc_value_checkN_fail_w_o)";
1709 args = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
1710 nargs = 2;
1711 } else {
1712 fn = &MC_(helperc_value_checkN_fail_no_o);
1713 nm = "MC_(helperc_value_checkN_fail_no_o)";
1714 args = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
1715 nargs = 1;
1717 break;
1718 default:
1719 VG_(tool_panic)("unexpected szB");
1722 tl_assert(fn);
1723 tl_assert(nm);
1724 tl_assert(args);
1725 tl_assert(nargs >= 0 && nargs <= 2);
1726 tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
1727 || (MC_(clo_mc_level) == 2 && origin == NULL) );
1729 di = unsafeIRDirty_0_N( nargs/*regparms*/, nm,
1730 VG_(fnptr_to_fnentry)( fn ), args );
1731 di->guard = cond; // and cond is PCast-to-1(atom#)
1733 /* If the complaint is to be issued under a guard condition, AND
1734 that into the guard condition for the helper call. */
1735 if (guard) {
1736 IRAtom *g1 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, di->guard));
1737 IRAtom *g2 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, guard));
1738 IRAtom *e = assignNew('V', mce, Ity_I32, binop(Iop_And32, g1, g2));
1739 di->guard = assignNew('V', mce, Ity_I1, unop(Iop_32to1, e));
1742 setHelperAnns( mce, di );
1743 stmt( 'V', mce, IRStmt_Dirty(di));
1745 /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
1746 defined -- but only in the case where the guard evaluates to
1747 True at run-time. Do the update by setting the orig->shadow
1748 mapping for tmp to reflect the fact that this shadow is getting
1749 a new value. */
1750 tl_assert(isIRAtom(vatom));
1751 /* sameKindedAtoms ... */
1752 if (vatom->tag == Iex_RdTmp) {
1753 tl_assert(atom->tag == Iex_RdTmp);
1754 if (guard == NULL) {
1755 // guard is 'always True', hence update unconditionally
1756 newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1757 assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp),
1758 definedOfType(ty));
1759 } else {
1760 // update the temp only conditionally. Do this by copying
1761 // its old value when the guard is False.
1762 // The old value ..
1763 IRTemp old_tmpV = findShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1764 newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1765 IRAtom* new_tmpV
1766 = assignNew('V', mce, shadowTypeV(ty),
1767 IRExpr_ITE(guard, definedOfType(ty),
1768 mkexpr(old_tmpV)));
1769 assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), new_tmpV);
1775 /*------------------------------------------------------------*/
1776 /*--- Shadowing PUTs/GETs, and indexed variants thereof ---*/
1777 /*------------------------------------------------------------*/
1779 /* Examine the always-defined sections declared in layout to see if
1780 the (offset,size) section is within one. Note, is is an error to
1781 partially fall into such a region: (offset,size) should either be
1782 completely in such a region or completely not-in such a region.
1784 static Bool isAlwaysDefd ( MCEnv* mce, Int offset, Int size )
1786 Int minoffD, maxoffD, i;
1787 Int minoff = offset;
1788 Int maxoff = minoff + size - 1;
1789 tl_assert((minoff & ~0xFFFF) == 0);
1790 tl_assert((maxoff & ~0xFFFF) == 0);
1792 for (i = 0; i < mce->layout->n_alwaysDefd; i++) {
1793 minoffD = mce->layout->alwaysDefd[i].offset;
1794 maxoffD = minoffD + mce->layout->alwaysDefd[i].size - 1;
1795 tl_assert((minoffD & ~0xFFFF) == 0);
1796 tl_assert((maxoffD & ~0xFFFF) == 0);
1798 if (maxoff < minoffD || maxoffD < minoff)
1799 continue; /* no overlap */
1800 if (minoff >= minoffD && maxoff <= maxoffD)
1801 return True; /* completely contained in an always-defd section */
1803 VG_(tool_panic)("memcheck:isAlwaysDefd:partial overlap");
1805 return False; /* could not find any containing section */
1809 /* Generate into bb suitable actions to shadow this Put. If the state
1810 slice is marked 'always defined', do nothing. Otherwise, write the
1811 supplied V bits to the shadow state. We can pass in either an
1812 original atom or a V-atom, but not both. In the former case the
1813 relevant V-bits are then generated from the original.
1814 We assume here, that the definedness of GUARD has already been checked.
1816 static
1817 void do_shadow_PUT ( MCEnv* mce, Int offset,
1818 IRAtom* atom, IRAtom* vatom, IRExpr *guard )
1820 IRType ty;
1822 // Don't do shadow PUTs if we're not doing undefined value checking.
1823 // Their absence lets Vex's optimiser remove all the shadow computation
1824 // that they depend on, which includes GETs of the shadow registers.
1825 if (MC_(clo_mc_level) == 1)
1826 return;
1828 if (atom) {
1829 tl_assert(!vatom);
1830 tl_assert(isOriginalAtom(mce, atom));
1831 vatom = expr2vbits( mce, atom, HuOth );
1832 } else {
1833 tl_assert(vatom);
1834 tl_assert(isShadowAtom(mce, vatom));
1837 ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1838 tl_assert(ty != Ity_I1);
1839 if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1840 /* later: no ... */
1841 /* emit code to emit a complaint if any of the vbits are 1. */
1842 /* complainIfUndefined(mce, atom); */
1843 } else {
1844 /* Do a plain shadow Put. */
1845 if (guard) {
1846 /* If the guard expression evaluates to false we simply Put the value
1847 that is already stored in the guest state slot */
1848 IRAtom *cond, *iffalse;
1850 cond = assignNew('V', mce, Ity_I1, guard);
1851 iffalse = assignNew('V', mce, ty,
1852 IRExpr_Get(offset + mce->layout->total_sizeB, ty));
1853 vatom = assignNew('V', mce, ty, IRExpr_ITE(cond, vatom, iffalse));
1855 stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ));
1860 /* Return an expression which contains the V bits corresponding to the
1861 given GETI (passed in in pieces).
1863 static
1864 void do_shadow_PUTI ( MCEnv* mce, IRPutI *puti)
1866 IRAtom* vatom;
1867 IRType ty, tyS;
1868 Int arrSize;;
1869 IRRegArray* descr = puti->descr;
1870 IRAtom* ix = puti->ix;
1871 Int bias = puti->bias;
1872 IRAtom* atom = puti->data;
1874 // Don't do shadow PUTIs if we're not doing undefined value checking.
1875 // Their absence lets Vex's optimiser remove all the shadow computation
1876 // that they depend on, which includes GETIs of the shadow registers.
1877 if (MC_(clo_mc_level) == 1)
1878 return;
1880 tl_assert(isOriginalAtom(mce,atom));
1881 vatom = expr2vbits( mce, atom, HuOth );
1882 tl_assert(sameKindedAtoms(atom, vatom));
1883 ty = descr->elemTy;
1884 tyS = shadowTypeV(ty);
1885 arrSize = descr->nElems * sizeofIRType(ty);
1886 tl_assert(ty != Ity_I1);
1887 tl_assert(isOriginalAtom(mce,ix));
1888 complainIfUndefined(mce, ix, NULL);
1889 if (isAlwaysDefd(mce, descr->base, arrSize)) {
1890 /* later: no ... */
1891 /* emit code to emit a complaint if any of the vbits are 1. */
1892 /* complainIfUndefined(mce, atom); */
1893 } else {
1894 /* Do a cloned version of the Put that refers to the shadow
1895 area. */
1896 IRRegArray* new_descr
1897 = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1898 tyS, descr->nElems);
1899 stmt( 'V', mce, IRStmt_PutI( mkIRPutI(new_descr, ix, bias, vatom) ));
1904 /* Return an expression which contains the V bits corresponding to the
1905 given GET (passed in in pieces).
1907 static
1908 IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
1910 IRType tyS = shadowTypeV(ty);
1911 tl_assert(ty != Ity_I1);
1912 tl_assert(ty != Ity_I128);
1913 if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1914 /* Always defined, return all zeroes of the relevant type */
1915 return definedOfType(tyS);
1916 } else {
1917 /* return a cloned version of the Get that refers to the shadow
1918 area. */
1919 /* FIXME: this isn't an atom! */
1920 return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
1925 /* Return an expression which contains the V bits corresponding to the
1926 given GETI (passed in in pieces).
1928 static
1929 IRExpr* shadow_GETI ( MCEnv* mce,
1930 IRRegArray* descr, IRAtom* ix, Int bias )
1932 IRType ty = descr->elemTy;
1933 IRType tyS = shadowTypeV(ty);
1934 Int arrSize = descr->nElems * sizeofIRType(ty);
1935 tl_assert(ty != Ity_I1);
1936 tl_assert(isOriginalAtom(mce,ix));
1937 complainIfUndefined(mce, ix, NULL);
1938 if (isAlwaysDefd(mce, descr->base, arrSize)) {
1939 /* Always defined, return all zeroes of the relevant type */
1940 return definedOfType(tyS);
1941 } else {
1942 /* return a cloned version of the Get that refers to the shadow
1943 area. */
1944 IRRegArray* new_descr
1945 = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1946 tyS, descr->nElems);
1947 return IRExpr_GetI( new_descr, ix, bias );
1952 /*------------------------------------------------------------*/
1953 /*--- Generating approximations for unknown operations, ---*/
1954 /*--- using lazy-propagate semantics ---*/
1955 /*------------------------------------------------------------*/
1957 /* Lazy propagation of undefinedness from two values, resulting in the
1958 specified shadow type.
1960 static
1961 IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
1963 IRAtom* at;
1964 IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1965 IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1966 tl_assert(isShadowAtom(mce,va1));
1967 tl_assert(isShadowAtom(mce,va2));
1969 /* The general case is inefficient because PCast is an expensive
1970 operation. Here are some special cases which use PCast only
1971 once rather than twice. */
1973 /* I64 x I64 -> I64 */
1974 if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I64) {
1975 if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
1976 at = mkUifU(mce, Ity_I64, va1, va2);
1977 at = mkPCastTo(mce, Ity_I64, at);
1978 return at;
1981 /* I64 x I64 -> I32 */
1982 if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I32) {
1983 if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
1984 at = mkUifU(mce, Ity_I64, va1, va2);
1985 at = mkPCastTo(mce, Ity_I32, at);
1986 return at;
1989 /* I32 x I32 -> I32 */
1990 if (t1 == Ity_I32 && t2 == Ity_I32 && finalVty == Ity_I32) {
1991 if (0) VG_(printf)("mkLazy2: I32 x I32 -> I32\n");
1992 at = mkUifU(mce, Ity_I32, va1, va2);
1993 at = mkPCastTo(mce, Ity_I32, at);
1994 return at;
1997 if (0) {
1998 VG_(printf)("mkLazy2 ");
1999 ppIRType(t1);
2000 VG_(printf)("_");
2001 ppIRType(t2);
2002 VG_(printf)("_");
2003 ppIRType(finalVty);
2004 VG_(printf)("\n");
2007 /* General case: force everything via 32-bit intermediaries. */
2008 at = mkPCastTo(mce, Ity_I32, va1);
2009 at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
2010 at = mkPCastTo(mce, finalVty, at);
2011 return at;
2015 /* 3-arg version of the above. */
2016 static
2017 IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
2018 IRAtom* va1, IRAtom* va2, IRAtom* va3 )
2020 IRAtom* at;
2021 IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
2022 IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
2023 IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
2024 tl_assert(isShadowAtom(mce,va1));
2025 tl_assert(isShadowAtom(mce,va2));
2026 tl_assert(isShadowAtom(mce,va3));
2028 /* The general case is inefficient because PCast is an expensive
2029 operation. Here are some special cases which use PCast only
2030 twice rather than three times. */
2032 /* I32 x I64 x I64 -> I64 */
2033 /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2034 if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
2035 && finalVty == Ity_I64) {
2036 if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I64\n");
2037 /* Widen 1st arg to I64. Since 1st arg is typically a rounding
2038 mode indication which is fully defined, this should get
2039 folded out later. */
2040 at = mkPCastTo(mce, Ity_I64, va1);
2041 /* Now fold in 2nd and 3rd args. */
2042 at = mkUifU(mce, Ity_I64, at, va2);
2043 at = mkUifU(mce, Ity_I64, at, va3);
2044 /* and PCast once again. */
2045 at = mkPCastTo(mce, Ity_I64, at);
2046 return at;
2049 /* I32 x I8 x I64 -> I64 */
2050 if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I64
2051 && finalVty == Ity_I64) {
2052 if (0) VG_(printf)("mkLazy3: I32 x I8 x I64 -> I64\n");
2053 /* Widen 1st and 2nd args to I64. Since 1st arg is typically a
2054 * rounding mode indication which is fully defined, this should
2055 * get folded out later.
2057 IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
2058 IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
2059 at = mkUifU(mce, Ity_I64, at1, at2); // UifU(PCast(va1), PCast(va2))
2060 at = mkUifU(mce, Ity_I64, at, va3);
2061 /* and PCast once again. */
2062 at = mkPCastTo(mce, Ity_I64, at);
2063 return at;
2066 /* I32 x I64 x I64 -> I32 */
2067 if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
2068 && finalVty == Ity_I32) {
2069 if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I32\n");
2070 at = mkPCastTo(mce, Ity_I64, va1);
2071 at = mkUifU(mce, Ity_I64, at, va2);
2072 at = mkUifU(mce, Ity_I64, at, va3);
2073 at = mkPCastTo(mce, Ity_I32, at);
2074 return at;
2077 /* I32 x I32 x I32 -> I32 */
2078 /* 32-bit FP idiom, as (eg) happens on ARM */
2079 if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32
2080 && finalVty == Ity_I32) {
2081 if (0) VG_(printf)("mkLazy3: I32 x I32 x I32 -> I32\n");
2082 at = va1;
2083 at = mkUifU(mce, Ity_I32, at, va2);
2084 at = mkUifU(mce, Ity_I32, at, va3);
2085 at = mkPCastTo(mce, Ity_I32, at);
2086 return at;
2089 /* I32 x I16 x I16 -> I16 */
2090 /* 16-bit half-precision FP idiom, as (eg) happens on arm64 v8.2 onwards */
2091 if (t1 == Ity_I32 && t2 == Ity_I16 && t3 == Ity_I16
2092 && finalVty == Ity_I16) {
2093 if (0) VG_(printf)("mkLazy3: I32 x I16 x I16 -> I16\n");
2094 at = mkPCastTo(mce, Ity_I16, va1);
2095 at = mkUifU(mce, Ity_I16, at, va2);
2096 at = mkUifU(mce, Ity_I16, at, va3);
2097 at = mkPCastTo(mce, Ity_I16, at);
2098 return at;
2101 /* I32 x I128 x I128 -> I128 */
2102 /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2103 if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128
2104 && finalVty == Ity_I128) {
2105 if (0) VG_(printf)("mkLazy3: I32 x I128 x I128 -> I128\n");
2106 /* Widen 1st arg to I128. Since 1st arg is typically a rounding
2107 mode indication which is fully defined, this should get
2108 folded out later. */
2109 at = mkPCastTo(mce, Ity_I128, va1);
2110 /* Now fold in 2nd and 3rd args. */
2111 at = mkUifU(mce, Ity_I128, at, va2);
2112 at = mkUifU(mce, Ity_I128, at, va3);
2113 /* and PCast once again. */
2114 at = mkPCastTo(mce, Ity_I128, at);
2115 return at;
2118 /* I32 x I8 x I128 -> I128 */
2119 /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2120 if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I128
2121 && finalVty == Ity_I128) {
2122 if (0) VG_(printf)("mkLazy3: I32 x I8 x I128 -> I128\n");
2123 /* Use I64 as an intermediate type, which means PCasting all 3
2124 args to I64 to start with. 1st arg is typically a rounding
2125 mode indication which is fully defined, so we hope that it
2126 will get folded out later. */
2127 IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
2128 IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
2129 IRAtom* at3 = mkPCastTo(mce, Ity_I64, va3);
2130 /* Now UifU all three together. */
2131 at = mkUifU(mce, Ity_I64, at1, at2); // UifU(PCast(va1), PCast(va2))
2132 at = mkUifU(mce, Ity_I64, at, at3); // ... `UifU` PCast(va3)
2133 /* and PCast once again. */
2134 at = mkPCastTo(mce, Ity_I128, at);
2135 return at;
2137 if (1) {
2138 VG_(printf)("mkLazy3: ");
2139 ppIRType(t1);
2140 VG_(printf)(" x ");
2141 ppIRType(t2);
2142 VG_(printf)(" x ");
2143 ppIRType(t3);
2144 VG_(printf)(" -> ");
2145 ppIRType(finalVty);
2146 VG_(printf)("\n");
2149 tl_assert(0);
2150 /* General case: force everything via 32-bit intermediaries. */
2152 at = mkPCastTo(mce, Ity_I32, va1);
2153 at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
2154 at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
2155 at = mkPCastTo(mce, finalVty, at);
2156 return at;
2161 /* 4-arg version of the above. */
2162 static
2163 IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
2164 IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
2166 IRAtom* at;
2167 IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
2168 IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
2169 IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
2170 IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
2171 tl_assert(isShadowAtom(mce,va1));
2172 tl_assert(isShadowAtom(mce,va2));
2173 tl_assert(isShadowAtom(mce,va3));
2174 tl_assert(isShadowAtom(mce,va4));
2176 /* The general case is inefficient because PCast is an expensive
2177 operation. Here are some special cases which use PCast only
2178 twice rather than three times. */
2180 /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2182 if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128 && t4 == Ity_I128
2183 && finalVty == Ity_I128) {
2184 if (0) VG_(printf)("mkLazy4: I32 x I128 x I128 x I128 -> I128\n");
2185 /* Widen 1st arg to I128. Since 1st arg is typically a rounding
2186 mode indication which is fully defined, this should get
2187 folded out later. */
2188 at = mkPCastTo(mce, Ity_I128, va1);
2189 /* Now fold in 2nd, 3rd, 4th args. */
2190 at = mkUifU(mce, Ity_I128, at, va2);
2191 at = mkUifU(mce, Ity_I128, at, va3);
2192 at = mkUifU(mce, Ity_I128, at, va4);
2193 /* and PCast once again. */
2194 at = mkPCastTo(mce, Ity_I128, at);
2195 return at;
2198 /* I32 x I64 x I64 x I64 -> I64 */
2199 if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64 && t4 == Ity_I64
2200 && finalVty == Ity_I64) {
2201 if (0) VG_(printf)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
2202 /* Widen 1st arg to I64. Since 1st arg is typically a rounding
2203 mode indication which is fully defined, this should get
2204 folded out later. */
2205 at = mkPCastTo(mce, Ity_I64, va1);
2206 /* Now fold in 2nd, 3rd, 4th args. */
2207 at = mkUifU(mce, Ity_I64, at, va2);
2208 at = mkUifU(mce, Ity_I64, at, va3);
2209 at = mkUifU(mce, Ity_I64, at, va4);
2210 /* and PCast once again. */
2211 at = mkPCastTo(mce, Ity_I64, at);
2212 return at;
2214 /* I32 x I32 x I32 x I32 -> I32 */
2215 /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2216 if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32 && t4 == Ity_I32
2217 && finalVty == Ity_I32) {
2218 if (0) VG_(printf)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
2219 at = va1;
2220 /* Now fold in 2nd, 3rd, 4th args. */
2221 at = mkUifU(mce, Ity_I32, at, va2);
2222 at = mkUifU(mce, Ity_I32, at, va3);
2223 at = mkUifU(mce, Ity_I32, at, va4);
2224 at = mkPCastTo(mce, Ity_I32, at);
2225 return at;
2228 if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2229 && finalVty == Ity_I32) {
2230 if (0) VG_(printf)("mkLazy4: I32 x I8 x I8 x I8 -> I32\n");
2231 at = mkPCastTo(mce, Ity_I8, va1);
2232 /* Now fold in 2nd, 3rd, 4th args. */
2233 at = mkUifU(mce, Ity_I8, at, va2);
2234 at = mkUifU(mce, Ity_I8, at, va3);
2235 at = mkUifU(mce, Ity_I8, at, va4);
2236 at = mkPCastTo(mce, Ity_I32, at);
2237 return at;
2240 if (t1 == Ity_I64 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2241 && finalVty == Ity_I64) {
2242 if (0) VG_(printf)("mkLazy4: I64 x I8 x I8 x I8 -> I64\n");
2243 at = mkPCastTo(mce, Ity_I8, va1);
2244 /* Now fold in 2nd, 3rd, 4th args. */
2245 at = mkUifU(mce, Ity_I8, at, va2);
2246 at = mkUifU(mce, Ity_I8, at, va3);
2247 at = mkUifU(mce, Ity_I8, at, va4);
2248 at = mkPCastTo(mce, Ity_I64, at);
2249 return at;
2252 if (1) {
2253 VG_(printf)("mkLazy4: ");
2254 ppIRType(t1);
2255 VG_(printf)(" x ");
2256 ppIRType(t2);
2257 VG_(printf)(" x ");
2258 ppIRType(t3);
2259 VG_(printf)(" x ");
2260 ppIRType(t4);
2261 VG_(printf)(" -> ");
2262 ppIRType(finalVty);
2263 VG_(printf)("\n");
2266 tl_assert(0);
2270 /* Do the lazy propagation game from a null-terminated vector of
2271 atoms. This is presumably the arguments to a helper call, so the
2272 IRCallee info is also supplied in order that we can know which
2273 arguments should be ignored (via the .mcx_mask field).
2275 static
2276 IRAtom* mkLazyN ( MCEnv* mce,
2277 IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
2279 Int i;
2280 IRAtom* here;
2281 IRAtom* curr;
2282 IRType mergeTy;
2283 Bool mergeTy64 = True;
2285 /* Decide on the type of the merge intermediary. If all relevant
2286 args are I64, then it's I64. In all other circumstances, use
2287 I32. */
2288 for (i = 0; exprvec[i]; i++) {
2289 tl_assert(i < 32);
2290 tl_assert(isOriginalAtom(mce, exprvec[i]));
2291 if (cee->mcx_mask & (1<<i))
2292 continue;
2293 if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
2294 mergeTy64 = False;
2297 mergeTy = mergeTy64 ? Ity_I64 : Ity_I32;
2298 curr = definedOfType(mergeTy);
2300 for (i = 0; exprvec[i]; i++) {
2301 tl_assert(i < 32);
2302 tl_assert(isOriginalAtom(mce, exprvec[i]));
2303 /* Only take notice of this arg if the callee's mc-exclusion
2304 mask does not say it is to be excluded. */
2305 if (cee->mcx_mask & (1<<i)) {
2306 /* the arg is to be excluded from definedness checking. Do
2307 nothing. */
2308 if (0) VG_(printf)("excluding %s(%d)\n", cee->name, i);
2309 } else {
2310 /* calculate the arg's definedness, and pessimistically merge
2311 it in. */
2312 here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i], HuOth) );
2313 curr = mergeTy64
2314 ? mkUifU64(mce, here, curr)
2315 : mkUifU32(mce, here, curr);
2318 return mkPCastTo(mce, finalVtype, curr );
2322 /*------------------------------------------------------------*/
2323 /*--- Generating expensive sequences for exact carry-chain ---*/
2324 /*--- propagation in add/sub and related operations. ---*/
2325 /*------------------------------------------------------------*/
2327 static
2328 IRAtom* expensiveAddSub ( MCEnv* mce,
2329 Bool add,
2330 IRType ty,
2331 IRAtom* qaa, IRAtom* qbb,
2332 IRAtom* aa, IRAtom* bb )
2334 IRAtom *a_min, *b_min, *a_max, *b_max;
2335 IROp opAND, opOR, opXOR, opNOT, opADD, opSUB;
2337 tl_assert(isShadowAtom(mce,qaa));
2338 tl_assert(isShadowAtom(mce,qbb));
2339 tl_assert(isOriginalAtom(mce,aa));
2340 tl_assert(isOriginalAtom(mce,bb));
2341 tl_assert(sameKindedAtoms(qaa,aa));
2342 tl_assert(sameKindedAtoms(qbb,bb));
2344 switch (ty) {
2345 case Ity_I32:
2346 opAND = Iop_And32;
2347 opOR = Iop_Or32;
2348 opXOR = Iop_Xor32;
2349 opNOT = Iop_Not32;
2350 opADD = Iop_Add32;
2351 opSUB = Iop_Sub32;
2352 break;
2353 case Ity_I64:
2354 opAND = Iop_And64;
2355 opOR = Iop_Or64;
2356 opXOR = Iop_Xor64;
2357 opNOT = Iop_Not64;
2358 opADD = Iop_Add64;
2359 opSUB = Iop_Sub64;
2360 break;
2361 default:
2362 VG_(tool_panic)("expensiveAddSub");
2365 // a_min = aa & ~qaa
2366 a_min = assignNew('V', mce,ty,
2367 binop(opAND, aa,
2368 assignNew('V', mce,ty, unop(opNOT, qaa))));
2370 // b_min = bb & ~qbb
2371 b_min = assignNew('V', mce,ty,
2372 binop(opAND, bb,
2373 assignNew('V', mce,ty, unop(opNOT, qbb))));
2375 // a_max = aa | qaa
2376 a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
2378 // b_max = bb | qbb
2379 b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
2381 if (add) {
2382 // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
2383 return
2384 assignNew('V', mce,ty,
2385 binop( opOR,
2386 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2387 assignNew('V', mce,ty,
2388 binop( opXOR,
2389 assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
2390 assignNew('V', mce,ty, binop(opADD, a_max, b_max))
2395 } else {
2396 // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max - b_min))
2397 return
2398 assignNew('V', mce,ty,
2399 binop( opOR,
2400 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2401 assignNew('V', mce,ty,
2402 binop( opXOR,
2403 assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
2404 assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
2414 static
2415 IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
2416 IRAtom* atom, IRAtom* vatom )
2418 IRType ty;
2419 IROp xorOp, subOp, andOp;
2420 IRExpr *one;
2421 IRAtom *improver, *improved;
2422 tl_assert(isShadowAtom(mce,vatom));
2423 tl_assert(isOriginalAtom(mce,atom));
2424 tl_assert(sameKindedAtoms(atom,vatom));
2426 switch (czop) {
2427 case Iop_Ctz32: case Iop_CtzNat32:
2428 ty = Ity_I32;
2429 xorOp = Iop_Xor32;
2430 subOp = Iop_Sub32;
2431 andOp = Iop_And32;
2432 one = mkU32(1);
2433 break;
2434 case Iop_Ctz64: case Iop_CtzNat64:
2435 ty = Ity_I64;
2436 xorOp = Iop_Xor64;
2437 subOp = Iop_Sub64;
2438 andOp = Iop_And64;
2439 one = mkU64(1);
2440 break;
2441 default:
2442 ppIROp(czop);
2443 VG_(tool_panic)("memcheck:expensiveCountTrailingZeroes");
2446 // improver = atom ^ (atom - 1)
2448 // That is, improver has its low ctz(atom)+1 bits equal to one;
2449 // higher bits (if any) equal to zero. So it's exactly the right
2450 // mask to use to remove the irrelevant undefined input bits.
2451 /* Here are some examples:
2452 atom = U...U 1 0...0
2453 atom-1 = U...U 0 1...1
2454 ^ed = 0...0 1 11111, which correctly describes which bits of |atom|
2455 actually influence the result
2456 A boundary case
2457 atom = 0...0
2458 atom-1 = 1...1
2459 ^ed = 11111, also a correct mask for the input: all input bits
2460 are relevant
2461 Another boundary case
2462 atom = 1..1 1
2463 atom-1 = 1..1 0
2464 ^ed = 0..0 1, also a correct mask: only the rightmost input bit
2465 is relevant
2466 Now with misc U bits interspersed:
2467 atom = U...U 1 0 U...U 0 1 0...0
2468 atom-1 = U...U 1 0 U...U 0 0 1...1
2469 ^ed = 0...0 0 0 0...0 0 1 1...1, also correct
2470 (Per re-check/analysis of 14 Nov 2018)
2472 improver = assignNew('V', mce,ty,
2473 binop(xorOp,
2474 atom,
2475 assignNew('V', mce, ty,
2476 binop(subOp, atom, one))));
2478 // improved = vatom & improver
2480 // That is, treat any V bits to the left of the rightmost ctz(atom)+1
2481 // bits as "defined".
2482 improved = assignNew('V', mce, ty,
2483 binop(andOp, vatom, improver));
2485 // Return pessimizing cast of improved.
2486 return mkPCastTo(mce, ty, improved);
2489 static
2490 IRAtom* expensiveCountLeadingZeroes ( MCEnv* mce, IROp czop,
2491 IRAtom* atom, IRAtom* vatom )
2493 IRType ty;
2494 IROp shrOp, notOp, andOp;
2495 IRAtom* (*mkRight)(MCEnv*, IRAtom*);
2496 IRAtom *improver, *improved;
2497 tl_assert(isShadowAtom(mce,vatom));
2498 tl_assert(isOriginalAtom(mce,atom));
2499 tl_assert(sameKindedAtoms(atom,vatom));
2501 switch (czop) {
2502 case Iop_Clz32: case Iop_ClzNat32:
2503 ty = Ity_I32;
2504 shrOp = Iop_Shr32;
2505 notOp = Iop_Not32;
2506 andOp = Iop_And32;
2507 mkRight = mkRight32;
2508 break;
2509 case Iop_Clz64: case Iop_ClzNat64:
2510 ty = Ity_I64;
2511 shrOp = Iop_Shr64;
2512 notOp = Iop_Not64;
2513 andOp = Iop_And64;
2514 mkRight = mkRight64;
2515 break;
2516 default:
2517 ppIROp(czop);
2518 VG_(tool_panic)("memcheck:expensiveCountLeadingZeroes");
2521 // This is in principle very similar to how expensiveCountTrailingZeroes
2522 // works. That function computed an "improver", which it used to mask
2523 // off all but the rightmost 1-bit and the zeroes to the right of it,
2524 // hence removing irrelevant bits from the input. Here, we play the
2525 // exact same game but with the left-vs-right roles interchanged.
2526 // Unfortunately calculation of the improver in this case is
2527 // significantly more expensive.
2529 // improver = ~(RIGHT(atom) >>u 1)
2531 // That is, improver has its upper clz(atom)+1 bits equal to one;
2532 // lower bits (if any) equal to zero. So it's exactly the right
2533 // mask to use to remove the irrelevant undefined input bits.
2534 /* Here are some examples:
2535 atom = 0...0 1 U...U
2536 R(atom) = 0...0 1 1...1
2537 R(atom) >>u 1 = 0...0 0 1...1
2538 ~(R(atom) >>u 1) = 1...1 1 0...0
2539 which correctly describes which bits of |atom|
2540 actually influence the result
2541 A boundary case
2542 atom = 0...0
2543 R(atom) = 0...0
2544 R(atom) >>u 1 = 0...0
2545 ~(R(atom) >>u 1) = 1...1
2546 also a correct mask for the input: all input bits
2547 are relevant
2548 Another boundary case
2549 atom = 1 1..1
2550 R(atom) = 1 1..1
2551 R(atom) >>u 1 = 0 1..1
2552 ~(R(atom) >>u 1) = 1 0..0
2553 also a correct mask: only the leftmost input bit
2554 is relevant
2555 Now with misc U bits interspersed:
2556 atom = 0...0 1 U...U 0 1 U...U
2557 R(atom) = 0...0 1 1...1 1 1 1...1
2558 R(atom) >>u 1 = 0...0 0 1...1 1 1 1...1
2559 ~(R(atom) >>u 1) = 1...1 1 0...0 0 0 0...0, also correct
2560 (Per initial implementation of 15 Nov 2018)
2562 improver = mkRight(mce, atom);
2563 improver = assignNew('V', mce, ty, binop(shrOp, improver, mkU8(1)));
2564 improver = assignNew('V', mce, ty, unop(notOp, improver));
2566 // improved = vatom & improver
2568 // That is, treat any V bits to the right of the leftmost clz(atom)+1
2569 // bits as "defined".
2570 improved = assignNew('V', mce, ty,
2571 binop(andOp, vatom, improver));
2573 // Return pessimizing cast of improved.
2574 return mkPCastTo(mce, ty, improved);
2578 /*------------------------------------------------------------*/
2579 /*--- Scalar shifts. ---*/
2580 /*------------------------------------------------------------*/
2582 /* Produce an interpretation for (aa << bb) (or >>s, >>u). The basic
2583 idea is to shift the definedness bits by the original shift amount.
2584 This introduces 0s ("defined") in new positions for left shifts and
2585 unsigned right shifts, and copies the top definedness bit for
2586 signed right shifts. So, conveniently, applying the original shift
2587 operator to the definedness bits for the left arg is exactly the
2588 right thing to do:
2590 (qaa << bb)
2592 However if the shift amount is undefined then the whole result
2593 is undefined. Hence need:
2595 (qaa << bb) `UifU` PCast(qbb)
2597 If the shift amount bb is a literal than qbb will say 'all defined'
2598 and the UifU and PCast will get folded out by post-instrumentation
2599 optimisation.
2601 static IRAtom* scalarShift ( MCEnv* mce,
2602 IRType ty,
2603 IROp original_op,
2604 IRAtom* qaa, IRAtom* qbb,
2605 IRAtom* aa, IRAtom* bb )
2607 tl_assert(isShadowAtom(mce,qaa));
2608 tl_assert(isShadowAtom(mce,qbb));
2609 tl_assert(isOriginalAtom(mce,aa));
2610 tl_assert(isOriginalAtom(mce,bb));
2611 tl_assert(sameKindedAtoms(qaa,aa));
2612 tl_assert(sameKindedAtoms(qbb,bb));
2613 return
2614 assignNew(
2615 'V', mce, ty,
2616 mkUifU( mce, ty,
2617 assignNew('V', mce, ty, binop(original_op, qaa, bb)),
2618 mkPCastTo(mce, ty, qbb)
2624 /*------------------------------------------------------------*/
2625 /*--- Helpers for dealing with vector primops. ---*/
2626 /*------------------------------------------------------------*/
2628 /* Vector pessimisation -- pessimise within each lane individually. */
2630 static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
2632 return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
2635 static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
2637 return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
2640 static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
2642 return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
2645 static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
2647 return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
2650 static IRAtom* mkPCast128x1 ( MCEnv* mce, IRAtom* at )
2652 return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ128x1, at));
2655 static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
2657 return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
2660 static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
2662 return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
2665 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
2667 return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
2670 static IRAtom* mkPCast16x16 ( MCEnv* mce, IRAtom* at )
2672 return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ16x16, at));
2675 static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
2677 return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
2680 static IRAtom* mkPCast8x32 ( MCEnv* mce, IRAtom* at )
2682 return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ8x32, at));
2685 static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
2687 return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
2690 static IRAtom* mkPCast16x2 ( MCEnv* mce, IRAtom* at )
2692 return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ16x2, at));
2695 static IRAtom* mkPCast8x4 ( MCEnv* mce, IRAtom* at )
2697 return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ8x4, at));
2701 /* Here's a simple scheme capable of handling ops derived from SSE1
2702 code and while only generating ops that can be efficiently
2703 implemented in SSE1. */
2705 /* All-lanes versions are straightforward:
2707 binary32Fx4(x,y) ==> PCast32x4(UifUV128(x#,y#))
2709 unary32Fx4(x,y) ==> PCast32x4(x#)
2711 Lowest-lane-only versions are more complex:
2713 binary32F0x4(x,y) ==> SetV128lo32(
2714 x#,
2715 PCast32(V128to32(UifUV128(x#,y#)))
2718 This is perhaps not so obvious. In particular, it's faster to
2719 do a V128-bit UifU and then take the bottom 32 bits than the more
2720 obvious scheme of taking the bottom 32 bits of each operand
2721 and doing a 32-bit UifU. Basically since UifU is fast and
2722 chopping lanes off vector values is slow.
2724 Finally:
2726 unary32F0x4(x) ==> SetV128lo32(
2727 x#,
2728 PCast32(V128to32(x#))
2731 Where:
2733 PCast32(v#) = 1Sto32(CmpNE32(v#,0))
2734 PCast32x4(v#) = CmpNEZ32x4(v#)
2737 static
2738 IRAtom* binary32Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2740 IRAtom* at;
2741 tl_assert(isShadowAtom(mce, vatomX));
2742 tl_assert(isShadowAtom(mce, vatomY));
2743 at = mkUifUV128(mce, vatomX, vatomY);
2744 at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
2745 return at;
2748 static
2749 IRAtom* unary32Fx4 ( MCEnv* mce, IRAtom* vatomX )
2751 IRAtom* at;
2752 tl_assert(isShadowAtom(mce, vatomX));
2753 at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
2754 return at;
2757 static
2758 IRAtom* binary32F0x4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2760 IRAtom* at;
2761 tl_assert(isShadowAtom(mce, vatomX));
2762 tl_assert(isShadowAtom(mce, vatomY));
2763 at = mkUifUV128(mce, vatomX, vatomY);
2764 at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
2765 at = mkPCastTo(mce, Ity_I32, at);
2766 at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2767 return at;
2770 static
2771 IRAtom* unary32F0x4 ( MCEnv* mce, IRAtom* vatomX )
2773 IRAtom* at;
2774 tl_assert(isShadowAtom(mce, vatomX));
2775 at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
2776 at = mkPCastTo(mce, Ity_I32, at);
2777 at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2778 return at;
2781 /* --- ... and ... 64Fx2 versions of the same ... --- */
2783 static
2784 IRAtom* binary64Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2786 IRAtom* at;
2787 tl_assert(isShadowAtom(mce, vatomX));
2788 tl_assert(isShadowAtom(mce, vatomY));
2789 at = mkUifUV128(mce, vatomX, vatomY);
2790 at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
2791 return at;
2794 static
2795 IRAtom* unary64Fx2 ( MCEnv* mce, IRAtom* vatomX )
2797 IRAtom* at;
2798 tl_assert(isShadowAtom(mce, vatomX));
2799 at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
2800 return at;
2803 static
2804 IRAtom* binary64F0x2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2806 IRAtom* at;
2807 tl_assert(isShadowAtom(mce, vatomX));
2808 tl_assert(isShadowAtom(mce, vatomY));
2809 at = mkUifUV128(mce, vatomX, vatomY);
2810 at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
2811 at = mkPCastTo(mce, Ity_I64, at);
2812 at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2813 return at;
2816 static
2817 IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX )
2819 IRAtom* at;
2820 tl_assert(isShadowAtom(mce, vatomX));
2821 at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
2822 at = mkPCastTo(mce, Ity_I64, at);
2823 at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2824 return at;
2827 /* --- --- ... and ... 16Fx8 versions of the same --- --- */
2829 static
2830 IRAtom* binary16Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2832 IRAtom* at;
2833 tl_assert(isShadowAtom(mce, vatomX));
2834 tl_assert(isShadowAtom(mce, vatomY));
2835 at = mkUifUV128(mce, vatomX, vatomY);
2836 at = assignNew('V', mce, Ity_V128, mkPCast16x8(mce, at));
2837 return at;
2840 static
2841 IRAtom* unary16Fx8 ( MCEnv* mce, IRAtom* vatomX )
2843 IRAtom* at;
2844 tl_assert(isShadowAtom(mce, vatomX));
2845 at = assignNew('V', mce, Ity_V128, mkPCast16x8(mce, vatomX));
2846 return at;
2849 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is
2850 implemented.
2853 /* --- --- ... and ... 32Fx2 versions of the same --- --- */
2855 static
2856 IRAtom* binary32Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2858 IRAtom* at;
2859 tl_assert(isShadowAtom(mce, vatomX));
2860 tl_assert(isShadowAtom(mce, vatomY));
2861 at = mkUifU64(mce, vatomX, vatomY);
2862 at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, at));
2863 return at;
2866 static
2867 IRAtom* unary32Fx2 ( MCEnv* mce, IRAtom* vatomX )
2869 IRAtom* at;
2870 tl_assert(isShadowAtom(mce, vatomX));
2871 at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, vatomX));
2872 return at;
2875 /* --- ... and ... 64Fx4 versions of the same ... --- */
2877 static
2878 IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2880 IRAtom* at;
2881 tl_assert(isShadowAtom(mce, vatomX));
2882 tl_assert(isShadowAtom(mce, vatomY));
2883 at = mkUifUV256(mce, vatomX, vatomY);
2884 at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
2885 return at;
2888 static
2889 IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
2891 IRAtom* at;
2892 tl_assert(isShadowAtom(mce, vatomX));
2893 at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
2894 return at;
2897 /* --- ... and ... 32Fx8 versions of the same ... --- */
2899 static
2900 IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2902 IRAtom* at;
2903 tl_assert(isShadowAtom(mce, vatomX));
2904 tl_assert(isShadowAtom(mce, vatomY));
2905 at = mkUifUV256(mce, vatomX, vatomY);
2906 at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
2907 return at;
2910 static
2911 IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
2913 IRAtom* at;
2914 tl_assert(isShadowAtom(mce, vatomX));
2915 at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
2916 return at;
2919 /* --- 64Fx2 binary FP ops, with rounding mode --- */
2921 static
2922 IRAtom* binary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM,
2923 IRAtom* vatomX, IRAtom* vatomY )
2925 /* This is the same as binary64Fx2, except that we subsequently
2926 pessimise vRM (definedness of the rounding mode), widen to 128
2927 bits and UifU it into the result. As with the scalar cases, if
2928 the RM is a constant then it is defined and so this extra bit
2929 will get constant-folded out later. */
2930 // "do" the vector args
2931 IRAtom* t1 = binary64Fx2(mce, vatomX, vatomY);
2932 // PCast the RM, and widen it to 128 bits
2933 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2934 // Roll it into the result
2935 t1 = mkUifUV128(mce, t1, t2);
2936 return t1;
2939 /* --- ... and ... 32Fx4 versions of the same --- */
2941 static
2942 IRAtom* binary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2943 IRAtom* vatomX, IRAtom* vatomY )
2945 IRAtom* t1 = binary32Fx4(mce, vatomX, vatomY);
2946 // PCast the RM, and widen it to 128 bits
2947 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2948 // Roll it into the result
2949 t1 = mkUifUV128(mce, t1, t2);
2950 return t1;
2953 /* --- ... and ... 64Fx4 versions of the same --- */
2955 static
2956 IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2957 IRAtom* vatomX, IRAtom* vatomY )
2959 IRAtom* t1 = binary64Fx4(mce, vatomX, vatomY);
2960 // PCast the RM, and widen it to 256 bits
2961 IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2962 // Roll it into the result
2963 t1 = mkUifUV256(mce, t1, t2);
2964 return t1;
2967 /* --- ... and ... 16Fx8 versions of the same --- */
2969 static
2970 IRAtom* binary16Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2971 IRAtom* vatomX, IRAtom* vatomY )
2973 IRAtom* t1 = binary16Fx8(mce, vatomX, vatomY);
2974 // PCast the RM, and widen it to 128 bits
2975 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2976 // Roll it into the result
2977 t1 = mkUifUV128(mce, t1, t2);
2978 return t1;
2981 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is
2982 implemented.
2985 /* --- ... and ... 32Fx8 versions of the same --- */
2987 static
2988 IRAtom* binary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2989 IRAtom* vatomX, IRAtom* vatomY )
2991 IRAtom* t1 = binary32Fx8(mce, vatomX, vatomY);
2992 // PCast the RM, and widen it to 256 bits
2993 IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2994 // Roll it into the result
2995 t1 = mkUifUV256(mce, t1, t2);
2996 return t1;
2999 /* --- 64Fx2 unary FP ops, with rounding mode --- */
3001 static
3002 IRAtom* unary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
3004 /* Same scheme as binary64Fx2_w_rm. */
3005 // "do" the vector arg
3006 IRAtom* t1 = unary64Fx2(mce, vatomX);
3007 // PCast the RM, and widen it to 128 bits
3008 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
3009 // Roll it into the result
3010 t1 = mkUifUV128(mce, t1, t2);
3011 return t1;
3014 /* --- ... and ... 32Fx4 versions of the same --- */
3016 static
3017 IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
3019 /* Same scheme as binaryFx4_w_rm. */
3020 IRAtom* t1 = unary32Fx4(mce, vatomX);
3021 // PCast the RM, and widen it to 128 bits
3022 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
3023 // Roll it into the result
3024 t1 = mkUifUV128(mce, t1, t2);
3025 return t1;
3028 /* --- ... and ... 16Fx8 versions of the same --- */
3030 static
3031 IRAtom* unary16Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
3033 /* Same scheme as binaryFx4_w_rm. */
3034 IRAtom* t1 = unary16Fx8(mce, vatomX);
3035 // PCast the RM, and widen it to 128 bits
3036 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
3037 // Roll it into the result
3038 t1 = mkUifUV128(mce, t1, t2);
3039 return t1;
3042 /* --- ... and ... 32Fx8 versions of the same --- */
3044 static
3045 IRAtom* unary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
3047 /* Same scheme as unary32Fx8_w_rm. */
3048 IRAtom* t1 = unary32Fx8(mce, vatomX);
3049 // PCast the RM, and widen it to 256 bits
3050 IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
3051 // Roll it into the result
3052 t1 = mkUifUV256(mce, t1, t2);
3053 return t1;
3057 /* --- --- Vector saturated narrowing --- --- */
3059 /* We used to do something very clever here, but on closer inspection
3060 (2011-Jun-15), and in particular bug #279698, it turns out to be
3061 wrong. Part of the problem came from the fact that for a long
3062 time, the IR primops to do with saturated narrowing were
3063 underspecified and managed to confuse multiple cases which needed
3064 to be separate: the op names had a signedness qualifier, but in
3065 fact the source and destination signednesses needed to be specified
3066 independently, so the op names really need two independent
3067 signedness specifiers.
3069 As of 2011-Jun-15 (ish) the underspecification was sorted out
3070 properly. The incorrect instrumentation remained, though. That
3071 has now (2011-Oct-22) been fixed.
3073 What we now do is simple:
3075 Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
3076 number of lanes, X is the source lane width and signedness, and Y
3077 is the destination lane width and signedness. In all cases the
3078 destination lane width is half the source lane width, so the names
3079 have a bit of redundancy, but are at least easy to read.
3081 For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
3082 to unsigned 16s.
3084 Let Vanilla(OP) be a function that takes OP, one of these
3085 saturating narrowing ops, and produces the same "shaped" narrowing
3086 op which is not saturating, but merely dumps the most significant
3087 bits. "same shape" means that the lane numbers and widths are the
3088 same as with OP.
3090 For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
3091 = Iop_NarrowBin32to16x8,
3092 that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
3093 dumping the top half of each lane.
3095 So, with that in place, the scheme is simple, and it is simple to
3096 pessimise each lane individually and then apply Vanilla(OP) so as
3097 to get the result in the right "shape". If the original OP is
3098 QNarrowBinXtoYxZ then we produce
3100 Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
3102 or for the case when OP is unary (Iop_QNarrowUn*)
3104 Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
3106 static
3107 IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
3109 switch (qnarrowOp) {
3110 /* Binary: (128, 128) -> 128 */
3111 case Iop_QNarrowBin16Sto8Ux16:
3112 case Iop_QNarrowBin16Sto8Sx16:
3113 case Iop_QNarrowBin16Uto8Ux16:
3114 case Iop_QNarrowBin64Sto32Sx4:
3115 case Iop_QNarrowBin64Uto32Ux4:
3116 return Iop_NarrowBin16to8x16;
3117 case Iop_QNarrowBin32Sto16Ux8:
3118 case Iop_QNarrowBin32Sto16Sx8:
3119 case Iop_QNarrowBin32Uto16Ux8:
3120 return Iop_NarrowBin32to16x8;
3121 /* Binary: (64, 64) -> 64 */
3122 case Iop_QNarrowBin32Sto16Sx4:
3123 return Iop_NarrowBin32to16x4;
3124 case Iop_QNarrowBin16Sto8Ux8:
3125 case Iop_QNarrowBin16Sto8Sx8:
3126 return Iop_NarrowBin16to8x8;
3127 /* Unary: 128 -> 64 */
3128 case Iop_QNarrowUn64Uto32Ux2:
3129 case Iop_QNarrowUn64Sto32Sx2:
3130 case Iop_QNarrowUn64Sto32Ux2:
3131 return Iop_NarrowUn64to32x2;
3132 case Iop_QNarrowUn32Uto16Ux4:
3133 case Iop_QNarrowUn32Sto16Sx4:
3134 case Iop_QNarrowUn32Sto16Ux4:
3135 case Iop_F32toF16x4_DEP:
3136 return Iop_NarrowUn32to16x4;
3137 case Iop_QNarrowUn16Uto8Ux8:
3138 case Iop_QNarrowUn16Sto8Sx8:
3139 case Iop_QNarrowUn16Sto8Ux8:
3140 return Iop_NarrowUn16to8x8;
3141 default:
3142 ppIROp(qnarrowOp);
3143 VG_(tool_panic)("vanillaNarrowOpOfShape");
3147 static
3148 IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op,
3149 IRAtom* vatom1, IRAtom* vatom2)
3151 IRAtom *at1, *at2, *at3;
3152 IRAtom* (*pcast)( MCEnv*, IRAtom* );
3153 switch (narrow_op) {
3154 case Iop_QNarrowBin64Sto32Sx4: pcast = mkPCast32x4; break;
3155 case Iop_QNarrowBin64Uto32Ux4: pcast = mkPCast32x4; break;
3156 case Iop_QNarrowBin32Sto16Sx8: pcast = mkPCast32x4; break;
3157 case Iop_QNarrowBin32Uto16Ux8: pcast = mkPCast32x4; break;
3158 case Iop_QNarrowBin32Sto16Ux8: pcast = mkPCast32x4; break;
3159 case Iop_QNarrowBin16Sto8Sx16: pcast = mkPCast16x8; break;
3160 case Iop_QNarrowBin16Uto8Ux16: pcast = mkPCast16x8; break;
3161 case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
3162 default: VG_(tool_panic)("vectorNarrowBinV128");
3164 IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3165 tl_assert(isShadowAtom(mce,vatom1));
3166 tl_assert(isShadowAtom(mce,vatom2));
3167 at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
3168 at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
3169 at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
3170 return at3;
3173 static
3174 IRAtom* vectorNarrowBin64 ( MCEnv* mce, IROp narrow_op,
3175 IRAtom* vatom1, IRAtom* vatom2)
3177 IRAtom *at1, *at2, *at3;
3178 IRAtom* (*pcast)( MCEnv*, IRAtom* );
3179 switch (narrow_op) {
3180 case Iop_QNarrowBin32Sto16Sx4: pcast = mkPCast32x2; break;
3181 case Iop_QNarrowBin16Sto8Sx8: pcast = mkPCast16x4; break;
3182 case Iop_QNarrowBin16Sto8Ux8: pcast = mkPCast16x4; break;
3183 default: VG_(tool_panic)("vectorNarrowBin64");
3185 IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3186 tl_assert(isShadowAtom(mce,vatom1));
3187 tl_assert(isShadowAtom(mce,vatom2));
3188 at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
3189 at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
3190 at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
3191 return at3;
3194 static
3195 IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
3196 IRAtom* vatom1)
3198 IRAtom *at1, *at2;
3199 IRAtom* (*pcast)( MCEnv*, IRAtom* );
3200 tl_assert(isShadowAtom(mce,vatom1));
3201 /* For vanilla narrowing (non-saturating), we can just apply
3202 the op directly to the V bits. */
3203 switch (narrow_op) {
3204 case Iop_NarrowUn16to8x8:
3205 case Iop_NarrowUn32to16x4:
3206 case Iop_NarrowUn64to32x2:
3207 case Iop_F32toF16x4_DEP:
3208 at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
3209 return at1;
3210 default:
3211 break; /* Do Plan B */
3213 /* Plan B: for ops that involve a saturation operation on the args,
3214 we must PCast before the vanilla narrow. */
3215 switch (narrow_op) {
3216 case Iop_QNarrowUn16Sto8Sx8: pcast = mkPCast16x8; break;
3217 case Iop_QNarrowUn16Sto8Ux8: pcast = mkPCast16x8; break;
3218 case Iop_QNarrowUn16Uto8Ux8: pcast = mkPCast16x8; break;
3219 case Iop_QNarrowUn32Sto16Sx4: pcast = mkPCast32x4; break;
3220 case Iop_QNarrowUn32Sto16Ux4: pcast = mkPCast32x4; break;
3221 case Iop_QNarrowUn32Uto16Ux4: pcast = mkPCast32x4; break;
3222 case Iop_QNarrowUn64Sto32Sx2: pcast = mkPCast64x2; break;
3223 case Iop_QNarrowUn64Sto32Ux2: pcast = mkPCast64x2; break;
3224 case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
3225 default: VG_(tool_panic)("vectorNarrowUnV128");
3227 IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3228 at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
3229 at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
3230 return at2;
3233 static
3234 IRAtom* vectorWidenI64 ( MCEnv* mce, IROp longen_op,
3235 IRAtom* vatom1)
3237 IRAtom *at1, *at2;
3238 IRAtom* (*pcast)( MCEnv*, IRAtom* );
3239 switch (longen_op) {
3240 case Iop_Widen8Uto16x8: pcast = mkPCast16x8; break;
3241 case Iop_Widen8Sto16x8: pcast = mkPCast16x8; break;
3242 case Iop_Widen16Uto32x4: pcast = mkPCast32x4; break;
3243 case Iop_Widen16Sto32x4: pcast = mkPCast32x4; break;
3244 case Iop_Widen32Uto64x2: pcast = mkPCast64x2; break;
3245 case Iop_Widen32Sto64x2: pcast = mkPCast64x2; break;
3246 case Iop_F16toF32x4: pcast = mkPCast32x4; break;
3247 default: VG_(tool_panic)("vectorWidenI64");
3249 tl_assert(isShadowAtom(mce,vatom1));
3250 at1 = assignNew('V', mce, Ity_V128, unop(longen_op, vatom1));
3251 at2 = assignNew('V', mce, Ity_V128, pcast(mce, at1));
3252 return at2;
3256 /* --- --- Vector integer arithmetic --- --- */
3258 /* Simple ... UifU the args and per-lane pessimise the results. */
3260 /* --- V256-bit versions --- */
3262 static
3263 IRAtom* binary8Ix32 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3265 IRAtom* at;
3266 at = mkUifUV256(mce, vatom1, vatom2);
3267 at = mkPCast8x32(mce, at);
3268 return at;
3271 static
3272 IRAtom* binary16Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3274 IRAtom* at;
3275 at = mkUifUV256(mce, vatom1, vatom2);
3276 at = mkPCast16x16(mce, at);
3277 return at;
3280 static
3281 IRAtom* binary32Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3283 IRAtom* at;
3284 at = mkUifUV256(mce, vatom1, vatom2);
3285 at = mkPCast32x8(mce, at);
3286 return at;
3289 static
3290 IRAtom* binary64Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3292 IRAtom* at;
3293 at = mkUifUV256(mce, vatom1, vatom2);
3294 at = mkPCast64x4(mce, at);
3295 return at;
3298 /* --- V128-bit versions --- */
3300 static
3301 IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3303 IRAtom* at;
3304 at = mkUifUV128(mce, vatom1, vatom2);
3305 at = mkPCast8x16(mce, at);
3306 return at;
3309 static
3310 IRAtom* binary16Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3312 IRAtom* at;
3313 at = mkUifUV128(mce, vatom1, vatom2);
3314 at = mkPCast16x8(mce, at);
3315 return at;
3318 static
3319 IRAtom* binary32Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3321 IRAtom* at;
3322 at = mkUifUV128(mce, vatom1, vatom2);
3323 at = mkPCast32x4(mce, at);
3324 return at;
3327 static
3328 IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3330 IRAtom* at;
3331 at = mkUifUV128(mce, vatom1, vatom2);
3332 at = mkPCast64x2(mce, at);
3333 return at;
3336 static
3337 IRAtom* binary128Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3339 IRAtom* at;
3340 at = mkUifUV128(mce, vatom1, vatom2);
3341 at = mkPCast128x1(mce, at);
3342 return at;
3345 /* --- 64-bit versions --- */
3347 static
3348 IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3350 IRAtom* at;
3351 at = mkUifU64(mce, vatom1, vatom2);
3352 at = mkPCast8x8(mce, at);
3353 return at;
3356 static
3357 IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3359 IRAtom* at;
3360 at = mkUifU64(mce, vatom1, vatom2);
3361 at = mkPCast16x4(mce, at);
3362 return at;
3365 static
3366 IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3368 IRAtom* at;
3369 at = mkUifU64(mce, vatom1, vatom2);
3370 at = mkPCast32x2(mce, at);
3371 return at;
3374 static
3375 IRAtom* binary64Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3377 IRAtom* at;
3378 at = mkUifU64(mce, vatom1, vatom2);
3379 at = mkPCastTo(mce, Ity_I64, at);
3380 return at;
3383 /* --- 32-bit versions --- */
3385 static
3386 IRAtom* binary8Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3388 IRAtom* at;
3389 at = mkUifU32(mce, vatom1, vatom2);
3390 at = mkPCast8x4(mce, at);
3391 return at;
3394 static
3395 IRAtom* binary16Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3397 IRAtom* at;
3398 at = mkUifU32(mce, vatom1, vatom2);
3399 at = mkPCast16x2(mce, at);
3400 return at;
3404 /*------------------------------------------------------------*/
3405 /*--- Generate shadow values from all kinds of IRExprs. ---*/
3406 /*------------------------------------------------------------*/
3408 static
3409 IRAtom* expr2vbits_Qop ( MCEnv* mce,
3410 IROp op,
3411 IRAtom* atom1, IRAtom* atom2,
3412 IRAtom* atom3, IRAtom* atom4 )
3414 IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3415 IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3416 IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3417 IRAtom* vatom4 = expr2vbits( mce, atom4, HuOth );
3419 tl_assert(isOriginalAtom(mce,atom1));
3420 tl_assert(isOriginalAtom(mce,atom2));
3421 tl_assert(isOriginalAtom(mce,atom3));
3422 tl_assert(isOriginalAtom(mce,atom4));
3423 tl_assert(isShadowAtom(mce,vatom1));
3424 tl_assert(isShadowAtom(mce,vatom2));
3425 tl_assert(isShadowAtom(mce,vatom3));
3426 tl_assert(isShadowAtom(mce,vatom4));
3427 tl_assert(sameKindedAtoms(atom1,vatom1));
3428 tl_assert(sameKindedAtoms(atom2,vatom2));
3429 tl_assert(sameKindedAtoms(atom3,vatom3));
3430 tl_assert(sameKindedAtoms(atom4,vatom4));
3431 switch (op) {
3432 case Iop_MAddF64:
3433 case Iop_MAddF64r32:
3434 case Iop_MSubF64:
3435 case Iop_MSubF64r32:
3436 /* I32(rm) x F64 x F64 x F64 -> F64 */
3437 return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3439 case Iop_MAddF32:
3440 case Iop_MSubF32:
3441 /* I32(rm) x F32 x F32 x F32 -> F32 */
3442 return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3444 case Iop_MAddF128:
3445 case Iop_MSubF128:
3446 case Iop_NegMAddF128:
3447 case Iop_NegMSubF128:
3448 /* I32(rm) x F128 x F128 x F128 -> F128 */
3449 return mkLazy4(mce, Ity_I128, vatom1, vatom2, vatom3, vatom4);
3451 /* V256-bit data-steering */
3452 case Iop_64x4toV256:
3453 return assignNew('V', mce, Ity_V256,
3454 IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
3456 /* I32/I64 x I8 x I8 x I8 -> I32/I64 */
3457 case Iop_Rotx32:
3458 return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3459 case Iop_Rotx64:
3460 return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3461 default:
3462 ppIROp(op);
3463 VG_(tool_panic)("memcheck:expr2vbits_Qop");
3468 static
3469 IRAtom* expr2vbits_Triop ( MCEnv* mce,
3470 IROp op,
3471 IRAtom* atom1, IRAtom* atom2, IRAtom* atom3 )
3473 IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3474 IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3475 IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3477 tl_assert(isOriginalAtom(mce,atom1));
3478 tl_assert(isOriginalAtom(mce,atom2));
3479 tl_assert(isOriginalAtom(mce,atom3));
3480 tl_assert(isShadowAtom(mce,vatom1));
3481 tl_assert(isShadowAtom(mce,vatom2));
3482 tl_assert(isShadowAtom(mce,vatom3));
3483 tl_assert(sameKindedAtoms(atom1,vatom1));
3484 tl_assert(sameKindedAtoms(atom2,vatom2));
3485 tl_assert(sameKindedAtoms(atom3,vatom3));
3486 switch (op) {
3487 case Iop_AddF128:
3488 case Iop_SubF128:
3489 case Iop_MulF128:
3490 case Iop_DivF128:
3491 case Iop_AddD128:
3492 case Iop_SubD128:
3493 case Iop_MulD128:
3494 case Iop_DivD128:
3495 case Iop_QuantizeD128:
3496 /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
3497 return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3498 case Iop_AddF64:
3499 case Iop_AddD64:
3500 case Iop_AddF64r32:
3501 case Iop_SubF64:
3502 case Iop_SubD64:
3503 case Iop_SubF64r32:
3504 case Iop_MulF64:
3505 case Iop_MulD64:
3506 case Iop_MulF64r32:
3507 case Iop_DivF64:
3508 case Iop_DivD64:
3509 case Iop_DivF64r32:
3510 case Iop_ScaleF64:
3511 case Iop_Yl2xF64:
3512 case Iop_Yl2xp1F64:
3513 case Iop_AtanF64:
3514 case Iop_PRemF64:
3515 case Iop_PRem1F64:
3516 case Iop_QuantizeD64:
3517 /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
3518 return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3519 case Iop_PRemC3210F64:
3520 case Iop_PRem1C3210F64:
3521 /* I32(rm) x F64 x F64 -> I32 */
3522 return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3523 case Iop_AddF32:
3524 case Iop_SubF32:
3525 case Iop_MulF32:
3526 case Iop_DivF32:
3527 /* I32(rm) x F32 x F32 -> I32 */
3528 return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3529 case Iop_AddF16:
3530 case Iop_SubF16:
3531 /* I32(rm) x F16 x F16 -> I16 */
3532 return mkLazy3(mce, Ity_I16, vatom1, vatom2, vatom3);
3533 case Iop_SignificanceRoundD64:
3534 /* IRRoundingMode(I32) x I8 x D64 -> D64 */
3535 return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3536 case Iop_SignificanceRoundD128:
3537 /* IRRoundingMode(I32) x I8 x D128 -> D128 */
3538 return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3539 case Iop_SliceV128:
3540 /* (V128, V128, I8) -> V128 */
3541 complainIfUndefined(mce, atom3, NULL);
3542 return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
3543 case Iop_Slice64:
3544 /* (I64, I64, I8) -> I64 */
3545 complainIfUndefined(mce, atom3, NULL);
3546 return assignNew('V', mce, Ity_I64, triop(op, vatom1, vatom2, atom3));
3547 case Iop_SetElem8x8:
3548 case Iop_SetElem16x4:
3549 case Iop_SetElem32x2:
3550 complainIfUndefined(mce, atom2, NULL);
3551 return assignNew('V', mce, Ity_I64, triop(op, vatom1, atom2, vatom3));
3553 case Iop_SetElem8x16:
3554 case Iop_SetElem16x8:
3555 case Iop_SetElem32x4:
3556 case Iop_SetElem64x2:
3557 complainIfUndefined(mce, atom2, NULL);
3558 return assignNew('V', mce, Ity_V128, triop(op, vatom1, atom2, vatom3));
3560 /* Int 128-bit Integer three arg */
3561 case Iop_2xMultU64Add128CarryOut:
3562 case Iop_Perm8x16x2:
3563 /* (V128, V128, V128) -> V128 */
3564 complainIfUndefined(mce, atom3, NULL);
3565 return mkUifUV128(
3566 mce,
3567 assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3)),
3568 mkPCast8x16(mce, vatom3)
3571 /* Vector FP with rounding mode as the first arg */
3572 case Iop_Add64Fx2:
3573 case Iop_Sub64Fx2:
3574 case Iop_Mul64Fx2:
3575 case Iop_Div64Fx2:
3576 case Iop_Scale2_64Fx2:
3577 return binary64Fx2_w_rm(mce, vatom1, vatom2, vatom3);
3579 case Iop_Add32Fx4:
3580 case Iop_Sub32Fx4:
3581 case Iop_Mul32Fx4:
3582 case Iop_Div32Fx4:
3583 case Iop_Scale2_32Fx4:
3584 return binary32Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3586 case Iop_Add64Fx4:
3587 case Iop_Sub64Fx4:
3588 case Iop_Mul64Fx4:
3589 case Iop_Div64Fx4:
3590 return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3592 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision
3593 IR is implemented.
3595 case Iop_Add16Fx8:
3596 case Iop_Sub16Fx8:
3597 return binary16Fx8_w_rm(mce, vatom1, vatom2, vatom3);
3599 case Iop_Add32Fx8:
3600 case Iop_Sub32Fx8:
3601 case Iop_Mul32Fx8:
3602 case Iop_Div32Fx8:
3603 return binary32Fx8_w_rm(mce, vatom1, vatom2, vatom3);
3605 case Iop_F32x4_2toQ16x8:
3606 return assignNew('V', mce, Ity_V128,
3607 binop(Iop_PackEvenLanes16x8,
3608 unary32Fx4_w_rm(mce, vatom1, vatom2),
3609 unary32Fx4_w_rm(mce, vatom1, vatom3)));
3610 case Iop_F64x2_2toQ32x4:
3611 return assignNew('V', mce, Ity_V128,
3612 binop(Iop_PackEvenLanes32x4,
3613 unary64Fx2_w_rm(mce, vatom1, vatom2),
3614 unary64Fx2_w_rm(mce, vatom1, vatom3)));
3616 default:
3617 ppIROp(op);
3618 VG_(tool_panic)("memcheck:expr2vbits_Triop");
3623 static
3624 IRAtom* expr2vbits_Binop ( MCEnv* mce,
3625 IROp op,
3626 IRAtom* atom1, IRAtom* atom2,
3627 HowUsed hu/*use HuOth if unknown*/ )
3629 IRType and_or_ty = Ity_INVALID;
3630 IRAtom* (*uifu) (MCEnv*, IRAtom*, IRAtom*) = NULL;
3631 IRAtom* (*difd) (MCEnv*, IRAtom*, IRAtom*) = NULL;
3632 IRAtom* (*improve) (MCEnv*, IRAtom*, IRAtom*) = NULL;
3634 IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3635 IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3637 tl_assert(isOriginalAtom(mce,atom1));
3638 tl_assert(isOriginalAtom(mce,atom2));
3639 tl_assert(isShadowAtom(mce,vatom1));
3640 tl_assert(isShadowAtom(mce,vatom2));
3641 tl_assert(sameKindedAtoms(atom1,vatom1));
3642 tl_assert(sameKindedAtoms(atom2,vatom2));
3643 switch (op) {
3645 /* 32-bit SIMD */
3647 case Iop_Add16x2:
3648 case Iop_HAdd16Ux2:
3649 case Iop_HAdd16Sx2:
3650 case Iop_Sub16x2:
3651 case Iop_HSub16Ux2:
3652 case Iop_HSub16Sx2:
3653 case Iop_QAdd16Sx2:
3654 case Iop_QSub16Sx2:
3655 case Iop_QSub16Ux2:
3656 case Iop_QAdd16Ux2:
3657 return binary16Ix2(mce, vatom1, vatom2);
3659 case Iop_Add8x4:
3660 case Iop_HAdd8Ux4:
3661 case Iop_HAdd8Sx4:
3662 case Iop_Sub8x4:
3663 case Iop_HSub8Ux4:
3664 case Iop_HSub8Sx4:
3665 case Iop_QSub8Ux4:
3666 case Iop_QAdd8Ux4:
3667 case Iop_QSub8Sx4:
3668 case Iop_QAdd8Sx4:
3669 return binary8Ix4(mce, vatom1, vatom2);
3671 /* 64-bit SIMD */
3673 case Iop_ShrN8x8:
3674 case Iop_ShrN16x4:
3675 case Iop_ShrN32x2:
3676 case Iop_SarN8x8:
3677 case Iop_SarN16x4:
3678 case Iop_SarN32x2:
3679 case Iop_ShlN16x4:
3680 case Iop_ShlN32x2:
3681 case Iop_ShlN8x8:
3682 /* Same scheme as with all other shifts. */
3683 complainIfUndefined(mce, atom2, NULL);
3684 return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
3686 case Iop_QNarrowBin32Sto16Sx4:
3687 case Iop_QNarrowBin16Sto8Sx8:
3688 case Iop_QNarrowBin16Sto8Ux8:
3689 return vectorNarrowBin64(mce, op, vatom1, vatom2);
3691 case Iop_Min8Ux8:
3692 case Iop_Min8Sx8:
3693 case Iop_Max8Ux8:
3694 case Iop_Max8Sx8:
3695 case Iop_Avg8Ux8:
3696 case Iop_QSub8Sx8:
3697 case Iop_QSub8Ux8:
3698 case Iop_Sub8x8:
3699 case Iop_CmpGT8Sx8:
3700 case Iop_CmpGT8Ux8:
3701 case Iop_CmpEQ8x8:
3702 case Iop_QAdd8Sx8:
3703 case Iop_QAdd8Ux8:
3704 case Iop_QSal8x8:
3705 case Iop_QShl8x8:
3706 case Iop_Add8x8:
3707 case Iop_Mul8x8:
3708 case Iop_PolynomialMul8x8:
3709 return binary8Ix8(mce, vatom1, vatom2);
3711 case Iop_Min16Sx4:
3712 case Iop_Min16Ux4:
3713 case Iop_Max16Sx4:
3714 case Iop_Max16Ux4:
3715 case Iop_Avg16Ux4:
3716 case Iop_QSub16Ux4:
3717 case Iop_QSub16Sx4:
3718 case Iop_Sub16x4:
3719 case Iop_Mul16x4:
3720 case Iop_MulHi16Sx4:
3721 case Iop_MulHi16Ux4:
3722 case Iop_CmpGT16Sx4:
3723 case Iop_CmpGT16Ux4:
3724 case Iop_CmpEQ16x4:
3725 case Iop_QAdd16Sx4:
3726 case Iop_QAdd16Ux4:
3727 case Iop_QSal16x4:
3728 case Iop_QShl16x4:
3729 case Iop_Add16x4:
3730 case Iop_QDMulHi16Sx4:
3731 case Iop_QRDMulHi16Sx4:
3732 return binary16Ix4(mce, vatom1, vatom2);
3734 case Iop_Sub32x2:
3735 case Iop_Mul32x2:
3736 case Iop_Max32Sx2:
3737 case Iop_Max32Ux2:
3738 case Iop_Min32Sx2:
3739 case Iop_Min32Ux2:
3740 case Iop_CmpGT32Sx2:
3741 case Iop_CmpGT32Ux2:
3742 case Iop_CmpEQ32x2:
3743 case Iop_Add32x2:
3744 case Iop_QAdd32Ux2:
3745 case Iop_QAdd32Sx2:
3746 case Iop_QSub32Ux2:
3747 case Iop_QSub32Sx2:
3748 case Iop_QSal32x2:
3749 case Iop_QShl32x2:
3750 case Iop_QDMulHi32Sx2:
3751 case Iop_QRDMulHi32Sx2:
3752 return binary32Ix2(mce, vatom1, vatom2);
3754 case Iop_QSub64Ux1:
3755 case Iop_QSub64Sx1:
3756 case Iop_QAdd64Ux1:
3757 case Iop_QAdd64Sx1:
3758 case Iop_QSal64x1:
3759 case Iop_QShl64x1:
3760 case Iop_Sal64x1:
3761 return binary64Ix1(mce, vatom1, vatom2);
3763 case Iop_QShlNsatSU8x8:
3764 case Iop_QShlNsatUU8x8:
3765 case Iop_QShlNsatSS8x8:
3766 complainIfUndefined(mce, atom2, NULL);
3767 return mkPCast8x8(mce, vatom1);
3769 case Iop_QShlNsatSU16x4:
3770 case Iop_QShlNsatUU16x4:
3771 case Iop_QShlNsatSS16x4:
3772 complainIfUndefined(mce, atom2, NULL);
3773 return mkPCast16x4(mce, vatom1);
3775 case Iop_QShlNsatSU32x2:
3776 case Iop_QShlNsatUU32x2:
3777 case Iop_QShlNsatSS32x2:
3778 complainIfUndefined(mce, atom2, NULL);
3779 return mkPCast32x2(mce, vatom1);
3781 case Iop_QShlNsatSU64x1:
3782 case Iop_QShlNsatUU64x1:
3783 case Iop_QShlNsatSS64x1:
3784 complainIfUndefined(mce, atom2, NULL);
3785 return mkPCast32x2(mce, vatom1);
3787 case Iop_PwMax32Sx2:
3788 case Iop_PwMax32Ux2:
3789 case Iop_PwMin32Sx2:
3790 case Iop_PwMin32Ux2:
3791 case Iop_PwMax32Fx2:
3792 case Iop_PwMin32Fx2:
3793 return assignNew('V', mce, Ity_I64,
3794 binop(Iop_PwMax32Ux2,
3795 mkPCast32x2(mce, vatom1),
3796 mkPCast32x2(mce, vatom2)));
3798 case Iop_PwMax16Sx4:
3799 case Iop_PwMax16Ux4:
3800 case Iop_PwMin16Sx4:
3801 case Iop_PwMin16Ux4:
3802 return assignNew('V', mce, Ity_I64,
3803 binop(Iop_PwMax16Ux4,
3804 mkPCast16x4(mce, vatom1),
3805 mkPCast16x4(mce, vatom2)));
3807 case Iop_PwMax8Sx8:
3808 case Iop_PwMax8Ux8:
3809 case Iop_PwMin8Sx8:
3810 case Iop_PwMin8Ux8:
3811 return assignNew('V', mce, Ity_I64,
3812 binop(Iop_PwMax8Ux8,
3813 mkPCast8x8(mce, vatom1),
3814 mkPCast8x8(mce, vatom2)));
3816 case Iop_PwAdd32x2:
3817 case Iop_PwAdd32Fx2:
3818 return mkPCast32x2(mce,
3819 assignNew('V', mce, Ity_I64,
3820 binop(Iop_PwAdd32x2,
3821 mkPCast32x2(mce, vatom1),
3822 mkPCast32x2(mce, vatom2))));
3824 case Iop_PwAdd16x4:
3825 return mkPCast16x4(mce,
3826 assignNew('V', mce, Ity_I64,
3827 binop(op, mkPCast16x4(mce, vatom1),
3828 mkPCast16x4(mce, vatom2))));
3830 case Iop_PwAdd8x8:
3831 return mkPCast8x8(mce,
3832 assignNew('V', mce, Ity_I64,
3833 binop(op, mkPCast8x8(mce, vatom1),
3834 mkPCast8x8(mce, vatom2))));
3836 case Iop_Shl8x8:
3837 case Iop_Shr8x8:
3838 case Iop_Sar8x8:
3839 case Iop_Sal8x8:
3840 return mkUifU64(mce,
3841 assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3842 mkPCast8x8(mce,vatom2)
3845 case Iop_Shl16x4:
3846 case Iop_Shr16x4:
3847 case Iop_Sar16x4:
3848 case Iop_Sal16x4:
3849 return mkUifU64(mce,
3850 assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3851 mkPCast16x4(mce,vatom2)
3854 case Iop_Shl32x2:
3855 case Iop_Shr32x2:
3856 case Iop_Sar32x2:
3857 case Iop_Sal32x2:
3858 return mkUifU64(mce,
3859 assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3860 mkPCast32x2(mce,vatom2)
3863 /* 64-bit data-steering */
3864 case Iop_InterleaveLO32x2:
3865 case Iop_InterleaveLO16x4:
3866 case Iop_InterleaveLO8x8:
3867 case Iop_InterleaveHI32x2:
3868 case Iop_InterleaveHI16x4:
3869 case Iop_InterleaveHI8x8:
3870 case Iop_CatOddLanes8x8:
3871 case Iop_CatEvenLanes8x8:
3872 case Iop_CatOddLanes16x4:
3873 case Iop_CatEvenLanes16x4:
3874 case Iop_InterleaveOddLanes8x8:
3875 case Iop_InterleaveEvenLanes8x8:
3876 case Iop_InterleaveOddLanes16x4:
3877 case Iop_InterleaveEvenLanes16x4:
3878 return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
3880 case Iop_GetElem8x8:
3881 complainIfUndefined(mce, atom2, NULL);
3882 return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3883 case Iop_GetElem16x4:
3884 complainIfUndefined(mce, atom2, NULL);
3885 return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3886 case Iop_GetElem32x2:
3887 complainIfUndefined(mce, atom2, NULL);
3888 return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3890 /* Perm8x8: rearrange values in left arg using steering values from
3891 right arg. So rearrange the vbits in the same way but pessimise wrt
3892 steering values. We assume that unused bits in the steering value
3893 are defined zeros, so we can safely PCast within each lane of the the
3894 steering value without having to take precautions to avoid a
3895 dependency on those unused bits.
3897 This is also correct for PermOrZero8x8, but it is a bit subtle. For
3898 each lane, if bit 7 of the steering value is zero, then we'll steer
3899 the shadow value exactly as per Perm8x8. If that bit is one, then
3900 the operation will set the resulting (concrete) value to zero. That
3901 means it is defined, and should have a shadow value of zero. Hence
3902 in both cases (bit 7 is 0 or 1) we can self-shadow (in the same way
3903 as Perm8x8) and then pessimise against the steering values. */
3904 case Iop_Perm8x8:
3905 case Iop_PermOrZero8x8:
3906 return mkUifU64(
3907 mce,
3908 assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3909 mkPCast8x8(mce, vatom2)
3912 /* V128-bit SIMD */
3914 case Iop_I32StoF32x4:
3915 case Iop_F32toI32Sx4:
3916 case Iop_Sqrt16Fx8:
3917 return unary16Fx8_w_rm(mce, vatom1, vatom2);
3918 case Iop_Sqrt32Fx4:
3919 return unary32Fx4_w_rm(mce, vatom1, vatom2);
3920 case Iop_Sqrt64Fx2:
3921 return unary64Fx2_w_rm(mce, vatom1, vatom2);
3923 case Iop_ShrN8x16:
3924 case Iop_ShrN16x8:
3925 case Iop_ShrN32x4:
3926 case Iop_ShrN64x2:
3927 case Iop_SarN8x16:
3928 case Iop_SarN16x8:
3929 case Iop_SarN32x4:
3930 case Iop_SarN64x2:
3931 case Iop_ShlN8x16:
3932 case Iop_ShlN16x8:
3933 case Iop_ShlN32x4:
3934 case Iop_ShlN64x2:
3935 /* Same scheme as with all other shifts. Note: 22 Oct 05:
3936 this is wrong now, scalar shifts are done properly lazily.
3937 Vector shifts should be fixed too. */
3938 complainIfUndefined(mce, atom2, NULL);
3939 return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3941 /* V x V shifts/rotates are done using the standard lazy scheme. */
3942 /* For the non-rounding variants of bi-di vector x vector
3943 shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
3944 But note that this is overly pessimistic, because in fact only
3945 the bottom 8 bits of each lane of the second argument are taken
3946 into account when shifting. So really we ought to ignore
3947 undefinedness in bits 8 and above of each lane in the
3948 second argument. */
3949 case Iop_Shl8x16:
3950 case Iop_Shr8x16:
3951 case Iop_Sar8x16:
3952 case Iop_Sal8x16:
3953 case Iop_Rol8x16:
3954 case Iop_Sh8Sx16:
3955 case Iop_Sh8Ux16:
3956 return mkUifUV128(mce,
3957 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3958 mkPCast8x16(mce,vatom2)
3961 case Iop_Shl16x8:
3962 case Iop_Shr16x8:
3963 case Iop_Sar16x8:
3964 case Iop_Sal16x8:
3965 case Iop_Rol16x8:
3966 case Iop_Sh16Sx8:
3967 case Iop_Sh16Ux8:
3968 return mkUifUV128(mce,
3969 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3970 mkPCast16x8(mce,vatom2)
3973 case Iop_Shl32x4:
3974 case Iop_Shr32x4:
3975 case Iop_Sar32x4:
3976 case Iop_Sal32x4:
3977 case Iop_Rol32x4:
3978 case Iop_Sh32Sx4:
3979 case Iop_Sh32Ux4:
3980 return mkUifUV128(mce,
3981 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3982 mkPCast32x4(mce,vatom2)
3985 case Iop_Shl64x2:
3986 case Iop_Shr64x2:
3987 case Iop_Sar64x2:
3988 case Iop_Sal64x2:
3989 case Iop_Rol64x2:
3990 case Iop_Sh64Sx2:
3991 case Iop_Sh64Ux2:
3992 return mkUifUV128(mce,
3993 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3994 mkPCast64x2(mce,vatom2)
3997 /* For the rounding variants of bi-di vector x vector shifts, the
3998 rounding adjustment can cause undefinedness to propagate through
3999 the entire lane, in the worst case. Too complex to handle
4000 properly .. just UifU the arguments and then PCast them.
4001 Suboptimal but safe. */
4002 case Iop_Rsh8Sx16:
4003 case Iop_Rsh8Ux16:
4004 return binary8Ix16(mce, vatom1, vatom2);
4005 case Iop_Rsh16Sx8:
4006 case Iop_Rsh16Ux8:
4007 return binary16Ix8(mce, vatom1, vatom2);
4008 case Iop_Rsh32Sx4:
4009 case Iop_Rsh32Ux4:
4010 return binary32Ix4(mce, vatom1, vatom2);
4011 case Iop_Rsh64Sx2:
4012 case Iop_Rsh64Ux2:
4013 return binary64Ix2(mce, vatom1, vatom2);
4015 case Iop_F32ToFixed32Ux4_RZ:
4016 case Iop_F32ToFixed32Sx4_RZ:
4017 case Iop_Fixed32UToF32x4_RN:
4018 case Iop_Fixed32SToF32x4_RN:
4019 complainIfUndefined(mce, atom2, NULL);
4020 return mkPCast32x4(mce, vatom1);
4022 case Iop_F32ToFixed32Ux2_RZ:
4023 case Iop_F32ToFixed32Sx2_RZ:
4024 case Iop_Fixed32UToF32x2_RN:
4025 case Iop_Fixed32SToF32x2_RN:
4026 complainIfUndefined(mce, atom2, NULL);
4027 return mkPCast32x2(mce, vatom1);
4029 case Iop_QSub8Ux16:
4030 case Iop_QSub8Sx16:
4031 case Iop_Sub8x16:
4032 case Iop_Min8Ux16:
4033 case Iop_Min8Sx16:
4034 case Iop_Max8Ux16:
4035 case Iop_Max8Sx16:
4036 case Iop_CmpEQ8x16:
4037 case Iop_Avg8Ux16:
4038 case Iop_Avg8Sx16:
4039 case Iop_QAdd8Ux16:
4040 case Iop_QAdd8Sx16:
4041 case Iop_QAddExtUSsatSS8x16:
4042 case Iop_QAddExtSUsatUU8x16:
4043 case Iop_QSal8x16:
4044 case Iop_QShl8x16:
4045 case Iop_Add8x16:
4046 case Iop_Mul8x16:
4047 case Iop_MulHi8Sx16:
4048 case Iop_MulHi8Ux16:
4049 case Iop_PolynomialMul8x16:
4050 case Iop_PolynomialMulAdd8x16:
4051 return binary8Ix16(mce, vatom1, vatom2);
4053 case Iop_QSub16Ux8:
4054 case Iop_QSub16Sx8:
4055 case Iop_Sub16x8:
4056 case Iop_Mul16x8:
4057 case Iop_MulHi16Sx8:
4058 case Iop_MulHi16Ux8:
4059 case Iop_Min16Sx8:
4060 case Iop_Min16Ux8:
4061 case Iop_Max16Sx8:
4062 case Iop_Max16Ux8:
4063 case Iop_CmpEQ16x8:
4064 case Iop_Avg16Ux8:
4065 case Iop_Avg16Sx8:
4066 case Iop_QAdd16Ux8:
4067 case Iop_QAdd16Sx8:
4068 case Iop_QAddExtUSsatSS16x8:
4069 case Iop_QAddExtSUsatUU16x8:
4070 case Iop_QSal16x8:
4071 case Iop_QShl16x8:
4072 case Iop_Add16x8:
4073 case Iop_QDMulHi16Sx8:
4074 case Iop_QRDMulHi16Sx8:
4075 case Iop_PolynomialMulAdd16x8:
4076 /* PwExtUSMulQAdd8x16 is a bit subtle. The effect of it is that each
4077 16-bit chunk of the output is formed from corresponding 16-bit chunks
4078 of the input args, so we can treat it like an other binary 16x8
4079 operation. That's despite it having '8x16' in its name. */
4080 case Iop_PwExtUSMulQAdd8x16:
4081 return binary16Ix8(mce, vatom1, vatom2);
4083 case Iop_CmpGT64Sx2:
4084 case Iop_CmpGT64Ux2:
4085 case Iop_CmpGT32Sx4:
4086 case Iop_CmpGT32Ux4:
4087 case Iop_CmpGT16Sx8:
4088 case Iop_CmpGT16Ux8:
4089 case Iop_CmpGT8Sx16:
4090 case Iop_CmpGT8Ux16:
4091 return expensiveCmpGT(mce, op,
4092 vatom1, vatom2, atom1, atom2);
4093 case Iop_Sub32x4:
4094 case Iop_CmpEQ32x4:
4095 case Iop_QAdd32Sx4:
4096 case Iop_QAdd32Ux4:
4097 case Iop_QSub32Sx4:
4098 case Iop_QSub32Ux4:
4099 case Iop_QAddExtUSsatSS32x4:
4100 case Iop_QAddExtSUsatUU32x4:
4101 case Iop_QSal32x4:
4102 case Iop_QShl32x4:
4103 case Iop_Avg32Ux4:
4104 case Iop_Avg32Sx4:
4105 case Iop_Add32x4:
4106 case Iop_Max32Ux4:
4107 case Iop_Max32Sx4:
4108 case Iop_Min32Ux4:
4109 case Iop_Min32Sx4:
4110 case Iop_Mul32x4:
4111 case Iop_MulHi32Sx4:
4112 case Iop_MulHi32Ux4:
4113 case Iop_QDMulHi32Sx4:
4114 case Iop_QRDMulHi32Sx4:
4115 case Iop_PolynomialMulAdd32x4:
4116 return binary32Ix4(mce, vatom1, vatom2);
4118 case Iop_Sub64x2:
4119 case Iop_Add64x2:
4120 case Iop_Avg64Ux2:
4121 case Iop_Avg64Sx2:
4122 case Iop_Max64Sx2:
4123 case Iop_Max64Ux2:
4124 case Iop_Min64Sx2:
4125 case Iop_Min64Ux2:
4126 case Iop_CmpEQ64x2:
4127 case Iop_QSal64x2:
4128 case Iop_QShl64x2:
4129 case Iop_QAdd64Ux2:
4130 case Iop_QAdd64Sx2:
4131 case Iop_QSub64Ux2:
4132 case Iop_QSub64Sx2:
4133 case Iop_QAddExtUSsatSS64x2:
4134 case Iop_QAddExtSUsatUU64x2:
4135 case Iop_PolynomialMulAdd64x2:
4136 case Iop_CipherV128:
4137 case Iop_CipherLV128:
4138 case Iop_NCipherV128:
4139 case Iop_NCipherLV128:
4140 case Iop_MulI128by10E:
4141 case Iop_MulI128by10ECarry:
4142 return binary64Ix2(mce, vatom1, vatom2);
4144 case Iop_Add128x1:
4145 case Iop_Sub128x1:
4146 case Iop_CmpNEZ128x1:
4147 return binary128Ix1(mce, vatom1, vatom2);
4149 case Iop_DivU128:
4150 case Iop_DivS128:
4151 case Iop_DivU128E:
4152 case Iop_DivS128E:
4153 case Iop_ModU128:
4154 case Iop_ModS128:
4155 /* I128 x I128 -> I128 */
4156 return mkLazy2(mce, Ity_V128, vatom1, vatom2);
4158 case Iop_QNarrowBin64Sto32Sx4:
4159 case Iop_QNarrowBin64Uto32Ux4:
4160 case Iop_QNarrowBin32Sto16Sx8:
4161 case Iop_QNarrowBin32Uto16Ux8:
4162 case Iop_QNarrowBin32Sto16Ux8:
4163 case Iop_QNarrowBin16Sto8Sx16:
4164 case Iop_QNarrowBin16Uto8Ux16:
4165 case Iop_QNarrowBin16Sto8Ux16:
4166 return vectorNarrowBinV128(mce, op, vatom1, vatom2);
4168 case Iop_Min64Fx2:
4169 case Iop_Max64Fx2:
4170 case Iop_CmpLT64Fx2:
4171 case Iop_CmpLE64Fx2:
4172 case Iop_CmpEQ64Fx2:
4173 case Iop_CmpUN64Fx2:
4174 case Iop_RecipStep64Fx2:
4175 case Iop_RSqrtStep64Fx2:
4176 return binary64Fx2(mce, vatom1, vatom2);
4178 case Iop_CmpLT16Fx8:
4179 case Iop_CmpLE16Fx8:
4180 case Iop_CmpEQ16Fx8:
4181 return binary16Fx8(mce, vatom1, vatom2);
4183 case Iop_Sub64F0x2:
4184 case Iop_Mul64F0x2:
4185 case Iop_Min64F0x2:
4186 case Iop_Max64F0x2:
4187 case Iop_Div64F0x2:
4188 case Iop_CmpLT64F0x2:
4189 case Iop_CmpLE64F0x2:
4190 case Iop_CmpEQ64F0x2:
4191 case Iop_CmpUN64F0x2:
4192 case Iop_Add64F0x2:
4193 return binary64F0x2(mce, vatom1, vatom2);
4195 case Iop_Min32Fx4:
4196 case Iop_Max32Fx4:
4197 case Iop_CmpLT32Fx4:
4198 case Iop_CmpLE32Fx4:
4199 case Iop_CmpEQ32Fx4:
4200 case Iop_CmpUN32Fx4:
4201 case Iop_CmpGT32Fx4:
4202 case Iop_CmpGE32Fx4:
4203 case Iop_RecipStep32Fx4:
4204 case Iop_RSqrtStep32Fx4:
4205 return binary32Fx4(mce, vatom1, vatom2);
4207 case Iop_Sub32Fx2:
4208 case Iop_Mul32Fx2:
4209 case Iop_Min32Fx2:
4210 case Iop_Max32Fx2:
4211 case Iop_CmpEQ32Fx2:
4212 case Iop_CmpGT32Fx2:
4213 case Iop_CmpGE32Fx2:
4214 case Iop_Add32Fx2:
4215 case Iop_RecipStep32Fx2:
4216 case Iop_RSqrtStep32Fx2:
4217 return binary32Fx2(mce, vatom1, vatom2);
4219 case Iop_Sub32F0x4:
4220 case Iop_Mul32F0x4:
4221 case Iop_Min32F0x4:
4222 case Iop_Max32F0x4:
4223 case Iop_Div32F0x4:
4224 case Iop_CmpLT32F0x4:
4225 case Iop_CmpLE32F0x4:
4226 case Iop_CmpEQ32F0x4:
4227 case Iop_CmpUN32F0x4:
4228 case Iop_Add32F0x4:
4229 return binary32F0x4(mce, vatom1, vatom2);
4231 case Iop_QShlNsatSU8x16:
4232 case Iop_QShlNsatUU8x16:
4233 case Iop_QShlNsatSS8x16:
4234 complainIfUndefined(mce, atom2, NULL);
4235 return mkPCast8x16(mce, vatom1);
4237 case Iop_QShlNsatSU16x8:
4238 case Iop_QShlNsatUU16x8:
4239 case Iop_QShlNsatSS16x8:
4240 complainIfUndefined(mce, atom2, NULL);
4241 return mkPCast16x8(mce, vatom1);
4243 case Iop_QShlNsatSU32x4:
4244 case Iop_QShlNsatUU32x4:
4245 case Iop_QShlNsatSS32x4:
4246 complainIfUndefined(mce, atom2, NULL);
4247 return mkPCast32x4(mce, vatom1);
4249 case Iop_QShlNsatSU64x2:
4250 case Iop_QShlNsatUU64x2:
4251 case Iop_QShlNsatSS64x2:
4252 complainIfUndefined(mce, atom2, NULL);
4253 return mkPCast32x4(mce, vatom1);
4255 /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
4256 To make this simpler, do the following:
4257 * complain if the shift amount (the I8) is undefined
4258 * pcast each lane at the wide width
4259 * truncate each lane to half width
4260 * pcast the resulting 64-bit value to a single bit and use
4261 that as the least significant bit of the upper half of the
4262 result. */
4263 case Iop_QandQShrNnarrow64Uto32Ux2:
4264 case Iop_QandQSarNnarrow64Sto32Sx2:
4265 case Iop_QandQSarNnarrow64Sto32Ux2:
4266 case Iop_QandQRShrNnarrow64Uto32Ux2:
4267 case Iop_QandQRSarNnarrow64Sto32Sx2:
4268 case Iop_QandQRSarNnarrow64Sto32Ux2:
4269 case Iop_QandQShrNnarrow32Uto16Ux4:
4270 case Iop_QandQSarNnarrow32Sto16Sx4:
4271 case Iop_QandQSarNnarrow32Sto16Ux4:
4272 case Iop_QandQRShrNnarrow32Uto16Ux4:
4273 case Iop_QandQRSarNnarrow32Sto16Sx4:
4274 case Iop_QandQRSarNnarrow32Sto16Ux4:
4275 case Iop_QandQShrNnarrow16Uto8Ux8:
4276 case Iop_QandQSarNnarrow16Sto8Sx8:
4277 case Iop_QandQSarNnarrow16Sto8Ux8:
4278 case Iop_QandQRShrNnarrow16Uto8Ux8:
4279 case Iop_QandQRSarNnarrow16Sto8Sx8:
4280 case Iop_QandQRSarNnarrow16Sto8Ux8:
4282 IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL;
4283 IROp opNarrow = Iop_INVALID;
4284 switch (op) {
4285 case Iop_QandQShrNnarrow64Uto32Ux2:
4286 case Iop_QandQSarNnarrow64Sto32Sx2:
4287 case Iop_QandQSarNnarrow64Sto32Ux2:
4288 case Iop_QandQRShrNnarrow64Uto32Ux2:
4289 case Iop_QandQRSarNnarrow64Sto32Sx2:
4290 case Iop_QandQRSarNnarrow64Sto32Ux2:
4291 fnPessim = mkPCast64x2;
4292 opNarrow = Iop_NarrowUn64to32x2;
4293 break;
4294 case Iop_QandQShrNnarrow32Uto16Ux4:
4295 case Iop_QandQSarNnarrow32Sto16Sx4:
4296 case Iop_QandQSarNnarrow32Sto16Ux4:
4297 case Iop_QandQRShrNnarrow32Uto16Ux4:
4298 case Iop_QandQRSarNnarrow32Sto16Sx4:
4299 case Iop_QandQRSarNnarrow32Sto16Ux4:
4300 fnPessim = mkPCast32x4;
4301 opNarrow = Iop_NarrowUn32to16x4;
4302 break;
4303 case Iop_QandQShrNnarrow16Uto8Ux8:
4304 case Iop_QandQSarNnarrow16Sto8Sx8:
4305 case Iop_QandQSarNnarrow16Sto8Ux8:
4306 case Iop_QandQRShrNnarrow16Uto8Ux8:
4307 case Iop_QandQRSarNnarrow16Sto8Sx8:
4308 case Iop_QandQRSarNnarrow16Sto8Ux8:
4309 fnPessim = mkPCast16x8;
4310 opNarrow = Iop_NarrowUn16to8x8;
4311 break;
4312 default:
4313 tl_assert(0);
4315 complainIfUndefined(mce, atom2, NULL);
4316 // Pessimised shift result
4317 IRAtom* shV
4318 = fnPessim(mce, vatom1);
4319 // Narrowed, pessimised shift result
4320 IRAtom* shVnarrowed
4321 = assignNew('V', mce, Ity_I64, unop(opNarrow, shV));
4322 // Generates: Def--(63)--Def PCast-to-I1(narrowed)
4323 IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64);
4324 // and assemble the result
4325 return assignNew('V', mce, Ity_V128,
4326 binop(Iop_64HLtoV128, qV, shVnarrowed));
4329 case Iop_Mull32Sx2:
4330 case Iop_Mull32Ux2:
4331 case Iop_QDMull32Sx2:
4332 return vectorWidenI64(mce, Iop_Widen32Sto64x2,
4333 mkUifU64(mce, vatom1, vatom2));
4335 case Iop_Mull16Sx4:
4336 case Iop_Mull16Ux4:
4337 case Iop_QDMull16Sx4:
4338 return vectorWidenI64(mce, Iop_Widen16Sto32x4,
4339 mkUifU64(mce, vatom1, vatom2));
4341 case Iop_Mull8Sx8:
4342 case Iop_Mull8Ux8:
4343 case Iop_PolynomialMull8x8:
4344 return vectorWidenI64(mce, Iop_Widen8Sto16x8,
4345 mkUifU64(mce, vatom1, vatom2));
4347 case Iop_PwAdd32x4:
4348 return mkPCast32x4(mce,
4349 assignNew('V', mce, Ity_V128, binop(op, mkPCast32x4(mce, vatom1),
4350 mkPCast32x4(mce, vatom2))));
4352 case Iop_PwAdd16x8:
4353 return mkPCast16x8(mce,
4354 assignNew('V', mce, Ity_V128, binop(op, mkPCast16x8(mce, vatom1),
4355 mkPCast16x8(mce, vatom2))));
4357 case Iop_PwAdd8x16:
4358 return mkPCast8x16(mce,
4359 assignNew('V', mce, Ity_V128, binop(op, mkPCast8x16(mce, vatom1),
4360 mkPCast8x16(mce, vatom2))));
4362 /* V128-bit data-steering */
4363 case Iop_SetV128lo32:
4364 case Iop_SetV128lo64:
4365 case Iop_64HLtoV128:
4366 case Iop_InterleaveLO64x2:
4367 case Iop_InterleaveLO32x4:
4368 case Iop_InterleaveLO16x8:
4369 case Iop_InterleaveLO8x16:
4370 case Iop_InterleaveHI64x2:
4371 case Iop_InterleaveHI32x4:
4372 case Iop_InterleaveHI16x8:
4373 case Iop_InterleaveHI8x16:
4374 case Iop_CatOddLanes8x16:
4375 case Iop_CatOddLanes16x8:
4376 case Iop_CatOddLanes32x4:
4377 case Iop_CatEvenLanes8x16:
4378 case Iop_CatEvenLanes16x8:
4379 case Iop_CatEvenLanes32x4:
4380 case Iop_InterleaveOddLanes8x16:
4381 case Iop_InterleaveOddLanes16x8:
4382 case Iop_InterleaveOddLanes32x4:
4383 case Iop_InterleaveEvenLanes8x16:
4384 case Iop_InterleaveEvenLanes16x8:
4385 case Iop_InterleaveEvenLanes32x4:
4386 case Iop_PackOddLanes8x16:
4387 case Iop_PackOddLanes16x8:
4388 case Iop_PackOddLanes32x4:
4389 case Iop_PackEvenLanes8x16:
4390 case Iop_PackEvenLanes16x8:
4391 case Iop_PackEvenLanes32x4:
4392 return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
4394 case Iop_GetElem8x16:
4395 complainIfUndefined(mce, atom2, NULL);
4396 return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
4397 case Iop_GetElem16x8:
4398 complainIfUndefined(mce, atom2, NULL);
4399 return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
4400 case Iop_GetElem32x4:
4401 complainIfUndefined(mce, atom2, NULL);
4402 return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
4403 case Iop_GetElem64x2:
4404 complainIfUndefined(mce, atom2, NULL);
4405 return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
4407 /* Perm8x16: rearrange values in left arg using steering values
4408 from right arg. So rearrange the vbits in the same way but
4409 pessimise wrt steering values. Perm32x4 ditto. */
4410 /* PermOrZero8x16: see comments above for PermOrZero8x8. */
4411 case Iop_Perm8x16:
4412 case Iop_PermOrZero8x16:
4413 return mkUifUV128(
4414 mce,
4415 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
4416 mkPCast8x16(mce, vatom2)
4418 case Iop_Perm32x4:
4419 return mkUifUV128(
4420 mce,
4421 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
4422 mkPCast32x4(mce, vatom2)
4425 /* These two take the lower half of each 16-bit lane, sign/zero
4426 extend it to 32, and multiply together, producing a 32x4
4427 result (and implicitly ignoring half the operand bits). So
4428 treat it as a bunch of independent 16x8 operations, but then
4429 do 32-bit shifts left-right to copy the lower half results
4430 (which are all 0s or all 1s due to PCasting in binary16Ix8)
4431 into the upper half of each result lane. */
4432 case Iop_MullEven16Ux8:
4433 case Iop_MullEven16Sx8: {
4434 IRAtom* at;
4435 at = binary16Ix8(mce,vatom1,vatom2);
4436 at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
4437 at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
4438 return at;
4441 /* Same deal as Iop_MullEven16{S,U}x8 */
4442 case Iop_MullEven8Ux16:
4443 case Iop_MullEven8Sx16: {
4444 IRAtom* at;
4445 at = binary8Ix16(mce,vatom1,vatom2);
4446 at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
4447 at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
4448 return at;
4451 /* Same deal as Iop_MullEven16{S,U}x8 */
4452 case Iop_MullEven32Ux4:
4453 case Iop_MullEven32Sx4: {
4454 IRAtom* at;
4455 at = binary32Ix4(mce,vatom1,vatom2);
4456 at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN64x2, at, mkU8(32)));
4457 at = assignNew('V', mce, Ity_V128, binop(Iop_SarN64x2, at, mkU8(32)));
4458 return at;
4461 /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
4462 32x4 -> 16x8 laneage, discarding the upper half of each lane.
4463 Simply apply same op to the V bits, since this really no more
4464 than a data steering operation. */
4465 case Iop_NarrowBin32to16x8:
4466 case Iop_NarrowBin16to8x16:
4467 case Iop_NarrowBin64to32x4:
4468 return assignNew('V', mce, Ity_V128,
4469 binop(op, vatom1, vatom2));
4471 case Iop_ShrV128:
4472 case Iop_SarV128:
4473 case Iop_ShlV128:
4474 case Iop_I128StoBCD128:
4475 /* Same scheme as with all other shifts. Note: 10 Nov 05:
4476 this is wrong now, scalar shifts are done properly lazily.
4477 Vector shifts should be fixed too. */
4478 complainIfUndefined(mce, atom2, NULL);
4479 return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4481 case Iop_I128UtoF128: /* I128 -> F128 */
4482 case Iop_I128StoF128: /* I128 -> F128 */
4483 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4485 case Iop_BCDAdd:
4486 case Iop_BCDSub:
4487 return mkLazy2(mce, Ity_V128, vatom1, vatom2);
4489 /* SHA Iops */
4490 case Iop_SHA256:
4491 case Iop_SHA512:
4492 complainIfUndefined(mce, atom2, NULL);
4493 return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4495 /* I128-bit data-steering */
4496 case Iop_64HLto128:
4497 return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
4499 /* V256-bit SIMD */
4501 case Iop_Max64Fx4:
4502 case Iop_Min64Fx4:
4503 return binary64Fx4(mce, vatom1, vatom2);
4505 case Iop_Max32Fx8:
4506 case Iop_Min32Fx8:
4507 return binary32Fx8(mce, vatom1, vatom2);
4509 /* V256-bit data-steering */
4510 case Iop_V128HLtoV256:
4511 return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
4513 /* Scalar floating point */
4515 case Iop_F32toI64S:
4516 case Iop_F32toI64U:
4517 /* I32(rm) x F32 -> I64 */
4518 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4520 case Iop_I64StoF32:
4521 /* I32(rm) x I64 -> F32 */
4522 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4524 case Iop_RoundF64toInt:
4525 case Iop_RoundF64toF32:
4526 case Iop_F64toI64S:
4527 case Iop_F64toI64U:
4528 case Iop_I64StoF64:
4529 case Iop_I64UtoF64:
4530 case Iop_SinF64:
4531 case Iop_CosF64:
4532 case Iop_TanF64:
4533 case Iop_2xm1F64:
4534 case Iop_SqrtF64:
4535 case Iop_RecpExpF64:
4536 /* I32(rm) x I64/F64 -> I64/F64 */
4537 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4539 case Iop_ShlD64:
4540 case Iop_ShrD64:
4541 case Iop_RoundD64toInt:
4542 /* I32(rm) x D64 -> D64 */
4543 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4545 case Iop_ShlD128:
4546 case Iop_ShrD128:
4547 case Iop_RoundD128toInt:
4548 /* I32(rm) x D128 -> D128 */
4549 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4551 case Iop_RoundF128toInt:
4552 /* I32(rm) x F128 -> F128 */
4553 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4555 case Iop_D64toI64S:
4556 case Iop_D64toI64U:
4557 case Iop_I64StoD64:
4558 case Iop_I64UtoD64:
4559 /* I32(rm) x I64/D64 -> D64/I64 */
4560 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4562 case Iop_F32toD32:
4563 case Iop_F64toD32:
4564 case Iop_F128toD32:
4565 case Iop_D32toF32:
4566 case Iop_D64toF32:
4567 case Iop_D128toF32:
4568 /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
4569 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4571 case Iop_F32toD64:
4572 case Iop_F64toD64:
4573 case Iop_F128toD64:
4574 case Iop_D32toF64:
4575 case Iop_D64toF64:
4576 case Iop_D128toF64:
4577 /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
4578 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4580 case Iop_F32toD128:
4581 case Iop_F64toD128:
4582 case Iop_F128toD128:
4583 case Iop_D32toF128:
4584 case Iop_D64toF128:
4585 case Iop_D128toF128:
4586 case Iop_I128StoD128:
4587 /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
4588 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4590 case Iop_SqrtF16:
4591 /* I32(rm) x F16 -> F16 */
4592 return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4594 case Iop_RoundF32toInt:
4595 case Iop_SqrtF32:
4596 case Iop_RecpExpF32:
4597 /* I32(rm) x I32/F32 -> I32/F32 */
4598 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4600 case Iop_SqrtF128:
4601 /* I32(rm) x F128 -> F128 */
4602 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4604 case Iop_I32StoF32:
4605 case Iop_I32UtoF32:
4606 case Iop_F32toI32S:
4607 case Iop_F32toI32U:
4608 /* First arg is I32 (rounding mode), second is F32/I32 (data). */
4609 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4611 case Iop_F64toF16:
4612 case Iop_F32toF16:
4613 /* First arg is I32 (rounding mode), second is F64/F32 (data). */
4614 return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4616 case Iop_F128toI32S: /* IRRoundingMode(I32) x F128 -> signed I32 */
4617 case Iop_F128toI32U: /* IRRoundingMode(I32) x F128 -> unsigned I32 */
4618 case Iop_F128toF32: /* IRRoundingMode(I32) x F128 -> F32 */
4619 case Iop_D128toI32S: /* IRRoundingMode(I32) x D128 -> signed I32 */
4620 case Iop_D128toI32U: /* IRRoundingMode(I32) x D128 -> unsigned I32 */
4621 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4623 case Iop_F128toI128S: /* IRRoundingMode(I32) x F128 -> signed I128 */
4624 case Iop_RndF128: /* IRRoundingMode(I32) x F128 -> F128 */
4625 case Iop_D128toI128S: /* IRRoundingMode(I32) x D128 -> signed I128 */
4626 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4628 case Iop_F128toI64S: /* IRRoundingMode(I32) x F128 -> signed I64 */
4629 case Iop_F128toI64U: /* IRRoundingMode(I32) x F128 -> unsigned I64 */
4630 case Iop_F128toF64: /* IRRoundingMode(I32) x F128 -> F64 */
4631 case Iop_D128toD64: /* IRRoundingMode(I64) x D128 -> D64 */
4632 case Iop_D128toI64S: /* IRRoundingMode(I64) x D128 -> signed I64 */
4633 case Iop_D128toI64U: /* IRRoundingMode(I32) x D128 -> unsigned I64 */
4634 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4636 case Iop_F64HLtoF128:
4637 case Iop_D64HLtoD128:
4638 return assignNew('V', mce, Ity_I128,
4639 binop(Iop_64HLto128, vatom1, vatom2));
4641 case Iop_F64toI32U:
4642 case Iop_F64toI32S:
4643 case Iop_F64toF32:
4644 case Iop_I64UtoF32:
4645 case Iop_D64toI32U:
4646 case Iop_D64toI32S:
4647 /* First arg is I32 (rounding mode), second is F64/D64 (data). */
4648 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4650 case Iop_D64toD32:
4651 /* First arg is I32 (rounding mode), second is D64 (data). */
4652 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4654 case Iop_F64toI16S:
4655 /* First arg is I32 (rounding mode), second is F64 (data). */
4656 return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4658 case Iop_InsertExpD64:
4659 /* I64 x I64 -> D64 */
4660 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4662 case Iop_InsertExpD128:
4663 /* I64 x I128 -> D128 */
4664 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4666 case Iop_CmpF16:
4667 case Iop_CmpF32:
4668 case Iop_CmpF64:
4669 case Iop_CmpF128:
4670 case Iop_CmpD64:
4671 case Iop_CmpD128:
4672 case Iop_CmpExpD64:
4673 case Iop_CmpExpD128:
4674 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4676 case Iop_MaxNumF32:
4677 case Iop_MinNumF32:
4678 /* F32 x F32 -> F32 */
4679 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4681 case Iop_MaxNumF64:
4682 case Iop_MinNumF64:
4683 /* F64 x F64 -> F64 */
4684 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4686 /* non-FP after here */
4688 case Iop_DivModU64to32:
4689 case Iop_DivModS64to32:
4690 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4692 case Iop_DivModU128to64:
4693 case Iop_DivModS128to64:
4694 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4696 case Iop_8HLto16:
4697 return assignNew('V', mce, Ity_I16, binop(op, vatom1, vatom2));
4698 case Iop_16HLto32:
4699 return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
4700 case Iop_32HLto64:
4701 return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
4703 case Iop_DivModU64to64:
4704 case Iop_DivModS64to64: {
4705 IRAtom* vTmp64 = mkLazy2(mce, Ity_I64, vatom1, vatom2);
4706 return assignNew('V', mce, Ity_I128,
4707 binop(Iop_64HLto128, vTmp64, vTmp64));
4710 case Iop_MullS64:
4711 case Iop_MullU64: {
4712 IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4713 IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
4714 return assignNew('V', mce, Ity_I128,
4715 binop(Iop_64HLto128, vHi64, vLo64));
4718 case Iop_DivModU32to32:
4719 case Iop_DivModS32to32: {
4720 IRAtom* vTmp32 = mkLazy2(mce, Ity_I32, vatom1, vatom2);
4721 return assignNew('V', mce, Ity_I64,
4722 binop(Iop_32HLto64, vTmp32, vTmp32));
4725 case Iop_MullS32:
4726 case Iop_MullU32: {
4727 IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4728 IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
4729 return assignNew('V', mce, Ity_I64,
4730 binop(Iop_32HLto64, vHi32, vLo32));
4733 case Iop_MullS16:
4734 case Iop_MullU16: {
4735 IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4736 IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
4737 return assignNew('V', mce, Ity_I32,
4738 binop(Iop_16HLto32, vHi16, vLo16));
4741 case Iop_MullS8:
4742 case Iop_MullU8: {
4743 IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4744 IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
4745 return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
4748 case Iop_Sad8Ux4: /* maybe we could do better? ftm, do mkLazy2. */
4749 case Iop_DivS32:
4750 case Iop_DivU32:
4751 case Iop_DivU32E:
4752 case Iop_DivS32E:
4753 case Iop_QAdd32S: /* could probably do better */
4754 case Iop_QSub32S: /* could probably do better */
4755 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4757 case Iop_DivS64:
4758 case Iop_DivU64:
4759 case Iop_DivS64E:
4760 case Iop_DivU64E:
4761 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4763 case Iop_Add32:
4764 if (mce->dlbo.dl_Add32 == DLexpensive
4765 || (mce->dlbo.dl_Add32 == DLauto && hu == HuOth)) {
4766 return expensiveAddSub(mce,True,Ity_I32,
4767 vatom1,vatom2, atom1,atom2);
4768 } else {
4769 goto cheap_AddSub32;
4771 case Iop_Sub32:
4772 if (mce->dlbo.dl_Sub32 == DLexpensive
4773 || (mce->dlbo.dl_Sub32 == DLauto && hu == HuOth)) {
4774 return expensiveAddSub(mce,False,Ity_I32,
4775 vatom1,vatom2, atom1,atom2);
4776 } else {
4777 goto cheap_AddSub32;
4780 cheap_AddSub32:
4781 case Iop_Mul32:
4782 return mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4784 case Iop_CmpORD32S:
4785 case Iop_CmpORD32U:
4786 case Iop_CmpORD64S:
4787 case Iop_CmpORD64U:
4788 return doCmpORD(mce, op, vatom1,vatom2, atom1,atom2);
4790 case Iop_Add64:
4791 if (mce->dlbo.dl_Add64 == DLexpensive
4792 || (mce->dlbo.dl_Add64 == DLauto && hu == HuOth)) {
4793 return expensiveAddSub(mce,True,Ity_I64,
4794 vatom1,vatom2, atom1,atom2);
4795 } else {
4796 goto cheap_AddSub64;
4798 case Iop_Sub64:
4799 if (mce->dlbo.dl_Sub64 == DLexpensive
4800 || (mce->dlbo.dl_Sub64 == DLauto && hu == HuOth)) {
4801 return expensiveAddSub(mce,False,Ity_I64,
4802 vatom1,vatom2, atom1,atom2);
4803 } else {
4804 goto cheap_AddSub64;
4807 cheap_AddSub64:
4808 case Iop_Mul64:
4809 return mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4811 case Iop_Mul16:
4812 case Iop_Add16:
4813 case Iop_Sub16:
4814 return mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4816 case Iop_Mul8:
4817 case Iop_Sub8:
4818 case Iop_Add8:
4819 return mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4821 ////---- CmpXX64
4822 case Iop_CmpEQ64: case Iop_CmpNE64:
4823 if (mce->dlbo.dl_CmpEQ64_CmpNE64 == DLexpensive)
4824 goto expensive_cmp64;
4825 else
4826 goto cheap_cmp64;
4828 expensive_cmp64:
4829 case Iop_ExpCmpNE64:
4830 return expensiveCmpEQorNE(mce,Ity_I64, vatom1,vatom2, atom1,atom2 );
4832 cheap_cmp64:
4833 case Iop_CmpLE64S: case Iop_CmpLE64U:
4834 case Iop_CmpLT64U: case Iop_CmpLT64S:
4835 return mkPCastTo(mce, Ity_I1, mkUifU64(mce, vatom1,vatom2));
4837 ////---- CmpXX32
4838 case Iop_CmpEQ32: case Iop_CmpNE32:
4839 if (mce->dlbo.dl_CmpEQ32_CmpNE32 == DLexpensive)
4840 goto expensive_cmp32;
4841 else
4842 goto cheap_cmp32;
4844 expensive_cmp32:
4845 case Iop_ExpCmpNE32:
4846 return expensiveCmpEQorNE(mce,Ity_I32, vatom1,vatom2, atom1,atom2 );
4848 cheap_cmp32:
4849 case Iop_CmpLE32S: case Iop_CmpLE32U:
4850 case Iop_CmpLT32U: case Iop_CmpLT32S:
4851 return mkPCastTo(mce, Ity_I1, mkUifU32(mce, vatom1,vatom2));
4853 ////---- CmpXX16
4854 case Iop_CmpEQ16: case Iop_CmpNE16:
4855 if (mce->dlbo.dl_CmpEQ16_CmpNE16 == DLexpensive)
4856 goto expensive_cmp16;
4857 else
4858 goto cheap_cmp16;
4860 expensive_cmp16:
4861 case Iop_ExpCmpNE16:
4862 return expensiveCmpEQorNE(mce,Ity_I16, vatom1,vatom2, atom1,atom2 );
4864 cheap_cmp16:
4865 return mkPCastTo(mce, Ity_I1, mkUifU16(mce, vatom1,vatom2));
4867 ////---- CmpXX8
4868 case Iop_CmpEQ8: case Iop_CmpNE8:
4869 if (mce->dlbo.dl_CmpEQ8_CmpNE8 == DLexpensive)
4870 goto expensive_cmp8;
4871 else
4872 goto cheap_cmp8;
4874 expensive_cmp8:
4875 return expensiveCmpEQorNE(mce,Ity_I8, vatom1,vatom2, atom1,atom2 );
4877 cheap_cmp8:
4878 return mkPCastTo(mce, Ity_I1, mkUifU8(mce, vatom1,vatom2));
4880 ////---- end CmpXX{64,32,16,8}
4882 case Iop_CasCmpEQ8: case Iop_CasCmpNE8:
4883 case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
4884 case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
4885 case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
4886 /* Just say these all produce a defined result, regardless
4887 of their arguments. See COMMENT_ON_CasCmpEQ in this file. */
4888 return assignNew('V', mce, Ity_I1, definedOfType(Ity_I1));
4890 case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
4891 return scalarShift( mce, Ity_I64, op, vatom1,vatom2, atom1,atom2 );
4893 case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
4894 return scalarShift( mce, Ity_I32, op, vatom1,vatom2, atom1,atom2 );
4896 case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
4897 return scalarShift( mce, Ity_I16, op, vatom1,vatom2, atom1,atom2 );
4899 case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
4900 return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
4902 case Iop_AndV256:
4903 uifu = mkUifUV256; difd = mkDifDV256;
4904 and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
4905 case Iop_AndV128:
4906 uifu = mkUifUV128; difd = mkDifDV128;
4907 and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
4908 case Iop_And64:
4909 uifu = mkUifU64; difd = mkDifD64;
4910 and_or_ty = Ity_I64; improve = mkImproveAND64; goto do_And_Or;
4911 case Iop_And32:
4912 uifu = mkUifU32; difd = mkDifD32;
4913 and_or_ty = Ity_I32; improve = mkImproveAND32; goto do_And_Or;
4914 case Iop_And16:
4915 uifu = mkUifU16; difd = mkDifD16;
4916 and_or_ty = Ity_I16; improve = mkImproveAND16; goto do_And_Or;
4917 case Iop_And8:
4918 uifu = mkUifU8; difd = mkDifD8;
4919 and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
4920 case Iop_And1:
4921 uifu = mkUifU1; difd = mkDifD1;
4922 and_or_ty = Ity_I1; improve = mkImproveAND1; goto do_And_Or;
4924 case Iop_OrV256:
4925 uifu = mkUifUV256; difd = mkDifDV256;
4926 and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
4927 case Iop_OrV128:
4928 uifu = mkUifUV128; difd = mkDifDV128;
4929 and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
4930 case Iop_Or64:
4931 uifu = mkUifU64; difd = mkDifD64;
4932 and_or_ty = Ity_I64; improve = mkImproveOR64; goto do_And_Or;
4933 case Iop_Or32:
4934 uifu = mkUifU32; difd = mkDifD32;
4935 and_or_ty = Ity_I32; improve = mkImproveOR32; goto do_And_Or;
4936 case Iop_Or16:
4937 uifu = mkUifU16; difd = mkDifD16;
4938 and_or_ty = Ity_I16; improve = mkImproveOR16; goto do_And_Or;
4939 case Iop_Or8:
4940 uifu = mkUifU8; difd = mkDifD8;
4941 and_or_ty = Ity_I8; improve = mkImproveOR8; goto do_And_Or;
4942 case Iop_Or1:
4943 uifu = mkUifU1; difd = mkDifD1;
4944 and_or_ty = Ity_I1; improve = mkImproveOR1; goto do_And_Or;
4946 do_And_Or:
4947 return assignNew('V', mce, and_or_ty,
4948 difd(mce, uifu(mce, vatom1, vatom2),
4949 difd(mce, improve(mce, atom1, vatom1),
4950 improve(mce, atom2, vatom2) ) ) );
4952 case Iop_Xor8:
4953 return mkUifU8(mce, vatom1, vatom2);
4954 case Iop_Xor16:
4955 return mkUifU16(mce, vatom1, vatom2);
4956 case Iop_Xor32:
4957 return mkUifU32(mce, vatom1, vatom2);
4958 case Iop_Xor64:
4959 return mkUifU64(mce, vatom1, vatom2);
4960 case Iop_XorV128:
4961 return mkUifUV128(mce, vatom1, vatom2);
4962 case Iop_XorV256:
4963 return mkUifUV256(mce, vatom1, vatom2);
4965 /* V256-bit SIMD */
4967 case Iop_ShrN16x16:
4968 case Iop_ShrN32x8:
4969 case Iop_ShrN64x4:
4970 case Iop_SarN16x16:
4971 case Iop_SarN32x8:
4972 case Iop_ShlN16x16:
4973 case Iop_ShlN32x8:
4974 case Iop_ShlN64x4:
4975 /* Same scheme as with all other shifts. Note: 22 Oct 05:
4976 this is wrong now, scalar shifts are done properly lazily.
4977 Vector shifts should be fixed too. */
4978 complainIfUndefined(mce, atom2, NULL);
4979 return assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2));
4981 case Iop_QSub8Ux32:
4982 case Iop_QSub8Sx32:
4983 case Iop_Sub8x32:
4984 case Iop_Min8Ux32:
4985 case Iop_Min8Sx32:
4986 case Iop_Max8Ux32:
4987 case Iop_Max8Sx32:
4988 case Iop_CmpGT8Sx32:
4989 case Iop_CmpEQ8x32:
4990 case Iop_Avg8Ux32:
4991 case Iop_QAdd8Ux32:
4992 case Iop_QAdd8Sx32:
4993 case Iop_Add8x32:
4994 return binary8Ix32(mce, vatom1, vatom2);
4996 case Iop_QSub16Ux16:
4997 case Iop_QSub16Sx16:
4998 case Iop_Sub16x16:
4999 case Iop_Mul16x16:
5000 case Iop_MulHi16Sx16:
5001 case Iop_MulHi16Ux16:
5002 case Iop_Min16Sx16:
5003 case Iop_Min16Ux16:
5004 case Iop_Max16Sx16:
5005 case Iop_Max16Ux16:
5006 case Iop_CmpGT16Sx16:
5007 case Iop_CmpEQ16x16:
5008 case Iop_Avg16Ux16:
5009 case Iop_QAdd16Ux16:
5010 case Iop_QAdd16Sx16:
5011 case Iop_Add16x16:
5012 return binary16Ix16(mce, vatom1, vatom2);
5014 case Iop_Sub32x8:
5015 case Iop_CmpGT32Sx8:
5016 case Iop_CmpEQ32x8:
5017 case Iop_Add32x8:
5018 case Iop_Max32Ux8:
5019 case Iop_Max32Sx8:
5020 case Iop_Min32Ux8:
5021 case Iop_Min32Sx8:
5022 case Iop_Mul32x8:
5023 return binary32Ix8(mce, vatom1, vatom2);
5025 case Iop_Sub64x4:
5026 case Iop_Add64x4:
5027 case Iop_CmpEQ64x4:
5028 case Iop_CmpGT64Sx4:
5029 return binary64Ix4(mce, vatom1, vatom2);
5031 case Iop_I32StoF32x8:
5032 case Iop_F32toI32Sx8:
5033 return unary32Fx8_w_rm(mce, vatom1, vatom2);
5035 /* Perm32x8: rearrange values in left arg using steering values
5036 from right arg. So rearrange the vbits in the same way but
5037 pessimise wrt steering values. */
5038 case Iop_Perm32x8:
5039 return mkUifUV256(
5040 mce,
5041 assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2)),
5042 mkPCast32x8(mce, vatom2)
5045 /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
5046 Handle the shifted results in the same way that other
5047 binary Q ops are handled, eg QSub: UifU the two args,
5048 then pessimise -- which is binaryNIxM. But for the upper
5049 V128, we require to generate just 1 bit which is the
5050 pessimised shift result, with 127 defined zeroes above it.
5052 Note that this overly pessimistic in that in fact only the
5053 bottom 8 bits of each lane of the second arg determine the shift
5054 amount. Really we ought to ignore any undefinedness in the
5055 rest of the lanes of the second arg. */
5056 case Iop_QandSQsh64x2: case Iop_QandUQsh64x2:
5057 case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2:
5058 case Iop_QandSQsh32x4: case Iop_QandUQsh32x4:
5059 case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4:
5060 case Iop_QandSQsh16x8: case Iop_QandUQsh16x8:
5061 case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8:
5062 case Iop_QandSQsh8x16: case Iop_QandUQsh8x16:
5063 case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16:
5065 // The function to generate the pessimised shift result
5066 IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL;
5067 switch (op) {
5068 case Iop_QandSQsh64x2:
5069 case Iop_QandUQsh64x2:
5070 case Iop_QandSQRsh64x2:
5071 case Iop_QandUQRsh64x2:
5072 binaryNIxM = binary64Ix2;
5073 break;
5074 case Iop_QandSQsh32x4:
5075 case Iop_QandUQsh32x4:
5076 case Iop_QandSQRsh32x4:
5077 case Iop_QandUQRsh32x4:
5078 binaryNIxM = binary32Ix4;
5079 break;
5080 case Iop_QandSQsh16x8:
5081 case Iop_QandUQsh16x8:
5082 case Iop_QandSQRsh16x8:
5083 case Iop_QandUQRsh16x8:
5084 binaryNIxM = binary16Ix8;
5085 break;
5086 case Iop_QandSQsh8x16:
5087 case Iop_QandUQsh8x16:
5088 case Iop_QandSQRsh8x16:
5089 case Iop_QandUQRsh8x16:
5090 binaryNIxM = binary8Ix16;
5091 break;
5092 default:
5093 tl_assert(0);
5095 tl_assert(binaryNIxM);
5096 // Pessimised shift result, shV[127:0]
5097 IRAtom* shV = binaryNIxM(mce, vatom1, vatom2);
5098 // Generates: Def--(127)--Def PCast-to-I1(shV)
5099 IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128);
5100 // and assemble the result
5101 return assignNew('V', mce, Ity_V256,
5102 binop(Iop_V128HLtoV256, qV, shV));
5105 case Iop_F32toF16x4: {
5106 // First, PCast the input vector, retaining the 32x4 format.
5107 IRAtom* pcasted = mkPCast32x4(mce, vatom2); // :: 32x4
5108 // Now truncate each 32 bit lane to 16 bits. Since we already PCasted
5109 // the input, we're not going to lose any information.
5110 IRAtom* pcHI64
5111 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, pcasted));//32x2
5112 IRAtom* pcLO64
5113 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, pcasted)); // 32x2
5114 IRAtom* narrowed
5115 = assignNew('V', mce, Ity_I64, binop(Iop_NarrowBin32to16x4,
5116 pcHI64, pcLO64)); // 16x4
5117 // Finally, roll in any badness from the rounding mode.
5118 IRAtom* rmPCasted = mkPCastTo(mce, Ity_I64, vatom1);
5119 return mkUifU64(mce, narrowed, rmPCasted);
5122 case Iop_F32toF16x8: {
5123 // Same scheme as for Iop_F32toF16x4.
5124 IRAtom* pcasted = mkPCast32x8(mce, vatom2); // :: 32x8
5125 IRAtom* pcHI128
5126 = assignNew('V', mce, Ity_V128, unop(Iop_V256toV128_1,
5127 pcasted)); // 32x4
5128 IRAtom* pcLO128
5129 = assignNew('V', mce, Ity_V128, unop(Iop_V256toV128_0,
5130 pcasted)); // 32x4
5131 IRAtom* narrowed
5132 = assignNew('V', mce, Ity_V128, binop(Iop_NarrowBin32to16x8,
5133 pcHI128, pcLO128)); // 16x8
5134 // Finally, roll in any badness from the rounding mode.
5135 IRAtom* rmPCasted = mkPCastTo(mce, Ity_V128, vatom1);
5136 return mkUifUV128(mce, narrowed, rmPCasted);
5139 default:
5140 ppIROp(op);
5141 VG_(tool_panic)("memcheck:expr2vbits_Binop");
5146 static
5147 IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
5149 /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
5150 selection of shadow operation implicitly duplicates the logic in
5151 do_shadow_LoadG and should be kept in sync (in the very unlikely
5152 event that the interpretation of such widening ops changes in
5153 future). See comment in do_shadow_LoadG. */
5154 IRAtom* vatom = expr2vbits( mce, atom, HuOth );
5155 tl_assert(isOriginalAtom(mce,atom));
5156 switch (op) {
5158 case Iop_Abs64Fx2:
5159 case Iop_Neg64Fx2:
5160 case Iop_RSqrtEst64Fx2:
5161 case Iop_RecipEst64Fx2:
5162 case Iop_Log2_64Fx2:
5163 return unary64Fx2(mce, vatom);
5165 case Iop_Sqrt64F0x2:
5166 return unary64F0x2(mce, vatom);
5168 case Iop_Sqrt32Fx8:
5169 case Iop_RSqrtEst32Fx8:
5170 case Iop_RecipEst32Fx8:
5171 return unary32Fx8(mce, vatom);
5173 case Iop_Sqrt64Fx4:
5174 return unary64Fx4(mce, vatom);
5176 case Iop_RecipEst32Fx4:
5177 case Iop_I32UtoF32x4_DEP:
5178 case Iop_I32StoF32x4_DEP:
5179 case Iop_QF32toI32Ux4_RZ:
5180 case Iop_QF32toI32Sx4_RZ:
5181 case Iop_RoundF32x4_RM:
5182 case Iop_RoundF32x4_RP:
5183 case Iop_RoundF32x4_RN:
5184 case Iop_RoundF32x4_RZ:
5185 case Iop_RecipEst32Ux4:
5186 case Iop_Abs32Fx4:
5187 case Iop_Neg32Fx4:
5188 case Iop_RSqrtEst32Fx4:
5189 case Iop_Log2_32Fx4:
5190 case Iop_Exp2_32Fx4:
5191 return unary32Fx4(mce, vatom);
5193 case Iop_I32UtoF32x2_DEP:
5194 case Iop_I32StoF32x2_DEP:
5195 case Iop_RecipEst32Fx2:
5196 case Iop_RecipEst32Ux2:
5197 case Iop_Abs32Fx2:
5198 case Iop_Neg32Fx2:
5199 case Iop_RSqrtEst32Fx2:
5200 return unary32Fx2(mce, vatom);
5202 case Iop_Sqrt32F0x4:
5203 case Iop_RSqrtEst32F0x4:
5204 case Iop_RecipEst32F0x4:
5205 return unary32F0x4(mce, vatom);
5207 case Iop_Abs16Fx8:
5208 case Iop_Neg16Fx8:
5209 return unary16Fx8(mce, vatom);
5211 // These are self-shadowing.
5212 case Iop_32UtoV128:
5213 case Iop_64UtoV128:
5214 case Iop_Dup8x16:
5215 case Iop_Dup16x8:
5216 case Iop_Dup32x4:
5217 case Iop_Reverse1sIn8_x16:
5218 case Iop_Reverse8sIn16_x8:
5219 case Iop_Reverse8sIn32_x4:
5220 case Iop_Reverse16sIn32_x4:
5221 case Iop_Reverse8sIn64_x2:
5222 case Iop_Reverse16sIn64_x2:
5223 case Iop_Reverse32sIn64_x2:
5224 case Iop_V256toV128_1: case Iop_V256toV128_0:
5225 case Iop_ZeroHI64ofV128:
5226 case Iop_ZeroHI96ofV128:
5227 case Iop_ZeroHI112ofV128:
5228 case Iop_ZeroHI120ofV128:
5229 case Iop_ReinterpI128asV128: /* I128 -> V128 */
5230 return assignNew('V', mce, Ity_V128, unop(op, vatom));
5232 case Iop_F128HItoF64: /* F128 -> high half of F128 */
5233 case Iop_D128HItoD64: /* D128 -> high half of D128 */
5234 return assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vatom));
5236 case Iop_F128LOtoF64: /* F128 -> low half of F128 */
5237 case Iop_D128LOtoD64: /* D128 -> low half of D128 */
5238 return assignNew('V', mce, Ity_I64, unop(Iop_128to64, vatom));
5240 case Iop_NegF128:
5241 case Iop_AbsF128:
5242 case Iop_RndF128:
5243 case Iop_TruncF128toI128S: /* F128 -> I128S */
5244 case Iop_TruncF128toI128U: /* F128 -> I128U */
5245 case Iop_ReinterpV128asI128: /* V128 -> I128 */
5246 case Iop_ReinterpI128asF128:
5247 case Iop_ReinterpF128asI128:
5248 return mkPCastTo(mce, Ity_I128, vatom);
5250 case Iop_BCD128toI128S:
5251 case Iop_MulI128by10:
5252 case Iop_MulI128by10Carry:
5253 case Iop_F16toF64x2:
5254 case Iop_F64toF16x2_DEP:
5255 // FIXME JRS 2018-Nov-15. This is surely not correct!
5256 return vatom;
5258 case Iop_ReinterpI32asF32:
5259 case Iop_ReinterpF32asI32:
5260 return assignNew('V', mce, Ity_I32, vatom);
5262 case Iop_ReinterpF64asI64:
5263 case Iop_ReinterpI64asF64:
5264 case Iop_ReinterpI64asD64:
5265 case Iop_ReinterpD64asI64:
5266 return assignNew('V', mce, Ity_I64, vatom);
5268 case Iop_I32StoF128: /* signed I32 -> F128 */
5269 case Iop_I64StoF128: /* signed I64 -> F128 */
5270 case Iop_I32UtoF128: /* unsigned I32 -> F128 */
5271 case Iop_I64UtoF128: /* unsigned I64 -> F128 */
5272 case Iop_F32toF128: /* F32 -> F128 */
5273 case Iop_F64toF128: /* F64 -> F128 */
5274 case Iop_I32StoD128: /* signed I64 -> D128 */
5275 case Iop_I64StoD128: /* signed I64 -> D128 */
5276 case Iop_I32UtoD128: /* unsigned I32 -> D128 */
5277 case Iop_I64UtoD128: /* unsigned I64 -> D128 */
5278 return mkPCastTo(mce, Ity_I128, vatom);
5280 case Iop_F16toF64:
5281 case Iop_F32toF64:
5282 case Iop_I32StoF64:
5283 case Iop_I32UtoF64:
5284 case Iop_NegF64:
5285 case Iop_AbsF64:
5286 case Iop_RSqrtEst5GoodF64:
5287 case Iop_RoundF64toF64_NEAREST:
5288 case Iop_RoundF64toF64_NegINF:
5289 case Iop_RoundF64toF64_PosINF:
5290 case Iop_RoundF64toF64_ZERO:
5291 case Iop_D32toD64:
5292 case Iop_I32StoD64:
5293 case Iop_I32UtoD64:
5294 case Iop_ExtractExpD64: /* D64 -> I64 */
5295 case Iop_ExtractExpD128: /* D128 -> I64 */
5296 case Iop_ExtractSigD64: /* D64 -> I64 */
5297 case Iop_ExtractSigD128: /* D128 -> I64 */
5298 case Iop_DPBtoBCD:
5299 case Iop_BCDtoDPB:
5300 return mkPCastTo(mce, Ity_I64, vatom);
5302 case Iop_D64toD128:
5303 return mkPCastTo(mce, Ity_I128, vatom);
5305 case Iop_TruncF64asF32:
5306 case Iop_NegF32:
5307 case Iop_AbsF32:
5308 case Iop_F16toF32:
5309 return mkPCastTo(mce, Ity_I32, vatom);
5311 case Iop_AbsF16:
5312 case Iop_NegF16:
5313 return mkPCastTo(mce, Ity_I16, vatom);
5315 case Iop_Ctz32: case Iop_CtzNat32:
5316 case Iop_Ctz64: case Iop_CtzNat64:
5317 return expensiveCountTrailingZeroes(mce, op, atom, vatom);
5319 case Iop_Clz32: case Iop_ClzNat32:
5320 case Iop_Clz64: case Iop_ClzNat64:
5321 return expensiveCountLeadingZeroes(mce, op, atom, vatom);
5323 // PopCount32: this is slightly pessimistic. It is true that the
5324 // result depends on all input bits, so that aspect of the PCast is
5325 // correct. However, regardless of the input, only the lowest 5 bits
5326 // out of the output can ever be undefined. So we could actually
5327 // "improve" the results here by marking the top 27 bits of output as
5328 // defined. A similar comment applies for PopCount64.
5329 case Iop_PopCount32:
5330 return mkPCastTo(mce, Ity_I32, vatom);
5331 case Iop_PopCount64:
5332 return mkPCastTo(mce, Ity_I64, vatom);
5334 // These are self-shadowing.
5335 case Iop_1Uto64:
5336 case Iop_1Sto64:
5337 case Iop_8Uto64:
5338 case Iop_8Sto64:
5339 case Iop_16Uto64:
5340 case Iop_16Sto64:
5341 case Iop_32Sto64:
5342 case Iop_32Uto64:
5343 case Iop_V128to64:
5344 case Iop_V128HIto64:
5345 case Iop_128HIto64:
5346 case Iop_128to64:
5347 case Iop_Dup8x8:
5348 case Iop_Dup16x4:
5349 case Iop_Dup32x2:
5350 case Iop_Reverse8sIn16_x4:
5351 case Iop_Reverse8sIn32_x2:
5352 case Iop_Reverse16sIn32_x2:
5353 case Iop_Reverse8sIn64_x1:
5354 case Iop_Reverse16sIn64_x1:
5355 case Iop_Reverse32sIn64_x1:
5356 case Iop_V256to64_0: case Iop_V256to64_1:
5357 case Iop_V256to64_2: case Iop_V256to64_3:
5358 return assignNew('V', mce, Ity_I64, unop(op, vatom));
5360 // These are self-shadowing.
5361 case Iop_64to32:
5362 case Iop_64HIto32:
5363 case Iop_1Uto32:
5364 case Iop_1Sto32:
5365 case Iop_8Uto32:
5366 case Iop_16Uto32:
5367 case Iop_16Sto32:
5368 case Iop_8Sto32:
5369 case Iop_V128to32:
5370 case Iop_Reverse8sIn32_x1:
5371 return assignNew('V', mce, Ity_I32, unop(op, vatom));
5373 // These are self-shadowing.
5374 case Iop_1Sto16:
5375 case Iop_8Sto16:
5376 case Iop_8Uto16:
5377 case Iop_32to16:
5378 case Iop_32HIto16:
5379 case Iop_64to16:
5380 case Iop_GetMSBs8x16:
5381 return assignNew('V', mce, Ity_I16, unop(op, vatom));
5383 // These are self-shadowing.
5384 case Iop_1Uto8:
5385 case Iop_1Sto8:
5386 case Iop_16to8:
5387 case Iop_16HIto8:
5388 case Iop_32to8:
5389 case Iop_64to8:
5390 case Iop_GetMSBs8x8:
5391 return assignNew('V', mce, Ity_I8, unop(op, vatom));
5393 case Iop_32to1:
5394 return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
5396 case Iop_64to1:
5397 return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
5399 case Iop_NotV256:
5400 case Iop_NotV128:
5401 case Iop_Not64:
5402 case Iop_Not32:
5403 case Iop_Not16:
5404 case Iop_Not8:
5405 case Iop_Not1:
5406 // FIXME JRS 2018-Nov-15. This is surely not correct!
5407 return vatom;
5409 case Iop_CmpNEZ8x8:
5410 case Iop_Cnt8x8:
5411 case Iop_Clz8x8:
5412 case Iop_Cls8x8:
5413 case Iop_Abs8x8:
5414 return mkPCast8x8(mce, vatom);
5416 case Iop_CmpNEZ8x16:
5417 case Iop_Cnt8x16:
5418 case Iop_Clz8x16:
5419 case Iop_Cls8x16:
5420 case Iop_Abs8x16:
5421 case Iop_Ctz8x16:
5422 return mkPCast8x16(mce, vatom);
5424 case Iop_CmpNEZ16x4:
5425 case Iop_Clz16x4:
5426 case Iop_Cls16x4:
5427 case Iop_Abs16x4:
5428 return mkPCast16x4(mce, vatom);
5430 case Iop_CmpNEZ16x8:
5431 case Iop_Clz16x8:
5432 case Iop_Cls16x8:
5433 case Iop_Abs16x8:
5434 case Iop_Ctz16x8:
5435 return mkPCast16x8(mce, vatom);
5437 case Iop_CmpNEZ32x2:
5438 case Iop_Clz32x2:
5439 case Iop_Cls32x2:
5440 case Iop_F32toI32Ux2_RZ:
5441 case Iop_F32toI32Sx2_RZ:
5442 case Iop_Abs32x2:
5443 return mkPCast32x2(mce, vatom);
5445 case Iop_CmpNEZ32x4:
5446 case Iop_Clz32x4:
5447 case Iop_Cls32x4:
5448 case Iop_F32toI32Ux4_RZ:
5449 case Iop_F32toI32Sx4_RZ:
5450 case Iop_Abs32x4:
5451 case Iop_RSqrtEst32Ux4:
5452 case Iop_Ctz32x4:
5453 return mkPCast32x4(mce, vatom);
5455 case Iop_TruncF128toI32S: /* F128 -> I32S (result stored in 64-bits) */
5456 case Iop_TruncF128toI32U: /* F128 -> I32U (result stored in 64-bits) */
5457 case Iop_CmpwNEZ32:
5458 return mkPCastTo(mce, Ity_I32, vatom);
5460 case Iop_TruncF128toI64S: /* F128 -> I64S */
5461 case Iop_TruncF128toI64U: /* F128 -> I64U */
5462 case Iop_CmpwNEZ64:
5463 return mkPCastTo(mce, Ity_I64, vatom);
5465 case Iop_CmpNEZ64x2:
5466 case Iop_CipherSV128:
5467 case Iop_Clz64x2:
5468 case Iop_Abs64x2:
5469 case Iop_Ctz64x2:
5470 return mkPCast64x2(mce, vatom);
5472 // This is self-shadowing.
5473 case Iop_PwBitMtxXpose64x2:
5474 return assignNew('V', mce, Ity_V128, unop(op, vatom));
5476 case Iop_NarrowUn16to8x8:
5477 case Iop_NarrowUn32to16x4:
5478 case Iop_NarrowUn64to32x2:
5479 case Iop_QNarrowUn16Sto8Sx8:
5480 case Iop_QNarrowUn16Sto8Ux8:
5481 case Iop_QNarrowUn16Uto8Ux8:
5482 case Iop_QNarrowUn32Sto16Sx4:
5483 case Iop_QNarrowUn32Sto16Ux4:
5484 case Iop_QNarrowUn32Uto16Ux4:
5485 case Iop_QNarrowUn64Sto32Sx2:
5486 case Iop_QNarrowUn64Sto32Ux2:
5487 case Iop_QNarrowUn64Uto32Ux2:
5488 return vectorNarrowUnV128(mce, op, vatom);
5490 // JRS FIXME 2019 Mar 17: per comments on F16toF32x4, this is probably not
5491 // right.
5492 case Iop_F32toF16x4_DEP:
5493 return vectorNarrowUnV128(mce, op, vatom);
5495 case Iop_Widen8Sto16x8:
5496 case Iop_Widen8Uto16x8:
5497 case Iop_Widen16Sto32x4:
5498 case Iop_Widen16Uto32x4:
5499 case Iop_Widen32Sto64x2:
5500 case Iop_Widen32Uto64x2:
5501 return vectorWidenI64(mce, op, vatom);
5503 case Iop_F16toF32x4:
5504 // JRS 2019 Mar 17: this definitely isn't right, but it probably works
5505 // OK by accident if -- as seems likely -- the F16 to F32 conversion
5506 // preserves will generate an output 32 bits with at least one 1 bit
5507 // set if there's one or more 1 bits set in the input 16 bits. More
5508 // correct code for this is just below, but commented out, so as to
5509 // avoid short-term backend failures on targets that can't do
5510 // Iop_Interleave{LO,HI}16x4.
5511 return vectorWidenI64(mce, op, vatom);
5513 case Iop_F16toF32x8: {
5514 // PCast the input at 16x8. This makes each lane hold either all
5515 // zeroes or all ones.
5516 IRAtom* pcasted = mkPCast16x8(mce, vatom); // :: I16x8
5517 // Now double the width of each lane to 32 bits. Because the lanes are
5518 // all zeroes or all ones, we can just copy the each lane twice into
5519 // the result. Here's the low half:
5520 IRAtom* widenedLO // :: I32x4
5521 = assignNew('V', mce, Ity_V128, binop(Iop_InterleaveLO16x8,
5522 pcasted, pcasted));
5523 // And the high half:
5524 IRAtom* widenedHI // :: I32x4
5525 = assignNew('V', mce, Ity_V128, binop(Iop_InterleaveHI16x8,
5526 pcasted, pcasted));
5527 // Glue them back together:
5528 return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
5529 widenedHI, widenedLO));
5532 // See comment just above, for Iop_F16toF32x4
5533 //case Iop_F16toF32x4: {
5534 // // Same scheme as F16toF32x4
5535 // IRAtom* pcasted = mkPCast16x4(mce, vatom); // :: I16x4
5536 // IRAtom* widenedLO // :: I32x2
5537 // = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveLO16x4,
5538 // pcasted, pcasted));
5539 // IRAtom* widenedHI // :: I32x4
5540 // = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveHI16x4,
5541 // pcasted, pcasted));
5542 // // Glue them back together:
5543 // return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
5544 // widenedHI, widenedLO));
5547 case Iop_PwAddL32Ux2:
5548 case Iop_PwAddL32Sx2:
5549 return mkPCastTo(mce, Ity_I64,
5550 assignNew('V', mce, Ity_I64, unop(op, mkPCast32x2(mce, vatom))));
5552 case Iop_PwAddL16Ux4:
5553 case Iop_PwAddL16Sx4:
5554 return mkPCast32x2(mce,
5555 assignNew('V', mce, Ity_I64, unop(op, mkPCast16x4(mce, vatom))));
5557 case Iop_PwAddL8Ux8:
5558 case Iop_PwAddL8Sx8:
5559 return mkPCast16x4(mce,
5560 assignNew('V', mce, Ity_I64, unop(op, mkPCast8x8(mce, vatom))));
5562 case Iop_PwAddL32Ux4:
5563 case Iop_PwAddL32Sx4:
5564 return mkPCast64x2(mce,
5565 assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
5567 case Iop_PwAddL64Ux2:
5568 return mkPCast128x1(mce,
5569 assignNew('V', mce, Ity_V128, unop(op, mkPCast64x2(mce, vatom))));
5571 case Iop_PwAddL16Ux8:
5572 case Iop_PwAddL16Sx8:
5573 return mkPCast32x4(mce,
5574 assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom))));
5576 case Iop_PwAddL8Ux16:
5577 case Iop_PwAddL8Sx16:
5578 return mkPCast16x8(mce,
5579 assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom))));
5581 case Iop_I64UtoF32:
5582 default:
5583 ppIROp(op);
5584 VG_(tool_panic)("memcheck:expr2vbits_Unop");
5589 /* Worker function -- do not call directly. See comments on
5590 expr2vbits_Load for the meaning of |guard|.
5592 Generates IR to (1) perform a definedness test of |addr|, (2)
5593 perform a validity test of |addr|, and (3) return the Vbits for the
5594 location indicated by |addr|. All of this only happens when
5595 |guard| is NULL or |guard| evaluates to True at run time.
5597 If |guard| evaluates to False at run time, the returned value is
5598 the IR-mandated 0x55..55 value, and no checks nor shadow loads are
5599 performed.
5601 The definedness of |guard| itself is not checked. That is assumed
5602 to have been done before this point, by the caller. */
5603 static
5604 IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
5605 IREndness end, IRType ty,
5606 IRAtom* addr, UInt bias, IRAtom* guard )
5608 tl_assert(isOriginalAtom(mce,addr));
5609 tl_assert(end == Iend_LE || end == Iend_BE);
5611 /* First, emit a definedness test for the address. This also sets
5612 the address (shadow) to 'defined' following the test. */
5613 complainIfUndefined( mce, addr, guard );
5615 /* Now cook up a call to the relevant helper function, to read the data V
5616 bits from shadow memory. Note that I128 loads are done by pretending
5617 we're doing a V128 load, and then converting the resulting V128 vbits
5618 word to an I128, right at the end of this function -- see `castedToI128`
5619 below. (It's only a minor hack :-) This pertains to bug 444399. */
5620 ty = shadowTypeV(ty);
5622 void* helper = NULL;
5623 const HChar* hname = NULL;
5624 Bool ret_via_outparam = False;
5626 if (end == Iend_LE) {
5627 switch (ty) {
5628 case Ity_V256: helper = &MC_(helperc_LOADV256le);
5629 hname = "MC_(helperc_LOADV256le)";
5630 ret_via_outparam = True;
5631 break;
5632 case Ity_I128: // fallthrough. See comment above.
5633 case Ity_V128: helper = &MC_(helperc_LOADV128le);
5634 hname = "MC_(helperc_LOADV128le)";
5635 ret_via_outparam = True;
5636 break;
5637 case Ity_I64: helper = &MC_(helperc_LOADV64le);
5638 hname = "MC_(helperc_LOADV64le)";
5639 break;
5640 case Ity_I32: helper = &MC_(helperc_LOADV32le);
5641 hname = "MC_(helperc_LOADV32le)";
5642 break;
5643 case Ity_I16: helper = &MC_(helperc_LOADV16le);
5644 hname = "MC_(helperc_LOADV16le)";
5645 break;
5646 case Ity_I8: helper = &MC_(helperc_LOADV8);
5647 hname = "MC_(helperc_LOADV8)";
5648 break;
5649 default: ppIRType(ty);
5650 VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(LE)");
5652 } else {
5653 switch (ty) {
5654 case Ity_V256: helper = &MC_(helperc_LOADV256be);
5655 hname = "MC_(helperc_LOADV256be)";
5656 ret_via_outparam = True;
5657 break;
5658 case Ity_V128: helper = &MC_(helperc_LOADV128be);
5659 hname = "MC_(helperc_LOADV128be)";
5660 ret_via_outparam = True;
5661 break;
5662 case Ity_I64: helper = &MC_(helperc_LOADV64be);
5663 hname = "MC_(helperc_LOADV64be)";
5664 break;
5665 case Ity_I32: helper = &MC_(helperc_LOADV32be);
5666 hname = "MC_(helperc_LOADV32be)";
5667 break;
5668 case Ity_I16: helper = &MC_(helperc_LOADV16be);
5669 hname = "MC_(helperc_LOADV16be)";
5670 break;
5671 case Ity_I8: helper = &MC_(helperc_LOADV8);
5672 hname = "MC_(helperc_LOADV8)";
5673 break;
5674 default: ppIRType(ty);
5675 VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(BE)");
5679 tl_assert(helper);
5680 tl_assert(hname);
5682 /* Generate the actual address into addrAct. */
5683 IRAtom* addrAct;
5684 if (bias == 0) {
5685 addrAct = addr;
5686 } else {
5687 IROp mkAdd;
5688 IRAtom* eBias;
5689 IRType tyAddr = mce->hWordTy;
5690 tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
5691 mkAdd = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
5692 eBias = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
5693 addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
5696 /* We need to have a place to park the V bits we're just about to
5697 read. */
5698 IRTemp datavbits = newTemp(mce, ty == Ity_I128 ? Ity_V128 : ty, VSh);
5700 /* Here's the call. */
5701 IRDirty* di;
5702 if (ret_via_outparam) {
5703 di = unsafeIRDirty_1_N( datavbits,
5704 2/*regparms*/,
5705 hname, VG_(fnptr_to_fnentry)( helper ),
5706 mkIRExprVec_2( IRExpr_VECRET(), addrAct ) );
5707 } else {
5708 di = unsafeIRDirty_1_N( datavbits,
5709 1/*regparms*/,
5710 hname, VG_(fnptr_to_fnentry)( helper ),
5711 mkIRExprVec_1( addrAct ) );
5714 setHelperAnns( mce, di );
5715 if (guard) {
5716 di->guard = guard;
5717 /* Ideally the didn't-happen return value here would be all-ones
5718 (all-undefined), so it'd be obvious if it got used
5719 inadvertently. We can get by with the IR-mandated default
5720 value (0b01 repeating, 0x55 etc) as that'll still look pretty
5721 undefined if it ever leaks out. */
5723 stmt( 'V', mce, IRStmt_Dirty(di) );
5725 if (ty == Ity_I128) {
5726 IRAtom* castedToI128
5727 = assignNew('V', mce, Ity_I128,
5728 unop(Iop_ReinterpV128asI128, mkexpr(datavbits)));
5729 return castedToI128;
5730 } else {
5731 return mkexpr(datavbits);
5736 /* Generate IR to do a shadow load. The helper is expected to check
5737 the validity of the address and return the V bits for that address.
5738 This can optionally be controlled by a guard, which is assumed to
5739 be True if NULL. In the case where the guard is False at runtime,
5740 the helper will return the didn't-do-the-call value of 0x55..55.
5741 Since that means "completely undefined result", the caller of
5742 this function will need to fix up the result somehow in that
5743 case.
5745 Caller of this function is also expected to have checked the
5746 definedness of |guard| before this point.
5748 static
5749 IRAtom* expr2vbits_Load ( MCEnv* mce,
5750 IREndness end, IRType ty,
5751 IRAtom* addr, UInt bias,
5752 IRAtom* guard )
5754 tl_assert(end == Iend_LE || end == Iend_BE);
5755 switch (shadowTypeV(ty)) {
5756 case Ity_I8:
5757 case Ity_I16:
5758 case Ity_I32:
5759 case Ity_I64:
5760 case Ity_I128:
5761 case Ity_V128:
5762 case Ity_V256:
5763 return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
5764 default:
5765 VG_(tool_panic)("expr2vbits_Load");
5770 /* The most general handler for guarded loads. Assumes the
5771 definedness of GUARD has already been checked by the caller. A
5772 GUARD of NULL is assumed to mean "always True". Generates code to
5773 check the definedness and validity of ADDR.
5775 Generate IR to do a shadow load from ADDR and return the V bits.
5776 The loaded type is TY. The loaded data is then (shadow) widened by
5777 using VWIDEN, which can be Iop_INVALID to denote a no-op. If GUARD
5778 evaluates to False at run time then the returned Vbits are simply
5779 VALT instead. Note therefore that the argument type of VWIDEN must
5780 be TY and the result type of VWIDEN must equal the type of VALT.
5782 static
5783 IRAtom* expr2vbits_Load_guarded_General ( MCEnv* mce,
5784 IREndness end, IRType ty,
5785 IRAtom* addr, UInt bias,
5786 IRAtom* guard,
5787 IROp vwiden, IRAtom* valt )
5789 /* Sanity check the conversion operation, and also set TYWIDE. */
5790 IRType tyWide = Ity_INVALID;
5791 switch (vwiden) {
5792 case Iop_INVALID:
5793 tyWide = ty;
5794 break;
5795 case Iop_16Uto32: case Iop_16Sto32: case Iop_8Uto32: case Iop_8Sto32:
5796 tyWide = Ity_I32;
5797 break;
5798 default:
5799 VG_(tool_panic)("memcheck:expr2vbits_Load_guarded_General");
5802 /* If the guard evaluates to True, this will hold the loaded V bits
5803 at TY. If the guard evaluates to False, this will be all
5804 ones, meaning "all undefined", in which case we will have to
5805 replace it using an ITE below. */
5806 IRAtom* iftrue1
5807 = assignNew('V', mce, ty,
5808 expr2vbits_Load(mce, end, ty, addr, bias, guard));
5809 /* Now (shadow-) widen the loaded V bits to the desired width. In
5810 the guard-is-False case, the allowable widening operators will
5811 in the worst case (unsigned widening) at least leave the
5812 pre-widened part as being marked all-undefined, and in the best
5813 case (signed widening) mark the whole widened result as
5814 undefined. Anyway, it doesn't matter really, since in this case
5815 we will replace said value with the default value |valt| using an
5816 ITE. */
5817 IRAtom* iftrue2
5818 = vwiden == Iop_INVALID
5819 ? iftrue1
5820 : assignNew('V', mce, tyWide, unop(vwiden, iftrue1));
5821 /* These are the V bits we will return if the load doesn't take
5822 place. */
5823 IRAtom* iffalse
5824 = valt;
5825 /* Prepare the cond for the ITE. Convert a NULL cond into
5826 something that iropt knows how to fold out later. */
5827 IRAtom* cond
5828 = guard == NULL ? mkU1(1) : guard;
5829 /* And assemble the final result. */
5830 return assignNew('V', mce, tyWide, IRExpr_ITE(cond, iftrue2, iffalse));
5834 /* A simpler handler for guarded loads, in which there is no
5835 conversion operation, and the default V bit return (when the guard
5836 evaluates to False at runtime) is "all defined". If there is no
5837 guard expression or the guard is always TRUE this function behaves
5838 like expr2vbits_Load. It is assumed that definedness of GUARD has
5839 already been checked at the call site. */
5840 static
5841 IRAtom* expr2vbits_Load_guarded_Simple ( MCEnv* mce,
5842 IREndness end, IRType ty,
5843 IRAtom* addr, UInt bias,
5844 IRAtom *guard )
5846 return expr2vbits_Load_guarded_General(
5847 mce, end, ty, addr, bias, guard, Iop_INVALID, definedOfType(ty)
5852 static
5853 IRAtom* expr2vbits_ITE ( MCEnv* mce,
5854 IRAtom* cond, IRAtom* iftrue, IRAtom* iffalse )
5856 IRAtom *vbitsC, *vbits0, *vbits1;
5857 IRType ty;
5858 /* Given ITE(cond, iftrue, iffalse), generate
5859 ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
5860 That is, steer the V bits like the originals, but trash the
5861 result if the steering value is undefined. This gives
5862 lazy propagation. */
5863 tl_assert(isOriginalAtom(mce, cond));
5864 tl_assert(isOriginalAtom(mce, iftrue));
5865 tl_assert(isOriginalAtom(mce, iffalse));
5867 vbitsC = expr2vbits(mce, cond, HuOth); // could we use HuPCa here?
5868 vbits1 = expr2vbits(mce, iftrue, HuOth);
5869 vbits0 = expr2vbits(mce, iffalse, HuOth);
5870 ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
5872 return
5873 mkUifU(mce, ty, assignNew('V', mce, ty,
5874 IRExpr_ITE(cond, vbits1, vbits0)),
5875 mkPCastTo(mce, ty, vbitsC) );
5878 /* --------- This is the main expression-handling function. --------- */
5880 static
5881 IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e,
5882 HowUsed hu/*use HuOth if unknown*/ )
5884 switch (e->tag) {
5886 case Iex_Get:
5887 return shadow_GET( mce, e->Iex.Get.offset, e->Iex.Get.ty );
5889 case Iex_GetI:
5890 return shadow_GETI( mce, e->Iex.GetI.descr,
5891 e->Iex.GetI.ix, e->Iex.GetI.bias );
5893 case Iex_RdTmp:
5894 return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
5896 case Iex_Const:
5897 return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
5899 case Iex_Qop:
5900 return expr2vbits_Qop(
5901 mce,
5902 e->Iex.Qop.details->op,
5903 e->Iex.Qop.details->arg1, e->Iex.Qop.details->arg2,
5904 e->Iex.Qop.details->arg3, e->Iex.Qop.details->arg4
5907 case Iex_Triop:
5908 return expr2vbits_Triop(
5909 mce,
5910 e->Iex.Triop.details->op,
5911 e->Iex.Triop.details->arg1, e->Iex.Triop.details->arg2,
5912 e->Iex.Triop.details->arg3
5915 case Iex_Binop:
5916 return expr2vbits_Binop(
5917 mce,
5918 e->Iex.Binop.op,
5919 e->Iex.Binop.arg1, e->Iex.Binop.arg2,
5923 case Iex_Unop:
5924 return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
5926 case Iex_Load:
5927 return expr2vbits_Load( mce, e->Iex.Load.end,
5928 e->Iex.Load.ty,
5929 e->Iex.Load.addr, 0/*addr bias*/,
5930 NULL/* guard == "always True"*/ );
5932 case Iex_CCall:
5933 return mkLazyN( mce, e->Iex.CCall.args,
5934 e->Iex.CCall.retty,
5935 e->Iex.CCall.cee );
5937 case Iex_ITE:
5938 return expr2vbits_ITE( mce, e->Iex.ITE.cond, e->Iex.ITE.iftrue,
5939 e->Iex.ITE.iffalse);
5941 default:
5942 VG_(printf)("\n");
5943 ppIRExpr(e);
5944 VG_(printf)("\n");
5945 VG_(tool_panic)("memcheck: expr2vbits");
5950 /*------------------------------------------------------------*/
5951 /*--- Generate shadow stmts from all kinds of IRStmts. ---*/
5952 /*------------------------------------------------------------*/
5954 /* Widen a value to the host word size. */
5956 static
5957 IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
5959 IRType ty, tyH;
5961 /* vatom is vbits-value and as such can only have a shadow type. */
5962 tl_assert(isShadowAtom(mce,vatom));
5964 ty = typeOfIRExpr(mce->sb->tyenv, vatom);
5965 tyH = mce->hWordTy;
5967 if (tyH == Ity_I32) {
5968 switch (ty) {
5969 case Ity_I32:
5970 return vatom;
5971 case Ity_I16:
5972 return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
5973 case Ity_I8:
5974 return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
5975 default:
5976 goto unhandled;
5978 } else
5979 if (tyH == Ity_I64) {
5980 switch (ty) {
5981 case Ity_I32:
5982 return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
5983 case Ity_I16:
5984 return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5985 assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
5986 case Ity_I8:
5987 return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5988 assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
5989 default:
5990 goto unhandled;
5992 } else {
5993 goto unhandled;
5995 unhandled:
5996 VG_(printf)("\nty = "); ppIRType(ty); VG_(printf)("\n");
5997 VG_(tool_panic)("zwidenToHostWord");
6001 /* Generate a shadow store. |addr| is always the original address
6002 atom. You can pass in either originals or V-bits for the data
6003 atom, but obviously not both. This function generates a check for
6004 the definedness and (indirectly) the validity of |addr|, but only
6005 when |guard| evaluates to True at run time (or is NULL).
6007 |guard| :: Ity_I1 controls whether the store really happens; NULL
6008 means it unconditionally does. Note that |guard| itself is not
6009 checked for definedness; the caller of this function must do that
6010 if necessary.
6012 static
6013 void do_shadow_Store ( MCEnv* mce,
6014 IREndness end,
6015 IRAtom* addr, UInt bias,
6016 IRAtom* data, IRAtom* vdata,
6017 IRAtom* guard )
6019 IROp mkAdd;
6020 IRType ty, tyAddr;
6021 void* helper = NULL;
6022 const HChar* hname = NULL;
6023 IRConst* c;
6025 tyAddr = mce->hWordTy;
6026 mkAdd = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
6027 tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
6028 tl_assert( end == Iend_LE || end == Iend_BE );
6030 if (data) {
6031 tl_assert(!vdata);
6032 tl_assert(isOriginalAtom(mce, data));
6033 tl_assert(bias == 0);
6034 vdata = expr2vbits( mce, data, HuOth );
6035 } else {
6036 tl_assert(vdata);
6039 tl_assert(isOriginalAtom(mce,addr));
6040 tl_assert(isShadowAtom(mce,vdata));
6042 if (guard) {
6043 tl_assert(isOriginalAtom(mce, guard));
6044 tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
6047 ty = typeOfIRExpr(mce->sb->tyenv, vdata);
6049 // If we're not doing undefined value checking, pretend that this value
6050 // is "all valid". That lets Vex's optimiser remove some of the V bit
6051 // shadow computation ops that precede it.
6052 if (MC_(clo_mc_level) == 1) {
6053 switch (ty) {
6054 case Ity_V256: // V256 weirdness -- used four times
6055 c = IRConst_V256(V_BITS32_DEFINED); break;
6056 case Ity_V128: // V128 weirdness -- used twice
6057 c = IRConst_V128(V_BITS16_DEFINED); break;
6058 case Ity_I128: c = IRConst_U128(V_BITS16_DEFINED); break;
6059 case Ity_I64: c = IRConst_U64 (V_BITS64_DEFINED); break;
6060 case Ity_I32: c = IRConst_U32 (V_BITS32_DEFINED); break;
6061 case Ity_I16: c = IRConst_U16 (V_BITS16_DEFINED); break;
6062 case Ity_I8: c = IRConst_U8 (V_BITS8_DEFINED); break;
6063 default: VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
6065 vdata = IRExpr_Const( c );
6068 /* First, emit a definedness test for the address. This also sets
6069 the address (shadow) to 'defined' following the test. Both of
6070 those actions are gated on |guard|. */
6071 complainIfUndefined( mce, addr, guard );
6073 /* Now decide which helper function to call to write the data V
6074 bits into shadow memory. */
6075 if (end == Iend_LE) {
6076 switch (ty) {
6077 case Ity_V256: /* we'll use the helper four times */
6078 case Ity_V128: /* we'll use the helper twice */
6079 case Ity_I128: /* we'll use the helper twice */
6080 case Ity_I64: helper = &MC_(helperc_STOREV64le);
6081 hname = "MC_(helperc_STOREV64le)";
6082 break;
6083 case Ity_I32: helper = &MC_(helperc_STOREV32le);
6084 hname = "MC_(helperc_STOREV32le)";
6085 break;
6086 case Ity_I16: helper = &MC_(helperc_STOREV16le);
6087 hname = "MC_(helperc_STOREV16le)";
6088 break;
6089 case Ity_I8: helper = &MC_(helperc_STOREV8);
6090 hname = "MC_(helperc_STOREV8)";
6091 break;
6092 default: VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
6094 } else {
6095 switch (ty) {
6096 case Ity_V128: /* we'll use the helper twice */
6097 case Ity_I64: helper = &MC_(helperc_STOREV64be);
6098 hname = "MC_(helperc_STOREV64be)";
6099 break;
6100 case Ity_I32: helper = &MC_(helperc_STOREV32be);
6101 hname = "MC_(helperc_STOREV32be)";
6102 break;
6103 case Ity_I16: helper = &MC_(helperc_STOREV16be);
6104 hname = "MC_(helperc_STOREV16be)";
6105 break;
6106 case Ity_I8: helper = &MC_(helperc_STOREV8);
6107 hname = "MC_(helperc_STOREV8)";
6108 break;
6109 /* Note, no V256 case here, because no big-endian target that
6110 we support, has 256 vectors. */
6111 default: VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
6115 if (UNLIKELY(ty == Ity_V256)) {
6117 /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
6118 Q3 being the most significant lane. */
6119 /* These are the offsets of the Qs in memory. */
6120 Int offQ0, offQ1, offQ2, offQ3;
6122 /* Various bits for constructing the 4 lane helper calls */
6123 IRDirty *diQ0, *diQ1, *diQ2, *diQ3;
6124 IRAtom *addrQ0, *addrQ1, *addrQ2, *addrQ3;
6125 IRAtom *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
6126 IRAtom *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
6128 if (end == Iend_LE) {
6129 offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
6130 } else {
6131 offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
6134 eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
6135 addrQ0 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
6136 vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
6137 diQ0 = unsafeIRDirty_0_N(
6138 1/*regparms*/,
6139 hname, VG_(fnptr_to_fnentry)( helper ),
6140 mkIRExprVec_2( addrQ0, vdataQ0 )
6143 eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
6144 addrQ1 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
6145 vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
6146 diQ1 = unsafeIRDirty_0_N(
6147 1/*regparms*/,
6148 hname, VG_(fnptr_to_fnentry)( helper ),
6149 mkIRExprVec_2( addrQ1, vdataQ1 )
6152 eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
6153 addrQ2 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
6154 vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
6155 diQ2 = unsafeIRDirty_0_N(
6156 1/*regparms*/,
6157 hname, VG_(fnptr_to_fnentry)( helper ),
6158 mkIRExprVec_2( addrQ2, vdataQ2 )
6161 eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
6162 addrQ3 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
6163 vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
6164 diQ3 = unsafeIRDirty_0_N(
6165 1/*regparms*/,
6166 hname, VG_(fnptr_to_fnentry)( helper ),
6167 mkIRExprVec_2( addrQ3, vdataQ3 )
6170 if (guard)
6171 diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
6173 setHelperAnns( mce, diQ0 );
6174 setHelperAnns( mce, diQ1 );
6175 setHelperAnns( mce, diQ2 );
6176 setHelperAnns( mce, diQ3 );
6177 stmt( 'V', mce, IRStmt_Dirty(diQ0) );
6178 stmt( 'V', mce, IRStmt_Dirty(diQ1) );
6179 stmt( 'V', mce, IRStmt_Dirty(diQ2) );
6180 stmt( 'V', mce, IRStmt_Dirty(diQ3) );
6183 else if (UNLIKELY(ty == Ity_V128 || ty == Ity_I128)) {
6185 /* V128/I128-bit case */
6186 /* See comment in next clause re 64-bit regparms */
6187 /* also, need to be careful about endianness */
6189 Int offLo64, offHi64;
6190 IRDirty *diLo64, *diHi64;
6191 IRAtom *addrLo64, *addrHi64;
6192 IRAtom *vdataLo64, *vdataHi64;
6193 IRAtom *eBiasLo64, *eBiasHi64;
6194 IROp opGetLO64, opGetHI64;
6196 if (end == Iend_LE) {
6197 offLo64 = 0;
6198 offHi64 = 8;
6199 } else {
6200 offLo64 = 8;
6201 offHi64 = 0;
6204 if (ty == Ity_V128) {
6205 opGetLO64 = Iop_V128to64;
6206 opGetHI64 = Iop_V128HIto64;
6207 } else {
6208 opGetLO64 = Iop_128to64;
6209 opGetHI64 = Iop_128HIto64;
6212 eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
6213 addrLo64 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
6214 vdataLo64 = assignNew('V', mce, Ity_I64, unop(opGetLO64, vdata));
6215 diLo64 = unsafeIRDirty_0_N(
6216 1/*regparms*/,
6217 hname, VG_(fnptr_to_fnentry)( helper ),
6218 mkIRExprVec_2( addrLo64, vdataLo64 )
6220 eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
6221 addrHi64 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
6222 vdataHi64 = assignNew('V', mce, Ity_I64, unop(opGetHI64, vdata));
6223 diHi64 = unsafeIRDirty_0_N(
6224 1/*regparms*/,
6225 hname, VG_(fnptr_to_fnentry)( helper ),
6226 mkIRExprVec_2( addrHi64, vdataHi64 )
6228 if (guard) diLo64->guard = guard;
6229 if (guard) diHi64->guard = guard;
6230 setHelperAnns( mce, diLo64 );
6231 setHelperAnns( mce, diHi64 );
6232 stmt( 'V', mce, IRStmt_Dirty(diLo64) );
6233 stmt( 'V', mce, IRStmt_Dirty(diHi64) );
6235 } else {
6237 IRDirty *di;
6238 IRAtom *addrAct;
6240 /* 8/16/32/64-bit cases */
6241 /* Generate the actual address into addrAct. */
6242 if (bias == 0) {
6243 addrAct = addr;
6244 } else {
6245 IRAtom* eBias = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
6246 addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
6249 if (ty == Ity_I64) {
6250 /* We can't do this with regparm 2 on 32-bit platforms, since
6251 the back ends aren't clever enough to handle 64-bit
6252 regparm args. Therefore be different. */
6253 di = unsafeIRDirty_0_N(
6254 1/*regparms*/,
6255 hname, VG_(fnptr_to_fnentry)( helper ),
6256 mkIRExprVec_2( addrAct, vdata )
6258 } else {
6259 di = unsafeIRDirty_0_N(
6260 2/*regparms*/,
6261 hname, VG_(fnptr_to_fnentry)( helper ),
6262 mkIRExprVec_2( addrAct,
6263 zwidenToHostWord( mce, vdata ))
6266 if (guard) di->guard = guard;
6267 setHelperAnns( mce, di );
6268 stmt( 'V', mce, IRStmt_Dirty(di) );
6274 /* Do lazy pessimistic propagation through a dirty helper call, by
6275 looking at the annotations on it. This is the most complex part of
6276 Memcheck. */
6278 static IRType szToITy ( Int n )
6280 switch (n) {
6281 case 1: return Ity_I8;
6282 case 2: return Ity_I16;
6283 case 4: return Ity_I32;
6284 case 8: return Ity_I64;
6285 default: VG_(tool_panic)("szToITy(memcheck)");
6289 static
6290 void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
6292 Int i, k, n, toDo, gSz, gOff;
6293 IRAtom *src, *here, *curr;
6294 IRType tySrc, tyDst;
6295 IRTemp dst;
6296 IREndness end;
6298 /* What's the native endianness? We need to know this. */
6299 # if defined(VG_BIGENDIAN)
6300 end = Iend_BE;
6301 # elif defined(VG_LITTLEENDIAN)
6302 end = Iend_LE;
6303 # else
6304 # error "Unknown endianness"
6305 # endif
6307 /* First check the guard. */
6308 complainIfUndefined(mce, d->guard, NULL);
6310 /* Now round up all inputs and PCast over them. */
6311 curr = definedOfType(Ity_I32);
6313 /* Inputs: unmasked args
6314 Note: arguments are evaluated REGARDLESS of the guard expression */
6315 for (i = 0; d->args[i]; i++) {
6316 IRAtom* arg = d->args[i];
6317 if ( (d->cee->mcx_mask & (1<<i))
6318 || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
6319 /* ignore this arg */
6320 } else {
6321 here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, arg, HuOth) );
6322 curr = mkUifU32(mce, here, curr);
6326 /* Inputs: guest state that we read. */
6327 for (i = 0; i < d->nFxState; i++) {
6328 tl_assert(d->fxState[i].fx != Ifx_None);
6329 if (d->fxState[i].fx == Ifx_Write)
6330 continue;
6332 /* Enumerate the described state segments */
6333 for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6334 gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6335 gSz = d->fxState[i].size;
6337 /* Ignore any sections marked as 'always defined'. */
6338 if (isAlwaysDefd(mce, gOff, gSz)) {
6339 if (0)
6340 VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
6341 gOff, gSz);
6342 continue;
6345 /* This state element is read or modified. So we need to
6346 consider it. If larger than 8 bytes, deal with it in
6347 8-byte chunks. */
6348 while (True) {
6349 tl_assert(gSz >= 0);
6350 if (gSz == 0) break;
6351 n = gSz <= 8 ? gSz : 8;
6352 /* update 'curr' with UifU of the state slice
6353 gOff .. gOff+n-1 */
6354 tySrc = szToITy( n );
6356 /* Observe the guard expression. If it is false use an
6357 all-bits-defined bit pattern */
6358 IRAtom *cond, *iffalse, *iftrue;
6360 cond = assignNew('V', mce, Ity_I1, d->guard);
6361 iftrue = assignNew('V', mce, tySrc, shadow_GET(mce, gOff, tySrc));
6362 iffalse = assignNew('V', mce, tySrc, definedOfType(tySrc));
6363 src = assignNew('V', mce, tySrc,
6364 IRExpr_ITE(cond, iftrue, iffalse));
6366 here = mkPCastTo( mce, Ity_I32, src );
6367 curr = mkUifU32(mce, here, curr);
6368 gSz -= n;
6369 gOff += n;
6374 /* Inputs: memory. First set up some info needed regardless of
6375 whether we're doing reads or writes. */
6377 if (d->mFx != Ifx_None) {
6378 /* Because we may do multiple shadow loads/stores from the same
6379 base address, it's best to do a single test of its
6380 definedness right now. Post-instrumentation optimisation
6381 should remove all but this test. */
6382 IRType tyAddr;
6383 tl_assert(d->mAddr);
6384 complainIfUndefined(mce, d->mAddr, d->guard);
6386 tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
6387 tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
6388 tl_assert(tyAddr == mce->hWordTy); /* not really right */
6391 /* Deal with memory inputs (reads or modifies) */
6392 if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
6393 toDo = d->mSize;
6394 /* chew off 32-bit chunks. We don't care about the endianness
6395 since it's all going to be condensed down to a single bit,
6396 but nevertheless choose an endianness which is hopefully
6397 native to the platform. */
6398 while (toDo >= 4) {
6399 here = mkPCastTo(
6400 mce, Ity_I32,
6401 expr2vbits_Load_guarded_Simple(
6402 mce, end, Ity_I32, d->mAddr, d->mSize - toDo, d->guard )
6404 curr = mkUifU32(mce, here, curr);
6405 toDo -= 4;
6407 /* chew off 16-bit chunks */
6408 while (toDo >= 2) {
6409 here = mkPCastTo(
6410 mce, Ity_I32,
6411 expr2vbits_Load_guarded_Simple(
6412 mce, end, Ity_I16, d->mAddr, d->mSize - toDo, d->guard )
6414 curr = mkUifU32(mce, here, curr);
6415 toDo -= 2;
6417 /* chew off the remaining 8-bit chunk, if any */
6418 if (toDo == 1) {
6419 here = mkPCastTo(
6420 mce, Ity_I32,
6421 expr2vbits_Load_guarded_Simple(
6422 mce, end, Ity_I8, d->mAddr, d->mSize - toDo, d->guard )
6424 curr = mkUifU32(mce, here, curr);
6425 toDo -= 1;
6427 tl_assert(toDo == 0);
6430 /* Whew! So curr is a 32-bit V-value summarising pessimistically
6431 all the inputs to the helper. Now we need to re-distribute the
6432 results to all destinations. */
6434 /* Outputs: the destination temporary, if there is one. */
6435 if (d->tmp != IRTemp_INVALID) {
6436 dst = findShadowTmpV(mce, d->tmp);
6437 tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
6438 assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
6441 /* Outputs: guest state that we write or modify. */
6442 for (i = 0; i < d->nFxState; i++) {
6443 tl_assert(d->fxState[i].fx != Ifx_None);
6444 if (d->fxState[i].fx == Ifx_Read)
6445 continue;
6447 /* Enumerate the described state segments */
6448 for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6449 gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6450 gSz = d->fxState[i].size;
6452 /* Ignore any sections marked as 'always defined'. */
6453 if (isAlwaysDefd(mce, gOff, gSz))
6454 continue;
6456 /* This state element is written or modified. So we need to
6457 consider it. If larger than 8 bytes, deal with it in
6458 8-byte chunks. */
6459 while (True) {
6460 tl_assert(gSz >= 0);
6461 if (gSz == 0) break;
6462 n = gSz <= 8 ? gSz : 8;
6463 /* Write suitably-casted 'curr' to the state slice
6464 gOff .. gOff+n-1 */
6465 tyDst = szToITy( n );
6466 do_shadow_PUT( mce, gOff,
6467 NULL, /* original atom */
6468 mkPCastTo( mce, tyDst, curr ), d->guard );
6469 gSz -= n;
6470 gOff += n;
6475 /* Outputs: memory that we write or modify. Same comments about
6476 endianness as above apply. */
6477 if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
6478 toDo = d->mSize;
6479 /* chew off 32-bit chunks */
6480 while (toDo >= 4) {
6481 do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6482 NULL, /* original data */
6483 mkPCastTo( mce, Ity_I32, curr ),
6484 d->guard );
6485 toDo -= 4;
6487 /* chew off 16-bit chunks */
6488 while (toDo >= 2) {
6489 do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6490 NULL, /* original data */
6491 mkPCastTo( mce, Ity_I16, curr ),
6492 d->guard );
6493 toDo -= 2;
6495 /* chew off the remaining 8-bit chunk, if any */
6496 if (toDo == 1) {
6497 do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6498 NULL, /* original data */
6499 mkPCastTo( mce, Ity_I8, curr ),
6500 d->guard );
6501 toDo -= 1;
6503 tl_assert(toDo == 0);
6509 /* We have an ABI hint telling us that [base .. base+len-1] is to
6510 become undefined ("writable"). Generate code to call a helper to
6511 notify the A/V bit machinery of this fact.
6513 We call
6514 void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
6515 Addr nia );
6517 static
6518 void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
6520 IRDirty* di;
6522 if (MC_(clo_mc_level) == 3) {
6523 di = unsafeIRDirty_0_N(
6524 3/*regparms*/,
6525 "MC_(helperc_MAKE_STACK_UNINIT_w_o)",
6526 VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_w_o) ),
6527 mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
6529 } else {
6530 /* We ignore the supplied nia, since it is irrelevant. */
6531 tl_assert(MC_(clo_mc_level) == 2 || MC_(clo_mc_level) == 1);
6532 /* Special-case the len==128 case, since that is for amd64-ELF,
6533 which is a very common target. */
6534 if (len == 128) {
6535 di = unsafeIRDirty_0_N(
6536 1/*regparms*/,
6537 "MC_(helperc_MAKE_STACK_UNINIT_128_no_o)",
6538 VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_128_no_o)),
6539 mkIRExprVec_1( base )
6541 } else {
6542 di = unsafeIRDirty_0_N(
6543 2/*regparms*/,
6544 "MC_(helperc_MAKE_STACK_UNINIT_no_o)",
6545 VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_no_o) ),
6546 mkIRExprVec_2( base, mkIRExpr_HWord( (UInt)len) )
6551 stmt( 'V', mce, IRStmt_Dirty(di) );
6555 /* ------ Dealing with IRCAS (big and complex) ------ */
6557 /* FWDS */
6558 static IRAtom* gen_load_b ( MCEnv* mce, Int szB,
6559 IRAtom* baseaddr, Int offset );
6560 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
6561 static void gen_store_b ( MCEnv* mce, Int szB,
6562 IRAtom* baseaddr, Int offset, IRAtom* dataB,
6563 IRAtom* guard );
6565 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
6566 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
6569 /* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
6570 IRExpr.Consts, else this asserts. If they are both Consts, it
6571 doesn't do anything. So that just leaves the RdTmp case.
6573 In which case: this assigns the shadow value SHADOW to the IR
6574 shadow temporary associated with ORIG. That is, ORIG, being an
6575 original temporary, will have a shadow temporary associated with
6576 it. However, in the case envisaged here, there will so far have
6577 been no IR emitted to actually write a shadow value into that
6578 temporary. What this routine does is to (emit IR to) copy the
6579 value in SHADOW into said temporary, so that after this call,
6580 IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
6581 value in SHADOW.
6583 Point is to allow callers to compute "by hand" a shadow value for
6584 ORIG, and force it to be associated with ORIG.
6586 How do we know that that shadow associated with ORIG has not so far
6587 been assigned to? Well, we don't per se know that, but supposing
6588 it had. Then this routine would create a second assignment to it,
6589 and later the IR sanity checker would barf. But that never
6590 happens. QED.
6592 static void bind_shadow_tmp_to_orig ( UChar how,
6593 MCEnv* mce,
6594 IRAtom* orig, IRAtom* shadow )
6596 tl_assert(isOriginalAtom(mce, orig));
6597 tl_assert(isShadowAtom(mce, shadow));
6598 switch (orig->tag) {
6599 case Iex_Const:
6600 tl_assert(shadow->tag == Iex_Const);
6601 break;
6602 case Iex_RdTmp:
6603 tl_assert(shadow->tag == Iex_RdTmp);
6604 if (how == 'V') {
6605 assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
6606 shadow);
6607 } else {
6608 tl_assert(how == 'B');
6609 assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
6610 shadow);
6612 break;
6613 default:
6614 tl_assert(0);
6619 static
6620 void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
6622 /* Scheme is (both single- and double- cases):
6624 1. fetch data#,dataB (the proposed new value)
6626 2. fetch expd#,expdB (what we expect to see at the address)
6628 3. check definedness of address
6630 4. load old#,oldB from shadow memory; this also checks
6631 addressibility of the address
6633 5. the CAS itself
6635 6. compute "expected == old". See COMMENT_ON_CasCmpEQ below.
6637 7. if "expected == old" (as computed by (6))
6638 store data#,dataB to shadow memory
6640 Note that 5 reads 'old' but 4 reads 'old#'. Similarly, 5 stores
6641 'data' but 7 stores 'data#'. Hence it is possible for the
6642 shadow data to be incorrectly checked and/or updated:
6644 * 7 is at least gated correctly, since the 'expected == old'
6645 condition is derived from outputs of 5. However, the shadow
6646 write could happen too late: imagine after 5 we are
6647 descheduled, a different thread runs, writes a different
6648 (shadow) value at the address, and then we resume, hence
6649 overwriting the shadow value written by the other thread.
6651 Because the original memory access is atomic, there's no way to
6652 make both the original and shadow accesses into a single atomic
6653 thing, hence this is unavoidable.
6655 At least as Valgrind stands, I don't think it's a problem, since
6656 we're single threaded *and* we guarantee that there are no
6657 context switches during the execution of any specific superblock
6658 -- context switches can only happen at superblock boundaries.
6660 If Valgrind ever becomes MT in the future, then it might be more
6661 of a problem. A possible kludge would be to artificially
6662 associate with the location, a lock, which we must acquire and
6663 release around the transaction as a whole. Hmm, that probably
6664 would't work properly since it only guards us against other
6665 threads doing CASs on the same location, not against other
6666 threads doing normal reads and writes.
6668 ------------------------------------------------------------
6670 COMMENT_ON_CasCmpEQ:
6672 Note two things. Firstly, in the sequence above, we compute
6673 "expected == old", but we don't check definedness of it. Why
6674 not? Also, the x86 and amd64 front ends use
6675 Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
6676 determination (expected == old ?) for themselves, and we also
6677 don't check definedness for those primops; we just say that the
6678 result is defined. Why? Details follow.
6680 x86/amd64 contains various forms of locked insns:
6681 * lock prefix before all basic arithmetic insn;
6682 eg lock xorl %reg1,(%reg2)
6683 * atomic exchange reg-mem
6684 * compare-and-swaps
6686 Rather than attempt to represent them all, which would be a
6687 royal PITA, I used a result from Maurice Herlihy
6688 (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
6689 demonstrates that compare-and-swap is a primitive more general
6690 than the other two, and so can be used to represent all of them.
6691 So the translation scheme for (eg) lock incl (%reg) is as
6692 follows:
6694 again:
6695 old = * %reg
6696 new = old + 1
6697 atomically { if (* %reg == old) { * %reg = new } else { goto again } }
6699 The "atomically" is the CAS bit. The scheme is always the same:
6700 get old value from memory, compute new value, atomically stuff
6701 new value back in memory iff the old value has not changed (iow,
6702 no other thread modified it in the meantime). If it has changed
6703 then we've been out-raced and we have to start over.
6705 Now that's all very neat, but it has the bad side effect of
6706 introducing an explicit equality test into the translation.
6707 Consider the behaviour of said code on a memory location which
6708 is uninitialised. We will wind up doing a comparison on
6709 uninitialised data, and mc duly complains.
6711 What's difficult about this is, the common case is that the
6712 location is uncontended, and so we're usually comparing the same
6713 value (* %reg) with itself. So we shouldn't complain even if it
6714 is undefined. But mc doesn't know that.
6716 My solution is to mark the == in the IR specially, so as to tell
6717 mc that it almost certainly compares a value with itself, and we
6718 should just regard the result as always defined. Rather than
6719 add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
6720 Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
6722 So there's always the question of, can this give a false
6723 negative? eg, imagine that initially, * %reg is defined; and we
6724 read that; but then in the gap between the read and the CAS, a
6725 different thread writes an undefined (and different) value at
6726 the location. Then the CAS in this thread will fail and we will
6727 go back to "again:", but without knowing that the trip back
6728 there was based on an undefined comparison. No matter; at least
6729 the other thread won the race and the location is correctly
6730 marked as undefined. What if it wrote an uninitialised version
6731 of the same value that was there originally, though?
6733 etc etc. Seems like there's a small corner case in which we
6734 might lose the fact that something's defined -- we're out-raced
6735 in between the "old = * reg" and the "atomically {", _and_ the
6736 other thread is writing in an undefined version of what's
6737 already there. Well, that seems pretty unlikely.
6741 If we ever need to reinstate it .. code which generates a
6742 definedness test for "expected == old" was removed at r10432 of
6743 this file.
6745 if (cas->oldHi == IRTemp_INVALID) {
6746 do_shadow_CAS_single( mce, cas );
6747 } else {
6748 do_shadow_CAS_double( mce, cas );
6753 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
6755 IRAtom *vdataLo = NULL, *bdataLo = NULL;
6756 IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6757 IRAtom *voldLo = NULL, *boldLo = NULL;
6758 IRAtom *expd_eq_old = NULL;
6759 IROp opCasCmpEQ;
6760 Int elemSzB;
6761 IRType elemTy;
6762 Bool otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6764 /* single CAS */
6765 tl_assert(cas->oldHi == IRTemp_INVALID);
6766 tl_assert(cas->expdHi == NULL);
6767 tl_assert(cas->dataHi == NULL);
6769 elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6770 switch (elemTy) {
6771 case Ity_I8: elemSzB = 1; opCasCmpEQ = Iop_CasCmpEQ8; break;
6772 case Ity_I16: elemSzB = 2; opCasCmpEQ = Iop_CasCmpEQ16; break;
6773 case Ity_I32: elemSzB = 4; opCasCmpEQ = Iop_CasCmpEQ32; break;
6774 case Ity_I64: elemSzB = 8; opCasCmpEQ = Iop_CasCmpEQ64; break;
6775 default: tl_assert(0); /* IR defn disallows any other types */
6778 /* 1. fetch data# (the proposed new value) */
6779 tl_assert(isOriginalAtom(mce, cas->dataLo));
6780 vdataLo
6781 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6782 tl_assert(isShadowAtom(mce, vdataLo));
6783 if (otrak) {
6784 bdataLo
6785 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6786 tl_assert(isShadowAtom(mce, bdataLo));
6789 /* 2. fetch expected# (what we expect to see at the address) */
6790 tl_assert(isOriginalAtom(mce, cas->expdLo));
6791 vexpdLo
6792 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6793 tl_assert(isShadowAtom(mce, vexpdLo));
6794 if (otrak) {
6795 bexpdLo
6796 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6797 tl_assert(isShadowAtom(mce, bexpdLo));
6800 /* 3. check definedness of address */
6801 /* 4. fetch old# from shadow memory; this also checks
6802 addressibility of the address */
6803 voldLo
6804 = assignNew(
6805 'V', mce, elemTy,
6806 expr2vbits_Load(
6807 mce,
6808 cas->end, elemTy, cas->addr, 0/*Addr bias*/,
6809 NULL/*always happens*/
6811 bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6812 if (otrak) {
6813 boldLo
6814 = assignNew('B', mce, Ity_I32,
6815 gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
6816 bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6819 /* 5. the CAS itself */
6820 stmt( 'C', mce, IRStmt_CAS(cas) );
6822 /* 6. compute "expected == old" */
6823 /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6824 /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6825 tree, but it's not copied from the input block. */
6826 expd_eq_old
6827 = assignNew('C', mce, Ity_I1,
6828 binop(opCasCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
6830 /* 7. if "expected == old"
6831 store data# to shadow memory */
6832 do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
6833 NULL/*data*/, vdataLo/*vdata*/,
6834 expd_eq_old/*guard for store*/ );
6835 if (otrak) {
6836 gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
6837 bdataLo/*bdata*/,
6838 expd_eq_old/*guard for store*/ );
6843 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
6845 IRAtom *vdataHi = NULL, *bdataHi = NULL;
6846 IRAtom *vdataLo = NULL, *bdataLo = NULL;
6847 IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
6848 IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6849 IRAtom *voldHi = NULL, *boldHi = NULL;
6850 IRAtom *voldLo = NULL, *boldLo = NULL;
6851 IRAtom *xHi = NULL, *xLo = NULL, *xHL = NULL;
6852 IRAtom *expd_eq_old = NULL, *zero = NULL;
6853 IROp opCasCmpEQ, opOr, opXor;
6854 Int elemSzB, memOffsLo, memOffsHi;
6855 IRType elemTy;
6856 Bool otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6858 /* double CAS */
6859 tl_assert(cas->oldHi != IRTemp_INVALID);
6860 tl_assert(cas->expdHi != NULL);
6861 tl_assert(cas->dataHi != NULL);
6863 elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6864 switch (elemTy) {
6865 case Ity_I8:
6866 opCasCmpEQ = Iop_CasCmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8;
6867 elemSzB = 1; zero = mkU8(0);
6868 break;
6869 case Ity_I16:
6870 opCasCmpEQ = Iop_CasCmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
6871 elemSzB = 2; zero = mkU16(0);
6872 break;
6873 case Ity_I32:
6874 opCasCmpEQ = Iop_CasCmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
6875 elemSzB = 4; zero = mkU32(0);
6876 break;
6877 case Ity_I64:
6878 opCasCmpEQ = Iop_CasCmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
6879 elemSzB = 8; zero = mkU64(0);
6880 break;
6881 default:
6882 tl_assert(0); /* IR defn disallows any other types */
6885 /* 1. fetch data# (the proposed new value) */
6886 tl_assert(isOriginalAtom(mce, cas->dataHi));
6887 tl_assert(isOriginalAtom(mce, cas->dataLo));
6888 vdataHi
6889 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi, HuOth));
6890 vdataLo
6891 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6892 tl_assert(isShadowAtom(mce, vdataHi));
6893 tl_assert(isShadowAtom(mce, vdataLo));
6894 if (otrak) {
6895 bdataHi
6896 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
6897 bdataLo
6898 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6899 tl_assert(isShadowAtom(mce, bdataHi));
6900 tl_assert(isShadowAtom(mce, bdataLo));
6903 /* 2. fetch expected# (what we expect to see at the address) */
6904 tl_assert(isOriginalAtom(mce, cas->expdHi));
6905 tl_assert(isOriginalAtom(mce, cas->expdLo));
6906 vexpdHi
6907 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi, HuOth));
6908 vexpdLo
6909 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6910 tl_assert(isShadowAtom(mce, vexpdHi));
6911 tl_assert(isShadowAtom(mce, vexpdLo));
6912 if (otrak) {
6913 bexpdHi
6914 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
6915 bexpdLo
6916 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6917 tl_assert(isShadowAtom(mce, bexpdHi));
6918 tl_assert(isShadowAtom(mce, bexpdLo));
6921 /* 3. check definedness of address */
6922 /* 4. fetch old# from shadow memory; this also checks
6923 addressibility of the address */
6924 if (cas->end == Iend_LE) {
6925 memOffsLo = 0;
6926 memOffsHi = elemSzB;
6927 } else {
6928 tl_assert(cas->end == Iend_BE);
6929 memOffsLo = elemSzB;
6930 memOffsHi = 0;
6932 voldHi
6933 = assignNew(
6934 'V', mce, elemTy,
6935 expr2vbits_Load(
6936 mce,
6937 cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/,
6938 NULL/*always happens*/
6940 voldLo
6941 = assignNew(
6942 'V', mce, elemTy,
6943 expr2vbits_Load(
6944 mce,
6945 cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/,
6946 NULL/*always happens*/
6948 bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
6949 bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6950 if (otrak) {
6951 boldHi
6952 = assignNew('B', mce, Ity_I32,
6953 gen_load_b(mce, elemSzB, cas->addr,
6954 memOffsHi/*addr bias*/));
6955 boldLo
6956 = assignNew('B', mce, Ity_I32,
6957 gen_load_b(mce, elemSzB, cas->addr,
6958 memOffsLo/*addr bias*/));
6959 bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
6960 bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6963 /* 5. the CAS itself */
6964 stmt( 'C', mce, IRStmt_CAS(cas) );
6966 /* 6. compute "expected == old" */
6967 /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6968 /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6969 tree, but it's not copied from the input block. */
6971 xHi = oldHi ^ expdHi;
6972 xLo = oldLo ^ expdLo;
6973 xHL = xHi | xLo;
6974 expd_eq_old = xHL == 0;
6976 xHi = assignNew('C', mce, elemTy,
6977 binop(opXor, cas->expdHi, mkexpr(cas->oldHi)));
6978 xLo = assignNew('C', mce, elemTy,
6979 binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
6980 xHL = assignNew('C', mce, elemTy,
6981 binop(opOr, xHi, xLo));
6982 expd_eq_old
6983 = assignNew('C', mce, Ity_I1,
6984 binop(opCasCmpEQ, xHL, zero));
6986 /* 7. if "expected == old"
6987 store data# to shadow memory */
6988 do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
6989 NULL/*data*/, vdataHi/*vdata*/,
6990 expd_eq_old/*guard for store*/ );
6991 do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
6992 NULL/*data*/, vdataLo/*vdata*/,
6993 expd_eq_old/*guard for store*/ );
6994 if (otrak) {
6995 gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
6996 bdataHi/*bdata*/,
6997 expd_eq_old/*guard for store*/ );
6998 gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
6999 bdataLo/*bdata*/,
7000 expd_eq_old/*guard for store*/ );
7005 /* ------ Dealing with LL/SC (not difficult) ------ */
7007 static void do_shadow_LLSC ( MCEnv* mce,
7008 IREndness stEnd,
7009 IRTemp stResult,
7010 IRExpr* stAddr,
7011 IRExpr* stStoredata )
7013 /* In short: treat a load-linked like a normal load followed by an
7014 assignment of the loaded (shadow) data to the result temporary.
7015 Treat a store-conditional like a normal store, and mark the
7016 result temporary as defined. */
7017 IRType resTy = typeOfIRTemp(mce->sb->tyenv, stResult);
7018 IRTemp resTmp = findShadowTmpV(mce, stResult);
7020 tl_assert(isIRAtom(stAddr));
7021 if (stStoredata)
7022 tl_assert(isIRAtom(stStoredata));
7024 if (stStoredata == NULL) {
7025 /* Load Linked */
7026 /* Just treat this as a normal load, followed by an assignment of
7027 the value to .result. */
7028 /* Stay sane */
7029 tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
7030 || resTy == Ity_I16 || resTy == Ity_I8);
7031 assign( 'V', mce, resTmp,
7032 expr2vbits_Load(
7033 mce, stEnd, resTy, stAddr, 0/*addr bias*/,
7034 NULL/*always happens*/) );
7035 } else {
7036 /* Store Conditional */
7037 /* Stay sane */
7038 IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
7039 stStoredata);
7040 tl_assert(dataTy == Ity_I128 || dataTy == Ity_I64 || dataTy == Ity_I32
7041 || dataTy == Ity_I16 || dataTy == Ity_I8);
7042 do_shadow_Store( mce, stEnd,
7043 stAddr, 0/* addr bias */,
7044 stStoredata,
7045 NULL /* shadow data */,
7046 NULL/*guard*/ );
7047 /* This is a store conditional, so it writes to .result a value
7048 indicating whether or not the store succeeded. Just claim
7049 this value is always defined. In the PowerPC interpretation
7050 of store-conditional, definedness of the success indication
7051 depends on whether the address of the store matches the
7052 reservation address. But we can't tell that here (and
7053 anyway, we're not being PowerPC-specific). At least we are
7054 guaranteed that the definedness of the store address, and its
7055 addressibility, will be checked as per normal. So it seems
7056 pretty safe to just say that the success indication is always
7057 defined.
7059 In schemeS, for origin tracking, we must correspondingly set
7060 a no-origin value for the origin shadow of .result.
7062 tl_assert(resTy == Ity_I1);
7063 assign( 'V', mce, resTmp, definedOfType(resTy) );
7068 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7070 static void do_shadow_StoreG ( MCEnv* mce, IRStoreG* sg )
7072 complainIfUndefined(mce, sg->guard, NULL);
7073 /* do_shadow_Store will generate code to check the definedness and
7074 validity of sg->addr, in the case where sg->guard evaluates to
7075 True at run-time. */
7076 do_shadow_Store( mce, sg->end,
7077 sg->addr, 0/* addr bias */,
7078 sg->data,
7079 NULL /* shadow data */,
7080 sg->guard );
7083 static void do_shadow_LoadG ( MCEnv* mce, IRLoadG* lg )
7085 complainIfUndefined(mce, lg->guard, NULL);
7086 /* expr2vbits_Load_guarded_General will generate code to check the
7087 definedness and validity of lg->addr, in the case where
7088 lg->guard evaluates to True at run-time. */
7090 /* Look at the LoadG's built-in conversion operation, to determine
7091 the source (actual loaded data) type, and the equivalent IROp.
7092 NOTE that implicitly we are taking a widening operation to be
7093 applied to original atoms and producing one that applies to V
7094 bits. Since signed and unsigned widening are self-shadowing,
7095 this is a straight copy of the op (modulo swapping from the
7096 IRLoadGOp form to the IROp form). Note also therefore that this
7097 implicitly duplicates the logic to do with said widening ops in
7098 expr2vbits_Unop. See comment at the start of expr2vbits_Unop. */
7099 IROp vwiden = Iop_INVALID;
7100 IRType loadedTy = Ity_INVALID;
7101 switch (lg->cvt) {
7102 case ILGop_IdentV128: loadedTy = Ity_V128; vwiden = Iop_INVALID; break;
7103 case ILGop_Ident64: loadedTy = Ity_I64; vwiden = Iop_INVALID; break;
7104 case ILGop_Ident32: loadedTy = Ity_I32; vwiden = Iop_INVALID; break;
7105 case ILGop_16Uto32: loadedTy = Ity_I16; vwiden = Iop_16Uto32; break;
7106 case ILGop_16Sto32: loadedTy = Ity_I16; vwiden = Iop_16Sto32; break;
7107 case ILGop_8Uto32: loadedTy = Ity_I8; vwiden = Iop_8Uto32; break;
7108 case ILGop_8Sto32: loadedTy = Ity_I8; vwiden = Iop_8Sto32; break;
7109 default: VG_(tool_panic)("do_shadow_LoadG");
7112 IRAtom* vbits_alt
7113 = expr2vbits( mce, lg->alt, HuOth );
7114 IRAtom* vbits_final
7115 = expr2vbits_Load_guarded_General(mce, lg->end, loadedTy,
7116 lg->addr, 0/*addr bias*/,
7117 lg->guard, vwiden, vbits_alt );
7118 /* And finally, bind the V bits to the destination temporary. */
7119 assign( 'V', mce, findShadowTmpV(mce, lg->dst), vbits_final );
7123 /*------------------------------------------------------------*/
7124 /*--- Origin tracking stuff ---*/
7125 /*------------------------------------------------------------*/
7127 /* Almost identical to findShadowTmpV. */
7128 static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
7130 TempMapEnt* ent;
7131 /* VG_(indexXA) range-checks 'orig', hence no need to check
7132 here. */
7133 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
7134 tl_assert(ent->kind == Orig);
7135 if (ent->shadowB == IRTemp_INVALID) {
7136 IRTemp tmpB
7137 = newTemp( mce, Ity_I32, BSh );
7138 /* newTemp may cause mce->tmpMap to resize, hence previous results
7139 from VG_(indexXA) are invalid. */
7140 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
7141 tl_assert(ent->kind == Orig);
7142 tl_assert(ent->shadowB == IRTemp_INVALID);
7143 ent->shadowB = tmpB;
7145 return ent->shadowB;
7148 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
7150 return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
7154 /* Make a guarded origin load, with no special handling in the
7155 didn't-happen case. A GUARD of NULL is assumed to mean "always
7156 True".
7158 Generate IR to do a shadow origins load from BASEADDR+OFFSET and
7159 return the otag. The loaded size is SZB. If GUARD evaluates to
7160 False at run time then the returned otag is zero.
7162 static IRAtom* gen_guarded_load_b ( MCEnv* mce, Int szB,
7163 IRAtom* baseaddr,
7164 Int offset, IRExpr* guard )
7166 void* hFun;
7167 const HChar* hName;
7168 IRTemp bTmp;
7169 IRDirty* di;
7170 IRType aTy = typeOfIRExpr( mce->sb->tyenv, baseaddr );
7171 IROp opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
7172 IRAtom* ea = baseaddr;
7173 if (offset != 0) {
7174 IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
7175 : mkU64( (Long)(Int)offset );
7176 ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
7178 bTmp = newTemp(mce, mce->hWordTy, BSh);
7180 switch (szB) {
7181 case 1: hFun = (void*)&MC_(helperc_b_load1);
7182 hName = "MC_(helperc_b_load1)";
7183 break;
7184 case 2: hFun = (void*)&MC_(helperc_b_load2);
7185 hName = "MC_(helperc_b_load2)";
7186 break;
7187 case 4: hFun = (void*)&MC_(helperc_b_load4);
7188 hName = "MC_(helperc_b_load4)";
7189 break;
7190 case 8: hFun = (void*)&MC_(helperc_b_load8);
7191 hName = "MC_(helperc_b_load8)";
7192 break;
7193 case 16: hFun = (void*)&MC_(helperc_b_load16);
7194 hName = "MC_(helperc_b_load16)";
7195 break;
7196 case 32: hFun = (void*)&MC_(helperc_b_load32);
7197 hName = "MC_(helperc_b_load32)";
7198 break;
7199 default:
7200 VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
7201 tl_assert(0);
7203 di = unsafeIRDirty_1_N(
7204 bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
7205 mkIRExprVec_1( ea )
7207 if (guard) {
7208 di->guard = guard;
7209 /* Ideally the didn't-happen return value here would be
7210 all-zeroes (unknown-origin), so it'd be harmless if it got
7211 used inadvertently. We slum it out with the IR-mandated
7212 default value (0b01 repeating, 0x55 etc) as that'll probably
7213 trump all legitimate otags via Max32, and it's pretty
7214 obviously bogus. */
7216 /* no need to mess with any annotations. This call accesses
7217 neither guest state nor guest memory. */
7218 stmt( 'B', mce, IRStmt_Dirty(di) );
7219 if (mce->hWordTy == Ity_I64) {
7220 /* 64-bit host */
7221 IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
7222 assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
7223 return mkexpr(bTmp32);
7224 } else {
7225 /* 32-bit host */
7226 return mkexpr(bTmp);
7231 /* Generate IR to do a shadow origins load from BASEADDR+OFFSET. The
7232 loaded size is SZB. The load is regarded as unconditional (always
7233 happens).
7235 static IRAtom* gen_load_b ( MCEnv* mce, Int szB, IRAtom* baseaddr,
7236 Int offset )
7238 return gen_guarded_load_b(mce, szB, baseaddr, offset, NULL/*guard*/);
7242 /* The most general handler for guarded origin loads. A GUARD of NULL
7243 is assumed to mean "always True".
7245 Generate IR to do a shadow origin load from ADDR+BIAS and return
7246 the B bits. The loaded type is TY. If GUARD evaluates to False at
7247 run time then the returned B bits are simply BALT instead.
7249 static
7250 IRAtom* expr2ori_Load_guarded_General ( MCEnv* mce,
7251 IRType ty,
7252 IRAtom* addr, UInt bias,
7253 IRAtom* guard, IRAtom* balt )
7255 /* If the guard evaluates to True, this will hold the loaded
7256 origin. If the guard evaluates to False, this will be zero,
7257 meaning "unknown origin", in which case we will have to replace
7258 it using an ITE below. */
7259 IRAtom* iftrue
7260 = assignNew('B', mce, Ity_I32,
7261 gen_guarded_load_b(mce, sizeofIRType(ty),
7262 addr, bias, guard));
7263 /* These are the bits we will return if the load doesn't take
7264 place. */
7265 IRAtom* iffalse
7266 = balt;
7267 /* Prepare the cond for the ITE. Convert a NULL cond into
7268 something that iropt knows how to fold out later. */
7269 IRAtom* cond
7270 = guard == NULL ? mkU1(1) : guard;
7271 /* And assemble the final result. */
7272 return assignNew('B', mce, Ity_I32, IRExpr_ITE(cond, iftrue, iffalse));
7276 /* Generate a shadow origins store. guard :: Ity_I1 controls whether
7277 the store really happens; NULL means it unconditionally does. */
7278 static void gen_store_b ( MCEnv* mce, Int szB,
7279 IRAtom* baseaddr, Int offset, IRAtom* dataB,
7280 IRAtom* guard )
7282 void* hFun;
7283 const HChar* hName;
7284 IRDirty* di;
7285 IRType aTy = typeOfIRExpr( mce->sb->tyenv, baseaddr );
7286 IROp opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
7287 IRAtom* ea = baseaddr;
7288 if (guard) {
7289 tl_assert(isOriginalAtom(mce, guard));
7290 tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
7292 if (offset != 0) {
7293 IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
7294 : mkU64( (Long)(Int)offset );
7295 ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
7297 if (mce->hWordTy == Ity_I64)
7298 dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
7300 switch (szB) {
7301 case 1: hFun = (void*)&MC_(helperc_b_store1);
7302 hName = "MC_(helperc_b_store1)";
7303 break;
7304 case 2: hFun = (void*)&MC_(helperc_b_store2);
7305 hName = "MC_(helperc_b_store2)";
7306 break;
7307 case 4: hFun = (void*)&MC_(helperc_b_store4);
7308 hName = "MC_(helperc_b_store4)";
7309 break;
7310 case 8: hFun = (void*)&MC_(helperc_b_store8);
7311 hName = "MC_(helperc_b_store8)";
7312 break;
7313 case 16: hFun = (void*)&MC_(helperc_b_store16);
7314 hName = "MC_(helperc_b_store16)";
7315 break;
7316 case 32: hFun = (void*)&MC_(helperc_b_store32);
7317 hName = "MC_(helperc_b_store32)";
7318 break;
7319 default:
7320 tl_assert(0);
7322 di = unsafeIRDirty_0_N( 2/*regparms*/,
7323 hName, VG_(fnptr_to_fnentry)( hFun ),
7324 mkIRExprVec_2( ea, dataB )
7326 /* no need to mess with any annotations. This call accesses
7327 neither guest state nor guest memory. */
7328 if (guard) di->guard = guard;
7329 stmt( 'B', mce, IRStmt_Dirty(di) );
7332 static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
7333 IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
7334 if (eTy == Ity_I64)
7335 return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
7336 if (eTy == Ity_I32)
7337 return e;
7338 tl_assert(0);
7341 static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
7342 IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
7343 tl_assert(eTy == Ity_I32);
7344 if (dstTy == Ity_I64)
7345 return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
7346 tl_assert(0);
7350 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
7352 tl_assert(MC_(clo_mc_level) == 3);
7354 switch (e->tag) {
7356 case Iex_GetI: {
7357 IRRegArray* descr_b;
7358 IRAtom *t1, *t2, *t3, *t4;
7359 IRRegArray* descr = e->Iex.GetI.descr;
7360 IRType equivIntTy
7361 = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7362 /* If this array is unshadowable for whatever reason, use the
7363 usual approximation. */
7364 if (equivIntTy == Ity_INVALID)
7365 return mkU32(0);
7366 tl_assert(sizeofIRType(equivIntTy) >= 4);
7367 tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7368 descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7369 equivIntTy, descr->nElems );
7370 /* Do a shadow indexed get of the same size, giving t1. Take
7371 the bottom 32 bits of it, giving t2. Compute into t3 the
7372 origin for the index (almost certainly zero, but there's
7373 no harm in being completely general here, since iropt will
7374 remove any useless code), and fold it in, giving a final
7375 value t4. */
7376 t1 = assignNew( 'B', mce, equivIntTy,
7377 IRExpr_GetI( descr_b, e->Iex.GetI.ix,
7378 e->Iex.GetI.bias ));
7379 t2 = narrowTo32( mce, t1 );
7380 t3 = schemeE( mce, e->Iex.GetI.ix );
7381 t4 = gen_maxU32( mce, t2, t3 );
7382 return t4;
7384 case Iex_CCall: {
7385 Int i;
7386 IRAtom* here;
7387 IRExpr** args = e->Iex.CCall.args;
7388 IRAtom* curr = mkU32(0);
7389 for (i = 0; args[i]; i++) {
7390 tl_assert(i < 32);
7391 tl_assert(isOriginalAtom(mce, args[i]));
7392 /* Only take notice of this arg if the callee's
7393 mc-exclusion mask does not say it is to be excluded. */
7394 if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
7395 /* the arg is to be excluded from definedness checking.
7396 Do nothing. */
7397 if (0) VG_(printf)("excluding %s(%d)\n",
7398 e->Iex.CCall.cee->name, i);
7399 } else {
7400 /* calculate the arg's definedness, and pessimistically
7401 merge it in. */
7402 here = schemeE( mce, args[i] );
7403 curr = gen_maxU32( mce, curr, here );
7406 return curr;
7408 case Iex_Load: {
7409 Int dszB;
7410 dszB = sizeofIRType(e->Iex.Load.ty);
7411 /* assert that the B value for the address is already
7412 available (somewhere) */
7413 tl_assert(isIRAtom(e->Iex.Load.addr));
7414 tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
7415 return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
7417 case Iex_ITE: {
7418 IRAtom* b1 = schemeE( mce, e->Iex.ITE.cond );
7419 IRAtom* b3 = schemeE( mce, e->Iex.ITE.iftrue );
7420 IRAtom* b2 = schemeE( mce, e->Iex.ITE.iffalse );
7421 return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
7423 case Iex_Qop: {
7424 IRAtom* b1 = schemeE( mce, e->Iex.Qop.details->arg1 );
7425 IRAtom* b2 = schemeE( mce, e->Iex.Qop.details->arg2 );
7426 IRAtom* b3 = schemeE( mce, e->Iex.Qop.details->arg3 );
7427 IRAtom* b4 = schemeE( mce, e->Iex.Qop.details->arg4 );
7428 return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
7429 gen_maxU32( mce, b3, b4 ) );
7431 case Iex_Triop: {
7432 IRAtom* b1 = schemeE( mce, e->Iex.Triop.details->arg1 );
7433 IRAtom* b2 = schemeE( mce, e->Iex.Triop.details->arg2 );
7434 IRAtom* b3 = schemeE( mce, e->Iex.Triop.details->arg3 );
7435 return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
7437 case Iex_Binop: {
7438 switch (e->Iex.Binop.op) {
7439 case Iop_CasCmpEQ8: case Iop_CasCmpNE8:
7440 case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
7441 case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
7442 case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
7443 /* Just say these all produce a defined result,
7444 regardless of their arguments. See
7445 COMMENT_ON_CasCmpEQ in this file. */
7446 return mkU32(0);
7447 default: {
7448 IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
7449 IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
7450 return gen_maxU32( mce, b1, b2 );
7453 tl_assert(0);
7454 /*NOTREACHED*/
7456 case Iex_Unop: {
7457 IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
7458 return b1;
7460 case Iex_Const:
7461 return mkU32(0);
7462 case Iex_RdTmp:
7463 return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
7464 case Iex_Get: {
7465 Int b_offset = MC_(get_otrack_shadow_offset)(
7466 e->Iex.Get.offset,
7467 sizeofIRType(e->Iex.Get.ty)
7469 tl_assert(b_offset >= -1
7470 && b_offset <= mce->layout->total_sizeB -4);
7471 if (b_offset >= 0) {
7472 /* FIXME: this isn't an atom! */
7473 return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
7474 Ity_I32 );
7476 return mkU32(0);
7478 default:
7479 VG_(printf)("mc_translate.c: schemeE: unhandled: ");
7480 ppIRExpr(e);
7481 VG_(tool_panic)("memcheck:schemeE");
7486 static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
7488 // This is a hacked version of do_shadow_Dirty
7489 Int i, k, n, toDo, gSz, gOff;
7490 IRAtom *here, *curr;
7491 IRTemp dst;
7493 /* First check the guard. */
7494 curr = schemeE( mce, d->guard );
7496 /* Now round up all inputs and maxU32 over them. */
7498 /* Inputs: unmasked args
7499 Note: arguments are evaluated REGARDLESS of the guard expression */
7500 for (i = 0; d->args[i]; i++) {
7501 IRAtom* arg = d->args[i];
7502 if ( (d->cee->mcx_mask & (1<<i))
7503 || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
7504 /* ignore this arg */
7505 } else {
7506 here = schemeE( mce, arg );
7507 curr = gen_maxU32( mce, curr, here );
7511 /* Inputs: guest state that we read. */
7512 for (i = 0; i < d->nFxState; i++) {
7513 tl_assert(d->fxState[i].fx != Ifx_None);
7514 if (d->fxState[i].fx == Ifx_Write)
7515 continue;
7517 /* Enumerate the described state segments */
7518 for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7519 gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7520 gSz = d->fxState[i].size;
7522 /* Ignore any sections marked as 'always defined'. */
7523 if (isAlwaysDefd(mce, gOff, gSz)) {
7524 if (0)
7525 VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
7526 gOff, gSz);
7527 continue;
7530 /* This state element is read or modified. So we need to
7531 consider it. If larger than 4 bytes, deal with it in
7532 4-byte chunks. */
7533 while (True) {
7534 Int b_offset;
7535 tl_assert(gSz >= 0);
7536 if (gSz == 0) break;
7537 n = gSz <= 4 ? gSz : 4;
7538 /* update 'curr' with maxU32 of the state slice
7539 gOff .. gOff+n-1 */
7540 b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7541 if (b_offset != -1) {
7542 /* Observe the guard expression. If it is false use 0, i.e.
7543 nothing is known about the origin */
7544 IRAtom *cond, *iffalse, *iftrue;
7546 cond = assignNew( 'B', mce, Ity_I1, d->guard);
7547 iffalse = mkU32(0);
7548 iftrue = assignNew( 'B', mce, Ity_I32,
7549 IRExpr_Get(b_offset
7550 + 2*mce->layout->total_sizeB,
7551 Ity_I32));
7552 here = assignNew( 'B', mce, Ity_I32,
7553 IRExpr_ITE(cond, iftrue, iffalse));
7554 curr = gen_maxU32( mce, curr, here );
7556 gSz -= n;
7557 gOff += n;
7562 /* Inputs: memory */
7564 if (d->mFx != Ifx_None) {
7565 /* Because we may do multiple shadow loads/stores from the same
7566 base address, it's best to do a single test of its
7567 definedness right now. Post-instrumentation optimisation
7568 should remove all but this test. */
7569 tl_assert(d->mAddr);
7570 here = schemeE( mce, d->mAddr );
7571 curr = gen_maxU32( mce, curr, here );
7574 /* Deal with memory inputs (reads or modifies) */
7575 if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
7576 toDo = d->mSize;
7577 /* chew off 32-bit chunks. We don't care about the endianness
7578 since it's all going to be condensed down to a single bit,
7579 but nevertheless choose an endianness which is hopefully
7580 native to the platform. */
7581 while (toDo >= 4) {
7582 here = gen_guarded_load_b( mce, 4, d->mAddr, d->mSize - toDo,
7583 d->guard );
7584 curr = gen_maxU32( mce, curr, here );
7585 toDo -= 4;
7587 /* handle possible 16-bit excess */
7588 while (toDo >= 2) {
7589 here = gen_guarded_load_b( mce, 2, d->mAddr, d->mSize - toDo,
7590 d->guard );
7591 curr = gen_maxU32( mce, curr, here );
7592 toDo -= 2;
7594 /* chew off the remaining 8-bit chunk, if any */
7595 if (toDo == 1) {
7596 here = gen_guarded_load_b( mce, 1, d->mAddr, d->mSize - toDo,
7597 d->guard );
7598 curr = gen_maxU32( mce, curr, here );
7599 toDo -= 1;
7601 tl_assert(toDo == 0);
7604 /* Whew! So curr is a 32-bit B-value which should give an origin
7605 of some use if any of the inputs to the helper are undefined.
7606 Now we need to re-distribute the results to all destinations. */
7608 /* Outputs: the destination temporary, if there is one. */
7609 if (d->tmp != IRTemp_INVALID) {
7610 dst = findShadowTmpB(mce, d->tmp);
7611 assign( 'V', mce, dst, curr );
7614 /* Outputs: guest state that we write or modify. */
7615 for (i = 0; i < d->nFxState; i++) {
7616 tl_assert(d->fxState[i].fx != Ifx_None);
7617 if (d->fxState[i].fx == Ifx_Read)
7618 continue;
7620 /* Enumerate the described state segments */
7621 for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7622 gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7623 gSz = d->fxState[i].size;
7625 /* Ignore any sections marked as 'always defined'. */
7626 if (isAlwaysDefd(mce, gOff, gSz))
7627 continue;
7629 /* This state element is written or modified. So we need to
7630 consider it. If larger than 4 bytes, deal with it in
7631 4-byte chunks. */
7632 while (True) {
7633 Int b_offset;
7634 tl_assert(gSz >= 0);
7635 if (gSz == 0) break;
7636 n = gSz <= 4 ? gSz : 4;
7637 /* Write 'curr' to the state slice gOff .. gOff+n-1 */
7638 b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7639 if (b_offset != -1) {
7641 /* If the guard expression evaluates to false we simply Put
7642 the value that is already stored in the guest state slot */
7643 IRAtom *cond, *iffalse;
7645 cond = assignNew('B', mce, Ity_I1,
7646 d->guard);
7647 iffalse = assignNew('B', mce, Ity_I32,
7648 IRExpr_Get(b_offset +
7649 2*mce->layout->total_sizeB,
7650 Ity_I32));
7651 curr = assignNew('V', mce, Ity_I32,
7652 IRExpr_ITE(cond, curr, iffalse));
7654 stmt( 'B', mce, IRStmt_Put(b_offset
7655 + 2*mce->layout->total_sizeB,
7656 curr ));
7658 gSz -= n;
7659 gOff += n;
7664 /* Outputs: memory that we write or modify. Same comments about
7665 endianness as above apply. */
7666 if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
7667 toDo = d->mSize;
7668 /* chew off 32-bit chunks */
7669 while (toDo >= 4) {
7670 gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
7671 d->guard );
7672 toDo -= 4;
7674 /* handle possible 16-bit excess */
7675 while (toDo >= 2) {
7676 gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
7677 d->guard );
7678 toDo -= 2;
7680 /* chew off the remaining 8-bit chunk, if any */
7681 if (toDo == 1) {
7682 gen_store_b( mce, 1, d->mAddr, d->mSize - toDo, curr,
7683 d->guard );
7684 toDo -= 1;
7686 tl_assert(toDo == 0);
7691 /* Generate IR for origin shadowing for a general guarded store. */
7692 static void do_origins_Store_guarded ( MCEnv* mce,
7693 IREndness stEnd,
7694 IRExpr* stAddr,
7695 IRExpr* stData,
7696 IRExpr* guard )
7698 Int dszB;
7699 IRAtom* dataB;
7700 /* assert that the B value for the address is already available
7701 (somewhere), since the call to schemeE will want to see it.
7702 XXXX how does this actually ensure that?? */
7703 tl_assert(isIRAtom(stAddr));
7704 tl_assert(isIRAtom(stData));
7705 dszB = sizeofIRType( typeOfIRExpr(mce->sb->tyenv, stData ) );
7706 dataB = schemeE( mce, stData );
7707 gen_store_b( mce, dszB, stAddr, 0/*offset*/, dataB, guard );
7711 /* Generate IR for origin shadowing for a plain store. */
7712 static void do_origins_Store_plain ( MCEnv* mce,
7713 IREndness stEnd,
7714 IRExpr* stAddr,
7715 IRExpr* stData )
7717 do_origins_Store_guarded ( mce, stEnd, stAddr, stData,
7718 NULL/*guard*/ );
7722 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7724 static void do_origins_StoreG ( MCEnv* mce, IRStoreG* sg )
7726 do_origins_Store_guarded( mce, sg->end, sg->addr,
7727 sg->data, sg->guard );
7730 static void do_origins_LoadG ( MCEnv* mce, IRLoadG* lg )
7732 IRType loadedTy = Ity_INVALID;
7733 switch (lg->cvt) {
7734 case ILGop_IdentV128: loadedTy = Ity_V128; break;
7735 case ILGop_Ident64: loadedTy = Ity_I64; break;
7736 case ILGop_Ident32: loadedTy = Ity_I32; break;
7737 case ILGop_16Uto32: loadedTy = Ity_I16; break;
7738 case ILGop_16Sto32: loadedTy = Ity_I16; break;
7739 case ILGop_8Uto32: loadedTy = Ity_I8; break;
7740 case ILGop_8Sto32: loadedTy = Ity_I8; break;
7741 default: VG_(tool_panic)("schemeS.IRLoadG");
7743 IRAtom* ori_alt
7744 = schemeE( mce,lg->alt );
7745 IRAtom* ori_final
7746 = expr2ori_Load_guarded_General(mce, loadedTy,
7747 lg->addr, 0/*addr bias*/,
7748 lg->guard, ori_alt );
7749 /* And finally, bind the origin to the destination temporary. */
7750 assign( 'B', mce, findShadowTmpB(mce, lg->dst), ori_final );
7754 static void schemeS ( MCEnv* mce, IRStmt* st )
7756 tl_assert(MC_(clo_mc_level) == 3);
7758 switch (st->tag) {
7760 case Ist_AbiHint:
7761 /* The value-check instrumenter handles this - by arranging
7762 to pass the address of the next instruction to
7763 MC_(helperc_MAKE_STACK_UNINIT). This is all that needs to
7764 happen for origin tracking w.r.t. AbiHints. So there is
7765 nothing to do here. */
7766 break;
7768 case Ist_PutI: {
7769 IRPutI *puti = st->Ist.PutI.details;
7770 IRRegArray* descr_b;
7771 IRAtom *t1, *t2, *t3, *t4;
7772 IRRegArray* descr = puti->descr;
7773 IRType equivIntTy
7774 = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7775 /* If this array is unshadowable for whatever reason,
7776 generate no code. */
7777 if (equivIntTy == Ity_INVALID)
7778 break;
7779 tl_assert(sizeofIRType(equivIntTy) >= 4);
7780 tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7781 descr_b
7782 = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7783 equivIntTy, descr->nElems );
7784 /* Compute a value to Put - the conjoinment of the origin for
7785 the data to be Put-ted (obviously) and of the index value
7786 (not so obviously). */
7787 t1 = schemeE( mce, puti->data );
7788 t2 = schemeE( mce, puti->ix );
7789 t3 = gen_maxU32( mce, t1, t2 );
7790 t4 = zWidenFrom32( mce, equivIntTy, t3 );
7791 stmt( 'B', mce, IRStmt_PutI( mkIRPutI(descr_b, puti->ix,
7792 puti->bias, t4) ));
7793 break;
7796 case Ist_Dirty:
7797 do_origins_Dirty( mce, st->Ist.Dirty.details );
7798 break;
7800 case Ist_Store:
7801 do_origins_Store_plain( mce, st->Ist.Store.end,
7802 st->Ist.Store.addr,
7803 st->Ist.Store.data );
7804 break;
7806 case Ist_StoreG:
7807 do_origins_StoreG( mce, st->Ist.StoreG.details );
7808 break;
7810 case Ist_LoadG:
7811 do_origins_LoadG( mce, st->Ist.LoadG.details );
7812 break;
7814 case Ist_LLSC: {
7815 /* In short: treat a load-linked like a normal load followed
7816 by an assignment of the loaded (shadow) data the result
7817 temporary. Treat a store-conditional like a normal store,
7818 and mark the result temporary as defined. */
7819 if (st->Ist.LLSC.storedata == NULL) {
7820 /* Load Linked */
7821 IRType resTy
7822 = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
7823 IRExpr* vanillaLoad
7824 = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
7825 tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
7826 || resTy == Ity_I16 || resTy == Ity_I8);
7827 assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7828 schemeE(mce, vanillaLoad));
7829 } else {
7830 /* Store conditional */
7831 do_origins_Store_plain( mce, st->Ist.LLSC.end,
7832 st->Ist.LLSC.addr,
7833 st->Ist.LLSC.storedata );
7834 /* For the rationale behind this, see comments at the
7835 place where the V-shadow for .result is constructed, in
7836 do_shadow_LLSC. In short, we regard .result as
7837 always-defined. */
7838 assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7839 mkU32(0) );
7841 break;
7844 case Ist_Put: {
7845 Int b_offset
7846 = MC_(get_otrack_shadow_offset)(
7847 st->Ist.Put.offset,
7848 sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
7850 if (b_offset >= 0) {
7851 /* FIXME: this isn't an atom! */
7852 stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
7853 schemeE( mce, st->Ist.Put.data )) );
7855 break;
7858 case Ist_WrTmp:
7859 assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
7860 schemeE(mce, st->Ist.WrTmp.data) );
7861 break;
7863 case Ist_MBE:
7864 case Ist_NoOp:
7865 case Ist_Exit:
7866 case Ist_IMark:
7867 break;
7869 default:
7870 VG_(printf)("mc_translate.c: schemeS: unhandled: ");
7871 ppIRStmt(st);
7872 VG_(tool_panic)("memcheck:schemeS");
7877 /*------------------------------------------------------------*/
7878 /*--- Post-tree-build final tidying ---*/
7879 /*------------------------------------------------------------*/
7881 /* This exploits the observation that Memcheck often produces
7882 repeated conditional calls of the form
7884 Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
7886 with the same guard expression G guarding the same helper call.
7887 The second and subsequent calls are redundant. This usually
7888 results from instrumentation of guest code containing multiple
7889 memory references at different constant offsets from the same base
7890 register. After optimisation of the instrumentation, you get a
7891 test for the definedness of the base register for each memory
7892 reference, which is kinda pointless. MC_(final_tidy) therefore
7893 looks for such repeated calls and removes all but the first. */
7896 /* With some testing on perf/bz2.c, on amd64 and x86, compiled with
7897 gcc-5.3.1 -O2, it appears that 16 entries in the array are enough to
7898 get almost all the benefits of this transformation whilst causing
7899 the slide-back case to just often enough to be verifiably
7900 correct. For posterity, the numbers are:
7902 bz2-32
7904 1 4,336 (112,212 -> 1,709,473; ratio 15.2)
7905 2 4,336 (112,194 -> 1,669,895; ratio 14.9)
7906 3 4,336 (112,194 -> 1,660,713; ratio 14.8)
7907 4 4,336 (112,194 -> 1,658,555; ratio 14.8)
7908 5 4,336 (112,194 -> 1,655,447; ratio 14.8)
7909 6 4,336 (112,194 -> 1,655,101; ratio 14.8)
7910 7 4,336 (112,194 -> 1,654,858; ratio 14.7)
7911 8 4,336 (112,194 -> 1,654,810; ratio 14.7)
7912 10 4,336 (112,194 -> 1,654,621; ratio 14.7)
7913 12 4,336 (112,194 -> 1,654,678; ratio 14.7)
7914 16 4,336 (112,194 -> 1,654,494; ratio 14.7)
7915 32 4,336 (112,194 -> 1,654,602; ratio 14.7)
7916 inf 4,336 (112,194 -> 1,654,602; ratio 14.7)
7918 bz2-64
7920 1 4,113 (107,329 -> 1,822,171; ratio 17.0)
7921 2 4,113 (107,329 -> 1,806,443; ratio 16.8)
7922 3 4,113 (107,329 -> 1,803,967; ratio 16.8)
7923 4 4,113 (107,329 -> 1,802,785; ratio 16.8)
7924 5 4,113 (107,329 -> 1,802,412; ratio 16.8)
7925 6 4,113 (107,329 -> 1,802,062; ratio 16.8)
7926 7 4,113 (107,329 -> 1,801,976; ratio 16.8)
7927 8 4,113 (107,329 -> 1,801,886; ratio 16.8)
7928 10 4,113 (107,329 -> 1,801,653; ratio 16.8)
7929 12 4,113 (107,329 -> 1,801,526; ratio 16.8)
7930 16 4,113 (107,329 -> 1,801,298; ratio 16.8)
7931 32 4,113 (107,329 -> 1,800,827; ratio 16.8)
7932 inf 4,113 (107,329 -> 1,800,827; ratio 16.8)
7935 /* Structs for recording which (helper, guard) pairs we have already
7936 seen. */
7938 #define N_TIDYING_PAIRS 16
7940 typedef
7941 struct { void* entry; IRExpr* guard; }
7942 Pair;
7944 typedef
7945 struct {
7946 Pair pairs[N_TIDYING_PAIRS +1/*for bounds checking*/];
7947 UInt pairsUsed;
7949 Pairs;
7952 /* Return True if e1 and e2 definitely denote the same value (used to
7953 compare guards). Return False if unknown; False is the safe
7954 answer. Since guest registers and guest memory do not have the
7955 SSA property we must return False if any Gets or Loads appear in
7956 the expression. This implicitly assumes that e1 and e2 have the
7957 same IR type, which is always true here -- the type is Ity_I1. */
7959 static Bool sameIRValue ( IRExpr* e1, IRExpr* e2 )
7961 if (e1->tag != e2->tag)
7962 return False;
7963 switch (e1->tag) {
7964 case Iex_Const:
7965 return eqIRConst( e1->Iex.Const.con, e2->Iex.Const.con );
7966 case Iex_Binop:
7967 return e1->Iex.Binop.op == e2->Iex.Binop.op
7968 && sameIRValue(e1->Iex.Binop.arg1, e2->Iex.Binop.arg1)
7969 && sameIRValue(e1->Iex.Binop.arg2, e2->Iex.Binop.arg2);
7970 case Iex_Unop:
7971 return e1->Iex.Unop.op == e2->Iex.Unop.op
7972 && sameIRValue(e1->Iex.Unop.arg, e2->Iex.Unop.arg);
7973 case Iex_RdTmp:
7974 return e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp;
7975 case Iex_ITE:
7976 return sameIRValue( e1->Iex.ITE.cond, e2->Iex.ITE.cond )
7977 && sameIRValue( e1->Iex.ITE.iftrue, e2->Iex.ITE.iftrue )
7978 && sameIRValue( e1->Iex.ITE.iffalse, e2->Iex.ITE.iffalse );
7979 case Iex_Qop:
7980 case Iex_Triop:
7981 case Iex_CCall:
7982 /* be lazy. Could define equality for these, but they never
7983 appear to be used. */
7984 return False;
7985 case Iex_Get:
7986 case Iex_GetI:
7987 case Iex_Load:
7988 /* be conservative - these may not give the same value each
7989 time */
7990 return False;
7991 case Iex_Binder:
7992 /* should never see this */
7993 /* fallthrough */
7994 default:
7995 VG_(printf)("mc_translate.c: sameIRValue: unhandled: ");
7996 ppIRExpr(e1);
7997 VG_(tool_panic)("memcheck:sameIRValue");
7998 return False;
8002 /* See if 'pairs' already has an entry for (entry, guard). Return
8003 True if so. If not, add an entry. */
8005 static
8006 Bool check_or_add ( Pairs* tidyingEnv, IRExpr* guard, void* entry )
8008 UInt i, n = tidyingEnv->pairsUsed;
8009 tl_assert(n <= N_TIDYING_PAIRS);
8010 for (i = 0; i < n; i++) {
8011 if (tidyingEnv->pairs[i].entry == entry
8012 && sameIRValue(tidyingEnv->pairs[i].guard, guard))
8013 return True;
8015 /* (guard, entry) wasn't found in the array. Add it at the end.
8016 If the array is already full, slide the entries one slot
8017 backwards. This means we will lose to ability to detect
8018 duplicates from the pair in slot zero, but that happens so
8019 rarely that it's unlikely to have much effect on overall code
8020 quality. Also, this strategy loses the check for the oldest
8021 tracked exit (memory reference, basically) and so that is (I'd
8022 guess) least likely to be re-used after this point. */
8023 tl_assert(i == n);
8024 if (n == N_TIDYING_PAIRS) {
8025 for (i = 1; i < N_TIDYING_PAIRS; i++) {
8026 tidyingEnv->pairs[i-1] = tidyingEnv->pairs[i];
8028 tidyingEnv->pairs[N_TIDYING_PAIRS-1].entry = entry;
8029 tidyingEnv->pairs[N_TIDYING_PAIRS-1].guard = guard;
8030 } else {
8031 tl_assert(n < N_TIDYING_PAIRS);
8032 tidyingEnv->pairs[n].entry = entry;
8033 tidyingEnv->pairs[n].guard = guard;
8034 n++;
8035 tidyingEnv->pairsUsed = n;
8037 return False;
8040 static Bool is_helperc_value_checkN_fail ( const HChar* name )
8042 /* This is expensive because it happens a lot. We are checking to
8043 see whether |name| is one of the following 8 strings:
8045 MC_(helperc_value_check8_fail_no_o)
8046 MC_(helperc_value_check4_fail_no_o)
8047 MC_(helperc_value_check0_fail_no_o)
8048 MC_(helperc_value_check1_fail_no_o)
8049 MC_(helperc_value_check8_fail_w_o)
8050 MC_(helperc_value_check0_fail_w_o)
8051 MC_(helperc_value_check1_fail_w_o)
8052 MC_(helperc_value_check4_fail_w_o)
8054 To speed it up, check the common prefix just once, rather than
8055 all 8 times.
8057 const HChar* prefix = "MC_(helperc_value_check";
8059 HChar n, p;
8060 while (True) {
8061 n = *name;
8062 p = *prefix;
8063 if (p == 0) break; /* ran off the end of the prefix */
8064 /* We still have some prefix to use */
8065 if (n == 0) return False; /* have prefix, but name ran out */
8066 if (n != p) return False; /* have both pfx and name, but no match */
8067 name++;
8068 prefix++;
8071 /* Check the part after the prefix. */
8072 tl_assert(*prefix == 0 && *name != 0);
8073 return 0==VG_(strcmp)(name, "8_fail_no_o)")
8074 || 0==VG_(strcmp)(name, "4_fail_no_o)")
8075 || 0==VG_(strcmp)(name, "0_fail_no_o)")
8076 || 0==VG_(strcmp)(name, "1_fail_no_o)")
8077 || 0==VG_(strcmp)(name, "8_fail_w_o)")
8078 || 0==VG_(strcmp)(name, "4_fail_w_o)")
8079 || 0==VG_(strcmp)(name, "0_fail_w_o)")
8080 || 0==VG_(strcmp)(name, "1_fail_w_o)");
8083 IRSB* MC_(final_tidy) ( IRSB* sb_in )
8085 Int i;
8086 IRStmt* st;
8087 IRDirty* di;
8088 IRExpr* guard;
8089 IRCallee* cee;
8090 Bool alreadyPresent;
8091 Pairs pairs;
8093 pairs.pairsUsed = 0;
8095 pairs.pairs[N_TIDYING_PAIRS].entry = (void*)0x123;
8096 pairs.pairs[N_TIDYING_PAIRS].guard = (IRExpr*)0x456;
8098 /* Scan forwards through the statements. Each time a call to one
8099 of the relevant helpers is seen, check if we have made a
8100 previous call to the same helper using the same guard
8101 expression, and if so, delete the call. */
8102 for (i = 0; i < sb_in->stmts_used; i++) {
8103 st = sb_in->stmts[i];
8104 tl_assert(st);
8105 if (st->tag != Ist_Dirty)
8106 continue;
8107 di = st->Ist.Dirty.details;
8108 guard = di->guard;
8109 tl_assert(guard);
8110 if (0) { ppIRExpr(guard); VG_(printf)("\n"); }
8111 cee = di->cee;
8112 if (!is_helperc_value_checkN_fail( cee->name ))
8113 continue;
8114 /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
8115 guard 'guard'. Check if we have already seen a call to this
8116 function with the same guard. If so, delete it. If not,
8117 add it to the set of calls we do know about. */
8118 alreadyPresent = check_or_add( &pairs, guard, cee->addr );
8119 if (alreadyPresent) {
8120 sb_in->stmts[i] = IRStmt_NoOp();
8121 if (0) VG_(printf)("XX\n");
8125 tl_assert(pairs.pairs[N_TIDYING_PAIRS].entry == (void*)0x123);
8126 tl_assert(pairs.pairs[N_TIDYING_PAIRS].guard == (IRExpr*)0x456);
8128 return sb_in;
8131 #undef N_TIDYING_PAIRS
8134 /*------------------------------------------------------------*/
8135 /*--- Startup assertion checking ---*/
8136 /*------------------------------------------------------------*/
8138 void MC_(do_instrumentation_startup_checks)( void )
8140 /* Make a best-effort check to see that is_helperc_value_checkN_fail
8141 is working as we expect. */
8143 # define CHECK(_expected, _string) \
8144 tl_assert((_expected) == is_helperc_value_checkN_fail(_string))
8146 /* It should identify these 8, and no others, as targets. */
8147 CHECK(True, "MC_(helperc_value_check8_fail_no_o)");
8148 CHECK(True, "MC_(helperc_value_check4_fail_no_o)");
8149 CHECK(True, "MC_(helperc_value_check0_fail_no_o)");
8150 CHECK(True, "MC_(helperc_value_check1_fail_no_o)");
8151 CHECK(True, "MC_(helperc_value_check8_fail_w_o)");
8152 CHECK(True, "MC_(helperc_value_check0_fail_w_o)");
8153 CHECK(True, "MC_(helperc_value_check1_fail_w_o)");
8154 CHECK(True, "MC_(helperc_value_check4_fail_w_o)");
8156 /* Ad-hoc selection of other strings gathered via a quick test. */
8157 CHECK(False, "amd64g_dirtyhelper_CPUID_avx2");
8158 CHECK(False, "amd64g_dirtyhelper_RDTSC");
8159 CHECK(False, "MC_(helperc_b_load1)");
8160 CHECK(False, "MC_(helperc_b_load2)");
8161 CHECK(False, "MC_(helperc_b_load4)");
8162 CHECK(False, "MC_(helperc_b_load8)");
8163 CHECK(False, "MC_(helperc_b_load16)");
8164 CHECK(False, "MC_(helperc_b_load32)");
8165 CHECK(False, "MC_(helperc_b_store1)");
8166 CHECK(False, "MC_(helperc_b_store2)");
8167 CHECK(False, "MC_(helperc_b_store4)");
8168 CHECK(False, "MC_(helperc_b_store8)");
8169 CHECK(False, "MC_(helperc_b_store16)");
8170 CHECK(False, "MC_(helperc_b_store32)");
8171 CHECK(False, "MC_(helperc_LOADV8)");
8172 CHECK(False, "MC_(helperc_LOADV16le)");
8173 CHECK(False, "MC_(helperc_LOADV32le)");
8174 CHECK(False, "MC_(helperc_LOADV64le)");
8175 CHECK(False, "MC_(helperc_LOADV128le)");
8176 CHECK(False, "MC_(helperc_LOADV256le)");
8177 CHECK(False, "MC_(helperc_STOREV16le)");
8178 CHECK(False, "MC_(helperc_STOREV32le)");
8179 CHECK(False, "MC_(helperc_STOREV64le)");
8180 CHECK(False, "MC_(helperc_STOREV8)");
8181 CHECK(False, "track_die_mem_stack_8");
8182 CHECK(False, "track_new_mem_stack_8_w_ECU");
8183 CHECK(False, "MC_(helperc_MAKE_STACK_UNINIT_w_o)");
8184 CHECK(False, "VG_(unknown_SP_update_w_ECU)");
8186 # undef CHECK
8190 /*------------------------------------------------------------*/
8191 /*--- Memcheck main ---*/
8192 /*------------------------------------------------------------*/
8194 static Bool isBogusAtom ( IRAtom* at )
8196 if (at->tag == Iex_RdTmp)
8197 return False;
8198 tl_assert(at->tag == Iex_Const);
8200 ULong n = 0;
8201 IRConst* con = at->Iex.Const.con;
8202 switch (con->tag) {
8203 case Ico_U1: return False;
8204 case Ico_U8: n = (ULong)con->Ico.U8; break;
8205 case Ico_U16: n = (ULong)con->Ico.U16; break;
8206 case Ico_U32: n = (ULong)con->Ico.U32; break;
8207 case Ico_U64: n = (ULong)con->Ico.U64; break;
8208 case Ico_F32: return False;
8209 case Ico_F64: return False;
8210 case Ico_F32i: return False;
8211 case Ico_F64i: return False;
8212 case Ico_V128: return False;
8213 case Ico_V256: return False;
8214 default: ppIRExpr(at); tl_assert(0);
8216 /* VG_(printf)("%llx\n", n); */
8217 /* Shortcuts */
8218 if (LIKELY(n <= 0x0000000000001000ULL)) return False;
8219 if (LIKELY(n >= 0xFFFFFFFFFFFFF000ULL)) return False;
8220 /* The list of bogus atoms is: */
8221 return (/*32*/ n == 0xFEFEFEFFULL
8222 /*32*/ || n == 0x80808080ULL
8223 /*32*/ || n == 0x7F7F7F7FULL
8224 /*32*/ || n == 0x7EFEFEFFULL
8225 /*32*/ || n == 0x81010100ULL
8226 /*64*/ || n == 0xFFFFFFFFFEFEFEFFULL
8227 /*64*/ || n == 0xFEFEFEFEFEFEFEFFULL
8228 /*64*/ || n == 0x0000000000008080ULL
8229 /*64*/ || n == 0x8080808080808080ULL
8230 /*64*/ || n == 0x0101010101010101ULL
8235 /* Does 'st' mention any of the literals identified/listed in
8236 isBogusAtom()? */
8237 static inline Bool containsBogusLiterals ( /*FLAT*/ IRStmt* st )
8239 Int i;
8240 IRExpr* e;
8241 IRDirty* d;
8242 IRCAS* cas;
8243 switch (st->tag) {
8244 case Ist_WrTmp:
8245 e = st->Ist.WrTmp.data;
8246 switch (e->tag) {
8247 case Iex_Get:
8248 case Iex_RdTmp:
8249 return False;
8250 case Iex_Const:
8251 return isBogusAtom(e);
8252 case Iex_Unop:
8253 return isBogusAtom(e->Iex.Unop.arg)
8254 || e->Iex.Unop.op == Iop_GetMSBs8x16;
8255 case Iex_GetI:
8256 return isBogusAtom(e->Iex.GetI.ix);
8257 case Iex_Binop:
8258 return isBogusAtom(e->Iex.Binop.arg1)
8259 || isBogusAtom(e->Iex.Binop.arg2);
8260 case Iex_Triop:
8261 return isBogusAtom(e->Iex.Triop.details->arg1)
8262 || isBogusAtom(e->Iex.Triop.details->arg2)
8263 || isBogusAtom(e->Iex.Triop.details->arg3);
8264 case Iex_Qop:
8265 return isBogusAtom(e->Iex.Qop.details->arg1)
8266 || isBogusAtom(e->Iex.Qop.details->arg2)
8267 || isBogusAtom(e->Iex.Qop.details->arg3)
8268 || isBogusAtom(e->Iex.Qop.details->arg4);
8269 case Iex_ITE:
8270 return isBogusAtom(e->Iex.ITE.cond)
8271 || isBogusAtom(e->Iex.ITE.iftrue)
8272 || isBogusAtom(e->Iex.ITE.iffalse);
8273 case Iex_Load:
8274 return isBogusAtom(e->Iex.Load.addr);
8275 case Iex_CCall:
8276 for (i = 0; e->Iex.CCall.args[i]; i++)
8277 if (isBogusAtom(e->Iex.CCall.args[i]))
8278 return True;
8279 return False;
8280 default:
8281 goto unhandled;
8283 case Ist_Dirty:
8284 d = st->Ist.Dirty.details;
8285 for (i = 0; d->args[i]; i++) {
8286 IRAtom* atom = d->args[i];
8287 if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(atom))) {
8288 if (isBogusAtom(atom))
8289 return True;
8292 if (isBogusAtom(d->guard))
8293 return True;
8294 if (d->mAddr && isBogusAtom(d->mAddr))
8295 return True;
8296 return False;
8297 case Ist_Put:
8298 return isBogusAtom(st->Ist.Put.data);
8299 case Ist_PutI:
8300 return isBogusAtom(st->Ist.PutI.details->ix)
8301 || isBogusAtom(st->Ist.PutI.details->data);
8302 case Ist_Store:
8303 return isBogusAtom(st->Ist.Store.addr)
8304 || isBogusAtom(st->Ist.Store.data);
8305 case Ist_StoreG: {
8306 IRStoreG* sg = st->Ist.StoreG.details;
8307 return isBogusAtom(sg->addr) || isBogusAtom(sg->data)
8308 || isBogusAtom(sg->guard);
8310 case Ist_LoadG: {
8311 IRLoadG* lg = st->Ist.LoadG.details;
8312 return isBogusAtom(lg->addr) || isBogusAtom(lg->alt)
8313 || isBogusAtom(lg->guard);
8315 case Ist_Exit:
8316 return isBogusAtom(st->Ist.Exit.guard);
8317 case Ist_AbiHint:
8318 return isBogusAtom(st->Ist.AbiHint.base)
8319 || isBogusAtom(st->Ist.AbiHint.nia);
8320 case Ist_NoOp:
8321 case Ist_IMark:
8322 case Ist_MBE:
8323 return False;
8324 case Ist_CAS:
8325 cas = st->Ist.CAS.details;
8326 return isBogusAtom(cas->addr)
8327 || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
8328 || isBogusAtom(cas->expdLo)
8329 || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
8330 || isBogusAtom(cas->dataLo);
8331 case Ist_LLSC:
8332 return isBogusAtom(st->Ist.LLSC.addr)
8333 || (st->Ist.LLSC.storedata
8334 ? isBogusAtom(st->Ist.LLSC.storedata)
8335 : False);
8336 default:
8337 unhandled:
8338 ppIRStmt(st);
8339 VG_(tool_panic)("hasBogusLiterals");
8344 /* This is the pre-instrumentation analysis. It does a backwards pass over
8345 the stmts in |sb_in| to determine a HowUsed value for each tmp defined in
8346 the block.
8348 Unrelatedly, it also checks all literals in the block with |isBogusAtom|,
8349 as a positive result from that is a strong indication that we need to
8350 expensively instrument add/sub in the block. We do both analyses in one
8351 pass, even though they are independent, so as to avoid the overhead of
8352 having to traverse the whole block twice.
8354 The usage pass proceeds as follows. Let max= be the max operation in the
8355 HowUsed lattice, hence
8357 X max= Y means X = max(X, Y)
8359 then
8361 for t in original tmps . useEnv[t] = HuUnU
8363 for t used in the block's . next field
8364 useEnv[t] max= HuPCa // because jmp targets are PCast-tested
8366 for st iterating *backwards* in the block
8368 match st
8370 case "t1 = load(t2)" // case 1
8371 useEnv[t2] max= HuPCa
8373 case "t1 = add(t2, t3)" // case 2
8374 useEnv[t2] max= useEnv[t1]
8375 useEnv[t3] max= useEnv[t1]
8377 other
8378 for t in st.usedTmps // case 3
8379 useEnv[t] max= HuOth
8380 // same as useEnv[t] = HuOth
8382 The general idea is that we accumulate, in useEnv[], information about
8383 how each tmp is used. That can be updated as we work further back
8384 through the block and find more uses of it, but its HowUsed value can
8385 only ascend the lattice, not descend.
8387 Initially we mark all tmps as unused. In case (1), if a tmp is seen to
8388 be used as a memory address, then its use is at least HuPCa. The point
8389 is that for a memory address we will add instrumentation to check if any
8390 bit of the address is undefined, which means that we won't need expensive
8391 V-bit propagation through an add expression that computed the address --
8392 cheap add instrumentation will be equivalent.
8394 Note in case (1) that if we have previously seen a non-memory-address use
8395 of the tmp, then its use will already be HuOth and will be unchanged by
8396 the max= operation. And if it turns out that the source of the tmp was
8397 an add, then we'll have to expensively instrument the add, because we
8398 can't prove that, for the previous non-memory-address use of the tmp,
8399 cheap and expensive instrumentation will be equivalent.
8401 In case 2, we propagate the usage-mode of the result of an add back
8402 through to its operands. Again, we use max= so as to take account of the
8403 fact that t2 or t3 might later in the block (viz, earlier in the
8404 iteration) have been used in a way that requires expensive add
8405 instrumentation.
8407 In case 3, we deal with all other tmp uses. We assume that we'll need a
8408 result that is as accurate as possible, so we max= HuOth into its use
8409 mode. Since HuOth is the top of the lattice, that's equivalent to just
8410 setting its use to HuOth.
8412 The net result of all this is that:
8414 tmps that are used either
8415 - only as a memory address, or
8416 - only as part of a tree of adds that computes a memory address,
8417 and has no other use
8418 are marked as HuPCa, and so we can instrument their generating Add
8419 nodes cheaply, which is the whole point of this analysis
8421 tmps that are used any other way at all are marked as HuOth
8423 tmps that are unused are marked as HuUnU. We don't expect to see any
8424 since we expect that the incoming IR has had all dead assignments
8425 removed by previous optimisation passes. Nevertheless the analysis is
8426 correct even in the presence of dead tmps.
8428 A final comment on dead tmps. In case 1 and case 2, we could actually
8429 conditionalise the updates thusly:
8431 if (useEnv[t1] > HuUnU) { useEnv[t2] max= HuPCa } // case 1
8433 if (useEnv[t1] > HuUnU) { useEnv[t2] max= useEnv[t1] } // case 2
8434 if (useEnv[t1] > HuUnU) { useEnv[t3] max= useEnv[t1] } // case 2
8436 In other words, if the assigned-to tmp |t1| is never used, then there's
8437 no point in propagating any use through to its operands. That won't
8438 change the final HuPCa-vs-HuOth results, which is what we care about.
8439 Given that we expect to get dead-code-free inputs, there's no point in
8440 adding this extra refinement.
8443 /* Helper for |preInstrumentationAnalysis|. */
8444 static inline void noteTmpUsesIn ( /*MOD*/HowUsed* useEnv,
8445 UInt tyenvUsed,
8446 HowUsed newUse, IRAtom* at )
8448 /* For the atom |at|, declare that for any tmp |t| in |at|, we will have
8449 seen a use of |newUse|. So, merge that info into |t|'s accumulated
8450 use info. */
8451 switch (at->tag) {
8452 case Iex_GSPTR:
8453 case Iex_VECRET:
8454 case Iex_Const:
8455 return;
8456 case Iex_RdTmp: {
8457 IRTemp t = at->Iex.RdTmp.tmp;
8458 tl_assert(t < tyenvUsed); // "is an original tmp"
8459 // The "max" operation in the lattice
8460 if (newUse > useEnv[t]) useEnv[t] = newUse;
8461 return;
8463 default:
8464 // We should never get here -- it implies non-flat IR
8465 ppIRExpr(at);
8466 VG_(tool_panic)("noteTmpUsesIn");
8468 /*NOTREACHED*/
8469 tl_assert(0);
8473 static void preInstrumentationAnalysis ( /*OUT*/HowUsed** useEnvP,
8474 /*OUT*/Bool* hasBogusLiteralsP,
8475 const IRSB* sb_in )
8477 const UInt nOrigTmps = (UInt)sb_in->tyenv->types_used;
8479 // We've seen no bogus literals so far.
8480 Bool bogus = False;
8482 // This is calloc'd, so implicitly all entries are initialised to HuUnU.
8483 HowUsed* useEnv = VG_(calloc)("mc.preInstrumentationAnalysis.1",
8484 nOrigTmps, sizeof(HowUsed));
8486 // Firstly, roll in contributions from the final dst address.
8487 bogus = isBogusAtom(sb_in->next);
8488 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, sb_in->next);
8490 // Now work backwards through the stmts.
8491 for (Int i = sb_in->stmts_used-1; i >= 0; i--) {
8492 IRStmt* st = sb_in->stmts[i];
8494 // Deal with literals.
8495 if (LIKELY(!bogus)) {
8496 bogus = containsBogusLiterals(st);
8499 // Deal with tmp uses.
8500 switch (st->tag) {
8501 case Ist_WrTmp: {
8502 IRTemp dst = st->Ist.WrTmp.tmp;
8503 IRExpr* rhs = st->Ist.WrTmp.data;
8504 // This is the one place where we have to consider all possible
8505 // tags for |rhs|, and can't just assume it is a tmp or a const.
8506 switch (rhs->tag) {
8507 case Iex_RdTmp:
8508 // just propagate demand for |dst| into this tmp use.
8509 noteTmpUsesIn(useEnv, nOrigTmps, useEnv[dst], rhs);
8510 break;
8511 case Iex_Unop:
8512 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.Unop.arg);
8513 break;
8514 case Iex_Binop:
8515 if (rhs->Iex.Binop.op == Iop_Add64
8516 || rhs->Iex.Binop.op == Iop_Add32) {
8517 // propagate demand for |dst| through to the operands.
8518 noteTmpUsesIn(useEnv, nOrigTmps,
8519 useEnv[dst], rhs->Iex.Binop.arg1);
8520 noteTmpUsesIn(useEnv, nOrigTmps,
8521 useEnv[dst], rhs->Iex.Binop.arg2);
8522 } else {
8523 // just say that the operands are used in some unknown way.
8524 noteTmpUsesIn(useEnv, nOrigTmps,
8525 HuOth, rhs->Iex.Binop.arg1);
8526 noteTmpUsesIn(useEnv, nOrigTmps,
8527 HuOth, rhs->Iex.Binop.arg2);
8529 break;
8530 case Iex_Triop: {
8531 // All operands are used in some unknown way.
8532 IRTriop* tri = rhs->Iex.Triop.details;
8533 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg1);
8534 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg2);
8535 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg3);
8536 break;
8538 case Iex_Qop: {
8539 // All operands are used in some unknown way.
8540 IRQop* qop = rhs->Iex.Qop.details;
8541 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg1);
8542 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg2);
8543 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg3);
8544 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg4);
8545 break;
8547 case Iex_Load:
8548 // The address will be checked (== PCasted).
8549 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.Load.addr);
8550 break;
8551 case Iex_ITE:
8552 // The condition is PCasted, the then- and else-values
8553 // aren't.
8554 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.ITE.cond);
8555 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iftrue);
8556 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iffalse);
8557 break;
8558 case Iex_CCall:
8559 // The args are used in unknown ways.
8560 for (IRExpr** args = rhs->Iex.CCall.args; *args; args++) {
8561 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
8563 break;
8564 case Iex_GetI: {
8565 // The index will be checked/PCasted (see do_shadow_GETI)
8566 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.GetI.ix);
8567 break;
8569 case Iex_Const:
8570 case Iex_Get:
8571 break;
8572 default:
8573 ppIRExpr(rhs);
8574 VG_(tool_panic)("preInstrumentationAnalysis:"
8575 " unhandled IRExpr");
8577 break;
8579 case Ist_Store:
8580 // The address will be checked (== PCasted). The data will be
8581 // used in some unknown way.
8582 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Store.addr);
8583 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Store.data);
8584 break;
8585 case Ist_Exit:
8586 // The guard will be checked (== PCasted)
8587 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Exit.guard);
8588 break;
8589 case Ist_Put:
8590 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Put.data);
8591 break;
8592 case Ist_PutI: {
8593 IRPutI* putI = st->Ist.PutI.details;
8594 // The index will be checked/PCasted (see do_shadow_PUTI). The
8595 // data will be used in an unknown way.
8596 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, putI->ix);
8597 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, putI->data);
8598 break;
8600 case Ist_Dirty: {
8601 IRDirty* d = st->Ist.Dirty.details;
8602 // The guard will be checked (== PCasted)
8603 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, d->guard);
8604 // The args will be used in unknown ways.
8605 for (IRExpr** args = d->args; *args; args++) {
8606 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
8608 break;
8610 case Ist_CAS: {
8611 IRCAS* cas = st->Ist.CAS.details;
8612 // Address will be pcasted, everything else used as unknown
8613 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, cas->addr);
8614 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdLo);
8615 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataLo);
8616 if (cas->expdHi)
8617 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdHi);
8618 if (cas->dataHi)
8619 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataHi);
8620 break;
8622 case Ist_AbiHint:
8623 // Both exprs are used in unknown ways. TODO: can we safely
8624 // just ignore AbiHints?
8625 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.base);
8626 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.nia);
8627 break;
8628 case Ist_StoreG: {
8629 // We might be able to do better, and use HuPCa for the addr.
8630 // It's not immediately obvious that we can, because the address
8631 // is regarded as "used" only when the guard is true.
8632 IRStoreG* sg = st->Ist.StoreG.details;
8633 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->addr);
8634 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->data);
8635 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->guard);
8636 break;
8638 case Ist_LoadG: {
8639 // Per similar comments to Ist_StoreG .. not sure whether this
8640 // is really optimal.
8641 IRLoadG* lg = st->Ist.LoadG.details;
8642 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->addr);
8643 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->alt);
8644 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->guard);
8645 break;
8647 case Ist_LLSC: {
8648 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.LLSC.addr);
8649 if (st->Ist.LLSC.storedata)
8650 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.LLSC.storedata);
8651 break;
8653 case Ist_MBE:
8654 case Ist_IMark:
8655 case Ist_NoOp:
8656 break;
8657 default: {
8658 ppIRStmt(st);
8659 VG_(tool_panic)("preInstrumentationAnalysis: unhandled IRStmt");
8662 } // Now work backwards through the stmts.
8664 // Return the computed use env and the bogus-atom flag.
8665 tl_assert(*useEnvP == NULL);
8666 *useEnvP = useEnv;
8668 tl_assert(*hasBogusLiteralsP == False);
8669 *hasBogusLiteralsP = bogus;
8673 IRSB* MC_(instrument) ( VgCallbackClosure* closure,
8674 IRSB* sb_in,
8675 const VexGuestLayout* layout,
8676 const VexGuestExtents* vge,
8677 const VexArchInfo* archinfo_host,
8678 IRType gWordTy, IRType hWordTy )
8680 Bool verboze = 0||False;
8681 Int i, j, first_stmt;
8682 IRStmt* st;
8683 MCEnv mce;
8684 IRSB* sb_out;
8686 if (gWordTy != hWordTy) {
8687 /* We don't currently support this case. */
8688 VG_(tool_panic)("host/guest word size mismatch");
8691 /* Check we're not completely nuts */
8692 tl_assert(sizeof(UWord) == sizeof(void*));
8693 tl_assert(sizeof(Word) == sizeof(void*));
8694 tl_assert(sizeof(Addr) == sizeof(void*));
8695 tl_assert(sizeof(ULong) == 8);
8696 tl_assert(sizeof(Long) == 8);
8697 tl_assert(sizeof(UInt) == 4);
8698 tl_assert(sizeof(Int) == 4);
8700 tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
8702 /* Set up SB */
8703 sb_out = deepCopyIRSBExceptStmts(sb_in);
8705 /* Set up the running environment. Both .sb and .tmpMap are
8706 modified as we go along. Note that tmps are added to both
8707 .sb->tyenv and .tmpMap together, so the valid index-set for
8708 those two arrays should always be identical. */
8709 VG_(memset)(&mce, 0, sizeof(mce));
8710 mce.sb = sb_out;
8711 mce.trace = verboze;
8712 mce.layout = layout;
8713 mce.hWordTy = hWordTy;
8714 mce.tmpHowUsed = NULL;
8716 /* BEGIN decide on expense levels for instrumentation. */
8718 /* Initially, select the cheap version of everything for which we have an
8719 option. */
8720 DetailLevelByOp__set_all( &mce.dlbo, DLcheap );
8722 /* Take account of the --expensive-definedness-checks= flag. */
8723 if (MC_(clo_expensive_definedness_checks) == EdcNO) {
8724 /* We just selected 'cheap for everything', so we don't need to do
8725 anything here. mce.tmpHowUsed remains NULL. */
8727 else if (MC_(clo_expensive_definedness_checks) == EdcYES) {
8728 /* Select 'expensive for everything'. mce.tmpHowUsed remains NULL. */
8729 DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8731 else {
8732 tl_assert(MC_(clo_expensive_definedness_checks) == EdcAUTO);
8733 /* We'll make our own selection, based on known per-target constraints
8734 and also on analysis of the block to be instrumented. First, set
8735 up default values for detail levels.
8737 On x86 and amd64, we'll routinely encounter code optimised by LLVM
8738 5 and above. Enable accurate interpretation of the following.
8739 LLVM uses adds for some bitfield inserts, and we get a lot of false
8740 errors if the cheap interpretation is used, alas. Could solve this
8741 much better if we knew which of such adds came from x86/amd64 LEA
8742 instructions, since these are the only ones really needing the
8743 expensive interpretation, but that would require some way to tag
8744 them in the _toIR.c front ends, which is a lot of faffing around.
8745 So for now we use preInstrumentationAnalysis() to detect adds which
8746 are used only to construct memory addresses, which is an
8747 approximation to the above, and is self-contained.*/
8748 # if defined(VGA_x86)
8749 mce.dlbo.dl_Add32 = DLauto;
8750 mce.dlbo.dl_CmpEQ16_CmpNE16 = DLexpensive;
8751 mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8752 # elif defined(VGA_amd64)
8753 mce.dlbo.dl_Add32 = DLexpensive;
8754 mce.dlbo.dl_Add64 = DLauto;
8755 mce.dlbo.dl_CmpEQ16_CmpNE16 = DLexpensive;
8756 mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8757 mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8758 # elif defined(VGA_ppc64le)
8759 // Needed by (at least) set_AV_CR6() in the front end.
8760 mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8761 # elif defined(VGA_arm64)
8762 mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8763 mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8764 # elif defined(VGA_arm)
8765 mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8766 # endif
8768 /* preInstrumentationAnalysis() will allocate &mce.tmpHowUsed and then
8769 fill it in. */
8770 Bool hasBogusLiterals = False;
8771 preInstrumentationAnalysis( &mce.tmpHowUsed, &hasBogusLiterals, sb_in );
8773 if (hasBogusLiterals) {
8774 /* This happens very rarely. In this case just select expensive
8775 for everything, and throw away the tmp-use analysis results. */
8776 DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8777 VG_(free)( mce.tmpHowUsed );
8778 mce.tmpHowUsed = NULL;
8779 } else {
8780 /* Nothing. mce.tmpHowUsed contains tmp-use analysis results,
8781 which will be used for some subset of Iop_{Add,Sub}{32,64},
8782 based on which ones are set to DLauto for this target. */
8786 DetailLevelByOp__check_sanity( &mce.dlbo );
8788 if (0) {
8789 // Debug printing: which tmps have been identified as PCast-only use
8790 if (mce.tmpHowUsed) {
8791 VG_(printf)("Cheapies: ");
8792 for (UInt q = 0; q < sb_in->tyenv->types_used; q++) {
8793 if (mce.tmpHowUsed[q] == HuPCa) {
8794 VG_(printf)("t%u ", q);
8797 VG_(printf)("\n");
8800 // Debug printing: number of ops by detail level
8801 UChar nCheap = DetailLevelByOp__count( &mce.dlbo, DLcheap );
8802 UChar nAuto = DetailLevelByOp__count( &mce.dlbo, DLauto );
8803 UChar nExpensive = DetailLevelByOp__count( &mce.dlbo, DLexpensive );
8804 tl_assert(nCheap + nAuto + nExpensive == 8);
8806 VG_(printf)("%u,%u,%u ", nCheap, nAuto, nExpensive);
8808 /* END decide on expense levels for instrumentation. */
8810 /* Initialise the running the tmp environment. */
8812 mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
8813 sizeof(TempMapEnt));
8814 VG_(hintSizeXA) (mce.tmpMap, sb_in->tyenv->types_used);
8815 for (i = 0; i < sb_in->tyenv->types_used; i++) {
8816 TempMapEnt ent;
8817 ent.kind = Orig;
8818 ent.shadowV = IRTemp_INVALID;
8819 ent.shadowB = IRTemp_INVALID;
8820 VG_(addToXA)( mce.tmpMap, &ent );
8822 tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
8824 /* Finally, begin instrumentation. */
8825 /* Copy verbatim any IR preamble preceding the first IMark */
8827 tl_assert(mce.sb == sb_out);
8828 tl_assert(mce.sb != sb_in);
8830 i = 0;
8831 while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
8833 st = sb_in->stmts[i];
8834 tl_assert(st);
8835 tl_assert(isFlatIRStmt(st));
8837 stmt( 'C', &mce, sb_in->stmts[i] );
8838 i++;
8841 /* Nasty problem. IR optimisation of the pre-instrumented IR may
8842 cause the IR following the preamble to contain references to IR
8843 temporaries defined in the preamble. Because the preamble isn't
8844 instrumented, these temporaries don't have any shadows.
8845 Nevertheless uses of them following the preamble will cause
8846 memcheck to generate references to their shadows. End effect is
8847 to cause IR sanity check failures, due to references to
8848 non-existent shadows. This is only evident for the complex
8849 preambles used for function wrapping on TOC-afflicted platforms
8850 (ppc64-linux).
8852 The following loop therefore scans the preamble looking for
8853 assignments to temporaries. For each one found it creates an
8854 assignment to the corresponding (V) shadow temp, marking it as
8855 'defined'. This is the same resulting IR as if the main
8856 instrumentation loop before had been applied to the statement
8857 'tmp = CONSTANT'.
8859 Similarly, if origin tracking is enabled, we must generate an
8860 assignment for the corresponding origin (B) shadow, claiming
8861 no-origin, as appropriate for a defined value.
8863 for (j = 0; j < i; j++) {
8864 if (sb_in->stmts[j]->tag == Ist_WrTmp) {
8865 /* findShadowTmpV checks its arg is an original tmp;
8866 no need to assert that here. */
8867 IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
8868 IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
8869 IRType ty_v = typeOfIRTemp(sb_out->tyenv, tmp_v);
8870 assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
8871 if (MC_(clo_mc_level) == 3) {
8872 IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
8873 tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
8874 assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
8876 if (0) {
8877 VG_(printf)("create shadow tmp(s) for preamble tmp [%d] ty ", j);
8878 ppIRType( ty_v );
8879 VG_(printf)("\n");
8884 /* Iterate over the remaining stmts to generate instrumentation. */
8886 tl_assert(sb_in->stmts_used > 0);
8887 tl_assert(i >= 0);
8888 tl_assert(i < sb_in->stmts_used);
8889 tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
8891 for (/* use current i*/; i < sb_in->stmts_used; i++) {
8893 st = sb_in->stmts[i];
8894 first_stmt = sb_out->stmts_used;
8896 if (verboze) {
8897 VG_(printf)("\n");
8898 ppIRStmt(st);
8899 VG_(printf)("\n");
8902 if (MC_(clo_mc_level) == 3) {
8903 /* See comments on case Ist_CAS below. */
8904 if (st->tag != Ist_CAS)
8905 schemeS( &mce, st );
8908 /* Generate instrumentation code for each stmt ... */
8910 switch (st->tag) {
8912 case Ist_WrTmp: {
8913 IRTemp dst = st->Ist.WrTmp.tmp;
8914 tl_assert(dst < (UInt)sb_in->tyenv->types_used);
8915 HowUsed hu = mce.tmpHowUsed ? mce.tmpHowUsed[dst]
8916 : HuOth/*we don't know, so play safe*/;
8917 assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp),
8918 expr2vbits( &mce, st->Ist.WrTmp.data, hu ));
8919 break;
8922 case Ist_Put:
8923 do_shadow_PUT( &mce,
8924 st->Ist.Put.offset,
8925 st->Ist.Put.data,
8926 NULL /* shadow atom */, NULL /* guard */ );
8927 break;
8929 case Ist_PutI:
8930 do_shadow_PUTI( &mce, st->Ist.PutI.details);
8931 break;
8933 case Ist_Store:
8934 do_shadow_Store( &mce, st->Ist.Store.end,
8935 st->Ist.Store.addr, 0/* addr bias */,
8936 st->Ist.Store.data,
8937 NULL /* shadow data */,
8938 NULL/*guard*/ );
8939 break;
8941 case Ist_StoreG:
8942 do_shadow_StoreG( &mce, st->Ist.StoreG.details );
8943 break;
8945 case Ist_LoadG:
8946 do_shadow_LoadG( &mce, st->Ist.LoadG.details );
8947 break;
8949 case Ist_Exit:
8950 complainIfUndefined( &mce, st->Ist.Exit.guard, NULL );
8951 break;
8953 case Ist_IMark:
8954 break;
8956 case Ist_NoOp:
8957 case Ist_MBE:
8958 break;
8960 case Ist_Dirty:
8961 do_shadow_Dirty( &mce, st->Ist.Dirty.details );
8962 break;
8964 case Ist_AbiHint:
8965 do_AbiHint( &mce, st->Ist.AbiHint.base,
8966 st->Ist.AbiHint.len,
8967 st->Ist.AbiHint.nia );
8968 break;
8970 case Ist_CAS:
8971 do_shadow_CAS( &mce, st->Ist.CAS.details );
8972 /* Note, do_shadow_CAS copies the CAS itself to the output
8973 block, because it needs to add instrumentation both
8974 before and after it. Hence skip the copy below. Also
8975 skip the origin-tracking stuff (call to schemeS) above,
8976 since that's all tangled up with it too; do_shadow_CAS
8977 does it all. */
8978 break;
8980 case Ist_LLSC:
8981 do_shadow_LLSC( &mce,
8982 st->Ist.LLSC.end,
8983 st->Ist.LLSC.result,
8984 st->Ist.LLSC.addr,
8985 st->Ist.LLSC.storedata );
8986 break;
8988 default:
8989 VG_(printf)("\n");
8990 ppIRStmt(st);
8991 VG_(printf)("\n");
8992 VG_(tool_panic)("memcheck: unhandled IRStmt");
8994 } /* switch (st->tag) */
8996 if (0 && verboze) {
8997 for (j = first_stmt; j < sb_out->stmts_used; j++) {
8998 VG_(printf)(" ");
8999 ppIRStmt(sb_out->stmts[j]);
9000 VG_(printf)("\n");
9002 VG_(printf)("\n");
9005 /* ... and finally copy the stmt itself to the output. Except,
9006 skip the copy of IRCASs; see comments on case Ist_CAS
9007 above. */
9008 if (st->tag != Ist_CAS)
9009 stmt('C', &mce, st);
9012 /* Now we need to complain if the jump target is undefined. */
9013 first_stmt = sb_out->stmts_used;
9015 if (verboze) {
9016 VG_(printf)("sb_in->next = ");
9017 ppIRExpr(sb_in->next);
9018 VG_(printf)("\n\n");
9021 complainIfUndefined( &mce, sb_in->next, NULL );
9023 if (0 && verboze) {
9024 for (j = first_stmt; j < sb_out->stmts_used; j++) {
9025 VG_(printf)(" ");
9026 ppIRStmt(sb_out->stmts[j]);
9027 VG_(printf)("\n");
9029 VG_(printf)("\n");
9032 /* If this fails, there's been some serious snafu with tmp management,
9033 that should be investigated. */
9034 tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
9035 VG_(deleteXA)( mce.tmpMap );
9037 if (mce.tmpHowUsed) {
9038 VG_(free)( mce.tmpHowUsed );
9041 tl_assert(mce.sb == sb_out);
9042 return sb_out;
9046 /*--------------------------------------------------------------------*/
9047 /*--- end mc_translate.c ---*/
9048 /*--------------------------------------------------------------------*/