vgdb: Handle EAGAIN in read_buf
[valgrind.git] / memcheck / mc_translate.c
blob72ccb3c8c6b5f4b3bf2a8dcd7182f655538b7afa
2 /*--------------------------------------------------------------------*/
3 /*--- Instrument IR to perform memory checking operations. ---*/
4 /*--- mc_translate.c ---*/
5 /*--------------------------------------------------------------------*/
7 /*
8 This file is part of MemCheck, a heavyweight Valgrind tool for
9 detecting memory errors.
11 Copyright (C) 2000-2017 Julian Seward
12 jseward@acm.org
14 This program is free software; you can redistribute it and/or
15 modify it under the terms of the GNU General Public License as
16 published by the Free Software Foundation; either version 2 of the
17 License, or (at your option) any later version.
19 This program is distributed in the hope that it will be useful, but
20 WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with this program; if not, see <http://www.gnu.org/licenses/>.
27 The GNU General Public License is contained in the file COPYING.
30 #include "pub_tool_basics.h"
31 #include "pub_tool_poolalloc.h" // For mc_include.h
32 #include "pub_tool_hashtable.h" // For mc_include.h
33 #include "pub_tool_libcassert.h"
34 #include "pub_tool_libcprint.h"
35 #include "pub_tool_tooliface.h"
36 #include "pub_tool_machine.h" // VG_(fnptr_to_fnentry)
37 #include "pub_tool_xarray.h"
38 #include "pub_tool_mallocfree.h"
39 #include "pub_tool_libcbase.h"
41 #include "mc_include.h"
44 /* FIXMEs JRS 2011-June-16.
46 Check the interpretation for vector narrowing and widening ops,
47 particularly the saturating ones. I suspect they are either overly
48 pessimistic and/or wrong.
50 Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
51 saturating shifts): the interpretation is overly pessimistic.
52 See comments on the relevant cases below for details.
54 Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
55 both rounding and non-rounding variants): ditto
58 /* This file implements the Memcheck instrumentation, and in
59 particular contains the core of its undefined value detection
60 machinery. For a comprehensive background of the terminology,
61 algorithms and rationale used herein, read:
63 Using Valgrind to detect undefined value errors with
64 bit-precision
66 Julian Seward and Nicholas Nethercote
68 2005 USENIX Annual Technical Conference (General Track),
69 Anaheim, CA, USA, April 10-15, 2005.
71 ----
73 Here is as good a place as any to record exactly when V bits are and
74 should be checked, why, and what function is responsible.
77 Memcheck complains when an undefined value is used:
79 1. In the condition of a conditional branch. Because it could cause
80 incorrect control flow, and thus cause incorrect externally-visible
81 behaviour. [mc_translate.c:complainIfUndefined]
83 2. As an argument to a system call, or as the value that specifies
84 the system call number. Because it could cause an incorrect
85 externally-visible side effect. [mc_translate.c:mc_pre_reg_read]
87 3. As the address in a load or store. Because it could cause an
88 incorrect value to be used later, which could cause externally-visible
89 behaviour (eg. via incorrect control flow or an incorrect system call
90 argument) [complainIfUndefined]
92 4. As the target address of a branch. Because it could cause incorrect
93 control flow. [complainIfUndefined]
95 5. As an argument to setenv, unsetenv, or putenv. Because it could put
96 an incorrect value into the external environment.
97 [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
99 6. As the index in a GETI or PUTI operation. I'm not sure why... (njn).
100 [complainIfUndefined]
102 7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
103 VALGRIND_CHECK_VALUE_IS_DEFINED client requests. Because the user
104 requested it. [in memcheck.h]
107 Memcheck also complains, but should not, when an undefined value is used:
109 8. As the shift value in certain SIMD shift operations (but not in the
110 standard integer shift operations). This inconsistency is due to
111 historical reasons.) [complainIfUndefined]
114 Memcheck does not complain, but should, when an undefined value is used:
116 9. As an input to a client request. Because the client request may
117 affect the visible behaviour -- see bug #144362 for an example
118 involving the malloc replacements in vg_replace_malloc.c and
119 VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
120 isn't identified. That bug report also has some info on how to solve
121 the problem. [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
124 In practice, 1 and 2 account for the vast majority of cases.
127 /* Generation of addr-definedness, addr-validity and
128 guard-definedness checks pertaining to loads and stores (Iex_Load,
129 Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
130 loads/stores) was re-checked 11 May 2013. */
133 /*------------------------------------------------------------*/
134 /*--- Forward decls ---*/
135 /*------------------------------------------------------------*/
137 struct _MCEnv;
139 // See below for comments explaining what this is for.
140 typedef
141 enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
142 HowUsed;
144 static IRType shadowTypeV ( IRType ty );
145 static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e,
146 HowUsed hu/*use HuOth if unknown*/ );
147 static IRTemp findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
149 static IRExpr *i128_const_zero(void);
152 /*------------------------------------------------------------*/
153 /*--- Memcheck running state, and tmp management. ---*/
154 /*------------------------------------------------------------*/
156 /* For a few (maybe 1%) IROps, we have both a cheaper, less exact vbit
157 propagation scheme, and a more expensive, more precise vbit propagation
158 scheme. This enum describes, for such an IROp, which scheme to use. */
159 typedef
160 enum {
161 // Use the cheaper, less-exact variant.
162 DLcheap=4,
163 // Choose between cheap and expensive based on analysis of the block
164 // to be instrumented. Note that the choice may be done on a
165 // per-instance basis of the IROp that this DetailLevel describes.
166 DLauto,
167 // Use the more expensive, more-exact variant.
168 DLexpensive
170 DetailLevel;
173 /* A readonly part of the running state. For IROps that have both a
174 less-exact and more-exact interpretation, records which interpretation is
175 to be used. */
176 typedef
177 struct {
178 // For Add32/64 and Sub32/64, all 3 settings are allowed. For the
179 // DLauto case, a per-instance decision is to be made by inspecting
180 // the associated tmp's entry in MCEnv.tmpHowUsed.
181 DetailLevel dl_Add32;
182 DetailLevel dl_Add64;
183 DetailLevel dl_Sub32;
184 DetailLevel dl_Sub64;
185 // For Cmp{EQ,NE}{64,32,16,8}, only DLcheap and DLexpensive are
186 // allowed.
187 DetailLevel dl_CmpEQ64_CmpNE64;
188 DetailLevel dl_CmpEQ32_CmpNE32;
189 DetailLevel dl_CmpEQ16_CmpNE16;
190 DetailLevel dl_CmpEQ8_CmpNE8;
192 DetailLevelByOp;
194 static void DetailLevelByOp__set_all ( /*OUT*/DetailLevelByOp* dlbo,
195 DetailLevel dl )
197 dlbo->dl_Add32 = dl;
198 dlbo->dl_Add64 = dl;
199 dlbo->dl_Sub32 = dl;
200 dlbo->dl_Sub64 = dl;
201 dlbo->dl_CmpEQ64_CmpNE64 = dl;
202 dlbo->dl_CmpEQ32_CmpNE32 = dl;
203 dlbo->dl_CmpEQ16_CmpNE16 = dl;
204 dlbo->dl_CmpEQ8_CmpNE8 = dl;
207 static void DetailLevelByOp__check_sanity ( const DetailLevelByOp* dlbo )
209 tl_assert(dlbo->dl_Add32 >= DLcheap && dlbo->dl_Add32 <= DLexpensive);
210 tl_assert(dlbo->dl_Add64 >= DLcheap && dlbo->dl_Add64 <= DLexpensive);
211 tl_assert(dlbo->dl_Sub32 >= DLcheap && dlbo->dl_Sub32 <= DLexpensive);
212 tl_assert(dlbo->dl_Sub64 >= DLcheap && dlbo->dl_Sub64 <= DLexpensive);
213 tl_assert(dlbo->dl_CmpEQ64_CmpNE64 == DLcheap
214 || dlbo->dl_CmpEQ64_CmpNE64 == DLexpensive);
215 tl_assert(dlbo->dl_CmpEQ32_CmpNE32 == DLcheap
216 || dlbo->dl_CmpEQ32_CmpNE32 == DLexpensive);
217 tl_assert(dlbo->dl_CmpEQ16_CmpNE16 == DLcheap
218 || dlbo->dl_CmpEQ16_CmpNE16 == DLexpensive);
219 tl_assert(dlbo->dl_CmpEQ8_CmpNE8 == DLcheap
220 || dlbo->dl_CmpEQ8_CmpNE8 == DLexpensive);
223 static UInt DetailLevelByOp__count ( const DetailLevelByOp* dlbo,
224 DetailLevel dl )
226 UInt n = 0;
227 n += (dlbo->dl_Add32 == dl ? 1 : 0);
228 n += (dlbo->dl_Add64 == dl ? 1 : 0);
229 n += (dlbo->dl_Sub32 == dl ? 1 : 0);
230 n += (dlbo->dl_Sub64 == dl ? 1 : 0);
231 n += (dlbo->dl_CmpEQ64_CmpNE64 == dl ? 1 : 0);
232 n += (dlbo->dl_CmpEQ32_CmpNE32 == dl ? 1 : 0);
233 n += (dlbo->dl_CmpEQ16_CmpNE16 == dl ? 1 : 0);
234 n += (dlbo->dl_CmpEQ8_CmpNE8 == dl ? 1 : 0);
235 return n;
239 /* Carries info about a particular tmp. The tmp's number is not
240 recorded, as this is implied by (equal to) its index in the tmpMap
241 in MCEnv. The tmp's type is also not recorded, as this is present
242 in MCEnv.sb->tyenv.
244 When .kind is Orig, .shadowV and .shadowB may give the identities
245 of the temps currently holding the associated definedness (shadowV)
246 and origin (shadowB) values, or these may be IRTemp_INVALID if code
247 to compute such values has not yet been emitted.
249 When .kind is VSh or BSh then the tmp is holds a V- or B- value,
250 and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
251 illogical for a shadow tmp itself to be shadowed.
253 typedef
254 enum { Orig=1, VSh=2, BSh=3 }
255 TempKind;
257 typedef
258 struct {
259 TempKind kind;
260 IRTemp shadowV;
261 IRTemp shadowB;
263 TempMapEnt;
266 /* A |HowUsed| value carries analysis results about how values are used,
267 pertaining to whether we need to instrument integer adds expensively or
268 not. The running state carries a (readonly) mapping from original tmp to
269 a HowUsed value for it. A usage value can be one of three values,
270 forming a 3-point chain lattice.
272 HuOth ("Other") used in some arbitrary way
274 HuPCa ("PCast") used *only* in effectively a PCast, in which all
275 | we care about is the all-defined vs not-all-defined distinction
277 HuUnU ("Unused") not used at all.
279 The "safe" (don't-know) end of the lattice is "HuOth". See comments
280 below in |preInstrumentationAnalysis| for further details.
282 /* DECLARED ABOVE:
283 typedef
284 enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
285 HowUsed;
288 // Not actually necessary, but we don't want to waste D1 space.
289 STATIC_ASSERT(sizeof(HowUsed) == 1);
292 /* Carries around state during memcheck instrumentation. */
293 typedef
294 struct _MCEnv {
295 /* MODIFIED: the superblock being constructed. IRStmts are
296 added. */
297 IRSB* sb;
298 Bool trace;
300 /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
301 current kind and possibly shadow temps for each temp in the
302 IRSB being constructed. Note that it does not contain the
303 type of each tmp. If you want to know the type, look at the
304 relevant entry in sb->tyenv. It follows that at all times
305 during the instrumentation process, the valid indices for
306 tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
307 total number of Orig, V- and B- temps allocated so far.
309 The reason for this strange split (types in one place, all
310 other info in another) is that we need the types to be
311 attached to sb so as to make it possible to do
312 "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
313 instrumentation process. */
314 XArray* /* of TempMapEnt */ tmpMap;
316 /* READONLY: contains details of which ops should be expensively
317 instrumented. */
318 DetailLevelByOp dlbo;
320 /* READONLY: for each original tmp, how the tmp is used. This is
321 computed by |preInstrumentationAnalysis|. Valid indices are
322 0 .. #temps_in_sb-1 (same as for tmpMap). */
323 HowUsed* tmpHowUsed;
325 /* READONLY: the guest layout. This indicates which parts of
326 the guest state should be regarded as 'always defined'. */
327 const VexGuestLayout* layout;
329 /* READONLY: the host word type. Needed for constructing
330 arguments of type 'HWord' to be passed to helper functions.
331 Ity_I32 or Ity_I64 only. */
332 IRType hWordTy;
334 MCEnv;
337 /* SHADOW TMP MANAGEMENT. Shadow tmps are allocated lazily (on
338 demand), as they are encountered. This is for two reasons.
340 (1) (less important reason): Many original tmps are unused due to
341 initial IR optimisation, and we do not want to spaces in tables
342 tracking them.
344 Shadow IRTemps are therefore allocated on demand. mce.tmpMap is a
345 table indexed [0 .. n_types-1], which gives the current shadow for
346 each original tmp, or INVALID_IRTEMP if none is so far assigned.
347 It is necessary to support making multiple assignments to a shadow
348 -- specifically, after testing a shadow for definedness, it needs
349 to be made defined. But IR's SSA property disallows this.
351 (2) (more important reason): Therefore, when a shadow needs to get
352 a new value, a new temporary is created, the value is assigned to
353 that, and the tmpMap is updated to reflect the new binding.
355 A corollary is that if the tmpMap maps a given tmp to
356 IRTemp_INVALID and we are hoping to read that shadow tmp, it means
357 there's a read-before-write error in the original tmps. The IR
358 sanity checker should catch all such anomalies, however.
361 /* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
362 both the table in mce->sb and to our auxiliary mapping. Note that
363 newTemp may cause mce->tmpMap to resize, hence previous results
364 from VG_(indexXA)(mce->tmpMap) are invalidated. */
365 static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
367 Word newIx;
368 TempMapEnt ent;
369 IRTemp tmp = newIRTemp(mce->sb->tyenv, ty);
370 ent.kind = kind;
371 ent.shadowV = IRTemp_INVALID;
372 ent.shadowB = IRTemp_INVALID;
373 newIx = VG_(addToXA)( mce->tmpMap, &ent );
374 tl_assert(newIx == (Word)tmp);
375 return tmp;
379 /* Find the tmp currently shadowing the given original tmp. If none
380 so far exists, allocate one. */
381 static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
383 TempMapEnt* ent;
384 /* VG_(indexXA) range-checks 'orig', hence no need to check
385 here. */
386 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
387 tl_assert(ent->kind == Orig);
388 if (ent->shadowV == IRTemp_INVALID) {
389 IRTemp tmpV
390 = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
391 /* newTemp may cause mce->tmpMap to resize, hence previous results
392 from VG_(indexXA) are invalid. */
393 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
394 tl_assert(ent->kind == Orig);
395 tl_assert(ent->shadowV == IRTemp_INVALID);
396 ent->shadowV = tmpV;
398 return ent->shadowV;
401 /* Allocate a new shadow for the given original tmp. This means any
402 previous shadow is abandoned. This is needed because it is
403 necessary to give a new value to a shadow once it has been tested
404 for undefinedness, but unfortunately IR's SSA property disallows
405 this. Instead we must abandon the old shadow, allocate a new one
406 and use that instead.
408 This is the same as findShadowTmpV, except we don't bother to see
409 if a shadow temp already existed -- we simply allocate a new one
410 regardless. */
411 static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
413 TempMapEnt* ent;
414 /* VG_(indexXA) range-checks 'orig', hence no need to check
415 here. */
416 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
417 tl_assert(ent->kind == Orig);
418 if (1) {
419 IRTemp tmpV
420 = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
421 /* newTemp may cause mce->tmpMap to resize, hence previous results
422 from VG_(indexXA) are invalid. */
423 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
424 tl_assert(ent->kind == Orig);
425 ent->shadowV = tmpV;
430 /*------------------------------------------------------------*/
431 /*--- IRAtoms -- a subset of IRExprs ---*/
432 /*------------------------------------------------------------*/
434 /* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
435 isIRAtom() in libvex_ir.h. Because this instrumenter expects flat
436 input, most of this code deals in atoms. Usefully, a value atom
437 always has a V-value which is also an atom: constants are shadowed
438 by constants, and temps are shadowed by the corresponding shadow
439 temporary. */
441 typedef IRExpr IRAtom;
443 /* (used for sanity checks only): is this an atom which looks
444 like it's from original code? */
445 static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
447 if (a1->tag == Iex_Const)
448 return True;
449 if (a1->tag == Iex_RdTmp) {
450 TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
451 return ent->kind == Orig;
453 return False;
456 /* (used for sanity checks only): is this an atom which looks
457 like it's from shadow code? */
458 static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
460 if (a1->tag == Iex_Const)
461 return True;
462 if (a1->tag == Iex_RdTmp) {
463 TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
464 return ent->kind == VSh || ent->kind == BSh;
466 return False;
469 /* (used for sanity checks only): check that both args are atoms and
470 are identically-kinded. */
471 static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
473 if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
474 return True;
475 if (a1->tag == Iex_Const && a2->tag == Iex_Const)
476 return True;
477 return False;
481 /*------------------------------------------------------------*/
482 /*--- Type management ---*/
483 /*------------------------------------------------------------*/
485 /* Shadow state is always accessed using integer types. This returns
486 an integer type with the same size (as per sizeofIRType) as the
487 given type. The only valid shadow types are Bit, I8, I16, I32,
488 I64, I128, V128, V256. */
490 static IRType shadowTypeV ( IRType ty )
492 switch (ty) {
493 case Ity_I1:
494 case Ity_I8:
495 case Ity_I16:
496 case Ity_I32:
497 case Ity_I64:
498 case Ity_I128: return ty;
499 case Ity_F16: return Ity_I16;
500 case Ity_F32: return Ity_I32;
501 case Ity_D32: return Ity_I32;
502 case Ity_F64: return Ity_I64;
503 case Ity_D64: return Ity_I64;
504 case Ity_F128: return Ity_I128;
505 case Ity_D128: return Ity_I128;
506 case Ity_V128: return Ity_V128;
507 case Ity_V256: return Ity_V256;
508 default: ppIRType(ty);
509 VG_(tool_panic)("memcheck:shadowTypeV");
513 /* Produce a 'defined' value of the given shadow type. Should only be
514 supplied shadow types (Bit/I8/I16/I32/UI64). */
515 static IRExpr* definedOfType ( IRType ty ) {
516 switch (ty) {
517 case Ity_I1: return IRExpr_Const(IRConst_U1(False));
518 case Ity_I8: return IRExpr_Const(IRConst_U8(0));
519 case Ity_I16: return IRExpr_Const(IRConst_U16(0));
520 case Ity_I32: return IRExpr_Const(IRConst_U32(0));
521 case Ity_I64: return IRExpr_Const(IRConst_U64(0));
522 case Ity_I128: return i128_const_zero();
523 case Ity_V128: return IRExpr_Const(IRConst_V128(0x0000));
524 case Ity_V256: return IRExpr_Const(IRConst_V256(0x00000000));
525 default: VG_(tool_panic)("memcheck:definedOfType");
530 /*------------------------------------------------------------*/
531 /*--- Constructing IR fragments ---*/
532 /*------------------------------------------------------------*/
534 /* add stmt to a bb */
535 static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
536 if (mce->trace) {
537 VG_(printf)(" %c: ", cat);
538 ppIRStmt(st);
539 VG_(printf)("\n");
541 addStmtToIRSB(mce->sb, st);
544 /* assign value to tmp */
545 static inline
546 void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
547 stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
550 /* build various kinds of expressions */
551 #define triop(_op, _arg1, _arg2, _arg3) \
552 IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
553 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
554 #define unop(_op, _arg) IRExpr_Unop((_op),(_arg))
555 #define mkU1(_n) IRExpr_Const(IRConst_U1(_n))
556 #define mkU8(_n) IRExpr_Const(IRConst_U8(_n))
557 #define mkU16(_n) IRExpr_Const(IRConst_U16(_n))
558 #define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
559 #define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
560 #define mkV128(_n) IRExpr_Const(IRConst_V128(_n))
561 #define mkexpr(_tmp) IRExpr_RdTmp((_tmp))
563 /* Bind the given expression to a new temporary, and return the
564 temporary. This effectively converts an arbitrary expression into
565 an atom.
567 'ty' is the type of 'e' and hence the type that the new temporary
568 needs to be. But passing it in is redundant, since we can deduce
569 the type merely by inspecting 'e'. So at least use that fact to
570 assert that the two types agree. */
571 static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
573 TempKind k;
574 IRTemp t;
575 IRType tyE = typeOfIRExpr(mce->sb->tyenv, e);
577 tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
578 switch (cat) {
579 case 'V': k = VSh; break;
580 case 'B': k = BSh; break;
581 case 'C': k = Orig; break;
582 /* happens when we are making up new "orig"
583 expressions, for IRCAS handling */
584 default: tl_assert(0);
586 t = newTemp(mce, ty, k);
587 assign(cat, mce, t, e);
588 return mkexpr(t);
592 /*------------------------------------------------------------*/
593 /*--- Helper functions for 128-bit ops ---*/
594 /*------------------------------------------------------------*/
596 static IRExpr *i128_const_zero(void)
598 IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
599 return binop(Iop_64HLto128, z64, z64);
602 /* There are no I128-bit loads and/or stores [as generated by any
603 current front ends]. So we do not need to worry about that in
604 expr2vbits_Load */
607 /*------------------------------------------------------------*/
608 /*--- Constructing definedness primitive ops ---*/
609 /*------------------------------------------------------------*/
611 /* --------- Defined-if-either-defined --------- */
613 static IRAtom* mkDifD1 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
614 tl_assert(isShadowAtom(mce,a1));
615 tl_assert(isShadowAtom(mce,a2));
616 return assignNew('V', mce, Ity_I1, binop(Iop_And1, a1, a2));
619 static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
620 tl_assert(isShadowAtom(mce,a1));
621 tl_assert(isShadowAtom(mce,a2));
622 return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
625 static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
626 tl_assert(isShadowAtom(mce,a1));
627 tl_assert(isShadowAtom(mce,a2));
628 return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
631 static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
632 tl_assert(isShadowAtom(mce,a1));
633 tl_assert(isShadowAtom(mce,a2));
634 return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
637 static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
638 tl_assert(isShadowAtom(mce,a1));
639 tl_assert(isShadowAtom(mce,a2));
640 return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
643 static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
644 tl_assert(isShadowAtom(mce,a1));
645 tl_assert(isShadowAtom(mce,a2));
646 return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
649 static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
650 tl_assert(isShadowAtom(mce,a1));
651 tl_assert(isShadowAtom(mce,a2));
652 return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
655 /* --------- Undefined-if-either-undefined --------- */
657 static IRAtom* mkUifU1 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
658 tl_assert(isShadowAtom(mce,a1));
659 tl_assert(isShadowAtom(mce,a2));
660 return assignNew('V', mce, Ity_I1, binop(Iop_Or1, a1, a2));
663 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
664 tl_assert(isShadowAtom(mce,a1));
665 tl_assert(isShadowAtom(mce,a2));
666 return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
669 static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
670 tl_assert(isShadowAtom(mce,a1));
671 tl_assert(isShadowAtom(mce,a2));
672 return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
675 static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
676 tl_assert(isShadowAtom(mce,a1));
677 tl_assert(isShadowAtom(mce,a2));
678 return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
681 static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
682 tl_assert(isShadowAtom(mce,a1));
683 tl_assert(isShadowAtom(mce,a2));
684 return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
687 static IRAtom* mkUifU128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
688 IRAtom *tmp1, *tmp2, *tmp3, *tmp4, *tmp5, *tmp6;
689 tl_assert(isShadowAtom(mce,a1));
690 tl_assert(isShadowAtom(mce,a2));
691 tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a1));
692 tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a1));
693 tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a2));
694 tmp4 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a2));
695 tmp5 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp1, tmp3));
696 tmp6 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp4));
698 return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp6, tmp5));
701 static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
702 tl_assert(isShadowAtom(mce,a1));
703 tl_assert(isShadowAtom(mce,a2));
704 return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
707 static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
708 tl_assert(isShadowAtom(mce,a1));
709 tl_assert(isShadowAtom(mce,a2));
710 return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
713 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
714 switch (vty) {
715 case Ity_I8: return mkUifU8(mce, a1, a2);
716 case Ity_I16: return mkUifU16(mce, a1, a2);
717 case Ity_I32: return mkUifU32(mce, a1, a2);
718 case Ity_I64: return mkUifU64(mce, a1, a2);
719 case Ity_I128: return mkUifU128(mce, a1, a2);
720 case Ity_V128: return mkUifUV128(mce, a1, a2);
721 case Ity_V256: return mkUifUV256(mce, a1, a2);
722 default:
723 VG_(printf)("\n"); ppIRType(vty); VG_(printf)("\n");
724 VG_(tool_panic)("memcheck:mkUifU");
728 /* --------- The Left-family of operations. --------- */
730 static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
731 tl_assert(isShadowAtom(mce,a1));
732 return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
735 static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
736 tl_assert(isShadowAtom(mce,a1));
737 return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
740 static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
741 tl_assert(isShadowAtom(mce,a1));
742 return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
745 static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
746 tl_assert(isShadowAtom(mce,a1));
747 return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
750 /* --------- The Right-family of operations. --------- */
752 /* Unfortunately these are a lot more expensive then their Left
753 counterparts. Fortunately they are only very rarely used -- only for
754 count-leading-zeroes instrumentation. */
756 static IRAtom* mkRight32 ( MCEnv* mce, IRAtom* a1 )
758 for (Int i = 1; i <= 16; i *= 2) {
759 // a1 |= (a1 >>u i)
760 IRAtom* tmp
761 = assignNew('V', mce, Ity_I32, binop(Iop_Shr32, a1, mkU8(i)));
762 a1 = assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, tmp));
764 return a1;
767 static IRAtom* mkRight64 ( MCEnv* mce, IRAtom* a1 )
769 for (Int i = 1; i <= 32; i *= 2) {
770 // a1 |= (a1 >>u i)
771 IRAtom* tmp
772 = assignNew('V', mce, Ity_I64, binop(Iop_Shr64, a1, mkU8(i)));
773 a1 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, tmp));
775 return a1;
778 /* --------- 'Improvement' functions for AND/OR. --------- */
780 /* ImproveAND(data, vbits) = data OR vbits. Defined (0) data 0s give
781 defined (0); all other -> undefined (1).
783 static IRAtom* mkImproveAND1 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
785 tl_assert(isOriginalAtom(mce, data));
786 tl_assert(isShadowAtom(mce, vbits));
787 tl_assert(sameKindedAtoms(data, vbits));
788 return assignNew('V', mce, Ity_I1, binop(Iop_Or1, data, vbits));
791 static IRAtom* mkImproveAND8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
793 tl_assert(isOriginalAtom(mce, data));
794 tl_assert(isShadowAtom(mce, vbits));
795 tl_assert(sameKindedAtoms(data, vbits));
796 return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
799 static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
801 tl_assert(isOriginalAtom(mce, data));
802 tl_assert(isShadowAtom(mce, vbits));
803 tl_assert(sameKindedAtoms(data, vbits));
804 return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
807 static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
809 tl_assert(isOriginalAtom(mce, data));
810 tl_assert(isShadowAtom(mce, vbits));
811 tl_assert(sameKindedAtoms(data, vbits));
812 return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
815 static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
817 tl_assert(isOriginalAtom(mce, data));
818 tl_assert(isShadowAtom(mce, vbits));
819 tl_assert(sameKindedAtoms(data, vbits));
820 return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
823 static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
825 tl_assert(isOriginalAtom(mce, data));
826 tl_assert(isShadowAtom(mce, vbits));
827 tl_assert(sameKindedAtoms(data, vbits));
828 return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
831 static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
833 tl_assert(isOriginalAtom(mce, data));
834 tl_assert(isShadowAtom(mce, vbits));
835 tl_assert(sameKindedAtoms(data, vbits));
836 return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
839 /* ImproveOR(data, vbits) = ~data OR vbits. Defined (0) data 1s give
840 defined (0); all other -> undefined (1).
842 static IRAtom* mkImproveOR1 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
844 tl_assert(isOriginalAtom(mce, data));
845 tl_assert(isShadowAtom(mce, vbits));
846 tl_assert(sameKindedAtoms(data, vbits));
847 return assignNew(
848 'V', mce, Ity_I1,
849 binop(Iop_Or1,
850 assignNew('V', mce, Ity_I1, unop(Iop_Not1, data)),
851 vbits) );
854 static IRAtom* mkImproveOR8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
856 tl_assert(isOriginalAtom(mce, data));
857 tl_assert(isShadowAtom(mce, vbits));
858 tl_assert(sameKindedAtoms(data, vbits));
859 return assignNew(
860 'V', mce, Ity_I8,
861 binop(Iop_Or8,
862 assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)),
863 vbits) );
866 static IRAtom* mkImproveOR16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
868 tl_assert(isOriginalAtom(mce, data));
869 tl_assert(isShadowAtom(mce, vbits));
870 tl_assert(sameKindedAtoms(data, vbits));
871 return assignNew(
872 'V', mce, Ity_I16,
873 binop(Iop_Or16,
874 assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)),
875 vbits) );
878 static IRAtom* mkImproveOR32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
880 tl_assert(isOriginalAtom(mce, data));
881 tl_assert(isShadowAtom(mce, vbits));
882 tl_assert(sameKindedAtoms(data, vbits));
883 return assignNew(
884 'V', mce, Ity_I32,
885 binop(Iop_Or32,
886 assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)),
887 vbits) );
890 static IRAtom* mkImproveOR64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
892 tl_assert(isOriginalAtom(mce, data));
893 tl_assert(isShadowAtom(mce, vbits));
894 tl_assert(sameKindedAtoms(data, vbits));
895 return assignNew(
896 'V', mce, Ity_I64,
897 binop(Iop_Or64,
898 assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)),
899 vbits) );
902 static IRAtom* mkImproveORV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
904 tl_assert(isOriginalAtom(mce, data));
905 tl_assert(isShadowAtom(mce, vbits));
906 tl_assert(sameKindedAtoms(data, vbits));
907 return assignNew(
908 'V', mce, Ity_V128,
909 binop(Iop_OrV128,
910 assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)),
911 vbits) );
914 static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
916 tl_assert(isOriginalAtom(mce, data));
917 tl_assert(isShadowAtom(mce, vbits));
918 tl_assert(sameKindedAtoms(data, vbits));
919 return assignNew(
920 'V', mce, Ity_V256,
921 binop(Iop_OrV256,
922 assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)),
923 vbits) );
926 /* --------- Pessimising casts. --------- */
928 /* The function returns an expression of type DST_TY. If any of the VBITS
929 is undefined (value == 1) the resulting expression has all bits set to
930 1. Otherwise, all bits are 0. */
932 static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
934 IRType src_ty;
935 IRAtom* tmp1;
937 /* Note, dst_ty is a shadow type, not an original type. */
938 tl_assert(isShadowAtom(mce,vbits));
939 src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
941 /* Fast-track some common cases */
942 if (src_ty == Ity_I32 && dst_ty == Ity_I32)
943 return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
945 if (src_ty == Ity_I64 && dst_ty == Ity_I64)
946 return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
948 if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
949 /* PCast the arg, then clone it. */
950 IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
951 return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
954 if (src_ty == Ity_I32 && dst_ty == Ity_V128) {
955 /* PCast the arg, then clone it 4 times. */
956 IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
957 tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
958 return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
961 if (src_ty == Ity_I32 && dst_ty == Ity_V256) {
962 /* PCast the arg, then clone it 8 times. */
963 IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
964 tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
965 tmp = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
966 return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, tmp, tmp));
969 if (src_ty == Ity_I64 && dst_ty == Ity_I32) {
970 /* PCast the arg. This gives all 0s or all 1s. Then throw away
971 the top half. */
972 IRAtom* tmp = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
973 return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
976 if (src_ty == Ity_V128 && dst_ty == Ity_I64) {
977 /* Use InterleaveHI64x2 to copy the top half of the vector into
978 the bottom half. Then we can UifU it with the original, throw
979 away the upper half of the result, and PCast-I64-to-I64
980 the lower half. */
981 // Generates vbits[127:64] : vbits[127:64]
982 IRAtom* hi64hi64
983 = assignNew('V', mce, Ity_V128,
984 binop(Iop_InterleaveHI64x2, vbits, vbits));
985 // Generates
986 // UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
987 // == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
988 IRAtom* lohi64
989 = mkUifUV128(mce, hi64hi64, vbits);
990 // Generates UifU(vbits[127:64],vbits[63:0])
991 IRAtom* lo64
992 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64));
993 // Generates
994 // PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
995 // == PCast-to-I64( vbits[127:0] )
996 IRAtom* res
997 = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64));
998 return res;
1001 /* Else do it the slow way .. */
1002 /* First of all, collapse vbits down to a single bit. */
1003 tmp1 = NULL;
1004 switch (src_ty) {
1005 case Ity_I1:
1006 tmp1 = vbits;
1007 break;
1008 case Ity_I8:
1009 tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
1010 break;
1011 case Ity_I16:
1012 tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
1013 break;
1014 case Ity_I32:
1015 tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
1016 break;
1017 case Ity_I64:
1018 tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
1019 break;
1020 case Ity_I128: {
1021 /* Gah. Chop it in half, OR the halves together, and compare
1022 that with zero. */
1023 IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
1024 IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
1025 IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
1026 tmp1 = assignNew('V', mce, Ity_I1,
1027 unop(Iop_CmpNEZ64, tmp4));
1028 break;
1030 case Ity_V128: {
1031 /* Chop it in half, OR the halves together, and compare that
1032 * with zero.
1034 IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vbits));
1035 IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vbits));
1036 IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
1037 tmp1 = assignNew('V', mce, Ity_I1,
1038 unop(Iop_CmpNEZ64, tmp4));
1039 break;
1041 default:
1042 ppIRType(src_ty);
1043 VG_(tool_panic)("mkPCastTo(1)");
1045 tl_assert(tmp1);
1046 /* Now widen up to the dst type. */
1047 switch (dst_ty) {
1048 case Ity_I1:
1049 return tmp1;
1050 case Ity_I8:
1051 return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
1052 case Ity_I16:
1053 return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
1054 case Ity_I32:
1055 return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
1056 case Ity_I64:
1057 return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1058 case Ity_V128:
1059 tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1060 tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
1061 return tmp1;
1062 case Ity_I128:
1063 tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1064 tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
1065 return tmp1;
1066 case Ity_V256:
1067 tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
1068 tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
1069 tmp1, tmp1));
1070 tmp1 = assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
1071 tmp1, tmp1));
1072 return tmp1;
1073 default:
1074 ppIRType(dst_ty);
1075 VG_(tool_panic)("mkPCastTo(2)");
1079 /* This is a minor variant. It takes an arg of some type and returns
1080 a value of the same type. The result consists entirely of Defined
1081 (zero) bits except its least significant bit, which is a PCast of
1082 the entire argument down to a single bit. */
1083 static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty )
1085 if (ty == Ity_V128) {
1086 /* --- Case for V128 --- */
1087 IRAtom* varg128 = varg;
1088 // generates: PCast-to-I64(varg128)
1089 IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128);
1090 // Now introduce zeros (defined bits) in the top 63 places
1091 // generates: Def--(63)--Def PCast-to-I1(varg128)
1092 IRAtom* d63pc
1093 = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1)));
1094 // generates: Def--(64)--Def
1095 IRAtom* d64
1096 = definedOfType(Ity_I64);
1097 // generates: Def--(127)--Def PCast-to-I1(varg128)
1098 IRAtom* res
1099 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc));
1100 return res;
1102 if (ty == Ity_I64) {
1103 /* --- Case for I64 --- */
1104 // PCast to 64
1105 IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg);
1106 // Zero (Def) out the top 63 bits
1107 IRAtom* res
1108 = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1)));
1109 return res;
1111 /*NOTREACHED*/
1112 tl_assert(0);
1115 /* --------- Optimistic casts. --------- */
1117 /* The function takes and returns an expression of type TY. If any of the
1118 VBITS indicate defined (value == 0) the resulting expression has all bits
1119 set to 0. Otherwise, all bits are 1. In words, if any bits are defined
1120 then all bits are made to be defined.
1122 In short we compute (vbits - (vbits >>u 1)) >>s (bitsize(vbits)-1).
1124 static IRAtom* mkOCastAt( MCEnv* mce, IRType ty, IRAtom* vbits )
1126 IROp opSUB, opSHR, opSAR;
1127 UInt sh;
1129 switch (ty) {
1130 case Ity_I64:
1131 opSUB = Iop_Sub64; opSHR = Iop_Shr64; opSAR = Iop_Sar64; sh = 63;
1132 break;
1133 case Ity_I32:
1134 opSUB = Iop_Sub32; opSHR = Iop_Shr32; opSAR = Iop_Sar32; sh = 31;
1135 break;
1136 case Ity_I16:
1137 opSUB = Iop_Sub16; opSHR = Iop_Shr16; opSAR = Iop_Sar16; sh = 15;
1138 break;
1139 case Ity_I8:
1140 opSUB = Iop_Sub8; opSHR = Iop_Shr8; opSAR = Iop_Sar8; sh = 7;
1141 break;
1142 default:
1143 ppIRType(ty);
1144 VG_(tool_panic)("mkOCastTo");
1147 IRAtom *shr1, *at;
1148 shr1 = assignNew('V', mce,ty, binop(opSHR, vbits, mkU8(1)));
1149 at = assignNew('V', mce,ty, binop(opSUB, vbits, shr1));
1150 at = assignNew('V', mce,ty, binop(opSAR, at, mkU8(sh)));
1151 return at;
1155 /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
1157 Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
1158 PCasting to Ity_U1. However, sometimes it is necessary to be more
1159 accurate. The insight is that the result is defined if two
1160 corresponding bits can be found, one from each argument, so that
1161 both bits are defined but are different -- that makes EQ say "No"
1162 and NE say "Yes". Hence, we compute an improvement term and DifD
1163 it onto the "normal" (UifU) result.
1165 The result is:
1167 PCastTo<1> (
1168 -- naive version
1169 UifU<sz>(vxx, vyy)
1171 `DifD<sz>`
1173 -- improvement term
1174 OCast<sz>(vec)
1177 where
1178 vec contains 0 (defined) bits where the corresponding arg bits
1179 are defined but different, and 1 bits otherwise.
1181 vec = Or<sz>( vxx, // 0 iff bit defined
1182 vyy, // 0 iff bit defined
1183 Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
1186 If any bit of vec is 0, the result is defined and so the
1187 improvement term should produce 0...0, else it should produce
1188 1...1.
1190 Hence require for the improvement term:
1192 OCast(vec) = if vec == 1...1 then 1...1 else 0...0
1194 which you can think of as an "optimistic cast" (OCast, the opposite of
1195 the normal "pessimistic cast" (PCast) family. An OCast says all bits
1196 are defined if any bit is defined.
1198 It is possible to show that
1200 if vec == 1...1 then 1...1 else 0...0
1202 can be implemented in straight-line code as
1204 (vec - (vec >>u 1)) >>s (word-size-in-bits - 1)
1206 We note that vec contains the sub-term Or<sz>(vxx, vyy). Since UifU is
1207 implemented with Or (since 1 signifies undefinedness), this is a
1208 duplicate of the UifU<sz>(vxx, vyy) term and so we can CSE it out, giving
1209 a final version of:
1211 let naive = UifU<sz>(vxx, vyy)
1212 vec = Or<sz>(naive, Not<sz>(Xor<sz)(xx, yy))
1214 PCastTo<1>( DifD<sz>(naive, OCast<sz>(vec)) )
1216 This was extensively re-analysed and checked on 6 July 05 and again
1217 in July 2017.
1219 static IRAtom* expensiveCmpEQorNE ( MCEnv* mce,
1220 IRType ty,
1221 IRAtom* vxx, IRAtom* vyy,
1222 IRAtom* xx, IRAtom* yy )
1224 IRAtom *naive, *vec, *improved, *final_cast;
1225 IROp opDIFD, opUIFU, opOR, opXOR, opNOT;
1227 tl_assert(isShadowAtom(mce,vxx));
1228 tl_assert(isShadowAtom(mce,vyy));
1229 tl_assert(isOriginalAtom(mce,xx));
1230 tl_assert(isOriginalAtom(mce,yy));
1231 tl_assert(sameKindedAtoms(vxx,xx));
1232 tl_assert(sameKindedAtoms(vyy,yy));
1234 switch (ty) {
1235 case Ity_I8:
1236 opDIFD = Iop_And8;
1237 opUIFU = Iop_Or8;
1238 opOR = Iop_Or8;
1239 opXOR = Iop_Xor8;
1240 opNOT = Iop_Not8;
1241 break;
1242 case Ity_I16:
1243 opDIFD = Iop_And16;
1244 opUIFU = Iop_Or16;
1245 opOR = Iop_Or16;
1246 opXOR = Iop_Xor16;
1247 opNOT = Iop_Not16;
1248 break;
1249 case Ity_I32:
1250 opDIFD = Iop_And32;
1251 opUIFU = Iop_Or32;
1252 opOR = Iop_Or32;
1253 opXOR = Iop_Xor32;
1254 opNOT = Iop_Not32;
1255 break;
1256 case Ity_I64:
1257 opDIFD = Iop_And64;
1258 opUIFU = Iop_Or64;
1259 opOR = Iop_Or64;
1260 opXOR = Iop_Xor64;
1261 opNOT = Iop_Not64;
1262 break;
1263 default:
1264 VG_(tool_panic)("expensiveCmpEQorNE");
1267 naive
1268 = assignNew('V', mce, ty, binop(opUIFU, vxx, vyy));
1270 vec
1271 = assignNew(
1272 'V', mce,ty,
1273 binop( opOR,
1274 naive,
1275 assignNew(
1276 'V', mce,ty,
1277 unop(opNOT,
1278 assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
1280 improved
1281 = assignNew( 'V', mce,ty,
1282 binop(opDIFD, naive, mkOCastAt(mce, ty, vec)));
1284 final_cast
1285 = mkPCastTo( mce, Ity_I1, improved );
1287 return final_cast;
1291 /* --------- Semi-accurate interpretation of CmpORD. --------- */
1293 /* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
1295 CmpORD32S(x,y) = 1<<3 if x <s y
1296 = 1<<2 if x >s y
1297 = 1<<1 if x == y
1299 and similarly the unsigned variant. The default interpretation is:
1301 CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
1302 & (7<<1)
1304 The "& (7<<1)" reflects the fact that all result bits except 3,2,1
1305 are zero and therefore defined (viz, zero).
1307 Also deal with a special case better:
1309 CmpORD32S(x,0)
1311 Here, bit 3 (LT) of the result is a copy of the top bit of x and
1312 will be defined even if the rest of x isn't. In which case we do:
1314 CmpORD32S#(x,x#,0,{impliedly 0}#)
1315 = PCast(x#) & (3<<1) -- standard interp for GT#,EQ#
1316 | (x# >>u 31) << 3 -- LT# = x#[31]
1318 Analogous handling for CmpORD64{S,U}.
1320 static Bool isZeroU32 ( IRAtom* e )
1322 return
1323 toBool( e->tag == Iex_Const
1324 && e->Iex.Const.con->tag == Ico_U32
1325 && e->Iex.Const.con->Ico.U32 == 0 );
1328 static Bool isZeroU64 ( IRAtom* e )
1330 return
1331 toBool( e->tag == Iex_Const
1332 && e->Iex.Const.con->tag == Ico_U64
1333 && e->Iex.Const.con->Ico.U64 == 0 );
1336 static IRAtom* doCmpORD ( MCEnv* mce,
1337 IROp cmp_op,
1338 IRAtom* xxhash, IRAtom* yyhash,
1339 IRAtom* xx, IRAtom* yy )
1341 Bool m64 = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
1342 Bool syned = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
1343 IROp opOR = m64 ? Iop_Or64 : Iop_Or32;
1344 IROp opAND = m64 ? Iop_And64 : Iop_And32;
1345 IROp opSHL = m64 ? Iop_Shl64 : Iop_Shl32;
1346 IROp opSHR = m64 ? Iop_Shr64 : Iop_Shr32;
1347 IROp op1UtoWS = m64 ? Iop_1Uto64 : Iop_1Uto32;
1348 IRType ty = m64 ? Ity_I64 : Ity_I32;
1349 Int width = m64 ? 64 : 32;
1351 Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
1353 tl_assert(isShadowAtom(mce,xxhash));
1354 tl_assert(isShadowAtom(mce,yyhash));
1355 tl_assert(isOriginalAtom(mce,xx));
1356 tl_assert(isOriginalAtom(mce,yy));
1357 tl_assert(sameKindedAtoms(xxhash,xx));
1358 tl_assert(sameKindedAtoms(yyhash,yy));
1359 tl_assert(cmp_op == Iop_CmpORD32S || cmp_op == Iop_CmpORD32U
1360 || cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U);
1362 if (0) {
1363 ppIROp(cmp_op); VG_(printf)(" ");
1364 ppIRExpr(xx); VG_(printf)(" "); ppIRExpr( yy ); VG_(printf)("\n");
1367 if (syned && isZero(yy)) {
1368 /* fancy interpretation */
1369 /* if yy is zero, then it must be fully defined (zero#). */
1370 tl_assert(isZero(yyhash));
1371 // This is still inaccurate, but I don't think it matters, since
1372 // nobody writes code of the form
1373 // "is <partially-undefined-value> signedly greater than zero?".
1374 // We therefore simply declare "x >s 0" to be undefined if any bit in
1375 // x is undefined. That's clearly suboptimal in some cases. Eg, if
1376 // the highest order bit is a defined 1 then x is negative so it
1377 // doesn't matter whether the remaining bits are defined or not.
1378 IRAtom* t_0_gt_0_0
1379 = assignNew(
1380 'V', mce,ty,
1381 binop(
1382 opAND,
1383 mkPCastTo(mce,ty, xxhash),
1384 m64 ? mkU64(1<<2) : mkU32(1<<2)
1386 // For "x <s 0", we can just copy the definedness of the top bit of x
1387 // and we have a precise result.
1388 IRAtom* t_lt_0_0_0
1389 = assignNew(
1390 'V', mce,ty,
1391 binop(
1392 opSHL,
1393 assignNew(
1394 'V', mce,ty,
1395 binop(opSHR, xxhash, mkU8(width-1))),
1396 mkU8(3)
1398 // For "x == 0" we can hand the problem off to expensiveCmpEQorNE.
1399 IRAtom* t_0_0_eq_0
1400 = assignNew(
1401 'V', mce,ty,
1402 binop(
1403 opSHL,
1404 assignNew('V', mce,ty,
1405 unop(
1406 op1UtoWS,
1407 expensiveCmpEQorNE(mce, ty, xxhash, yyhash, xx, yy))
1409 mkU8(1)
1411 return
1412 binop(
1413 opOR,
1414 assignNew('V', mce,ty, binop(opOR, t_lt_0_0_0, t_0_gt_0_0)),
1415 t_0_0_eq_0
1417 } else {
1418 /* standard interpretation */
1419 IRAtom* sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
1420 return
1421 binop(
1422 opAND,
1423 mkPCastTo( mce,ty,
1424 mkUifU(mce,ty, xxhash,yyhash)),
1425 sevenLeft1
1431 /*------------------------------------------------------------*/
1432 /*--- Emit a test and complaint if something is undefined. ---*/
1433 /*------------------------------------------------------------*/
1435 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
1438 /* Set the annotations on a dirty helper to indicate that the stack
1439 pointer and instruction pointers might be read. This is the
1440 behaviour of all 'emit-a-complaint' style functions we might
1441 call. */
1443 static void setHelperAnns ( MCEnv* mce, IRDirty* di ) {
1444 di->nFxState = 2;
1445 di->fxState[0].fx = Ifx_Read;
1446 di->fxState[0].offset = mce->layout->offset_SP;
1447 di->fxState[0].size = mce->layout->sizeof_SP;
1448 di->fxState[0].nRepeats = 0;
1449 di->fxState[0].repeatLen = 0;
1450 di->fxState[1].fx = Ifx_Read;
1451 di->fxState[1].offset = mce->layout->offset_IP;
1452 di->fxState[1].size = mce->layout->sizeof_IP;
1453 di->fxState[1].nRepeats = 0;
1454 di->fxState[1].repeatLen = 0;
1458 /* Check the supplied *original* |atom| for undefinedness, and emit a
1459 complaint if so. Once that happens, mark it as defined. This is
1460 possible because the atom is either a tmp or literal. If it's a
1461 tmp, it will be shadowed by a tmp, and so we can set the shadow to
1462 be defined. In fact as mentioned above, we will have to allocate a
1463 new tmp to carry the new 'defined' shadow value, and update the
1464 original->tmp mapping accordingly; we cannot simply assign a new
1465 value to an existing shadow tmp as this breaks SSAness.
1467 The checks are performed, any resulting complaint emitted, and
1468 |atom|'s shadow temp set to 'defined', ONLY in the case that
1469 |guard| evaluates to True at run-time. If it evaluates to False
1470 then no action is performed. If |guard| is NULL (the usual case)
1471 then it is assumed to be always-true, and hence these actions are
1472 performed unconditionally.
1474 This routine does not generate code to check the definedness of
1475 |guard|. The caller is assumed to have taken care of that already.
1477 static void complainIfUndefined ( MCEnv* mce, IRAtom* atom, IRExpr *guard )
1479 IRAtom* vatom;
1480 IRType ty;
1481 Int sz;
1482 IRDirty* di;
1483 IRAtom* cond;
1484 IRAtom* origin;
1485 void* fn;
1486 const HChar* nm;
1487 IRExpr** args;
1488 Int nargs;
1490 // Don't do V bit tests if we're not reporting undefined value errors.
1491 if (MC_(clo_mc_level) == 1)
1492 return;
1494 if (guard)
1495 tl_assert(isOriginalAtom(mce, guard));
1497 /* Since the original expression is atomic, there's no duplicated
1498 work generated by making multiple V-expressions for it. So we
1499 don't really care about the possibility that someone else may
1500 also create a V-interpretion for it. */
1501 tl_assert(isOriginalAtom(mce, atom));
1502 vatom = expr2vbits( mce, atom, HuOth );
1503 tl_assert(isShadowAtom(mce, vatom));
1504 tl_assert(sameKindedAtoms(atom, vatom));
1506 ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1508 /* sz is only used for constructing the error message */
1509 sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
1511 cond = mkPCastTo( mce, Ity_I1, vatom );
1512 /* cond will be 0 if all defined, and 1 if any not defined. */
1514 /* Get the origin info for the value we are about to check. At
1515 least, if we are doing origin tracking. If not, use a dummy
1516 zero origin. */
1517 if (MC_(clo_mc_level) == 3) {
1518 origin = schemeE( mce, atom );
1519 if (mce->hWordTy == Ity_I64) {
1520 origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
1522 } else {
1523 origin = NULL;
1526 fn = NULL;
1527 nm = NULL;
1528 args = NULL;
1529 nargs = -1;
1531 switch (sz) {
1532 case 0:
1533 if (origin) {
1534 fn = &MC_(helperc_value_check0_fail_w_o);
1535 nm = "MC_(helperc_value_check0_fail_w_o)";
1536 args = mkIRExprVec_1(origin);
1537 nargs = 1;
1538 } else {
1539 fn = &MC_(helperc_value_check0_fail_no_o);
1540 nm = "MC_(helperc_value_check0_fail_no_o)";
1541 args = mkIRExprVec_0();
1542 nargs = 0;
1544 break;
1545 case 1:
1546 if (origin) {
1547 fn = &MC_(helperc_value_check1_fail_w_o);
1548 nm = "MC_(helperc_value_check1_fail_w_o)";
1549 args = mkIRExprVec_1(origin);
1550 nargs = 1;
1551 } else {
1552 fn = &MC_(helperc_value_check1_fail_no_o);
1553 nm = "MC_(helperc_value_check1_fail_no_o)";
1554 args = mkIRExprVec_0();
1555 nargs = 0;
1557 break;
1558 case 4:
1559 if (origin) {
1560 fn = &MC_(helperc_value_check4_fail_w_o);
1561 nm = "MC_(helperc_value_check4_fail_w_o)";
1562 args = mkIRExprVec_1(origin);
1563 nargs = 1;
1564 } else {
1565 fn = &MC_(helperc_value_check4_fail_no_o);
1566 nm = "MC_(helperc_value_check4_fail_no_o)";
1567 args = mkIRExprVec_0();
1568 nargs = 0;
1570 break;
1571 case 8:
1572 if (origin) {
1573 fn = &MC_(helperc_value_check8_fail_w_o);
1574 nm = "MC_(helperc_value_check8_fail_w_o)";
1575 args = mkIRExprVec_1(origin);
1576 nargs = 1;
1577 } else {
1578 fn = &MC_(helperc_value_check8_fail_no_o);
1579 nm = "MC_(helperc_value_check8_fail_no_o)";
1580 args = mkIRExprVec_0();
1581 nargs = 0;
1583 break;
1584 case 2:
1585 case 16:
1586 if (origin) {
1587 fn = &MC_(helperc_value_checkN_fail_w_o);
1588 nm = "MC_(helperc_value_checkN_fail_w_o)";
1589 args = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
1590 nargs = 2;
1591 } else {
1592 fn = &MC_(helperc_value_checkN_fail_no_o);
1593 nm = "MC_(helperc_value_checkN_fail_no_o)";
1594 args = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
1595 nargs = 1;
1597 break;
1598 default:
1599 VG_(tool_panic)("unexpected szB");
1602 tl_assert(fn);
1603 tl_assert(nm);
1604 tl_assert(args);
1605 tl_assert(nargs >= 0 && nargs <= 2);
1606 tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
1607 || (MC_(clo_mc_level) == 2 && origin == NULL) );
1609 di = unsafeIRDirty_0_N( nargs/*regparms*/, nm,
1610 VG_(fnptr_to_fnentry)( fn ), args );
1611 di->guard = cond; // and cond is PCast-to-1(atom#)
1613 /* If the complaint is to be issued under a guard condition, AND
1614 that into the guard condition for the helper call. */
1615 if (guard) {
1616 IRAtom *g1 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, di->guard));
1617 IRAtom *g2 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, guard));
1618 IRAtom *e = assignNew('V', mce, Ity_I32, binop(Iop_And32, g1, g2));
1619 di->guard = assignNew('V', mce, Ity_I1, unop(Iop_32to1, e));
1622 setHelperAnns( mce, di );
1623 stmt( 'V', mce, IRStmt_Dirty(di));
1625 /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
1626 defined -- but only in the case where the guard evaluates to
1627 True at run-time. Do the update by setting the orig->shadow
1628 mapping for tmp to reflect the fact that this shadow is getting
1629 a new value. */
1630 tl_assert(isIRAtom(vatom));
1631 /* sameKindedAtoms ... */
1632 if (vatom->tag == Iex_RdTmp) {
1633 tl_assert(atom->tag == Iex_RdTmp);
1634 if (guard == NULL) {
1635 // guard is 'always True', hence update unconditionally
1636 newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1637 assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp),
1638 definedOfType(ty));
1639 } else {
1640 // update the temp only conditionally. Do this by copying
1641 // its old value when the guard is False.
1642 // The old value ..
1643 IRTemp old_tmpV = findShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1644 newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
1645 IRAtom* new_tmpV
1646 = assignNew('V', mce, shadowTypeV(ty),
1647 IRExpr_ITE(guard, definedOfType(ty),
1648 mkexpr(old_tmpV)));
1649 assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), new_tmpV);
1655 /*------------------------------------------------------------*/
1656 /*--- Shadowing PUTs/GETs, and indexed variants thereof ---*/
1657 /*------------------------------------------------------------*/
1659 /* Examine the always-defined sections declared in layout to see if
1660 the (offset,size) section is within one. Note, is is an error to
1661 partially fall into such a region: (offset,size) should either be
1662 completely in such a region or completely not-in such a region.
1664 static Bool isAlwaysDefd ( MCEnv* mce, Int offset, Int size )
1666 Int minoffD, maxoffD, i;
1667 Int minoff = offset;
1668 Int maxoff = minoff + size - 1;
1669 tl_assert((minoff & ~0xFFFF) == 0);
1670 tl_assert((maxoff & ~0xFFFF) == 0);
1672 for (i = 0; i < mce->layout->n_alwaysDefd; i++) {
1673 minoffD = mce->layout->alwaysDefd[i].offset;
1674 maxoffD = minoffD + mce->layout->alwaysDefd[i].size - 1;
1675 tl_assert((minoffD & ~0xFFFF) == 0);
1676 tl_assert((maxoffD & ~0xFFFF) == 0);
1678 if (maxoff < minoffD || maxoffD < minoff)
1679 continue; /* no overlap */
1680 if (minoff >= minoffD && maxoff <= maxoffD)
1681 return True; /* completely contained in an always-defd section */
1683 VG_(tool_panic)("memcheck:isAlwaysDefd:partial overlap");
1685 return False; /* could not find any containing section */
1689 /* Generate into bb suitable actions to shadow this Put. If the state
1690 slice is marked 'always defined', do nothing. Otherwise, write the
1691 supplied V bits to the shadow state. We can pass in either an
1692 original atom or a V-atom, but not both. In the former case the
1693 relevant V-bits are then generated from the original.
1694 We assume here, that the definedness of GUARD has already been checked.
1696 static
1697 void do_shadow_PUT ( MCEnv* mce, Int offset,
1698 IRAtom* atom, IRAtom* vatom, IRExpr *guard )
1700 IRType ty;
1702 // Don't do shadow PUTs if we're not doing undefined value checking.
1703 // Their absence lets Vex's optimiser remove all the shadow computation
1704 // that they depend on, which includes GETs of the shadow registers.
1705 if (MC_(clo_mc_level) == 1)
1706 return;
1708 if (atom) {
1709 tl_assert(!vatom);
1710 tl_assert(isOriginalAtom(mce, atom));
1711 vatom = expr2vbits( mce, atom, HuOth );
1712 } else {
1713 tl_assert(vatom);
1714 tl_assert(isShadowAtom(mce, vatom));
1717 ty = typeOfIRExpr(mce->sb->tyenv, vatom);
1718 tl_assert(ty != Ity_I1);
1719 if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1720 /* later: no ... */
1721 /* emit code to emit a complaint if any of the vbits are 1. */
1722 /* complainIfUndefined(mce, atom); */
1723 } else {
1724 /* Do a plain shadow Put. */
1725 if (guard) {
1726 /* If the guard expression evaluates to false we simply Put the value
1727 that is already stored in the guest state slot */
1728 IRAtom *cond, *iffalse;
1730 cond = assignNew('V', mce, Ity_I1, guard);
1731 iffalse = assignNew('V', mce, ty,
1732 IRExpr_Get(offset + mce->layout->total_sizeB, ty));
1733 vatom = assignNew('V', mce, ty, IRExpr_ITE(cond, vatom, iffalse));
1735 stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ));
1740 /* Return an expression which contains the V bits corresponding to the
1741 given GETI (passed in in pieces).
1743 static
1744 void do_shadow_PUTI ( MCEnv* mce, IRPutI *puti)
1746 IRAtom* vatom;
1747 IRType ty, tyS;
1748 Int arrSize;;
1749 IRRegArray* descr = puti->descr;
1750 IRAtom* ix = puti->ix;
1751 Int bias = puti->bias;
1752 IRAtom* atom = puti->data;
1754 // Don't do shadow PUTIs if we're not doing undefined value checking.
1755 // Their absence lets Vex's optimiser remove all the shadow computation
1756 // that they depend on, which includes GETIs of the shadow registers.
1757 if (MC_(clo_mc_level) == 1)
1758 return;
1760 tl_assert(isOriginalAtom(mce,atom));
1761 vatom = expr2vbits( mce, atom, HuOth );
1762 tl_assert(sameKindedAtoms(atom, vatom));
1763 ty = descr->elemTy;
1764 tyS = shadowTypeV(ty);
1765 arrSize = descr->nElems * sizeofIRType(ty);
1766 tl_assert(ty != Ity_I1);
1767 tl_assert(isOriginalAtom(mce,ix));
1768 complainIfUndefined(mce, ix, NULL);
1769 if (isAlwaysDefd(mce, descr->base, arrSize)) {
1770 /* later: no ... */
1771 /* emit code to emit a complaint if any of the vbits are 1. */
1772 /* complainIfUndefined(mce, atom); */
1773 } else {
1774 /* Do a cloned version of the Put that refers to the shadow
1775 area. */
1776 IRRegArray* new_descr
1777 = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1778 tyS, descr->nElems);
1779 stmt( 'V', mce, IRStmt_PutI( mkIRPutI(new_descr, ix, bias, vatom) ));
1784 /* Return an expression which contains the V bits corresponding to the
1785 given GET (passed in in pieces).
1787 static
1788 IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
1790 IRType tyS = shadowTypeV(ty);
1791 tl_assert(ty != Ity_I1);
1792 tl_assert(ty != Ity_I128);
1793 if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
1794 /* Always defined, return all zeroes of the relevant type */
1795 return definedOfType(tyS);
1796 } else {
1797 /* return a cloned version of the Get that refers to the shadow
1798 area. */
1799 /* FIXME: this isn't an atom! */
1800 return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
1805 /* Return an expression which contains the V bits corresponding to the
1806 given GETI (passed in in pieces).
1808 static
1809 IRExpr* shadow_GETI ( MCEnv* mce,
1810 IRRegArray* descr, IRAtom* ix, Int bias )
1812 IRType ty = descr->elemTy;
1813 IRType tyS = shadowTypeV(ty);
1814 Int arrSize = descr->nElems * sizeofIRType(ty);
1815 tl_assert(ty != Ity_I1);
1816 tl_assert(isOriginalAtom(mce,ix));
1817 complainIfUndefined(mce, ix, NULL);
1818 if (isAlwaysDefd(mce, descr->base, arrSize)) {
1819 /* Always defined, return all zeroes of the relevant type */
1820 return definedOfType(tyS);
1821 } else {
1822 /* return a cloned version of the Get that refers to the shadow
1823 area. */
1824 IRRegArray* new_descr
1825 = mkIRRegArray( descr->base + mce->layout->total_sizeB,
1826 tyS, descr->nElems);
1827 return IRExpr_GetI( new_descr, ix, bias );
1832 /*------------------------------------------------------------*/
1833 /*--- Generating approximations for unknown operations, ---*/
1834 /*--- using lazy-propagate semantics ---*/
1835 /*------------------------------------------------------------*/
1837 /* Lazy propagation of undefinedness from two values, resulting in the
1838 specified shadow type.
1840 static
1841 IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
1843 IRAtom* at;
1844 IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1845 IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1846 tl_assert(isShadowAtom(mce,va1));
1847 tl_assert(isShadowAtom(mce,va2));
1849 /* The general case is inefficient because PCast is an expensive
1850 operation. Here are some special cases which use PCast only
1851 once rather than twice. */
1853 /* I64 x I64 -> I64 */
1854 if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I64) {
1855 if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
1856 at = mkUifU(mce, Ity_I64, va1, va2);
1857 at = mkPCastTo(mce, Ity_I64, at);
1858 return at;
1861 /* I64 x I64 -> I32 */
1862 if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I32) {
1863 if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
1864 at = mkUifU(mce, Ity_I64, va1, va2);
1865 at = mkPCastTo(mce, Ity_I32, at);
1866 return at;
1869 /* I32 x I32 -> I32 */
1870 if (t1 == Ity_I32 && t2 == Ity_I32 && finalVty == Ity_I32) {
1871 if (0) VG_(printf)("mkLazy2: I32 x I32 -> I32\n");
1872 at = mkUifU(mce, Ity_I32, va1, va2);
1873 at = mkPCastTo(mce, Ity_I32, at);
1874 return at;
1877 if (0) {
1878 VG_(printf)("mkLazy2 ");
1879 ppIRType(t1);
1880 VG_(printf)("_");
1881 ppIRType(t2);
1882 VG_(printf)("_");
1883 ppIRType(finalVty);
1884 VG_(printf)("\n");
1887 /* General case: force everything via 32-bit intermediaries. */
1888 at = mkPCastTo(mce, Ity_I32, va1);
1889 at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
1890 at = mkPCastTo(mce, finalVty, at);
1891 return at;
1895 /* 3-arg version of the above. */
1896 static
1897 IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
1898 IRAtom* va1, IRAtom* va2, IRAtom* va3 )
1900 IRAtom* at;
1901 IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
1902 IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
1903 IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
1904 tl_assert(isShadowAtom(mce,va1));
1905 tl_assert(isShadowAtom(mce,va2));
1906 tl_assert(isShadowAtom(mce,va3));
1908 /* The general case is inefficient because PCast is an expensive
1909 operation. Here are some special cases which use PCast only
1910 twice rather than three times. */
1912 /* I32 x I64 x I64 -> I64 */
1913 /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1914 if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
1915 && finalVty == Ity_I64) {
1916 if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I64\n");
1917 /* Widen 1st arg to I64. Since 1st arg is typically a rounding
1918 mode indication which is fully defined, this should get
1919 folded out later. */
1920 at = mkPCastTo(mce, Ity_I64, va1);
1921 /* Now fold in 2nd and 3rd args. */
1922 at = mkUifU(mce, Ity_I64, at, va2);
1923 at = mkUifU(mce, Ity_I64, at, va3);
1924 /* and PCast once again. */
1925 at = mkPCastTo(mce, Ity_I64, at);
1926 return at;
1929 /* I32 x I8 x I64 -> I64 */
1930 if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I64
1931 && finalVty == Ity_I64) {
1932 if (0) VG_(printf)("mkLazy3: I32 x I8 x I64 -> I64\n");
1933 /* Widen 1st and 2nd args to I64. Since 1st arg is typically a
1934 * rounding mode indication which is fully defined, this should
1935 * get folded out later.
1937 IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
1938 IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
1939 at = mkUifU(mce, Ity_I64, at1, at2); // UifU(PCast(va1), PCast(va2))
1940 at = mkUifU(mce, Ity_I64, at, va3);
1941 /* and PCast once again. */
1942 at = mkPCastTo(mce, Ity_I64, at);
1943 return at;
1946 /* I32 x I64 x I64 -> I32 */
1947 if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
1948 && finalVty == Ity_I32) {
1949 if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I32\n");
1950 at = mkPCastTo(mce, Ity_I64, va1);
1951 at = mkUifU(mce, Ity_I64, at, va2);
1952 at = mkUifU(mce, Ity_I64, at, va3);
1953 at = mkPCastTo(mce, Ity_I32, at);
1954 return at;
1957 /* I32 x I32 x I32 -> I32 */
1958 /* 32-bit FP idiom, as (eg) happens on ARM */
1959 if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32
1960 && finalVty == Ity_I32) {
1961 if (0) VG_(printf)("mkLazy3: I32 x I32 x I32 -> I32\n");
1962 at = va1;
1963 at = mkUifU(mce, Ity_I32, at, va2);
1964 at = mkUifU(mce, Ity_I32, at, va3);
1965 at = mkPCastTo(mce, Ity_I32, at);
1966 return at;
1969 /* I32 x I16 x I16 -> I16 */
1970 /* 16-bit half-precision FP idiom, as (eg) happens on arm64 v8.2 onwards */
1971 if (t1 == Ity_I32 && t2 == Ity_I16 && t3 == Ity_I16
1972 && finalVty == Ity_I16) {
1973 if (0) VG_(printf)("mkLazy3: I32 x I16 x I16 -> I16\n");
1974 at = mkPCastTo(mce, Ity_I16, va1);
1975 at = mkUifU(mce, Ity_I16, at, va2);
1976 at = mkUifU(mce, Ity_I16, at, va3);
1977 at = mkPCastTo(mce, Ity_I16, at);
1978 return at;
1981 /* I32 x I128 x I128 -> I128 */
1982 /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1983 if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128
1984 && finalVty == Ity_I128) {
1985 if (0) VG_(printf)("mkLazy3: I32 x I128 x I128 -> I128\n");
1986 /* Widen 1st arg to I128. Since 1st arg is typically a rounding
1987 mode indication which is fully defined, this should get
1988 folded out later. */
1989 at = mkPCastTo(mce, Ity_I128, va1);
1990 /* Now fold in 2nd and 3rd args. */
1991 at = mkUifU(mce, Ity_I128, at, va2);
1992 at = mkUifU(mce, Ity_I128, at, va3);
1993 /* and PCast once again. */
1994 at = mkPCastTo(mce, Ity_I128, at);
1995 return at;
1998 /* I32 x I8 x I128 -> I128 */
1999 /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
2000 if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I128
2001 && finalVty == Ity_I128) {
2002 if (0) VG_(printf)("mkLazy3: I32 x I8 x I128 -> I128\n");
2003 /* Use I64 as an intermediate type, which means PCasting all 3
2004 args to I64 to start with. 1st arg is typically a rounding
2005 mode indication which is fully defined, so we hope that it
2006 will get folded out later. */
2007 IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
2008 IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
2009 IRAtom* at3 = mkPCastTo(mce, Ity_I64, va3);
2010 /* Now UifU all three together. */
2011 at = mkUifU(mce, Ity_I64, at1, at2); // UifU(PCast(va1), PCast(va2))
2012 at = mkUifU(mce, Ity_I64, at, at3); // ... `UifU` PCast(va3)
2013 /* and PCast once again. */
2014 at = mkPCastTo(mce, Ity_I128, at);
2015 return at;
2017 if (1) {
2018 VG_(printf)("mkLazy3: ");
2019 ppIRType(t1);
2020 VG_(printf)(" x ");
2021 ppIRType(t2);
2022 VG_(printf)(" x ");
2023 ppIRType(t3);
2024 VG_(printf)(" -> ");
2025 ppIRType(finalVty);
2026 VG_(printf)("\n");
2029 tl_assert(0);
2030 /* General case: force everything via 32-bit intermediaries. */
2032 at = mkPCastTo(mce, Ity_I32, va1);
2033 at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
2034 at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
2035 at = mkPCastTo(mce, finalVty, at);
2036 return at;
2041 /* 4-arg version of the above. */
2042 static
2043 IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
2044 IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
2046 IRAtom* at;
2047 IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
2048 IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
2049 IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
2050 IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
2051 tl_assert(isShadowAtom(mce,va1));
2052 tl_assert(isShadowAtom(mce,va2));
2053 tl_assert(isShadowAtom(mce,va3));
2054 tl_assert(isShadowAtom(mce,va4));
2056 /* The general case is inefficient because PCast is an expensive
2057 operation. Here are some special cases which use PCast only
2058 twice rather than three times. */
2060 /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2062 if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128 && t4 == Ity_I128
2063 && finalVty == Ity_I128) {
2064 if (0) VG_(printf)("mkLazy4: I32 x I128 x I128 x I128 -> I128\n");
2065 /* Widen 1st arg to I128. Since 1st arg is typically a rounding
2066 mode indication which is fully defined, this should get
2067 folded out later. */
2068 at = mkPCastTo(mce, Ity_I128, va1);
2069 /* Now fold in 2nd, 3rd, 4th args. */
2070 at = mkUifU(mce, Ity_I128, at, va2);
2071 at = mkUifU(mce, Ity_I128, at, va3);
2072 at = mkUifU(mce, Ity_I128, at, va4);
2073 /* and PCast once again. */
2074 at = mkPCastTo(mce, Ity_I128, at);
2075 return at;
2078 /* I32 x I64 x I64 x I64 -> I64 */
2079 if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64 && t4 == Ity_I64
2080 && finalVty == Ity_I64) {
2081 if (0) VG_(printf)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
2082 /* Widen 1st arg to I64. Since 1st arg is typically a rounding
2083 mode indication which is fully defined, this should get
2084 folded out later. */
2085 at = mkPCastTo(mce, Ity_I64, va1);
2086 /* Now fold in 2nd, 3rd, 4th args. */
2087 at = mkUifU(mce, Ity_I64, at, va2);
2088 at = mkUifU(mce, Ity_I64, at, va3);
2089 at = mkUifU(mce, Ity_I64, at, va4);
2090 /* and PCast once again. */
2091 at = mkPCastTo(mce, Ity_I64, at);
2092 return at;
2094 /* I32 x I32 x I32 x I32 -> I32 */
2095 /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2096 if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32 && t4 == Ity_I32
2097 && finalVty == Ity_I32) {
2098 if (0) VG_(printf)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
2099 at = va1;
2100 /* Now fold in 2nd, 3rd, 4th args. */
2101 at = mkUifU(mce, Ity_I32, at, va2);
2102 at = mkUifU(mce, Ity_I32, at, va3);
2103 at = mkUifU(mce, Ity_I32, at, va4);
2104 at = mkPCastTo(mce, Ity_I32, at);
2105 return at;
2108 if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2109 && finalVty == Ity_I32) {
2110 if (0) VG_(printf)("mkLazy4: I32 x I8 x I8 x I8 -> I32\n");
2111 at = mkPCastTo(mce, Ity_I8, va1);
2112 /* Now fold in 2nd, 3rd, 4th args. */
2113 at = mkUifU(mce, Ity_I8, at, va2);
2114 at = mkUifU(mce, Ity_I8, at, va3);
2115 at = mkUifU(mce, Ity_I8, at, va4);
2116 at = mkPCastTo(mce, Ity_I32, at);
2117 return at;
2120 if (t1 == Ity_I64 && t2 == Ity_I8 && t3 == Ity_I8 && t4 == Ity_I8
2121 && finalVty == Ity_I64) {
2122 if (0) VG_(printf)("mkLazy4: I64 x I8 x I8 x I8 -> I64\n");
2123 at = mkPCastTo(mce, Ity_I8, va1);
2124 /* Now fold in 2nd, 3rd, 4th args. */
2125 at = mkUifU(mce, Ity_I8, at, va2);
2126 at = mkUifU(mce, Ity_I8, at, va3);
2127 at = mkUifU(mce, Ity_I8, at, va4);
2128 at = mkPCastTo(mce, Ity_I64, at);
2129 return at;
2132 if (1) {
2133 VG_(printf)("mkLazy4: ");
2134 ppIRType(t1);
2135 VG_(printf)(" x ");
2136 ppIRType(t2);
2137 VG_(printf)(" x ");
2138 ppIRType(t3);
2139 VG_(printf)(" x ");
2140 ppIRType(t4);
2141 VG_(printf)(" -> ");
2142 ppIRType(finalVty);
2143 VG_(printf)("\n");
2146 tl_assert(0);
2150 /* Do the lazy propagation game from a null-terminated vector of
2151 atoms. This is presumably the arguments to a helper call, so the
2152 IRCallee info is also supplied in order that we can know which
2153 arguments should be ignored (via the .mcx_mask field).
2155 static
2156 IRAtom* mkLazyN ( MCEnv* mce,
2157 IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
2159 Int i;
2160 IRAtom* here;
2161 IRAtom* curr;
2162 IRType mergeTy;
2163 Bool mergeTy64 = True;
2165 /* Decide on the type of the merge intermediary. If all relevant
2166 args are I64, then it's I64. In all other circumstances, use
2167 I32. */
2168 for (i = 0; exprvec[i]; i++) {
2169 tl_assert(i < 32);
2170 tl_assert(isOriginalAtom(mce, exprvec[i]));
2171 if (cee->mcx_mask & (1<<i))
2172 continue;
2173 if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
2174 mergeTy64 = False;
2177 mergeTy = mergeTy64 ? Ity_I64 : Ity_I32;
2178 curr = definedOfType(mergeTy);
2180 for (i = 0; exprvec[i]; i++) {
2181 tl_assert(i < 32);
2182 tl_assert(isOriginalAtom(mce, exprvec[i]));
2183 /* Only take notice of this arg if the callee's mc-exclusion
2184 mask does not say it is to be excluded. */
2185 if (cee->mcx_mask & (1<<i)) {
2186 /* the arg is to be excluded from definedness checking. Do
2187 nothing. */
2188 if (0) VG_(printf)("excluding %s(%d)\n", cee->name, i);
2189 } else {
2190 /* calculate the arg's definedness, and pessimistically merge
2191 it in. */
2192 here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i], HuOth) );
2193 curr = mergeTy64
2194 ? mkUifU64(mce, here, curr)
2195 : mkUifU32(mce, here, curr);
2198 return mkPCastTo(mce, finalVtype, curr );
2202 /*------------------------------------------------------------*/
2203 /*--- Generating expensive sequences for exact carry-chain ---*/
2204 /*--- propagation in add/sub and related operations. ---*/
2205 /*------------------------------------------------------------*/
2207 static
2208 IRAtom* expensiveAddSub ( MCEnv* mce,
2209 Bool add,
2210 IRType ty,
2211 IRAtom* qaa, IRAtom* qbb,
2212 IRAtom* aa, IRAtom* bb )
2214 IRAtom *a_min, *b_min, *a_max, *b_max;
2215 IROp opAND, opOR, opXOR, opNOT, opADD, opSUB;
2217 tl_assert(isShadowAtom(mce,qaa));
2218 tl_assert(isShadowAtom(mce,qbb));
2219 tl_assert(isOriginalAtom(mce,aa));
2220 tl_assert(isOriginalAtom(mce,bb));
2221 tl_assert(sameKindedAtoms(qaa,aa));
2222 tl_assert(sameKindedAtoms(qbb,bb));
2224 switch (ty) {
2225 case Ity_I32:
2226 opAND = Iop_And32;
2227 opOR = Iop_Or32;
2228 opXOR = Iop_Xor32;
2229 opNOT = Iop_Not32;
2230 opADD = Iop_Add32;
2231 opSUB = Iop_Sub32;
2232 break;
2233 case Ity_I64:
2234 opAND = Iop_And64;
2235 opOR = Iop_Or64;
2236 opXOR = Iop_Xor64;
2237 opNOT = Iop_Not64;
2238 opADD = Iop_Add64;
2239 opSUB = Iop_Sub64;
2240 break;
2241 default:
2242 VG_(tool_panic)("expensiveAddSub");
2245 // a_min = aa & ~qaa
2246 a_min = assignNew('V', mce,ty,
2247 binop(opAND, aa,
2248 assignNew('V', mce,ty, unop(opNOT, qaa))));
2250 // b_min = bb & ~qbb
2251 b_min = assignNew('V', mce,ty,
2252 binop(opAND, bb,
2253 assignNew('V', mce,ty, unop(opNOT, qbb))));
2255 // a_max = aa | qaa
2256 a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
2258 // b_max = bb | qbb
2259 b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
2261 if (add) {
2262 // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
2263 return
2264 assignNew('V', mce,ty,
2265 binop( opOR,
2266 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2267 assignNew('V', mce,ty,
2268 binop( opXOR,
2269 assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
2270 assignNew('V', mce,ty, binop(opADD, a_max, b_max))
2275 } else {
2276 // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max - b_min))
2277 return
2278 assignNew('V', mce,ty,
2279 binop( opOR,
2280 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
2281 assignNew('V', mce,ty,
2282 binop( opXOR,
2283 assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
2284 assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
2294 static
2295 IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
2296 IRAtom* atom, IRAtom* vatom )
2298 IRType ty;
2299 IROp xorOp, subOp, andOp;
2300 IRExpr *one;
2301 IRAtom *improver, *improved;
2302 tl_assert(isShadowAtom(mce,vatom));
2303 tl_assert(isOriginalAtom(mce,atom));
2304 tl_assert(sameKindedAtoms(atom,vatom));
2306 switch (czop) {
2307 case Iop_Ctz32: case Iop_CtzNat32:
2308 ty = Ity_I32;
2309 xorOp = Iop_Xor32;
2310 subOp = Iop_Sub32;
2311 andOp = Iop_And32;
2312 one = mkU32(1);
2313 break;
2314 case Iop_Ctz64: case Iop_CtzNat64:
2315 ty = Ity_I64;
2316 xorOp = Iop_Xor64;
2317 subOp = Iop_Sub64;
2318 andOp = Iop_And64;
2319 one = mkU64(1);
2320 break;
2321 default:
2322 ppIROp(czop);
2323 VG_(tool_panic)("memcheck:expensiveCountTrailingZeroes");
2326 // improver = atom ^ (atom - 1)
2328 // That is, improver has its low ctz(atom)+1 bits equal to one;
2329 // higher bits (if any) equal to zero. So it's exactly the right
2330 // mask to use to remove the irrelevant undefined input bits.
2331 /* Here are some examples:
2332 atom = U...U 1 0...0
2333 atom-1 = U...U 0 1...1
2334 ^ed = 0...0 1 11111, which correctly describes which bits of |atom|
2335 actually influence the result
2336 A boundary case
2337 atom = 0...0
2338 atom-1 = 1...1
2339 ^ed = 11111, also a correct mask for the input: all input bits
2340 are relevant
2341 Another boundary case
2342 atom = 1..1 1
2343 atom-1 = 1..1 0
2344 ^ed = 0..0 1, also a correct mask: only the rightmost input bit
2345 is relevant
2346 Now with misc U bits interspersed:
2347 atom = U...U 1 0 U...U 0 1 0...0
2348 atom-1 = U...U 1 0 U...U 0 0 1...1
2349 ^ed = 0...0 0 0 0...0 0 1 1...1, also correct
2350 (Per re-check/analysis of 14 Nov 2018)
2352 improver = assignNew('V', mce,ty,
2353 binop(xorOp,
2354 atom,
2355 assignNew('V', mce, ty,
2356 binop(subOp, atom, one))));
2358 // improved = vatom & improver
2360 // That is, treat any V bits to the left of the rightmost ctz(atom)+1
2361 // bits as "defined".
2362 improved = assignNew('V', mce, ty,
2363 binop(andOp, vatom, improver));
2365 // Return pessimizing cast of improved.
2366 return mkPCastTo(mce, ty, improved);
2369 static
2370 IRAtom* expensiveCountLeadingZeroes ( MCEnv* mce, IROp czop,
2371 IRAtom* atom, IRAtom* vatom )
2373 IRType ty;
2374 IROp shrOp, notOp, andOp;
2375 IRAtom* (*mkRight)(MCEnv*, IRAtom*);
2376 IRAtom *improver, *improved;
2377 tl_assert(isShadowAtom(mce,vatom));
2378 tl_assert(isOriginalAtom(mce,atom));
2379 tl_assert(sameKindedAtoms(atom,vatom));
2381 switch (czop) {
2382 case Iop_Clz32: case Iop_ClzNat32:
2383 ty = Ity_I32;
2384 shrOp = Iop_Shr32;
2385 notOp = Iop_Not32;
2386 andOp = Iop_And32;
2387 mkRight = mkRight32;
2388 break;
2389 case Iop_Clz64: case Iop_ClzNat64:
2390 ty = Ity_I64;
2391 shrOp = Iop_Shr64;
2392 notOp = Iop_Not64;
2393 andOp = Iop_And64;
2394 mkRight = mkRight64;
2395 break;
2396 default:
2397 ppIROp(czop);
2398 VG_(tool_panic)("memcheck:expensiveCountLeadingZeroes");
2401 // This is in principle very similar to how expensiveCountTrailingZeroes
2402 // works. That function computed an "improver", which it used to mask
2403 // off all but the rightmost 1-bit and the zeroes to the right of it,
2404 // hence removing irrelevant bits from the input. Here, we play the
2405 // exact same game but with the left-vs-right roles interchanged.
2406 // Unfortunately calculation of the improver in this case is
2407 // significantly more expensive.
2409 // improver = ~(RIGHT(atom) >>u 1)
2411 // That is, improver has its upper clz(atom)+1 bits equal to one;
2412 // lower bits (if any) equal to zero. So it's exactly the right
2413 // mask to use to remove the irrelevant undefined input bits.
2414 /* Here are some examples:
2415 atom = 0...0 1 U...U
2416 R(atom) = 0...0 1 1...1
2417 R(atom) >>u 1 = 0...0 0 1...1
2418 ~(R(atom) >>u 1) = 1...1 1 0...0
2419 which correctly describes which bits of |atom|
2420 actually influence the result
2421 A boundary case
2422 atom = 0...0
2423 R(atom) = 0...0
2424 R(atom) >>u 1 = 0...0
2425 ~(R(atom) >>u 1) = 1...1
2426 also a correct mask for the input: all input bits
2427 are relevant
2428 Another boundary case
2429 atom = 1 1..1
2430 R(atom) = 1 1..1
2431 R(atom) >>u 1 = 0 1..1
2432 ~(R(atom) >>u 1) = 1 0..0
2433 also a correct mask: only the leftmost input bit
2434 is relevant
2435 Now with misc U bits interspersed:
2436 atom = 0...0 1 U...U 0 1 U...U
2437 R(atom) = 0...0 1 1...1 1 1 1...1
2438 R(atom) >>u 1 = 0...0 0 1...1 1 1 1...1
2439 ~(R(atom) >>u 1) = 1...1 1 0...0 0 0 0...0, also correct
2440 (Per initial implementation of 15 Nov 2018)
2442 improver = mkRight(mce, atom);
2443 improver = assignNew('V', mce, ty, binop(shrOp, improver, mkU8(1)));
2444 improver = assignNew('V', mce, ty, unop(notOp, improver));
2446 // improved = vatom & improver
2448 // That is, treat any V bits to the right of the leftmost clz(atom)+1
2449 // bits as "defined".
2450 improved = assignNew('V', mce, ty,
2451 binop(andOp, vatom, improver));
2453 // Return pessimizing cast of improved.
2454 return mkPCastTo(mce, ty, improved);
2458 /*------------------------------------------------------------*/
2459 /*--- Scalar shifts. ---*/
2460 /*------------------------------------------------------------*/
2462 /* Produce an interpretation for (aa << bb) (or >>s, >>u). The basic
2463 idea is to shift the definedness bits by the original shift amount.
2464 This introduces 0s ("defined") in new positions for left shifts and
2465 unsigned right shifts, and copies the top definedness bit for
2466 signed right shifts. So, conveniently, applying the original shift
2467 operator to the definedness bits for the left arg is exactly the
2468 right thing to do:
2470 (qaa << bb)
2472 However if the shift amount is undefined then the whole result
2473 is undefined. Hence need:
2475 (qaa << bb) `UifU` PCast(qbb)
2477 If the shift amount bb is a literal than qbb will say 'all defined'
2478 and the UifU and PCast will get folded out by post-instrumentation
2479 optimisation.
2481 static IRAtom* scalarShift ( MCEnv* mce,
2482 IRType ty,
2483 IROp original_op,
2484 IRAtom* qaa, IRAtom* qbb,
2485 IRAtom* aa, IRAtom* bb )
2487 tl_assert(isShadowAtom(mce,qaa));
2488 tl_assert(isShadowAtom(mce,qbb));
2489 tl_assert(isOriginalAtom(mce,aa));
2490 tl_assert(isOriginalAtom(mce,bb));
2491 tl_assert(sameKindedAtoms(qaa,aa));
2492 tl_assert(sameKindedAtoms(qbb,bb));
2493 return
2494 assignNew(
2495 'V', mce, ty,
2496 mkUifU( mce, ty,
2497 assignNew('V', mce, ty, binop(original_op, qaa, bb)),
2498 mkPCastTo(mce, ty, qbb)
2504 /*------------------------------------------------------------*/
2505 /*--- Helpers for dealing with vector primops. ---*/
2506 /*------------------------------------------------------------*/
2508 /* Vector pessimisation -- pessimise within each lane individually. */
2510 static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
2512 return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
2515 static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
2517 return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
2520 static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
2522 return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
2525 static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
2527 return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
2530 static IRAtom* mkPCast128x1 ( MCEnv* mce, IRAtom* at )
2532 return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ128x1, at));
2535 static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
2537 return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
2540 static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
2542 return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
2545 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
2547 return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
2550 static IRAtom* mkPCast16x16 ( MCEnv* mce, IRAtom* at )
2552 return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ16x16, at));
2555 static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
2557 return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
2560 static IRAtom* mkPCast8x32 ( MCEnv* mce, IRAtom* at )
2562 return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ8x32, at));
2565 static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
2567 return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
2570 static IRAtom* mkPCast16x2 ( MCEnv* mce, IRAtom* at )
2572 return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ16x2, at));
2575 static IRAtom* mkPCast8x4 ( MCEnv* mce, IRAtom* at )
2577 return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ8x4, at));
2581 /* Here's a simple scheme capable of handling ops derived from SSE1
2582 code and while only generating ops that can be efficiently
2583 implemented in SSE1. */
2585 /* All-lanes versions are straightforward:
2587 binary32Fx4(x,y) ==> PCast32x4(UifUV128(x#,y#))
2589 unary32Fx4(x,y) ==> PCast32x4(x#)
2591 Lowest-lane-only versions are more complex:
2593 binary32F0x4(x,y) ==> SetV128lo32(
2594 x#,
2595 PCast32(V128to32(UifUV128(x#,y#)))
2598 This is perhaps not so obvious. In particular, it's faster to
2599 do a V128-bit UifU and then take the bottom 32 bits than the more
2600 obvious scheme of taking the bottom 32 bits of each operand
2601 and doing a 32-bit UifU. Basically since UifU is fast and
2602 chopping lanes off vector values is slow.
2604 Finally:
2606 unary32F0x4(x) ==> SetV128lo32(
2607 x#,
2608 PCast32(V128to32(x#))
2611 Where:
2613 PCast32(v#) = 1Sto32(CmpNE32(v#,0))
2614 PCast32x4(v#) = CmpNEZ32x4(v#)
2617 static
2618 IRAtom* binary32Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2620 IRAtom* at;
2621 tl_assert(isShadowAtom(mce, vatomX));
2622 tl_assert(isShadowAtom(mce, vatomY));
2623 at = mkUifUV128(mce, vatomX, vatomY);
2624 at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
2625 return at;
2628 static
2629 IRAtom* unary32Fx4 ( MCEnv* mce, IRAtom* vatomX )
2631 IRAtom* at;
2632 tl_assert(isShadowAtom(mce, vatomX));
2633 at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
2634 return at;
2637 static
2638 IRAtom* binary32F0x4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2640 IRAtom* at;
2641 tl_assert(isShadowAtom(mce, vatomX));
2642 tl_assert(isShadowAtom(mce, vatomY));
2643 at = mkUifUV128(mce, vatomX, vatomY);
2644 at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
2645 at = mkPCastTo(mce, Ity_I32, at);
2646 at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2647 return at;
2650 static
2651 IRAtom* unary32F0x4 ( MCEnv* mce, IRAtom* vatomX )
2653 IRAtom* at;
2654 tl_assert(isShadowAtom(mce, vatomX));
2655 at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
2656 at = mkPCastTo(mce, Ity_I32, at);
2657 at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
2658 return at;
2661 /* --- ... and ... 64Fx2 versions of the same ... --- */
2663 static
2664 IRAtom* binary64Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2666 IRAtom* at;
2667 tl_assert(isShadowAtom(mce, vatomX));
2668 tl_assert(isShadowAtom(mce, vatomY));
2669 at = mkUifUV128(mce, vatomX, vatomY);
2670 at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
2671 return at;
2674 static
2675 IRAtom* unary64Fx2 ( MCEnv* mce, IRAtom* vatomX )
2677 IRAtom* at;
2678 tl_assert(isShadowAtom(mce, vatomX));
2679 at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
2680 return at;
2683 static
2684 IRAtom* binary64F0x2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2686 IRAtom* at;
2687 tl_assert(isShadowAtom(mce, vatomX));
2688 tl_assert(isShadowAtom(mce, vatomY));
2689 at = mkUifUV128(mce, vatomX, vatomY);
2690 at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
2691 at = mkPCastTo(mce, Ity_I64, at);
2692 at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2693 return at;
2696 static
2697 IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX )
2699 IRAtom* at;
2700 tl_assert(isShadowAtom(mce, vatomX));
2701 at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
2702 at = mkPCastTo(mce, Ity_I64, at);
2703 at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
2704 return at;
2707 /* --- --- ... and ... 16Fx8 versions of the same --- --- */
2709 static
2710 IRAtom* binary16Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2712 IRAtom* at;
2713 tl_assert(isShadowAtom(mce, vatomX));
2714 tl_assert(isShadowAtom(mce, vatomY));
2715 at = mkUifUV128(mce, vatomX, vatomY);
2716 at = assignNew('V', mce, Ity_V128, mkPCast16x8(mce, at));
2717 return at;
2720 static
2721 IRAtom* unary16Fx8 ( MCEnv* mce, IRAtom* vatomX )
2723 IRAtom* at;
2724 tl_assert(isShadowAtom(mce, vatomX));
2725 at = assignNew('V', mce, Ity_V128, mkPCast16x8(mce, vatomX));
2726 return at;
2729 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is
2730 implemented.
2733 /* --- --- ... and ... 32Fx2 versions of the same --- --- */
2735 static
2736 IRAtom* binary32Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2738 IRAtom* at;
2739 tl_assert(isShadowAtom(mce, vatomX));
2740 tl_assert(isShadowAtom(mce, vatomY));
2741 at = mkUifU64(mce, vatomX, vatomY);
2742 at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, at));
2743 return at;
2746 static
2747 IRAtom* unary32Fx2 ( MCEnv* mce, IRAtom* vatomX )
2749 IRAtom* at;
2750 tl_assert(isShadowAtom(mce, vatomX));
2751 at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, vatomX));
2752 return at;
2755 /* --- ... and ... 64Fx4 versions of the same ... --- */
2757 static
2758 IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2760 IRAtom* at;
2761 tl_assert(isShadowAtom(mce, vatomX));
2762 tl_assert(isShadowAtom(mce, vatomY));
2763 at = mkUifUV256(mce, vatomX, vatomY);
2764 at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
2765 return at;
2768 static
2769 IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
2771 IRAtom* at;
2772 tl_assert(isShadowAtom(mce, vatomX));
2773 at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
2774 return at;
2777 /* --- ... and ... 32Fx8 versions of the same ... --- */
2779 static
2780 IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
2782 IRAtom* at;
2783 tl_assert(isShadowAtom(mce, vatomX));
2784 tl_assert(isShadowAtom(mce, vatomY));
2785 at = mkUifUV256(mce, vatomX, vatomY);
2786 at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
2787 return at;
2790 static
2791 IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
2793 IRAtom* at;
2794 tl_assert(isShadowAtom(mce, vatomX));
2795 at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
2796 return at;
2799 /* --- 64Fx2 binary FP ops, with rounding mode --- */
2801 static
2802 IRAtom* binary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM,
2803 IRAtom* vatomX, IRAtom* vatomY )
2805 /* This is the same as binary64Fx2, except that we subsequently
2806 pessimise vRM (definedness of the rounding mode), widen to 128
2807 bits and UifU it into the result. As with the scalar cases, if
2808 the RM is a constant then it is defined and so this extra bit
2809 will get constant-folded out later. */
2810 // "do" the vector args
2811 IRAtom* t1 = binary64Fx2(mce, vatomX, vatomY);
2812 // PCast the RM, and widen it to 128 bits
2813 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2814 // Roll it into the result
2815 t1 = mkUifUV128(mce, t1, t2);
2816 return t1;
2819 /* --- ... and ... 32Fx4 versions of the same --- */
2821 static
2822 IRAtom* binary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2823 IRAtom* vatomX, IRAtom* vatomY )
2825 IRAtom* t1 = binary32Fx4(mce, vatomX, vatomY);
2826 // PCast the RM, and widen it to 128 bits
2827 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2828 // Roll it into the result
2829 t1 = mkUifUV128(mce, t1, t2);
2830 return t1;
2833 /* --- ... and ... 64Fx4 versions of the same --- */
2835 static
2836 IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
2837 IRAtom* vatomX, IRAtom* vatomY )
2839 IRAtom* t1 = binary64Fx4(mce, vatomX, vatomY);
2840 // PCast the RM, and widen it to 256 bits
2841 IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2842 // Roll it into the result
2843 t1 = mkUifUV256(mce, t1, t2);
2844 return t1;
2847 /* --- ... and ... 16Fx8 versions of the same --- */
2849 static
2850 IRAtom* binary16Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2851 IRAtom* vatomX, IRAtom* vatomY )
2853 IRAtom* t1 = binary16Fx8(mce, vatomX, vatomY);
2854 // PCast the RM, and widen it to 128 bits
2855 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2856 // Roll it into the result
2857 t1 = mkUifUV128(mce, t1, t2);
2858 return t1;
2861 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision IR is
2862 implemented.
2865 /* --- ... and ... 32Fx8 versions of the same --- */
2867 static
2868 IRAtom* binary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
2869 IRAtom* vatomX, IRAtom* vatomY )
2871 IRAtom* t1 = binary32Fx8(mce, vatomX, vatomY);
2872 // PCast the RM, and widen it to 256 bits
2873 IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2874 // Roll it into the result
2875 t1 = mkUifUV256(mce, t1, t2);
2876 return t1;
2879 /* --- 64Fx2 unary FP ops, with rounding mode --- */
2881 static
2882 IRAtom* unary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2884 /* Same scheme as binary64Fx2_w_rm. */
2885 // "do" the vector arg
2886 IRAtom* t1 = unary64Fx2(mce, vatomX);
2887 // PCast the RM, and widen it to 128 bits
2888 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2889 // Roll it into the result
2890 t1 = mkUifUV128(mce, t1, t2);
2891 return t1;
2894 /* --- ... and ... 32Fx4 versions of the same --- */
2896 static
2897 IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2899 /* Same scheme as binaryFx4_w_rm. */
2900 IRAtom* t1 = unary32Fx4(mce, vatomX);
2901 // PCast the RM, and widen it to 128 bits
2902 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2903 // Roll it into the result
2904 t1 = mkUifUV128(mce, t1, t2);
2905 return t1;
2908 /* --- ... and ... 16Fx8 versions of the same --- */
2910 static
2911 IRAtom* unary16Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2913 /* Same scheme as binaryFx4_w_rm. */
2914 IRAtom* t1 = unary16Fx8(mce, vatomX);
2915 // PCast the RM, and widen it to 128 bits
2916 IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
2917 // Roll it into the result
2918 t1 = mkUifUV128(mce, t1, t2);
2919 return t1;
2922 /* --- ... and ... 32Fx8 versions of the same --- */
2924 static
2925 IRAtom* unary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
2927 /* Same scheme as unary32Fx8_w_rm. */
2928 IRAtom* t1 = unary32Fx8(mce, vatomX);
2929 // PCast the RM, and widen it to 256 bits
2930 IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
2931 // Roll it into the result
2932 t1 = mkUifUV256(mce, t1, t2);
2933 return t1;
2937 /* --- --- Vector saturated narrowing --- --- */
2939 /* We used to do something very clever here, but on closer inspection
2940 (2011-Jun-15), and in particular bug #279698, it turns out to be
2941 wrong. Part of the problem came from the fact that for a long
2942 time, the IR primops to do with saturated narrowing were
2943 underspecified and managed to confuse multiple cases which needed
2944 to be separate: the op names had a signedness qualifier, but in
2945 fact the source and destination signednesses needed to be specified
2946 independently, so the op names really need two independent
2947 signedness specifiers.
2949 As of 2011-Jun-15 (ish) the underspecification was sorted out
2950 properly. The incorrect instrumentation remained, though. That
2951 has now (2011-Oct-22) been fixed.
2953 What we now do is simple:
2955 Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
2956 number of lanes, X is the source lane width and signedness, and Y
2957 is the destination lane width and signedness. In all cases the
2958 destination lane width is half the source lane width, so the names
2959 have a bit of redundancy, but are at least easy to read.
2961 For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
2962 to unsigned 16s.
2964 Let Vanilla(OP) be a function that takes OP, one of these
2965 saturating narrowing ops, and produces the same "shaped" narrowing
2966 op which is not saturating, but merely dumps the most significant
2967 bits. "same shape" means that the lane numbers and widths are the
2968 same as with OP.
2970 For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
2971 = Iop_NarrowBin32to16x8,
2972 that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
2973 dumping the top half of each lane.
2975 So, with that in place, the scheme is simple, and it is simple to
2976 pessimise each lane individually and then apply Vanilla(OP) so as
2977 to get the result in the right "shape". If the original OP is
2978 QNarrowBinXtoYxZ then we produce
2980 Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
2982 or for the case when OP is unary (Iop_QNarrowUn*)
2984 Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
2986 static
2987 IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
2989 switch (qnarrowOp) {
2990 /* Binary: (128, 128) -> 128 */
2991 case Iop_QNarrowBin16Sto8Ux16:
2992 case Iop_QNarrowBin16Sto8Sx16:
2993 case Iop_QNarrowBin16Uto8Ux16:
2994 case Iop_QNarrowBin64Sto32Sx4:
2995 case Iop_QNarrowBin64Uto32Ux4:
2996 return Iop_NarrowBin16to8x16;
2997 case Iop_QNarrowBin32Sto16Ux8:
2998 case Iop_QNarrowBin32Sto16Sx8:
2999 case Iop_QNarrowBin32Uto16Ux8:
3000 return Iop_NarrowBin32to16x8;
3001 /* Binary: (64, 64) -> 64 */
3002 case Iop_QNarrowBin32Sto16Sx4:
3003 return Iop_NarrowBin32to16x4;
3004 case Iop_QNarrowBin16Sto8Ux8:
3005 case Iop_QNarrowBin16Sto8Sx8:
3006 return Iop_NarrowBin16to8x8;
3007 /* Unary: 128 -> 64 */
3008 case Iop_QNarrowUn64Uto32Ux2:
3009 case Iop_QNarrowUn64Sto32Sx2:
3010 case Iop_QNarrowUn64Sto32Ux2:
3011 return Iop_NarrowUn64to32x2;
3012 case Iop_QNarrowUn32Uto16Ux4:
3013 case Iop_QNarrowUn32Sto16Sx4:
3014 case Iop_QNarrowUn32Sto16Ux4:
3015 case Iop_F32toF16x4_DEP:
3016 return Iop_NarrowUn32to16x4;
3017 case Iop_QNarrowUn16Uto8Ux8:
3018 case Iop_QNarrowUn16Sto8Sx8:
3019 case Iop_QNarrowUn16Sto8Ux8:
3020 return Iop_NarrowUn16to8x8;
3021 default:
3022 ppIROp(qnarrowOp);
3023 VG_(tool_panic)("vanillaNarrowOpOfShape");
3027 static
3028 IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op,
3029 IRAtom* vatom1, IRAtom* vatom2)
3031 IRAtom *at1, *at2, *at3;
3032 IRAtom* (*pcast)( MCEnv*, IRAtom* );
3033 switch (narrow_op) {
3034 case Iop_QNarrowBin64Sto32Sx4: pcast = mkPCast32x4; break;
3035 case Iop_QNarrowBin64Uto32Ux4: pcast = mkPCast32x4; break;
3036 case Iop_QNarrowBin32Sto16Sx8: pcast = mkPCast32x4; break;
3037 case Iop_QNarrowBin32Uto16Ux8: pcast = mkPCast32x4; break;
3038 case Iop_QNarrowBin32Sto16Ux8: pcast = mkPCast32x4; break;
3039 case Iop_QNarrowBin16Sto8Sx16: pcast = mkPCast16x8; break;
3040 case Iop_QNarrowBin16Uto8Ux16: pcast = mkPCast16x8; break;
3041 case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
3042 default: VG_(tool_panic)("vectorNarrowBinV128");
3044 IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3045 tl_assert(isShadowAtom(mce,vatom1));
3046 tl_assert(isShadowAtom(mce,vatom2));
3047 at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
3048 at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
3049 at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
3050 return at3;
3053 static
3054 IRAtom* vectorNarrowBin64 ( MCEnv* mce, IROp narrow_op,
3055 IRAtom* vatom1, IRAtom* vatom2)
3057 IRAtom *at1, *at2, *at3;
3058 IRAtom* (*pcast)( MCEnv*, IRAtom* );
3059 switch (narrow_op) {
3060 case Iop_QNarrowBin32Sto16Sx4: pcast = mkPCast32x2; break;
3061 case Iop_QNarrowBin16Sto8Sx8: pcast = mkPCast16x4; break;
3062 case Iop_QNarrowBin16Sto8Ux8: pcast = mkPCast16x4; break;
3063 default: VG_(tool_panic)("vectorNarrowBin64");
3065 IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3066 tl_assert(isShadowAtom(mce,vatom1));
3067 tl_assert(isShadowAtom(mce,vatom2));
3068 at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
3069 at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
3070 at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
3071 return at3;
3074 static
3075 IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
3076 IRAtom* vatom1)
3078 IRAtom *at1, *at2;
3079 IRAtom* (*pcast)( MCEnv*, IRAtom* );
3080 tl_assert(isShadowAtom(mce,vatom1));
3081 /* For vanilla narrowing (non-saturating), we can just apply
3082 the op directly to the V bits. */
3083 switch (narrow_op) {
3084 case Iop_NarrowUn16to8x8:
3085 case Iop_NarrowUn32to16x4:
3086 case Iop_NarrowUn64to32x2:
3087 case Iop_F32toF16x4_DEP:
3088 at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
3089 return at1;
3090 default:
3091 break; /* Do Plan B */
3093 /* Plan B: for ops that involve a saturation operation on the args,
3094 we must PCast before the vanilla narrow. */
3095 switch (narrow_op) {
3096 case Iop_QNarrowUn16Sto8Sx8: pcast = mkPCast16x8; break;
3097 case Iop_QNarrowUn16Sto8Ux8: pcast = mkPCast16x8; break;
3098 case Iop_QNarrowUn16Uto8Ux8: pcast = mkPCast16x8; break;
3099 case Iop_QNarrowUn32Sto16Sx4: pcast = mkPCast32x4; break;
3100 case Iop_QNarrowUn32Sto16Ux4: pcast = mkPCast32x4; break;
3101 case Iop_QNarrowUn32Uto16Ux4: pcast = mkPCast32x4; break;
3102 case Iop_QNarrowUn64Sto32Sx2: pcast = mkPCast64x2; break;
3103 case Iop_QNarrowUn64Sto32Ux2: pcast = mkPCast64x2; break;
3104 case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
3105 default: VG_(tool_panic)("vectorNarrowUnV128");
3107 IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
3108 at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
3109 at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
3110 return at2;
3113 static
3114 IRAtom* vectorWidenI64 ( MCEnv* mce, IROp longen_op,
3115 IRAtom* vatom1)
3117 IRAtom *at1, *at2;
3118 IRAtom* (*pcast)( MCEnv*, IRAtom* );
3119 switch (longen_op) {
3120 case Iop_Widen8Uto16x8: pcast = mkPCast16x8; break;
3121 case Iop_Widen8Sto16x8: pcast = mkPCast16x8; break;
3122 case Iop_Widen16Uto32x4: pcast = mkPCast32x4; break;
3123 case Iop_Widen16Sto32x4: pcast = mkPCast32x4; break;
3124 case Iop_Widen32Uto64x2: pcast = mkPCast64x2; break;
3125 case Iop_Widen32Sto64x2: pcast = mkPCast64x2; break;
3126 case Iop_F16toF32x4: pcast = mkPCast32x4; break;
3127 default: VG_(tool_panic)("vectorWidenI64");
3129 tl_assert(isShadowAtom(mce,vatom1));
3130 at1 = assignNew('V', mce, Ity_V128, unop(longen_op, vatom1));
3131 at2 = assignNew('V', mce, Ity_V128, pcast(mce, at1));
3132 return at2;
3136 /* --- --- Vector integer arithmetic --- --- */
3138 /* Simple ... UifU the args and per-lane pessimise the results. */
3140 /* --- V256-bit versions --- */
3142 static
3143 IRAtom* binary8Ix32 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3145 IRAtom* at;
3146 at = mkUifUV256(mce, vatom1, vatom2);
3147 at = mkPCast8x32(mce, at);
3148 return at;
3151 static
3152 IRAtom* binary16Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3154 IRAtom* at;
3155 at = mkUifUV256(mce, vatom1, vatom2);
3156 at = mkPCast16x16(mce, at);
3157 return at;
3160 static
3161 IRAtom* binary32Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3163 IRAtom* at;
3164 at = mkUifUV256(mce, vatom1, vatom2);
3165 at = mkPCast32x8(mce, at);
3166 return at;
3169 static
3170 IRAtom* binary64Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3172 IRAtom* at;
3173 at = mkUifUV256(mce, vatom1, vatom2);
3174 at = mkPCast64x4(mce, at);
3175 return at;
3178 /* --- V128-bit versions --- */
3180 static
3181 IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3183 IRAtom* at;
3184 at = mkUifUV128(mce, vatom1, vatom2);
3185 at = mkPCast8x16(mce, at);
3186 return at;
3189 static
3190 IRAtom* binary16Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3192 IRAtom* at;
3193 at = mkUifUV128(mce, vatom1, vatom2);
3194 at = mkPCast16x8(mce, at);
3195 return at;
3198 static
3199 IRAtom* binary32Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3201 IRAtom* at;
3202 at = mkUifUV128(mce, vatom1, vatom2);
3203 at = mkPCast32x4(mce, at);
3204 return at;
3207 static
3208 IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3210 IRAtom* at;
3211 at = mkUifUV128(mce, vatom1, vatom2);
3212 at = mkPCast64x2(mce, at);
3213 return at;
3216 static
3217 IRAtom* binary128Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3219 IRAtom* at;
3220 at = mkUifUV128(mce, vatom1, vatom2);
3221 at = mkPCast128x1(mce, at);
3222 return at;
3225 /* --- 64-bit versions --- */
3227 static
3228 IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3230 IRAtom* at;
3231 at = mkUifU64(mce, vatom1, vatom2);
3232 at = mkPCast8x8(mce, at);
3233 return at;
3236 static
3237 IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3239 IRAtom* at;
3240 at = mkUifU64(mce, vatom1, vatom2);
3241 at = mkPCast16x4(mce, at);
3242 return at;
3245 static
3246 IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3248 IRAtom* at;
3249 at = mkUifU64(mce, vatom1, vatom2);
3250 at = mkPCast32x2(mce, at);
3251 return at;
3254 static
3255 IRAtom* binary64Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3257 IRAtom* at;
3258 at = mkUifU64(mce, vatom1, vatom2);
3259 at = mkPCastTo(mce, Ity_I64, at);
3260 return at;
3263 /* --- 32-bit versions --- */
3265 static
3266 IRAtom* binary8Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3268 IRAtom* at;
3269 at = mkUifU32(mce, vatom1, vatom2);
3270 at = mkPCast8x4(mce, at);
3271 return at;
3274 static
3275 IRAtom* binary16Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
3277 IRAtom* at;
3278 at = mkUifU32(mce, vatom1, vatom2);
3279 at = mkPCast16x2(mce, at);
3280 return at;
3284 /*------------------------------------------------------------*/
3285 /*--- Generate shadow values from all kinds of IRExprs. ---*/
3286 /*------------------------------------------------------------*/
3288 static
3289 IRAtom* expr2vbits_Qop ( MCEnv* mce,
3290 IROp op,
3291 IRAtom* atom1, IRAtom* atom2,
3292 IRAtom* atom3, IRAtom* atom4 )
3294 IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3295 IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3296 IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3297 IRAtom* vatom4 = expr2vbits( mce, atom4, HuOth );
3299 tl_assert(isOriginalAtom(mce,atom1));
3300 tl_assert(isOriginalAtom(mce,atom2));
3301 tl_assert(isOriginalAtom(mce,atom3));
3302 tl_assert(isOriginalAtom(mce,atom4));
3303 tl_assert(isShadowAtom(mce,vatom1));
3304 tl_assert(isShadowAtom(mce,vatom2));
3305 tl_assert(isShadowAtom(mce,vatom3));
3306 tl_assert(isShadowAtom(mce,vatom4));
3307 tl_assert(sameKindedAtoms(atom1,vatom1));
3308 tl_assert(sameKindedAtoms(atom2,vatom2));
3309 tl_assert(sameKindedAtoms(atom3,vatom3));
3310 tl_assert(sameKindedAtoms(atom4,vatom4));
3311 switch (op) {
3312 case Iop_MAddF64:
3313 case Iop_MAddF64r32:
3314 case Iop_MSubF64:
3315 case Iop_MSubF64r32:
3316 /* I32(rm) x F64 x F64 x F64 -> F64 */
3317 return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3319 case Iop_MAddF32:
3320 case Iop_MSubF32:
3321 /* I32(rm) x F32 x F32 x F32 -> F32 */
3322 return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3324 case Iop_MAddF128:
3325 case Iop_MSubF128:
3326 case Iop_NegMAddF128:
3327 case Iop_NegMSubF128:
3328 /* I32(rm) x F128 x F128 x F128 -> F128 */
3329 return mkLazy4(mce, Ity_I128, vatom1, vatom2, vatom3, vatom4);
3331 /* V256-bit data-steering */
3332 case Iop_64x4toV256:
3333 return assignNew('V', mce, Ity_V256,
3334 IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
3336 /* I32/I64 x I8 x I8 x I8 -> I32/I64 */
3337 case Iop_Rotx32:
3338 return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
3339 case Iop_Rotx64:
3340 return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
3341 default:
3342 ppIROp(op);
3343 VG_(tool_panic)("memcheck:expr2vbits_Qop");
3348 static
3349 IRAtom* expr2vbits_Triop ( MCEnv* mce,
3350 IROp op,
3351 IRAtom* atom1, IRAtom* atom2, IRAtom* atom3 )
3353 IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3354 IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3355 IRAtom* vatom3 = expr2vbits( mce, atom3, HuOth );
3357 tl_assert(isOriginalAtom(mce,atom1));
3358 tl_assert(isOriginalAtom(mce,atom2));
3359 tl_assert(isOriginalAtom(mce,atom3));
3360 tl_assert(isShadowAtom(mce,vatom1));
3361 tl_assert(isShadowAtom(mce,vatom2));
3362 tl_assert(isShadowAtom(mce,vatom3));
3363 tl_assert(sameKindedAtoms(atom1,vatom1));
3364 tl_assert(sameKindedAtoms(atom2,vatom2));
3365 tl_assert(sameKindedAtoms(atom3,vatom3));
3366 switch (op) {
3367 case Iop_AddF128:
3368 case Iop_SubF128:
3369 case Iop_MulF128:
3370 case Iop_DivF128:
3371 case Iop_AddD128:
3372 case Iop_SubD128:
3373 case Iop_MulD128:
3374 case Iop_DivD128:
3375 case Iop_QuantizeD128:
3376 /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
3377 return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3378 case Iop_AddF64:
3379 case Iop_AddD64:
3380 case Iop_AddF64r32:
3381 case Iop_SubF64:
3382 case Iop_SubD64:
3383 case Iop_SubF64r32:
3384 case Iop_MulF64:
3385 case Iop_MulD64:
3386 case Iop_MulF64r32:
3387 case Iop_DivF64:
3388 case Iop_DivD64:
3389 case Iop_DivF64r32:
3390 case Iop_ScaleF64:
3391 case Iop_Yl2xF64:
3392 case Iop_Yl2xp1F64:
3393 case Iop_AtanF64:
3394 case Iop_PRemF64:
3395 case Iop_PRem1F64:
3396 case Iop_QuantizeD64:
3397 /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
3398 return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3399 case Iop_PRemC3210F64:
3400 case Iop_PRem1C3210F64:
3401 /* I32(rm) x F64 x F64 -> I32 */
3402 return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3403 case Iop_AddF32:
3404 case Iop_SubF32:
3405 case Iop_MulF32:
3406 case Iop_DivF32:
3407 /* I32(rm) x F32 x F32 -> I32 */
3408 return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
3409 case Iop_AddF16:
3410 case Iop_SubF16:
3411 /* I32(rm) x F16 x F16 -> I16 */
3412 return mkLazy3(mce, Ity_I16, vatom1, vatom2, vatom3);
3413 case Iop_SignificanceRoundD64:
3414 /* IRRoundingMode(I32) x I8 x D64 -> D64 */
3415 return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
3416 case Iop_SignificanceRoundD128:
3417 /* IRRoundingMode(I32) x I8 x D128 -> D128 */
3418 return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
3419 case Iop_SliceV128:
3420 /* (V128, V128, I8) -> V128 */
3421 complainIfUndefined(mce, atom3, NULL);
3422 return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
3423 case Iop_Slice64:
3424 /* (I64, I64, I8) -> I64 */
3425 complainIfUndefined(mce, atom3, NULL);
3426 return assignNew('V', mce, Ity_I64, triop(op, vatom1, vatom2, atom3));
3427 case Iop_SetElem8x8:
3428 case Iop_SetElem16x4:
3429 case Iop_SetElem32x2:
3430 complainIfUndefined(mce, atom2, NULL);
3431 return assignNew('V', mce, Ity_I64, triop(op, vatom1, atom2, vatom3));
3433 case Iop_SetElem8x16:
3434 case Iop_SetElem16x8:
3435 case Iop_SetElem32x4:
3436 case Iop_SetElem64x2:
3437 complainIfUndefined(mce, atom2, NULL);
3438 return assignNew('V', mce, Ity_V128, triop(op, vatom1, atom2, vatom3));
3440 /* Int 128-bit Integer three arg */
3441 case Iop_2xMultU64Add128CarryOut:
3442 case Iop_Perm8x16x2:
3443 /* (V128, V128, V128) -> V128 */
3444 complainIfUndefined(mce, atom3, NULL);
3445 return mkUifUV128(
3446 mce,
3447 assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3)),
3448 mkPCast8x16(mce, vatom3)
3451 /* Vector FP with rounding mode as the first arg */
3452 case Iop_Add64Fx2:
3453 case Iop_Sub64Fx2:
3454 case Iop_Mul64Fx2:
3455 case Iop_Div64Fx2:
3456 case Iop_Scale2_64Fx2:
3457 return binary64Fx2_w_rm(mce, vatom1, vatom2, vatom3);
3459 case Iop_Add32Fx4:
3460 case Iop_Sub32Fx4:
3461 case Iop_Mul32Fx4:
3462 case Iop_Div32Fx4:
3463 case Iop_Scale2_32Fx4:
3464 return binary32Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3466 case Iop_Add64Fx4:
3467 case Iop_Sub64Fx4:
3468 case Iop_Mul64Fx4:
3469 case Iop_Div64Fx4:
3470 return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3);
3472 /* TODO: remaining versions of 16x4 FP ops when more of the half-precision
3473 IR is implemented.
3475 case Iop_Add16Fx8:
3476 case Iop_Sub16Fx8:
3477 return binary16Fx8_w_rm(mce, vatom1, vatom2, vatom3);
3479 case Iop_Add32Fx8:
3480 case Iop_Sub32Fx8:
3481 case Iop_Mul32Fx8:
3482 case Iop_Div32Fx8:
3483 return binary32Fx8_w_rm(mce, vatom1, vatom2, vatom3);
3485 case Iop_F32x4_2toQ16x8:
3486 return assignNew('V', mce, Ity_V128,
3487 binop(Iop_PackEvenLanes16x8,
3488 unary32Fx4_w_rm(mce, vatom1, vatom2),
3489 unary32Fx4_w_rm(mce, vatom1, vatom3)));
3490 case Iop_F64x2_2toQ32x4:
3491 return assignNew('V', mce, Ity_V128,
3492 binop(Iop_PackEvenLanes32x4,
3493 unary64Fx2_w_rm(mce, vatom1, vatom2),
3494 unary64Fx2_w_rm(mce, vatom1, vatom3)));
3496 default:
3497 ppIROp(op);
3498 VG_(tool_panic)("memcheck:expr2vbits_Triop");
3503 static
3504 IRAtom* expr2vbits_Binop ( MCEnv* mce,
3505 IROp op,
3506 IRAtom* atom1, IRAtom* atom2,
3507 HowUsed hu/*use HuOth if unknown*/ )
3509 IRType and_or_ty = Ity_INVALID;
3510 IRAtom* (*uifu) (MCEnv*, IRAtom*, IRAtom*) = NULL;
3511 IRAtom* (*difd) (MCEnv*, IRAtom*, IRAtom*) = NULL;
3512 IRAtom* (*improve) (MCEnv*, IRAtom*, IRAtom*) = NULL;
3514 IRAtom* vatom1 = expr2vbits( mce, atom1, HuOth );
3515 IRAtom* vatom2 = expr2vbits( mce, atom2, HuOth );
3517 tl_assert(isOriginalAtom(mce,atom1));
3518 tl_assert(isOriginalAtom(mce,atom2));
3519 tl_assert(isShadowAtom(mce,vatom1));
3520 tl_assert(isShadowAtom(mce,vatom2));
3521 tl_assert(sameKindedAtoms(atom1,vatom1));
3522 tl_assert(sameKindedAtoms(atom2,vatom2));
3523 switch (op) {
3525 /* 32-bit SIMD */
3527 case Iop_Add16x2:
3528 case Iop_HAdd16Ux2:
3529 case Iop_HAdd16Sx2:
3530 case Iop_Sub16x2:
3531 case Iop_HSub16Ux2:
3532 case Iop_HSub16Sx2:
3533 case Iop_QAdd16Sx2:
3534 case Iop_QSub16Sx2:
3535 case Iop_QSub16Ux2:
3536 case Iop_QAdd16Ux2:
3537 return binary16Ix2(mce, vatom1, vatom2);
3539 case Iop_Add8x4:
3540 case Iop_HAdd8Ux4:
3541 case Iop_HAdd8Sx4:
3542 case Iop_Sub8x4:
3543 case Iop_HSub8Ux4:
3544 case Iop_HSub8Sx4:
3545 case Iop_QSub8Ux4:
3546 case Iop_QAdd8Ux4:
3547 case Iop_QSub8Sx4:
3548 case Iop_QAdd8Sx4:
3549 return binary8Ix4(mce, vatom1, vatom2);
3551 /* 64-bit SIMD */
3553 case Iop_ShrN8x8:
3554 case Iop_ShrN16x4:
3555 case Iop_ShrN32x2:
3556 case Iop_SarN8x8:
3557 case Iop_SarN16x4:
3558 case Iop_SarN32x2:
3559 case Iop_ShlN16x4:
3560 case Iop_ShlN32x2:
3561 case Iop_ShlN8x8:
3562 /* Same scheme as with all other shifts. */
3563 complainIfUndefined(mce, atom2, NULL);
3564 return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
3566 case Iop_QNarrowBin32Sto16Sx4:
3567 case Iop_QNarrowBin16Sto8Sx8:
3568 case Iop_QNarrowBin16Sto8Ux8:
3569 return vectorNarrowBin64(mce, op, vatom1, vatom2);
3571 case Iop_Min8Ux8:
3572 case Iop_Min8Sx8:
3573 case Iop_Max8Ux8:
3574 case Iop_Max8Sx8:
3575 case Iop_Avg8Ux8:
3576 case Iop_QSub8Sx8:
3577 case Iop_QSub8Ux8:
3578 case Iop_Sub8x8:
3579 case Iop_CmpGT8Sx8:
3580 case Iop_CmpGT8Ux8:
3581 case Iop_CmpEQ8x8:
3582 case Iop_QAdd8Sx8:
3583 case Iop_QAdd8Ux8:
3584 case Iop_QSal8x8:
3585 case Iop_QShl8x8:
3586 case Iop_Add8x8:
3587 case Iop_Mul8x8:
3588 case Iop_PolynomialMul8x8:
3589 return binary8Ix8(mce, vatom1, vatom2);
3591 case Iop_Min16Sx4:
3592 case Iop_Min16Ux4:
3593 case Iop_Max16Sx4:
3594 case Iop_Max16Ux4:
3595 case Iop_Avg16Ux4:
3596 case Iop_QSub16Ux4:
3597 case Iop_QSub16Sx4:
3598 case Iop_Sub16x4:
3599 case Iop_Mul16x4:
3600 case Iop_MulHi16Sx4:
3601 case Iop_MulHi16Ux4:
3602 case Iop_CmpGT16Sx4:
3603 case Iop_CmpGT16Ux4:
3604 case Iop_CmpEQ16x4:
3605 case Iop_QAdd16Sx4:
3606 case Iop_QAdd16Ux4:
3607 case Iop_QSal16x4:
3608 case Iop_QShl16x4:
3609 case Iop_Add16x4:
3610 case Iop_QDMulHi16Sx4:
3611 case Iop_QRDMulHi16Sx4:
3612 return binary16Ix4(mce, vatom1, vatom2);
3614 case Iop_Sub32x2:
3615 case Iop_Mul32x2:
3616 case Iop_Max32Sx2:
3617 case Iop_Max32Ux2:
3618 case Iop_Min32Sx2:
3619 case Iop_Min32Ux2:
3620 case Iop_CmpGT32Sx2:
3621 case Iop_CmpGT32Ux2:
3622 case Iop_CmpEQ32x2:
3623 case Iop_Add32x2:
3624 case Iop_QAdd32Ux2:
3625 case Iop_QAdd32Sx2:
3626 case Iop_QSub32Ux2:
3627 case Iop_QSub32Sx2:
3628 case Iop_QSal32x2:
3629 case Iop_QShl32x2:
3630 case Iop_QDMulHi32Sx2:
3631 case Iop_QRDMulHi32Sx2:
3632 return binary32Ix2(mce, vatom1, vatom2);
3634 case Iop_QSub64Ux1:
3635 case Iop_QSub64Sx1:
3636 case Iop_QAdd64Ux1:
3637 case Iop_QAdd64Sx1:
3638 case Iop_QSal64x1:
3639 case Iop_QShl64x1:
3640 case Iop_Sal64x1:
3641 return binary64Ix1(mce, vatom1, vatom2);
3643 case Iop_QShlNsatSU8x8:
3644 case Iop_QShlNsatUU8x8:
3645 case Iop_QShlNsatSS8x8:
3646 complainIfUndefined(mce, atom2, NULL);
3647 return mkPCast8x8(mce, vatom1);
3649 case Iop_QShlNsatSU16x4:
3650 case Iop_QShlNsatUU16x4:
3651 case Iop_QShlNsatSS16x4:
3652 complainIfUndefined(mce, atom2, NULL);
3653 return mkPCast16x4(mce, vatom1);
3655 case Iop_QShlNsatSU32x2:
3656 case Iop_QShlNsatUU32x2:
3657 case Iop_QShlNsatSS32x2:
3658 complainIfUndefined(mce, atom2, NULL);
3659 return mkPCast32x2(mce, vatom1);
3661 case Iop_QShlNsatSU64x1:
3662 case Iop_QShlNsatUU64x1:
3663 case Iop_QShlNsatSS64x1:
3664 complainIfUndefined(mce, atom2, NULL);
3665 return mkPCast32x2(mce, vatom1);
3667 case Iop_PwMax32Sx2:
3668 case Iop_PwMax32Ux2:
3669 case Iop_PwMin32Sx2:
3670 case Iop_PwMin32Ux2:
3671 case Iop_PwMax32Fx2:
3672 case Iop_PwMin32Fx2:
3673 return assignNew('V', mce, Ity_I64,
3674 binop(Iop_PwMax32Ux2,
3675 mkPCast32x2(mce, vatom1),
3676 mkPCast32x2(mce, vatom2)));
3678 case Iop_PwMax16Sx4:
3679 case Iop_PwMax16Ux4:
3680 case Iop_PwMin16Sx4:
3681 case Iop_PwMin16Ux4:
3682 return assignNew('V', mce, Ity_I64,
3683 binop(Iop_PwMax16Ux4,
3684 mkPCast16x4(mce, vatom1),
3685 mkPCast16x4(mce, vatom2)));
3687 case Iop_PwMax8Sx8:
3688 case Iop_PwMax8Ux8:
3689 case Iop_PwMin8Sx8:
3690 case Iop_PwMin8Ux8:
3691 return assignNew('V', mce, Ity_I64,
3692 binop(Iop_PwMax8Ux8,
3693 mkPCast8x8(mce, vatom1),
3694 mkPCast8x8(mce, vatom2)));
3696 case Iop_PwAdd32x2:
3697 case Iop_PwAdd32Fx2:
3698 return mkPCast32x2(mce,
3699 assignNew('V', mce, Ity_I64,
3700 binop(Iop_PwAdd32x2,
3701 mkPCast32x2(mce, vatom1),
3702 mkPCast32x2(mce, vatom2))));
3704 case Iop_PwAdd16x4:
3705 return mkPCast16x4(mce,
3706 assignNew('V', mce, Ity_I64,
3707 binop(op, mkPCast16x4(mce, vatom1),
3708 mkPCast16x4(mce, vatom2))));
3710 case Iop_PwAdd8x8:
3711 return mkPCast8x8(mce,
3712 assignNew('V', mce, Ity_I64,
3713 binop(op, mkPCast8x8(mce, vatom1),
3714 mkPCast8x8(mce, vatom2))));
3716 case Iop_Shl8x8:
3717 case Iop_Shr8x8:
3718 case Iop_Sar8x8:
3719 case Iop_Sal8x8:
3720 return mkUifU64(mce,
3721 assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3722 mkPCast8x8(mce,vatom2)
3725 case Iop_Shl16x4:
3726 case Iop_Shr16x4:
3727 case Iop_Sar16x4:
3728 case Iop_Sal16x4:
3729 return mkUifU64(mce,
3730 assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3731 mkPCast16x4(mce,vatom2)
3734 case Iop_Shl32x2:
3735 case Iop_Shr32x2:
3736 case Iop_Sar32x2:
3737 case Iop_Sal32x2:
3738 return mkUifU64(mce,
3739 assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3740 mkPCast32x2(mce,vatom2)
3743 /* 64-bit data-steering */
3744 case Iop_InterleaveLO32x2:
3745 case Iop_InterleaveLO16x4:
3746 case Iop_InterleaveLO8x8:
3747 case Iop_InterleaveHI32x2:
3748 case Iop_InterleaveHI16x4:
3749 case Iop_InterleaveHI8x8:
3750 case Iop_CatOddLanes8x8:
3751 case Iop_CatEvenLanes8x8:
3752 case Iop_CatOddLanes16x4:
3753 case Iop_CatEvenLanes16x4:
3754 case Iop_InterleaveOddLanes8x8:
3755 case Iop_InterleaveEvenLanes8x8:
3756 case Iop_InterleaveOddLanes16x4:
3757 case Iop_InterleaveEvenLanes16x4:
3758 return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
3760 case Iop_GetElem8x8:
3761 complainIfUndefined(mce, atom2, NULL);
3762 return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
3763 case Iop_GetElem16x4:
3764 complainIfUndefined(mce, atom2, NULL);
3765 return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
3766 case Iop_GetElem32x2:
3767 complainIfUndefined(mce, atom2, NULL);
3768 return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
3770 /* Perm8x8: rearrange values in left arg using steering values from
3771 right arg. So rearrange the vbits in the same way but pessimise wrt
3772 steering values. We assume that unused bits in the steering value
3773 are defined zeros, so we can safely PCast within each lane of the the
3774 steering value without having to take precautions to avoid a
3775 dependency on those unused bits.
3777 This is also correct for PermOrZero8x8, but it is a bit subtle. For
3778 each lane, if bit 7 of the steering value is zero, then we'll steer
3779 the shadow value exactly as per Perm8x8. If that bit is one, then
3780 the operation will set the resulting (concrete) value to zero. That
3781 means it is defined, and should have a shadow value of zero. Hence
3782 in both cases (bit 7 is 0 or 1) we can self-shadow (in the same way
3783 as Perm8x8) and then pessimise against the steering values. */
3784 case Iop_Perm8x8:
3785 case Iop_PermOrZero8x8:
3786 return mkUifU64(
3787 mce,
3788 assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
3789 mkPCast8x8(mce, vatom2)
3792 /* V128-bit SIMD */
3794 case Iop_I32StoF32x4:
3795 case Iop_F32toI32Sx4:
3796 case Iop_Sqrt16Fx8:
3797 return unary16Fx8_w_rm(mce, vatom1, vatom2);
3798 case Iop_Sqrt32Fx4:
3799 return unary32Fx4_w_rm(mce, vatom1, vatom2);
3800 case Iop_Sqrt64Fx2:
3801 return unary64Fx2_w_rm(mce, vatom1, vatom2);
3803 case Iop_ShrN8x16:
3804 case Iop_ShrN16x8:
3805 case Iop_ShrN32x4:
3806 case Iop_ShrN64x2:
3807 case Iop_SarN8x16:
3808 case Iop_SarN16x8:
3809 case Iop_SarN32x4:
3810 case Iop_SarN64x2:
3811 case Iop_ShlN8x16:
3812 case Iop_ShlN16x8:
3813 case Iop_ShlN32x4:
3814 case Iop_ShlN64x2:
3815 /* Same scheme as with all other shifts. Note: 22 Oct 05:
3816 this is wrong now, scalar shifts are done properly lazily.
3817 Vector shifts should be fixed too. */
3818 complainIfUndefined(mce, atom2, NULL);
3819 return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
3821 /* V x V shifts/rotates are done using the standard lazy scheme. */
3822 /* For the non-rounding variants of bi-di vector x vector
3823 shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
3824 But note that this is overly pessimistic, because in fact only
3825 the bottom 8 bits of each lane of the second argument are taken
3826 into account when shifting. So really we ought to ignore
3827 undefinedness in bits 8 and above of each lane in the
3828 second argument. */
3829 case Iop_Shl8x16:
3830 case Iop_Shr8x16:
3831 case Iop_Sar8x16:
3832 case Iop_Sal8x16:
3833 case Iop_Rol8x16:
3834 case Iop_Sh8Sx16:
3835 case Iop_Sh8Ux16:
3836 return mkUifUV128(mce,
3837 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3838 mkPCast8x16(mce,vatom2)
3841 case Iop_Shl16x8:
3842 case Iop_Shr16x8:
3843 case Iop_Sar16x8:
3844 case Iop_Sal16x8:
3845 case Iop_Rol16x8:
3846 case Iop_Sh16Sx8:
3847 case Iop_Sh16Ux8:
3848 return mkUifUV128(mce,
3849 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3850 mkPCast16x8(mce,vatom2)
3853 case Iop_Shl32x4:
3854 case Iop_Shr32x4:
3855 case Iop_Sar32x4:
3856 case Iop_Sal32x4:
3857 case Iop_Rol32x4:
3858 case Iop_Sh32Sx4:
3859 case Iop_Sh32Ux4:
3860 return mkUifUV128(mce,
3861 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3862 mkPCast32x4(mce,vatom2)
3865 case Iop_Shl64x2:
3866 case Iop_Shr64x2:
3867 case Iop_Sar64x2:
3868 case Iop_Sal64x2:
3869 case Iop_Rol64x2:
3870 case Iop_Sh64Sx2:
3871 case Iop_Sh64Ux2:
3872 return mkUifUV128(mce,
3873 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
3874 mkPCast64x2(mce,vatom2)
3877 /* For the rounding variants of bi-di vector x vector shifts, the
3878 rounding adjustment can cause undefinedness to propagate through
3879 the entire lane, in the worst case. Too complex to handle
3880 properly .. just UifU the arguments and then PCast them.
3881 Suboptimal but safe. */
3882 case Iop_Rsh8Sx16:
3883 case Iop_Rsh8Ux16:
3884 return binary8Ix16(mce, vatom1, vatom2);
3885 case Iop_Rsh16Sx8:
3886 case Iop_Rsh16Ux8:
3887 return binary16Ix8(mce, vatom1, vatom2);
3888 case Iop_Rsh32Sx4:
3889 case Iop_Rsh32Ux4:
3890 return binary32Ix4(mce, vatom1, vatom2);
3891 case Iop_Rsh64Sx2:
3892 case Iop_Rsh64Ux2:
3893 return binary64Ix2(mce, vatom1, vatom2);
3895 case Iop_F32ToFixed32Ux4_RZ:
3896 case Iop_F32ToFixed32Sx4_RZ:
3897 case Iop_Fixed32UToF32x4_RN:
3898 case Iop_Fixed32SToF32x4_RN:
3899 complainIfUndefined(mce, atom2, NULL);
3900 return mkPCast32x4(mce, vatom1);
3902 case Iop_F32ToFixed32Ux2_RZ:
3903 case Iop_F32ToFixed32Sx2_RZ:
3904 case Iop_Fixed32UToF32x2_RN:
3905 case Iop_Fixed32SToF32x2_RN:
3906 complainIfUndefined(mce, atom2, NULL);
3907 return mkPCast32x2(mce, vatom1);
3909 case Iop_QSub8Ux16:
3910 case Iop_QSub8Sx16:
3911 case Iop_Sub8x16:
3912 case Iop_Min8Ux16:
3913 case Iop_Min8Sx16:
3914 case Iop_Max8Ux16:
3915 case Iop_Max8Sx16:
3916 case Iop_CmpGT8Sx16:
3917 case Iop_CmpGT8Ux16:
3918 case Iop_CmpEQ8x16:
3919 case Iop_Avg8Ux16:
3920 case Iop_Avg8Sx16:
3921 case Iop_QAdd8Ux16:
3922 case Iop_QAdd8Sx16:
3923 case Iop_QAddExtUSsatSS8x16:
3924 case Iop_QAddExtSUsatUU8x16:
3925 case Iop_QSal8x16:
3926 case Iop_QShl8x16:
3927 case Iop_Add8x16:
3928 case Iop_Mul8x16:
3929 case Iop_MulHi8Sx16:
3930 case Iop_MulHi8Ux16:
3931 case Iop_PolynomialMul8x16:
3932 case Iop_PolynomialMulAdd8x16:
3933 return binary8Ix16(mce, vatom1, vatom2);
3935 case Iop_QSub16Ux8:
3936 case Iop_QSub16Sx8:
3937 case Iop_Sub16x8:
3938 case Iop_Mul16x8:
3939 case Iop_MulHi16Sx8:
3940 case Iop_MulHi16Ux8:
3941 case Iop_Min16Sx8:
3942 case Iop_Min16Ux8:
3943 case Iop_Max16Sx8:
3944 case Iop_Max16Ux8:
3945 case Iop_CmpGT16Sx8:
3946 case Iop_CmpGT16Ux8:
3947 case Iop_CmpEQ16x8:
3948 case Iop_Avg16Ux8:
3949 case Iop_Avg16Sx8:
3950 case Iop_QAdd16Ux8:
3951 case Iop_QAdd16Sx8:
3952 case Iop_QAddExtUSsatSS16x8:
3953 case Iop_QAddExtSUsatUU16x8:
3954 case Iop_QSal16x8:
3955 case Iop_QShl16x8:
3956 case Iop_Add16x8:
3957 case Iop_QDMulHi16Sx8:
3958 case Iop_QRDMulHi16Sx8:
3959 case Iop_PolynomialMulAdd16x8:
3960 /* PwExtUSMulQAdd8x16 is a bit subtle. The effect of it is that each
3961 16-bit chunk of the output is formed from corresponding 16-bit chunks
3962 of the input args, so we can treat it like an other binary 16x8
3963 operation. That's despite it having '8x16' in its name. */
3964 case Iop_PwExtUSMulQAdd8x16:
3965 return binary16Ix8(mce, vatom1, vatom2);
3967 case Iop_Sub32x4:
3968 case Iop_CmpGT32Sx4:
3969 case Iop_CmpGT32Ux4:
3970 case Iop_CmpEQ32x4:
3971 case Iop_QAdd32Sx4:
3972 case Iop_QAdd32Ux4:
3973 case Iop_QSub32Sx4:
3974 case Iop_QSub32Ux4:
3975 case Iop_QAddExtUSsatSS32x4:
3976 case Iop_QAddExtSUsatUU32x4:
3977 case Iop_QSal32x4:
3978 case Iop_QShl32x4:
3979 case Iop_Avg32Ux4:
3980 case Iop_Avg32Sx4:
3981 case Iop_Add32x4:
3982 case Iop_Max32Ux4:
3983 case Iop_Max32Sx4:
3984 case Iop_Min32Ux4:
3985 case Iop_Min32Sx4:
3986 case Iop_Mul32x4:
3987 case Iop_MulHi32Sx4:
3988 case Iop_MulHi32Ux4:
3989 case Iop_QDMulHi32Sx4:
3990 case Iop_QRDMulHi32Sx4:
3991 case Iop_PolynomialMulAdd32x4:
3992 return binary32Ix4(mce, vatom1, vatom2);
3994 case Iop_Sub64x2:
3995 case Iop_Add64x2:
3996 case Iop_Avg64Ux2:
3997 case Iop_Avg64Sx2:
3998 case Iop_Max64Sx2:
3999 case Iop_Max64Ux2:
4000 case Iop_Min64Sx2:
4001 case Iop_Min64Ux2:
4002 case Iop_CmpEQ64x2:
4003 case Iop_CmpGT64Sx2:
4004 case Iop_CmpGT64Ux2:
4005 case Iop_QSal64x2:
4006 case Iop_QShl64x2:
4007 case Iop_QAdd64Ux2:
4008 case Iop_QAdd64Sx2:
4009 case Iop_QSub64Ux2:
4010 case Iop_QSub64Sx2:
4011 case Iop_QAddExtUSsatSS64x2:
4012 case Iop_QAddExtSUsatUU64x2:
4013 case Iop_PolynomialMulAdd64x2:
4014 case Iop_CipherV128:
4015 case Iop_CipherLV128:
4016 case Iop_NCipherV128:
4017 case Iop_NCipherLV128:
4018 case Iop_MulI128by10E:
4019 case Iop_MulI128by10ECarry:
4020 return binary64Ix2(mce, vatom1, vatom2);
4022 case Iop_Add128x1:
4023 case Iop_Sub128x1:
4024 case Iop_CmpNEZ128x1:
4025 return binary128Ix1(mce, vatom1, vatom2);
4027 case Iop_DivU128:
4028 case Iop_DivS128:
4029 case Iop_DivU128E:
4030 case Iop_DivS128E:
4031 case Iop_ModU128:
4032 case Iop_ModS128:
4033 /* I128 x I128 -> I128 */
4034 return mkLazy2(mce, Ity_V128, vatom1, vatom2);
4036 case Iop_QNarrowBin64Sto32Sx4:
4037 case Iop_QNarrowBin64Uto32Ux4:
4038 case Iop_QNarrowBin32Sto16Sx8:
4039 case Iop_QNarrowBin32Uto16Ux8:
4040 case Iop_QNarrowBin32Sto16Ux8:
4041 case Iop_QNarrowBin16Sto8Sx16:
4042 case Iop_QNarrowBin16Uto8Ux16:
4043 case Iop_QNarrowBin16Sto8Ux16:
4044 return vectorNarrowBinV128(mce, op, vatom1, vatom2);
4046 case Iop_Min64Fx2:
4047 case Iop_Max64Fx2:
4048 case Iop_CmpLT64Fx2:
4049 case Iop_CmpLE64Fx2:
4050 case Iop_CmpEQ64Fx2:
4051 case Iop_CmpUN64Fx2:
4052 case Iop_RecipStep64Fx2:
4053 case Iop_RSqrtStep64Fx2:
4054 return binary64Fx2(mce, vatom1, vatom2);
4056 case Iop_CmpLT16Fx8:
4057 case Iop_CmpLE16Fx8:
4058 case Iop_CmpEQ16Fx8:
4059 return binary16Fx8(mce, vatom1, vatom2);
4061 case Iop_Sub64F0x2:
4062 case Iop_Mul64F0x2:
4063 case Iop_Min64F0x2:
4064 case Iop_Max64F0x2:
4065 case Iop_Div64F0x2:
4066 case Iop_CmpLT64F0x2:
4067 case Iop_CmpLE64F0x2:
4068 case Iop_CmpEQ64F0x2:
4069 case Iop_CmpUN64F0x2:
4070 case Iop_Add64F0x2:
4071 return binary64F0x2(mce, vatom1, vatom2);
4073 case Iop_Min32Fx4:
4074 case Iop_Max32Fx4:
4075 case Iop_CmpLT32Fx4:
4076 case Iop_CmpLE32Fx4:
4077 case Iop_CmpEQ32Fx4:
4078 case Iop_CmpUN32Fx4:
4079 case Iop_CmpGT32Fx4:
4080 case Iop_CmpGE32Fx4:
4081 case Iop_RecipStep32Fx4:
4082 case Iop_RSqrtStep32Fx4:
4083 return binary32Fx4(mce, vatom1, vatom2);
4085 case Iop_Sub32Fx2:
4086 case Iop_Mul32Fx2:
4087 case Iop_Min32Fx2:
4088 case Iop_Max32Fx2:
4089 case Iop_CmpEQ32Fx2:
4090 case Iop_CmpGT32Fx2:
4091 case Iop_CmpGE32Fx2:
4092 case Iop_Add32Fx2:
4093 case Iop_RecipStep32Fx2:
4094 case Iop_RSqrtStep32Fx2:
4095 return binary32Fx2(mce, vatom1, vatom2);
4097 case Iop_Sub32F0x4:
4098 case Iop_Mul32F0x4:
4099 case Iop_Min32F0x4:
4100 case Iop_Max32F0x4:
4101 case Iop_Div32F0x4:
4102 case Iop_CmpLT32F0x4:
4103 case Iop_CmpLE32F0x4:
4104 case Iop_CmpEQ32F0x4:
4105 case Iop_CmpUN32F0x4:
4106 case Iop_Add32F0x4:
4107 return binary32F0x4(mce, vatom1, vatom2);
4109 case Iop_QShlNsatSU8x16:
4110 case Iop_QShlNsatUU8x16:
4111 case Iop_QShlNsatSS8x16:
4112 complainIfUndefined(mce, atom2, NULL);
4113 return mkPCast8x16(mce, vatom1);
4115 case Iop_QShlNsatSU16x8:
4116 case Iop_QShlNsatUU16x8:
4117 case Iop_QShlNsatSS16x8:
4118 complainIfUndefined(mce, atom2, NULL);
4119 return mkPCast16x8(mce, vatom1);
4121 case Iop_QShlNsatSU32x4:
4122 case Iop_QShlNsatUU32x4:
4123 case Iop_QShlNsatSS32x4:
4124 complainIfUndefined(mce, atom2, NULL);
4125 return mkPCast32x4(mce, vatom1);
4127 case Iop_QShlNsatSU64x2:
4128 case Iop_QShlNsatUU64x2:
4129 case Iop_QShlNsatSS64x2:
4130 complainIfUndefined(mce, atom2, NULL);
4131 return mkPCast32x4(mce, vatom1);
4133 /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
4134 To make this simpler, do the following:
4135 * complain if the shift amount (the I8) is undefined
4136 * pcast each lane at the wide width
4137 * truncate each lane to half width
4138 * pcast the resulting 64-bit value to a single bit and use
4139 that as the least significant bit of the upper half of the
4140 result. */
4141 case Iop_QandQShrNnarrow64Uto32Ux2:
4142 case Iop_QandQSarNnarrow64Sto32Sx2:
4143 case Iop_QandQSarNnarrow64Sto32Ux2:
4144 case Iop_QandQRShrNnarrow64Uto32Ux2:
4145 case Iop_QandQRSarNnarrow64Sto32Sx2:
4146 case Iop_QandQRSarNnarrow64Sto32Ux2:
4147 case Iop_QandQShrNnarrow32Uto16Ux4:
4148 case Iop_QandQSarNnarrow32Sto16Sx4:
4149 case Iop_QandQSarNnarrow32Sto16Ux4:
4150 case Iop_QandQRShrNnarrow32Uto16Ux4:
4151 case Iop_QandQRSarNnarrow32Sto16Sx4:
4152 case Iop_QandQRSarNnarrow32Sto16Ux4:
4153 case Iop_QandQShrNnarrow16Uto8Ux8:
4154 case Iop_QandQSarNnarrow16Sto8Sx8:
4155 case Iop_QandQSarNnarrow16Sto8Ux8:
4156 case Iop_QandQRShrNnarrow16Uto8Ux8:
4157 case Iop_QandQRSarNnarrow16Sto8Sx8:
4158 case Iop_QandQRSarNnarrow16Sto8Ux8:
4160 IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL;
4161 IROp opNarrow = Iop_INVALID;
4162 switch (op) {
4163 case Iop_QandQShrNnarrow64Uto32Ux2:
4164 case Iop_QandQSarNnarrow64Sto32Sx2:
4165 case Iop_QandQSarNnarrow64Sto32Ux2:
4166 case Iop_QandQRShrNnarrow64Uto32Ux2:
4167 case Iop_QandQRSarNnarrow64Sto32Sx2:
4168 case Iop_QandQRSarNnarrow64Sto32Ux2:
4169 fnPessim = mkPCast64x2;
4170 opNarrow = Iop_NarrowUn64to32x2;
4171 break;
4172 case Iop_QandQShrNnarrow32Uto16Ux4:
4173 case Iop_QandQSarNnarrow32Sto16Sx4:
4174 case Iop_QandQSarNnarrow32Sto16Ux4:
4175 case Iop_QandQRShrNnarrow32Uto16Ux4:
4176 case Iop_QandQRSarNnarrow32Sto16Sx4:
4177 case Iop_QandQRSarNnarrow32Sto16Ux4:
4178 fnPessim = mkPCast32x4;
4179 opNarrow = Iop_NarrowUn32to16x4;
4180 break;
4181 case Iop_QandQShrNnarrow16Uto8Ux8:
4182 case Iop_QandQSarNnarrow16Sto8Sx8:
4183 case Iop_QandQSarNnarrow16Sto8Ux8:
4184 case Iop_QandQRShrNnarrow16Uto8Ux8:
4185 case Iop_QandQRSarNnarrow16Sto8Sx8:
4186 case Iop_QandQRSarNnarrow16Sto8Ux8:
4187 fnPessim = mkPCast16x8;
4188 opNarrow = Iop_NarrowUn16to8x8;
4189 break;
4190 default:
4191 tl_assert(0);
4193 complainIfUndefined(mce, atom2, NULL);
4194 // Pessimised shift result
4195 IRAtom* shV
4196 = fnPessim(mce, vatom1);
4197 // Narrowed, pessimised shift result
4198 IRAtom* shVnarrowed
4199 = assignNew('V', mce, Ity_I64, unop(opNarrow, shV));
4200 // Generates: Def--(63)--Def PCast-to-I1(narrowed)
4201 IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64);
4202 // and assemble the result
4203 return assignNew('V', mce, Ity_V128,
4204 binop(Iop_64HLtoV128, qV, shVnarrowed));
4207 case Iop_Mull32Sx2:
4208 case Iop_Mull32Ux2:
4209 case Iop_QDMull32Sx2:
4210 return vectorWidenI64(mce, Iop_Widen32Sto64x2,
4211 mkUifU64(mce, vatom1, vatom2));
4213 case Iop_Mull16Sx4:
4214 case Iop_Mull16Ux4:
4215 case Iop_QDMull16Sx4:
4216 return vectorWidenI64(mce, Iop_Widen16Sto32x4,
4217 mkUifU64(mce, vatom1, vatom2));
4219 case Iop_Mull8Sx8:
4220 case Iop_Mull8Ux8:
4221 case Iop_PolynomialMull8x8:
4222 return vectorWidenI64(mce, Iop_Widen8Sto16x8,
4223 mkUifU64(mce, vatom1, vatom2));
4225 case Iop_PwAdd32x4:
4226 return mkPCast32x4(mce,
4227 assignNew('V', mce, Ity_V128, binop(op, mkPCast32x4(mce, vatom1),
4228 mkPCast32x4(mce, vatom2))));
4230 case Iop_PwAdd16x8:
4231 return mkPCast16x8(mce,
4232 assignNew('V', mce, Ity_V128, binop(op, mkPCast16x8(mce, vatom1),
4233 mkPCast16x8(mce, vatom2))));
4235 case Iop_PwAdd8x16:
4236 return mkPCast8x16(mce,
4237 assignNew('V', mce, Ity_V128, binop(op, mkPCast8x16(mce, vatom1),
4238 mkPCast8x16(mce, vatom2))));
4240 /* V128-bit data-steering */
4241 case Iop_SetV128lo32:
4242 case Iop_SetV128lo64:
4243 case Iop_64HLtoV128:
4244 case Iop_InterleaveLO64x2:
4245 case Iop_InterleaveLO32x4:
4246 case Iop_InterleaveLO16x8:
4247 case Iop_InterleaveLO8x16:
4248 case Iop_InterleaveHI64x2:
4249 case Iop_InterleaveHI32x4:
4250 case Iop_InterleaveHI16x8:
4251 case Iop_InterleaveHI8x16:
4252 case Iop_CatOddLanes8x16:
4253 case Iop_CatOddLanes16x8:
4254 case Iop_CatOddLanes32x4:
4255 case Iop_CatEvenLanes8x16:
4256 case Iop_CatEvenLanes16x8:
4257 case Iop_CatEvenLanes32x4:
4258 case Iop_InterleaveOddLanes8x16:
4259 case Iop_InterleaveOddLanes16x8:
4260 case Iop_InterleaveOddLanes32x4:
4261 case Iop_InterleaveEvenLanes8x16:
4262 case Iop_InterleaveEvenLanes16x8:
4263 case Iop_InterleaveEvenLanes32x4:
4264 case Iop_PackOddLanes8x16:
4265 case Iop_PackOddLanes16x8:
4266 case Iop_PackOddLanes32x4:
4267 case Iop_PackEvenLanes8x16:
4268 case Iop_PackEvenLanes16x8:
4269 case Iop_PackEvenLanes32x4:
4270 return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
4272 case Iop_GetElem8x16:
4273 complainIfUndefined(mce, atom2, NULL);
4274 return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
4275 case Iop_GetElem16x8:
4276 complainIfUndefined(mce, atom2, NULL);
4277 return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
4278 case Iop_GetElem32x4:
4279 complainIfUndefined(mce, atom2, NULL);
4280 return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
4281 case Iop_GetElem64x2:
4282 complainIfUndefined(mce, atom2, NULL);
4283 return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
4285 /* Perm8x16: rearrange values in left arg using steering values
4286 from right arg. So rearrange the vbits in the same way but
4287 pessimise wrt steering values. Perm32x4 ditto. */
4288 /* PermOrZero8x16: see comments above for PermOrZero8x8. */
4289 case Iop_Perm8x16:
4290 case Iop_PermOrZero8x16:
4291 return mkUifUV128(
4292 mce,
4293 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
4294 mkPCast8x16(mce, vatom2)
4296 case Iop_Perm32x4:
4297 return mkUifUV128(
4298 mce,
4299 assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
4300 mkPCast32x4(mce, vatom2)
4303 /* These two take the lower half of each 16-bit lane, sign/zero
4304 extend it to 32, and multiply together, producing a 32x4
4305 result (and implicitly ignoring half the operand bits). So
4306 treat it as a bunch of independent 16x8 operations, but then
4307 do 32-bit shifts left-right to copy the lower half results
4308 (which are all 0s or all 1s due to PCasting in binary16Ix8)
4309 into the upper half of each result lane. */
4310 case Iop_MullEven16Ux8:
4311 case Iop_MullEven16Sx8: {
4312 IRAtom* at;
4313 at = binary16Ix8(mce,vatom1,vatom2);
4314 at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
4315 at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
4316 return at;
4319 /* Same deal as Iop_MullEven16{S,U}x8 */
4320 case Iop_MullEven8Ux16:
4321 case Iop_MullEven8Sx16: {
4322 IRAtom* at;
4323 at = binary8Ix16(mce,vatom1,vatom2);
4324 at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
4325 at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
4326 return at;
4329 /* Same deal as Iop_MullEven16{S,U}x8 */
4330 case Iop_MullEven32Ux4:
4331 case Iop_MullEven32Sx4: {
4332 IRAtom* at;
4333 at = binary32Ix4(mce,vatom1,vatom2);
4334 at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN64x2, at, mkU8(32)));
4335 at = assignNew('V', mce, Ity_V128, binop(Iop_SarN64x2, at, mkU8(32)));
4336 return at;
4339 /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
4340 32x4 -> 16x8 laneage, discarding the upper half of each lane.
4341 Simply apply same op to the V bits, since this really no more
4342 than a data steering operation. */
4343 case Iop_NarrowBin32to16x8:
4344 case Iop_NarrowBin16to8x16:
4345 case Iop_NarrowBin64to32x4:
4346 return assignNew('V', mce, Ity_V128,
4347 binop(op, vatom1, vatom2));
4349 case Iop_ShrV128:
4350 case Iop_SarV128:
4351 case Iop_ShlV128:
4352 case Iop_I128StoBCD128:
4353 /* Same scheme as with all other shifts. Note: 10 Nov 05:
4354 this is wrong now, scalar shifts are done properly lazily.
4355 Vector shifts should be fixed too. */
4356 complainIfUndefined(mce, atom2, NULL);
4357 return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4359 case Iop_I128UtoF128: /* I128 -> F128 */
4360 case Iop_I128StoF128: /* I128 -> F128 */
4361 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4363 case Iop_BCDAdd:
4364 case Iop_BCDSub:
4365 return mkLazy2(mce, Ity_V128, vatom1, vatom2);
4367 /* SHA Iops */
4368 case Iop_SHA256:
4369 case Iop_SHA512:
4370 complainIfUndefined(mce, atom2, NULL);
4371 return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
4373 /* I128-bit data-steering */
4374 case Iop_64HLto128:
4375 return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
4377 /* V256-bit SIMD */
4379 case Iop_Max64Fx4:
4380 case Iop_Min64Fx4:
4381 return binary64Fx4(mce, vatom1, vatom2);
4383 case Iop_Max32Fx8:
4384 case Iop_Min32Fx8:
4385 return binary32Fx8(mce, vatom1, vatom2);
4387 /* V256-bit data-steering */
4388 case Iop_V128HLtoV256:
4389 return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
4391 /* Scalar floating point */
4393 case Iop_F32toI64S:
4394 case Iop_F32toI64U:
4395 /* I32(rm) x F32 -> I64 */
4396 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4398 case Iop_I64StoF32:
4399 /* I32(rm) x I64 -> F32 */
4400 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4402 case Iop_RoundF64toInt:
4403 case Iop_RoundF64toF32:
4404 case Iop_F64toI64S:
4405 case Iop_F64toI64U:
4406 case Iop_I64StoF64:
4407 case Iop_I64UtoF64:
4408 case Iop_SinF64:
4409 case Iop_CosF64:
4410 case Iop_TanF64:
4411 case Iop_2xm1F64:
4412 case Iop_SqrtF64:
4413 case Iop_RecpExpF64:
4414 /* I32(rm) x I64/F64 -> I64/F64 */
4415 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4417 case Iop_ShlD64:
4418 case Iop_ShrD64:
4419 case Iop_RoundD64toInt:
4420 /* I32(rm) x D64 -> D64 */
4421 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4423 case Iop_ShlD128:
4424 case Iop_ShrD128:
4425 case Iop_RoundD128toInt:
4426 /* I32(rm) x D128 -> D128 */
4427 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4429 case Iop_RoundF128toInt:
4430 /* I32(rm) x F128 -> F128 */
4431 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4433 case Iop_D64toI64S:
4434 case Iop_D64toI64U:
4435 case Iop_I64StoD64:
4436 case Iop_I64UtoD64:
4437 /* I32(rm) x I64/D64 -> D64/I64 */
4438 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4440 case Iop_F32toD32:
4441 case Iop_F64toD32:
4442 case Iop_F128toD32:
4443 case Iop_D32toF32:
4444 case Iop_D64toF32:
4445 case Iop_D128toF32:
4446 /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
4447 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4449 case Iop_F32toD64:
4450 case Iop_F64toD64:
4451 case Iop_F128toD64:
4452 case Iop_D32toF64:
4453 case Iop_D64toF64:
4454 case Iop_D128toF64:
4455 /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
4456 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4458 case Iop_F32toD128:
4459 case Iop_F64toD128:
4460 case Iop_F128toD128:
4461 case Iop_D32toF128:
4462 case Iop_D64toF128:
4463 case Iop_D128toF128:
4464 case Iop_I128StoD128:
4465 /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
4466 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4468 case Iop_SqrtF16:
4469 /* I32(rm) x F16 -> F16 */
4470 return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4472 case Iop_RoundF32toInt:
4473 case Iop_SqrtF32:
4474 case Iop_RecpExpF32:
4475 /* I32(rm) x I32/F32 -> I32/F32 */
4476 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4478 case Iop_SqrtF128:
4479 /* I32(rm) x F128 -> F128 */
4480 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4482 case Iop_I32StoF32:
4483 case Iop_I32UtoF32:
4484 case Iop_F32toI32S:
4485 case Iop_F32toI32U:
4486 /* First arg is I32 (rounding mode), second is F32/I32 (data). */
4487 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4489 case Iop_F64toF16:
4490 case Iop_F32toF16:
4491 /* First arg is I32 (rounding mode), second is F64/F32 (data). */
4492 return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4494 case Iop_F128toI32S: /* IRRoundingMode(I32) x F128 -> signed I32 */
4495 case Iop_F128toI32U: /* IRRoundingMode(I32) x F128 -> unsigned I32 */
4496 case Iop_F128toF32: /* IRRoundingMode(I32) x F128 -> F32 */
4497 case Iop_D128toI32S: /* IRRoundingMode(I32) x D128 -> signed I32 */
4498 case Iop_D128toI32U: /* IRRoundingMode(I32) x D128 -> unsigned I32 */
4499 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4501 case Iop_F128toI128S: /* IRRoundingMode(I32) x F128 -> signed I128 */
4502 case Iop_RndF128: /* IRRoundingMode(I32) x F128 -> F128 */
4503 case Iop_D128toI128S: /* IRRoundingMode(I32) x D128 -> signed I128 */
4504 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4506 case Iop_F128toI64S: /* IRRoundingMode(I32) x F128 -> signed I64 */
4507 case Iop_F128toI64U: /* IRRoundingMode(I32) x F128 -> unsigned I64 */
4508 case Iop_F128toF64: /* IRRoundingMode(I32) x F128 -> F64 */
4509 case Iop_D128toD64: /* IRRoundingMode(I64) x D128 -> D64 */
4510 case Iop_D128toI64S: /* IRRoundingMode(I64) x D128 -> signed I64 */
4511 case Iop_D128toI64U: /* IRRoundingMode(I32) x D128 -> unsigned I64 */
4512 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4514 case Iop_F64HLtoF128:
4515 case Iop_D64HLtoD128:
4516 return assignNew('V', mce, Ity_I128,
4517 binop(Iop_64HLto128, vatom1, vatom2));
4519 case Iop_F64toI32U:
4520 case Iop_F64toI32S:
4521 case Iop_F64toF32:
4522 case Iop_I64UtoF32:
4523 case Iop_D64toI32U:
4524 case Iop_D64toI32S:
4525 /* First arg is I32 (rounding mode), second is F64/D64 (data). */
4526 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4528 case Iop_D64toD32:
4529 /* First arg is I32 (rounding mode), second is D64 (data). */
4530 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4532 case Iop_F64toI16S:
4533 /* First arg is I32 (rounding mode), second is F64 (data). */
4534 return mkLazy2(mce, Ity_I16, vatom1, vatom2);
4536 case Iop_InsertExpD64:
4537 /* I64 x I64 -> D64 */
4538 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4540 case Iop_InsertExpD128:
4541 /* I64 x I128 -> D128 */
4542 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4544 case Iop_CmpF16:
4545 case Iop_CmpF32:
4546 case Iop_CmpF64:
4547 case Iop_CmpF128:
4548 case Iop_CmpD64:
4549 case Iop_CmpD128:
4550 case Iop_CmpExpD64:
4551 case Iop_CmpExpD128:
4552 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4554 case Iop_MaxNumF32:
4555 case Iop_MinNumF32:
4556 /* F32 x F32 -> F32 */
4557 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4559 case Iop_MaxNumF64:
4560 case Iop_MinNumF64:
4561 /* F64 x F64 -> F64 */
4562 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4564 /* non-FP after here */
4566 case Iop_DivModU64to32:
4567 case Iop_DivModS64to32:
4568 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4570 case Iop_DivModU128to64:
4571 case Iop_DivModS128to64:
4572 return mkLazy2(mce, Ity_I128, vatom1, vatom2);
4574 case Iop_8HLto16:
4575 return assignNew('V', mce, Ity_I16, binop(op, vatom1, vatom2));
4576 case Iop_16HLto32:
4577 return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
4578 case Iop_32HLto64:
4579 return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
4581 case Iop_DivModU64to64:
4582 case Iop_DivModS64to64: {
4583 IRAtom* vTmp64 = mkLazy2(mce, Ity_I64, vatom1, vatom2);
4584 return assignNew('V', mce, Ity_I128,
4585 binop(Iop_64HLto128, vTmp64, vTmp64));
4588 case Iop_MullS64:
4589 case Iop_MullU64: {
4590 IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4591 IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
4592 return assignNew('V', mce, Ity_I128,
4593 binop(Iop_64HLto128, vHi64, vLo64));
4596 case Iop_DivModU32to32:
4597 case Iop_DivModS32to32: {
4598 IRAtom* vTmp32 = mkLazy2(mce, Ity_I32, vatom1, vatom2);
4599 return assignNew('V', mce, Ity_I64,
4600 binop(Iop_32HLto64, vTmp32, vTmp32));
4603 case Iop_MullS32:
4604 case Iop_MullU32: {
4605 IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4606 IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
4607 return assignNew('V', mce, Ity_I64,
4608 binop(Iop_32HLto64, vHi32, vLo32));
4611 case Iop_MullS16:
4612 case Iop_MullU16: {
4613 IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4614 IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
4615 return assignNew('V', mce, Ity_I32,
4616 binop(Iop_16HLto32, vHi16, vLo16));
4619 case Iop_MullS8:
4620 case Iop_MullU8: {
4621 IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4622 IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
4623 return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
4626 case Iop_Sad8Ux4: /* maybe we could do better? ftm, do mkLazy2. */
4627 case Iop_DivS32:
4628 case Iop_DivU32:
4629 case Iop_DivU32E:
4630 case Iop_DivS32E:
4631 case Iop_QAdd32S: /* could probably do better */
4632 case Iop_QSub32S: /* could probably do better */
4633 return mkLazy2(mce, Ity_I32, vatom1, vatom2);
4635 case Iop_DivS64:
4636 case Iop_DivU64:
4637 case Iop_DivS64E:
4638 case Iop_DivU64E:
4639 return mkLazy2(mce, Ity_I64, vatom1, vatom2);
4641 case Iop_Add32:
4642 if (mce->dlbo.dl_Add32 == DLexpensive
4643 || (mce->dlbo.dl_Add32 == DLauto && hu == HuOth)) {
4644 return expensiveAddSub(mce,True,Ity_I32,
4645 vatom1,vatom2, atom1,atom2);
4646 } else {
4647 goto cheap_AddSub32;
4649 case Iop_Sub32:
4650 if (mce->dlbo.dl_Sub32 == DLexpensive
4651 || (mce->dlbo.dl_Sub32 == DLauto && hu == HuOth)) {
4652 return expensiveAddSub(mce,False,Ity_I32,
4653 vatom1,vatom2, atom1,atom2);
4654 } else {
4655 goto cheap_AddSub32;
4658 cheap_AddSub32:
4659 case Iop_Mul32:
4660 return mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
4662 case Iop_CmpORD32S:
4663 case Iop_CmpORD32U:
4664 case Iop_CmpORD64S:
4665 case Iop_CmpORD64U:
4666 return doCmpORD(mce, op, vatom1,vatom2, atom1,atom2);
4668 case Iop_Add64:
4669 if (mce->dlbo.dl_Add64 == DLexpensive
4670 || (mce->dlbo.dl_Add64 == DLauto && hu == HuOth)) {
4671 return expensiveAddSub(mce,True,Ity_I64,
4672 vatom1,vatom2, atom1,atom2);
4673 } else {
4674 goto cheap_AddSub64;
4676 case Iop_Sub64:
4677 if (mce->dlbo.dl_Sub64 == DLexpensive
4678 || (mce->dlbo.dl_Sub64 == DLauto && hu == HuOth)) {
4679 return expensiveAddSub(mce,False,Ity_I64,
4680 vatom1,vatom2, atom1,atom2);
4681 } else {
4682 goto cheap_AddSub64;
4685 cheap_AddSub64:
4686 case Iop_Mul64:
4687 return mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
4689 case Iop_Mul16:
4690 case Iop_Add16:
4691 case Iop_Sub16:
4692 return mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
4694 case Iop_Mul8:
4695 case Iop_Sub8:
4696 case Iop_Add8:
4697 return mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
4699 ////---- CmpXX64
4700 case Iop_CmpEQ64: case Iop_CmpNE64:
4701 if (mce->dlbo.dl_CmpEQ64_CmpNE64 == DLexpensive)
4702 goto expensive_cmp64;
4703 else
4704 goto cheap_cmp64;
4706 expensive_cmp64:
4707 case Iop_ExpCmpNE64:
4708 return expensiveCmpEQorNE(mce,Ity_I64, vatom1,vatom2, atom1,atom2 );
4710 cheap_cmp64:
4711 case Iop_CmpLE64S: case Iop_CmpLE64U:
4712 case Iop_CmpLT64U: case Iop_CmpLT64S:
4713 return mkPCastTo(mce, Ity_I1, mkUifU64(mce, vatom1,vatom2));
4715 ////---- CmpXX32
4716 case Iop_CmpEQ32: case Iop_CmpNE32:
4717 if (mce->dlbo.dl_CmpEQ32_CmpNE32 == DLexpensive)
4718 goto expensive_cmp32;
4719 else
4720 goto cheap_cmp32;
4722 expensive_cmp32:
4723 case Iop_ExpCmpNE32:
4724 return expensiveCmpEQorNE(mce,Ity_I32, vatom1,vatom2, atom1,atom2 );
4726 cheap_cmp32:
4727 case Iop_CmpLE32S: case Iop_CmpLE32U:
4728 case Iop_CmpLT32U: case Iop_CmpLT32S:
4729 return mkPCastTo(mce, Ity_I1, mkUifU32(mce, vatom1,vatom2));
4731 ////---- CmpXX16
4732 case Iop_CmpEQ16: case Iop_CmpNE16:
4733 if (mce->dlbo.dl_CmpEQ16_CmpNE16 == DLexpensive)
4734 goto expensive_cmp16;
4735 else
4736 goto cheap_cmp16;
4738 expensive_cmp16:
4739 case Iop_ExpCmpNE16:
4740 return expensiveCmpEQorNE(mce,Ity_I16, vatom1,vatom2, atom1,atom2 );
4742 cheap_cmp16:
4743 return mkPCastTo(mce, Ity_I1, mkUifU16(mce, vatom1,vatom2));
4745 ////---- CmpXX8
4746 case Iop_CmpEQ8: case Iop_CmpNE8:
4747 if (mce->dlbo.dl_CmpEQ8_CmpNE8 == DLexpensive)
4748 goto expensive_cmp8;
4749 else
4750 goto cheap_cmp8;
4752 expensive_cmp8:
4753 return expensiveCmpEQorNE(mce,Ity_I8, vatom1,vatom2, atom1,atom2 );
4755 cheap_cmp8:
4756 return mkPCastTo(mce, Ity_I1, mkUifU8(mce, vatom1,vatom2));
4758 ////---- end CmpXX{64,32,16,8}
4760 case Iop_CasCmpEQ8: case Iop_CasCmpNE8:
4761 case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
4762 case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
4763 case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
4764 /* Just say these all produce a defined result, regardless
4765 of their arguments. See COMMENT_ON_CasCmpEQ in this file. */
4766 return assignNew('V', mce, Ity_I1, definedOfType(Ity_I1));
4768 case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
4769 return scalarShift( mce, Ity_I64, op, vatom1,vatom2, atom1,atom2 );
4771 case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
4772 return scalarShift( mce, Ity_I32, op, vatom1,vatom2, atom1,atom2 );
4774 case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
4775 return scalarShift( mce, Ity_I16, op, vatom1,vatom2, atom1,atom2 );
4777 case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
4778 return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
4780 case Iop_AndV256:
4781 uifu = mkUifUV256; difd = mkDifDV256;
4782 and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
4783 case Iop_AndV128:
4784 uifu = mkUifUV128; difd = mkDifDV128;
4785 and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
4786 case Iop_And64:
4787 uifu = mkUifU64; difd = mkDifD64;
4788 and_or_ty = Ity_I64; improve = mkImproveAND64; goto do_And_Or;
4789 case Iop_And32:
4790 uifu = mkUifU32; difd = mkDifD32;
4791 and_or_ty = Ity_I32; improve = mkImproveAND32; goto do_And_Or;
4792 case Iop_And16:
4793 uifu = mkUifU16; difd = mkDifD16;
4794 and_or_ty = Ity_I16; improve = mkImproveAND16; goto do_And_Or;
4795 case Iop_And8:
4796 uifu = mkUifU8; difd = mkDifD8;
4797 and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
4798 case Iop_And1:
4799 uifu = mkUifU1; difd = mkDifD1;
4800 and_or_ty = Ity_I1; improve = mkImproveAND1; goto do_And_Or;
4802 case Iop_OrV256:
4803 uifu = mkUifUV256; difd = mkDifDV256;
4804 and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
4805 case Iop_OrV128:
4806 uifu = mkUifUV128; difd = mkDifDV128;
4807 and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
4808 case Iop_Or64:
4809 uifu = mkUifU64; difd = mkDifD64;
4810 and_or_ty = Ity_I64; improve = mkImproveOR64; goto do_And_Or;
4811 case Iop_Or32:
4812 uifu = mkUifU32; difd = mkDifD32;
4813 and_or_ty = Ity_I32; improve = mkImproveOR32; goto do_And_Or;
4814 case Iop_Or16:
4815 uifu = mkUifU16; difd = mkDifD16;
4816 and_or_ty = Ity_I16; improve = mkImproveOR16; goto do_And_Or;
4817 case Iop_Or8:
4818 uifu = mkUifU8; difd = mkDifD8;
4819 and_or_ty = Ity_I8; improve = mkImproveOR8; goto do_And_Or;
4820 case Iop_Or1:
4821 uifu = mkUifU1; difd = mkDifD1;
4822 and_or_ty = Ity_I1; improve = mkImproveOR1; goto do_And_Or;
4824 do_And_Or:
4825 return
4826 assignNew(
4827 'V', mce,
4828 and_or_ty,
4829 difd(mce, uifu(mce, vatom1, vatom2),
4830 difd(mce, improve(mce, atom1, vatom1),
4831 improve(mce, atom2, vatom2) ) ) );
4833 return assignNew('V', mce, and_or_ty,
4834 difd(mce, uifu(mce, vatom1, vatom2),
4835 difd(mce, improve(mce, atom1, vatom1),
4836 improve(mce, atom2, vatom2) ) ) );
4837 case Iop_Xor8:
4838 return mkUifU8(mce, vatom1, vatom2);
4839 case Iop_Xor16:
4840 return mkUifU16(mce, vatom1, vatom2);
4841 case Iop_Xor32:
4842 return mkUifU32(mce, vatom1, vatom2);
4843 case Iop_Xor64:
4844 return mkUifU64(mce, vatom1, vatom2);
4845 case Iop_XorV128:
4846 return mkUifUV128(mce, vatom1, vatom2);
4847 case Iop_XorV256:
4848 return mkUifUV256(mce, vatom1, vatom2);
4850 /* V256-bit SIMD */
4852 case Iop_ShrN16x16:
4853 case Iop_ShrN32x8:
4854 case Iop_ShrN64x4:
4855 case Iop_SarN16x16:
4856 case Iop_SarN32x8:
4857 case Iop_ShlN16x16:
4858 case Iop_ShlN32x8:
4859 case Iop_ShlN64x4:
4860 /* Same scheme as with all other shifts. Note: 22 Oct 05:
4861 this is wrong now, scalar shifts are done properly lazily.
4862 Vector shifts should be fixed too. */
4863 complainIfUndefined(mce, atom2, NULL);
4864 return assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2));
4866 case Iop_QSub8Ux32:
4867 case Iop_QSub8Sx32:
4868 case Iop_Sub8x32:
4869 case Iop_Min8Ux32:
4870 case Iop_Min8Sx32:
4871 case Iop_Max8Ux32:
4872 case Iop_Max8Sx32:
4873 case Iop_CmpGT8Sx32:
4874 case Iop_CmpEQ8x32:
4875 case Iop_Avg8Ux32:
4876 case Iop_QAdd8Ux32:
4877 case Iop_QAdd8Sx32:
4878 case Iop_Add8x32:
4879 return binary8Ix32(mce, vatom1, vatom2);
4881 case Iop_QSub16Ux16:
4882 case Iop_QSub16Sx16:
4883 case Iop_Sub16x16:
4884 case Iop_Mul16x16:
4885 case Iop_MulHi16Sx16:
4886 case Iop_MulHi16Ux16:
4887 case Iop_Min16Sx16:
4888 case Iop_Min16Ux16:
4889 case Iop_Max16Sx16:
4890 case Iop_Max16Ux16:
4891 case Iop_CmpGT16Sx16:
4892 case Iop_CmpEQ16x16:
4893 case Iop_Avg16Ux16:
4894 case Iop_QAdd16Ux16:
4895 case Iop_QAdd16Sx16:
4896 case Iop_Add16x16:
4897 return binary16Ix16(mce, vatom1, vatom2);
4899 case Iop_Sub32x8:
4900 case Iop_CmpGT32Sx8:
4901 case Iop_CmpEQ32x8:
4902 case Iop_Add32x8:
4903 case Iop_Max32Ux8:
4904 case Iop_Max32Sx8:
4905 case Iop_Min32Ux8:
4906 case Iop_Min32Sx8:
4907 case Iop_Mul32x8:
4908 return binary32Ix8(mce, vatom1, vatom2);
4910 case Iop_Sub64x4:
4911 case Iop_Add64x4:
4912 case Iop_CmpEQ64x4:
4913 case Iop_CmpGT64Sx4:
4914 return binary64Ix4(mce, vatom1, vatom2);
4916 case Iop_I32StoF32x8:
4917 case Iop_F32toI32Sx8:
4918 return unary32Fx8_w_rm(mce, vatom1, vatom2);
4920 /* Perm32x8: rearrange values in left arg using steering values
4921 from right arg. So rearrange the vbits in the same way but
4922 pessimise wrt steering values. */
4923 case Iop_Perm32x8:
4924 return mkUifUV256(
4925 mce,
4926 assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2)),
4927 mkPCast32x8(mce, vatom2)
4930 /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
4931 Handle the shifted results in the same way that other
4932 binary Q ops are handled, eg QSub: UifU the two args,
4933 then pessimise -- which is binaryNIxM. But for the upper
4934 V128, we require to generate just 1 bit which is the
4935 pessimised shift result, with 127 defined zeroes above it.
4937 Note that this overly pessimistic in that in fact only the
4938 bottom 8 bits of each lane of the second arg determine the shift
4939 amount. Really we ought to ignore any undefinedness in the
4940 rest of the lanes of the second arg. */
4941 case Iop_QandSQsh64x2: case Iop_QandUQsh64x2:
4942 case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2:
4943 case Iop_QandSQsh32x4: case Iop_QandUQsh32x4:
4944 case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4:
4945 case Iop_QandSQsh16x8: case Iop_QandUQsh16x8:
4946 case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8:
4947 case Iop_QandSQsh8x16: case Iop_QandUQsh8x16:
4948 case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16:
4950 // The function to generate the pessimised shift result
4951 IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL;
4952 switch (op) {
4953 case Iop_QandSQsh64x2:
4954 case Iop_QandUQsh64x2:
4955 case Iop_QandSQRsh64x2:
4956 case Iop_QandUQRsh64x2:
4957 binaryNIxM = binary64Ix2;
4958 break;
4959 case Iop_QandSQsh32x4:
4960 case Iop_QandUQsh32x4:
4961 case Iop_QandSQRsh32x4:
4962 case Iop_QandUQRsh32x4:
4963 binaryNIxM = binary32Ix4;
4964 break;
4965 case Iop_QandSQsh16x8:
4966 case Iop_QandUQsh16x8:
4967 case Iop_QandSQRsh16x8:
4968 case Iop_QandUQRsh16x8:
4969 binaryNIxM = binary16Ix8;
4970 break;
4971 case Iop_QandSQsh8x16:
4972 case Iop_QandUQsh8x16:
4973 case Iop_QandSQRsh8x16:
4974 case Iop_QandUQRsh8x16:
4975 binaryNIxM = binary8Ix16;
4976 break;
4977 default:
4978 tl_assert(0);
4980 tl_assert(binaryNIxM);
4981 // Pessimised shift result, shV[127:0]
4982 IRAtom* shV = binaryNIxM(mce, vatom1, vatom2);
4983 // Generates: Def--(127)--Def PCast-to-I1(shV)
4984 IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128);
4985 // and assemble the result
4986 return assignNew('V', mce, Ity_V256,
4987 binop(Iop_V128HLtoV256, qV, shV));
4990 case Iop_F32toF16x4: {
4991 // First, PCast the input vector, retaining the 32x4 format.
4992 IRAtom* pcasted = mkPCast32x4(mce, vatom2); // :: 32x4
4993 // Now truncate each 32 bit lane to 16 bits. Since we already PCasted
4994 // the input, we're not going to lose any information.
4995 IRAtom* pcHI64
4996 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, pcasted));//32x2
4997 IRAtom* pcLO64
4998 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, pcasted)); // 32x2
4999 IRAtom* narrowed
5000 = assignNew('V', mce, Ity_I64, binop(Iop_NarrowBin32to16x4,
5001 pcHI64, pcLO64)); // 16x4
5002 // Finally, roll in any badness from the rounding mode.
5003 IRAtom* rmPCasted = mkPCastTo(mce, Ity_I64, vatom1);
5004 return mkUifU64(mce, narrowed, rmPCasted);
5007 case Iop_F32toF16x8: {
5008 // Same scheme as for Iop_F32toF16x4.
5009 IRAtom* pcasted = mkPCast32x8(mce, vatom2); // :: 32x8
5010 IRAtom* pcHI128
5011 = assignNew('V', mce, Ity_V128, unop(Iop_V256toV128_1,
5012 pcasted)); // 32x4
5013 IRAtom* pcLO128
5014 = assignNew('V', mce, Ity_V128, unop(Iop_V256toV128_0,
5015 pcasted)); // 32x4
5016 IRAtom* narrowed
5017 = assignNew('V', mce, Ity_V128, binop(Iop_NarrowBin32to16x8,
5018 pcHI128, pcLO128)); // 16x8
5019 // Finally, roll in any badness from the rounding mode.
5020 IRAtom* rmPCasted = mkPCastTo(mce, Ity_V128, vatom1);
5021 return mkUifUV128(mce, narrowed, rmPCasted);
5024 default:
5025 ppIROp(op);
5026 VG_(tool_panic)("memcheck:expr2vbits_Binop");
5031 static
5032 IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
5034 /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
5035 selection of shadow operation implicitly duplicates the logic in
5036 do_shadow_LoadG and should be kept in sync (in the very unlikely
5037 event that the interpretation of such widening ops changes in
5038 future). See comment in do_shadow_LoadG. */
5039 IRAtom* vatom = expr2vbits( mce, atom, HuOth );
5040 tl_assert(isOriginalAtom(mce,atom));
5041 switch (op) {
5043 case Iop_Abs64Fx2:
5044 case Iop_Neg64Fx2:
5045 case Iop_RSqrtEst64Fx2:
5046 case Iop_RecipEst64Fx2:
5047 case Iop_Log2_64Fx2:
5048 return unary64Fx2(mce, vatom);
5050 case Iop_Sqrt64F0x2:
5051 return unary64F0x2(mce, vatom);
5053 case Iop_Sqrt32Fx8:
5054 case Iop_RSqrtEst32Fx8:
5055 case Iop_RecipEst32Fx8:
5056 return unary32Fx8(mce, vatom);
5058 case Iop_Sqrt64Fx4:
5059 return unary64Fx4(mce, vatom);
5061 case Iop_RecipEst32Fx4:
5062 case Iop_I32UtoF32x4_DEP:
5063 case Iop_I32StoF32x4_DEP:
5064 case Iop_QF32toI32Ux4_RZ:
5065 case Iop_QF32toI32Sx4_RZ:
5066 case Iop_RoundF32x4_RM:
5067 case Iop_RoundF32x4_RP:
5068 case Iop_RoundF32x4_RN:
5069 case Iop_RoundF32x4_RZ:
5070 case Iop_RecipEst32Ux4:
5071 case Iop_Abs32Fx4:
5072 case Iop_Neg32Fx4:
5073 case Iop_RSqrtEst32Fx4:
5074 case Iop_Log2_32Fx4:
5075 case Iop_Exp2_32Fx4:
5076 return unary32Fx4(mce, vatom);
5078 case Iop_I32UtoF32x2_DEP:
5079 case Iop_I32StoF32x2_DEP:
5080 case Iop_RecipEst32Fx2:
5081 case Iop_RecipEst32Ux2:
5082 case Iop_Abs32Fx2:
5083 case Iop_Neg32Fx2:
5084 case Iop_RSqrtEst32Fx2:
5085 return unary32Fx2(mce, vatom);
5087 case Iop_Sqrt32F0x4:
5088 case Iop_RSqrtEst32F0x4:
5089 case Iop_RecipEst32F0x4:
5090 return unary32F0x4(mce, vatom);
5092 case Iop_Abs16Fx8:
5093 case Iop_Neg16Fx8:
5094 return unary16Fx8(mce, vatom);
5096 // These are self-shadowing.
5097 case Iop_32UtoV128:
5098 case Iop_64UtoV128:
5099 case Iop_Dup8x16:
5100 case Iop_Dup16x8:
5101 case Iop_Dup32x4:
5102 case Iop_Reverse1sIn8_x16:
5103 case Iop_Reverse8sIn16_x8:
5104 case Iop_Reverse8sIn32_x4:
5105 case Iop_Reverse16sIn32_x4:
5106 case Iop_Reverse8sIn64_x2:
5107 case Iop_Reverse16sIn64_x2:
5108 case Iop_Reverse32sIn64_x2:
5109 case Iop_V256toV128_1: case Iop_V256toV128_0:
5110 case Iop_ZeroHI64ofV128:
5111 case Iop_ZeroHI96ofV128:
5112 case Iop_ZeroHI112ofV128:
5113 case Iop_ZeroHI120ofV128:
5114 case Iop_ReinterpI128asV128: /* I128 -> V128 */
5115 return assignNew('V', mce, Ity_V128, unop(op, vatom));
5117 case Iop_F128HItoF64: /* F128 -> high half of F128 */
5118 case Iop_D128HItoD64: /* D128 -> high half of D128 */
5119 return assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vatom));
5121 case Iop_F128LOtoF64: /* F128 -> low half of F128 */
5122 case Iop_D128LOtoD64: /* D128 -> low half of D128 */
5123 return assignNew('V', mce, Ity_I64, unop(Iop_128to64, vatom));
5125 case Iop_NegF128:
5126 case Iop_AbsF128:
5127 case Iop_RndF128:
5128 case Iop_TruncF128toI128S: /* F128 -> I128S */
5129 case Iop_TruncF128toI128U: /* F128 -> I128U */
5130 case Iop_ReinterpV128asI128: /* V128 -> I128 */
5131 case Iop_ReinterpI128asF128:
5132 case Iop_ReinterpF128asI128:
5133 return mkPCastTo(mce, Ity_I128, vatom);
5135 case Iop_BCD128toI128S:
5136 case Iop_MulI128by10:
5137 case Iop_MulI128by10Carry:
5138 case Iop_F16toF64x2:
5139 case Iop_F64toF16x2_DEP:
5140 // FIXME JRS 2018-Nov-15. This is surely not correct!
5141 return vatom;
5143 case Iop_ReinterpI32asF32:
5144 case Iop_ReinterpF32asI32:
5145 return assignNew('V', mce, Ity_I32, vatom);
5147 case Iop_ReinterpF64asI64:
5148 case Iop_ReinterpI64asF64:
5149 case Iop_ReinterpI64asD64:
5150 case Iop_ReinterpD64asI64:
5151 return assignNew('V', mce, Ity_I64, vatom);
5153 case Iop_I32StoF128: /* signed I32 -> F128 */
5154 case Iop_I64StoF128: /* signed I64 -> F128 */
5155 case Iop_I32UtoF128: /* unsigned I32 -> F128 */
5156 case Iop_I64UtoF128: /* unsigned I64 -> F128 */
5157 case Iop_F32toF128: /* F32 -> F128 */
5158 case Iop_F64toF128: /* F64 -> F128 */
5159 case Iop_I32StoD128: /* signed I64 -> D128 */
5160 case Iop_I64StoD128: /* signed I64 -> D128 */
5161 case Iop_I32UtoD128: /* unsigned I32 -> D128 */
5162 case Iop_I64UtoD128: /* unsigned I64 -> D128 */
5163 return mkPCastTo(mce, Ity_I128, vatom);
5165 case Iop_F16toF64:
5166 case Iop_F32toF64:
5167 case Iop_I32StoF64:
5168 case Iop_I32UtoF64:
5169 case Iop_NegF64:
5170 case Iop_AbsF64:
5171 case Iop_RSqrtEst5GoodF64:
5172 case Iop_RoundF64toF64_NEAREST:
5173 case Iop_RoundF64toF64_NegINF:
5174 case Iop_RoundF64toF64_PosINF:
5175 case Iop_RoundF64toF64_ZERO:
5176 case Iop_D32toD64:
5177 case Iop_I32StoD64:
5178 case Iop_I32UtoD64:
5179 case Iop_ExtractExpD64: /* D64 -> I64 */
5180 case Iop_ExtractExpD128: /* D128 -> I64 */
5181 case Iop_ExtractSigD64: /* D64 -> I64 */
5182 case Iop_ExtractSigD128: /* D128 -> I64 */
5183 case Iop_DPBtoBCD:
5184 case Iop_BCDtoDPB:
5185 return mkPCastTo(mce, Ity_I64, vatom);
5187 case Iop_D64toD128:
5188 return mkPCastTo(mce, Ity_I128, vatom);
5190 case Iop_TruncF64asF32:
5191 case Iop_NegF32:
5192 case Iop_AbsF32:
5193 case Iop_F16toF32:
5194 return mkPCastTo(mce, Ity_I32, vatom);
5196 case Iop_AbsF16:
5197 case Iop_NegF16:
5198 return mkPCastTo(mce, Ity_I16, vatom);
5200 case Iop_Ctz32: case Iop_CtzNat32:
5201 case Iop_Ctz64: case Iop_CtzNat64:
5202 return expensiveCountTrailingZeroes(mce, op, atom, vatom);
5204 case Iop_Clz32: case Iop_ClzNat32:
5205 case Iop_Clz64: case Iop_ClzNat64:
5206 return expensiveCountLeadingZeroes(mce, op, atom, vatom);
5208 // PopCount32: this is slightly pessimistic. It is true that the
5209 // result depends on all input bits, so that aspect of the PCast is
5210 // correct. However, regardless of the input, only the lowest 5 bits
5211 // out of the output can ever be undefined. So we could actually
5212 // "improve" the results here by marking the top 27 bits of output as
5213 // defined. A similar comment applies for PopCount64.
5214 case Iop_PopCount32:
5215 return mkPCastTo(mce, Ity_I32, vatom);
5216 case Iop_PopCount64:
5217 return mkPCastTo(mce, Ity_I64, vatom);
5219 // These are self-shadowing.
5220 case Iop_1Uto64:
5221 case Iop_1Sto64:
5222 case Iop_8Uto64:
5223 case Iop_8Sto64:
5224 case Iop_16Uto64:
5225 case Iop_16Sto64:
5226 case Iop_32Sto64:
5227 case Iop_32Uto64:
5228 case Iop_V128to64:
5229 case Iop_V128HIto64:
5230 case Iop_128HIto64:
5231 case Iop_128to64:
5232 case Iop_Dup8x8:
5233 case Iop_Dup16x4:
5234 case Iop_Dup32x2:
5235 case Iop_Reverse8sIn16_x4:
5236 case Iop_Reverse8sIn32_x2:
5237 case Iop_Reverse16sIn32_x2:
5238 case Iop_Reverse8sIn64_x1:
5239 case Iop_Reverse16sIn64_x1:
5240 case Iop_Reverse32sIn64_x1:
5241 case Iop_V256to64_0: case Iop_V256to64_1:
5242 case Iop_V256to64_2: case Iop_V256to64_3:
5243 return assignNew('V', mce, Ity_I64, unop(op, vatom));
5245 // These are self-shadowing.
5246 case Iop_64to32:
5247 case Iop_64HIto32:
5248 case Iop_1Uto32:
5249 case Iop_1Sto32:
5250 case Iop_8Uto32:
5251 case Iop_16Uto32:
5252 case Iop_16Sto32:
5253 case Iop_8Sto32:
5254 case Iop_V128to32:
5255 case Iop_Reverse8sIn32_x1:
5256 return assignNew('V', mce, Ity_I32, unop(op, vatom));
5258 // These are self-shadowing.
5259 case Iop_1Sto16:
5260 case Iop_8Sto16:
5261 case Iop_8Uto16:
5262 case Iop_32to16:
5263 case Iop_32HIto16:
5264 case Iop_64to16:
5265 case Iop_GetMSBs8x16:
5266 return assignNew('V', mce, Ity_I16, unop(op, vatom));
5268 // These are self-shadowing.
5269 case Iop_1Uto8:
5270 case Iop_1Sto8:
5271 case Iop_16to8:
5272 case Iop_16HIto8:
5273 case Iop_32to8:
5274 case Iop_64to8:
5275 case Iop_GetMSBs8x8:
5276 return assignNew('V', mce, Ity_I8, unop(op, vatom));
5278 case Iop_32to1:
5279 return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
5281 case Iop_64to1:
5282 return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
5284 case Iop_NotV256:
5285 case Iop_NotV128:
5286 case Iop_Not64:
5287 case Iop_Not32:
5288 case Iop_Not16:
5289 case Iop_Not8:
5290 case Iop_Not1:
5291 // FIXME JRS 2018-Nov-15. This is surely not correct!
5292 return vatom;
5294 case Iop_CmpNEZ8x8:
5295 case Iop_Cnt8x8:
5296 case Iop_Clz8x8:
5297 case Iop_Cls8x8:
5298 case Iop_Abs8x8:
5299 return mkPCast8x8(mce, vatom);
5301 case Iop_CmpNEZ8x16:
5302 case Iop_Cnt8x16:
5303 case Iop_Clz8x16:
5304 case Iop_Cls8x16:
5305 case Iop_Abs8x16:
5306 case Iop_Ctz8x16:
5307 return mkPCast8x16(mce, vatom);
5309 case Iop_CmpNEZ16x4:
5310 case Iop_Clz16x4:
5311 case Iop_Cls16x4:
5312 case Iop_Abs16x4:
5313 return mkPCast16x4(mce, vatom);
5315 case Iop_CmpNEZ16x8:
5316 case Iop_Clz16x8:
5317 case Iop_Cls16x8:
5318 case Iop_Abs16x8:
5319 case Iop_Ctz16x8:
5320 return mkPCast16x8(mce, vatom);
5322 case Iop_CmpNEZ32x2:
5323 case Iop_Clz32x2:
5324 case Iop_Cls32x2:
5325 case Iop_F32toI32Ux2_RZ:
5326 case Iop_F32toI32Sx2_RZ:
5327 case Iop_Abs32x2:
5328 return mkPCast32x2(mce, vatom);
5330 case Iop_CmpNEZ32x4:
5331 case Iop_Clz32x4:
5332 case Iop_Cls32x4:
5333 case Iop_F32toI32Ux4_RZ:
5334 case Iop_F32toI32Sx4_RZ:
5335 case Iop_Abs32x4:
5336 case Iop_RSqrtEst32Ux4:
5337 case Iop_Ctz32x4:
5338 return mkPCast32x4(mce, vatom);
5340 case Iop_TruncF128toI32S: /* F128 -> I32S (result stored in 64-bits) */
5341 case Iop_TruncF128toI32U: /* F128 -> I32U (result stored in 64-bits) */
5342 case Iop_CmpwNEZ32:
5343 return mkPCastTo(mce, Ity_I32, vatom);
5345 case Iop_TruncF128toI64S: /* F128 -> I64S */
5346 case Iop_TruncF128toI64U: /* F128 -> I64U */
5347 case Iop_CmpwNEZ64:
5348 return mkPCastTo(mce, Ity_I64, vatom);
5350 case Iop_CmpNEZ64x2:
5351 case Iop_CipherSV128:
5352 case Iop_Clz64x2:
5353 case Iop_Abs64x2:
5354 case Iop_Ctz64x2:
5355 return mkPCast64x2(mce, vatom);
5357 // This is self-shadowing.
5358 case Iop_PwBitMtxXpose64x2:
5359 return assignNew('V', mce, Ity_V128, unop(op, vatom));
5361 case Iop_NarrowUn16to8x8:
5362 case Iop_NarrowUn32to16x4:
5363 case Iop_NarrowUn64to32x2:
5364 case Iop_QNarrowUn16Sto8Sx8:
5365 case Iop_QNarrowUn16Sto8Ux8:
5366 case Iop_QNarrowUn16Uto8Ux8:
5367 case Iop_QNarrowUn32Sto16Sx4:
5368 case Iop_QNarrowUn32Sto16Ux4:
5369 case Iop_QNarrowUn32Uto16Ux4:
5370 case Iop_QNarrowUn64Sto32Sx2:
5371 case Iop_QNarrowUn64Sto32Ux2:
5372 case Iop_QNarrowUn64Uto32Ux2:
5373 return vectorNarrowUnV128(mce, op, vatom);
5375 // JRS FIXME 2019 Mar 17: per comments on F16toF32x4, this is probably not
5376 // right.
5377 case Iop_F32toF16x4_DEP:
5378 return vectorNarrowUnV128(mce, op, vatom);
5380 case Iop_Widen8Sto16x8:
5381 case Iop_Widen8Uto16x8:
5382 case Iop_Widen16Sto32x4:
5383 case Iop_Widen16Uto32x4:
5384 case Iop_Widen32Sto64x2:
5385 case Iop_Widen32Uto64x2:
5386 return vectorWidenI64(mce, op, vatom);
5388 case Iop_F16toF32x4:
5389 // JRS 2019 Mar 17: this definitely isn't right, but it probably works
5390 // OK by accident if -- as seems likely -- the F16 to F32 conversion
5391 // preserves will generate an output 32 bits with at least one 1 bit
5392 // set if there's one or more 1 bits set in the input 16 bits. More
5393 // correct code for this is just below, but commented out, so as to
5394 // avoid short-term backend failures on targets that can't do
5395 // Iop_Interleave{LO,HI}16x4.
5396 return vectorWidenI64(mce, op, vatom);
5398 case Iop_F16toF32x8: {
5399 // PCast the input at 16x8. This makes each lane hold either all
5400 // zeroes or all ones.
5401 IRAtom* pcasted = mkPCast16x8(mce, vatom); // :: I16x8
5402 // Now double the width of each lane to 32 bits. Because the lanes are
5403 // all zeroes or all ones, we can just copy the each lane twice into
5404 // the result. Here's the low half:
5405 IRAtom* widenedLO // :: I32x4
5406 = assignNew('V', mce, Ity_V128, binop(Iop_InterleaveLO16x8,
5407 pcasted, pcasted));
5408 // And the high half:
5409 IRAtom* widenedHI // :: I32x4
5410 = assignNew('V', mce, Ity_V128, binop(Iop_InterleaveHI16x8,
5411 pcasted, pcasted));
5412 // Glue them back together:
5413 return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
5414 widenedHI, widenedLO));
5417 // See comment just above, for Iop_F16toF32x4
5418 //case Iop_F16toF32x4: {
5419 // // Same scheme as F16toF32x4
5420 // IRAtom* pcasted = mkPCast16x4(mce, vatom); // :: I16x4
5421 // IRAtom* widenedLO // :: I32x2
5422 // = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveLO16x4,
5423 // pcasted, pcasted));
5424 // IRAtom* widenedHI // :: I32x4
5425 // = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveHI16x4,
5426 // pcasted, pcasted));
5427 // // Glue them back together:
5428 // return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
5429 // widenedHI, widenedLO));
5432 case Iop_PwAddL32Ux2:
5433 case Iop_PwAddL32Sx2:
5434 return mkPCastTo(mce, Ity_I64,
5435 assignNew('V', mce, Ity_I64, unop(op, mkPCast32x2(mce, vatom))));
5437 case Iop_PwAddL16Ux4:
5438 case Iop_PwAddL16Sx4:
5439 return mkPCast32x2(mce,
5440 assignNew('V', mce, Ity_I64, unop(op, mkPCast16x4(mce, vatom))));
5442 case Iop_PwAddL8Ux8:
5443 case Iop_PwAddL8Sx8:
5444 return mkPCast16x4(mce,
5445 assignNew('V', mce, Ity_I64, unop(op, mkPCast8x8(mce, vatom))));
5447 case Iop_PwAddL32Ux4:
5448 case Iop_PwAddL32Sx4:
5449 return mkPCast64x2(mce,
5450 assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
5452 case Iop_PwAddL64Ux2:
5453 return mkPCast128x1(mce,
5454 assignNew('V', mce, Ity_V128, unop(op, mkPCast64x2(mce, vatom))));
5456 case Iop_PwAddL16Ux8:
5457 case Iop_PwAddL16Sx8:
5458 return mkPCast32x4(mce,
5459 assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom))));
5461 case Iop_PwAddL8Ux16:
5462 case Iop_PwAddL8Sx16:
5463 return mkPCast16x8(mce,
5464 assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom))));
5466 case Iop_I64UtoF32:
5467 default:
5468 ppIROp(op);
5469 VG_(tool_panic)("memcheck:expr2vbits_Unop");
5474 /* Worker function -- do not call directly. See comments on
5475 expr2vbits_Load for the meaning of |guard|.
5477 Generates IR to (1) perform a definedness test of |addr|, (2)
5478 perform a validity test of |addr|, and (3) return the Vbits for the
5479 location indicated by |addr|. All of this only happens when
5480 |guard| is NULL or |guard| evaluates to True at run time.
5482 If |guard| evaluates to False at run time, the returned value is
5483 the IR-mandated 0x55..55 value, and no checks nor shadow loads are
5484 performed.
5486 The definedness of |guard| itself is not checked. That is assumed
5487 to have been done before this point, by the caller. */
5488 static
5489 IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
5490 IREndness end, IRType ty,
5491 IRAtom* addr, UInt bias, IRAtom* guard )
5493 tl_assert(isOriginalAtom(mce,addr));
5494 tl_assert(end == Iend_LE || end == Iend_BE);
5496 /* First, emit a definedness test for the address. This also sets
5497 the address (shadow) to 'defined' following the test. */
5498 complainIfUndefined( mce, addr, guard );
5500 /* Now cook up a call to the relevant helper function, to read the data V
5501 bits from shadow memory. Note that I128 loads are done by pretending
5502 we're doing a V128 load, and then converting the resulting V128 vbits
5503 word to an I128, right at the end of this function -- see `castedToI128`
5504 below. (It's only a minor hack :-) This pertains to bug 444399. */
5505 ty = shadowTypeV(ty);
5507 void* helper = NULL;
5508 const HChar* hname = NULL;
5509 Bool ret_via_outparam = False;
5511 if (end == Iend_LE) {
5512 switch (ty) {
5513 case Ity_V256: helper = &MC_(helperc_LOADV256le);
5514 hname = "MC_(helperc_LOADV256le)";
5515 ret_via_outparam = True;
5516 break;
5517 case Ity_I128: // fallthrough. See comment above.
5518 case Ity_V128: helper = &MC_(helperc_LOADV128le);
5519 hname = "MC_(helperc_LOADV128le)";
5520 ret_via_outparam = True;
5521 break;
5522 case Ity_I64: helper = &MC_(helperc_LOADV64le);
5523 hname = "MC_(helperc_LOADV64le)";
5524 break;
5525 case Ity_I32: helper = &MC_(helperc_LOADV32le);
5526 hname = "MC_(helperc_LOADV32le)";
5527 break;
5528 case Ity_I16: helper = &MC_(helperc_LOADV16le);
5529 hname = "MC_(helperc_LOADV16le)";
5530 break;
5531 case Ity_I8: helper = &MC_(helperc_LOADV8);
5532 hname = "MC_(helperc_LOADV8)";
5533 break;
5534 default: ppIRType(ty);
5535 VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(LE)");
5537 } else {
5538 switch (ty) {
5539 case Ity_V256: helper = &MC_(helperc_LOADV256be);
5540 hname = "MC_(helperc_LOADV256be)";
5541 ret_via_outparam = True;
5542 break;
5543 case Ity_V128: helper = &MC_(helperc_LOADV128be);
5544 hname = "MC_(helperc_LOADV128be)";
5545 ret_via_outparam = True;
5546 break;
5547 case Ity_I64: helper = &MC_(helperc_LOADV64be);
5548 hname = "MC_(helperc_LOADV64be)";
5549 break;
5550 case Ity_I32: helper = &MC_(helperc_LOADV32be);
5551 hname = "MC_(helperc_LOADV32be)";
5552 break;
5553 case Ity_I16: helper = &MC_(helperc_LOADV16be);
5554 hname = "MC_(helperc_LOADV16be)";
5555 break;
5556 case Ity_I8: helper = &MC_(helperc_LOADV8);
5557 hname = "MC_(helperc_LOADV8)";
5558 break;
5559 default: ppIRType(ty);
5560 VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(BE)");
5564 tl_assert(helper);
5565 tl_assert(hname);
5567 /* Generate the actual address into addrAct. */
5568 IRAtom* addrAct;
5569 if (bias == 0) {
5570 addrAct = addr;
5571 } else {
5572 IROp mkAdd;
5573 IRAtom* eBias;
5574 IRType tyAddr = mce->hWordTy;
5575 tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
5576 mkAdd = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
5577 eBias = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
5578 addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
5581 /* We need to have a place to park the V bits we're just about to
5582 read. */
5583 IRTemp datavbits = newTemp(mce, ty == Ity_I128 ? Ity_V128 : ty, VSh);
5585 /* Here's the call. */
5586 IRDirty* di;
5587 if (ret_via_outparam) {
5588 di = unsafeIRDirty_1_N( datavbits,
5589 2/*regparms*/,
5590 hname, VG_(fnptr_to_fnentry)( helper ),
5591 mkIRExprVec_2( IRExpr_VECRET(), addrAct ) );
5592 } else {
5593 di = unsafeIRDirty_1_N( datavbits,
5594 1/*regparms*/,
5595 hname, VG_(fnptr_to_fnentry)( helper ),
5596 mkIRExprVec_1( addrAct ) );
5599 setHelperAnns( mce, di );
5600 if (guard) {
5601 di->guard = guard;
5602 /* Ideally the didn't-happen return value here would be all-ones
5603 (all-undefined), so it'd be obvious if it got used
5604 inadvertently. We can get by with the IR-mandated default
5605 value (0b01 repeating, 0x55 etc) as that'll still look pretty
5606 undefined if it ever leaks out. */
5608 stmt( 'V', mce, IRStmt_Dirty(di) );
5610 if (ty == Ity_I128) {
5611 IRAtom* castedToI128
5612 = assignNew('V', mce, Ity_I128,
5613 unop(Iop_ReinterpV128asI128, mkexpr(datavbits)));
5614 return castedToI128;
5615 } else {
5616 return mkexpr(datavbits);
5621 /* Generate IR to do a shadow load. The helper is expected to check
5622 the validity of the address and return the V bits for that address.
5623 This can optionally be controlled by a guard, which is assumed to
5624 be True if NULL. In the case where the guard is False at runtime,
5625 the helper will return the didn't-do-the-call value of 0x55..55.
5626 Since that means "completely undefined result", the caller of
5627 this function will need to fix up the result somehow in that
5628 case.
5630 Caller of this function is also expected to have checked the
5631 definedness of |guard| before this point.
5633 static
5634 IRAtom* expr2vbits_Load ( MCEnv* mce,
5635 IREndness end, IRType ty,
5636 IRAtom* addr, UInt bias,
5637 IRAtom* guard )
5639 tl_assert(end == Iend_LE || end == Iend_BE);
5640 switch (shadowTypeV(ty)) {
5641 case Ity_I8:
5642 case Ity_I16:
5643 case Ity_I32:
5644 case Ity_I64:
5645 case Ity_I128:
5646 case Ity_V128:
5647 case Ity_V256:
5648 return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
5649 default:
5650 VG_(tool_panic)("expr2vbits_Load");
5655 /* The most general handler for guarded loads. Assumes the
5656 definedness of GUARD has already been checked by the caller. A
5657 GUARD of NULL is assumed to mean "always True". Generates code to
5658 check the definedness and validity of ADDR.
5660 Generate IR to do a shadow load from ADDR and return the V bits.
5661 The loaded type is TY. The loaded data is then (shadow) widened by
5662 using VWIDEN, which can be Iop_INVALID to denote a no-op. If GUARD
5663 evaluates to False at run time then the returned Vbits are simply
5664 VALT instead. Note therefore that the argument type of VWIDEN must
5665 be TY and the result type of VWIDEN must equal the type of VALT.
5667 static
5668 IRAtom* expr2vbits_Load_guarded_General ( MCEnv* mce,
5669 IREndness end, IRType ty,
5670 IRAtom* addr, UInt bias,
5671 IRAtom* guard,
5672 IROp vwiden, IRAtom* valt )
5674 /* Sanity check the conversion operation, and also set TYWIDE. */
5675 IRType tyWide = Ity_INVALID;
5676 switch (vwiden) {
5677 case Iop_INVALID:
5678 tyWide = ty;
5679 break;
5680 case Iop_16Uto32: case Iop_16Sto32: case Iop_8Uto32: case Iop_8Sto32:
5681 tyWide = Ity_I32;
5682 break;
5683 default:
5684 VG_(tool_panic)("memcheck:expr2vbits_Load_guarded_General");
5687 /* If the guard evaluates to True, this will hold the loaded V bits
5688 at TY. If the guard evaluates to False, this will be all
5689 ones, meaning "all undefined", in which case we will have to
5690 replace it using an ITE below. */
5691 IRAtom* iftrue1
5692 = assignNew('V', mce, ty,
5693 expr2vbits_Load(mce, end, ty, addr, bias, guard));
5694 /* Now (shadow-) widen the loaded V bits to the desired width. In
5695 the guard-is-False case, the allowable widening operators will
5696 in the worst case (unsigned widening) at least leave the
5697 pre-widened part as being marked all-undefined, and in the best
5698 case (signed widening) mark the whole widened result as
5699 undefined. Anyway, it doesn't matter really, since in this case
5700 we will replace said value with the default value |valt| using an
5701 ITE. */
5702 IRAtom* iftrue2
5703 = vwiden == Iop_INVALID
5704 ? iftrue1
5705 : assignNew('V', mce, tyWide, unop(vwiden, iftrue1));
5706 /* These are the V bits we will return if the load doesn't take
5707 place. */
5708 IRAtom* iffalse
5709 = valt;
5710 /* Prepare the cond for the ITE. Convert a NULL cond into
5711 something that iropt knows how to fold out later. */
5712 IRAtom* cond
5713 = guard == NULL ? mkU1(1) : guard;
5714 /* And assemble the final result. */
5715 return assignNew('V', mce, tyWide, IRExpr_ITE(cond, iftrue2, iffalse));
5719 /* A simpler handler for guarded loads, in which there is no
5720 conversion operation, and the default V bit return (when the guard
5721 evaluates to False at runtime) is "all defined". If there is no
5722 guard expression or the guard is always TRUE this function behaves
5723 like expr2vbits_Load. It is assumed that definedness of GUARD has
5724 already been checked at the call site. */
5725 static
5726 IRAtom* expr2vbits_Load_guarded_Simple ( MCEnv* mce,
5727 IREndness end, IRType ty,
5728 IRAtom* addr, UInt bias,
5729 IRAtom *guard )
5731 return expr2vbits_Load_guarded_General(
5732 mce, end, ty, addr, bias, guard, Iop_INVALID, definedOfType(ty)
5737 static
5738 IRAtom* expr2vbits_ITE ( MCEnv* mce,
5739 IRAtom* cond, IRAtom* iftrue, IRAtom* iffalse )
5741 IRAtom *vbitsC, *vbits0, *vbits1;
5742 IRType ty;
5743 /* Given ITE(cond, iftrue, iffalse), generate
5744 ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
5745 That is, steer the V bits like the originals, but trash the
5746 result if the steering value is undefined. This gives
5747 lazy propagation. */
5748 tl_assert(isOriginalAtom(mce, cond));
5749 tl_assert(isOriginalAtom(mce, iftrue));
5750 tl_assert(isOriginalAtom(mce, iffalse));
5752 vbitsC = expr2vbits(mce, cond, HuOth); // could we use HuPCa here?
5753 vbits1 = expr2vbits(mce, iftrue, HuOth);
5754 vbits0 = expr2vbits(mce, iffalse, HuOth);
5755 ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
5757 return
5758 mkUifU(mce, ty, assignNew('V', mce, ty,
5759 IRExpr_ITE(cond, vbits1, vbits0)),
5760 mkPCastTo(mce, ty, vbitsC) );
5763 /* --------- This is the main expression-handling function. --------- */
5765 static
5766 IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e,
5767 HowUsed hu/*use HuOth if unknown*/ )
5769 switch (e->tag) {
5771 case Iex_Get:
5772 return shadow_GET( mce, e->Iex.Get.offset, e->Iex.Get.ty );
5774 case Iex_GetI:
5775 return shadow_GETI( mce, e->Iex.GetI.descr,
5776 e->Iex.GetI.ix, e->Iex.GetI.bias );
5778 case Iex_RdTmp:
5779 return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
5781 case Iex_Const:
5782 return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
5784 case Iex_Qop:
5785 return expr2vbits_Qop(
5786 mce,
5787 e->Iex.Qop.details->op,
5788 e->Iex.Qop.details->arg1, e->Iex.Qop.details->arg2,
5789 e->Iex.Qop.details->arg3, e->Iex.Qop.details->arg4
5792 case Iex_Triop:
5793 return expr2vbits_Triop(
5794 mce,
5795 e->Iex.Triop.details->op,
5796 e->Iex.Triop.details->arg1, e->Iex.Triop.details->arg2,
5797 e->Iex.Triop.details->arg3
5800 case Iex_Binop:
5801 return expr2vbits_Binop(
5802 mce,
5803 e->Iex.Binop.op,
5804 e->Iex.Binop.arg1, e->Iex.Binop.arg2,
5808 case Iex_Unop:
5809 return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
5811 case Iex_Load:
5812 return expr2vbits_Load( mce, e->Iex.Load.end,
5813 e->Iex.Load.ty,
5814 e->Iex.Load.addr, 0/*addr bias*/,
5815 NULL/* guard == "always True"*/ );
5817 case Iex_CCall:
5818 return mkLazyN( mce, e->Iex.CCall.args,
5819 e->Iex.CCall.retty,
5820 e->Iex.CCall.cee );
5822 case Iex_ITE:
5823 return expr2vbits_ITE( mce, e->Iex.ITE.cond, e->Iex.ITE.iftrue,
5824 e->Iex.ITE.iffalse);
5826 default:
5827 VG_(printf)("\n");
5828 ppIRExpr(e);
5829 VG_(printf)("\n");
5830 VG_(tool_panic)("memcheck: expr2vbits");
5835 /*------------------------------------------------------------*/
5836 /*--- Generate shadow stmts from all kinds of IRStmts. ---*/
5837 /*------------------------------------------------------------*/
5839 /* Widen a value to the host word size. */
5841 static
5842 IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
5844 IRType ty, tyH;
5846 /* vatom is vbits-value and as such can only have a shadow type. */
5847 tl_assert(isShadowAtom(mce,vatom));
5849 ty = typeOfIRExpr(mce->sb->tyenv, vatom);
5850 tyH = mce->hWordTy;
5852 if (tyH == Ity_I32) {
5853 switch (ty) {
5854 case Ity_I32:
5855 return vatom;
5856 case Ity_I16:
5857 return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
5858 case Ity_I8:
5859 return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
5860 default:
5861 goto unhandled;
5863 } else
5864 if (tyH == Ity_I64) {
5865 switch (ty) {
5866 case Ity_I32:
5867 return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
5868 case Ity_I16:
5869 return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5870 assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
5871 case Ity_I8:
5872 return assignNew('V', mce, tyH, unop(Iop_32Uto64,
5873 assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
5874 default:
5875 goto unhandled;
5877 } else {
5878 goto unhandled;
5880 unhandled:
5881 VG_(printf)("\nty = "); ppIRType(ty); VG_(printf)("\n");
5882 VG_(tool_panic)("zwidenToHostWord");
5886 /* Generate a shadow store. |addr| is always the original address
5887 atom. You can pass in either originals or V-bits for the data
5888 atom, but obviously not both. This function generates a check for
5889 the definedness and (indirectly) the validity of |addr|, but only
5890 when |guard| evaluates to True at run time (or is NULL).
5892 |guard| :: Ity_I1 controls whether the store really happens; NULL
5893 means it unconditionally does. Note that |guard| itself is not
5894 checked for definedness; the caller of this function must do that
5895 if necessary.
5897 static
5898 void do_shadow_Store ( MCEnv* mce,
5899 IREndness end,
5900 IRAtom* addr, UInt bias,
5901 IRAtom* data, IRAtom* vdata,
5902 IRAtom* guard )
5904 IROp mkAdd;
5905 IRType ty, tyAddr;
5906 void* helper = NULL;
5907 const HChar* hname = NULL;
5908 IRConst* c;
5910 tyAddr = mce->hWordTy;
5911 mkAdd = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
5912 tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
5913 tl_assert( end == Iend_LE || end == Iend_BE );
5915 if (data) {
5916 tl_assert(!vdata);
5917 tl_assert(isOriginalAtom(mce, data));
5918 tl_assert(bias == 0);
5919 vdata = expr2vbits( mce, data, HuOth );
5920 } else {
5921 tl_assert(vdata);
5924 tl_assert(isOriginalAtom(mce,addr));
5925 tl_assert(isShadowAtom(mce,vdata));
5927 if (guard) {
5928 tl_assert(isOriginalAtom(mce, guard));
5929 tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
5932 ty = typeOfIRExpr(mce->sb->tyenv, vdata);
5934 // If we're not doing undefined value checking, pretend that this value
5935 // is "all valid". That lets Vex's optimiser remove some of the V bit
5936 // shadow computation ops that precede it.
5937 if (MC_(clo_mc_level) == 1) {
5938 switch (ty) {
5939 case Ity_V256: // V256 weirdness -- used four times
5940 c = IRConst_V256(V_BITS32_DEFINED); break;
5941 case Ity_V128: // V128 weirdness -- used twice
5942 c = IRConst_V128(V_BITS16_DEFINED); break;
5943 case Ity_I128: c = IRConst_U128(V_BITS16_DEFINED); break;
5944 case Ity_I64: c = IRConst_U64 (V_BITS64_DEFINED); break;
5945 case Ity_I32: c = IRConst_U32 (V_BITS32_DEFINED); break;
5946 case Ity_I16: c = IRConst_U16 (V_BITS16_DEFINED); break;
5947 case Ity_I8: c = IRConst_U8 (V_BITS8_DEFINED); break;
5948 default: VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
5950 vdata = IRExpr_Const( c );
5953 /* First, emit a definedness test for the address. This also sets
5954 the address (shadow) to 'defined' following the test. Both of
5955 those actions are gated on |guard|. */
5956 complainIfUndefined( mce, addr, guard );
5958 /* Now decide which helper function to call to write the data V
5959 bits into shadow memory. */
5960 if (end == Iend_LE) {
5961 switch (ty) {
5962 case Ity_V256: /* we'll use the helper four times */
5963 case Ity_V128: /* we'll use the helper twice */
5964 case Ity_I128: /* we'll use the helper twice */
5965 case Ity_I64: helper = &MC_(helperc_STOREV64le);
5966 hname = "MC_(helperc_STOREV64le)";
5967 break;
5968 case Ity_I32: helper = &MC_(helperc_STOREV32le);
5969 hname = "MC_(helperc_STOREV32le)";
5970 break;
5971 case Ity_I16: helper = &MC_(helperc_STOREV16le);
5972 hname = "MC_(helperc_STOREV16le)";
5973 break;
5974 case Ity_I8: helper = &MC_(helperc_STOREV8);
5975 hname = "MC_(helperc_STOREV8)";
5976 break;
5977 default: VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
5979 } else {
5980 switch (ty) {
5981 case Ity_V128: /* we'll use the helper twice */
5982 case Ity_I64: helper = &MC_(helperc_STOREV64be);
5983 hname = "MC_(helperc_STOREV64be)";
5984 break;
5985 case Ity_I32: helper = &MC_(helperc_STOREV32be);
5986 hname = "MC_(helperc_STOREV32be)";
5987 break;
5988 case Ity_I16: helper = &MC_(helperc_STOREV16be);
5989 hname = "MC_(helperc_STOREV16be)";
5990 break;
5991 case Ity_I8: helper = &MC_(helperc_STOREV8);
5992 hname = "MC_(helperc_STOREV8)";
5993 break;
5994 /* Note, no V256 case here, because no big-endian target that
5995 we support, has 256 vectors. */
5996 default: VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
6000 if (UNLIKELY(ty == Ity_V256)) {
6002 /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
6003 Q3 being the most significant lane. */
6004 /* These are the offsets of the Qs in memory. */
6005 Int offQ0, offQ1, offQ2, offQ3;
6007 /* Various bits for constructing the 4 lane helper calls */
6008 IRDirty *diQ0, *diQ1, *diQ2, *diQ3;
6009 IRAtom *addrQ0, *addrQ1, *addrQ2, *addrQ3;
6010 IRAtom *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
6011 IRAtom *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
6013 if (end == Iend_LE) {
6014 offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
6015 } else {
6016 offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
6019 eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
6020 addrQ0 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
6021 vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
6022 diQ0 = unsafeIRDirty_0_N(
6023 1/*regparms*/,
6024 hname, VG_(fnptr_to_fnentry)( helper ),
6025 mkIRExprVec_2( addrQ0, vdataQ0 )
6028 eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
6029 addrQ1 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
6030 vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
6031 diQ1 = unsafeIRDirty_0_N(
6032 1/*regparms*/,
6033 hname, VG_(fnptr_to_fnentry)( helper ),
6034 mkIRExprVec_2( addrQ1, vdataQ1 )
6037 eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
6038 addrQ2 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
6039 vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
6040 diQ2 = unsafeIRDirty_0_N(
6041 1/*regparms*/,
6042 hname, VG_(fnptr_to_fnentry)( helper ),
6043 mkIRExprVec_2( addrQ2, vdataQ2 )
6046 eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
6047 addrQ3 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
6048 vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
6049 diQ3 = unsafeIRDirty_0_N(
6050 1/*regparms*/,
6051 hname, VG_(fnptr_to_fnentry)( helper ),
6052 mkIRExprVec_2( addrQ3, vdataQ3 )
6055 if (guard)
6056 diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
6058 setHelperAnns( mce, diQ0 );
6059 setHelperAnns( mce, diQ1 );
6060 setHelperAnns( mce, diQ2 );
6061 setHelperAnns( mce, diQ3 );
6062 stmt( 'V', mce, IRStmt_Dirty(diQ0) );
6063 stmt( 'V', mce, IRStmt_Dirty(diQ1) );
6064 stmt( 'V', mce, IRStmt_Dirty(diQ2) );
6065 stmt( 'V', mce, IRStmt_Dirty(diQ3) );
6068 else if (UNLIKELY(ty == Ity_V128 || ty == Ity_I128)) {
6070 /* V128/I128-bit case */
6071 /* See comment in next clause re 64-bit regparms */
6072 /* also, need to be careful about endianness */
6074 Int offLo64, offHi64;
6075 IRDirty *diLo64, *diHi64;
6076 IRAtom *addrLo64, *addrHi64;
6077 IRAtom *vdataLo64, *vdataHi64;
6078 IRAtom *eBiasLo64, *eBiasHi64;
6079 IROp opGetLO64, opGetHI64;
6081 if (end == Iend_LE) {
6082 offLo64 = 0;
6083 offHi64 = 8;
6084 } else {
6085 offLo64 = 8;
6086 offHi64 = 0;
6089 if (ty == Ity_V128) {
6090 opGetLO64 = Iop_V128to64;
6091 opGetHI64 = Iop_V128HIto64;
6092 } else {
6093 opGetLO64 = Iop_128to64;
6094 opGetHI64 = Iop_128HIto64;
6097 eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
6098 addrLo64 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
6099 vdataLo64 = assignNew('V', mce, Ity_I64, unop(opGetLO64, vdata));
6100 diLo64 = unsafeIRDirty_0_N(
6101 1/*regparms*/,
6102 hname, VG_(fnptr_to_fnentry)( helper ),
6103 mkIRExprVec_2( addrLo64, vdataLo64 )
6105 eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
6106 addrHi64 = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
6107 vdataHi64 = assignNew('V', mce, Ity_I64, unop(opGetHI64, vdata));
6108 diHi64 = unsafeIRDirty_0_N(
6109 1/*regparms*/,
6110 hname, VG_(fnptr_to_fnentry)( helper ),
6111 mkIRExprVec_2( addrHi64, vdataHi64 )
6113 if (guard) diLo64->guard = guard;
6114 if (guard) diHi64->guard = guard;
6115 setHelperAnns( mce, diLo64 );
6116 setHelperAnns( mce, diHi64 );
6117 stmt( 'V', mce, IRStmt_Dirty(diLo64) );
6118 stmt( 'V', mce, IRStmt_Dirty(diHi64) );
6120 } else {
6122 IRDirty *di;
6123 IRAtom *addrAct;
6125 /* 8/16/32/64-bit cases */
6126 /* Generate the actual address into addrAct. */
6127 if (bias == 0) {
6128 addrAct = addr;
6129 } else {
6130 IRAtom* eBias = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
6131 addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
6134 if (ty == Ity_I64) {
6135 /* We can't do this with regparm 2 on 32-bit platforms, since
6136 the back ends aren't clever enough to handle 64-bit
6137 regparm args. Therefore be different. */
6138 di = unsafeIRDirty_0_N(
6139 1/*regparms*/,
6140 hname, VG_(fnptr_to_fnentry)( helper ),
6141 mkIRExprVec_2( addrAct, vdata )
6143 } else {
6144 di = unsafeIRDirty_0_N(
6145 2/*regparms*/,
6146 hname, VG_(fnptr_to_fnentry)( helper ),
6147 mkIRExprVec_2( addrAct,
6148 zwidenToHostWord( mce, vdata ))
6151 if (guard) di->guard = guard;
6152 setHelperAnns( mce, di );
6153 stmt( 'V', mce, IRStmt_Dirty(di) );
6159 /* Do lazy pessimistic propagation through a dirty helper call, by
6160 looking at the annotations on it. This is the most complex part of
6161 Memcheck. */
6163 static IRType szToITy ( Int n )
6165 switch (n) {
6166 case 1: return Ity_I8;
6167 case 2: return Ity_I16;
6168 case 4: return Ity_I32;
6169 case 8: return Ity_I64;
6170 default: VG_(tool_panic)("szToITy(memcheck)");
6174 static
6175 void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
6177 Int i, k, n, toDo, gSz, gOff;
6178 IRAtom *src, *here, *curr;
6179 IRType tySrc, tyDst;
6180 IRTemp dst;
6181 IREndness end;
6183 /* What's the native endianness? We need to know this. */
6184 # if defined(VG_BIGENDIAN)
6185 end = Iend_BE;
6186 # elif defined(VG_LITTLEENDIAN)
6187 end = Iend_LE;
6188 # else
6189 # error "Unknown endianness"
6190 # endif
6192 /* First check the guard. */
6193 complainIfUndefined(mce, d->guard, NULL);
6195 /* Now round up all inputs and PCast over them. */
6196 curr = definedOfType(Ity_I32);
6198 /* Inputs: unmasked args
6199 Note: arguments are evaluated REGARDLESS of the guard expression */
6200 for (i = 0; d->args[i]; i++) {
6201 IRAtom* arg = d->args[i];
6202 if ( (d->cee->mcx_mask & (1<<i))
6203 || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
6204 /* ignore this arg */
6205 } else {
6206 here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, arg, HuOth) );
6207 curr = mkUifU32(mce, here, curr);
6211 /* Inputs: guest state that we read. */
6212 for (i = 0; i < d->nFxState; i++) {
6213 tl_assert(d->fxState[i].fx != Ifx_None);
6214 if (d->fxState[i].fx == Ifx_Write)
6215 continue;
6217 /* Enumerate the described state segments */
6218 for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6219 gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6220 gSz = d->fxState[i].size;
6222 /* Ignore any sections marked as 'always defined'. */
6223 if (isAlwaysDefd(mce, gOff, gSz)) {
6224 if (0)
6225 VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
6226 gOff, gSz);
6227 continue;
6230 /* This state element is read or modified. So we need to
6231 consider it. If larger than 8 bytes, deal with it in
6232 8-byte chunks. */
6233 while (True) {
6234 tl_assert(gSz >= 0);
6235 if (gSz == 0) break;
6236 n = gSz <= 8 ? gSz : 8;
6237 /* update 'curr' with UifU of the state slice
6238 gOff .. gOff+n-1 */
6239 tySrc = szToITy( n );
6241 /* Observe the guard expression. If it is false use an
6242 all-bits-defined bit pattern */
6243 IRAtom *cond, *iffalse, *iftrue;
6245 cond = assignNew('V', mce, Ity_I1, d->guard);
6246 iftrue = assignNew('V', mce, tySrc, shadow_GET(mce, gOff, tySrc));
6247 iffalse = assignNew('V', mce, tySrc, definedOfType(tySrc));
6248 src = assignNew('V', mce, tySrc,
6249 IRExpr_ITE(cond, iftrue, iffalse));
6251 here = mkPCastTo( mce, Ity_I32, src );
6252 curr = mkUifU32(mce, here, curr);
6253 gSz -= n;
6254 gOff += n;
6259 /* Inputs: memory. First set up some info needed regardless of
6260 whether we're doing reads or writes. */
6262 if (d->mFx != Ifx_None) {
6263 /* Because we may do multiple shadow loads/stores from the same
6264 base address, it's best to do a single test of its
6265 definedness right now. Post-instrumentation optimisation
6266 should remove all but this test. */
6267 IRType tyAddr;
6268 tl_assert(d->mAddr);
6269 complainIfUndefined(mce, d->mAddr, d->guard);
6271 tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
6272 tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
6273 tl_assert(tyAddr == mce->hWordTy); /* not really right */
6276 /* Deal with memory inputs (reads or modifies) */
6277 if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
6278 toDo = d->mSize;
6279 /* chew off 32-bit chunks. We don't care about the endianness
6280 since it's all going to be condensed down to a single bit,
6281 but nevertheless choose an endianness which is hopefully
6282 native to the platform. */
6283 while (toDo >= 4) {
6284 here = mkPCastTo(
6285 mce, Ity_I32,
6286 expr2vbits_Load_guarded_Simple(
6287 mce, end, Ity_I32, d->mAddr, d->mSize - toDo, d->guard )
6289 curr = mkUifU32(mce, here, curr);
6290 toDo -= 4;
6292 /* chew off 16-bit chunks */
6293 while (toDo >= 2) {
6294 here = mkPCastTo(
6295 mce, Ity_I32,
6296 expr2vbits_Load_guarded_Simple(
6297 mce, end, Ity_I16, d->mAddr, d->mSize - toDo, d->guard )
6299 curr = mkUifU32(mce, here, curr);
6300 toDo -= 2;
6302 /* chew off the remaining 8-bit chunk, if any */
6303 if (toDo == 1) {
6304 here = mkPCastTo(
6305 mce, Ity_I32,
6306 expr2vbits_Load_guarded_Simple(
6307 mce, end, Ity_I8, d->mAddr, d->mSize - toDo, d->guard )
6309 curr = mkUifU32(mce, here, curr);
6310 toDo -= 1;
6312 tl_assert(toDo == 0);
6315 /* Whew! So curr is a 32-bit V-value summarising pessimistically
6316 all the inputs to the helper. Now we need to re-distribute the
6317 results to all destinations. */
6319 /* Outputs: the destination temporary, if there is one. */
6320 if (d->tmp != IRTemp_INVALID) {
6321 dst = findShadowTmpV(mce, d->tmp);
6322 tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
6323 assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
6326 /* Outputs: guest state that we write or modify. */
6327 for (i = 0; i < d->nFxState; i++) {
6328 tl_assert(d->fxState[i].fx != Ifx_None);
6329 if (d->fxState[i].fx == Ifx_Read)
6330 continue;
6332 /* Enumerate the described state segments */
6333 for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
6334 gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
6335 gSz = d->fxState[i].size;
6337 /* Ignore any sections marked as 'always defined'. */
6338 if (isAlwaysDefd(mce, gOff, gSz))
6339 continue;
6341 /* This state element is written or modified. So we need to
6342 consider it. If larger than 8 bytes, deal with it in
6343 8-byte chunks. */
6344 while (True) {
6345 tl_assert(gSz >= 0);
6346 if (gSz == 0) break;
6347 n = gSz <= 8 ? gSz : 8;
6348 /* Write suitably-casted 'curr' to the state slice
6349 gOff .. gOff+n-1 */
6350 tyDst = szToITy( n );
6351 do_shadow_PUT( mce, gOff,
6352 NULL, /* original atom */
6353 mkPCastTo( mce, tyDst, curr ), d->guard );
6354 gSz -= n;
6355 gOff += n;
6360 /* Outputs: memory that we write or modify. Same comments about
6361 endianness as above apply. */
6362 if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
6363 toDo = d->mSize;
6364 /* chew off 32-bit chunks */
6365 while (toDo >= 4) {
6366 do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6367 NULL, /* original data */
6368 mkPCastTo( mce, Ity_I32, curr ),
6369 d->guard );
6370 toDo -= 4;
6372 /* chew off 16-bit chunks */
6373 while (toDo >= 2) {
6374 do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6375 NULL, /* original data */
6376 mkPCastTo( mce, Ity_I16, curr ),
6377 d->guard );
6378 toDo -= 2;
6380 /* chew off the remaining 8-bit chunk, if any */
6381 if (toDo == 1) {
6382 do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
6383 NULL, /* original data */
6384 mkPCastTo( mce, Ity_I8, curr ),
6385 d->guard );
6386 toDo -= 1;
6388 tl_assert(toDo == 0);
6394 /* We have an ABI hint telling us that [base .. base+len-1] is to
6395 become undefined ("writable"). Generate code to call a helper to
6396 notify the A/V bit machinery of this fact.
6398 We call
6399 void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
6400 Addr nia );
6402 static
6403 void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
6405 IRDirty* di;
6407 if (MC_(clo_mc_level) == 3) {
6408 di = unsafeIRDirty_0_N(
6409 3/*regparms*/,
6410 "MC_(helperc_MAKE_STACK_UNINIT_w_o)",
6411 VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_w_o) ),
6412 mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
6414 } else {
6415 /* We ignore the supplied nia, since it is irrelevant. */
6416 tl_assert(MC_(clo_mc_level) == 2 || MC_(clo_mc_level) == 1);
6417 /* Special-case the len==128 case, since that is for amd64-ELF,
6418 which is a very common target. */
6419 if (len == 128) {
6420 di = unsafeIRDirty_0_N(
6421 1/*regparms*/,
6422 "MC_(helperc_MAKE_STACK_UNINIT_128_no_o)",
6423 VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_128_no_o)),
6424 mkIRExprVec_1( base )
6426 } else {
6427 di = unsafeIRDirty_0_N(
6428 2/*regparms*/,
6429 "MC_(helperc_MAKE_STACK_UNINIT_no_o)",
6430 VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_no_o) ),
6431 mkIRExprVec_2( base, mkIRExpr_HWord( (UInt)len) )
6436 stmt( 'V', mce, IRStmt_Dirty(di) );
6440 /* ------ Dealing with IRCAS (big and complex) ------ */
6442 /* FWDS */
6443 static IRAtom* gen_load_b ( MCEnv* mce, Int szB,
6444 IRAtom* baseaddr, Int offset );
6445 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
6446 static void gen_store_b ( MCEnv* mce, Int szB,
6447 IRAtom* baseaddr, Int offset, IRAtom* dataB,
6448 IRAtom* guard );
6450 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
6451 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
6454 /* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
6455 IRExpr.Consts, else this asserts. If they are both Consts, it
6456 doesn't do anything. So that just leaves the RdTmp case.
6458 In which case: this assigns the shadow value SHADOW to the IR
6459 shadow temporary associated with ORIG. That is, ORIG, being an
6460 original temporary, will have a shadow temporary associated with
6461 it. However, in the case envisaged here, there will so far have
6462 been no IR emitted to actually write a shadow value into that
6463 temporary. What this routine does is to (emit IR to) copy the
6464 value in SHADOW into said temporary, so that after this call,
6465 IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
6466 value in SHADOW.
6468 Point is to allow callers to compute "by hand" a shadow value for
6469 ORIG, and force it to be associated with ORIG.
6471 How do we know that that shadow associated with ORIG has not so far
6472 been assigned to? Well, we don't per se know that, but supposing
6473 it had. Then this routine would create a second assignment to it,
6474 and later the IR sanity checker would barf. But that never
6475 happens. QED.
6477 static void bind_shadow_tmp_to_orig ( UChar how,
6478 MCEnv* mce,
6479 IRAtom* orig, IRAtom* shadow )
6481 tl_assert(isOriginalAtom(mce, orig));
6482 tl_assert(isShadowAtom(mce, shadow));
6483 switch (orig->tag) {
6484 case Iex_Const:
6485 tl_assert(shadow->tag == Iex_Const);
6486 break;
6487 case Iex_RdTmp:
6488 tl_assert(shadow->tag == Iex_RdTmp);
6489 if (how == 'V') {
6490 assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
6491 shadow);
6492 } else {
6493 tl_assert(how == 'B');
6494 assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
6495 shadow);
6497 break;
6498 default:
6499 tl_assert(0);
6504 static
6505 void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
6507 /* Scheme is (both single- and double- cases):
6509 1. fetch data#,dataB (the proposed new value)
6511 2. fetch expd#,expdB (what we expect to see at the address)
6513 3. check definedness of address
6515 4. load old#,oldB from shadow memory; this also checks
6516 addressibility of the address
6518 5. the CAS itself
6520 6. compute "expected == old". See COMMENT_ON_CasCmpEQ below.
6522 7. if "expected == old" (as computed by (6))
6523 store data#,dataB to shadow memory
6525 Note that 5 reads 'old' but 4 reads 'old#'. Similarly, 5 stores
6526 'data' but 7 stores 'data#'. Hence it is possible for the
6527 shadow data to be incorrectly checked and/or updated:
6529 * 7 is at least gated correctly, since the 'expected == old'
6530 condition is derived from outputs of 5. However, the shadow
6531 write could happen too late: imagine after 5 we are
6532 descheduled, a different thread runs, writes a different
6533 (shadow) value at the address, and then we resume, hence
6534 overwriting the shadow value written by the other thread.
6536 Because the original memory access is atomic, there's no way to
6537 make both the original and shadow accesses into a single atomic
6538 thing, hence this is unavoidable.
6540 At least as Valgrind stands, I don't think it's a problem, since
6541 we're single threaded *and* we guarantee that there are no
6542 context switches during the execution of any specific superblock
6543 -- context switches can only happen at superblock boundaries.
6545 If Valgrind ever becomes MT in the future, then it might be more
6546 of a problem. A possible kludge would be to artificially
6547 associate with the location, a lock, which we must acquire and
6548 release around the transaction as a whole. Hmm, that probably
6549 would't work properly since it only guards us against other
6550 threads doing CASs on the same location, not against other
6551 threads doing normal reads and writes.
6553 ------------------------------------------------------------
6555 COMMENT_ON_CasCmpEQ:
6557 Note two things. Firstly, in the sequence above, we compute
6558 "expected == old", but we don't check definedness of it. Why
6559 not? Also, the x86 and amd64 front ends use
6560 Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
6561 determination (expected == old ?) for themselves, and we also
6562 don't check definedness for those primops; we just say that the
6563 result is defined. Why? Details follow.
6565 x86/amd64 contains various forms of locked insns:
6566 * lock prefix before all basic arithmetic insn;
6567 eg lock xorl %reg1,(%reg2)
6568 * atomic exchange reg-mem
6569 * compare-and-swaps
6571 Rather than attempt to represent them all, which would be a
6572 royal PITA, I used a result from Maurice Herlihy
6573 (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
6574 demonstrates that compare-and-swap is a primitive more general
6575 than the other two, and so can be used to represent all of them.
6576 So the translation scheme for (eg) lock incl (%reg) is as
6577 follows:
6579 again:
6580 old = * %reg
6581 new = old + 1
6582 atomically { if (* %reg == old) { * %reg = new } else { goto again } }
6584 The "atomically" is the CAS bit. The scheme is always the same:
6585 get old value from memory, compute new value, atomically stuff
6586 new value back in memory iff the old value has not changed (iow,
6587 no other thread modified it in the meantime). If it has changed
6588 then we've been out-raced and we have to start over.
6590 Now that's all very neat, but it has the bad side effect of
6591 introducing an explicit equality test into the translation.
6592 Consider the behaviour of said code on a memory location which
6593 is uninitialised. We will wind up doing a comparison on
6594 uninitialised data, and mc duly complains.
6596 What's difficult about this is, the common case is that the
6597 location is uncontended, and so we're usually comparing the same
6598 value (* %reg) with itself. So we shouldn't complain even if it
6599 is undefined. But mc doesn't know that.
6601 My solution is to mark the == in the IR specially, so as to tell
6602 mc that it almost certainly compares a value with itself, and we
6603 should just regard the result as always defined. Rather than
6604 add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
6605 Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
6607 So there's always the question of, can this give a false
6608 negative? eg, imagine that initially, * %reg is defined; and we
6609 read that; but then in the gap between the read and the CAS, a
6610 different thread writes an undefined (and different) value at
6611 the location. Then the CAS in this thread will fail and we will
6612 go back to "again:", but without knowing that the trip back
6613 there was based on an undefined comparison. No matter; at least
6614 the other thread won the race and the location is correctly
6615 marked as undefined. What if it wrote an uninitialised version
6616 of the same value that was there originally, though?
6618 etc etc. Seems like there's a small corner case in which we
6619 might lose the fact that something's defined -- we're out-raced
6620 in between the "old = * reg" and the "atomically {", _and_ the
6621 other thread is writing in an undefined version of what's
6622 already there. Well, that seems pretty unlikely.
6626 If we ever need to reinstate it .. code which generates a
6627 definedness test for "expected == old" was removed at r10432 of
6628 this file.
6630 if (cas->oldHi == IRTemp_INVALID) {
6631 do_shadow_CAS_single( mce, cas );
6632 } else {
6633 do_shadow_CAS_double( mce, cas );
6638 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
6640 IRAtom *vdataLo = NULL, *bdataLo = NULL;
6641 IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6642 IRAtom *voldLo = NULL, *boldLo = NULL;
6643 IRAtom *expd_eq_old = NULL;
6644 IROp opCasCmpEQ;
6645 Int elemSzB;
6646 IRType elemTy;
6647 Bool otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6649 /* single CAS */
6650 tl_assert(cas->oldHi == IRTemp_INVALID);
6651 tl_assert(cas->expdHi == NULL);
6652 tl_assert(cas->dataHi == NULL);
6654 elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6655 switch (elemTy) {
6656 case Ity_I8: elemSzB = 1; opCasCmpEQ = Iop_CasCmpEQ8; break;
6657 case Ity_I16: elemSzB = 2; opCasCmpEQ = Iop_CasCmpEQ16; break;
6658 case Ity_I32: elemSzB = 4; opCasCmpEQ = Iop_CasCmpEQ32; break;
6659 case Ity_I64: elemSzB = 8; opCasCmpEQ = Iop_CasCmpEQ64; break;
6660 default: tl_assert(0); /* IR defn disallows any other types */
6663 /* 1. fetch data# (the proposed new value) */
6664 tl_assert(isOriginalAtom(mce, cas->dataLo));
6665 vdataLo
6666 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6667 tl_assert(isShadowAtom(mce, vdataLo));
6668 if (otrak) {
6669 bdataLo
6670 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6671 tl_assert(isShadowAtom(mce, bdataLo));
6674 /* 2. fetch expected# (what we expect to see at the address) */
6675 tl_assert(isOriginalAtom(mce, cas->expdLo));
6676 vexpdLo
6677 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6678 tl_assert(isShadowAtom(mce, vexpdLo));
6679 if (otrak) {
6680 bexpdLo
6681 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6682 tl_assert(isShadowAtom(mce, bexpdLo));
6685 /* 3. check definedness of address */
6686 /* 4. fetch old# from shadow memory; this also checks
6687 addressibility of the address */
6688 voldLo
6689 = assignNew(
6690 'V', mce, elemTy,
6691 expr2vbits_Load(
6692 mce,
6693 cas->end, elemTy, cas->addr, 0/*Addr bias*/,
6694 NULL/*always happens*/
6696 bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6697 if (otrak) {
6698 boldLo
6699 = assignNew('B', mce, Ity_I32,
6700 gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
6701 bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6704 /* 5. the CAS itself */
6705 stmt( 'C', mce, IRStmt_CAS(cas) );
6707 /* 6. compute "expected == old" */
6708 /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6709 /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6710 tree, but it's not copied from the input block. */
6711 expd_eq_old
6712 = assignNew('C', mce, Ity_I1,
6713 binop(opCasCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
6715 /* 7. if "expected == old"
6716 store data# to shadow memory */
6717 do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
6718 NULL/*data*/, vdataLo/*vdata*/,
6719 expd_eq_old/*guard for store*/ );
6720 if (otrak) {
6721 gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
6722 bdataLo/*bdata*/,
6723 expd_eq_old/*guard for store*/ );
6728 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
6730 IRAtom *vdataHi = NULL, *bdataHi = NULL;
6731 IRAtom *vdataLo = NULL, *bdataLo = NULL;
6732 IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
6733 IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
6734 IRAtom *voldHi = NULL, *boldHi = NULL;
6735 IRAtom *voldLo = NULL, *boldLo = NULL;
6736 IRAtom *xHi = NULL, *xLo = NULL, *xHL = NULL;
6737 IRAtom *expd_eq_old = NULL, *zero = NULL;
6738 IROp opCasCmpEQ, opOr, opXor;
6739 Int elemSzB, memOffsLo, memOffsHi;
6740 IRType elemTy;
6741 Bool otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
6743 /* double CAS */
6744 tl_assert(cas->oldHi != IRTemp_INVALID);
6745 tl_assert(cas->expdHi != NULL);
6746 tl_assert(cas->dataHi != NULL);
6748 elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
6749 switch (elemTy) {
6750 case Ity_I8:
6751 opCasCmpEQ = Iop_CasCmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8;
6752 elemSzB = 1; zero = mkU8(0);
6753 break;
6754 case Ity_I16:
6755 opCasCmpEQ = Iop_CasCmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
6756 elemSzB = 2; zero = mkU16(0);
6757 break;
6758 case Ity_I32:
6759 opCasCmpEQ = Iop_CasCmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
6760 elemSzB = 4; zero = mkU32(0);
6761 break;
6762 case Ity_I64:
6763 opCasCmpEQ = Iop_CasCmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
6764 elemSzB = 8; zero = mkU64(0);
6765 break;
6766 default:
6767 tl_assert(0); /* IR defn disallows any other types */
6770 /* 1. fetch data# (the proposed new value) */
6771 tl_assert(isOriginalAtom(mce, cas->dataHi));
6772 tl_assert(isOriginalAtom(mce, cas->dataLo));
6773 vdataHi
6774 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi, HuOth));
6775 vdataLo
6776 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo, HuOth));
6777 tl_assert(isShadowAtom(mce, vdataHi));
6778 tl_assert(isShadowAtom(mce, vdataLo));
6779 if (otrak) {
6780 bdataHi
6781 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
6782 bdataLo
6783 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
6784 tl_assert(isShadowAtom(mce, bdataHi));
6785 tl_assert(isShadowAtom(mce, bdataLo));
6788 /* 2. fetch expected# (what we expect to see at the address) */
6789 tl_assert(isOriginalAtom(mce, cas->expdHi));
6790 tl_assert(isOriginalAtom(mce, cas->expdLo));
6791 vexpdHi
6792 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi, HuOth));
6793 vexpdLo
6794 = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo, HuOth));
6795 tl_assert(isShadowAtom(mce, vexpdHi));
6796 tl_assert(isShadowAtom(mce, vexpdLo));
6797 if (otrak) {
6798 bexpdHi
6799 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
6800 bexpdLo
6801 = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
6802 tl_assert(isShadowAtom(mce, bexpdHi));
6803 tl_assert(isShadowAtom(mce, bexpdLo));
6806 /* 3. check definedness of address */
6807 /* 4. fetch old# from shadow memory; this also checks
6808 addressibility of the address */
6809 if (cas->end == Iend_LE) {
6810 memOffsLo = 0;
6811 memOffsHi = elemSzB;
6812 } else {
6813 tl_assert(cas->end == Iend_BE);
6814 memOffsLo = elemSzB;
6815 memOffsHi = 0;
6817 voldHi
6818 = assignNew(
6819 'V', mce, elemTy,
6820 expr2vbits_Load(
6821 mce,
6822 cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/,
6823 NULL/*always happens*/
6825 voldLo
6826 = assignNew(
6827 'V', mce, elemTy,
6828 expr2vbits_Load(
6829 mce,
6830 cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/,
6831 NULL/*always happens*/
6833 bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
6834 bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
6835 if (otrak) {
6836 boldHi
6837 = assignNew('B', mce, Ity_I32,
6838 gen_load_b(mce, elemSzB, cas->addr,
6839 memOffsHi/*addr bias*/));
6840 boldLo
6841 = assignNew('B', mce, Ity_I32,
6842 gen_load_b(mce, elemSzB, cas->addr,
6843 memOffsLo/*addr bias*/));
6844 bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
6845 bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
6848 /* 5. the CAS itself */
6849 stmt( 'C', mce, IRStmt_CAS(cas) );
6851 /* 6. compute "expected == old" */
6852 /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6853 /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6854 tree, but it's not copied from the input block. */
6856 xHi = oldHi ^ expdHi;
6857 xLo = oldLo ^ expdLo;
6858 xHL = xHi | xLo;
6859 expd_eq_old = xHL == 0;
6861 xHi = assignNew('C', mce, elemTy,
6862 binop(opXor, cas->expdHi, mkexpr(cas->oldHi)));
6863 xLo = assignNew('C', mce, elemTy,
6864 binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
6865 xHL = assignNew('C', mce, elemTy,
6866 binop(opOr, xHi, xLo));
6867 expd_eq_old
6868 = assignNew('C', mce, Ity_I1,
6869 binop(opCasCmpEQ, xHL, zero));
6871 /* 7. if "expected == old"
6872 store data# to shadow memory */
6873 do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
6874 NULL/*data*/, vdataHi/*vdata*/,
6875 expd_eq_old/*guard for store*/ );
6876 do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
6877 NULL/*data*/, vdataLo/*vdata*/,
6878 expd_eq_old/*guard for store*/ );
6879 if (otrak) {
6880 gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
6881 bdataHi/*bdata*/,
6882 expd_eq_old/*guard for store*/ );
6883 gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
6884 bdataLo/*bdata*/,
6885 expd_eq_old/*guard for store*/ );
6890 /* ------ Dealing with LL/SC (not difficult) ------ */
6892 static void do_shadow_LLSC ( MCEnv* mce,
6893 IREndness stEnd,
6894 IRTemp stResult,
6895 IRExpr* stAddr,
6896 IRExpr* stStoredata )
6898 /* In short: treat a load-linked like a normal load followed by an
6899 assignment of the loaded (shadow) data to the result temporary.
6900 Treat a store-conditional like a normal store, and mark the
6901 result temporary as defined. */
6902 IRType resTy = typeOfIRTemp(mce->sb->tyenv, stResult);
6903 IRTemp resTmp = findShadowTmpV(mce, stResult);
6905 tl_assert(isIRAtom(stAddr));
6906 if (stStoredata)
6907 tl_assert(isIRAtom(stStoredata));
6909 if (stStoredata == NULL) {
6910 /* Load Linked */
6911 /* Just treat this as a normal load, followed by an assignment of
6912 the value to .result. */
6913 /* Stay sane */
6914 tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
6915 || resTy == Ity_I16 || resTy == Ity_I8);
6916 assign( 'V', mce, resTmp,
6917 expr2vbits_Load(
6918 mce, stEnd, resTy, stAddr, 0/*addr bias*/,
6919 NULL/*always happens*/) );
6920 } else {
6921 /* Store Conditional */
6922 /* Stay sane */
6923 IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
6924 stStoredata);
6925 tl_assert(dataTy == Ity_I128 || dataTy == Ity_I64 || dataTy == Ity_I32
6926 || dataTy == Ity_I16 || dataTy == Ity_I8);
6927 do_shadow_Store( mce, stEnd,
6928 stAddr, 0/* addr bias */,
6929 stStoredata,
6930 NULL /* shadow data */,
6931 NULL/*guard*/ );
6932 /* This is a store conditional, so it writes to .result a value
6933 indicating whether or not the store succeeded. Just claim
6934 this value is always defined. In the PowerPC interpretation
6935 of store-conditional, definedness of the success indication
6936 depends on whether the address of the store matches the
6937 reservation address. But we can't tell that here (and
6938 anyway, we're not being PowerPC-specific). At least we are
6939 guaranteed that the definedness of the store address, and its
6940 addressibility, will be checked as per normal. So it seems
6941 pretty safe to just say that the success indication is always
6942 defined.
6944 In schemeS, for origin tracking, we must correspondingly set
6945 a no-origin value for the origin shadow of .result.
6947 tl_assert(resTy == Ity_I1);
6948 assign( 'V', mce, resTmp, definedOfType(resTy) );
6953 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
6955 static void do_shadow_StoreG ( MCEnv* mce, IRStoreG* sg )
6957 complainIfUndefined(mce, sg->guard, NULL);
6958 /* do_shadow_Store will generate code to check the definedness and
6959 validity of sg->addr, in the case where sg->guard evaluates to
6960 True at run-time. */
6961 do_shadow_Store( mce, sg->end,
6962 sg->addr, 0/* addr bias */,
6963 sg->data,
6964 NULL /* shadow data */,
6965 sg->guard );
6968 static void do_shadow_LoadG ( MCEnv* mce, IRLoadG* lg )
6970 complainIfUndefined(mce, lg->guard, NULL);
6971 /* expr2vbits_Load_guarded_General will generate code to check the
6972 definedness and validity of lg->addr, in the case where
6973 lg->guard evaluates to True at run-time. */
6975 /* Look at the LoadG's built-in conversion operation, to determine
6976 the source (actual loaded data) type, and the equivalent IROp.
6977 NOTE that implicitly we are taking a widening operation to be
6978 applied to original atoms and producing one that applies to V
6979 bits. Since signed and unsigned widening are self-shadowing,
6980 this is a straight copy of the op (modulo swapping from the
6981 IRLoadGOp form to the IROp form). Note also therefore that this
6982 implicitly duplicates the logic to do with said widening ops in
6983 expr2vbits_Unop. See comment at the start of expr2vbits_Unop. */
6984 IROp vwiden = Iop_INVALID;
6985 IRType loadedTy = Ity_INVALID;
6986 switch (lg->cvt) {
6987 case ILGop_IdentV128: loadedTy = Ity_V128; vwiden = Iop_INVALID; break;
6988 case ILGop_Ident64: loadedTy = Ity_I64; vwiden = Iop_INVALID; break;
6989 case ILGop_Ident32: loadedTy = Ity_I32; vwiden = Iop_INVALID; break;
6990 case ILGop_16Uto32: loadedTy = Ity_I16; vwiden = Iop_16Uto32; break;
6991 case ILGop_16Sto32: loadedTy = Ity_I16; vwiden = Iop_16Sto32; break;
6992 case ILGop_8Uto32: loadedTy = Ity_I8; vwiden = Iop_8Uto32; break;
6993 case ILGop_8Sto32: loadedTy = Ity_I8; vwiden = Iop_8Sto32; break;
6994 default: VG_(tool_panic)("do_shadow_LoadG");
6997 IRAtom* vbits_alt
6998 = expr2vbits( mce, lg->alt, HuOth );
6999 IRAtom* vbits_final
7000 = expr2vbits_Load_guarded_General(mce, lg->end, loadedTy,
7001 lg->addr, 0/*addr bias*/,
7002 lg->guard, vwiden, vbits_alt );
7003 /* And finally, bind the V bits to the destination temporary. */
7004 assign( 'V', mce, findShadowTmpV(mce, lg->dst), vbits_final );
7008 /*------------------------------------------------------------*/
7009 /*--- Origin tracking stuff ---*/
7010 /*------------------------------------------------------------*/
7012 /* Almost identical to findShadowTmpV. */
7013 static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
7015 TempMapEnt* ent;
7016 /* VG_(indexXA) range-checks 'orig', hence no need to check
7017 here. */
7018 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
7019 tl_assert(ent->kind == Orig);
7020 if (ent->shadowB == IRTemp_INVALID) {
7021 IRTemp tmpB
7022 = newTemp( mce, Ity_I32, BSh );
7023 /* newTemp may cause mce->tmpMap to resize, hence previous results
7024 from VG_(indexXA) are invalid. */
7025 ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
7026 tl_assert(ent->kind == Orig);
7027 tl_assert(ent->shadowB == IRTemp_INVALID);
7028 ent->shadowB = tmpB;
7030 return ent->shadowB;
7033 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
7035 return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
7039 /* Make a guarded origin load, with no special handling in the
7040 didn't-happen case. A GUARD of NULL is assumed to mean "always
7041 True".
7043 Generate IR to do a shadow origins load from BASEADDR+OFFSET and
7044 return the otag. The loaded size is SZB. If GUARD evaluates to
7045 False at run time then the returned otag is zero.
7047 static IRAtom* gen_guarded_load_b ( MCEnv* mce, Int szB,
7048 IRAtom* baseaddr,
7049 Int offset, IRExpr* guard )
7051 void* hFun;
7052 const HChar* hName;
7053 IRTemp bTmp;
7054 IRDirty* di;
7055 IRType aTy = typeOfIRExpr( mce->sb->tyenv, baseaddr );
7056 IROp opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
7057 IRAtom* ea = baseaddr;
7058 if (offset != 0) {
7059 IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
7060 : mkU64( (Long)(Int)offset );
7061 ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
7063 bTmp = newTemp(mce, mce->hWordTy, BSh);
7065 switch (szB) {
7066 case 1: hFun = (void*)&MC_(helperc_b_load1);
7067 hName = "MC_(helperc_b_load1)";
7068 break;
7069 case 2: hFun = (void*)&MC_(helperc_b_load2);
7070 hName = "MC_(helperc_b_load2)";
7071 break;
7072 case 4: hFun = (void*)&MC_(helperc_b_load4);
7073 hName = "MC_(helperc_b_load4)";
7074 break;
7075 case 8: hFun = (void*)&MC_(helperc_b_load8);
7076 hName = "MC_(helperc_b_load8)";
7077 break;
7078 case 16: hFun = (void*)&MC_(helperc_b_load16);
7079 hName = "MC_(helperc_b_load16)";
7080 break;
7081 case 32: hFun = (void*)&MC_(helperc_b_load32);
7082 hName = "MC_(helperc_b_load32)";
7083 break;
7084 default:
7085 VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
7086 tl_assert(0);
7088 di = unsafeIRDirty_1_N(
7089 bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
7090 mkIRExprVec_1( ea )
7092 if (guard) {
7093 di->guard = guard;
7094 /* Ideally the didn't-happen return value here would be
7095 all-zeroes (unknown-origin), so it'd be harmless if it got
7096 used inadvertently. We slum it out with the IR-mandated
7097 default value (0b01 repeating, 0x55 etc) as that'll probably
7098 trump all legitimate otags via Max32, and it's pretty
7099 obviously bogus. */
7101 /* no need to mess with any annotations. This call accesses
7102 neither guest state nor guest memory. */
7103 stmt( 'B', mce, IRStmt_Dirty(di) );
7104 if (mce->hWordTy == Ity_I64) {
7105 /* 64-bit host */
7106 IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
7107 assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
7108 return mkexpr(bTmp32);
7109 } else {
7110 /* 32-bit host */
7111 return mkexpr(bTmp);
7116 /* Generate IR to do a shadow origins load from BASEADDR+OFFSET. The
7117 loaded size is SZB. The load is regarded as unconditional (always
7118 happens).
7120 static IRAtom* gen_load_b ( MCEnv* mce, Int szB, IRAtom* baseaddr,
7121 Int offset )
7123 return gen_guarded_load_b(mce, szB, baseaddr, offset, NULL/*guard*/);
7127 /* The most general handler for guarded origin loads. A GUARD of NULL
7128 is assumed to mean "always True".
7130 Generate IR to do a shadow origin load from ADDR+BIAS and return
7131 the B bits. The loaded type is TY. If GUARD evaluates to False at
7132 run time then the returned B bits are simply BALT instead.
7134 static
7135 IRAtom* expr2ori_Load_guarded_General ( MCEnv* mce,
7136 IRType ty,
7137 IRAtom* addr, UInt bias,
7138 IRAtom* guard, IRAtom* balt )
7140 /* If the guard evaluates to True, this will hold the loaded
7141 origin. If the guard evaluates to False, this will be zero,
7142 meaning "unknown origin", in which case we will have to replace
7143 it using an ITE below. */
7144 IRAtom* iftrue
7145 = assignNew('B', mce, Ity_I32,
7146 gen_guarded_load_b(mce, sizeofIRType(ty),
7147 addr, bias, guard));
7148 /* These are the bits we will return if the load doesn't take
7149 place. */
7150 IRAtom* iffalse
7151 = balt;
7152 /* Prepare the cond for the ITE. Convert a NULL cond into
7153 something that iropt knows how to fold out later. */
7154 IRAtom* cond
7155 = guard == NULL ? mkU1(1) : guard;
7156 /* And assemble the final result. */
7157 return assignNew('B', mce, Ity_I32, IRExpr_ITE(cond, iftrue, iffalse));
7161 /* Generate a shadow origins store. guard :: Ity_I1 controls whether
7162 the store really happens; NULL means it unconditionally does. */
7163 static void gen_store_b ( MCEnv* mce, Int szB,
7164 IRAtom* baseaddr, Int offset, IRAtom* dataB,
7165 IRAtom* guard )
7167 void* hFun;
7168 const HChar* hName;
7169 IRDirty* di;
7170 IRType aTy = typeOfIRExpr( mce->sb->tyenv, baseaddr );
7171 IROp opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
7172 IRAtom* ea = baseaddr;
7173 if (guard) {
7174 tl_assert(isOriginalAtom(mce, guard));
7175 tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
7177 if (offset != 0) {
7178 IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
7179 : mkU64( (Long)(Int)offset );
7180 ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
7182 if (mce->hWordTy == Ity_I64)
7183 dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
7185 switch (szB) {
7186 case 1: hFun = (void*)&MC_(helperc_b_store1);
7187 hName = "MC_(helperc_b_store1)";
7188 break;
7189 case 2: hFun = (void*)&MC_(helperc_b_store2);
7190 hName = "MC_(helperc_b_store2)";
7191 break;
7192 case 4: hFun = (void*)&MC_(helperc_b_store4);
7193 hName = "MC_(helperc_b_store4)";
7194 break;
7195 case 8: hFun = (void*)&MC_(helperc_b_store8);
7196 hName = "MC_(helperc_b_store8)";
7197 break;
7198 case 16: hFun = (void*)&MC_(helperc_b_store16);
7199 hName = "MC_(helperc_b_store16)";
7200 break;
7201 case 32: hFun = (void*)&MC_(helperc_b_store32);
7202 hName = "MC_(helperc_b_store32)";
7203 break;
7204 default:
7205 tl_assert(0);
7207 di = unsafeIRDirty_0_N( 2/*regparms*/,
7208 hName, VG_(fnptr_to_fnentry)( hFun ),
7209 mkIRExprVec_2( ea, dataB )
7211 /* no need to mess with any annotations. This call accesses
7212 neither guest state nor guest memory. */
7213 if (guard) di->guard = guard;
7214 stmt( 'B', mce, IRStmt_Dirty(di) );
7217 static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
7218 IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
7219 if (eTy == Ity_I64)
7220 return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
7221 if (eTy == Ity_I32)
7222 return e;
7223 tl_assert(0);
7226 static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
7227 IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
7228 tl_assert(eTy == Ity_I32);
7229 if (dstTy == Ity_I64)
7230 return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
7231 tl_assert(0);
7235 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
7237 tl_assert(MC_(clo_mc_level) == 3);
7239 switch (e->tag) {
7241 case Iex_GetI: {
7242 IRRegArray* descr_b;
7243 IRAtom *t1, *t2, *t3, *t4;
7244 IRRegArray* descr = e->Iex.GetI.descr;
7245 IRType equivIntTy
7246 = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7247 /* If this array is unshadowable for whatever reason, use the
7248 usual approximation. */
7249 if (equivIntTy == Ity_INVALID)
7250 return mkU32(0);
7251 tl_assert(sizeofIRType(equivIntTy) >= 4);
7252 tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7253 descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7254 equivIntTy, descr->nElems );
7255 /* Do a shadow indexed get of the same size, giving t1. Take
7256 the bottom 32 bits of it, giving t2. Compute into t3 the
7257 origin for the index (almost certainly zero, but there's
7258 no harm in being completely general here, since iropt will
7259 remove any useless code), and fold it in, giving a final
7260 value t4. */
7261 t1 = assignNew( 'B', mce, equivIntTy,
7262 IRExpr_GetI( descr_b, e->Iex.GetI.ix,
7263 e->Iex.GetI.bias ));
7264 t2 = narrowTo32( mce, t1 );
7265 t3 = schemeE( mce, e->Iex.GetI.ix );
7266 t4 = gen_maxU32( mce, t2, t3 );
7267 return t4;
7269 case Iex_CCall: {
7270 Int i;
7271 IRAtom* here;
7272 IRExpr** args = e->Iex.CCall.args;
7273 IRAtom* curr = mkU32(0);
7274 for (i = 0; args[i]; i++) {
7275 tl_assert(i < 32);
7276 tl_assert(isOriginalAtom(mce, args[i]));
7277 /* Only take notice of this arg if the callee's
7278 mc-exclusion mask does not say it is to be excluded. */
7279 if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
7280 /* the arg is to be excluded from definedness checking.
7281 Do nothing. */
7282 if (0) VG_(printf)("excluding %s(%d)\n",
7283 e->Iex.CCall.cee->name, i);
7284 } else {
7285 /* calculate the arg's definedness, and pessimistically
7286 merge it in. */
7287 here = schemeE( mce, args[i] );
7288 curr = gen_maxU32( mce, curr, here );
7291 return curr;
7293 case Iex_Load: {
7294 Int dszB;
7295 dszB = sizeofIRType(e->Iex.Load.ty);
7296 /* assert that the B value for the address is already
7297 available (somewhere) */
7298 tl_assert(isIRAtom(e->Iex.Load.addr));
7299 tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
7300 return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
7302 case Iex_ITE: {
7303 IRAtom* b1 = schemeE( mce, e->Iex.ITE.cond );
7304 IRAtom* b3 = schemeE( mce, e->Iex.ITE.iftrue );
7305 IRAtom* b2 = schemeE( mce, e->Iex.ITE.iffalse );
7306 return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
7308 case Iex_Qop: {
7309 IRAtom* b1 = schemeE( mce, e->Iex.Qop.details->arg1 );
7310 IRAtom* b2 = schemeE( mce, e->Iex.Qop.details->arg2 );
7311 IRAtom* b3 = schemeE( mce, e->Iex.Qop.details->arg3 );
7312 IRAtom* b4 = schemeE( mce, e->Iex.Qop.details->arg4 );
7313 return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
7314 gen_maxU32( mce, b3, b4 ) );
7316 case Iex_Triop: {
7317 IRAtom* b1 = schemeE( mce, e->Iex.Triop.details->arg1 );
7318 IRAtom* b2 = schemeE( mce, e->Iex.Triop.details->arg2 );
7319 IRAtom* b3 = schemeE( mce, e->Iex.Triop.details->arg3 );
7320 return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
7322 case Iex_Binop: {
7323 switch (e->Iex.Binop.op) {
7324 case Iop_CasCmpEQ8: case Iop_CasCmpNE8:
7325 case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
7326 case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
7327 case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
7328 /* Just say these all produce a defined result,
7329 regardless of their arguments. See
7330 COMMENT_ON_CasCmpEQ in this file. */
7331 return mkU32(0);
7332 default: {
7333 IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
7334 IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
7335 return gen_maxU32( mce, b1, b2 );
7338 tl_assert(0);
7339 /*NOTREACHED*/
7341 case Iex_Unop: {
7342 IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
7343 return b1;
7345 case Iex_Const:
7346 return mkU32(0);
7347 case Iex_RdTmp:
7348 return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
7349 case Iex_Get: {
7350 Int b_offset = MC_(get_otrack_shadow_offset)(
7351 e->Iex.Get.offset,
7352 sizeofIRType(e->Iex.Get.ty)
7354 tl_assert(b_offset >= -1
7355 && b_offset <= mce->layout->total_sizeB -4);
7356 if (b_offset >= 0) {
7357 /* FIXME: this isn't an atom! */
7358 return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
7359 Ity_I32 );
7361 return mkU32(0);
7363 default:
7364 VG_(printf)("mc_translate.c: schemeE: unhandled: ");
7365 ppIRExpr(e);
7366 VG_(tool_panic)("memcheck:schemeE");
7371 static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
7373 // This is a hacked version of do_shadow_Dirty
7374 Int i, k, n, toDo, gSz, gOff;
7375 IRAtom *here, *curr;
7376 IRTemp dst;
7378 /* First check the guard. */
7379 curr = schemeE( mce, d->guard );
7381 /* Now round up all inputs and maxU32 over them. */
7383 /* Inputs: unmasked args
7384 Note: arguments are evaluated REGARDLESS of the guard expression */
7385 for (i = 0; d->args[i]; i++) {
7386 IRAtom* arg = d->args[i];
7387 if ( (d->cee->mcx_mask & (1<<i))
7388 || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
7389 /* ignore this arg */
7390 } else {
7391 here = schemeE( mce, arg );
7392 curr = gen_maxU32( mce, curr, here );
7396 /* Inputs: guest state that we read. */
7397 for (i = 0; i < d->nFxState; i++) {
7398 tl_assert(d->fxState[i].fx != Ifx_None);
7399 if (d->fxState[i].fx == Ifx_Write)
7400 continue;
7402 /* Enumerate the described state segments */
7403 for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7404 gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7405 gSz = d->fxState[i].size;
7407 /* Ignore any sections marked as 'always defined'. */
7408 if (isAlwaysDefd(mce, gOff, gSz)) {
7409 if (0)
7410 VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
7411 gOff, gSz);
7412 continue;
7415 /* This state element is read or modified. So we need to
7416 consider it. If larger than 4 bytes, deal with it in
7417 4-byte chunks. */
7418 while (True) {
7419 Int b_offset;
7420 tl_assert(gSz >= 0);
7421 if (gSz == 0) break;
7422 n = gSz <= 4 ? gSz : 4;
7423 /* update 'curr' with maxU32 of the state slice
7424 gOff .. gOff+n-1 */
7425 b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7426 if (b_offset != -1) {
7427 /* Observe the guard expression. If it is false use 0, i.e.
7428 nothing is known about the origin */
7429 IRAtom *cond, *iffalse, *iftrue;
7431 cond = assignNew( 'B', mce, Ity_I1, d->guard);
7432 iffalse = mkU32(0);
7433 iftrue = assignNew( 'B', mce, Ity_I32,
7434 IRExpr_Get(b_offset
7435 + 2*mce->layout->total_sizeB,
7436 Ity_I32));
7437 here = assignNew( 'B', mce, Ity_I32,
7438 IRExpr_ITE(cond, iftrue, iffalse));
7439 curr = gen_maxU32( mce, curr, here );
7441 gSz -= n;
7442 gOff += n;
7447 /* Inputs: memory */
7449 if (d->mFx != Ifx_None) {
7450 /* Because we may do multiple shadow loads/stores from the same
7451 base address, it's best to do a single test of its
7452 definedness right now. Post-instrumentation optimisation
7453 should remove all but this test. */
7454 tl_assert(d->mAddr);
7455 here = schemeE( mce, d->mAddr );
7456 curr = gen_maxU32( mce, curr, here );
7459 /* Deal with memory inputs (reads or modifies) */
7460 if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
7461 toDo = d->mSize;
7462 /* chew off 32-bit chunks. We don't care about the endianness
7463 since it's all going to be condensed down to a single bit,
7464 but nevertheless choose an endianness which is hopefully
7465 native to the platform. */
7466 while (toDo >= 4) {
7467 here = gen_guarded_load_b( mce, 4, d->mAddr, d->mSize - toDo,
7468 d->guard );
7469 curr = gen_maxU32( mce, curr, here );
7470 toDo -= 4;
7472 /* handle possible 16-bit excess */
7473 while (toDo >= 2) {
7474 here = gen_guarded_load_b( mce, 2, d->mAddr, d->mSize - toDo,
7475 d->guard );
7476 curr = gen_maxU32( mce, curr, here );
7477 toDo -= 2;
7479 /* chew off the remaining 8-bit chunk, if any */
7480 if (toDo == 1) {
7481 here = gen_guarded_load_b( mce, 1, d->mAddr, d->mSize - toDo,
7482 d->guard );
7483 curr = gen_maxU32( mce, curr, here );
7484 toDo -= 1;
7486 tl_assert(toDo == 0);
7489 /* Whew! So curr is a 32-bit B-value which should give an origin
7490 of some use if any of the inputs to the helper are undefined.
7491 Now we need to re-distribute the results to all destinations. */
7493 /* Outputs: the destination temporary, if there is one. */
7494 if (d->tmp != IRTemp_INVALID) {
7495 dst = findShadowTmpB(mce, d->tmp);
7496 assign( 'V', mce, dst, curr );
7499 /* Outputs: guest state that we write or modify. */
7500 for (i = 0; i < d->nFxState; i++) {
7501 tl_assert(d->fxState[i].fx != Ifx_None);
7502 if (d->fxState[i].fx == Ifx_Read)
7503 continue;
7505 /* Enumerate the described state segments */
7506 for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
7507 gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
7508 gSz = d->fxState[i].size;
7510 /* Ignore any sections marked as 'always defined'. */
7511 if (isAlwaysDefd(mce, gOff, gSz))
7512 continue;
7514 /* This state element is written or modified. So we need to
7515 consider it. If larger than 4 bytes, deal with it in
7516 4-byte chunks. */
7517 while (True) {
7518 Int b_offset;
7519 tl_assert(gSz >= 0);
7520 if (gSz == 0) break;
7521 n = gSz <= 4 ? gSz : 4;
7522 /* Write 'curr' to the state slice gOff .. gOff+n-1 */
7523 b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
7524 if (b_offset != -1) {
7526 /* If the guard expression evaluates to false we simply Put
7527 the value that is already stored in the guest state slot */
7528 IRAtom *cond, *iffalse;
7530 cond = assignNew('B', mce, Ity_I1,
7531 d->guard);
7532 iffalse = assignNew('B', mce, Ity_I32,
7533 IRExpr_Get(b_offset +
7534 2*mce->layout->total_sizeB,
7535 Ity_I32));
7536 curr = assignNew('V', mce, Ity_I32,
7537 IRExpr_ITE(cond, curr, iffalse));
7539 stmt( 'B', mce, IRStmt_Put(b_offset
7540 + 2*mce->layout->total_sizeB,
7541 curr ));
7543 gSz -= n;
7544 gOff += n;
7549 /* Outputs: memory that we write or modify. Same comments about
7550 endianness as above apply. */
7551 if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
7552 toDo = d->mSize;
7553 /* chew off 32-bit chunks */
7554 while (toDo >= 4) {
7555 gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
7556 d->guard );
7557 toDo -= 4;
7559 /* handle possible 16-bit excess */
7560 while (toDo >= 2) {
7561 gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
7562 d->guard );
7563 toDo -= 2;
7565 /* chew off the remaining 8-bit chunk, if any */
7566 if (toDo == 1) {
7567 gen_store_b( mce, 1, d->mAddr, d->mSize - toDo, curr,
7568 d->guard );
7569 toDo -= 1;
7571 tl_assert(toDo == 0);
7576 /* Generate IR for origin shadowing for a general guarded store. */
7577 static void do_origins_Store_guarded ( MCEnv* mce,
7578 IREndness stEnd,
7579 IRExpr* stAddr,
7580 IRExpr* stData,
7581 IRExpr* guard )
7583 Int dszB;
7584 IRAtom* dataB;
7585 /* assert that the B value for the address is already available
7586 (somewhere), since the call to schemeE will want to see it.
7587 XXXX how does this actually ensure that?? */
7588 tl_assert(isIRAtom(stAddr));
7589 tl_assert(isIRAtom(stData));
7590 dszB = sizeofIRType( typeOfIRExpr(mce->sb->tyenv, stData ) );
7591 dataB = schemeE( mce, stData );
7592 gen_store_b( mce, dszB, stAddr, 0/*offset*/, dataB, guard );
7596 /* Generate IR for origin shadowing for a plain store. */
7597 static void do_origins_Store_plain ( MCEnv* mce,
7598 IREndness stEnd,
7599 IRExpr* stAddr,
7600 IRExpr* stData )
7602 do_origins_Store_guarded ( mce, stEnd, stAddr, stData,
7603 NULL/*guard*/ );
7607 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7609 static void do_origins_StoreG ( MCEnv* mce, IRStoreG* sg )
7611 do_origins_Store_guarded( mce, sg->end, sg->addr,
7612 sg->data, sg->guard );
7615 static void do_origins_LoadG ( MCEnv* mce, IRLoadG* lg )
7617 IRType loadedTy = Ity_INVALID;
7618 switch (lg->cvt) {
7619 case ILGop_IdentV128: loadedTy = Ity_V128; break;
7620 case ILGop_Ident64: loadedTy = Ity_I64; break;
7621 case ILGop_Ident32: loadedTy = Ity_I32; break;
7622 case ILGop_16Uto32: loadedTy = Ity_I16; break;
7623 case ILGop_16Sto32: loadedTy = Ity_I16; break;
7624 case ILGop_8Uto32: loadedTy = Ity_I8; break;
7625 case ILGop_8Sto32: loadedTy = Ity_I8; break;
7626 default: VG_(tool_panic)("schemeS.IRLoadG");
7628 IRAtom* ori_alt
7629 = schemeE( mce,lg->alt );
7630 IRAtom* ori_final
7631 = expr2ori_Load_guarded_General(mce, loadedTy,
7632 lg->addr, 0/*addr bias*/,
7633 lg->guard, ori_alt );
7634 /* And finally, bind the origin to the destination temporary. */
7635 assign( 'B', mce, findShadowTmpB(mce, lg->dst), ori_final );
7639 static void schemeS ( MCEnv* mce, IRStmt* st )
7641 tl_assert(MC_(clo_mc_level) == 3);
7643 switch (st->tag) {
7645 case Ist_AbiHint:
7646 /* The value-check instrumenter handles this - by arranging
7647 to pass the address of the next instruction to
7648 MC_(helperc_MAKE_STACK_UNINIT). This is all that needs to
7649 happen for origin tracking w.r.t. AbiHints. So there is
7650 nothing to do here. */
7651 break;
7653 case Ist_PutI: {
7654 IRPutI *puti = st->Ist.PutI.details;
7655 IRRegArray* descr_b;
7656 IRAtom *t1, *t2, *t3, *t4;
7657 IRRegArray* descr = puti->descr;
7658 IRType equivIntTy
7659 = MC_(get_otrack_reg_array_equiv_int_type)(descr);
7660 /* If this array is unshadowable for whatever reason,
7661 generate no code. */
7662 if (equivIntTy == Ity_INVALID)
7663 break;
7664 tl_assert(sizeofIRType(equivIntTy) >= 4);
7665 tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
7666 descr_b
7667 = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
7668 equivIntTy, descr->nElems );
7669 /* Compute a value to Put - the conjoinment of the origin for
7670 the data to be Put-ted (obviously) and of the index value
7671 (not so obviously). */
7672 t1 = schemeE( mce, puti->data );
7673 t2 = schemeE( mce, puti->ix );
7674 t3 = gen_maxU32( mce, t1, t2 );
7675 t4 = zWidenFrom32( mce, equivIntTy, t3 );
7676 stmt( 'B', mce, IRStmt_PutI( mkIRPutI(descr_b, puti->ix,
7677 puti->bias, t4) ));
7678 break;
7681 case Ist_Dirty:
7682 do_origins_Dirty( mce, st->Ist.Dirty.details );
7683 break;
7685 case Ist_Store:
7686 do_origins_Store_plain( mce, st->Ist.Store.end,
7687 st->Ist.Store.addr,
7688 st->Ist.Store.data );
7689 break;
7691 case Ist_StoreG:
7692 do_origins_StoreG( mce, st->Ist.StoreG.details );
7693 break;
7695 case Ist_LoadG:
7696 do_origins_LoadG( mce, st->Ist.LoadG.details );
7697 break;
7699 case Ist_LLSC: {
7700 /* In short: treat a load-linked like a normal load followed
7701 by an assignment of the loaded (shadow) data the result
7702 temporary. Treat a store-conditional like a normal store,
7703 and mark the result temporary as defined. */
7704 if (st->Ist.LLSC.storedata == NULL) {
7705 /* Load Linked */
7706 IRType resTy
7707 = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
7708 IRExpr* vanillaLoad
7709 = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
7710 tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
7711 || resTy == Ity_I16 || resTy == Ity_I8);
7712 assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7713 schemeE(mce, vanillaLoad));
7714 } else {
7715 /* Store conditional */
7716 do_origins_Store_plain( mce, st->Ist.LLSC.end,
7717 st->Ist.LLSC.addr,
7718 st->Ist.LLSC.storedata );
7719 /* For the rationale behind this, see comments at the
7720 place where the V-shadow for .result is constructed, in
7721 do_shadow_LLSC. In short, we regard .result as
7722 always-defined. */
7723 assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
7724 mkU32(0) );
7726 break;
7729 case Ist_Put: {
7730 Int b_offset
7731 = MC_(get_otrack_shadow_offset)(
7732 st->Ist.Put.offset,
7733 sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
7735 if (b_offset >= 0) {
7736 /* FIXME: this isn't an atom! */
7737 stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
7738 schemeE( mce, st->Ist.Put.data )) );
7740 break;
7743 case Ist_WrTmp:
7744 assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
7745 schemeE(mce, st->Ist.WrTmp.data) );
7746 break;
7748 case Ist_MBE:
7749 case Ist_NoOp:
7750 case Ist_Exit:
7751 case Ist_IMark:
7752 break;
7754 default:
7755 VG_(printf)("mc_translate.c: schemeS: unhandled: ");
7756 ppIRStmt(st);
7757 VG_(tool_panic)("memcheck:schemeS");
7762 /*------------------------------------------------------------*/
7763 /*--- Post-tree-build final tidying ---*/
7764 /*------------------------------------------------------------*/
7766 /* This exploits the observation that Memcheck often produces
7767 repeated conditional calls of the form
7769 Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
7771 with the same guard expression G guarding the same helper call.
7772 The second and subsequent calls are redundant. This usually
7773 results from instrumentation of guest code containing multiple
7774 memory references at different constant offsets from the same base
7775 register. After optimisation of the instrumentation, you get a
7776 test for the definedness of the base register for each memory
7777 reference, which is kinda pointless. MC_(final_tidy) therefore
7778 looks for such repeated calls and removes all but the first. */
7781 /* With some testing on perf/bz2.c, on amd64 and x86, compiled with
7782 gcc-5.3.1 -O2, it appears that 16 entries in the array are enough to
7783 get almost all the benefits of this transformation whilst causing
7784 the slide-back case to just often enough to be verifiably
7785 correct. For posterity, the numbers are:
7787 bz2-32
7789 1 4,336 (112,212 -> 1,709,473; ratio 15.2)
7790 2 4,336 (112,194 -> 1,669,895; ratio 14.9)
7791 3 4,336 (112,194 -> 1,660,713; ratio 14.8)
7792 4 4,336 (112,194 -> 1,658,555; ratio 14.8)
7793 5 4,336 (112,194 -> 1,655,447; ratio 14.8)
7794 6 4,336 (112,194 -> 1,655,101; ratio 14.8)
7795 7 4,336 (112,194 -> 1,654,858; ratio 14.7)
7796 8 4,336 (112,194 -> 1,654,810; ratio 14.7)
7797 10 4,336 (112,194 -> 1,654,621; ratio 14.7)
7798 12 4,336 (112,194 -> 1,654,678; ratio 14.7)
7799 16 4,336 (112,194 -> 1,654,494; ratio 14.7)
7800 32 4,336 (112,194 -> 1,654,602; ratio 14.7)
7801 inf 4,336 (112,194 -> 1,654,602; ratio 14.7)
7803 bz2-64
7805 1 4,113 (107,329 -> 1,822,171; ratio 17.0)
7806 2 4,113 (107,329 -> 1,806,443; ratio 16.8)
7807 3 4,113 (107,329 -> 1,803,967; ratio 16.8)
7808 4 4,113 (107,329 -> 1,802,785; ratio 16.8)
7809 5 4,113 (107,329 -> 1,802,412; ratio 16.8)
7810 6 4,113 (107,329 -> 1,802,062; ratio 16.8)
7811 7 4,113 (107,329 -> 1,801,976; ratio 16.8)
7812 8 4,113 (107,329 -> 1,801,886; ratio 16.8)
7813 10 4,113 (107,329 -> 1,801,653; ratio 16.8)
7814 12 4,113 (107,329 -> 1,801,526; ratio 16.8)
7815 16 4,113 (107,329 -> 1,801,298; ratio 16.8)
7816 32 4,113 (107,329 -> 1,800,827; ratio 16.8)
7817 inf 4,113 (107,329 -> 1,800,827; ratio 16.8)
7820 /* Structs for recording which (helper, guard) pairs we have already
7821 seen. */
7823 #define N_TIDYING_PAIRS 16
7825 typedef
7826 struct { void* entry; IRExpr* guard; }
7827 Pair;
7829 typedef
7830 struct {
7831 Pair pairs[N_TIDYING_PAIRS +1/*for bounds checking*/];
7832 UInt pairsUsed;
7834 Pairs;
7837 /* Return True if e1 and e2 definitely denote the same value (used to
7838 compare guards). Return False if unknown; False is the safe
7839 answer. Since guest registers and guest memory do not have the
7840 SSA property we must return False if any Gets or Loads appear in
7841 the expression. This implicitly assumes that e1 and e2 have the
7842 same IR type, which is always true here -- the type is Ity_I1. */
7844 static Bool sameIRValue ( IRExpr* e1, IRExpr* e2 )
7846 if (e1->tag != e2->tag)
7847 return False;
7848 switch (e1->tag) {
7849 case Iex_Const:
7850 return eqIRConst( e1->Iex.Const.con, e2->Iex.Const.con );
7851 case Iex_Binop:
7852 return e1->Iex.Binop.op == e2->Iex.Binop.op
7853 && sameIRValue(e1->Iex.Binop.arg1, e2->Iex.Binop.arg1)
7854 && sameIRValue(e1->Iex.Binop.arg2, e2->Iex.Binop.arg2);
7855 case Iex_Unop:
7856 return e1->Iex.Unop.op == e2->Iex.Unop.op
7857 && sameIRValue(e1->Iex.Unop.arg, e2->Iex.Unop.arg);
7858 case Iex_RdTmp:
7859 return e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp;
7860 case Iex_ITE:
7861 return sameIRValue( e1->Iex.ITE.cond, e2->Iex.ITE.cond )
7862 && sameIRValue( e1->Iex.ITE.iftrue, e2->Iex.ITE.iftrue )
7863 && sameIRValue( e1->Iex.ITE.iffalse, e2->Iex.ITE.iffalse );
7864 case Iex_Qop:
7865 case Iex_Triop:
7866 case Iex_CCall:
7867 /* be lazy. Could define equality for these, but they never
7868 appear to be used. */
7869 return False;
7870 case Iex_Get:
7871 case Iex_GetI:
7872 case Iex_Load:
7873 /* be conservative - these may not give the same value each
7874 time */
7875 return False;
7876 case Iex_Binder:
7877 /* should never see this */
7878 /* fallthrough */
7879 default:
7880 VG_(printf)("mc_translate.c: sameIRValue: unhandled: ");
7881 ppIRExpr(e1);
7882 VG_(tool_panic)("memcheck:sameIRValue");
7883 return False;
7887 /* See if 'pairs' already has an entry for (entry, guard). Return
7888 True if so. If not, add an entry. */
7890 static
7891 Bool check_or_add ( Pairs* tidyingEnv, IRExpr* guard, void* entry )
7893 UInt i, n = tidyingEnv->pairsUsed;
7894 tl_assert(n <= N_TIDYING_PAIRS);
7895 for (i = 0; i < n; i++) {
7896 if (tidyingEnv->pairs[i].entry == entry
7897 && sameIRValue(tidyingEnv->pairs[i].guard, guard))
7898 return True;
7900 /* (guard, entry) wasn't found in the array. Add it at the end.
7901 If the array is already full, slide the entries one slot
7902 backwards. This means we will lose to ability to detect
7903 duplicates from the pair in slot zero, but that happens so
7904 rarely that it's unlikely to have much effect on overall code
7905 quality. Also, this strategy loses the check for the oldest
7906 tracked exit (memory reference, basically) and so that is (I'd
7907 guess) least likely to be re-used after this point. */
7908 tl_assert(i == n);
7909 if (n == N_TIDYING_PAIRS) {
7910 for (i = 1; i < N_TIDYING_PAIRS; i++) {
7911 tidyingEnv->pairs[i-1] = tidyingEnv->pairs[i];
7913 tidyingEnv->pairs[N_TIDYING_PAIRS-1].entry = entry;
7914 tidyingEnv->pairs[N_TIDYING_PAIRS-1].guard = guard;
7915 } else {
7916 tl_assert(n < N_TIDYING_PAIRS);
7917 tidyingEnv->pairs[n].entry = entry;
7918 tidyingEnv->pairs[n].guard = guard;
7919 n++;
7920 tidyingEnv->pairsUsed = n;
7922 return False;
7925 static Bool is_helperc_value_checkN_fail ( const HChar* name )
7927 /* This is expensive because it happens a lot. We are checking to
7928 see whether |name| is one of the following 8 strings:
7930 MC_(helperc_value_check8_fail_no_o)
7931 MC_(helperc_value_check4_fail_no_o)
7932 MC_(helperc_value_check0_fail_no_o)
7933 MC_(helperc_value_check1_fail_no_o)
7934 MC_(helperc_value_check8_fail_w_o)
7935 MC_(helperc_value_check0_fail_w_o)
7936 MC_(helperc_value_check1_fail_w_o)
7937 MC_(helperc_value_check4_fail_w_o)
7939 To speed it up, check the common prefix just once, rather than
7940 all 8 times.
7942 const HChar* prefix = "MC_(helperc_value_check";
7944 HChar n, p;
7945 while (True) {
7946 n = *name;
7947 p = *prefix;
7948 if (p == 0) break; /* ran off the end of the prefix */
7949 /* We still have some prefix to use */
7950 if (n == 0) return False; /* have prefix, but name ran out */
7951 if (n != p) return False; /* have both pfx and name, but no match */
7952 name++;
7953 prefix++;
7956 /* Check the part after the prefix. */
7957 tl_assert(*prefix == 0 && *name != 0);
7958 return 0==VG_(strcmp)(name, "8_fail_no_o)")
7959 || 0==VG_(strcmp)(name, "4_fail_no_o)")
7960 || 0==VG_(strcmp)(name, "0_fail_no_o)")
7961 || 0==VG_(strcmp)(name, "1_fail_no_o)")
7962 || 0==VG_(strcmp)(name, "8_fail_w_o)")
7963 || 0==VG_(strcmp)(name, "4_fail_w_o)")
7964 || 0==VG_(strcmp)(name, "0_fail_w_o)")
7965 || 0==VG_(strcmp)(name, "1_fail_w_o)");
7968 IRSB* MC_(final_tidy) ( IRSB* sb_in )
7970 Int i;
7971 IRStmt* st;
7972 IRDirty* di;
7973 IRExpr* guard;
7974 IRCallee* cee;
7975 Bool alreadyPresent;
7976 Pairs pairs;
7978 pairs.pairsUsed = 0;
7980 pairs.pairs[N_TIDYING_PAIRS].entry = (void*)0x123;
7981 pairs.pairs[N_TIDYING_PAIRS].guard = (IRExpr*)0x456;
7983 /* Scan forwards through the statements. Each time a call to one
7984 of the relevant helpers is seen, check if we have made a
7985 previous call to the same helper using the same guard
7986 expression, and if so, delete the call. */
7987 for (i = 0; i < sb_in->stmts_used; i++) {
7988 st = sb_in->stmts[i];
7989 tl_assert(st);
7990 if (st->tag != Ist_Dirty)
7991 continue;
7992 di = st->Ist.Dirty.details;
7993 guard = di->guard;
7994 tl_assert(guard);
7995 if (0) { ppIRExpr(guard); VG_(printf)("\n"); }
7996 cee = di->cee;
7997 if (!is_helperc_value_checkN_fail( cee->name ))
7998 continue;
7999 /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
8000 guard 'guard'. Check if we have already seen a call to this
8001 function with the same guard. If so, delete it. If not,
8002 add it to the set of calls we do know about. */
8003 alreadyPresent = check_or_add( &pairs, guard, cee->addr );
8004 if (alreadyPresent) {
8005 sb_in->stmts[i] = IRStmt_NoOp();
8006 if (0) VG_(printf)("XX\n");
8010 tl_assert(pairs.pairs[N_TIDYING_PAIRS].entry == (void*)0x123);
8011 tl_assert(pairs.pairs[N_TIDYING_PAIRS].guard == (IRExpr*)0x456);
8013 return sb_in;
8016 #undef N_TIDYING_PAIRS
8019 /*------------------------------------------------------------*/
8020 /*--- Startup assertion checking ---*/
8021 /*------------------------------------------------------------*/
8023 void MC_(do_instrumentation_startup_checks)( void )
8025 /* Make a best-effort check to see that is_helperc_value_checkN_fail
8026 is working as we expect. */
8028 # define CHECK(_expected, _string) \
8029 tl_assert((_expected) == is_helperc_value_checkN_fail(_string))
8031 /* It should identify these 8, and no others, as targets. */
8032 CHECK(True, "MC_(helperc_value_check8_fail_no_o)");
8033 CHECK(True, "MC_(helperc_value_check4_fail_no_o)");
8034 CHECK(True, "MC_(helperc_value_check0_fail_no_o)");
8035 CHECK(True, "MC_(helperc_value_check1_fail_no_o)");
8036 CHECK(True, "MC_(helperc_value_check8_fail_w_o)");
8037 CHECK(True, "MC_(helperc_value_check0_fail_w_o)");
8038 CHECK(True, "MC_(helperc_value_check1_fail_w_o)");
8039 CHECK(True, "MC_(helperc_value_check4_fail_w_o)");
8041 /* Ad-hoc selection of other strings gathered via a quick test. */
8042 CHECK(False, "amd64g_dirtyhelper_CPUID_avx2");
8043 CHECK(False, "amd64g_dirtyhelper_RDTSC");
8044 CHECK(False, "MC_(helperc_b_load1)");
8045 CHECK(False, "MC_(helperc_b_load2)");
8046 CHECK(False, "MC_(helperc_b_load4)");
8047 CHECK(False, "MC_(helperc_b_load8)");
8048 CHECK(False, "MC_(helperc_b_load16)");
8049 CHECK(False, "MC_(helperc_b_load32)");
8050 CHECK(False, "MC_(helperc_b_store1)");
8051 CHECK(False, "MC_(helperc_b_store2)");
8052 CHECK(False, "MC_(helperc_b_store4)");
8053 CHECK(False, "MC_(helperc_b_store8)");
8054 CHECK(False, "MC_(helperc_b_store16)");
8055 CHECK(False, "MC_(helperc_b_store32)");
8056 CHECK(False, "MC_(helperc_LOADV8)");
8057 CHECK(False, "MC_(helperc_LOADV16le)");
8058 CHECK(False, "MC_(helperc_LOADV32le)");
8059 CHECK(False, "MC_(helperc_LOADV64le)");
8060 CHECK(False, "MC_(helperc_LOADV128le)");
8061 CHECK(False, "MC_(helperc_LOADV256le)");
8062 CHECK(False, "MC_(helperc_STOREV16le)");
8063 CHECK(False, "MC_(helperc_STOREV32le)");
8064 CHECK(False, "MC_(helperc_STOREV64le)");
8065 CHECK(False, "MC_(helperc_STOREV8)");
8066 CHECK(False, "track_die_mem_stack_8");
8067 CHECK(False, "track_new_mem_stack_8_w_ECU");
8068 CHECK(False, "MC_(helperc_MAKE_STACK_UNINIT_w_o)");
8069 CHECK(False, "VG_(unknown_SP_update_w_ECU)");
8071 # undef CHECK
8075 /*------------------------------------------------------------*/
8076 /*--- Memcheck main ---*/
8077 /*------------------------------------------------------------*/
8079 static Bool isBogusAtom ( IRAtom* at )
8081 if (at->tag == Iex_RdTmp)
8082 return False;
8083 tl_assert(at->tag == Iex_Const);
8085 ULong n = 0;
8086 IRConst* con = at->Iex.Const.con;
8087 switch (con->tag) {
8088 case Ico_U1: return False;
8089 case Ico_U8: n = (ULong)con->Ico.U8; break;
8090 case Ico_U16: n = (ULong)con->Ico.U16; break;
8091 case Ico_U32: n = (ULong)con->Ico.U32; break;
8092 case Ico_U64: n = (ULong)con->Ico.U64; break;
8093 case Ico_F32: return False;
8094 case Ico_F64: return False;
8095 case Ico_F32i: return False;
8096 case Ico_F64i: return False;
8097 case Ico_V128: return False;
8098 case Ico_V256: return False;
8099 default: ppIRExpr(at); tl_assert(0);
8101 /* VG_(printf)("%llx\n", n); */
8102 /* Shortcuts */
8103 if (LIKELY(n <= 0x0000000000001000ULL)) return False;
8104 if (LIKELY(n >= 0xFFFFFFFFFFFFF000ULL)) return False;
8105 /* The list of bogus atoms is: */
8106 return (/*32*/ n == 0xFEFEFEFFULL
8107 /*32*/ || n == 0x80808080ULL
8108 /*32*/ || n == 0x7F7F7F7FULL
8109 /*32*/ || n == 0x7EFEFEFFULL
8110 /*32*/ || n == 0x81010100ULL
8111 /*64*/ || n == 0xFFFFFFFFFEFEFEFFULL
8112 /*64*/ || n == 0xFEFEFEFEFEFEFEFFULL
8113 /*64*/ || n == 0x0000000000008080ULL
8114 /*64*/ || n == 0x8080808080808080ULL
8115 /*64*/ || n == 0x0101010101010101ULL
8120 /* Does 'st' mention any of the literals identified/listed in
8121 isBogusAtom()? */
8122 static inline Bool containsBogusLiterals ( /*FLAT*/ IRStmt* st )
8124 Int i;
8125 IRExpr* e;
8126 IRDirty* d;
8127 IRCAS* cas;
8128 switch (st->tag) {
8129 case Ist_WrTmp:
8130 e = st->Ist.WrTmp.data;
8131 switch (e->tag) {
8132 case Iex_Get:
8133 case Iex_RdTmp:
8134 return False;
8135 case Iex_Const:
8136 return isBogusAtom(e);
8137 case Iex_Unop:
8138 return isBogusAtom(e->Iex.Unop.arg)
8139 || e->Iex.Unop.op == Iop_GetMSBs8x16;
8140 case Iex_GetI:
8141 return isBogusAtom(e->Iex.GetI.ix);
8142 case Iex_Binop:
8143 return isBogusAtom(e->Iex.Binop.arg1)
8144 || isBogusAtom(e->Iex.Binop.arg2);
8145 case Iex_Triop:
8146 return isBogusAtom(e->Iex.Triop.details->arg1)
8147 || isBogusAtom(e->Iex.Triop.details->arg2)
8148 || isBogusAtom(e->Iex.Triop.details->arg3);
8149 case Iex_Qop:
8150 return isBogusAtom(e->Iex.Qop.details->arg1)
8151 || isBogusAtom(e->Iex.Qop.details->arg2)
8152 || isBogusAtom(e->Iex.Qop.details->arg3)
8153 || isBogusAtom(e->Iex.Qop.details->arg4);
8154 case Iex_ITE:
8155 return isBogusAtom(e->Iex.ITE.cond)
8156 || isBogusAtom(e->Iex.ITE.iftrue)
8157 || isBogusAtom(e->Iex.ITE.iffalse);
8158 case Iex_Load:
8159 return isBogusAtom(e->Iex.Load.addr);
8160 case Iex_CCall:
8161 for (i = 0; e->Iex.CCall.args[i]; i++)
8162 if (isBogusAtom(e->Iex.CCall.args[i]))
8163 return True;
8164 return False;
8165 default:
8166 goto unhandled;
8168 case Ist_Dirty:
8169 d = st->Ist.Dirty.details;
8170 for (i = 0; d->args[i]; i++) {
8171 IRAtom* atom = d->args[i];
8172 if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(atom))) {
8173 if (isBogusAtom(atom))
8174 return True;
8177 if (isBogusAtom(d->guard))
8178 return True;
8179 if (d->mAddr && isBogusAtom(d->mAddr))
8180 return True;
8181 return False;
8182 case Ist_Put:
8183 return isBogusAtom(st->Ist.Put.data);
8184 case Ist_PutI:
8185 return isBogusAtom(st->Ist.PutI.details->ix)
8186 || isBogusAtom(st->Ist.PutI.details->data);
8187 case Ist_Store:
8188 return isBogusAtom(st->Ist.Store.addr)
8189 || isBogusAtom(st->Ist.Store.data);
8190 case Ist_StoreG: {
8191 IRStoreG* sg = st->Ist.StoreG.details;
8192 return isBogusAtom(sg->addr) || isBogusAtom(sg->data)
8193 || isBogusAtom(sg->guard);
8195 case Ist_LoadG: {
8196 IRLoadG* lg = st->Ist.LoadG.details;
8197 return isBogusAtom(lg->addr) || isBogusAtom(lg->alt)
8198 || isBogusAtom(lg->guard);
8200 case Ist_Exit:
8201 return isBogusAtom(st->Ist.Exit.guard);
8202 case Ist_AbiHint:
8203 return isBogusAtom(st->Ist.AbiHint.base)
8204 || isBogusAtom(st->Ist.AbiHint.nia);
8205 case Ist_NoOp:
8206 case Ist_IMark:
8207 case Ist_MBE:
8208 return False;
8209 case Ist_CAS:
8210 cas = st->Ist.CAS.details;
8211 return isBogusAtom(cas->addr)
8212 || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
8213 || isBogusAtom(cas->expdLo)
8214 || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
8215 || isBogusAtom(cas->dataLo);
8216 case Ist_LLSC:
8217 return isBogusAtom(st->Ist.LLSC.addr)
8218 || (st->Ist.LLSC.storedata
8219 ? isBogusAtom(st->Ist.LLSC.storedata)
8220 : False);
8221 default:
8222 unhandled:
8223 ppIRStmt(st);
8224 VG_(tool_panic)("hasBogusLiterals");
8229 /* This is the pre-instrumentation analysis. It does a backwards pass over
8230 the stmts in |sb_in| to determine a HowUsed value for each tmp defined in
8231 the block.
8233 Unrelatedly, it also checks all literals in the block with |isBogusAtom|,
8234 as a positive result from that is a strong indication that we need to
8235 expensively instrument add/sub in the block. We do both analyses in one
8236 pass, even though they are independent, so as to avoid the overhead of
8237 having to traverse the whole block twice.
8239 The usage pass proceeds as follows. Let max= be the max operation in the
8240 HowUsed lattice, hence
8242 X max= Y means X = max(X, Y)
8244 then
8246 for t in original tmps . useEnv[t] = HuUnU
8248 for t used in the block's . next field
8249 useEnv[t] max= HuPCa // because jmp targets are PCast-tested
8251 for st iterating *backwards* in the block
8253 match st
8255 case "t1 = load(t2)" // case 1
8256 useEnv[t2] max= HuPCa
8258 case "t1 = add(t2, t3)" // case 2
8259 useEnv[t2] max= useEnv[t1]
8260 useEnv[t3] max= useEnv[t1]
8262 other
8263 for t in st.usedTmps // case 3
8264 useEnv[t] max= HuOth
8265 // same as useEnv[t] = HuOth
8267 The general idea is that we accumulate, in useEnv[], information about
8268 how each tmp is used. That can be updated as we work further back
8269 through the block and find more uses of it, but its HowUsed value can
8270 only ascend the lattice, not descend.
8272 Initially we mark all tmps as unused. In case (1), if a tmp is seen to
8273 be used as a memory address, then its use is at least HuPCa. The point
8274 is that for a memory address we will add instrumentation to check if any
8275 bit of the address is undefined, which means that we won't need expensive
8276 V-bit propagation through an add expression that computed the address --
8277 cheap add instrumentation will be equivalent.
8279 Note in case (1) that if we have previously seen a non-memory-address use
8280 of the tmp, then its use will already be HuOth and will be unchanged by
8281 the max= operation. And if it turns out that the source of the tmp was
8282 an add, then we'll have to expensively instrument the add, because we
8283 can't prove that, for the previous non-memory-address use of the tmp,
8284 cheap and expensive instrumentation will be equivalent.
8286 In case 2, we propagate the usage-mode of the result of an add back
8287 through to its operands. Again, we use max= so as to take account of the
8288 fact that t2 or t3 might later in the block (viz, earlier in the
8289 iteration) have been used in a way that requires expensive add
8290 instrumentation.
8292 In case 3, we deal with all other tmp uses. We assume that we'll need a
8293 result that is as accurate as possible, so we max= HuOth into its use
8294 mode. Since HuOth is the top of the lattice, that's equivalent to just
8295 setting its use to HuOth.
8297 The net result of all this is that:
8299 tmps that are used either
8300 - only as a memory address, or
8301 - only as part of a tree of adds that computes a memory address,
8302 and has no other use
8303 are marked as HuPCa, and so we can instrument their generating Add
8304 nodes cheaply, which is the whole point of this analysis
8306 tmps that are used any other way at all are marked as HuOth
8308 tmps that are unused are marked as HuUnU. We don't expect to see any
8309 since we expect that the incoming IR has had all dead assignments
8310 removed by previous optimisation passes. Nevertheless the analysis is
8311 correct even in the presence of dead tmps.
8313 A final comment on dead tmps. In case 1 and case 2, we could actually
8314 conditionalise the updates thusly:
8316 if (useEnv[t1] > HuUnU) { useEnv[t2] max= HuPCa } // case 1
8318 if (useEnv[t1] > HuUnU) { useEnv[t2] max= useEnv[t1] } // case 2
8319 if (useEnv[t1] > HuUnU) { useEnv[t3] max= useEnv[t1] } // case 2
8321 In other words, if the assigned-to tmp |t1| is never used, then there's
8322 no point in propagating any use through to its operands. That won't
8323 change the final HuPCa-vs-HuOth results, which is what we care about.
8324 Given that we expect to get dead-code-free inputs, there's no point in
8325 adding this extra refinement.
8328 /* Helper for |preInstrumentationAnalysis|. */
8329 static inline void noteTmpUsesIn ( /*MOD*/HowUsed* useEnv,
8330 UInt tyenvUsed,
8331 HowUsed newUse, IRAtom* at )
8333 /* For the atom |at|, declare that for any tmp |t| in |at|, we will have
8334 seen a use of |newUse|. So, merge that info into |t|'s accumulated
8335 use info. */
8336 switch (at->tag) {
8337 case Iex_GSPTR:
8338 case Iex_VECRET:
8339 case Iex_Const:
8340 return;
8341 case Iex_RdTmp: {
8342 IRTemp t = at->Iex.RdTmp.tmp;
8343 tl_assert(t < tyenvUsed); // "is an original tmp"
8344 // The "max" operation in the lattice
8345 if (newUse > useEnv[t]) useEnv[t] = newUse;
8346 return;
8348 default:
8349 // We should never get here -- it implies non-flat IR
8350 ppIRExpr(at);
8351 VG_(tool_panic)("noteTmpUsesIn");
8353 /*NOTREACHED*/
8354 tl_assert(0);
8358 static void preInstrumentationAnalysis ( /*OUT*/HowUsed** useEnvP,
8359 /*OUT*/Bool* hasBogusLiteralsP,
8360 const IRSB* sb_in )
8362 const UInt nOrigTmps = (UInt)sb_in->tyenv->types_used;
8364 // We've seen no bogus literals so far.
8365 Bool bogus = False;
8367 // This is calloc'd, so implicitly all entries are initialised to HuUnU.
8368 HowUsed* useEnv = VG_(calloc)("mc.preInstrumentationAnalysis.1",
8369 nOrigTmps, sizeof(HowUsed));
8371 // Firstly, roll in contributions from the final dst address.
8372 bogus = isBogusAtom(sb_in->next);
8373 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, sb_in->next);
8375 // Now work backwards through the stmts.
8376 for (Int i = sb_in->stmts_used-1; i >= 0; i--) {
8377 IRStmt* st = sb_in->stmts[i];
8379 // Deal with literals.
8380 if (LIKELY(!bogus)) {
8381 bogus = containsBogusLiterals(st);
8384 // Deal with tmp uses.
8385 switch (st->tag) {
8386 case Ist_WrTmp: {
8387 IRTemp dst = st->Ist.WrTmp.tmp;
8388 IRExpr* rhs = st->Ist.WrTmp.data;
8389 // This is the one place where we have to consider all possible
8390 // tags for |rhs|, and can't just assume it is a tmp or a const.
8391 switch (rhs->tag) {
8392 case Iex_RdTmp:
8393 // just propagate demand for |dst| into this tmp use.
8394 noteTmpUsesIn(useEnv, nOrigTmps, useEnv[dst], rhs);
8395 break;
8396 case Iex_Unop:
8397 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.Unop.arg);
8398 break;
8399 case Iex_Binop:
8400 if (rhs->Iex.Binop.op == Iop_Add64
8401 || rhs->Iex.Binop.op == Iop_Add32) {
8402 // propagate demand for |dst| through to the operands.
8403 noteTmpUsesIn(useEnv, nOrigTmps,
8404 useEnv[dst], rhs->Iex.Binop.arg1);
8405 noteTmpUsesIn(useEnv, nOrigTmps,
8406 useEnv[dst], rhs->Iex.Binop.arg2);
8407 } else {
8408 // just say that the operands are used in some unknown way.
8409 noteTmpUsesIn(useEnv, nOrigTmps,
8410 HuOth, rhs->Iex.Binop.arg1);
8411 noteTmpUsesIn(useEnv, nOrigTmps,
8412 HuOth, rhs->Iex.Binop.arg2);
8414 break;
8415 case Iex_Triop: {
8416 // All operands are used in some unknown way.
8417 IRTriop* tri = rhs->Iex.Triop.details;
8418 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg1);
8419 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg2);
8420 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, tri->arg3);
8421 break;
8423 case Iex_Qop: {
8424 // All operands are used in some unknown way.
8425 IRQop* qop = rhs->Iex.Qop.details;
8426 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg1);
8427 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg2);
8428 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg3);
8429 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, qop->arg4);
8430 break;
8432 case Iex_Load:
8433 // The address will be checked (== PCasted).
8434 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.Load.addr);
8435 break;
8436 case Iex_ITE:
8437 // The condition is PCasted, the then- and else-values
8438 // aren't.
8439 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.ITE.cond);
8440 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iftrue);
8441 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, rhs->Iex.ITE.iffalse);
8442 break;
8443 case Iex_CCall:
8444 // The args are used in unknown ways.
8445 for (IRExpr** args = rhs->Iex.CCall.args; *args; args++) {
8446 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
8448 break;
8449 case Iex_GetI: {
8450 // The index will be checked/PCasted (see do_shadow_GETI)
8451 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, rhs->Iex.GetI.ix);
8452 break;
8454 case Iex_Const:
8455 case Iex_Get:
8456 break;
8457 default:
8458 ppIRExpr(rhs);
8459 VG_(tool_panic)("preInstrumentationAnalysis:"
8460 " unhandled IRExpr");
8462 break;
8464 case Ist_Store:
8465 // The address will be checked (== PCasted). The data will be
8466 // used in some unknown way.
8467 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Store.addr);
8468 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Store.data);
8469 break;
8470 case Ist_Exit:
8471 // The guard will be checked (== PCasted)
8472 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.Exit.guard);
8473 break;
8474 case Ist_Put:
8475 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.Put.data);
8476 break;
8477 case Ist_PutI: {
8478 IRPutI* putI = st->Ist.PutI.details;
8479 // The index will be checked/PCasted (see do_shadow_PUTI). The
8480 // data will be used in an unknown way.
8481 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, putI->ix);
8482 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, putI->data);
8483 break;
8485 case Ist_Dirty: {
8486 IRDirty* d = st->Ist.Dirty.details;
8487 // The guard will be checked (== PCasted)
8488 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, d->guard);
8489 // The args will be used in unknown ways.
8490 for (IRExpr** args = d->args; *args; args++) {
8491 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, *args);
8493 break;
8495 case Ist_CAS: {
8496 IRCAS* cas = st->Ist.CAS.details;
8497 // Address will be pcasted, everything else used as unknown
8498 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, cas->addr);
8499 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdLo);
8500 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataLo);
8501 if (cas->expdHi)
8502 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->expdHi);
8503 if (cas->dataHi)
8504 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, cas->dataHi);
8505 break;
8507 case Ist_AbiHint:
8508 // Both exprs are used in unknown ways. TODO: can we safely
8509 // just ignore AbiHints?
8510 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.base);
8511 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.AbiHint.nia);
8512 break;
8513 case Ist_StoreG: {
8514 // We might be able to do better, and use HuPCa for the addr.
8515 // It's not immediately obvious that we can, because the address
8516 // is regarded as "used" only when the guard is true.
8517 IRStoreG* sg = st->Ist.StoreG.details;
8518 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->addr);
8519 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->data);
8520 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, sg->guard);
8521 break;
8523 case Ist_LoadG: {
8524 // Per similar comments to Ist_StoreG .. not sure whether this
8525 // is really optimal.
8526 IRLoadG* lg = st->Ist.LoadG.details;
8527 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->addr);
8528 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->alt);
8529 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, lg->guard);
8530 break;
8532 case Ist_LLSC: {
8533 noteTmpUsesIn(useEnv, nOrigTmps, HuPCa, st->Ist.LLSC.addr);
8534 if (st->Ist.LLSC.storedata)
8535 noteTmpUsesIn(useEnv, nOrigTmps, HuOth, st->Ist.LLSC.storedata);
8536 break;
8538 case Ist_MBE:
8539 case Ist_IMark:
8540 case Ist_NoOp:
8541 break;
8542 default: {
8543 ppIRStmt(st);
8544 VG_(tool_panic)("preInstrumentationAnalysis: unhandled IRStmt");
8547 } // Now work backwards through the stmts.
8549 // Return the computed use env and the bogus-atom flag.
8550 tl_assert(*useEnvP == NULL);
8551 *useEnvP = useEnv;
8553 tl_assert(*hasBogusLiteralsP == False);
8554 *hasBogusLiteralsP = bogus;
8558 IRSB* MC_(instrument) ( VgCallbackClosure* closure,
8559 IRSB* sb_in,
8560 const VexGuestLayout* layout,
8561 const VexGuestExtents* vge,
8562 const VexArchInfo* archinfo_host,
8563 IRType gWordTy, IRType hWordTy )
8565 Bool verboze = 0||False;
8566 Int i, j, first_stmt;
8567 IRStmt* st;
8568 MCEnv mce;
8569 IRSB* sb_out;
8571 if (gWordTy != hWordTy) {
8572 /* We don't currently support this case. */
8573 VG_(tool_panic)("host/guest word size mismatch");
8576 /* Check we're not completely nuts */
8577 tl_assert(sizeof(UWord) == sizeof(void*));
8578 tl_assert(sizeof(Word) == sizeof(void*));
8579 tl_assert(sizeof(Addr) == sizeof(void*));
8580 tl_assert(sizeof(ULong) == 8);
8581 tl_assert(sizeof(Long) == 8);
8582 tl_assert(sizeof(UInt) == 4);
8583 tl_assert(sizeof(Int) == 4);
8585 tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
8587 /* Set up SB */
8588 sb_out = deepCopyIRSBExceptStmts(sb_in);
8590 /* Set up the running environment. Both .sb and .tmpMap are
8591 modified as we go along. Note that tmps are added to both
8592 .sb->tyenv and .tmpMap together, so the valid index-set for
8593 those two arrays should always be identical. */
8594 VG_(memset)(&mce, 0, sizeof(mce));
8595 mce.sb = sb_out;
8596 mce.trace = verboze;
8597 mce.layout = layout;
8598 mce.hWordTy = hWordTy;
8599 mce.tmpHowUsed = NULL;
8601 /* BEGIN decide on expense levels for instrumentation. */
8603 /* Initially, select the cheap version of everything for which we have an
8604 option. */
8605 DetailLevelByOp__set_all( &mce.dlbo, DLcheap );
8607 /* Take account of the --expensive-definedness-checks= flag. */
8608 if (MC_(clo_expensive_definedness_checks) == EdcNO) {
8609 /* We just selected 'cheap for everything', so we don't need to do
8610 anything here. mce.tmpHowUsed remains NULL. */
8612 else if (MC_(clo_expensive_definedness_checks) == EdcYES) {
8613 /* Select 'expensive for everything'. mce.tmpHowUsed remains NULL. */
8614 DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8616 else {
8617 tl_assert(MC_(clo_expensive_definedness_checks) == EdcAUTO);
8618 /* We'll make our own selection, based on known per-target constraints
8619 and also on analysis of the block to be instrumented. First, set
8620 up default values for detail levels.
8622 On x86 and amd64, we'll routinely encounter code optimised by LLVM
8623 5 and above. Enable accurate interpretation of the following.
8624 LLVM uses adds for some bitfield inserts, and we get a lot of false
8625 errors if the cheap interpretation is used, alas. Could solve this
8626 much better if we knew which of such adds came from x86/amd64 LEA
8627 instructions, since these are the only ones really needing the
8628 expensive interpretation, but that would require some way to tag
8629 them in the _toIR.c front ends, which is a lot of faffing around.
8630 So for now we use preInstrumentationAnalysis() to detect adds which
8631 are used only to construct memory addresses, which is an
8632 approximation to the above, and is self-contained.*/
8633 # if defined(VGA_x86)
8634 mce.dlbo.dl_Add32 = DLauto;
8635 mce.dlbo.dl_CmpEQ16_CmpNE16 = DLexpensive;
8636 mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8637 # elif defined(VGA_amd64)
8638 mce.dlbo.dl_Add32 = DLexpensive;
8639 mce.dlbo.dl_Add64 = DLauto;
8640 mce.dlbo.dl_CmpEQ16_CmpNE16 = DLexpensive;
8641 mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8642 mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8643 # elif defined(VGA_ppc64le)
8644 // Needed by (at least) set_AV_CR6() in the front end.
8645 mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8646 # elif defined(VGA_arm64)
8647 mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8648 mce.dlbo.dl_CmpEQ64_CmpNE64 = DLexpensive;
8649 # elif defined(VGA_arm)
8650 mce.dlbo.dl_CmpEQ32_CmpNE32 = DLexpensive;
8651 # endif
8653 /* preInstrumentationAnalysis() will allocate &mce.tmpHowUsed and then
8654 fill it in. */
8655 Bool hasBogusLiterals = False;
8656 preInstrumentationAnalysis( &mce.tmpHowUsed, &hasBogusLiterals, sb_in );
8658 if (hasBogusLiterals) {
8659 /* This happens very rarely. In this case just select expensive
8660 for everything, and throw away the tmp-use analysis results. */
8661 DetailLevelByOp__set_all( &mce.dlbo, DLexpensive );
8662 VG_(free)( mce.tmpHowUsed );
8663 mce.tmpHowUsed = NULL;
8664 } else {
8665 /* Nothing. mce.tmpHowUsed contains tmp-use analysis results,
8666 which will be used for some subset of Iop_{Add,Sub}{32,64},
8667 based on which ones are set to DLauto for this target. */
8671 DetailLevelByOp__check_sanity( &mce.dlbo );
8673 if (0) {
8674 // Debug printing: which tmps have been identified as PCast-only use
8675 if (mce.tmpHowUsed) {
8676 VG_(printf)("Cheapies: ");
8677 for (UInt q = 0; q < sb_in->tyenv->types_used; q++) {
8678 if (mce.tmpHowUsed[q] == HuPCa) {
8679 VG_(printf)("t%u ", q);
8682 VG_(printf)("\n");
8685 // Debug printing: number of ops by detail level
8686 UChar nCheap = DetailLevelByOp__count( &mce.dlbo, DLcheap );
8687 UChar nAuto = DetailLevelByOp__count( &mce.dlbo, DLauto );
8688 UChar nExpensive = DetailLevelByOp__count( &mce.dlbo, DLexpensive );
8689 tl_assert(nCheap + nAuto + nExpensive == 8);
8691 VG_(printf)("%u,%u,%u ", nCheap, nAuto, nExpensive);
8693 /* END decide on expense levels for instrumentation. */
8695 /* Initialise the running the tmp environment. */
8697 mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
8698 sizeof(TempMapEnt));
8699 VG_(hintSizeXA) (mce.tmpMap, sb_in->tyenv->types_used);
8700 for (i = 0; i < sb_in->tyenv->types_used; i++) {
8701 TempMapEnt ent;
8702 ent.kind = Orig;
8703 ent.shadowV = IRTemp_INVALID;
8704 ent.shadowB = IRTemp_INVALID;
8705 VG_(addToXA)( mce.tmpMap, &ent );
8707 tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
8709 /* Finally, begin instrumentation. */
8710 /* Copy verbatim any IR preamble preceding the first IMark */
8712 tl_assert(mce.sb == sb_out);
8713 tl_assert(mce.sb != sb_in);
8715 i = 0;
8716 while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
8718 st = sb_in->stmts[i];
8719 tl_assert(st);
8720 tl_assert(isFlatIRStmt(st));
8722 stmt( 'C', &mce, sb_in->stmts[i] );
8723 i++;
8726 /* Nasty problem. IR optimisation of the pre-instrumented IR may
8727 cause the IR following the preamble to contain references to IR
8728 temporaries defined in the preamble. Because the preamble isn't
8729 instrumented, these temporaries don't have any shadows.
8730 Nevertheless uses of them following the preamble will cause
8731 memcheck to generate references to their shadows. End effect is
8732 to cause IR sanity check failures, due to references to
8733 non-existent shadows. This is only evident for the complex
8734 preambles used for function wrapping on TOC-afflicted platforms
8735 (ppc64-linux).
8737 The following loop therefore scans the preamble looking for
8738 assignments to temporaries. For each one found it creates an
8739 assignment to the corresponding (V) shadow temp, marking it as
8740 'defined'. This is the same resulting IR as if the main
8741 instrumentation loop before had been applied to the statement
8742 'tmp = CONSTANT'.
8744 Similarly, if origin tracking is enabled, we must generate an
8745 assignment for the corresponding origin (B) shadow, claiming
8746 no-origin, as appropriate for a defined value.
8748 for (j = 0; j < i; j++) {
8749 if (sb_in->stmts[j]->tag == Ist_WrTmp) {
8750 /* findShadowTmpV checks its arg is an original tmp;
8751 no need to assert that here. */
8752 IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
8753 IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
8754 IRType ty_v = typeOfIRTemp(sb_out->tyenv, tmp_v);
8755 assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
8756 if (MC_(clo_mc_level) == 3) {
8757 IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
8758 tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
8759 assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
8761 if (0) {
8762 VG_(printf)("create shadow tmp(s) for preamble tmp [%d] ty ", j);
8763 ppIRType( ty_v );
8764 VG_(printf)("\n");
8769 /* Iterate over the remaining stmts to generate instrumentation. */
8771 tl_assert(sb_in->stmts_used > 0);
8772 tl_assert(i >= 0);
8773 tl_assert(i < sb_in->stmts_used);
8774 tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
8776 for (/* use current i*/; i < sb_in->stmts_used; i++) {
8778 st = sb_in->stmts[i];
8779 first_stmt = sb_out->stmts_used;
8781 if (verboze) {
8782 VG_(printf)("\n");
8783 ppIRStmt(st);
8784 VG_(printf)("\n");
8787 if (MC_(clo_mc_level) == 3) {
8788 /* See comments on case Ist_CAS below. */
8789 if (st->tag != Ist_CAS)
8790 schemeS( &mce, st );
8793 /* Generate instrumentation code for each stmt ... */
8795 switch (st->tag) {
8797 case Ist_WrTmp: {
8798 IRTemp dst = st->Ist.WrTmp.tmp;
8799 tl_assert(dst < (UInt)sb_in->tyenv->types_used);
8800 HowUsed hu = mce.tmpHowUsed ? mce.tmpHowUsed[dst]
8801 : HuOth/*we don't know, so play safe*/;
8802 assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp),
8803 expr2vbits( &mce, st->Ist.WrTmp.data, hu ));
8804 break;
8807 case Ist_Put:
8808 do_shadow_PUT( &mce,
8809 st->Ist.Put.offset,
8810 st->Ist.Put.data,
8811 NULL /* shadow atom */, NULL /* guard */ );
8812 break;
8814 case Ist_PutI:
8815 do_shadow_PUTI( &mce, st->Ist.PutI.details);
8816 break;
8818 case Ist_Store:
8819 do_shadow_Store( &mce, st->Ist.Store.end,
8820 st->Ist.Store.addr, 0/* addr bias */,
8821 st->Ist.Store.data,
8822 NULL /* shadow data */,
8823 NULL/*guard*/ );
8824 break;
8826 case Ist_StoreG:
8827 do_shadow_StoreG( &mce, st->Ist.StoreG.details );
8828 break;
8830 case Ist_LoadG:
8831 do_shadow_LoadG( &mce, st->Ist.LoadG.details );
8832 break;
8834 case Ist_Exit:
8835 complainIfUndefined( &mce, st->Ist.Exit.guard, NULL );
8836 break;
8838 case Ist_IMark:
8839 break;
8841 case Ist_NoOp:
8842 case Ist_MBE:
8843 break;
8845 case Ist_Dirty:
8846 do_shadow_Dirty( &mce, st->Ist.Dirty.details );
8847 break;
8849 case Ist_AbiHint:
8850 do_AbiHint( &mce, st->Ist.AbiHint.base,
8851 st->Ist.AbiHint.len,
8852 st->Ist.AbiHint.nia );
8853 break;
8855 case Ist_CAS:
8856 do_shadow_CAS( &mce, st->Ist.CAS.details );
8857 /* Note, do_shadow_CAS copies the CAS itself to the output
8858 block, because it needs to add instrumentation both
8859 before and after it. Hence skip the copy below. Also
8860 skip the origin-tracking stuff (call to schemeS) above,
8861 since that's all tangled up with it too; do_shadow_CAS
8862 does it all. */
8863 break;
8865 case Ist_LLSC:
8866 do_shadow_LLSC( &mce,
8867 st->Ist.LLSC.end,
8868 st->Ist.LLSC.result,
8869 st->Ist.LLSC.addr,
8870 st->Ist.LLSC.storedata );
8871 break;
8873 default:
8874 VG_(printf)("\n");
8875 ppIRStmt(st);
8876 VG_(printf)("\n");
8877 VG_(tool_panic)("memcheck: unhandled IRStmt");
8879 } /* switch (st->tag) */
8881 if (0 && verboze) {
8882 for (j = first_stmt; j < sb_out->stmts_used; j++) {
8883 VG_(printf)(" ");
8884 ppIRStmt(sb_out->stmts[j]);
8885 VG_(printf)("\n");
8887 VG_(printf)("\n");
8890 /* ... and finally copy the stmt itself to the output. Except,
8891 skip the copy of IRCASs; see comments on case Ist_CAS
8892 above. */
8893 if (st->tag != Ist_CAS)
8894 stmt('C', &mce, st);
8897 /* Now we need to complain if the jump target is undefined. */
8898 first_stmt = sb_out->stmts_used;
8900 if (verboze) {
8901 VG_(printf)("sb_in->next = ");
8902 ppIRExpr(sb_in->next);
8903 VG_(printf)("\n\n");
8906 complainIfUndefined( &mce, sb_in->next, NULL );
8908 if (0 && verboze) {
8909 for (j = first_stmt; j < sb_out->stmts_used; j++) {
8910 VG_(printf)(" ");
8911 ppIRStmt(sb_out->stmts[j]);
8912 VG_(printf)("\n");
8914 VG_(printf)("\n");
8917 /* If this fails, there's been some serious snafu with tmp management,
8918 that should be investigated. */
8919 tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
8920 VG_(deleteXA)( mce.tmpMap );
8922 if (mce.tmpHowUsed) {
8923 VG_(free)( mce.tmpHowUsed );
8926 tl_assert(mce.sb == sb_out);
8927 return sb_out;
8931 /*--------------------------------------------------------------------*/
8932 /*--- end mc_translate.c ---*/
8933 /*--------------------------------------------------------------------*/