2 /*--------------------------------------------------------------------*/
3 /*--- Instrument IR to perform memory checking operations. ---*/
4 /*--- mc_translate.c ---*/
5 /*--------------------------------------------------------------------*/
8 This file is part of MemCheck, a heavyweight Valgrind tool for
9 detecting memory errors.
11 Copyright (C) 2000-2017 Julian Seward
14 This program is free software; you can redistribute it and/or
15 modify it under the terms of the GNU General Public License as
16 published by the Free Software Foundation; either version 2 of the
17 License, or (at your option) any later version.
19 This program is distributed in the hope that it will be useful, but
20 WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with this program; if not, see <http://www.gnu.org/licenses/>.
27 The GNU General Public License is contained in the file COPYING.
30 #include "pub_tool_basics.h"
31 #include "pub_tool_poolalloc.h" // For mc_include.h
32 #include "pub_tool_hashtable.h" // For mc_include.h
33 #include "pub_tool_libcassert.h"
34 #include "pub_tool_libcprint.h"
35 #include "pub_tool_tooliface.h"
36 #include "pub_tool_machine.h" // VG_(fnptr_to_fnentry)
37 #include "pub_tool_xarray.h"
38 #include "pub_tool_mallocfree.h"
39 #include "pub_tool_libcbase.h"
41 #include "mc_include.h"
44 /* FIXMEs JRS 2011-June-16.
46 Check the interpretation for vector narrowing and widening ops,
47 particularly the saturating ones. I suspect they are either overly
48 pessimistic and/or wrong.
50 Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
51 saturating shifts): the interpretation is overly pessimistic.
52 See comments on the relevant cases below for details.
54 Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
55 both rounding and non-rounding variants): ditto
58 /* This file implements the Memcheck instrumentation, and in
59 particular contains the core of its undefined value detection
60 machinery. For a comprehensive background of the terminology,
61 algorithms and rationale used herein, read:
63 Using Valgrind to detect undefined value errors with
66 Julian Seward and Nicholas Nethercote
68 2005 USENIX Annual Technical Conference (General Track),
69 Anaheim, CA, USA, April 10-15, 2005.
73 Here is as good a place as any to record exactly when V bits are and
74 should be checked, why, and what function is responsible.
77 Memcheck complains when an undefined value is used:
79 1. In the condition of a conditional branch. Because it could cause
80 incorrect control flow, and thus cause incorrect externally-visible
81 behaviour. [mc_translate.c:complainIfUndefined]
83 2. As an argument to a system call, or as the value that specifies
84 the system call number. Because it could cause an incorrect
85 externally-visible side effect. [mc_translate.c:mc_pre_reg_read]
87 3. As the address in a load or store. Because it could cause an
88 incorrect value to be used later, which could cause externally-visible
89 behaviour (eg. via incorrect control flow or an incorrect system call
90 argument) [complainIfUndefined]
92 4. As the target address of a branch. Because it could cause incorrect
93 control flow. [complainIfUndefined]
95 5. As an argument to setenv, unsetenv, or putenv. Because it could put
96 an incorrect value into the external environment.
97 [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
99 6. As the index in a GETI or PUTI operation. I'm not sure why... (njn).
100 [complainIfUndefined]
102 7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
103 VALGRIND_CHECK_VALUE_IS_DEFINED client requests. Because the user
104 requested it. [in memcheck.h]
107 Memcheck also complains, but should not, when an undefined value is used:
109 8. As the shift value in certain SIMD shift operations (but not in the
110 standard integer shift operations). This inconsistency is due to
111 historical reasons.) [complainIfUndefined]
114 Memcheck does not complain, but should, when an undefined value is used:
116 9. As an input to a client request. Because the client request may
117 affect the visible behaviour -- see bug #144362 for an example
118 involving the malloc replacements in vg_replace_malloc.c and
119 VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
120 isn't identified. That bug report also has some info on how to solve
121 the problem. [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
124 In practice, 1 and 2 account for the vast majority of cases.
127 /* Generation of addr-definedness, addr-validity and
128 guard-definedness checks pertaining to loads and stores (Iex_Load,
129 Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
130 loads/stores) was re-checked 11 May 2013. */
133 /*------------------------------------------------------------*/
134 /*--- Forward decls ---*/
135 /*------------------------------------------------------------*/
139 // See below for comments explaining what this is for.
141 enum __attribute__((packed
)) { HuUnU
=0, HuPCa
=1, HuOth
=2 }
144 static IRType
shadowTypeV ( IRType ty
);
145 static IRExpr
* expr2vbits ( struct _MCEnv
* mce
, IRExpr
* e
,
146 HowUsed hu
/*use HuOth if unknown*/ );
147 static IRTemp
findShadowTmpB ( struct _MCEnv
* mce
, IRTemp orig
);
149 static IRExpr
*i128_const_zero(void);
152 /*------------------------------------------------------------*/
153 /*--- Memcheck running state, and tmp management. ---*/
154 /*------------------------------------------------------------*/
156 /* For a few (maybe 1%) IROps, we have both a cheaper, less exact vbit
157 propagation scheme, and a more expensive, more precise vbit propagation
158 scheme. This enum describes, for such an IROp, which scheme to use. */
161 // Use the cheaper, less-exact variant.
163 // Choose between cheap and expensive based on analysis of the block
164 // to be instrumented. Note that the choice may be done on a
165 // per-instance basis of the IROp that this DetailLevel describes.
167 // Use the more expensive, more-exact variant.
173 /* A readonly part of the running state. For IROps that have both a
174 less-exact and more-exact interpretation, records which interpretation is
178 // For Add32/64 and Sub32/64, all 3 settings are allowed. For the
179 // DLauto case, a per-instance decision is to be made by inspecting
180 // the associated tmp's entry in MCEnv.tmpHowUsed.
181 DetailLevel dl_Add32
;
182 DetailLevel dl_Add64
;
183 DetailLevel dl_Sub32
;
184 DetailLevel dl_Sub64
;
185 // For Cmp{EQ,NE}{64,32,16,8}, only DLcheap and DLexpensive are
187 DetailLevel dl_CmpEQ64_CmpNE64
;
188 DetailLevel dl_CmpEQ32_CmpNE32
;
189 DetailLevel dl_CmpEQ16_CmpNE16
;
190 DetailLevel dl_CmpEQ8_CmpNE8
;
194 static void DetailLevelByOp__set_all ( /*OUT*/DetailLevelByOp
* dlbo
,
201 dlbo
->dl_CmpEQ64_CmpNE64
= dl
;
202 dlbo
->dl_CmpEQ32_CmpNE32
= dl
;
203 dlbo
->dl_CmpEQ16_CmpNE16
= dl
;
204 dlbo
->dl_CmpEQ8_CmpNE8
= dl
;
207 static void DetailLevelByOp__check_sanity ( const DetailLevelByOp
* dlbo
)
209 tl_assert(dlbo
->dl_Add32
>= DLcheap
&& dlbo
->dl_Add32
<= DLexpensive
);
210 tl_assert(dlbo
->dl_Add64
>= DLcheap
&& dlbo
->dl_Add64
<= DLexpensive
);
211 tl_assert(dlbo
->dl_Sub32
>= DLcheap
&& dlbo
->dl_Sub32
<= DLexpensive
);
212 tl_assert(dlbo
->dl_Sub64
>= DLcheap
&& dlbo
->dl_Sub64
<= DLexpensive
);
213 tl_assert(dlbo
->dl_CmpEQ64_CmpNE64
== DLcheap
214 || dlbo
->dl_CmpEQ64_CmpNE64
== DLexpensive
);
215 tl_assert(dlbo
->dl_CmpEQ32_CmpNE32
== DLcheap
216 || dlbo
->dl_CmpEQ32_CmpNE32
== DLexpensive
);
217 tl_assert(dlbo
->dl_CmpEQ16_CmpNE16
== DLcheap
218 || dlbo
->dl_CmpEQ16_CmpNE16
== DLexpensive
);
219 tl_assert(dlbo
->dl_CmpEQ8_CmpNE8
== DLcheap
220 || dlbo
->dl_CmpEQ8_CmpNE8
== DLexpensive
);
223 static UInt
DetailLevelByOp__count ( const DetailLevelByOp
* dlbo
,
227 n
+= (dlbo
->dl_Add32
== dl
? 1 : 0);
228 n
+= (dlbo
->dl_Add64
== dl
? 1 : 0);
229 n
+= (dlbo
->dl_Sub32
== dl
? 1 : 0);
230 n
+= (dlbo
->dl_Sub64
== dl
? 1 : 0);
231 n
+= (dlbo
->dl_CmpEQ64_CmpNE64
== dl
? 1 : 0);
232 n
+= (dlbo
->dl_CmpEQ32_CmpNE32
== dl
? 1 : 0);
233 n
+= (dlbo
->dl_CmpEQ16_CmpNE16
== dl
? 1 : 0);
234 n
+= (dlbo
->dl_CmpEQ8_CmpNE8
== dl
? 1 : 0);
239 /* Carries info about a particular tmp. The tmp's number is not
240 recorded, as this is implied by (equal to) its index in the tmpMap
241 in MCEnv. The tmp's type is also not recorded, as this is present
244 When .kind is Orig, .shadowV and .shadowB may give the identities
245 of the temps currently holding the associated definedness (shadowV)
246 and origin (shadowB) values, or these may be IRTemp_INVALID if code
247 to compute such values has not yet been emitted.
249 When .kind is VSh or BSh then the tmp is holds a V- or B- value,
250 and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
251 illogical for a shadow tmp itself to be shadowed.
254 enum { Orig
=1, VSh
=2, BSh
=3 }
266 /* A |HowUsed| value carries analysis results about how values are used,
267 pertaining to whether we need to instrument integer adds expensively or
268 not. The running state carries a (readonly) mapping from original tmp to
269 a HowUsed value for it. A usage value can be one of three values,
270 forming a 3-point chain lattice.
272 HuOth ("Other") used in some arbitrary way
274 HuPCa ("PCast") used *only* in effectively a PCast, in which all
275 | we care about is the all-defined vs not-all-defined distinction
277 HuUnU ("Unused") not used at all.
279 The "safe" (don't-know) end of the lattice is "HuOth". See comments
280 below in |preInstrumentationAnalysis| for further details.
284 enum __attribute__((packed)) { HuUnU=0, HuPCa=1, HuOth=2 }
288 // Not actually necessary, but we don't want to waste D1 space.
289 STATIC_ASSERT(sizeof(HowUsed
) == 1);
292 /* Carries around state during memcheck instrumentation. */
295 /* MODIFIED: the superblock being constructed. IRStmts are
300 /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
301 current kind and possibly shadow temps for each temp in the
302 IRSB being constructed. Note that it does not contain the
303 type of each tmp. If you want to know the type, look at the
304 relevant entry in sb->tyenv. It follows that at all times
305 during the instrumentation process, the valid indices for
306 tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
307 total number of Orig, V- and B- temps allocated so far.
309 The reason for this strange split (types in one place, all
310 other info in another) is that we need the types to be
311 attached to sb so as to make it possible to do
312 "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
313 instrumentation process. */
314 XArray
* /* of TempMapEnt */ tmpMap
;
316 /* READONLY: contains details of which ops should be expensively
318 DetailLevelByOp dlbo
;
320 /* READONLY: for each original tmp, how the tmp is used. This is
321 computed by |preInstrumentationAnalysis|. Valid indices are
322 0 .. #temps_in_sb-1 (same as for tmpMap). */
325 /* READONLY: the guest layout. This indicates which parts of
326 the guest state should be regarded as 'always defined'. */
327 const VexGuestLayout
* layout
;
329 /* READONLY: the host word type. Needed for constructing
330 arguments of type 'HWord' to be passed to helper functions.
331 Ity_I32 or Ity_I64 only. */
337 /* SHADOW TMP MANAGEMENT. Shadow tmps are allocated lazily (on
338 demand), as they are encountered. This is for two reasons.
340 (1) (less important reason): Many original tmps are unused due to
341 initial IR optimisation, and we do not want to spaces in tables
344 Shadow IRTemps are therefore allocated on demand. mce.tmpMap is a
345 table indexed [0 .. n_types-1], which gives the current shadow for
346 each original tmp, or INVALID_IRTEMP if none is so far assigned.
347 It is necessary to support making multiple assignments to a shadow
348 -- specifically, after testing a shadow for definedness, it needs
349 to be made defined. But IR's SSA property disallows this.
351 (2) (more important reason): Therefore, when a shadow needs to get
352 a new value, a new temporary is created, the value is assigned to
353 that, and the tmpMap is updated to reflect the new binding.
355 A corollary is that if the tmpMap maps a given tmp to
356 IRTemp_INVALID and we are hoping to read that shadow tmp, it means
357 there's a read-before-write error in the original tmps. The IR
358 sanity checker should catch all such anomalies, however.
361 /* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
362 both the table in mce->sb and to our auxiliary mapping. Note that
363 newTemp may cause mce->tmpMap to resize, hence previous results
364 from VG_(indexXA)(mce->tmpMap) are invalidated. */
365 static IRTemp
newTemp ( MCEnv
* mce
, IRType ty
, TempKind kind
)
369 IRTemp tmp
= newIRTemp(mce
->sb
->tyenv
, ty
);
371 ent
.shadowV
= IRTemp_INVALID
;
372 ent
.shadowB
= IRTemp_INVALID
;
373 newIx
= VG_(addToXA
)( mce
->tmpMap
, &ent
);
374 tl_assert(newIx
== (Word
)tmp
);
379 /* Find the tmp currently shadowing the given original tmp. If none
380 so far exists, allocate one. */
381 static IRTemp
findShadowTmpV ( MCEnv
* mce
, IRTemp orig
)
384 /* VG_(indexXA) range-checks 'orig', hence no need to check
386 ent
= (TempMapEnt
*)VG_(indexXA
)( mce
->tmpMap
, (Word
)orig
);
387 tl_assert(ent
->kind
== Orig
);
388 if (ent
->shadowV
== IRTemp_INVALID
) {
390 = newTemp( mce
, shadowTypeV(mce
->sb
->tyenv
->types
[orig
]), VSh
);
391 /* newTemp may cause mce->tmpMap to resize, hence previous results
392 from VG_(indexXA) are invalid. */
393 ent
= (TempMapEnt
*)VG_(indexXA
)( mce
->tmpMap
, (Word
)orig
);
394 tl_assert(ent
->kind
== Orig
);
395 tl_assert(ent
->shadowV
== IRTemp_INVALID
);
401 /* Allocate a new shadow for the given original tmp. This means any
402 previous shadow is abandoned. This is needed because it is
403 necessary to give a new value to a shadow once it has been tested
404 for undefinedness, but unfortunately IR's SSA property disallows
405 this. Instead we must abandon the old shadow, allocate a new one
406 and use that instead.
408 This is the same as findShadowTmpV, except we don't bother to see
409 if a shadow temp already existed -- we simply allocate a new one
411 static void newShadowTmpV ( MCEnv
* mce
, IRTemp orig
)
414 /* VG_(indexXA) range-checks 'orig', hence no need to check
416 ent
= (TempMapEnt
*)VG_(indexXA
)( mce
->tmpMap
, (Word
)orig
);
417 tl_assert(ent
->kind
== Orig
);
420 = newTemp( mce
, shadowTypeV(mce
->sb
->tyenv
->types
[orig
]), VSh
);
421 /* newTemp may cause mce->tmpMap to resize, hence previous results
422 from VG_(indexXA) are invalid. */
423 ent
= (TempMapEnt
*)VG_(indexXA
)( mce
->tmpMap
, (Word
)orig
);
424 tl_assert(ent
->kind
== Orig
);
430 /*------------------------------------------------------------*/
431 /*--- IRAtoms -- a subset of IRExprs ---*/
432 /*------------------------------------------------------------*/
434 /* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
435 isIRAtom() in libvex_ir.h. Because this instrumenter expects flat
436 input, most of this code deals in atoms. Usefully, a value atom
437 always has a V-value which is also an atom: constants are shadowed
438 by constants, and temps are shadowed by the corresponding shadow
441 typedef IRExpr IRAtom
;
443 /* (used for sanity checks only): is this an atom which looks
444 like it's from original code? */
445 static Bool
isOriginalAtom ( MCEnv
* mce
, IRAtom
* a1
)
447 if (a1
->tag
== Iex_Const
)
449 if (a1
->tag
== Iex_RdTmp
) {
450 TempMapEnt
* ent
= VG_(indexXA
)( mce
->tmpMap
, a1
->Iex
.RdTmp
.tmp
);
451 return ent
->kind
== Orig
;
456 /* (used for sanity checks only): is this an atom which looks
457 like it's from shadow code? */
458 static Bool
isShadowAtom ( MCEnv
* mce
, IRAtom
* a1
)
460 if (a1
->tag
== Iex_Const
)
462 if (a1
->tag
== Iex_RdTmp
) {
463 TempMapEnt
* ent
= VG_(indexXA
)( mce
->tmpMap
, a1
->Iex
.RdTmp
.tmp
);
464 return ent
->kind
== VSh
|| ent
->kind
== BSh
;
469 /* (used for sanity checks only): check that both args are atoms and
470 are identically-kinded. */
471 static Bool
sameKindedAtoms ( IRAtom
* a1
, IRAtom
* a2
)
473 if (a1
->tag
== Iex_RdTmp
&& a2
->tag
== Iex_RdTmp
)
475 if (a1
->tag
== Iex_Const
&& a2
->tag
== Iex_Const
)
481 /*------------------------------------------------------------*/
482 /*--- Type management ---*/
483 /*------------------------------------------------------------*/
485 /* Shadow state is always accessed using integer types. This returns
486 an integer type with the same size (as per sizeofIRType) as the
487 given type. The only valid shadow types are Bit, I8, I16, I32,
488 I64, I128, V128, V256. */
490 static IRType
shadowTypeV ( IRType ty
)
498 case Ity_I128
: return ty
;
499 case Ity_F16
: return Ity_I16
;
500 case Ity_F32
: return Ity_I32
;
501 case Ity_D32
: return Ity_I32
;
502 case Ity_F64
: return Ity_I64
;
503 case Ity_D64
: return Ity_I64
;
504 case Ity_F128
: return Ity_I128
;
505 case Ity_D128
: return Ity_I128
;
506 case Ity_V128
: return Ity_V128
;
507 case Ity_V256
: return Ity_V256
;
508 default: ppIRType(ty
);
509 VG_(tool_panic
)("memcheck:shadowTypeV");
513 /* Produce a 'defined' value of the given shadow type. Should only be
514 supplied shadow types (Bit/I8/I16/I32/UI64). */
515 static IRExpr
* definedOfType ( IRType ty
) {
517 case Ity_I1
: return IRExpr_Const(IRConst_U1(False
));
518 case Ity_I8
: return IRExpr_Const(IRConst_U8(0));
519 case Ity_I16
: return IRExpr_Const(IRConst_U16(0));
520 case Ity_I32
: return IRExpr_Const(IRConst_U32(0));
521 case Ity_I64
: return IRExpr_Const(IRConst_U64(0));
522 case Ity_I128
: return i128_const_zero();
523 case Ity_V128
: return IRExpr_Const(IRConst_V128(0x0000));
524 case Ity_V256
: return IRExpr_Const(IRConst_V256(0x00000000));
525 default: VG_(tool_panic
)("memcheck:definedOfType");
530 /*------------------------------------------------------------*/
531 /*--- Constructing IR fragments ---*/
532 /*------------------------------------------------------------*/
534 /* add stmt to a bb */
535 static inline void stmt ( HChar cat
, MCEnv
* mce
, IRStmt
* st
) {
537 VG_(printf
)(" %c: ", cat
);
541 addStmtToIRSB(mce
->sb
, st
);
544 /* assign value to tmp */
546 void assign ( HChar cat
, MCEnv
* mce
, IRTemp tmp
, IRExpr
* expr
) {
547 stmt(cat
, mce
, IRStmt_WrTmp(tmp
,expr
));
550 /* build various kinds of expressions */
551 #define triop(_op, _arg1, _arg2, _arg3) \
552 IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
553 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
554 #define unop(_op, _arg) IRExpr_Unop((_op),(_arg))
555 #define mkU1(_n) IRExpr_Const(IRConst_U1(_n))
556 #define mkU8(_n) IRExpr_Const(IRConst_U8(_n))
557 #define mkU16(_n) IRExpr_Const(IRConst_U16(_n))
558 #define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
559 #define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
560 #define mkV128(_n) IRExpr_Const(IRConst_V128(_n))
561 #define mkexpr(_tmp) IRExpr_RdTmp((_tmp))
563 /* Bind the given expression to a new temporary, and return the
564 temporary. This effectively converts an arbitrary expression into
567 'ty' is the type of 'e' and hence the type that the new temporary
568 needs to be. But passing it in is redundant, since we can deduce
569 the type merely by inspecting 'e'. So at least use that fact to
570 assert that the two types agree. */
571 static IRAtom
* assignNew ( HChar cat
, MCEnv
* mce
, IRType ty
, IRExpr
* e
)
575 IRType tyE
= typeOfIRExpr(mce
->sb
->tyenv
, e
);
577 tl_assert(tyE
== ty
); /* so 'ty' is redundant (!) */
579 case 'V': k
= VSh
; break;
580 case 'B': k
= BSh
; break;
581 case 'C': k
= Orig
; break;
582 /* happens when we are making up new "orig"
583 expressions, for IRCAS handling */
584 default: tl_assert(0);
586 t
= newTemp(mce
, ty
, k
);
587 assign(cat
, mce
, t
, e
);
592 /*------------------------------------------------------------*/
593 /*--- Helper functions for 128-bit ops ---*/
594 /*------------------------------------------------------------*/
596 static IRExpr
*i128_const_zero(void)
598 IRAtom
* z64
= IRExpr_Const(IRConst_U64(0));
599 return binop(Iop_64HLto128
, z64
, z64
);
602 /* There are no I128-bit loads and/or stores [as generated by any
603 current front ends]. So we do not need to worry about that in
607 /*------------------------------------------------------------*/
608 /*--- Constructing definedness primitive ops ---*/
609 /*------------------------------------------------------------*/
611 /* --------- Defined-if-either-defined --------- */
613 static IRAtom
* mkDifD1 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
614 tl_assert(isShadowAtom(mce
,a1
));
615 tl_assert(isShadowAtom(mce
,a2
));
616 return assignNew('V', mce
, Ity_I1
, binop(Iop_And1
, a1
, a2
));
619 static IRAtom
* mkDifD8 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
620 tl_assert(isShadowAtom(mce
,a1
));
621 tl_assert(isShadowAtom(mce
,a2
));
622 return assignNew('V', mce
, Ity_I8
, binop(Iop_And8
, a1
, a2
));
625 static IRAtom
* mkDifD16 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
626 tl_assert(isShadowAtom(mce
,a1
));
627 tl_assert(isShadowAtom(mce
,a2
));
628 return assignNew('V', mce
, Ity_I16
, binop(Iop_And16
, a1
, a2
));
631 static IRAtom
* mkDifD32 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
632 tl_assert(isShadowAtom(mce
,a1
));
633 tl_assert(isShadowAtom(mce
,a2
));
634 return assignNew('V', mce
, Ity_I32
, binop(Iop_And32
, a1
, a2
));
637 static IRAtom
* mkDifD64 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
638 tl_assert(isShadowAtom(mce
,a1
));
639 tl_assert(isShadowAtom(mce
,a2
));
640 return assignNew('V', mce
, Ity_I64
, binop(Iop_And64
, a1
, a2
));
643 static IRAtom
* mkDifDV128 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
644 tl_assert(isShadowAtom(mce
,a1
));
645 tl_assert(isShadowAtom(mce
,a2
));
646 return assignNew('V', mce
, Ity_V128
, binop(Iop_AndV128
, a1
, a2
));
649 static IRAtom
* mkDifDV256 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
650 tl_assert(isShadowAtom(mce
,a1
));
651 tl_assert(isShadowAtom(mce
,a2
));
652 return assignNew('V', mce
, Ity_V256
, binop(Iop_AndV256
, a1
, a2
));
655 /* --------- Undefined-if-either-undefined --------- */
657 static IRAtom
* mkUifU1 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
658 tl_assert(isShadowAtom(mce
,a1
));
659 tl_assert(isShadowAtom(mce
,a2
));
660 return assignNew('V', mce
, Ity_I1
, binop(Iop_Or1
, a1
, a2
));
663 static IRAtom
* mkUifU8 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
664 tl_assert(isShadowAtom(mce
,a1
));
665 tl_assert(isShadowAtom(mce
,a2
));
666 return assignNew('V', mce
, Ity_I8
, binop(Iop_Or8
, a1
, a2
));
669 static IRAtom
* mkUifU16 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
670 tl_assert(isShadowAtom(mce
,a1
));
671 tl_assert(isShadowAtom(mce
,a2
));
672 return assignNew('V', mce
, Ity_I16
, binop(Iop_Or16
, a1
, a2
));
675 static IRAtom
* mkUifU32 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
676 tl_assert(isShadowAtom(mce
,a1
));
677 tl_assert(isShadowAtom(mce
,a2
));
678 return assignNew('V', mce
, Ity_I32
, binop(Iop_Or32
, a1
, a2
));
681 static IRAtom
* mkUifU64 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
682 tl_assert(isShadowAtom(mce
,a1
));
683 tl_assert(isShadowAtom(mce
,a2
));
684 return assignNew('V', mce
, Ity_I64
, binop(Iop_Or64
, a1
, a2
));
687 static IRAtom
* mkUifU128 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
688 IRAtom
*tmp1
, *tmp2
, *tmp3
, *tmp4
, *tmp5
, *tmp6
;
689 tl_assert(isShadowAtom(mce
,a1
));
690 tl_assert(isShadowAtom(mce
,a2
));
691 tmp1
= assignNew('V', mce
, Ity_I64
, unop(Iop_128to64
, a1
));
692 tmp2
= assignNew('V', mce
, Ity_I64
, unop(Iop_128HIto64
, a1
));
693 tmp3
= assignNew('V', mce
, Ity_I64
, unop(Iop_128to64
, a2
));
694 tmp4
= assignNew('V', mce
, Ity_I64
, unop(Iop_128HIto64
, a2
));
695 tmp5
= assignNew('V', mce
, Ity_I64
, binop(Iop_Or64
, tmp1
, tmp3
));
696 tmp6
= assignNew('V', mce
, Ity_I64
, binop(Iop_Or64
, tmp2
, tmp4
));
698 return assignNew('V', mce
, Ity_I128
, binop(Iop_64HLto128
, tmp6
, tmp5
));
701 static IRAtom
* mkUifUV128 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
702 tl_assert(isShadowAtom(mce
,a1
));
703 tl_assert(isShadowAtom(mce
,a2
));
704 return assignNew('V', mce
, Ity_V128
, binop(Iop_OrV128
, a1
, a2
));
707 static IRAtom
* mkUifUV256 ( MCEnv
* mce
, IRAtom
* a1
, IRAtom
* a2
) {
708 tl_assert(isShadowAtom(mce
,a1
));
709 tl_assert(isShadowAtom(mce
,a2
));
710 return assignNew('V', mce
, Ity_V256
, binop(Iop_OrV256
, a1
, a2
));
713 static IRAtom
* mkUifU ( MCEnv
* mce
, IRType vty
, IRAtom
* a1
, IRAtom
* a2
) {
715 case Ity_I8
: return mkUifU8(mce
, a1
, a2
);
716 case Ity_I16
: return mkUifU16(mce
, a1
, a2
);
717 case Ity_I32
: return mkUifU32(mce
, a1
, a2
);
718 case Ity_I64
: return mkUifU64(mce
, a1
, a2
);
719 case Ity_I128
: return mkUifU128(mce
, a1
, a2
);
720 case Ity_V128
: return mkUifUV128(mce
, a1
, a2
);
721 case Ity_V256
: return mkUifUV256(mce
, a1
, a2
);
723 VG_(printf
)("\n"); ppIRType(vty
); VG_(printf
)("\n");
724 VG_(tool_panic
)("memcheck:mkUifU");
728 /* --------- The Left-family of operations. --------- */
730 static IRAtom
* mkLeft8 ( MCEnv
* mce
, IRAtom
* a1
) {
731 tl_assert(isShadowAtom(mce
,a1
));
732 return assignNew('V', mce
, Ity_I8
, unop(Iop_Left8
, a1
));
735 static IRAtom
* mkLeft16 ( MCEnv
* mce
, IRAtom
* a1
) {
736 tl_assert(isShadowAtom(mce
,a1
));
737 return assignNew('V', mce
, Ity_I16
, unop(Iop_Left16
, a1
));
740 static IRAtom
* mkLeft32 ( MCEnv
* mce
, IRAtom
* a1
) {
741 tl_assert(isShadowAtom(mce
,a1
));
742 return assignNew('V', mce
, Ity_I32
, unop(Iop_Left32
, a1
));
745 static IRAtom
* mkLeft64 ( MCEnv
* mce
, IRAtom
* a1
) {
746 tl_assert(isShadowAtom(mce
,a1
));
747 return assignNew('V', mce
, Ity_I64
, unop(Iop_Left64
, a1
));
750 /* --------- The Right-family of operations. --------- */
752 /* Unfortunately these are a lot more expensive then their Left
753 counterparts. Fortunately they are only very rarely used -- only for
754 count-leading-zeroes instrumentation. */
756 static IRAtom
* mkRight32 ( MCEnv
* mce
, IRAtom
* a1
)
758 for (Int i
= 1; i
<= 16; i
*= 2) {
761 = assignNew('V', mce
, Ity_I32
, binop(Iop_Shr32
, a1
, mkU8(i
)));
762 a1
= assignNew('V', mce
, Ity_I32
, binop(Iop_Or32
, a1
, tmp
));
767 static IRAtom
* mkRight64 ( MCEnv
* mce
, IRAtom
* a1
)
769 for (Int i
= 1; i
<= 32; i
*= 2) {
772 = assignNew('V', mce
, Ity_I64
, binop(Iop_Shr64
, a1
, mkU8(i
)));
773 a1
= assignNew('V', mce
, Ity_I64
, binop(Iop_Or64
, a1
, tmp
));
778 /* --------- 'Improvement' functions for AND/OR. --------- */
780 /* ImproveAND(data, vbits) = data OR vbits. Defined (0) data 0s give
781 defined (0); all other -> undefined (1).
783 static IRAtom
* mkImproveAND1 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
785 tl_assert(isOriginalAtom(mce
, data
));
786 tl_assert(isShadowAtom(mce
, vbits
));
787 tl_assert(sameKindedAtoms(data
, vbits
));
788 return assignNew('V', mce
, Ity_I1
, binop(Iop_Or1
, data
, vbits
));
791 static IRAtom
* mkImproveAND8 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
793 tl_assert(isOriginalAtom(mce
, data
));
794 tl_assert(isShadowAtom(mce
, vbits
));
795 tl_assert(sameKindedAtoms(data
, vbits
));
796 return assignNew('V', mce
, Ity_I8
, binop(Iop_Or8
, data
, vbits
));
799 static IRAtom
* mkImproveAND16 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
801 tl_assert(isOriginalAtom(mce
, data
));
802 tl_assert(isShadowAtom(mce
, vbits
));
803 tl_assert(sameKindedAtoms(data
, vbits
));
804 return assignNew('V', mce
, Ity_I16
, binop(Iop_Or16
, data
, vbits
));
807 static IRAtom
* mkImproveAND32 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
809 tl_assert(isOriginalAtom(mce
, data
));
810 tl_assert(isShadowAtom(mce
, vbits
));
811 tl_assert(sameKindedAtoms(data
, vbits
));
812 return assignNew('V', mce
, Ity_I32
, binop(Iop_Or32
, data
, vbits
));
815 static IRAtom
* mkImproveAND64 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
817 tl_assert(isOriginalAtom(mce
, data
));
818 tl_assert(isShadowAtom(mce
, vbits
));
819 tl_assert(sameKindedAtoms(data
, vbits
));
820 return assignNew('V', mce
, Ity_I64
, binop(Iop_Or64
, data
, vbits
));
823 static IRAtom
* mkImproveANDV128 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
825 tl_assert(isOriginalAtom(mce
, data
));
826 tl_assert(isShadowAtom(mce
, vbits
));
827 tl_assert(sameKindedAtoms(data
, vbits
));
828 return assignNew('V', mce
, Ity_V128
, binop(Iop_OrV128
, data
, vbits
));
831 static IRAtom
* mkImproveANDV256 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
833 tl_assert(isOriginalAtom(mce
, data
));
834 tl_assert(isShadowAtom(mce
, vbits
));
835 tl_assert(sameKindedAtoms(data
, vbits
));
836 return assignNew('V', mce
, Ity_V256
, binop(Iop_OrV256
, data
, vbits
));
839 /* ImproveOR(data, vbits) = ~data OR vbits. Defined (0) data 1s give
840 defined (0); all other -> undefined (1).
842 static IRAtom
* mkImproveOR1 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
844 tl_assert(isOriginalAtom(mce
, data
));
845 tl_assert(isShadowAtom(mce
, vbits
));
846 tl_assert(sameKindedAtoms(data
, vbits
));
850 assignNew('V', mce
, Ity_I1
, unop(Iop_Not1
, data
)),
854 static IRAtom
* mkImproveOR8 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
856 tl_assert(isOriginalAtom(mce
, data
));
857 tl_assert(isShadowAtom(mce
, vbits
));
858 tl_assert(sameKindedAtoms(data
, vbits
));
862 assignNew('V', mce
, Ity_I8
, unop(Iop_Not8
, data
)),
866 static IRAtom
* mkImproveOR16 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
868 tl_assert(isOriginalAtom(mce
, data
));
869 tl_assert(isShadowAtom(mce
, vbits
));
870 tl_assert(sameKindedAtoms(data
, vbits
));
874 assignNew('V', mce
, Ity_I16
, unop(Iop_Not16
, data
)),
878 static IRAtom
* mkImproveOR32 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
880 tl_assert(isOriginalAtom(mce
, data
));
881 tl_assert(isShadowAtom(mce
, vbits
));
882 tl_assert(sameKindedAtoms(data
, vbits
));
886 assignNew('V', mce
, Ity_I32
, unop(Iop_Not32
, data
)),
890 static IRAtom
* mkImproveOR64 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
892 tl_assert(isOriginalAtom(mce
, data
));
893 tl_assert(isShadowAtom(mce
, vbits
));
894 tl_assert(sameKindedAtoms(data
, vbits
));
898 assignNew('V', mce
, Ity_I64
, unop(Iop_Not64
, data
)),
902 static IRAtom
* mkImproveORV128 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
904 tl_assert(isOriginalAtom(mce
, data
));
905 tl_assert(isShadowAtom(mce
, vbits
));
906 tl_assert(sameKindedAtoms(data
, vbits
));
910 assignNew('V', mce
, Ity_V128
, unop(Iop_NotV128
, data
)),
914 static IRAtom
* mkImproveORV256 ( MCEnv
* mce
, IRAtom
* data
, IRAtom
* vbits
)
916 tl_assert(isOriginalAtom(mce
, data
));
917 tl_assert(isShadowAtom(mce
, vbits
));
918 tl_assert(sameKindedAtoms(data
, vbits
));
922 assignNew('V', mce
, Ity_V256
, unop(Iop_NotV256
, data
)),
926 /* --------- Pessimising casts. --------- */
928 /* The function returns an expression of type DST_TY. If any of the VBITS
929 is undefined (value == 1) the resulting expression has all bits set to
930 1. Otherwise, all bits are 0. */
932 static IRAtom
* mkPCastTo( MCEnv
* mce
, IRType dst_ty
, IRAtom
* vbits
)
937 /* Note, dst_ty is a shadow type, not an original type. */
938 tl_assert(isShadowAtom(mce
,vbits
));
939 src_ty
= typeOfIRExpr(mce
->sb
->tyenv
, vbits
);
941 /* Fast-track some common cases */
942 if (src_ty
== Ity_I32
&& dst_ty
== Ity_I32
)
943 return assignNew('V', mce
, Ity_I32
, unop(Iop_CmpwNEZ32
, vbits
));
945 if (src_ty
== Ity_I64
&& dst_ty
== Ity_I64
)
946 return assignNew('V', mce
, Ity_I64
, unop(Iop_CmpwNEZ64
, vbits
));
948 if (src_ty
== Ity_I32
&& dst_ty
== Ity_I64
) {
949 /* PCast the arg, then clone it. */
950 IRAtom
* tmp
= assignNew('V', mce
, Ity_I32
, unop(Iop_CmpwNEZ32
, vbits
));
951 return assignNew('V', mce
, Ity_I64
, binop(Iop_32HLto64
, tmp
, tmp
));
954 if (src_ty
== Ity_I32
&& dst_ty
== Ity_V128
) {
955 /* PCast the arg, then clone it 4 times. */
956 IRAtom
* tmp
= assignNew('V', mce
, Ity_I32
, unop(Iop_CmpwNEZ32
, vbits
));
957 tmp
= assignNew('V', mce
, Ity_I64
, binop(Iop_32HLto64
, tmp
, tmp
));
958 return assignNew('V', mce
, Ity_V128
, binop(Iop_64HLtoV128
, tmp
, tmp
));
961 if (src_ty
== Ity_I32
&& dst_ty
== Ity_V256
) {
962 /* PCast the arg, then clone it 8 times. */
963 IRAtom
* tmp
= assignNew('V', mce
, Ity_I32
, unop(Iop_CmpwNEZ32
, vbits
));
964 tmp
= assignNew('V', mce
, Ity_I64
, binop(Iop_32HLto64
, tmp
, tmp
));
965 tmp
= assignNew('V', mce
, Ity_V128
, binop(Iop_64HLtoV128
, tmp
, tmp
));
966 return assignNew('V', mce
, Ity_V256
, binop(Iop_V128HLtoV256
, tmp
, tmp
));
969 if (src_ty
== Ity_I64
&& dst_ty
== Ity_I32
) {
970 /* PCast the arg. This gives all 0s or all 1s. Then throw away
972 IRAtom
* tmp
= assignNew('V', mce
, Ity_I64
, unop(Iop_CmpwNEZ64
, vbits
));
973 return assignNew('V', mce
, Ity_I32
, unop(Iop_64to32
, tmp
));
976 if (src_ty
== Ity_V128
&& dst_ty
== Ity_I64
) {
977 /* Use InterleaveHI64x2 to copy the top half of the vector into
978 the bottom half. Then we can UifU it with the original, throw
979 away the upper half of the result, and PCast-I64-to-I64
981 // Generates vbits[127:64] : vbits[127:64]
983 = assignNew('V', mce
, Ity_V128
,
984 binop(Iop_InterleaveHI64x2
, vbits
, vbits
));
986 // UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
987 // == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
989 = mkUifUV128(mce
, hi64hi64
, vbits
);
990 // Generates UifU(vbits[127:64],vbits[63:0])
992 = assignNew('V', mce
, Ity_I64
, unop(Iop_V128to64
, lohi64
));
994 // PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
995 // == PCast-to-I64( vbits[127:0] )
997 = assignNew('V', mce
, Ity_I64
, unop(Iop_CmpwNEZ64
, lo64
));
1001 /* Else do it the slow way .. */
1002 /* First of all, collapse vbits down to a single bit. */
1009 tmp1
= assignNew('V', mce
, Ity_I1
, unop(Iop_CmpNEZ8
, vbits
));
1012 tmp1
= assignNew('V', mce
, Ity_I1
, unop(Iop_CmpNEZ16
, vbits
));
1015 tmp1
= assignNew('V', mce
, Ity_I1
, unop(Iop_CmpNEZ32
, vbits
));
1018 tmp1
= assignNew('V', mce
, Ity_I1
, unop(Iop_CmpNEZ64
, vbits
));
1021 /* Gah. Chop it in half, OR the halves together, and compare
1023 IRAtom
* tmp2
= assignNew('V', mce
, Ity_I64
, unop(Iop_128HIto64
, vbits
));
1024 IRAtom
* tmp3
= assignNew('V', mce
, Ity_I64
, unop(Iop_128to64
, vbits
));
1025 IRAtom
* tmp4
= assignNew('V', mce
, Ity_I64
, binop(Iop_Or64
, tmp2
, tmp3
));
1026 tmp1
= assignNew('V', mce
, Ity_I1
,
1027 unop(Iop_CmpNEZ64
, tmp4
));
1031 /* Chop it in half, OR the halves together, and compare that
1034 IRAtom
* tmp2
= assignNew('V', mce
, Ity_I64
, unop(Iop_V128HIto64
, vbits
));
1035 IRAtom
* tmp3
= assignNew('V', mce
, Ity_I64
, unop(Iop_V128to64
, vbits
));
1036 IRAtom
* tmp4
= assignNew('V', mce
, Ity_I64
, binop(Iop_Or64
, tmp2
, tmp3
));
1037 tmp1
= assignNew('V', mce
, Ity_I1
,
1038 unop(Iop_CmpNEZ64
, tmp4
));
1043 VG_(tool_panic
)("mkPCastTo(1)");
1046 /* Now widen up to the dst type. */
1051 return assignNew('V', mce
, Ity_I8
, unop(Iop_1Sto8
, tmp1
));
1053 return assignNew('V', mce
, Ity_I16
, unop(Iop_1Sto16
, tmp1
));
1055 return assignNew('V', mce
, Ity_I32
, unop(Iop_1Sto32
, tmp1
));
1057 return assignNew('V', mce
, Ity_I64
, unop(Iop_1Sto64
, tmp1
));
1059 tmp1
= assignNew('V', mce
, Ity_I64
, unop(Iop_1Sto64
, tmp1
));
1060 tmp1
= assignNew('V', mce
, Ity_V128
, binop(Iop_64HLtoV128
, tmp1
, tmp1
));
1063 tmp1
= assignNew('V', mce
, Ity_I64
, unop(Iop_1Sto64
, tmp1
));
1064 tmp1
= assignNew('V', mce
, Ity_I128
, binop(Iop_64HLto128
, tmp1
, tmp1
));
1067 tmp1
= assignNew('V', mce
, Ity_I64
, unop(Iop_1Sto64
, tmp1
));
1068 tmp1
= assignNew('V', mce
, Ity_V128
, binop(Iop_64HLtoV128
,
1070 tmp1
= assignNew('V', mce
, Ity_V256
, binop(Iop_V128HLtoV256
,
1075 VG_(tool_panic
)("mkPCastTo(2)");
1079 /* This is a minor variant. It takes an arg of some type and returns
1080 a value of the same type. The result consists entirely of Defined
1081 (zero) bits except its least significant bit, which is a PCast of
1082 the entire argument down to a single bit. */
1083 static IRAtom
* mkPCastXXtoXXlsb ( MCEnv
* mce
, IRAtom
* varg
, IRType ty
)
1085 if (ty
== Ity_V128
) {
1086 /* --- Case for V128 --- */
1087 IRAtom
* varg128
= varg
;
1088 // generates: PCast-to-I64(varg128)
1089 IRAtom
* pcdTo64
= mkPCastTo(mce
, Ity_I64
, varg128
);
1090 // Now introduce zeros (defined bits) in the top 63 places
1091 // generates: Def--(63)--Def PCast-to-I1(varg128)
1093 = assignNew('V', mce
, Ity_I64
, binop(Iop_And64
, pcdTo64
, mkU64(1)));
1094 // generates: Def--(64)--Def
1096 = definedOfType(Ity_I64
);
1097 // generates: Def--(127)--Def PCast-to-I1(varg128)
1099 = assignNew('V', mce
, Ity_V128
, binop(Iop_64HLtoV128
, d64
, d63pc
));
1102 if (ty
== Ity_I64
) {
1103 /* --- Case for I64 --- */
1105 IRAtom
* pcd
= mkPCastTo(mce
, Ity_I64
, varg
);
1106 // Zero (Def) out the top 63 bits
1108 = assignNew('V', mce
, Ity_I64
, binop(Iop_And64
, pcd
, mkU64(1)));
1115 /* --------- Optimistic casts. --------- */
1117 /* The function takes and returns an expression of type TY. If any of the
1118 VBITS indicate defined (value == 0) the resulting expression has all bits
1119 set to 0. Otherwise, all bits are 1. In words, if any bits are defined
1120 then all bits are made to be defined.
1122 In short we compute (vbits - (vbits >>u 1)) >>s (bitsize(vbits)-1).
1124 static IRAtom
* mkOCastAt( MCEnv
* mce
, IRType ty
, IRAtom
* vbits
)
1126 IROp opSUB
, opSHR
, opSAR
;
1131 opSUB
= Iop_Sub64
; opSHR
= Iop_Shr64
; opSAR
= Iop_Sar64
; sh
= 63;
1134 opSUB
= Iop_Sub32
; opSHR
= Iop_Shr32
; opSAR
= Iop_Sar32
; sh
= 31;
1137 opSUB
= Iop_Sub16
; opSHR
= Iop_Shr16
; opSAR
= Iop_Sar16
; sh
= 15;
1140 opSUB
= Iop_Sub8
; opSHR
= Iop_Shr8
; opSAR
= Iop_Sar8
; sh
= 7;
1144 VG_(tool_panic
)("mkOCastTo");
1148 shr1
= assignNew('V', mce
,ty
, binop(opSHR
, vbits
, mkU8(1)));
1149 at
= assignNew('V', mce
,ty
, binop(opSUB
, vbits
, shr1
));
1150 at
= assignNew('V', mce
,ty
, binop(opSAR
, at
, mkU8(sh
)));
1155 /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
1157 Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
1158 PCasting to Ity_U1. However, sometimes it is necessary to be more
1159 accurate. The insight is that the result is defined if two
1160 corresponding bits can be found, one from each argument, so that
1161 both bits are defined but are different -- that makes EQ say "No"
1162 and NE say "Yes". Hence, we compute an improvement term and DifD
1163 it onto the "normal" (UifU) result.
1178 vec contains 0 (defined) bits where the corresponding arg bits
1179 are defined but different, and 1 bits otherwise.
1181 vec = Or<sz>( vxx, // 0 iff bit defined
1182 vyy, // 0 iff bit defined
1183 Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
1186 If any bit of vec is 0, the result is defined and so the
1187 improvement term should produce 0...0, else it should produce
1190 Hence require for the improvement term:
1192 OCast(vec) = if vec == 1...1 then 1...1 else 0...0
1194 which you can think of as an "optimistic cast" (OCast, the opposite of
1195 the normal "pessimistic cast" (PCast) family. An OCast says all bits
1196 are defined if any bit is defined.
1198 It is possible to show that
1200 if vec == 1...1 then 1...1 else 0...0
1202 can be implemented in straight-line code as
1204 (vec - (vec >>u 1)) >>s (word-size-in-bits - 1)
1206 We note that vec contains the sub-term Or<sz>(vxx, vyy). Since UifU is
1207 implemented with Or (since 1 signifies undefinedness), this is a
1208 duplicate of the UifU<sz>(vxx, vyy) term and so we can CSE it out, giving
1211 let naive = UifU<sz>(vxx, vyy)
1212 vec = Or<sz>(naive, Not<sz>(Xor<sz)(xx, yy))
1214 PCastTo<1>( DifD<sz>(naive, OCast<sz>(vec)) )
1216 This was extensively re-analysed and checked on 6 July 05 and again
1219 static IRAtom
* expensiveCmpEQorNE ( MCEnv
* mce
,
1221 IRAtom
* vxx
, IRAtom
* vyy
,
1222 IRAtom
* xx
, IRAtom
* yy
)
1224 IRAtom
*naive
, *vec
, *improved
, *final_cast
;
1225 IROp opDIFD
, opUIFU
, opOR
, opXOR
, opNOT
;
1227 tl_assert(isShadowAtom(mce
,vxx
));
1228 tl_assert(isShadowAtom(mce
,vyy
));
1229 tl_assert(isOriginalAtom(mce
,xx
));
1230 tl_assert(isOriginalAtom(mce
,yy
));
1231 tl_assert(sameKindedAtoms(vxx
,xx
));
1232 tl_assert(sameKindedAtoms(vyy
,yy
));
1264 VG_(tool_panic
)("expensiveCmpEQorNE");
1268 = assignNew('V', mce
, ty
, binop(opUIFU
, vxx
, vyy
));
1278 assignNew('V', mce
,ty
, binop(opXOR
, xx
, yy
))))));
1281 = assignNew( 'V', mce
,ty
,
1282 binop(opDIFD
, naive
, mkOCastAt(mce
, ty
, vec
)));
1285 = mkPCastTo( mce
, Ity_I1
, improved
);
1291 /* --------- Semi-accurate interpretation of CmpORD. --------- */
1293 /* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
1295 CmpORD32S(x,y) = 1<<3 if x <s y
1299 and similarly the unsigned variant. The default interpretation is:
1301 CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
1304 The "& (7<<1)" reflects the fact that all result bits except 3,2,1
1305 are zero and therefore defined (viz, zero).
1307 Also deal with a special case better:
1311 Here, bit 3 (LT) of the result is a copy of the top bit of x and
1312 will be defined even if the rest of x isn't. In which case we do:
1314 CmpORD32S#(x,x#,0,{impliedly 0}#)
1315 = PCast(x#) & (3<<1) -- standard interp for GT#,EQ#
1316 | (x# >>u 31) << 3 -- LT# = x#[31]
1318 Analogous handling for CmpORD64{S,U}.
1320 static Bool
isZeroU32 ( IRAtom
* e
)
1323 toBool( e
->tag
== Iex_Const
1324 && e
->Iex
.Const
.con
->tag
== Ico_U32
1325 && e
->Iex
.Const
.con
->Ico
.U32
== 0 );
1328 static Bool
isZeroU64 ( IRAtom
* e
)
1331 toBool( e
->tag
== Iex_Const
1332 && e
->Iex
.Const
.con
->tag
== Ico_U64
1333 && e
->Iex
.Const
.con
->Ico
.U64
== 0 );
1336 static IRAtom
* doCmpORD ( MCEnv
* mce
,
1338 IRAtom
* xxhash
, IRAtom
* yyhash
,
1339 IRAtom
* xx
, IRAtom
* yy
)
1341 Bool m64
= cmp_op
== Iop_CmpORD64S
|| cmp_op
== Iop_CmpORD64U
;
1342 Bool syned
= cmp_op
== Iop_CmpORD64S
|| cmp_op
== Iop_CmpORD32S
;
1343 IROp opOR
= m64
? Iop_Or64
: Iop_Or32
;
1344 IROp opAND
= m64
? Iop_And64
: Iop_And32
;
1345 IROp opSHL
= m64
? Iop_Shl64
: Iop_Shl32
;
1346 IROp opSHR
= m64
? Iop_Shr64
: Iop_Shr32
;
1347 IROp op1UtoWS
= m64
? Iop_1Uto64
: Iop_1Uto32
;
1348 IRType ty
= m64
? Ity_I64
: Ity_I32
;
1349 Int width
= m64
? 64 : 32;
1351 Bool (*isZero
)(IRAtom
*) = m64
? isZeroU64
: isZeroU32
;
1353 tl_assert(isShadowAtom(mce
,xxhash
));
1354 tl_assert(isShadowAtom(mce
,yyhash
));
1355 tl_assert(isOriginalAtom(mce
,xx
));
1356 tl_assert(isOriginalAtom(mce
,yy
));
1357 tl_assert(sameKindedAtoms(xxhash
,xx
));
1358 tl_assert(sameKindedAtoms(yyhash
,yy
));
1359 tl_assert(cmp_op
== Iop_CmpORD32S
|| cmp_op
== Iop_CmpORD32U
1360 || cmp_op
== Iop_CmpORD64S
|| cmp_op
== Iop_CmpORD64U
);
1363 ppIROp(cmp_op
); VG_(printf
)(" ");
1364 ppIRExpr(xx
); VG_(printf
)(" "); ppIRExpr( yy
); VG_(printf
)("\n");
1367 if (syned
&& isZero(yy
)) {
1368 /* fancy interpretation */
1369 /* if yy is zero, then it must be fully defined (zero#). */
1370 tl_assert(isZero(yyhash
));
1371 // This is still inaccurate, but I don't think it matters, since
1372 // nobody writes code of the form
1373 // "is <partially-undefined-value> signedly greater than zero?".
1374 // We therefore simply declare "x >s 0" to be undefined if any bit in
1375 // x is undefined. That's clearly suboptimal in some cases. Eg, if
1376 // the highest order bit is a defined 1 then x is negative so it
1377 // doesn't matter whether the remaining bits are defined or not.
1383 mkPCastTo(mce
,ty
, xxhash
),
1384 m64
? mkU64(1<<2) : mkU32(1<<2)
1386 // For "x <s 0", we can just copy the definedness of the top bit of x
1387 // and we have a precise result.
1395 binop(opSHR
, xxhash
, mkU8(width
-1))),
1398 // For "x == 0" we can hand the problem off to expensiveCmpEQorNE.
1404 assignNew('V', mce
,ty
,
1407 expensiveCmpEQorNE(mce
, ty
, xxhash
, yyhash
, xx
, yy
))
1414 assignNew('V', mce
,ty
, binop(opOR
, t_lt_0_0_0
, t_0_gt_0_0
)),
1418 /* standard interpretation */
1419 IRAtom
* sevenLeft1
= m64
? mkU64(7<<1) : mkU32(7<<1);
1424 mkUifU(mce
,ty
, xxhash
,yyhash
)),
1431 /*------------------------------------------------------------*/
1432 /*--- Emit a test and complaint if something is undefined. ---*/
1433 /*------------------------------------------------------------*/
1435 static IRAtom
* schemeE ( MCEnv
* mce
, IRExpr
* e
); /* fwds */
1438 /* Set the annotations on a dirty helper to indicate that the stack
1439 pointer and instruction pointers might be read. This is the
1440 behaviour of all 'emit-a-complaint' style functions we might
1443 static void setHelperAnns ( MCEnv
* mce
, IRDirty
* di
) {
1445 di
->fxState
[0].fx
= Ifx_Read
;
1446 di
->fxState
[0].offset
= mce
->layout
->offset_SP
;
1447 di
->fxState
[0].size
= mce
->layout
->sizeof_SP
;
1448 di
->fxState
[0].nRepeats
= 0;
1449 di
->fxState
[0].repeatLen
= 0;
1450 di
->fxState
[1].fx
= Ifx_Read
;
1451 di
->fxState
[1].offset
= mce
->layout
->offset_IP
;
1452 di
->fxState
[1].size
= mce
->layout
->sizeof_IP
;
1453 di
->fxState
[1].nRepeats
= 0;
1454 di
->fxState
[1].repeatLen
= 0;
1458 /* Check the supplied *original* |atom| for undefinedness, and emit a
1459 complaint if so. Once that happens, mark it as defined. This is
1460 possible because the atom is either a tmp or literal. If it's a
1461 tmp, it will be shadowed by a tmp, and so we can set the shadow to
1462 be defined. In fact as mentioned above, we will have to allocate a
1463 new tmp to carry the new 'defined' shadow value, and update the
1464 original->tmp mapping accordingly; we cannot simply assign a new
1465 value to an existing shadow tmp as this breaks SSAness.
1467 The checks are performed, any resulting complaint emitted, and
1468 |atom|'s shadow temp set to 'defined', ONLY in the case that
1469 |guard| evaluates to True at run-time. If it evaluates to False
1470 then no action is performed. If |guard| is NULL (the usual case)
1471 then it is assumed to be always-true, and hence these actions are
1472 performed unconditionally.
1474 This routine does not generate code to check the definedness of
1475 |guard|. The caller is assumed to have taken care of that already.
1477 static void complainIfUndefined ( MCEnv
* mce
, IRAtom
* atom
, IRExpr
*guard
)
1490 // Don't do V bit tests if we're not reporting undefined value errors.
1491 if (MC_(clo_mc_level
) == 1)
1495 tl_assert(isOriginalAtom(mce
, guard
));
1497 /* Since the original expression is atomic, there's no duplicated
1498 work generated by making multiple V-expressions for it. So we
1499 don't really care about the possibility that someone else may
1500 also create a V-interpretion for it. */
1501 tl_assert(isOriginalAtom(mce
, atom
));
1502 vatom
= expr2vbits( mce
, atom
, HuOth
);
1503 tl_assert(isShadowAtom(mce
, vatom
));
1504 tl_assert(sameKindedAtoms(atom
, vatom
));
1506 ty
= typeOfIRExpr(mce
->sb
->tyenv
, vatom
);
1508 /* sz is only used for constructing the error message */
1509 sz
= ty
==Ity_I1
? 0 : sizeofIRType(ty
);
1511 cond
= mkPCastTo( mce
, Ity_I1
, vatom
);
1512 /* cond will be 0 if all defined, and 1 if any not defined. */
1514 /* Get the origin info for the value we are about to check. At
1515 least, if we are doing origin tracking. If not, use a dummy
1517 if (MC_(clo_mc_level
) == 3) {
1518 origin
= schemeE( mce
, atom
);
1519 if (mce
->hWordTy
== Ity_I64
) {
1520 origin
= assignNew( 'B', mce
, Ity_I64
, unop(Iop_32Uto64
, origin
) );
1534 fn
= &MC_(helperc_value_check0_fail_w_o
);
1535 nm
= "MC_(helperc_value_check0_fail_w_o)";
1536 args
= mkIRExprVec_1(origin
);
1539 fn
= &MC_(helperc_value_check0_fail_no_o
);
1540 nm
= "MC_(helperc_value_check0_fail_no_o)";
1541 args
= mkIRExprVec_0();
1547 fn
= &MC_(helperc_value_check1_fail_w_o
);
1548 nm
= "MC_(helperc_value_check1_fail_w_o)";
1549 args
= mkIRExprVec_1(origin
);
1552 fn
= &MC_(helperc_value_check1_fail_no_o
);
1553 nm
= "MC_(helperc_value_check1_fail_no_o)";
1554 args
= mkIRExprVec_0();
1560 fn
= &MC_(helperc_value_check4_fail_w_o
);
1561 nm
= "MC_(helperc_value_check4_fail_w_o)";
1562 args
= mkIRExprVec_1(origin
);
1565 fn
= &MC_(helperc_value_check4_fail_no_o
);
1566 nm
= "MC_(helperc_value_check4_fail_no_o)";
1567 args
= mkIRExprVec_0();
1573 fn
= &MC_(helperc_value_check8_fail_w_o
);
1574 nm
= "MC_(helperc_value_check8_fail_w_o)";
1575 args
= mkIRExprVec_1(origin
);
1578 fn
= &MC_(helperc_value_check8_fail_no_o
);
1579 nm
= "MC_(helperc_value_check8_fail_no_o)";
1580 args
= mkIRExprVec_0();
1587 fn
= &MC_(helperc_value_checkN_fail_w_o
);
1588 nm
= "MC_(helperc_value_checkN_fail_w_o)";
1589 args
= mkIRExprVec_2( mkIRExpr_HWord( sz
), origin
);
1592 fn
= &MC_(helperc_value_checkN_fail_no_o
);
1593 nm
= "MC_(helperc_value_checkN_fail_no_o)";
1594 args
= mkIRExprVec_1( mkIRExpr_HWord( sz
) );
1599 VG_(tool_panic
)("unexpected szB");
1605 tl_assert(nargs
>= 0 && nargs
<= 2);
1606 tl_assert( (MC_(clo_mc_level
) == 3 && origin
!= NULL
)
1607 || (MC_(clo_mc_level
) == 2 && origin
== NULL
) );
1609 di
= unsafeIRDirty_0_N( nargs
/*regparms*/, nm
,
1610 VG_(fnptr_to_fnentry
)( fn
), args
);
1611 di
->guard
= cond
; // and cond is PCast-to-1(atom#)
1613 /* If the complaint is to be issued under a guard condition, AND
1614 that into the guard condition for the helper call. */
1616 IRAtom
*g1
= assignNew('V', mce
, Ity_I32
, unop(Iop_1Uto32
, di
->guard
));
1617 IRAtom
*g2
= assignNew('V', mce
, Ity_I32
, unop(Iop_1Uto32
, guard
));
1618 IRAtom
*e
= assignNew('V', mce
, Ity_I32
, binop(Iop_And32
, g1
, g2
));
1619 di
->guard
= assignNew('V', mce
, Ity_I1
, unop(Iop_32to1
, e
));
1622 setHelperAnns( mce
, di
);
1623 stmt( 'V', mce
, IRStmt_Dirty(di
));
1625 /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
1626 defined -- but only in the case where the guard evaluates to
1627 True at run-time. Do the update by setting the orig->shadow
1628 mapping for tmp to reflect the fact that this shadow is getting
1630 tl_assert(isIRAtom(vatom
));
1631 /* sameKindedAtoms ... */
1632 if (vatom
->tag
== Iex_RdTmp
) {
1633 tl_assert(atom
->tag
== Iex_RdTmp
);
1634 if (guard
== NULL
) {
1635 // guard is 'always True', hence update unconditionally
1636 newShadowTmpV(mce
, atom
->Iex
.RdTmp
.tmp
);
1637 assign('V', mce
, findShadowTmpV(mce
, atom
->Iex
.RdTmp
.tmp
),
1640 // update the temp only conditionally. Do this by copying
1641 // its old value when the guard is False.
1643 IRTemp old_tmpV
= findShadowTmpV(mce
, atom
->Iex
.RdTmp
.tmp
);
1644 newShadowTmpV(mce
, atom
->Iex
.RdTmp
.tmp
);
1646 = assignNew('V', mce
, shadowTypeV(ty
),
1647 IRExpr_ITE(guard
, definedOfType(ty
),
1649 assign('V', mce
, findShadowTmpV(mce
, atom
->Iex
.RdTmp
.tmp
), new_tmpV
);
1655 /*------------------------------------------------------------*/
1656 /*--- Shadowing PUTs/GETs, and indexed variants thereof ---*/
1657 /*------------------------------------------------------------*/
1659 /* Examine the always-defined sections declared in layout to see if
1660 the (offset,size) section is within one. Note, is is an error to
1661 partially fall into such a region: (offset,size) should either be
1662 completely in such a region or completely not-in such a region.
1664 static Bool
isAlwaysDefd ( MCEnv
* mce
, Int offset
, Int size
)
1666 Int minoffD
, maxoffD
, i
;
1667 Int minoff
= offset
;
1668 Int maxoff
= minoff
+ size
- 1;
1669 tl_assert((minoff
& ~0xFFFF) == 0);
1670 tl_assert((maxoff
& ~0xFFFF) == 0);
1672 for (i
= 0; i
< mce
->layout
->n_alwaysDefd
; i
++) {
1673 minoffD
= mce
->layout
->alwaysDefd
[i
].offset
;
1674 maxoffD
= minoffD
+ mce
->layout
->alwaysDefd
[i
].size
- 1;
1675 tl_assert((minoffD
& ~0xFFFF) == 0);
1676 tl_assert((maxoffD
& ~0xFFFF) == 0);
1678 if (maxoff
< minoffD
|| maxoffD
< minoff
)
1679 continue; /* no overlap */
1680 if (minoff
>= minoffD
&& maxoff
<= maxoffD
)
1681 return True
; /* completely contained in an always-defd section */
1683 VG_(tool_panic
)("memcheck:isAlwaysDefd:partial overlap");
1685 return False
; /* could not find any containing section */
1689 /* Generate into bb suitable actions to shadow this Put. If the state
1690 slice is marked 'always defined', do nothing. Otherwise, write the
1691 supplied V bits to the shadow state. We can pass in either an
1692 original atom or a V-atom, but not both. In the former case the
1693 relevant V-bits are then generated from the original.
1694 We assume here, that the definedness of GUARD has already been checked.
1697 void do_shadow_PUT ( MCEnv
* mce
, Int offset
,
1698 IRAtom
* atom
, IRAtom
* vatom
, IRExpr
*guard
)
1702 // Don't do shadow PUTs if we're not doing undefined value checking.
1703 // Their absence lets Vex's optimiser remove all the shadow computation
1704 // that they depend on, which includes GETs of the shadow registers.
1705 if (MC_(clo_mc_level
) == 1)
1710 tl_assert(isOriginalAtom(mce
, atom
));
1711 vatom
= expr2vbits( mce
, atom
, HuOth
);
1714 tl_assert(isShadowAtom(mce
, vatom
));
1717 ty
= typeOfIRExpr(mce
->sb
->tyenv
, vatom
);
1718 tl_assert(ty
!= Ity_I1
);
1719 if (isAlwaysDefd(mce
, offset
, sizeofIRType(ty
))) {
1721 /* emit code to emit a complaint if any of the vbits are 1. */
1722 /* complainIfUndefined(mce, atom); */
1724 /* Do a plain shadow Put. */
1726 /* If the guard expression evaluates to false we simply Put the value
1727 that is already stored in the guest state slot */
1728 IRAtom
*cond
, *iffalse
;
1730 cond
= assignNew('V', mce
, Ity_I1
, guard
);
1731 iffalse
= assignNew('V', mce
, ty
,
1732 IRExpr_Get(offset
+ mce
->layout
->total_sizeB
, ty
));
1733 vatom
= assignNew('V', mce
, ty
, IRExpr_ITE(cond
, vatom
, iffalse
));
1735 stmt( 'V', mce
, IRStmt_Put( offset
+ mce
->layout
->total_sizeB
, vatom
));
1740 /* Return an expression which contains the V bits corresponding to the
1741 given GETI (passed in in pieces).
1744 void do_shadow_PUTI ( MCEnv
* mce
, IRPutI
*puti
)
1749 IRRegArray
* descr
= puti
->descr
;
1750 IRAtom
* ix
= puti
->ix
;
1751 Int bias
= puti
->bias
;
1752 IRAtom
* atom
= puti
->data
;
1754 // Don't do shadow PUTIs if we're not doing undefined value checking.
1755 // Their absence lets Vex's optimiser remove all the shadow computation
1756 // that they depend on, which includes GETIs of the shadow registers.
1757 if (MC_(clo_mc_level
) == 1)
1760 tl_assert(isOriginalAtom(mce
,atom
));
1761 vatom
= expr2vbits( mce
, atom
, HuOth
);
1762 tl_assert(sameKindedAtoms(atom
, vatom
));
1764 tyS
= shadowTypeV(ty
);
1765 arrSize
= descr
->nElems
* sizeofIRType(ty
);
1766 tl_assert(ty
!= Ity_I1
);
1767 tl_assert(isOriginalAtom(mce
,ix
));
1768 complainIfUndefined(mce
, ix
, NULL
);
1769 if (isAlwaysDefd(mce
, descr
->base
, arrSize
)) {
1771 /* emit code to emit a complaint if any of the vbits are 1. */
1772 /* complainIfUndefined(mce, atom); */
1774 /* Do a cloned version of the Put that refers to the shadow
1776 IRRegArray
* new_descr
1777 = mkIRRegArray( descr
->base
+ mce
->layout
->total_sizeB
,
1778 tyS
, descr
->nElems
);
1779 stmt( 'V', mce
, IRStmt_PutI( mkIRPutI(new_descr
, ix
, bias
, vatom
) ));
1784 /* Return an expression which contains the V bits corresponding to the
1785 given GET (passed in in pieces).
1788 IRExpr
* shadow_GET ( MCEnv
* mce
, Int offset
, IRType ty
)
1790 IRType tyS
= shadowTypeV(ty
);
1791 tl_assert(ty
!= Ity_I1
);
1792 tl_assert(ty
!= Ity_I128
);
1793 if (isAlwaysDefd(mce
, offset
, sizeofIRType(ty
))) {
1794 /* Always defined, return all zeroes of the relevant type */
1795 return definedOfType(tyS
);
1797 /* return a cloned version of the Get that refers to the shadow
1799 /* FIXME: this isn't an atom! */
1800 return IRExpr_Get( offset
+ mce
->layout
->total_sizeB
, tyS
);
1805 /* Return an expression which contains the V bits corresponding to the
1806 given GETI (passed in in pieces).
1809 IRExpr
* shadow_GETI ( MCEnv
* mce
,
1810 IRRegArray
* descr
, IRAtom
* ix
, Int bias
)
1812 IRType ty
= descr
->elemTy
;
1813 IRType tyS
= shadowTypeV(ty
);
1814 Int arrSize
= descr
->nElems
* sizeofIRType(ty
);
1815 tl_assert(ty
!= Ity_I1
);
1816 tl_assert(isOriginalAtom(mce
,ix
));
1817 complainIfUndefined(mce
, ix
, NULL
);
1818 if (isAlwaysDefd(mce
, descr
->base
, arrSize
)) {
1819 /* Always defined, return all zeroes of the relevant type */
1820 return definedOfType(tyS
);
1822 /* return a cloned version of the Get that refers to the shadow
1824 IRRegArray
* new_descr
1825 = mkIRRegArray( descr
->base
+ mce
->layout
->total_sizeB
,
1826 tyS
, descr
->nElems
);
1827 return IRExpr_GetI( new_descr
, ix
, bias
);
1832 /*------------------------------------------------------------*/
1833 /*--- Generating approximations for unknown operations, ---*/
1834 /*--- using lazy-propagate semantics ---*/
1835 /*------------------------------------------------------------*/
1837 /* Lazy propagation of undefinedness from two values, resulting in the
1838 specified shadow type.
1841 IRAtom
* mkLazy2 ( MCEnv
* mce
, IRType finalVty
, IRAtom
* va1
, IRAtom
* va2
)
1844 IRType t1
= typeOfIRExpr(mce
->sb
->tyenv
, va1
);
1845 IRType t2
= typeOfIRExpr(mce
->sb
->tyenv
, va2
);
1846 tl_assert(isShadowAtom(mce
,va1
));
1847 tl_assert(isShadowAtom(mce
,va2
));
1849 /* The general case is inefficient because PCast is an expensive
1850 operation. Here are some special cases which use PCast only
1851 once rather than twice. */
1853 /* I64 x I64 -> I64 */
1854 if (t1
== Ity_I64
&& t2
== Ity_I64
&& finalVty
== Ity_I64
) {
1855 if (0) VG_(printf
)("mkLazy2: I64 x I64 -> I64\n");
1856 at
= mkUifU(mce
, Ity_I64
, va1
, va2
);
1857 at
= mkPCastTo(mce
, Ity_I64
, at
);
1861 /* I64 x I64 -> I32 */
1862 if (t1
== Ity_I64
&& t2
== Ity_I64
&& finalVty
== Ity_I32
) {
1863 if (0) VG_(printf
)("mkLazy2: I64 x I64 -> I32\n");
1864 at
= mkUifU(mce
, Ity_I64
, va1
, va2
);
1865 at
= mkPCastTo(mce
, Ity_I32
, at
);
1869 /* I32 x I32 -> I32 */
1870 if (t1
== Ity_I32
&& t2
== Ity_I32
&& finalVty
== Ity_I32
) {
1871 if (0) VG_(printf
)("mkLazy2: I32 x I32 -> I32\n");
1872 at
= mkUifU(mce
, Ity_I32
, va1
, va2
);
1873 at
= mkPCastTo(mce
, Ity_I32
, at
);
1878 VG_(printf
)("mkLazy2 ");
1887 /* General case: force everything via 32-bit intermediaries. */
1888 at
= mkPCastTo(mce
, Ity_I32
, va1
);
1889 at
= mkUifU(mce
, Ity_I32
, at
, mkPCastTo(mce
, Ity_I32
, va2
));
1890 at
= mkPCastTo(mce
, finalVty
, at
);
1895 /* 3-arg version of the above. */
1897 IRAtom
* mkLazy3 ( MCEnv
* mce
, IRType finalVty
,
1898 IRAtom
* va1
, IRAtom
* va2
, IRAtom
* va3
)
1901 IRType t1
= typeOfIRExpr(mce
->sb
->tyenv
, va1
);
1902 IRType t2
= typeOfIRExpr(mce
->sb
->tyenv
, va2
);
1903 IRType t3
= typeOfIRExpr(mce
->sb
->tyenv
, va3
);
1904 tl_assert(isShadowAtom(mce
,va1
));
1905 tl_assert(isShadowAtom(mce
,va2
));
1906 tl_assert(isShadowAtom(mce
,va3
));
1908 /* The general case is inefficient because PCast is an expensive
1909 operation. Here are some special cases which use PCast only
1910 twice rather than three times. */
1912 /* I32 x I64 x I64 -> I64 */
1913 /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1914 if (t1
== Ity_I32
&& t2
== Ity_I64
&& t3
== Ity_I64
1915 && finalVty
== Ity_I64
) {
1916 if (0) VG_(printf
)("mkLazy3: I32 x I64 x I64 -> I64\n");
1917 /* Widen 1st arg to I64. Since 1st arg is typically a rounding
1918 mode indication which is fully defined, this should get
1919 folded out later. */
1920 at
= mkPCastTo(mce
, Ity_I64
, va1
);
1921 /* Now fold in 2nd and 3rd args. */
1922 at
= mkUifU(mce
, Ity_I64
, at
, va2
);
1923 at
= mkUifU(mce
, Ity_I64
, at
, va3
);
1924 /* and PCast once again. */
1925 at
= mkPCastTo(mce
, Ity_I64
, at
);
1929 /* I32 x I8 x I64 -> I64 */
1930 if (t1
== Ity_I32
&& t2
== Ity_I8
&& t3
== Ity_I64
1931 && finalVty
== Ity_I64
) {
1932 if (0) VG_(printf
)("mkLazy3: I32 x I8 x I64 -> I64\n");
1933 /* Widen 1st and 2nd args to I64. Since 1st arg is typically a
1934 * rounding mode indication which is fully defined, this should
1935 * get folded out later.
1937 IRAtom
* at1
= mkPCastTo(mce
, Ity_I64
, va1
);
1938 IRAtom
* at2
= mkPCastTo(mce
, Ity_I64
, va2
);
1939 at
= mkUifU(mce
, Ity_I64
, at1
, at2
); // UifU(PCast(va1), PCast(va2))
1940 at
= mkUifU(mce
, Ity_I64
, at
, va3
);
1941 /* and PCast once again. */
1942 at
= mkPCastTo(mce
, Ity_I64
, at
);
1946 /* I32 x I64 x I64 -> I32 */
1947 if (t1
== Ity_I32
&& t2
== Ity_I64
&& t3
== Ity_I64
1948 && finalVty
== Ity_I32
) {
1949 if (0) VG_(printf
)("mkLazy3: I32 x I64 x I64 -> I32\n");
1950 at
= mkPCastTo(mce
, Ity_I64
, va1
);
1951 at
= mkUifU(mce
, Ity_I64
, at
, va2
);
1952 at
= mkUifU(mce
, Ity_I64
, at
, va3
);
1953 at
= mkPCastTo(mce
, Ity_I32
, at
);
1957 /* I32 x I32 x I32 -> I32 */
1958 /* 32-bit FP idiom, as (eg) happens on ARM */
1959 if (t1
== Ity_I32
&& t2
== Ity_I32
&& t3
== Ity_I32
1960 && finalVty
== Ity_I32
) {
1961 if (0) VG_(printf
)("mkLazy3: I32 x I32 x I32 -> I32\n");
1963 at
= mkUifU(mce
, Ity_I32
, at
, va2
);
1964 at
= mkUifU(mce
, Ity_I32
, at
, va3
);
1965 at
= mkPCastTo(mce
, Ity_I32
, at
);
1969 /* I32 x I128 x I128 -> I128 */
1970 /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1971 if (t1
== Ity_I32
&& t2
== Ity_I128
&& t3
== Ity_I128
1972 && finalVty
== Ity_I128
) {
1973 if (0) VG_(printf
)("mkLazy3: I32 x I128 x I128 -> I128\n");
1974 /* Widen 1st arg to I128. Since 1st arg is typically a rounding
1975 mode indication which is fully defined, this should get
1976 folded out later. */
1977 at
= mkPCastTo(mce
, Ity_I128
, va1
);
1978 /* Now fold in 2nd and 3rd args. */
1979 at
= mkUifU(mce
, Ity_I128
, at
, va2
);
1980 at
= mkUifU(mce
, Ity_I128
, at
, va3
);
1981 /* and PCast once again. */
1982 at
= mkPCastTo(mce
, Ity_I128
, at
);
1986 /* I32 x I8 x I128 -> I128 */
1987 /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
1988 if (t1
== Ity_I32
&& t2
== Ity_I8
&& t3
== Ity_I128
1989 && finalVty
== Ity_I128
) {
1990 if (0) VG_(printf
)("mkLazy3: I32 x I8 x I128 -> I128\n");
1991 /* Use I64 as an intermediate type, which means PCasting all 3
1992 args to I64 to start with. 1st arg is typically a rounding
1993 mode indication which is fully defined, so we hope that it
1994 will get folded out later. */
1995 IRAtom
* at1
= mkPCastTo(mce
, Ity_I64
, va1
);
1996 IRAtom
* at2
= mkPCastTo(mce
, Ity_I64
, va2
);
1997 IRAtom
* at3
= mkPCastTo(mce
, Ity_I64
, va3
);
1998 /* Now UifU all three together. */
1999 at
= mkUifU(mce
, Ity_I64
, at1
, at2
); // UifU(PCast(va1), PCast(va2))
2000 at
= mkUifU(mce
, Ity_I64
, at
, at3
); // ... `UifU` PCast(va3)
2001 /* and PCast once again. */
2002 at
= mkPCastTo(mce
, Ity_I128
, at
);
2006 VG_(printf
)("mkLazy3: ");
2012 VG_(printf
)(" -> ");
2018 /* General case: force everything via 32-bit intermediaries. */
2020 at = mkPCastTo(mce, Ity_I32, va1);
2021 at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
2022 at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
2023 at = mkPCastTo(mce, finalVty, at);
2029 /* 4-arg version of the above. */
2031 IRAtom
* mkLazy4 ( MCEnv
* mce
, IRType finalVty
,
2032 IRAtom
* va1
, IRAtom
* va2
, IRAtom
* va3
, IRAtom
* va4
)
2035 IRType t1
= typeOfIRExpr(mce
->sb
->tyenv
, va1
);
2036 IRType t2
= typeOfIRExpr(mce
->sb
->tyenv
, va2
);
2037 IRType t3
= typeOfIRExpr(mce
->sb
->tyenv
, va3
);
2038 IRType t4
= typeOfIRExpr(mce
->sb
->tyenv
, va4
);
2039 tl_assert(isShadowAtom(mce
,va1
));
2040 tl_assert(isShadowAtom(mce
,va2
));
2041 tl_assert(isShadowAtom(mce
,va3
));
2042 tl_assert(isShadowAtom(mce
,va4
));
2044 /* The general case is inefficient because PCast is an expensive
2045 operation. Here are some special cases which use PCast only
2046 twice rather than three times. */
2048 /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2050 if (t1
== Ity_I32
&& t2
== Ity_I128
&& t3
== Ity_I128
&& t4
== Ity_I128
2051 && finalVty
== Ity_I128
) {
2052 if (0) VG_(printf
)("mkLazy4: I32 x I128 x I128 x I128 -> I128\n");
2053 /* Widen 1st arg to I128. Since 1st arg is typically a rounding
2054 mode indication which is fully defined, this should get
2055 folded out later. */
2056 at
= mkPCastTo(mce
, Ity_I128
, va1
);
2057 /* Now fold in 2nd, 3rd, 4th args. */
2058 at
= mkUifU(mce
, Ity_I128
, at
, va2
);
2059 at
= mkUifU(mce
, Ity_I128
, at
, va3
);
2060 at
= mkUifU(mce
, Ity_I128
, at
, va4
);
2061 /* and PCast once again. */
2062 at
= mkPCastTo(mce
, Ity_I128
, at
);
2066 /* I32 x I64 x I64 x I64 -> I64 */
2067 if (t1
== Ity_I32
&& t2
== Ity_I64
&& t3
== Ity_I64
&& t4
== Ity_I64
2068 && finalVty
== Ity_I64
) {
2069 if (0) VG_(printf
)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
2070 /* Widen 1st arg to I64. Since 1st arg is typically a rounding
2071 mode indication which is fully defined, this should get
2072 folded out later. */
2073 at
= mkPCastTo(mce
, Ity_I64
, va1
);
2074 /* Now fold in 2nd, 3rd, 4th args. */
2075 at
= mkUifU(mce
, Ity_I64
, at
, va2
);
2076 at
= mkUifU(mce
, Ity_I64
, at
, va3
);
2077 at
= mkUifU(mce
, Ity_I64
, at
, va4
);
2078 /* and PCast once again. */
2079 at
= mkPCastTo(mce
, Ity_I64
, at
);
2082 /* I32 x I32 x I32 x I32 -> I32 */
2083 /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
2084 if (t1
== Ity_I32
&& t2
== Ity_I32
&& t3
== Ity_I32
&& t4
== Ity_I32
2085 && finalVty
== Ity_I32
) {
2086 if (0) VG_(printf
)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
2088 /* Now fold in 2nd, 3rd, 4th args. */
2089 at
= mkUifU(mce
, Ity_I32
, at
, va2
);
2090 at
= mkUifU(mce
, Ity_I32
, at
, va3
);
2091 at
= mkUifU(mce
, Ity_I32
, at
, va4
);
2092 at
= mkPCastTo(mce
, Ity_I32
, at
);
2096 if (t1
== Ity_I32
&& t2
== Ity_I8
&& t3
== Ity_I8
&& t4
== Ity_I8
2097 && finalVty
== Ity_I32
) {
2098 if (0) VG_(printf
)("mkLazy4: I32 x I8 x I8 x I8 -> I32\n");
2099 at
= mkPCastTo(mce
, Ity_I8
, va1
);
2100 /* Now fold in 2nd, 3rd, 4th args. */
2101 at
= mkUifU(mce
, Ity_I8
, at
, va2
);
2102 at
= mkUifU(mce
, Ity_I8
, at
, va3
);
2103 at
= mkUifU(mce
, Ity_I8
, at
, va4
);
2104 at
= mkPCastTo(mce
, Ity_I32
, at
);
2108 if (t1
== Ity_I64
&& t2
== Ity_I8
&& t3
== Ity_I8
&& t4
== Ity_I8
2109 && finalVty
== Ity_I64
) {
2110 if (0) VG_(printf
)("mkLazy4: I64 x I8 x I8 x I8 -> I64\n");
2111 at
= mkPCastTo(mce
, Ity_I8
, va1
);
2112 /* Now fold in 2nd, 3rd, 4th args. */
2113 at
= mkUifU(mce
, Ity_I8
, at
, va2
);
2114 at
= mkUifU(mce
, Ity_I8
, at
, va3
);
2115 at
= mkUifU(mce
, Ity_I8
, at
, va4
);
2116 at
= mkPCastTo(mce
, Ity_I64
, at
);
2121 VG_(printf
)("mkLazy4: ");
2129 VG_(printf
)(" -> ");
2138 /* Do the lazy propagation game from a null-terminated vector of
2139 atoms. This is presumably the arguments to a helper call, so the
2140 IRCallee info is also supplied in order that we can know which
2141 arguments should be ignored (via the .mcx_mask field).
2144 IRAtom
* mkLazyN ( MCEnv
* mce
,
2145 IRAtom
** exprvec
, IRType finalVtype
, IRCallee
* cee
)
2151 Bool mergeTy64
= True
;
2153 /* Decide on the type of the merge intermediary. If all relevant
2154 args are I64, then it's I64. In all other circumstances, use
2156 for (i
= 0; exprvec
[i
]; i
++) {
2158 tl_assert(isOriginalAtom(mce
, exprvec
[i
]));
2159 if (cee
->mcx_mask
& (1<<i
))
2161 if (typeOfIRExpr(mce
->sb
->tyenv
, exprvec
[i
]) != Ity_I64
)
2165 mergeTy
= mergeTy64
? Ity_I64
: Ity_I32
;
2166 curr
= definedOfType(mergeTy
);
2168 for (i
= 0; exprvec
[i
]; i
++) {
2170 tl_assert(isOriginalAtom(mce
, exprvec
[i
]));
2171 /* Only take notice of this arg if the callee's mc-exclusion
2172 mask does not say it is to be excluded. */
2173 if (cee
->mcx_mask
& (1<<i
)) {
2174 /* the arg is to be excluded from definedness checking. Do
2176 if (0) VG_(printf
)("excluding %s(%d)\n", cee
->name
, i
);
2178 /* calculate the arg's definedness, and pessimistically merge
2180 here
= mkPCastTo( mce
, mergeTy
, expr2vbits(mce
, exprvec
[i
], HuOth
) );
2182 ? mkUifU64(mce
, here
, curr
)
2183 : mkUifU32(mce
, here
, curr
);
2186 return mkPCastTo(mce
, finalVtype
, curr
);
2190 /*------------------------------------------------------------*/
2191 /*--- Generating expensive sequences for exact carry-chain ---*/
2192 /*--- propagation in add/sub and related operations. ---*/
2193 /*------------------------------------------------------------*/
2196 IRAtom
* expensiveAddSub ( MCEnv
* mce
,
2199 IRAtom
* qaa
, IRAtom
* qbb
,
2200 IRAtom
* aa
, IRAtom
* bb
)
2202 IRAtom
*a_min
, *b_min
, *a_max
, *b_max
;
2203 IROp opAND
, opOR
, opXOR
, opNOT
, opADD
, opSUB
;
2205 tl_assert(isShadowAtom(mce
,qaa
));
2206 tl_assert(isShadowAtom(mce
,qbb
));
2207 tl_assert(isOriginalAtom(mce
,aa
));
2208 tl_assert(isOriginalAtom(mce
,bb
));
2209 tl_assert(sameKindedAtoms(qaa
,aa
));
2210 tl_assert(sameKindedAtoms(qbb
,bb
));
2230 VG_(tool_panic
)("expensiveAddSub");
2233 // a_min = aa & ~qaa
2234 a_min
= assignNew('V', mce
,ty
,
2236 assignNew('V', mce
,ty
, unop(opNOT
, qaa
))));
2238 // b_min = bb & ~qbb
2239 b_min
= assignNew('V', mce
,ty
,
2241 assignNew('V', mce
,ty
, unop(opNOT
, qbb
))));
2244 a_max
= assignNew('V', mce
,ty
, binop(opOR
, aa
, qaa
));
2247 b_max
= assignNew('V', mce
,ty
, binop(opOR
, bb
, qbb
));
2250 // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
2252 assignNew('V', mce
,ty
,
2254 assignNew('V', mce
,ty
, binop(opOR
, qaa
, qbb
)),
2255 assignNew('V', mce
,ty
,
2257 assignNew('V', mce
,ty
, binop(opADD
, a_min
, b_min
)),
2258 assignNew('V', mce
,ty
, binop(opADD
, a_max
, b_max
))
2264 // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max - b_min))
2266 assignNew('V', mce
,ty
,
2268 assignNew('V', mce
,ty
, binop(opOR
, qaa
, qbb
)),
2269 assignNew('V', mce
,ty
,
2271 assignNew('V', mce
,ty
, binop(opSUB
, a_min
, b_max
)),
2272 assignNew('V', mce
,ty
, binop(opSUB
, a_max
, b_min
))
2283 IRAtom
* expensiveCountTrailingZeroes ( MCEnv
* mce
, IROp czop
,
2284 IRAtom
* atom
, IRAtom
* vatom
)
2287 IROp xorOp
, subOp
, andOp
;
2289 IRAtom
*improver
, *improved
;
2290 tl_assert(isShadowAtom(mce
,vatom
));
2291 tl_assert(isOriginalAtom(mce
,atom
));
2292 tl_assert(sameKindedAtoms(atom
,vatom
));
2295 case Iop_Ctz32
: case Iop_CtzNat32
:
2302 case Iop_Ctz64
: case Iop_CtzNat64
:
2311 VG_(tool_panic
)("memcheck:expensiveCountTrailingZeroes");
2314 // improver = atom ^ (atom - 1)
2316 // That is, improver has its low ctz(atom)+1 bits equal to one;
2317 // higher bits (if any) equal to zero. So it's exactly the right
2318 // mask to use to remove the irrelevant undefined input bits.
2319 /* Here are some examples:
2320 atom = U...U 1 0...0
2321 atom-1 = U...U 0 1...1
2322 ^ed = 0...0 1 11111, which correctly describes which bits of |atom|
2323 actually influence the result
2327 ^ed = 11111, also a correct mask for the input: all input bits
2329 Another boundary case
2332 ^ed = 0..0 1, also a correct mask: only the rightmost input bit
2334 Now with misc U bits interspersed:
2335 atom = U...U 1 0 U...U 0 1 0...0
2336 atom-1 = U...U 1 0 U...U 0 0 1...1
2337 ^ed = 0...0 0 0 0...0 0 1 1...1, also correct
2338 (Per re-check/analysis of 14 Nov 2018)
2340 improver
= assignNew('V', mce
,ty
,
2343 assignNew('V', mce
, ty
,
2344 binop(subOp
, atom
, one
))));
2346 // improved = vatom & improver
2348 // That is, treat any V bits to the left of the rightmost ctz(atom)+1
2349 // bits as "defined".
2350 improved
= assignNew('V', mce
, ty
,
2351 binop(andOp
, vatom
, improver
));
2353 // Return pessimizing cast of improved.
2354 return mkPCastTo(mce
, ty
, improved
);
2358 IRAtom
* expensiveCountLeadingZeroes ( MCEnv
* mce
, IROp czop
,
2359 IRAtom
* atom
, IRAtom
* vatom
)
2362 IROp shrOp
, notOp
, andOp
;
2363 IRAtom
* (*mkRight
)(MCEnv
*, IRAtom
*);
2364 IRAtom
*improver
, *improved
;
2365 tl_assert(isShadowAtom(mce
,vatom
));
2366 tl_assert(isOriginalAtom(mce
,atom
));
2367 tl_assert(sameKindedAtoms(atom
,vatom
));
2370 case Iop_Clz32
: case Iop_ClzNat32
:
2375 mkRight
= mkRight32
;
2377 case Iop_Clz64
: case Iop_ClzNat64
:
2382 mkRight
= mkRight64
;
2386 VG_(tool_panic
)("memcheck:expensiveCountLeadingZeroes");
2389 // This is in principle very similar to how expensiveCountTrailingZeroes
2390 // works. That function computed an "improver", which it used to mask
2391 // off all but the rightmost 1-bit and the zeroes to the right of it,
2392 // hence removing irrelevant bits from the input. Here, we play the
2393 // exact same game but with the left-vs-right roles interchanged.
2394 // Unfortunately calculation of the improver in this case is
2395 // significantly more expensive.
2397 // improver = ~(RIGHT(atom) >>u 1)
2399 // That is, improver has its upper clz(atom)+1 bits equal to one;
2400 // lower bits (if any) equal to zero. So it's exactly the right
2401 // mask to use to remove the irrelevant undefined input bits.
2402 /* Here are some examples:
2403 atom = 0...0 1 U...U
2404 R(atom) = 0...0 1 1...1
2405 R(atom) >>u 1 = 0...0 0 1...1
2406 ~(R(atom) >>u 1) = 1...1 1 0...0
2407 which correctly describes which bits of |atom|
2408 actually influence the result
2412 R(atom) >>u 1 = 0...0
2413 ~(R(atom) >>u 1) = 1...1
2414 also a correct mask for the input: all input bits
2416 Another boundary case
2419 R(atom) >>u 1 = 0 1..1
2420 ~(R(atom) >>u 1) = 1 0..0
2421 also a correct mask: only the leftmost input bit
2423 Now with misc U bits interspersed:
2424 atom = 0...0 1 U...U 0 1 U...U
2425 R(atom) = 0...0 1 1...1 1 1 1...1
2426 R(atom) >>u 1 = 0...0 0 1...1 1 1 1...1
2427 ~(R(atom) >>u 1) = 1...1 1 0...0 0 0 0...0, also correct
2428 (Per initial implementation of 15 Nov 2018)
2430 improver
= mkRight(mce
, atom
);
2431 improver
= assignNew('V', mce
, ty
, binop(shrOp
, improver
, mkU8(1)));
2432 improver
= assignNew('V', mce
, ty
, unop(notOp
, improver
));
2434 // improved = vatom & improver
2436 // That is, treat any V bits to the right of the leftmost clz(atom)+1
2437 // bits as "defined".
2438 improved
= assignNew('V', mce
, ty
,
2439 binop(andOp
, vatom
, improver
));
2441 // Return pessimizing cast of improved.
2442 return mkPCastTo(mce
, ty
, improved
);
2446 /*------------------------------------------------------------*/
2447 /*--- Scalar shifts. ---*/
2448 /*------------------------------------------------------------*/
2450 /* Produce an interpretation for (aa << bb) (or >>s, >>u). The basic
2451 idea is to shift the definedness bits by the original shift amount.
2452 This introduces 0s ("defined") in new positions for left shifts and
2453 unsigned right shifts, and copies the top definedness bit for
2454 signed right shifts. So, conveniently, applying the original shift
2455 operator to the definedness bits for the left arg is exactly the
2460 However if the shift amount is undefined then the whole result
2461 is undefined. Hence need:
2463 (qaa << bb) `UifU` PCast(qbb)
2465 If the shift amount bb is a literal than qbb will say 'all defined'
2466 and the UifU and PCast will get folded out by post-instrumentation
2469 static IRAtom
* scalarShift ( MCEnv
* mce
,
2472 IRAtom
* qaa
, IRAtom
* qbb
,
2473 IRAtom
* aa
, IRAtom
* bb
)
2475 tl_assert(isShadowAtom(mce
,qaa
));
2476 tl_assert(isShadowAtom(mce
,qbb
));
2477 tl_assert(isOriginalAtom(mce
,aa
));
2478 tl_assert(isOriginalAtom(mce
,bb
));
2479 tl_assert(sameKindedAtoms(qaa
,aa
));
2480 tl_assert(sameKindedAtoms(qbb
,bb
));
2485 assignNew('V', mce
, ty
, binop(original_op
, qaa
, bb
)),
2486 mkPCastTo(mce
, ty
, qbb
)
2492 /*------------------------------------------------------------*/
2493 /*--- Helpers for dealing with vector primops. ---*/
2494 /*------------------------------------------------------------*/
2496 /* Vector pessimisation -- pessimise within each lane individually. */
2498 static IRAtom
* mkPCast8x16 ( MCEnv
* mce
, IRAtom
* at
)
2500 return assignNew('V', mce
, Ity_V128
, unop(Iop_CmpNEZ8x16
, at
));
2503 static IRAtom
* mkPCast16x8 ( MCEnv
* mce
, IRAtom
* at
)
2505 return assignNew('V', mce
, Ity_V128
, unop(Iop_CmpNEZ16x8
, at
));
2508 static IRAtom
* mkPCast32x4 ( MCEnv
* mce
, IRAtom
* at
)
2510 return assignNew('V', mce
, Ity_V128
, unop(Iop_CmpNEZ32x4
, at
));
2513 static IRAtom
* mkPCast64x2 ( MCEnv
* mce
, IRAtom
* at
)
2515 return assignNew('V', mce
, Ity_V128
, unop(Iop_CmpNEZ64x2
, at
));
2518 static IRAtom
* mkPCast128x1 ( MCEnv
* mce
, IRAtom
* at
)
2520 return assignNew('V', mce
, Ity_V128
, unop(Iop_CmpNEZ128x1
, at
));
2523 static IRAtom
* mkPCast64x4 ( MCEnv
* mce
, IRAtom
* at
)
2525 return assignNew('V', mce
, Ity_V256
, unop(Iop_CmpNEZ64x4
, at
));
2528 static IRAtom
* mkPCast32x8 ( MCEnv
* mce
, IRAtom
* at
)
2530 return assignNew('V', mce
, Ity_V256
, unop(Iop_CmpNEZ32x8
, at
));
2533 static IRAtom
* mkPCast32x2 ( MCEnv
* mce
, IRAtom
* at
)
2535 return assignNew('V', mce
, Ity_I64
, unop(Iop_CmpNEZ32x2
, at
));
2538 static IRAtom
* mkPCast16x16 ( MCEnv
* mce
, IRAtom
* at
)
2540 return assignNew('V', mce
, Ity_V256
, unop(Iop_CmpNEZ16x16
, at
));
2543 static IRAtom
* mkPCast16x4 ( MCEnv
* mce
, IRAtom
* at
)
2545 return assignNew('V', mce
, Ity_I64
, unop(Iop_CmpNEZ16x4
, at
));
2548 static IRAtom
* mkPCast8x32 ( MCEnv
* mce
, IRAtom
* at
)
2550 return assignNew('V', mce
, Ity_V256
, unop(Iop_CmpNEZ8x32
, at
));
2553 static IRAtom
* mkPCast8x8 ( MCEnv
* mce
, IRAtom
* at
)
2555 return assignNew('V', mce
, Ity_I64
, unop(Iop_CmpNEZ8x8
, at
));
2558 static IRAtom
* mkPCast16x2 ( MCEnv
* mce
, IRAtom
* at
)
2560 return assignNew('V', mce
, Ity_I32
, unop(Iop_CmpNEZ16x2
, at
));
2563 static IRAtom
* mkPCast8x4 ( MCEnv
* mce
, IRAtom
* at
)
2565 return assignNew('V', mce
, Ity_I32
, unop(Iop_CmpNEZ8x4
, at
));
2569 /* Here's a simple scheme capable of handling ops derived from SSE1
2570 code and while only generating ops that can be efficiently
2571 implemented in SSE1. */
2573 /* All-lanes versions are straightforward:
2575 binary32Fx4(x,y) ==> PCast32x4(UifUV128(x#,y#))
2577 unary32Fx4(x,y) ==> PCast32x4(x#)
2579 Lowest-lane-only versions are more complex:
2581 binary32F0x4(x,y) ==> SetV128lo32(
2583 PCast32(V128to32(UifUV128(x#,y#)))
2586 This is perhaps not so obvious. In particular, it's faster to
2587 do a V128-bit UifU and then take the bottom 32 bits than the more
2588 obvious scheme of taking the bottom 32 bits of each operand
2589 and doing a 32-bit UifU. Basically since UifU is fast and
2590 chopping lanes off vector values is slow.
2594 unary32F0x4(x) ==> SetV128lo32(
2596 PCast32(V128to32(x#))
2601 PCast32(v#) = 1Sto32(CmpNE32(v#,0))
2602 PCast32x4(v#) = CmpNEZ32x4(v#)
2606 IRAtom
* binary32Fx4 ( MCEnv
* mce
, IRAtom
* vatomX
, IRAtom
* vatomY
)
2609 tl_assert(isShadowAtom(mce
, vatomX
));
2610 tl_assert(isShadowAtom(mce
, vatomY
));
2611 at
= mkUifUV128(mce
, vatomX
, vatomY
);
2612 at
= assignNew('V', mce
, Ity_V128
, mkPCast32x4(mce
, at
));
2617 IRAtom
* unary32Fx4 ( MCEnv
* mce
, IRAtom
* vatomX
)
2620 tl_assert(isShadowAtom(mce
, vatomX
));
2621 at
= assignNew('V', mce
, Ity_V128
, mkPCast32x4(mce
, vatomX
));
2626 IRAtom
* binary32F0x4 ( MCEnv
* mce
, IRAtom
* vatomX
, IRAtom
* vatomY
)
2629 tl_assert(isShadowAtom(mce
, vatomX
));
2630 tl_assert(isShadowAtom(mce
, vatomY
));
2631 at
= mkUifUV128(mce
, vatomX
, vatomY
);
2632 at
= assignNew('V', mce
, Ity_I32
, unop(Iop_V128to32
, at
));
2633 at
= mkPCastTo(mce
, Ity_I32
, at
);
2634 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_SetV128lo32
, vatomX
, at
));
2639 IRAtom
* unary32F0x4 ( MCEnv
* mce
, IRAtom
* vatomX
)
2642 tl_assert(isShadowAtom(mce
, vatomX
));
2643 at
= assignNew('V', mce
, Ity_I32
, unop(Iop_V128to32
, vatomX
));
2644 at
= mkPCastTo(mce
, Ity_I32
, at
);
2645 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_SetV128lo32
, vatomX
, at
));
2649 /* --- ... and ... 64Fx2 versions of the same ... --- */
2652 IRAtom
* binary64Fx2 ( MCEnv
* mce
, IRAtom
* vatomX
, IRAtom
* vatomY
)
2655 tl_assert(isShadowAtom(mce
, vatomX
));
2656 tl_assert(isShadowAtom(mce
, vatomY
));
2657 at
= mkUifUV128(mce
, vatomX
, vatomY
);
2658 at
= assignNew('V', mce
, Ity_V128
, mkPCast64x2(mce
, at
));
2663 IRAtom
* unary64Fx2 ( MCEnv
* mce
, IRAtom
* vatomX
)
2666 tl_assert(isShadowAtom(mce
, vatomX
));
2667 at
= assignNew('V', mce
, Ity_V128
, mkPCast64x2(mce
, vatomX
));
2672 IRAtom
* binary64F0x2 ( MCEnv
* mce
, IRAtom
* vatomX
, IRAtom
* vatomY
)
2675 tl_assert(isShadowAtom(mce
, vatomX
));
2676 tl_assert(isShadowAtom(mce
, vatomY
));
2677 at
= mkUifUV128(mce
, vatomX
, vatomY
);
2678 at
= assignNew('V', mce
, Ity_I64
, unop(Iop_V128to64
, at
));
2679 at
= mkPCastTo(mce
, Ity_I64
, at
);
2680 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_SetV128lo64
, vatomX
, at
));
2685 IRAtom
* unary64F0x2 ( MCEnv
* mce
, IRAtom
* vatomX
)
2688 tl_assert(isShadowAtom(mce
, vatomX
));
2689 at
= assignNew('V', mce
, Ity_I64
, unop(Iop_V128to64
, vatomX
));
2690 at
= mkPCastTo(mce
, Ity_I64
, at
);
2691 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_SetV128lo64
, vatomX
, at
));
2695 /* --- --- ... and ... 32Fx2 versions of the same --- --- */
2698 IRAtom
* binary32Fx2 ( MCEnv
* mce
, IRAtom
* vatomX
, IRAtom
* vatomY
)
2701 tl_assert(isShadowAtom(mce
, vatomX
));
2702 tl_assert(isShadowAtom(mce
, vatomY
));
2703 at
= mkUifU64(mce
, vatomX
, vatomY
);
2704 at
= assignNew('V', mce
, Ity_I64
, mkPCast32x2(mce
, at
));
2709 IRAtom
* unary32Fx2 ( MCEnv
* mce
, IRAtom
* vatomX
)
2712 tl_assert(isShadowAtom(mce
, vatomX
));
2713 at
= assignNew('V', mce
, Ity_I64
, mkPCast32x2(mce
, vatomX
));
2717 /* --- ... and ... 64Fx4 versions of the same ... --- */
2720 IRAtom
* binary64Fx4 ( MCEnv
* mce
, IRAtom
* vatomX
, IRAtom
* vatomY
)
2723 tl_assert(isShadowAtom(mce
, vatomX
));
2724 tl_assert(isShadowAtom(mce
, vatomY
));
2725 at
= mkUifUV256(mce
, vatomX
, vatomY
);
2726 at
= assignNew('V', mce
, Ity_V256
, mkPCast64x4(mce
, at
));
2731 IRAtom
* unary64Fx4 ( MCEnv
* mce
, IRAtom
* vatomX
)
2734 tl_assert(isShadowAtom(mce
, vatomX
));
2735 at
= assignNew('V', mce
, Ity_V256
, mkPCast64x4(mce
, vatomX
));
2739 /* --- ... and ... 32Fx8 versions of the same ... --- */
2742 IRAtom
* binary32Fx8 ( MCEnv
* mce
, IRAtom
* vatomX
, IRAtom
* vatomY
)
2745 tl_assert(isShadowAtom(mce
, vatomX
));
2746 tl_assert(isShadowAtom(mce
, vatomY
));
2747 at
= mkUifUV256(mce
, vatomX
, vatomY
);
2748 at
= assignNew('V', mce
, Ity_V256
, mkPCast32x8(mce
, at
));
2753 IRAtom
* unary32Fx8 ( MCEnv
* mce
, IRAtom
* vatomX
)
2756 tl_assert(isShadowAtom(mce
, vatomX
));
2757 at
= assignNew('V', mce
, Ity_V256
, mkPCast32x8(mce
, vatomX
));
2761 /* --- 64Fx2 binary FP ops, with rounding mode --- */
2764 IRAtom
* binary64Fx2_w_rm ( MCEnv
* mce
, IRAtom
* vRM
,
2765 IRAtom
* vatomX
, IRAtom
* vatomY
)
2767 /* This is the same as binary64Fx2, except that we subsequently
2768 pessimise vRM (definedness of the rounding mode), widen to 128
2769 bits and UifU it into the result. As with the scalar cases, if
2770 the RM is a constant then it is defined and so this extra bit
2771 will get constant-folded out later. */
2772 // "do" the vector args
2773 IRAtom
* t1
= binary64Fx2(mce
, vatomX
, vatomY
);
2774 // PCast the RM, and widen it to 128 bits
2775 IRAtom
* t2
= mkPCastTo(mce
, Ity_V128
, vRM
);
2776 // Roll it into the result
2777 t1
= mkUifUV128(mce
, t1
, t2
);
2781 /* --- ... and ... 32Fx4 versions of the same --- */
2784 IRAtom
* binary32Fx4_w_rm ( MCEnv
* mce
, IRAtom
* vRM
,
2785 IRAtom
* vatomX
, IRAtom
* vatomY
)
2787 IRAtom
* t1
= binary32Fx4(mce
, vatomX
, vatomY
);
2788 // PCast the RM, and widen it to 128 bits
2789 IRAtom
* t2
= mkPCastTo(mce
, Ity_V128
, vRM
);
2790 // Roll it into the result
2791 t1
= mkUifUV128(mce
, t1
, t2
);
2795 /* --- ... and ... 64Fx4 versions of the same --- */
2798 IRAtom
* binary64Fx4_w_rm ( MCEnv
* mce
, IRAtom
* vRM
,
2799 IRAtom
* vatomX
, IRAtom
* vatomY
)
2801 IRAtom
* t1
= binary64Fx4(mce
, vatomX
, vatomY
);
2802 // PCast the RM, and widen it to 256 bits
2803 IRAtom
* t2
= mkPCastTo(mce
, Ity_V256
, vRM
);
2804 // Roll it into the result
2805 t1
= mkUifUV256(mce
, t1
, t2
);
2809 /* --- ... and ... 32Fx8 versions of the same --- */
2812 IRAtom
* binary32Fx8_w_rm ( MCEnv
* mce
, IRAtom
* vRM
,
2813 IRAtom
* vatomX
, IRAtom
* vatomY
)
2815 IRAtom
* t1
= binary32Fx8(mce
, vatomX
, vatomY
);
2816 // PCast the RM, and widen it to 256 bits
2817 IRAtom
* t2
= mkPCastTo(mce
, Ity_V256
, vRM
);
2818 // Roll it into the result
2819 t1
= mkUifUV256(mce
, t1
, t2
);
2823 /* --- 64Fx2 unary FP ops, with rounding mode --- */
2826 IRAtom
* unary64Fx2_w_rm ( MCEnv
* mce
, IRAtom
* vRM
, IRAtom
* vatomX
)
2828 /* Same scheme as binary64Fx2_w_rm. */
2829 // "do" the vector arg
2830 IRAtom
* t1
= unary64Fx2(mce
, vatomX
);
2831 // PCast the RM, and widen it to 128 bits
2832 IRAtom
* t2
= mkPCastTo(mce
, Ity_V128
, vRM
);
2833 // Roll it into the result
2834 t1
= mkUifUV128(mce
, t1
, t2
);
2838 /* --- ... and ... 32Fx4 versions of the same --- */
2841 IRAtom
* unary32Fx4_w_rm ( MCEnv
* mce
, IRAtom
* vRM
, IRAtom
* vatomX
)
2843 /* Same scheme as binaryFx4_w_rm. */
2844 IRAtom
* t1
= unary32Fx4(mce
, vatomX
);
2845 // PCast the RM, and widen it to 128 bits
2846 IRAtom
* t2
= mkPCastTo(mce
, Ity_V128
, vRM
);
2847 // Roll it into the result
2848 t1
= mkUifUV128(mce
, t1
, t2
);
2852 /* --- ... and ... 32Fx8 versions of the same --- */
2855 IRAtom
* unary32Fx8_w_rm ( MCEnv
* mce
, IRAtom
* vRM
, IRAtom
* vatomX
)
2857 /* Same scheme as unary32Fx8_w_rm. */
2858 IRAtom
* t1
= unary32Fx8(mce
, vatomX
);
2859 // PCast the RM, and widen it to 256 bits
2860 IRAtom
* t2
= mkPCastTo(mce
, Ity_V256
, vRM
);
2861 // Roll it into the result
2862 t1
= mkUifUV256(mce
, t1
, t2
);
2867 /* --- --- Vector saturated narrowing --- --- */
2869 /* We used to do something very clever here, but on closer inspection
2870 (2011-Jun-15), and in particular bug #279698, it turns out to be
2871 wrong. Part of the problem came from the fact that for a long
2872 time, the IR primops to do with saturated narrowing were
2873 underspecified and managed to confuse multiple cases which needed
2874 to be separate: the op names had a signedness qualifier, but in
2875 fact the source and destination signednesses needed to be specified
2876 independently, so the op names really need two independent
2877 signedness specifiers.
2879 As of 2011-Jun-15 (ish) the underspecification was sorted out
2880 properly. The incorrect instrumentation remained, though. That
2881 has now (2011-Oct-22) been fixed.
2883 What we now do is simple:
2885 Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
2886 number of lanes, X is the source lane width and signedness, and Y
2887 is the destination lane width and signedness. In all cases the
2888 destination lane width is half the source lane width, so the names
2889 have a bit of redundancy, but are at least easy to read.
2891 For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
2894 Let Vanilla(OP) be a function that takes OP, one of these
2895 saturating narrowing ops, and produces the same "shaped" narrowing
2896 op which is not saturating, but merely dumps the most significant
2897 bits. "same shape" means that the lane numbers and widths are the
2900 For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
2901 = Iop_NarrowBin32to16x8,
2902 that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
2903 dumping the top half of each lane.
2905 So, with that in place, the scheme is simple, and it is simple to
2906 pessimise each lane individually and then apply Vanilla(OP) so as
2907 to get the result in the right "shape". If the original OP is
2908 QNarrowBinXtoYxZ then we produce
2910 Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
2912 or for the case when OP is unary (Iop_QNarrowUn*)
2914 Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
2917 IROp
vanillaNarrowingOpOfShape ( IROp qnarrowOp
)
2919 switch (qnarrowOp
) {
2920 /* Binary: (128, 128) -> 128 */
2921 case Iop_QNarrowBin16Sto8Ux16
:
2922 case Iop_QNarrowBin16Sto8Sx16
:
2923 case Iop_QNarrowBin16Uto8Ux16
:
2924 case Iop_QNarrowBin64Sto32Sx4
:
2925 case Iop_QNarrowBin64Uto32Ux4
:
2926 return Iop_NarrowBin16to8x16
;
2927 case Iop_QNarrowBin32Sto16Ux8
:
2928 case Iop_QNarrowBin32Sto16Sx8
:
2929 case Iop_QNarrowBin32Uto16Ux8
:
2930 return Iop_NarrowBin32to16x8
;
2931 /* Binary: (64, 64) -> 64 */
2932 case Iop_QNarrowBin32Sto16Sx4
:
2933 return Iop_NarrowBin32to16x4
;
2934 case Iop_QNarrowBin16Sto8Ux8
:
2935 case Iop_QNarrowBin16Sto8Sx8
:
2936 return Iop_NarrowBin16to8x8
;
2937 /* Unary: 128 -> 64 */
2938 case Iop_QNarrowUn64Uto32Ux2
:
2939 case Iop_QNarrowUn64Sto32Sx2
:
2940 case Iop_QNarrowUn64Sto32Ux2
:
2941 return Iop_NarrowUn64to32x2
;
2942 case Iop_QNarrowUn32Uto16Ux4
:
2943 case Iop_QNarrowUn32Sto16Sx4
:
2944 case Iop_QNarrowUn32Sto16Ux4
:
2945 case Iop_F32toF16x4_DEP
:
2946 return Iop_NarrowUn32to16x4
;
2947 case Iop_QNarrowUn16Uto8Ux8
:
2948 case Iop_QNarrowUn16Sto8Sx8
:
2949 case Iop_QNarrowUn16Sto8Ux8
:
2950 return Iop_NarrowUn16to8x8
;
2953 VG_(tool_panic
)("vanillaNarrowOpOfShape");
2958 IRAtom
* vectorNarrowBinV128 ( MCEnv
* mce
, IROp narrow_op
,
2959 IRAtom
* vatom1
, IRAtom
* vatom2
)
2961 IRAtom
*at1
, *at2
, *at3
;
2962 IRAtom
* (*pcast
)( MCEnv
*, IRAtom
* );
2963 switch (narrow_op
) {
2964 case Iop_QNarrowBin64Sto32Sx4
: pcast
= mkPCast32x4
; break;
2965 case Iop_QNarrowBin64Uto32Ux4
: pcast
= mkPCast32x4
; break;
2966 case Iop_QNarrowBin32Sto16Sx8
: pcast
= mkPCast32x4
; break;
2967 case Iop_QNarrowBin32Uto16Ux8
: pcast
= mkPCast32x4
; break;
2968 case Iop_QNarrowBin32Sto16Ux8
: pcast
= mkPCast32x4
; break;
2969 case Iop_QNarrowBin16Sto8Sx16
: pcast
= mkPCast16x8
; break;
2970 case Iop_QNarrowBin16Uto8Ux16
: pcast
= mkPCast16x8
; break;
2971 case Iop_QNarrowBin16Sto8Ux16
: pcast
= mkPCast16x8
; break;
2972 default: VG_(tool_panic
)("vectorNarrowBinV128");
2974 IROp vanilla_narrow
= vanillaNarrowingOpOfShape(narrow_op
);
2975 tl_assert(isShadowAtom(mce
,vatom1
));
2976 tl_assert(isShadowAtom(mce
,vatom2
));
2977 at1
= assignNew('V', mce
, Ity_V128
, pcast(mce
, vatom1
));
2978 at2
= assignNew('V', mce
, Ity_V128
, pcast(mce
, vatom2
));
2979 at3
= assignNew('V', mce
, Ity_V128
, binop(vanilla_narrow
, at1
, at2
));
2984 IRAtom
* vectorNarrowBin64 ( MCEnv
* mce
, IROp narrow_op
,
2985 IRAtom
* vatom1
, IRAtom
* vatom2
)
2987 IRAtom
*at1
, *at2
, *at3
;
2988 IRAtom
* (*pcast
)( MCEnv
*, IRAtom
* );
2989 switch (narrow_op
) {
2990 case Iop_QNarrowBin32Sto16Sx4
: pcast
= mkPCast32x2
; break;
2991 case Iop_QNarrowBin16Sto8Sx8
: pcast
= mkPCast16x4
; break;
2992 case Iop_QNarrowBin16Sto8Ux8
: pcast
= mkPCast16x4
; break;
2993 default: VG_(tool_panic
)("vectorNarrowBin64");
2995 IROp vanilla_narrow
= vanillaNarrowingOpOfShape(narrow_op
);
2996 tl_assert(isShadowAtom(mce
,vatom1
));
2997 tl_assert(isShadowAtom(mce
,vatom2
));
2998 at1
= assignNew('V', mce
, Ity_I64
, pcast(mce
, vatom1
));
2999 at2
= assignNew('V', mce
, Ity_I64
, pcast(mce
, vatom2
));
3000 at3
= assignNew('V', mce
, Ity_I64
, binop(vanilla_narrow
, at1
, at2
));
3005 IRAtom
* vectorNarrowUnV128 ( MCEnv
* mce
, IROp narrow_op
,
3009 IRAtom
* (*pcast
)( MCEnv
*, IRAtom
* );
3010 tl_assert(isShadowAtom(mce
,vatom1
));
3011 /* For vanilla narrowing (non-saturating), we can just apply
3012 the op directly to the V bits. */
3013 switch (narrow_op
) {
3014 case Iop_NarrowUn16to8x8
:
3015 case Iop_NarrowUn32to16x4
:
3016 case Iop_NarrowUn64to32x2
:
3017 case Iop_F32toF16x4_DEP
:
3018 at1
= assignNew('V', mce
, Ity_I64
, unop(narrow_op
, vatom1
));
3021 break; /* Do Plan B */
3023 /* Plan B: for ops that involve a saturation operation on the args,
3024 we must PCast before the vanilla narrow. */
3025 switch (narrow_op
) {
3026 case Iop_QNarrowUn16Sto8Sx8
: pcast
= mkPCast16x8
; break;
3027 case Iop_QNarrowUn16Sto8Ux8
: pcast
= mkPCast16x8
; break;
3028 case Iop_QNarrowUn16Uto8Ux8
: pcast
= mkPCast16x8
; break;
3029 case Iop_QNarrowUn32Sto16Sx4
: pcast
= mkPCast32x4
; break;
3030 case Iop_QNarrowUn32Sto16Ux4
: pcast
= mkPCast32x4
; break;
3031 case Iop_QNarrowUn32Uto16Ux4
: pcast
= mkPCast32x4
; break;
3032 case Iop_QNarrowUn64Sto32Sx2
: pcast
= mkPCast64x2
; break;
3033 case Iop_QNarrowUn64Sto32Ux2
: pcast
= mkPCast64x2
; break;
3034 case Iop_QNarrowUn64Uto32Ux2
: pcast
= mkPCast64x2
; break;
3035 default: VG_(tool_panic
)("vectorNarrowUnV128");
3037 IROp vanilla_narrow
= vanillaNarrowingOpOfShape(narrow_op
);
3038 at1
= assignNew('V', mce
, Ity_V128
, pcast(mce
, vatom1
));
3039 at2
= assignNew('V', mce
, Ity_I64
, unop(vanilla_narrow
, at1
));
3044 IRAtom
* vectorWidenI64 ( MCEnv
* mce
, IROp longen_op
,
3048 IRAtom
* (*pcast
)( MCEnv
*, IRAtom
* );
3049 switch (longen_op
) {
3050 case Iop_Widen8Uto16x8
: pcast
= mkPCast16x8
; break;
3051 case Iop_Widen8Sto16x8
: pcast
= mkPCast16x8
; break;
3052 case Iop_Widen16Uto32x4
: pcast
= mkPCast32x4
; break;
3053 case Iop_Widen16Sto32x4
: pcast
= mkPCast32x4
; break;
3054 case Iop_Widen32Uto64x2
: pcast
= mkPCast64x2
; break;
3055 case Iop_Widen32Sto64x2
: pcast
= mkPCast64x2
; break;
3056 case Iop_F16toF32x4
: pcast
= mkPCast32x4
; break;
3057 default: VG_(tool_panic
)("vectorWidenI64");
3059 tl_assert(isShadowAtom(mce
,vatom1
));
3060 at1
= assignNew('V', mce
, Ity_V128
, unop(longen_op
, vatom1
));
3061 at2
= assignNew('V', mce
, Ity_V128
, pcast(mce
, at1
));
3066 /* --- --- Vector integer arithmetic --- --- */
3068 /* Simple ... UifU the args and per-lane pessimise the results. */
3070 /* --- V256-bit versions --- */
3073 IRAtom
* binary8Ix32 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3076 at
= mkUifUV256(mce
, vatom1
, vatom2
);
3077 at
= mkPCast8x32(mce
, at
);
3082 IRAtom
* binary16Ix16 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3085 at
= mkUifUV256(mce
, vatom1
, vatom2
);
3086 at
= mkPCast16x16(mce
, at
);
3091 IRAtom
* binary32Ix8 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3094 at
= mkUifUV256(mce
, vatom1
, vatom2
);
3095 at
= mkPCast32x8(mce
, at
);
3100 IRAtom
* binary64Ix4 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3103 at
= mkUifUV256(mce
, vatom1
, vatom2
);
3104 at
= mkPCast64x4(mce
, at
);
3108 /* --- V128-bit versions --- */
3111 IRAtom
* binary8Ix16 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3114 at
= mkUifUV128(mce
, vatom1
, vatom2
);
3115 at
= mkPCast8x16(mce
, at
);
3120 IRAtom
* binary16Ix8 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3123 at
= mkUifUV128(mce
, vatom1
, vatom2
);
3124 at
= mkPCast16x8(mce
, at
);
3129 IRAtom
* binary32Ix4 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3132 at
= mkUifUV128(mce
, vatom1
, vatom2
);
3133 at
= mkPCast32x4(mce
, at
);
3138 IRAtom
* binary64Ix2 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3141 at
= mkUifUV128(mce
, vatom1
, vatom2
);
3142 at
= mkPCast64x2(mce
, at
);
3147 IRAtom
* binary128Ix1 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3150 at
= mkUifUV128(mce
, vatom1
, vatom2
);
3151 at
= mkPCast128x1(mce
, at
);
3155 /* --- 64-bit versions --- */
3158 IRAtom
* binary8Ix8 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3161 at
= mkUifU64(mce
, vatom1
, vatom2
);
3162 at
= mkPCast8x8(mce
, at
);
3167 IRAtom
* binary16Ix4 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3170 at
= mkUifU64(mce
, vatom1
, vatom2
);
3171 at
= mkPCast16x4(mce
, at
);
3176 IRAtom
* binary32Ix2 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3179 at
= mkUifU64(mce
, vatom1
, vatom2
);
3180 at
= mkPCast32x2(mce
, at
);
3185 IRAtom
* binary64Ix1 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3188 at
= mkUifU64(mce
, vatom1
, vatom2
);
3189 at
= mkPCastTo(mce
, Ity_I64
, at
);
3193 /* --- 32-bit versions --- */
3196 IRAtom
* binary8Ix4 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3199 at
= mkUifU32(mce
, vatom1
, vatom2
);
3200 at
= mkPCast8x4(mce
, at
);
3205 IRAtom
* binary16Ix2 ( MCEnv
* mce
, IRAtom
* vatom1
, IRAtom
* vatom2
)
3208 at
= mkUifU32(mce
, vatom1
, vatom2
);
3209 at
= mkPCast16x2(mce
, at
);
3214 /*------------------------------------------------------------*/
3215 /*--- Generate shadow values from all kinds of IRExprs. ---*/
3216 /*------------------------------------------------------------*/
3219 IRAtom
* expr2vbits_Qop ( MCEnv
* mce
,
3221 IRAtom
* atom1
, IRAtom
* atom2
,
3222 IRAtom
* atom3
, IRAtom
* atom4
)
3224 IRAtom
* vatom1
= expr2vbits( mce
, atom1
, HuOth
);
3225 IRAtom
* vatom2
= expr2vbits( mce
, atom2
, HuOth
);
3226 IRAtom
* vatom3
= expr2vbits( mce
, atom3
, HuOth
);
3227 IRAtom
* vatom4
= expr2vbits( mce
, atom4
, HuOth
);
3229 tl_assert(isOriginalAtom(mce
,atom1
));
3230 tl_assert(isOriginalAtom(mce
,atom2
));
3231 tl_assert(isOriginalAtom(mce
,atom3
));
3232 tl_assert(isOriginalAtom(mce
,atom4
));
3233 tl_assert(isShadowAtom(mce
,vatom1
));
3234 tl_assert(isShadowAtom(mce
,vatom2
));
3235 tl_assert(isShadowAtom(mce
,vatom3
));
3236 tl_assert(isShadowAtom(mce
,vatom4
));
3237 tl_assert(sameKindedAtoms(atom1
,vatom1
));
3238 tl_assert(sameKindedAtoms(atom2
,vatom2
));
3239 tl_assert(sameKindedAtoms(atom3
,vatom3
));
3240 tl_assert(sameKindedAtoms(atom4
,vatom4
));
3243 case Iop_MAddF64r32
:
3245 case Iop_MSubF64r32
:
3246 /* I32(rm) x F64 x F64 x F64 -> F64 */
3247 return mkLazy4(mce
, Ity_I64
, vatom1
, vatom2
, vatom3
, vatom4
);
3251 /* I32(rm) x F32 x F32 x F32 -> F32 */
3252 return mkLazy4(mce
, Ity_I32
, vatom1
, vatom2
, vatom3
, vatom4
);
3256 case Iop_NegMAddF128
:
3257 case Iop_NegMSubF128
:
3258 /* I32(rm) x F128 x F128 x F128 -> F128 */
3259 return mkLazy4(mce
, Ity_I128
, vatom1
, vatom2
, vatom3
, vatom4
);
3261 /* V256-bit data-steering */
3262 case Iop_64x4toV256
:
3263 return assignNew('V', mce
, Ity_V256
,
3264 IRExpr_Qop(op
, vatom1
, vatom2
, vatom3
, vatom4
));
3266 /* I32/I64 x I8 x I8 x I8 -> I32/I64 */
3268 return mkLazy4(mce
, Ity_I32
, vatom1
, vatom2
, vatom3
, vatom4
);
3270 return mkLazy4(mce
, Ity_I64
, vatom1
, vatom2
, vatom3
, vatom4
);
3273 VG_(tool_panic
)("memcheck:expr2vbits_Qop");
3279 IRAtom
* expr2vbits_Triop ( MCEnv
* mce
,
3281 IRAtom
* atom1
, IRAtom
* atom2
, IRAtom
* atom3
)
3283 IRAtom
* vatom1
= expr2vbits( mce
, atom1
, HuOth
);
3284 IRAtom
* vatom2
= expr2vbits( mce
, atom2
, HuOth
);
3285 IRAtom
* vatom3
= expr2vbits( mce
, atom3
, HuOth
);
3287 tl_assert(isOriginalAtom(mce
,atom1
));
3288 tl_assert(isOriginalAtom(mce
,atom2
));
3289 tl_assert(isOriginalAtom(mce
,atom3
));
3290 tl_assert(isShadowAtom(mce
,vatom1
));
3291 tl_assert(isShadowAtom(mce
,vatom2
));
3292 tl_assert(isShadowAtom(mce
,vatom3
));
3293 tl_assert(sameKindedAtoms(atom1
,vatom1
));
3294 tl_assert(sameKindedAtoms(atom2
,vatom2
));
3295 tl_assert(sameKindedAtoms(atom3
,vatom3
));
3305 case Iop_QuantizeD128
:
3306 /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
3307 return mkLazy3(mce
, Ity_I128
, vatom1
, vatom2
, vatom3
);
3326 case Iop_QuantizeD64
:
3327 /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
3328 return mkLazy3(mce
, Ity_I64
, vatom1
, vatom2
, vatom3
);
3329 case Iop_PRemC3210F64
:
3330 case Iop_PRem1C3210F64
:
3331 /* I32(rm) x F64 x F64 -> I32 */
3332 return mkLazy3(mce
, Ity_I32
, vatom1
, vatom2
, vatom3
);
3337 /* I32(rm) x F32 x F32 -> I32 */
3338 return mkLazy3(mce
, Ity_I32
, vatom1
, vatom2
, vatom3
);
3339 case Iop_SignificanceRoundD64
:
3340 /* IRRoundingMode(I32) x I8 x D64 -> D64 */
3341 return mkLazy3(mce
, Ity_I64
, vatom1
, vatom2
, vatom3
);
3342 case Iop_SignificanceRoundD128
:
3343 /* IRRoundingMode(I32) x I8 x D128 -> D128 */
3344 return mkLazy3(mce
, Ity_I128
, vatom1
, vatom2
, vatom3
);
3346 /* (V128, V128, I8) -> V128 */
3347 complainIfUndefined(mce
, atom3
, NULL
);
3348 return assignNew('V', mce
, Ity_V128
, triop(op
, vatom1
, vatom2
, atom3
));
3350 /* (I64, I64, I8) -> I64 */
3351 complainIfUndefined(mce
, atom3
, NULL
);
3352 return assignNew('V', mce
, Ity_I64
, triop(op
, vatom1
, vatom2
, atom3
));
3353 case Iop_SetElem8x8
:
3354 case Iop_SetElem16x4
:
3355 case Iop_SetElem32x2
:
3356 complainIfUndefined(mce
, atom2
, NULL
);
3357 return assignNew('V', mce
, Ity_I64
, triop(op
, vatom1
, atom2
, vatom3
));
3359 case Iop_SetElem8x16
:
3360 case Iop_SetElem16x8
:
3361 case Iop_SetElem32x4
:
3362 case Iop_SetElem64x2
:
3363 complainIfUndefined(mce
, atom2
, NULL
);
3364 return assignNew('V', mce
, Ity_V128
, triop(op
, vatom1
, atom2
, vatom3
));
3366 case Iop_Perm8x16x2
:
3367 /* (V128, V128, V128) -> V128 */
3368 complainIfUndefined(mce
, atom3
, NULL
);
3371 assignNew('V', mce
, Ity_V128
, triop(op
, vatom1
, vatom2
, atom3
)),
3372 mkPCast8x16(mce
, vatom3
)
3375 /* Vector FP with rounding mode as the first arg */
3380 case Iop_Scale2_64Fx2
:
3381 return binary64Fx2_w_rm(mce
, vatom1
, vatom2
, vatom3
);
3387 case Iop_Scale2_32Fx4
:
3388 return binary32Fx4_w_rm(mce
, vatom1
, vatom2
, vatom3
);
3394 return binary64Fx4_w_rm(mce
, vatom1
, vatom2
, vatom3
);
3400 return binary32Fx8_w_rm(mce
, vatom1
, vatom2
, vatom3
);
3402 case Iop_F32x4_2toQ16x8
:
3403 return assignNew('V', mce
, Ity_V128
,
3404 binop(Iop_PackEvenLanes16x8
,
3405 unary32Fx4_w_rm(mce
, vatom1
, vatom2
),
3406 unary32Fx4_w_rm(mce
, vatom1
, vatom3
)));
3407 case Iop_F64x2_2toQ32x4
:
3408 return assignNew('V', mce
, Ity_V128
,
3409 binop(Iop_PackEvenLanes32x4
,
3410 unary64Fx2_w_rm(mce
, vatom1
, vatom2
),
3411 unary64Fx2_w_rm(mce
, vatom1
, vatom3
)));
3416 VG_(tool_panic
)("memcheck:expr2vbits_Triop");
3422 IRAtom
* expr2vbits_Binop ( MCEnv
* mce
,
3424 IRAtom
* atom1
, IRAtom
* atom2
,
3425 HowUsed hu
/*use HuOth if unknown*/ )
3427 IRType and_or_ty
= Ity_INVALID
;
3428 IRAtom
* (*uifu
) (MCEnv
*, IRAtom
*, IRAtom
*) = NULL
;
3429 IRAtom
* (*difd
) (MCEnv
*, IRAtom
*, IRAtom
*) = NULL
;
3430 IRAtom
* (*improve
) (MCEnv
*, IRAtom
*, IRAtom
*) = NULL
;
3432 IRAtom
* vatom1
= expr2vbits( mce
, atom1
, HuOth
);
3433 IRAtom
* vatom2
= expr2vbits( mce
, atom2
, HuOth
);
3435 tl_assert(isOriginalAtom(mce
,atom1
));
3436 tl_assert(isOriginalAtom(mce
,atom2
));
3437 tl_assert(isShadowAtom(mce
,vatom1
));
3438 tl_assert(isShadowAtom(mce
,vatom2
));
3439 tl_assert(sameKindedAtoms(atom1
,vatom1
));
3440 tl_assert(sameKindedAtoms(atom2
,vatom2
));
3455 return binary16Ix2(mce
, vatom1
, vatom2
);
3467 return binary8Ix4(mce
, vatom1
, vatom2
);
3480 /* Same scheme as with all other shifts. */
3481 complainIfUndefined(mce
, atom2
, NULL
);
3482 return assignNew('V', mce
, Ity_I64
, binop(op
, vatom1
, atom2
));
3484 case Iop_QNarrowBin32Sto16Sx4
:
3485 case Iop_QNarrowBin16Sto8Sx8
:
3486 case Iop_QNarrowBin16Sto8Ux8
:
3487 return vectorNarrowBin64(mce
, op
, vatom1
, vatom2
);
3506 case Iop_PolynomialMul8x8
:
3507 return binary8Ix8(mce
, vatom1
, vatom2
);
3518 case Iop_MulHi16Sx4
:
3519 case Iop_MulHi16Ux4
:
3520 case Iop_CmpGT16Sx4
:
3521 case Iop_CmpGT16Ux4
:
3528 case Iop_QDMulHi16Sx4
:
3529 case Iop_QRDMulHi16Sx4
:
3530 return binary16Ix4(mce
, vatom1
, vatom2
);
3538 case Iop_CmpGT32Sx2
:
3539 case Iop_CmpGT32Ux2
:
3548 case Iop_QDMulHi32Sx2
:
3549 case Iop_QRDMulHi32Sx2
:
3550 return binary32Ix2(mce
, vatom1
, vatom2
);
3559 return binary64Ix1(mce
, vatom1
, vatom2
);
3561 case Iop_QShlNsatSU8x8
:
3562 case Iop_QShlNsatUU8x8
:
3563 case Iop_QShlNsatSS8x8
:
3564 complainIfUndefined(mce
, atom2
, NULL
);
3565 return mkPCast8x8(mce
, vatom1
);
3567 case Iop_QShlNsatSU16x4
:
3568 case Iop_QShlNsatUU16x4
:
3569 case Iop_QShlNsatSS16x4
:
3570 complainIfUndefined(mce
, atom2
, NULL
);
3571 return mkPCast16x4(mce
, vatom1
);
3573 case Iop_QShlNsatSU32x2
:
3574 case Iop_QShlNsatUU32x2
:
3575 case Iop_QShlNsatSS32x2
:
3576 complainIfUndefined(mce
, atom2
, NULL
);
3577 return mkPCast32x2(mce
, vatom1
);
3579 case Iop_QShlNsatSU64x1
:
3580 case Iop_QShlNsatUU64x1
:
3581 case Iop_QShlNsatSS64x1
:
3582 complainIfUndefined(mce
, atom2
, NULL
);
3583 return mkPCast32x2(mce
, vatom1
);
3585 case Iop_PwMax32Sx2
:
3586 case Iop_PwMax32Ux2
:
3587 case Iop_PwMin32Sx2
:
3588 case Iop_PwMin32Ux2
:
3589 case Iop_PwMax32Fx2
:
3590 case Iop_PwMin32Fx2
:
3591 return assignNew('V', mce
, Ity_I64
,
3592 binop(Iop_PwMax32Ux2
,
3593 mkPCast32x2(mce
, vatom1
),
3594 mkPCast32x2(mce
, vatom2
)));
3596 case Iop_PwMax16Sx4
:
3597 case Iop_PwMax16Ux4
:
3598 case Iop_PwMin16Sx4
:
3599 case Iop_PwMin16Ux4
:
3600 return assignNew('V', mce
, Ity_I64
,
3601 binop(Iop_PwMax16Ux4
,
3602 mkPCast16x4(mce
, vatom1
),
3603 mkPCast16x4(mce
, vatom2
)));
3609 return assignNew('V', mce
, Ity_I64
,
3610 binop(Iop_PwMax8Ux8
,
3611 mkPCast8x8(mce
, vatom1
),
3612 mkPCast8x8(mce
, vatom2
)));
3615 case Iop_PwAdd32Fx2
:
3616 return mkPCast32x2(mce
,
3617 assignNew('V', mce
, Ity_I64
,
3618 binop(Iop_PwAdd32x2
,
3619 mkPCast32x2(mce
, vatom1
),
3620 mkPCast32x2(mce
, vatom2
))));
3623 return mkPCast16x4(mce
,
3624 assignNew('V', mce
, Ity_I64
,
3625 binop(op
, mkPCast16x4(mce
, vatom1
),
3626 mkPCast16x4(mce
, vatom2
))));
3629 return mkPCast8x8(mce
,
3630 assignNew('V', mce
, Ity_I64
,
3631 binop(op
, mkPCast8x8(mce
, vatom1
),
3632 mkPCast8x8(mce
, vatom2
))));
3638 return mkUifU64(mce
,
3639 assignNew('V', mce
, Ity_I64
, binop(op
, vatom1
, atom2
)),
3640 mkPCast8x8(mce
,vatom2
)
3647 return mkUifU64(mce
,
3648 assignNew('V', mce
, Ity_I64
, binop(op
, vatom1
, atom2
)),
3649 mkPCast16x4(mce
,vatom2
)
3656 return mkUifU64(mce
,
3657 assignNew('V', mce
, Ity_I64
, binop(op
, vatom1
, atom2
)),
3658 mkPCast32x2(mce
,vatom2
)
3661 /* 64-bit data-steering */
3662 case Iop_InterleaveLO32x2
:
3663 case Iop_InterleaveLO16x4
:
3664 case Iop_InterleaveLO8x8
:
3665 case Iop_InterleaveHI32x2
:
3666 case Iop_InterleaveHI16x4
:
3667 case Iop_InterleaveHI8x8
:
3668 case Iop_CatOddLanes8x8
:
3669 case Iop_CatEvenLanes8x8
:
3670 case Iop_CatOddLanes16x4
:
3671 case Iop_CatEvenLanes16x4
:
3672 case Iop_InterleaveOddLanes8x8
:
3673 case Iop_InterleaveEvenLanes8x8
:
3674 case Iop_InterleaveOddLanes16x4
:
3675 case Iop_InterleaveEvenLanes16x4
:
3676 return assignNew('V', mce
, Ity_I64
, binop(op
, vatom1
, vatom2
));
3678 case Iop_GetElem8x8
:
3679 complainIfUndefined(mce
, atom2
, NULL
);
3680 return assignNew('V', mce
, Ity_I8
, binop(op
, vatom1
, atom2
));
3681 case Iop_GetElem16x4
:
3682 complainIfUndefined(mce
, atom2
, NULL
);
3683 return assignNew('V', mce
, Ity_I16
, binop(op
, vatom1
, atom2
));
3684 case Iop_GetElem32x2
:
3685 complainIfUndefined(mce
, atom2
, NULL
);
3686 return assignNew('V', mce
, Ity_I32
, binop(op
, vatom1
, atom2
));
3688 /* Perm8x8: rearrange values in left arg using steering values from
3689 right arg. So rearrange the vbits in the same way but pessimise wrt
3690 steering values. We assume that unused bits in the steering value
3691 are defined zeros, so we can safely PCast within each lane of the the
3692 steering value without having to take precautions to avoid a
3693 dependency on those unused bits.
3695 This is also correct for PermOrZero8x8, but it is a bit subtle. For
3696 each lane, if bit 7 of the steering value is zero, then we'll steer
3697 the shadow value exactly as per Perm8x8. If that bit is one, then
3698 the operation will set the resulting (concrete) value to zero. That
3699 means it is defined, and should have a shadow value of zero. Hence
3700 in both cases (bit 7 is 0 or 1) we can self-shadow (in the same way
3701 as Perm8x8) and then pessimise against the steering values. */
3703 case Iop_PermOrZero8x8
:
3706 assignNew('V', mce
, Ity_I64
, binop(op
, vatom1
, atom2
)),
3707 mkPCast8x8(mce
, vatom2
)
3712 case Iop_I32StoF32x4
:
3713 case Iop_F32toI32Sx4
:
3715 return unary32Fx4_w_rm(mce
, vatom1
, vatom2
);
3717 return unary64Fx2_w_rm(mce
, vatom1
, vatom2
);
3731 /* Same scheme as with all other shifts. Note: 22 Oct 05:
3732 this is wrong now, scalar shifts are done properly lazily.
3733 Vector shifts should be fixed too. */
3734 complainIfUndefined(mce
, atom2
, NULL
);
3735 return assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
));
3737 /* V x V shifts/rotates are done using the standard lazy scheme. */
3738 /* For the non-rounding variants of bi-di vector x vector
3739 shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
3740 But note that this is overly pessimistic, because in fact only
3741 the bottom 8 bits of each lane of the second argument are taken
3742 into account when shifting. So really we ought to ignore
3743 undefinedness in bits 8 and above of each lane in the
3752 return mkUifUV128(mce
,
3753 assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
)),
3754 mkPCast8x16(mce
,vatom2
)
3764 return mkUifUV128(mce
,
3765 assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
)),
3766 mkPCast16x8(mce
,vatom2
)
3776 return mkUifUV128(mce
,
3777 assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
)),
3778 mkPCast32x4(mce
,vatom2
)
3788 return mkUifUV128(mce
,
3789 assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
)),
3790 mkPCast64x2(mce
,vatom2
)
3793 /* For the rounding variants of bi-di vector x vector shifts, the
3794 rounding adjustment can cause undefinedness to propagate through
3795 the entire lane, in the worst case. Too complex to handle
3796 properly .. just UifU the arguments and then PCast them.
3797 Suboptimal but safe. */
3800 return binary8Ix16(mce
, vatom1
, vatom2
);
3803 return binary16Ix8(mce
, vatom1
, vatom2
);
3806 return binary32Ix4(mce
, vatom1
, vatom2
);
3809 return binary64Ix2(mce
, vatom1
, vatom2
);
3811 case Iop_F32ToFixed32Ux4_RZ
:
3812 case Iop_F32ToFixed32Sx4_RZ
:
3813 case Iop_Fixed32UToF32x4_RN
:
3814 case Iop_Fixed32SToF32x4_RN
:
3815 complainIfUndefined(mce
, atom2
, NULL
);
3816 return mkPCast32x4(mce
, vatom1
);
3818 case Iop_F32ToFixed32Ux2_RZ
:
3819 case Iop_F32ToFixed32Sx2_RZ
:
3820 case Iop_Fixed32UToF32x2_RN
:
3821 case Iop_Fixed32SToF32x2_RN
:
3822 complainIfUndefined(mce
, atom2
, NULL
);
3823 return mkPCast32x2(mce
, vatom1
);
3832 case Iop_CmpGT8Sx16
:
3833 case Iop_CmpGT8Ux16
:
3839 case Iop_QAddExtUSsatSS8x16
:
3840 case Iop_QAddExtSUsatUU8x16
:
3845 case Iop_MulHi8Sx16
:
3846 case Iop_MulHi8Ux16
:
3847 case Iop_PolynomialMul8x16
:
3848 case Iop_PolynomialMulAdd8x16
:
3849 return binary8Ix16(mce
, vatom1
, vatom2
);
3855 case Iop_MulHi16Sx8
:
3856 case Iop_MulHi16Ux8
:
3861 case Iop_CmpGT16Sx8
:
3862 case Iop_CmpGT16Ux8
:
3868 case Iop_QAddExtUSsatSS16x8
:
3869 case Iop_QAddExtSUsatUU16x8
:
3873 case Iop_QDMulHi16Sx8
:
3874 case Iop_QRDMulHi16Sx8
:
3875 case Iop_PolynomialMulAdd16x8
:
3876 /* PwExtUSMulQAdd8x16 is a bit subtle. The effect of it is that each
3877 16-bit chunk of the output is formed from corresponding 16-bit chunks
3878 of the input args, so we can treat it like an other binary 16x8
3879 operation. That's despite it having '8x16' in its name. */
3880 case Iop_PwExtUSMulQAdd8x16
:
3881 return binary16Ix8(mce
, vatom1
, vatom2
);
3884 case Iop_CmpGT32Sx4
:
3885 case Iop_CmpGT32Ux4
:
3891 case Iop_QAddExtUSsatSS32x4
:
3892 case Iop_QAddExtSUsatUU32x4
:
3903 case Iop_MulHi32Sx4
:
3904 case Iop_MulHi32Ux4
:
3905 case Iop_QDMulHi32Sx4
:
3906 case Iop_QRDMulHi32Sx4
:
3907 case Iop_PolynomialMulAdd32x4
:
3908 return binary32Ix4(mce
, vatom1
, vatom2
);
3919 case Iop_CmpGT64Sx2
:
3920 case Iop_CmpGT64Ux2
:
3927 case Iop_QAddExtUSsatSS64x2
:
3928 case Iop_QAddExtSUsatUU64x2
:
3929 case Iop_PolynomialMulAdd64x2
:
3930 case Iop_CipherV128
:
3931 case Iop_CipherLV128
:
3932 case Iop_NCipherV128
:
3933 case Iop_NCipherLV128
:
3934 case Iop_MulI128by10E
:
3935 case Iop_MulI128by10ECarry
:
3936 return binary64Ix2(mce
, vatom1
, vatom2
);
3940 case Iop_CmpNEZ128x1
:
3941 return binary128Ix1(mce
, vatom1
, vatom2
);
3943 case Iop_QNarrowBin64Sto32Sx4
:
3944 case Iop_QNarrowBin64Uto32Ux4
:
3945 case Iop_QNarrowBin32Sto16Sx8
:
3946 case Iop_QNarrowBin32Uto16Ux8
:
3947 case Iop_QNarrowBin32Sto16Ux8
:
3948 case Iop_QNarrowBin16Sto8Sx16
:
3949 case Iop_QNarrowBin16Uto8Ux16
:
3950 case Iop_QNarrowBin16Sto8Ux16
:
3951 return vectorNarrowBinV128(mce
, op
, vatom1
, vatom2
);
3955 case Iop_CmpLT64Fx2
:
3956 case Iop_CmpLE64Fx2
:
3957 case Iop_CmpEQ64Fx2
:
3958 case Iop_CmpUN64Fx2
:
3959 case Iop_RecipStep64Fx2
:
3960 case Iop_RSqrtStep64Fx2
:
3961 return binary64Fx2(mce
, vatom1
, vatom2
);
3968 case Iop_CmpLT64F0x2
:
3969 case Iop_CmpLE64F0x2
:
3970 case Iop_CmpEQ64F0x2
:
3971 case Iop_CmpUN64F0x2
:
3973 return binary64F0x2(mce
, vatom1
, vatom2
);
3977 case Iop_CmpLT32Fx4
:
3978 case Iop_CmpLE32Fx4
:
3979 case Iop_CmpEQ32Fx4
:
3980 case Iop_CmpUN32Fx4
:
3981 case Iop_CmpGT32Fx4
:
3982 case Iop_CmpGE32Fx4
:
3983 case Iop_RecipStep32Fx4
:
3984 case Iop_RSqrtStep32Fx4
:
3985 return binary32Fx4(mce
, vatom1
, vatom2
);
3991 case Iop_CmpEQ32Fx2
:
3992 case Iop_CmpGT32Fx2
:
3993 case Iop_CmpGE32Fx2
:
3995 case Iop_RecipStep32Fx2
:
3996 case Iop_RSqrtStep32Fx2
:
3997 return binary32Fx2(mce
, vatom1
, vatom2
);
4004 case Iop_CmpLT32F0x4
:
4005 case Iop_CmpLE32F0x4
:
4006 case Iop_CmpEQ32F0x4
:
4007 case Iop_CmpUN32F0x4
:
4009 return binary32F0x4(mce
, vatom1
, vatom2
);
4011 case Iop_QShlNsatSU8x16
:
4012 case Iop_QShlNsatUU8x16
:
4013 case Iop_QShlNsatSS8x16
:
4014 complainIfUndefined(mce
, atom2
, NULL
);
4015 return mkPCast8x16(mce
, vatom1
);
4017 case Iop_QShlNsatSU16x8
:
4018 case Iop_QShlNsatUU16x8
:
4019 case Iop_QShlNsatSS16x8
:
4020 complainIfUndefined(mce
, atom2
, NULL
);
4021 return mkPCast16x8(mce
, vatom1
);
4023 case Iop_QShlNsatSU32x4
:
4024 case Iop_QShlNsatUU32x4
:
4025 case Iop_QShlNsatSS32x4
:
4026 complainIfUndefined(mce
, atom2
, NULL
);
4027 return mkPCast32x4(mce
, vatom1
);
4029 case Iop_QShlNsatSU64x2
:
4030 case Iop_QShlNsatUU64x2
:
4031 case Iop_QShlNsatSS64x2
:
4032 complainIfUndefined(mce
, atom2
, NULL
);
4033 return mkPCast32x4(mce
, vatom1
);
4035 /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
4036 To make this simpler, do the following:
4037 * complain if the shift amount (the I8) is undefined
4038 * pcast each lane at the wide width
4039 * truncate each lane to half width
4040 * pcast the resulting 64-bit value to a single bit and use
4041 that as the least significant bit of the upper half of the
4043 case Iop_QandQShrNnarrow64Uto32Ux2
:
4044 case Iop_QandQSarNnarrow64Sto32Sx2
:
4045 case Iop_QandQSarNnarrow64Sto32Ux2
:
4046 case Iop_QandQRShrNnarrow64Uto32Ux2
:
4047 case Iop_QandQRSarNnarrow64Sto32Sx2
:
4048 case Iop_QandQRSarNnarrow64Sto32Ux2
:
4049 case Iop_QandQShrNnarrow32Uto16Ux4
:
4050 case Iop_QandQSarNnarrow32Sto16Sx4
:
4051 case Iop_QandQSarNnarrow32Sto16Ux4
:
4052 case Iop_QandQRShrNnarrow32Uto16Ux4
:
4053 case Iop_QandQRSarNnarrow32Sto16Sx4
:
4054 case Iop_QandQRSarNnarrow32Sto16Ux4
:
4055 case Iop_QandQShrNnarrow16Uto8Ux8
:
4056 case Iop_QandQSarNnarrow16Sto8Sx8
:
4057 case Iop_QandQSarNnarrow16Sto8Ux8
:
4058 case Iop_QandQRShrNnarrow16Uto8Ux8
:
4059 case Iop_QandQRSarNnarrow16Sto8Sx8
:
4060 case Iop_QandQRSarNnarrow16Sto8Ux8
:
4062 IRAtom
* (*fnPessim
) (MCEnv
*, IRAtom
*) = NULL
;
4063 IROp opNarrow
= Iop_INVALID
;
4065 case Iop_QandQShrNnarrow64Uto32Ux2
:
4066 case Iop_QandQSarNnarrow64Sto32Sx2
:
4067 case Iop_QandQSarNnarrow64Sto32Ux2
:
4068 case Iop_QandQRShrNnarrow64Uto32Ux2
:
4069 case Iop_QandQRSarNnarrow64Sto32Sx2
:
4070 case Iop_QandQRSarNnarrow64Sto32Ux2
:
4071 fnPessim
= mkPCast64x2
;
4072 opNarrow
= Iop_NarrowUn64to32x2
;
4074 case Iop_QandQShrNnarrow32Uto16Ux4
:
4075 case Iop_QandQSarNnarrow32Sto16Sx4
:
4076 case Iop_QandQSarNnarrow32Sto16Ux4
:
4077 case Iop_QandQRShrNnarrow32Uto16Ux4
:
4078 case Iop_QandQRSarNnarrow32Sto16Sx4
:
4079 case Iop_QandQRSarNnarrow32Sto16Ux4
:
4080 fnPessim
= mkPCast32x4
;
4081 opNarrow
= Iop_NarrowUn32to16x4
;
4083 case Iop_QandQShrNnarrow16Uto8Ux8
:
4084 case Iop_QandQSarNnarrow16Sto8Sx8
:
4085 case Iop_QandQSarNnarrow16Sto8Ux8
:
4086 case Iop_QandQRShrNnarrow16Uto8Ux8
:
4087 case Iop_QandQRSarNnarrow16Sto8Sx8
:
4088 case Iop_QandQRSarNnarrow16Sto8Ux8
:
4089 fnPessim
= mkPCast16x8
;
4090 opNarrow
= Iop_NarrowUn16to8x8
;
4095 complainIfUndefined(mce
, atom2
, NULL
);
4096 // Pessimised shift result
4098 = fnPessim(mce
, vatom1
);
4099 // Narrowed, pessimised shift result
4101 = assignNew('V', mce
, Ity_I64
, unop(opNarrow
, shV
));
4102 // Generates: Def--(63)--Def PCast-to-I1(narrowed)
4103 IRAtom
* qV
= mkPCastXXtoXXlsb(mce
, shVnarrowed
, Ity_I64
);
4104 // and assemble the result
4105 return assignNew('V', mce
, Ity_V128
,
4106 binop(Iop_64HLtoV128
, qV
, shVnarrowed
));
4111 case Iop_QDMull32Sx2
:
4112 return vectorWidenI64(mce
, Iop_Widen32Sto64x2
,
4113 mkUifU64(mce
, vatom1
, vatom2
));
4117 case Iop_QDMull16Sx4
:
4118 return vectorWidenI64(mce
, Iop_Widen16Sto32x4
,
4119 mkUifU64(mce
, vatom1
, vatom2
));
4123 case Iop_PolynomialMull8x8
:
4124 return vectorWidenI64(mce
, Iop_Widen8Sto16x8
,
4125 mkUifU64(mce
, vatom1
, vatom2
));
4128 return mkPCast32x4(mce
,
4129 assignNew('V', mce
, Ity_V128
, binop(op
, mkPCast32x4(mce
, vatom1
),
4130 mkPCast32x4(mce
, vatom2
))));
4133 return mkPCast16x8(mce
,
4134 assignNew('V', mce
, Ity_V128
, binop(op
, mkPCast16x8(mce
, vatom1
),
4135 mkPCast16x8(mce
, vatom2
))));
4138 return mkPCast8x16(mce
,
4139 assignNew('V', mce
, Ity_V128
, binop(op
, mkPCast8x16(mce
, vatom1
),
4140 mkPCast8x16(mce
, vatom2
))));
4142 /* V128-bit data-steering */
4143 case Iop_SetV128lo32
:
4144 case Iop_SetV128lo64
:
4145 case Iop_64HLtoV128
:
4146 case Iop_InterleaveLO64x2
:
4147 case Iop_InterleaveLO32x4
:
4148 case Iop_InterleaveLO16x8
:
4149 case Iop_InterleaveLO8x16
:
4150 case Iop_InterleaveHI64x2
:
4151 case Iop_InterleaveHI32x4
:
4152 case Iop_InterleaveHI16x8
:
4153 case Iop_InterleaveHI8x16
:
4154 case Iop_CatOddLanes8x16
:
4155 case Iop_CatOddLanes16x8
:
4156 case Iop_CatOddLanes32x4
:
4157 case Iop_CatEvenLanes8x16
:
4158 case Iop_CatEvenLanes16x8
:
4159 case Iop_CatEvenLanes32x4
:
4160 case Iop_InterleaveOddLanes8x16
:
4161 case Iop_InterleaveOddLanes16x8
:
4162 case Iop_InterleaveOddLanes32x4
:
4163 case Iop_InterleaveEvenLanes8x16
:
4164 case Iop_InterleaveEvenLanes16x8
:
4165 case Iop_InterleaveEvenLanes32x4
:
4166 case Iop_PackOddLanes8x16
:
4167 case Iop_PackOddLanes16x8
:
4168 case Iop_PackOddLanes32x4
:
4169 case Iop_PackEvenLanes8x16
:
4170 case Iop_PackEvenLanes16x8
:
4171 case Iop_PackEvenLanes32x4
:
4172 return assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, vatom2
));
4174 case Iop_GetElem8x16
:
4175 complainIfUndefined(mce
, atom2
, NULL
);
4176 return assignNew('V', mce
, Ity_I8
, binop(op
, vatom1
, atom2
));
4177 case Iop_GetElem16x8
:
4178 complainIfUndefined(mce
, atom2
, NULL
);
4179 return assignNew('V', mce
, Ity_I16
, binop(op
, vatom1
, atom2
));
4180 case Iop_GetElem32x4
:
4181 complainIfUndefined(mce
, atom2
, NULL
);
4182 return assignNew('V', mce
, Ity_I32
, binop(op
, vatom1
, atom2
));
4183 case Iop_GetElem64x2
:
4184 complainIfUndefined(mce
, atom2
, NULL
);
4185 return assignNew('V', mce
, Ity_I64
, binop(op
, vatom1
, atom2
));
4187 /* Perm8x16: rearrange values in left arg using steering values
4188 from right arg. So rearrange the vbits in the same way but
4189 pessimise wrt steering values. Perm32x4 ditto. */
4190 /* PermOrZero8x16: see comments above for PermOrZero8x8. */
4192 case Iop_PermOrZero8x16
:
4195 assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
)),
4196 mkPCast8x16(mce
, vatom2
)
4201 assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
)),
4202 mkPCast32x4(mce
, vatom2
)
4205 /* These two take the lower half of each 16-bit lane, sign/zero
4206 extend it to 32, and multiply together, producing a 32x4
4207 result (and implicitly ignoring half the operand bits). So
4208 treat it as a bunch of independent 16x8 operations, but then
4209 do 32-bit shifts left-right to copy the lower half results
4210 (which are all 0s or all 1s due to PCasting in binary16Ix8)
4211 into the upper half of each result lane. */
4212 case Iop_MullEven16Ux8
:
4213 case Iop_MullEven16Sx8
: {
4215 at
= binary16Ix8(mce
,vatom1
,vatom2
);
4216 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_ShlN32x4
, at
, mkU8(16)));
4217 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_SarN32x4
, at
, mkU8(16)));
4221 /* Same deal as Iop_MullEven16{S,U}x8 */
4222 case Iop_MullEven8Ux16
:
4223 case Iop_MullEven8Sx16
: {
4225 at
= binary8Ix16(mce
,vatom1
,vatom2
);
4226 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_ShlN16x8
, at
, mkU8(8)));
4227 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_SarN16x8
, at
, mkU8(8)));
4231 /* Same deal as Iop_MullEven16{S,U}x8 */
4232 case Iop_MullEven32Ux4
:
4233 case Iop_MullEven32Sx4
: {
4235 at
= binary32Ix4(mce
,vatom1
,vatom2
);
4236 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_ShlN64x2
, at
, mkU8(32)));
4237 at
= assignNew('V', mce
, Ity_V128
, binop(Iop_SarN64x2
, at
, mkU8(32)));
4241 /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
4242 32x4 -> 16x8 laneage, discarding the upper half of each lane.
4243 Simply apply same op to the V bits, since this really no more
4244 than a data steering operation. */
4245 case Iop_NarrowBin32to16x8
:
4246 case Iop_NarrowBin16to8x16
:
4247 case Iop_NarrowBin64to32x4
:
4248 return assignNew('V', mce
, Ity_V128
,
4249 binop(op
, vatom1
, vatom2
));
4254 case Iop_I128StoBCD128
:
4255 /* Same scheme as with all other shifts. Note: 10 Nov 05:
4256 this is wrong now, scalar shifts are done properly lazily.
4257 Vector shifts should be fixed too. */
4258 complainIfUndefined(mce
, atom2
, NULL
);
4259 return assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
));
4263 return mkLazy2(mce
, Ity_V128
, vatom1
, vatom2
);
4268 complainIfUndefined(mce
, atom2
, NULL
);
4269 return assignNew('V', mce
, Ity_V128
, binop(op
, vatom1
, atom2
));
4271 /* I128-bit data-steering */
4273 return assignNew('V', mce
, Ity_I128
, binop(op
, vatom1
, vatom2
));
4279 return binary64Fx4(mce
, vatom1
, vatom2
);
4283 return binary32Fx8(mce
, vatom1
, vatom2
);
4285 /* V256-bit data-steering */
4286 case Iop_V128HLtoV256
:
4287 return assignNew('V', mce
, Ity_V256
, binop(op
, vatom1
, vatom2
));
4289 /* Scalar floating point */
4293 /* I32(rm) x F32 -> I64 */
4294 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4297 /* I32(rm) x I64 -> F32 */
4298 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4300 case Iop_RoundF64toInt
:
4301 case Iop_RoundF64toF32
:
4311 case Iop_RecpExpF64
:
4312 /* I32(rm) x I64/F64 -> I64/F64 */
4313 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4317 case Iop_RoundD64toInt
:
4318 /* I32(rm) x D64 -> D64 */
4319 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4323 case Iop_RoundD128toInt
:
4324 /* I32(rm) x D128 -> D128 */
4325 return mkLazy2(mce
, Ity_I128
, vatom1
, vatom2
);
4327 case Iop_RoundF128toInt
:
4328 /* I32(rm) x F128 -> F128 */
4329 return mkLazy2(mce
, Ity_I128
, vatom1
, vatom2
);
4335 /* I32(rm) x I64/D64 -> D64/I64 */
4336 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4344 /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
4345 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4353 /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
4354 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4358 case Iop_F128toD128
:
4361 case Iop_D128toF128
:
4362 /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
4363 return mkLazy2(mce
, Ity_I128
, vatom1
, vatom2
);
4365 case Iop_RoundF32toInt
:
4367 case Iop_RecpExpF32
:
4368 /* I32(rm) x I32/F32 -> I32/F32 */
4369 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4372 /* I32(rm) x F128 -> F128 */
4373 return mkLazy2(mce
, Ity_I128
, vatom1
, vatom2
);
4379 /* First arg is I32 (rounding mode), second is F32/I32 (data). */
4380 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4384 /* First arg is I32 (rounding mode), second is F64/F32 (data). */
4385 return mkLazy2(mce
, Ity_I16
, vatom1
, vatom2
);
4387 case Iop_F128toI32S
: /* IRRoundingMode(I32) x F128 -> signed I32 */
4388 case Iop_F128toI32U
: /* IRRoundingMode(I32) x F128 -> unsigned I32 */
4389 case Iop_F128toF32
: /* IRRoundingMode(I32) x F128 -> F32 */
4390 case Iop_D128toI32S
: /* IRRoundingMode(I32) x D128 -> signed I32 */
4391 case Iop_D128toI32U
: /* IRRoundingMode(I32) x D128 -> unsigned I32 */
4392 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4394 case Iop_F128toI128S
: /* IRRoundingMode(I32) x F128 -> signed I128 */
4395 case Iop_RndF128
: /* IRRoundingMode(I32) x F128 -> F128 */
4396 return mkLazy2(mce
, Ity_I128
, vatom1
, vatom2
);
4398 case Iop_F128toI64S
: /* IRRoundingMode(I32) x F128 -> signed I64 */
4399 case Iop_F128toI64U
: /* IRRoundingMode(I32) x F128 -> unsigned I64 */
4400 case Iop_F128toF64
: /* IRRoundingMode(I32) x F128 -> F64 */
4401 case Iop_D128toD64
: /* IRRoundingMode(I64) x D128 -> D64 */
4402 case Iop_D128toI64S
: /* IRRoundingMode(I64) x D128 -> signed I64 */
4403 case Iop_D128toI64U
: /* IRRoundingMode(I32) x D128 -> unsigned I64 */
4404 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4406 case Iop_F64HLtoF128
:
4407 case Iop_D64HLtoD128
:
4408 return assignNew('V', mce
, Ity_I128
,
4409 binop(Iop_64HLto128
, vatom1
, vatom2
));
4417 /* First arg is I32 (rounding mode), second is F64/D64 (data). */
4418 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4421 /* First arg is I32 (rounding mode), second is D64 (data). */
4422 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4425 /* First arg is I32 (rounding mode), second is F64 (data). */
4426 return mkLazy2(mce
, Ity_I16
, vatom1
, vatom2
);
4428 case Iop_InsertExpD64
:
4429 /* I64 x I64 -> D64 */
4430 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4432 case Iop_InsertExpD128
:
4433 /* I64 x I128 -> D128 */
4434 return mkLazy2(mce
, Ity_I128
, vatom1
, vatom2
);
4442 case Iop_CmpExpD128
:
4443 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4447 /* F32 x F32 -> F32 */
4448 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4452 /* F64 x F64 -> F64 */
4453 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4455 /* non-FP after here */
4457 case Iop_DivModU64to32
:
4458 case Iop_DivModS64to32
:
4459 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4461 case Iop_DivModU128to64
:
4462 case Iop_DivModS128to64
:
4463 return mkLazy2(mce
, Ity_I128
, vatom1
, vatom2
);
4466 return assignNew('V', mce
, Ity_I16
, binop(op
, vatom1
, vatom2
));
4468 return assignNew('V', mce
, Ity_I32
, binop(op
, vatom1
, vatom2
));
4470 return assignNew('V', mce
, Ity_I64
, binop(op
, vatom1
, vatom2
));
4472 case Iop_DivModU64to64
:
4473 case Iop_DivModS64to64
: {
4474 IRAtom
* vTmp64
= mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4475 return assignNew('V', mce
, Ity_I128
,
4476 binop(Iop_64HLto128
, vTmp64
, vTmp64
));
4481 IRAtom
* vLo64
= mkLeft64(mce
, mkUifU64(mce
, vatom1
,vatom2
));
4482 IRAtom
* vHi64
= mkPCastTo(mce
, Ity_I64
, vLo64
);
4483 return assignNew('V', mce
, Ity_I128
,
4484 binop(Iop_64HLto128
, vHi64
, vLo64
));
4487 case Iop_DivModU32to32
:
4488 case Iop_DivModS32to32
: {
4489 IRAtom
* vTmp32
= mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4490 return assignNew('V', mce
, Ity_I64
,
4491 binop(Iop_32HLto64
, vTmp32
, vTmp32
));
4496 IRAtom
* vLo32
= mkLeft32(mce
, mkUifU32(mce
, vatom1
,vatom2
));
4497 IRAtom
* vHi32
= mkPCastTo(mce
, Ity_I32
, vLo32
);
4498 return assignNew('V', mce
, Ity_I64
,
4499 binop(Iop_32HLto64
, vHi32
, vLo32
));
4504 IRAtom
* vLo16
= mkLeft16(mce
, mkUifU16(mce
, vatom1
,vatom2
));
4505 IRAtom
* vHi16
= mkPCastTo(mce
, Ity_I16
, vLo16
);
4506 return assignNew('V', mce
, Ity_I32
,
4507 binop(Iop_16HLto32
, vHi16
, vLo16
));
4512 IRAtom
* vLo8
= mkLeft8(mce
, mkUifU8(mce
, vatom1
,vatom2
));
4513 IRAtom
* vHi8
= mkPCastTo(mce
, Ity_I8
, vLo8
);
4514 return assignNew('V', mce
, Ity_I16
, binop(Iop_8HLto16
, vHi8
, vLo8
));
4517 case Iop_Sad8Ux4
: /* maybe we could do better? ftm, do mkLazy2. */
4522 case Iop_QAdd32S
: /* could probably do better */
4523 case Iop_QSub32S
: /* could probably do better */
4524 return mkLazy2(mce
, Ity_I32
, vatom1
, vatom2
);
4530 return mkLazy2(mce
, Ity_I64
, vatom1
, vatom2
);
4533 if (mce
->dlbo
.dl_Add32
== DLexpensive
4534 || (mce
->dlbo
.dl_Add32
== DLauto
&& hu
== HuOth
)) {
4535 return expensiveAddSub(mce
,True
,Ity_I32
,
4536 vatom1
,vatom2
, atom1
,atom2
);
4538 goto cheap_AddSub32
;
4541 if (mce
->dlbo
.dl_Sub32
== DLexpensive
4542 || (mce
->dlbo
.dl_Sub32
== DLauto
&& hu
== HuOth
)) {
4543 return expensiveAddSub(mce
,False
,Ity_I32
,
4544 vatom1
,vatom2
, atom1
,atom2
);
4546 goto cheap_AddSub32
;
4551 return mkLeft32(mce
, mkUifU32(mce
, vatom1
,vatom2
));
4557 return doCmpORD(mce
, op
, vatom1
,vatom2
, atom1
,atom2
);
4560 if (mce
->dlbo
.dl_Add64
== DLexpensive
4561 || (mce
->dlbo
.dl_Add64
== DLauto
&& hu
== HuOth
)) {
4562 return expensiveAddSub(mce
,True
,Ity_I64
,
4563 vatom1
,vatom2
, atom1
,atom2
);
4565 goto cheap_AddSub64
;
4568 if (mce
->dlbo
.dl_Sub64
== DLexpensive
4569 || (mce
->dlbo
.dl_Sub64
== DLauto
&& hu
== HuOth
)) {
4570 return expensiveAddSub(mce
,False
,Ity_I64
,
4571 vatom1
,vatom2
, atom1
,atom2
);
4573 goto cheap_AddSub64
;
4578 return mkLeft64(mce
, mkUifU64(mce
, vatom1
,vatom2
));
4583 return mkLeft16(mce
, mkUifU16(mce
, vatom1
,vatom2
));
4588 return mkLeft8(mce
, mkUifU8(mce
, vatom1
,vatom2
));
4591 case Iop_CmpEQ64
: case Iop_CmpNE64
:
4592 if (mce
->dlbo
.dl_CmpEQ64_CmpNE64
== DLexpensive
)
4593 goto expensive_cmp64
;
4598 case Iop_ExpCmpNE64
:
4599 return expensiveCmpEQorNE(mce
,Ity_I64
, vatom1
,vatom2
, atom1
,atom2
);
4602 case Iop_CmpLE64S
: case Iop_CmpLE64U
:
4603 case Iop_CmpLT64U
: case Iop_CmpLT64S
:
4604 return mkPCastTo(mce
, Ity_I1
, mkUifU64(mce
, vatom1
,vatom2
));
4607 case Iop_CmpEQ32
: case Iop_CmpNE32
:
4608 if (mce
->dlbo
.dl_CmpEQ32_CmpNE32
== DLexpensive
)
4609 goto expensive_cmp32
;
4614 case Iop_ExpCmpNE32
:
4615 return expensiveCmpEQorNE(mce
,Ity_I32
, vatom1
,vatom2
, atom1
,atom2
);
4618 case Iop_CmpLE32S
: case Iop_CmpLE32U
:
4619 case Iop_CmpLT32U
: case Iop_CmpLT32S
:
4620 return mkPCastTo(mce
, Ity_I1
, mkUifU32(mce
, vatom1
,vatom2
));
4623 case Iop_CmpEQ16
: case Iop_CmpNE16
:
4624 if (mce
->dlbo
.dl_CmpEQ16_CmpNE16
== DLexpensive
)
4625 goto expensive_cmp16
;
4630 case Iop_ExpCmpNE16
:
4631 return expensiveCmpEQorNE(mce
,Ity_I16
, vatom1
,vatom2
, atom1
,atom2
);
4634 return mkPCastTo(mce
, Ity_I1
, mkUifU16(mce
, vatom1
,vatom2
));
4637 case Iop_CmpEQ8
: case Iop_CmpNE8
:
4638 if (mce
->dlbo
.dl_CmpEQ8_CmpNE8
== DLexpensive
)
4639 goto expensive_cmp8
;
4644 return expensiveCmpEQorNE(mce
,Ity_I8
, vatom1
,vatom2
, atom1
,atom2
);
4647 return mkPCastTo(mce
, Ity_I1
, mkUifU8(mce
, vatom1
,vatom2
));
4649 ////---- end CmpXX{64,32,16,8}
4651 case Iop_CasCmpEQ8
: case Iop_CasCmpNE8
:
4652 case Iop_CasCmpEQ16
: case Iop_CasCmpNE16
:
4653 case Iop_CasCmpEQ32
: case Iop_CasCmpNE32
:
4654 case Iop_CasCmpEQ64
: case Iop_CasCmpNE64
:
4655 /* Just say these all produce a defined result, regardless
4656 of their arguments. See COMMENT_ON_CasCmpEQ in this file. */
4657 return assignNew('V', mce
, Ity_I1
, definedOfType(Ity_I1
));
4659 case Iop_Shl64
: case Iop_Shr64
: case Iop_Sar64
:
4660 return scalarShift( mce
, Ity_I64
, op
, vatom1
,vatom2
, atom1
,atom2
);
4662 case Iop_Shl32
: case Iop_Shr32
: case Iop_Sar32
:
4663 return scalarShift( mce
, Ity_I32
, op
, vatom1
,vatom2
, atom1
,atom2
);
4665 case Iop_Shl16
: case Iop_Shr16
: case Iop_Sar16
:
4666 return scalarShift( mce
, Ity_I16
, op
, vatom1
,vatom2
, atom1
,atom2
);
4668 case Iop_Shl8
: case Iop_Shr8
: case Iop_Sar8
:
4669 return scalarShift( mce
, Ity_I8
, op
, vatom1
,vatom2
, atom1
,atom2
);
4672 uifu
= mkUifUV256
; difd
= mkDifDV256
;
4673 and_or_ty
= Ity_V256
; improve
= mkImproveANDV256
; goto do_And_Or
;
4675 uifu
= mkUifUV128
; difd
= mkDifDV128
;
4676 and_or_ty
= Ity_V128
; improve
= mkImproveANDV128
; goto do_And_Or
;
4678 uifu
= mkUifU64
; difd
= mkDifD64
;
4679 and_or_ty
= Ity_I64
; improve
= mkImproveAND64
; goto do_And_Or
;
4681 uifu
= mkUifU32
; difd
= mkDifD32
;
4682 and_or_ty
= Ity_I32
; improve
= mkImproveAND32
; goto do_And_Or
;
4684 uifu
= mkUifU16
; difd
= mkDifD16
;
4685 and_or_ty
= Ity_I16
; improve
= mkImproveAND16
; goto do_And_Or
;
4687 uifu
= mkUifU8
; difd
= mkDifD8
;
4688 and_or_ty
= Ity_I8
; improve
= mkImproveAND8
; goto do_And_Or
;
4690 uifu
= mkUifU1
; difd
= mkDifD1
;
4691 and_or_ty
= Ity_I1
; improve
= mkImproveAND1
; goto do_And_Or
;
4694 uifu
= mkUifUV256
; difd
= mkDifDV256
;
4695 and_or_ty
= Ity_V256
; improve
= mkImproveORV256
; goto do_And_Or
;
4697 uifu
= mkUifUV128
; difd
= mkDifDV128
;
4698 and_or_ty
= Ity_V128
; improve
= mkImproveORV128
; goto do_And_Or
;
4700 uifu
= mkUifU64
; difd
= mkDifD64
;
4701 and_or_ty
= Ity_I64
; improve
= mkImproveOR64
; goto do_And_Or
;
4703 uifu
= mkUifU32
; difd
= mkDifD32
;
4704 and_or_ty
= Ity_I32
; improve
= mkImproveOR32
; goto do_And_Or
;
4706 uifu
= mkUifU16
; difd
= mkDifD16
;
4707 and_or_ty
= Ity_I16
; improve
= mkImproveOR16
; goto do_And_Or
;
4709 uifu
= mkUifU8
; difd
= mkDifD8
;
4710 and_or_ty
= Ity_I8
; improve
= mkImproveOR8
; goto do_And_Or
;
4712 uifu
= mkUifU1
; difd
= mkDifD1
;
4713 and_or_ty
= Ity_I1
; improve
= mkImproveOR1
; goto do_And_Or
;
4720 difd(mce
, uifu(mce
, vatom1
, vatom2
),
4721 difd(mce
, improve(mce
, atom1
, vatom1
),
4722 improve(mce
, atom2
, vatom2
) ) ) );
4725 return mkUifU8(mce
, vatom1
, vatom2
);
4727 return mkUifU16(mce
, vatom1
, vatom2
);
4729 return mkUifU32(mce
, vatom1
, vatom2
);
4731 return mkUifU64(mce
, vatom1
, vatom2
);
4733 return mkUifUV128(mce
, vatom1
, vatom2
);
4735 return mkUifUV256(mce
, vatom1
, vatom2
);
4747 /* Same scheme as with all other shifts. Note: 22 Oct 05:
4748 this is wrong now, scalar shifts are done properly lazily.
4749 Vector shifts should be fixed too. */
4750 complainIfUndefined(mce
, atom2
, NULL
);
4751 return assignNew('V', mce
, Ity_V256
, binop(op
, vatom1
, atom2
));
4760 case Iop_CmpGT8Sx32
:
4766 return binary8Ix32(mce
, vatom1
, vatom2
);
4768 case Iop_QSub16Ux16
:
4769 case Iop_QSub16Sx16
:
4772 case Iop_MulHi16Sx16
:
4773 case Iop_MulHi16Ux16
:
4778 case Iop_CmpGT16Sx16
:
4779 case Iop_CmpEQ16x16
:
4781 case Iop_QAdd16Ux16
:
4782 case Iop_QAdd16Sx16
:
4784 return binary16Ix16(mce
, vatom1
, vatom2
);
4787 case Iop_CmpGT32Sx8
:
4795 return binary32Ix8(mce
, vatom1
, vatom2
);
4800 case Iop_CmpGT64Sx4
:
4801 return binary64Ix4(mce
, vatom1
, vatom2
);
4803 case Iop_I32StoF32x8
:
4804 case Iop_F32toI32Sx8
:
4805 return unary32Fx8_w_rm(mce
, vatom1
, vatom2
);
4807 /* Perm32x8: rearrange values in left arg using steering values
4808 from right arg. So rearrange the vbits in the same way but
4809 pessimise wrt steering values. */
4813 assignNew('V', mce
, Ity_V256
, binop(op
, vatom1
, atom2
)),
4814 mkPCast32x8(mce
, vatom2
)
4817 /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
4818 Handle the shifted results in the same way that other
4819 binary Q ops are handled, eg QSub: UifU the two args,
4820 then pessimise -- which is binaryNIxM. But for the upper
4821 V128, we require to generate just 1 bit which is the
4822 pessimised shift result, with 127 defined zeroes above it.
4824 Note that this overly pessimistic in that in fact only the
4825 bottom 8 bits of each lane of the second arg determine the shift
4826 amount. Really we ought to ignore any undefinedness in the
4827 rest of the lanes of the second arg. */
4828 case Iop_QandSQsh64x2
: case Iop_QandUQsh64x2
:
4829 case Iop_QandSQRsh64x2
: case Iop_QandUQRsh64x2
:
4830 case Iop_QandSQsh32x4
: case Iop_QandUQsh32x4
:
4831 case Iop_QandSQRsh32x4
: case Iop_QandUQRsh32x4
:
4832 case Iop_QandSQsh16x8
: case Iop_QandUQsh16x8
:
4833 case Iop_QandSQRsh16x8
: case Iop_QandUQRsh16x8
:
4834 case Iop_QandSQsh8x16
: case Iop_QandUQsh8x16
:
4835 case Iop_QandSQRsh8x16
: case Iop_QandUQRsh8x16
:
4837 // The function to generate the pessimised shift result
4838 IRAtom
* (*binaryNIxM
)(MCEnv
*,IRAtom
*,IRAtom
*) = NULL
;
4840 case Iop_QandSQsh64x2
:
4841 case Iop_QandUQsh64x2
:
4842 case Iop_QandSQRsh64x2
:
4843 case Iop_QandUQRsh64x2
:
4844 binaryNIxM
= binary64Ix2
;
4846 case Iop_QandSQsh32x4
:
4847 case Iop_QandUQsh32x4
:
4848 case Iop_QandSQRsh32x4
:
4849 case Iop_QandUQRsh32x4
:
4850 binaryNIxM
= binary32Ix4
;
4852 case Iop_QandSQsh16x8
:
4853 case Iop_QandUQsh16x8
:
4854 case Iop_QandSQRsh16x8
:
4855 case Iop_QandUQRsh16x8
:
4856 binaryNIxM
= binary16Ix8
;
4858 case Iop_QandSQsh8x16
:
4859 case Iop_QandUQsh8x16
:
4860 case Iop_QandSQRsh8x16
:
4861 case Iop_QandUQRsh8x16
:
4862 binaryNIxM
= binary8Ix16
;
4867 tl_assert(binaryNIxM
);
4868 // Pessimised shift result, shV[127:0]
4869 IRAtom
* shV
= binaryNIxM(mce
, vatom1
, vatom2
);
4870 // Generates: Def--(127)--Def PCast-to-I1(shV)
4871 IRAtom
* qV
= mkPCastXXtoXXlsb(mce
, shV
, Ity_V128
);
4872 // and assemble the result
4873 return assignNew('V', mce
, Ity_V256
,
4874 binop(Iop_V128HLtoV256
, qV
, shV
));
4877 case Iop_F32toF16x4
: {
4878 // First, PCast the input vector, retaining the 32x4 format.
4879 IRAtom
* pcasted
= mkPCast32x4(mce
, vatom2
); // :: 32x4
4880 // Now truncate each 32 bit lane to 16 bits. Since we already PCasted
4881 // the input, we're not going to lose any information.
4883 = assignNew('V', mce
, Ity_I64
, unop(Iop_V128HIto64
, pcasted
));//32x2
4885 = assignNew('V', mce
, Ity_I64
, unop(Iop_V128to64
, pcasted
)); // 32x2
4887 = assignNew('V', mce
, Ity_I64
, binop(Iop_NarrowBin32to16x4
,
4888 pcHI64
, pcLO64
)); // 16x4
4889 // Finally, roll in any badness from the rounding mode.
4890 IRAtom
* rmPCasted
= mkPCastTo(mce
, Ity_I64
, vatom1
);
4891 return mkUifU64(mce
, narrowed
, rmPCasted
);
4894 case Iop_F32toF16x8
: {
4895 // Same scheme as for Iop_F32toF16x4.
4896 IRAtom
* pcasted
= mkPCast32x8(mce
, vatom2
); // :: 32x8
4898 = assignNew('V', mce
, Ity_V128
, unop(Iop_V256toV128_1
,
4901 = assignNew('V', mce
, Ity_V128
, unop(Iop_V256toV128_0
,
4904 = assignNew('V', mce
, Ity_V128
, binop(Iop_NarrowBin32to16x8
,
4905 pcHI128
, pcLO128
)); // 16x8
4906 // Finally, roll in any badness from the rounding mode.
4907 IRAtom
* rmPCasted
= mkPCastTo(mce
, Ity_V128
, vatom1
);
4908 return mkUifUV128(mce
, narrowed
, rmPCasted
);
4913 VG_(tool_panic
)("memcheck:expr2vbits_Binop");
4919 IRExpr
* expr2vbits_Unop ( MCEnv
* mce
, IROp op
, IRAtom
* atom
)
4921 /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
4922 selection of shadow operation implicitly duplicates the logic in
4923 do_shadow_LoadG and should be kept in sync (in the very unlikely
4924 event that the interpretation of such widening ops changes in
4925 future). See comment in do_shadow_LoadG. */
4926 IRAtom
* vatom
= expr2vbits( mce
, atom
, HuOth
);
4927 tl_assert(isOriginalAtom(mce
,atom
));
4932 case Iop_RSqrtEst64Fx2
:
4933 case Iop_RecipEst64Fx2
:
4934 case Iop_Log2_64Fx2
:
4935 return unary64Fx2(mce
, vatom
);
4937 case Iop_Sqrt64F0x2
:
4938 return unary64F0x2(mce
, vatom
);
4941 case Iop_RSqrtEst32Fx8
:
4942 case Iop_RecipEst32Fx8
:
4943 return unary32Fx8(mce
, vatom
);
4946 return unary64Fx4(mce
, vatom
);
4948 case Iop_RecipEst32Fx4
:
4949 case Iop_I32UtoF32x4_DEP
:
4950 case Iop_I32StoF32x4_DEP
:
4951 case Iop_QF32toI32Ux4_RZ
:
4952 case Iop_QF32toI32Sx4_RZ
:
4953 case Iop_RoundF32x4_RM
:
4954 case Iop_RoundF32x4_RP
:
4955 case Iop_RoundF32x4_RN
:
4956 case Iop_RoundF32x4_RZ
:
4957 case Iop_RecipEst32Ux4
:
4960 case Iop_RSqrtEst32Fx4
:
4961 case Iop_Log2_32Fx4
:
4962 case Iop_Exp2_32Fx4
:
4963 return unary32Fx4(mce
, vatom
);
4965 case Iop_I32UtoF32x2_DEP
:
4966 case Iop_I32StoF32x2_DEP
:
4967 case Iop_RecipEst32Fx2
:
4968 case Iop_RecipEst32Ux2
:
4971 case Iop_RSqrtEst32Fx2
:
4972 return unary32Fx2(mce
, vatom
);
4974 case Iop_Sqrt32F0x4
:
4975 case Iop_RSqrtEst32F0x4
:
4976 case Iop_RecipEst32F0x4
:
4977 return unary32F0x4(mce
, vatom
);
4979 // These are self-shadowing.
4985 case Iop_Reverse1sIn8_x16
:
4986 case Iop_Reverse8sIn16_x8
:
4987 case Iop_Reverse8sIn32_x4
:
4988 case Iop_Reverse16sIn32_x4
:
4989 case Iop_Reverse8sIn64_x2
:
4990 case Iop_Reverse16sIn64_x2
:
4991 case Iop_Reverse32sIn64_x2
:
4992 case Iop_V256toV128_1
: case Iop_V256toV128_0
:
4993 case Iop_ZeroHI64ofV128
:
4994 case Iop_ZeroHI96ofV128
:
4995 case Iop_ZeroHI112ofV128
:
4996 case Iop_ZeroHI120ofV128
:
4997 return assignNew('V', mce
, Ity_V128
, unop(op
, vatom
));
4999 case Iop_F128HItoF64
: /* F128 -> high half of F128 */
5000 case Iop_D128HItoD64
: /* D128 -> high half of D128 */
5001 return assignNew('V', mce
, Ity_I64
, unop(Iop_128HIto64
, vatom
));
5002 case Iop_F128LOtoF64
: /* F128 -> low half of F128 */
5003 case Iop_D128LOtoD64
: /* D128 -> low half of D128 */
5004 return assignNew('V', mce
, Ity_I64
, unop(Iop_128to64
, vatom
));
5009 case Iop_TruncF128toI64S
: /* F128 -> I64S */
5010 case Iop_TruncF128toI32S
: /* F128 -> I32S (result stored in 64-bits) */
5011 case Iop_TruncF128toI64U
: /* F128 -> I64U */
5012 case Iop_TruncF128toI32U
: /* F128 -> I32U (result stored in 64-bits) */
5013 return mkPCastTo(mce
, Ity_I128
, vatom
);
5015 case Iop_BCD128toI128S
:
5016 case Iop_MulI128by10
:
5017 case Iop_MulI128by10Carry
:
5018 case Iop_F16toF64x2
:
5019 case Iop_F64toF16x2_DEP
:
5020 // FIXME JRS 2018-Nov-15. This is surely not correct!
5023 case Iop_I32StoF128
: /* signed I32 -> F128 */
5024 case Iop_I64StoF128
: /* signed I64 -> F128 */
5025 case Iop_I32UtoF128
: /* unsigned I32 -> F128 */
5026 case Iop_I64UtoF128
: /* unsigned I64 -> F128 */
5027 case Iop_F32toF128
: /* F32 -> F128 */
5028 case Iop_F64toF128
: /* F64 -> F128 */
5029 case Iop_I32StoD128
: /* signed I64 -> D128 */
5030 case Iop_I64StoD128
: /* signed I64 -> D128 */
5031 case Iop_I32UtoD128
: /* unsigned I32 -> D128 */
5032 case Iop_I64UtoD128
: /* unsigned I64 -> D128 */
5033 return mkPCastTo(mce
, Ity_I128
, vatom
);
5041 case Iop_RSqrtEst5GoodF64
:
5042 case Iop_RoundF64toF64_NEAREST
:
5043 case Iop_RoundF64toF64_NegINF
:
5044 case Iop_RoundF64toF64_PosINF
:
5045 case Iop_RoundF64toF64_ZERO
:
5049 case Iop_ExtractExpD64
: /* D64 -> I64 */
5050 case Iop_ExtractExpD128
: /* D128 -> I64 */
5051 case Iop_ExtractSigD64
: /* D64 -> I64 */
5052 case Iop_ExtractSigD128
: /* D128 -> I64 */
5055 return mkPCastTo(mce
, Ity_I64
, vatom
);
5058 return mkPCastTo(mce
, Ity_I128
, vatom
);
5060 case Iop_TruncF64asF32
:
5064 return mkPCastTo(mce
, Ity_I32
, vatom
);
5066 case Iop_Ctz32
: case Iop_CtzNat32
:
5067 case Iop_Ctz64
: case Iop_CtzNat64
:
5068 return expensiveCountTrailingZeroes(mce
, op
, atom
, vatom
);
5070 case Iop_Clz32
: case Iop_ClzNat32
:
5071 case Iop_Clz64
: case Iop_ClzNat64
:
5072 return expensiveCountLeadingZeroes(mce
, op
, atom
, vatom
);
5074 // PopCount32: this is slightly pessimistic. It is true that the
5075 // result depends on all input bits, so that aspect of the PCast is
5076 // correct. However, regardless of the input, only the lowest 5 bits
5077 // out of the output can ever be undefined. So we could actually
5078 // "improve" the results here by marking the top 27 bits of output as
5079 // defined. A similar comment applies for PopCount64.
5080 case Iop_PopCount32
:
5081 return mkPCastTo(mce
, Ity_I32
, vatom
);
5082 case Iop_PopCount64
:
5083 return mkPCastTo(mce
, Ity_I64
, vatom
);
5085 // These are self-shadowing.
5095 case Iop_V128HIto64
:
5101 case Iop_Reverse8sIn16_x4
:
5102 case Iop_Reverse8sIn32_x2
:
5103 case Iop_Reverse16sIn32_x2
:
5104 case Iop_Reverse8sIn64_x1
:
5105 case Iop_Reverse16sIn64_x1
:
5106 case Iop_Reverse32sIn64_x1
:
5107 case Iop_V256to64_0
: case Iop_V256to64_1
:
5108 case Iop_V256to64_2
: case Iop_V256to64_3
:
5109 return assignNew('V', mce
, Ity_I64
, unop(op
, vatom
));
5111 // These are self-shadowing.
5121 case Iop_Reverse8sIn32_x1
:
5122 return assignNew('V', mce
, Ity_I32
, unop(op
, vatom
));
5124 // These are self-shadowing.
5130 case Iop_GetMSBs8x16
:
5131 return assignNew('V', mce
, Ity_I16
, unop(op
, vatom
));
5133 // These are self-shadowing.
5140 case Iop_GetMSBs8x8
:
5141 return assignNew('V', mce
, Ity_I8
, unop(op
, vatom
));
5144 return assignNew('V', mce
, Ity_I1
, unop(Iop_32to1
, vatom
));
5147 return assignNew('V', mce
, Ity_I1
, unop(Iop_64to1
, vatom
));
5149 case Iop_ReinterpF64asI64
:
5150 case Iop_ReinterpI64asF64
:
5151 case Iop_ReinterpI32asF32
:
5152 case Iop_ReinterpF32asI32
:
5153 case Iop_ReinterpI64asD64
:
5154 case Iop_ReinterpD64asI64
:
5162 // FIXME JRS 2018-Nov-15. This is surely not correct!
5170 return mkPCast8x8(mce
, vatom
);
5172 case Iop_CmpNEZ8x16
:
5178 return mkPCast8x16(mce
, vatom
);
5180 case Iop_CmpNEZ16x4
:
5184 return mkPCast16x4(mce
, vatom
);
5186 case Iop_CmpNEZ16x8
:
5191 return mkPCast16x8(mce
, vatom
);
5193 case Iop_CmpNEZ32x2
:
5196 case Iop_F32toI32Ux2_RZ
:
5197 case Iop_F32toI32Sx2_RZ
:
5199 return mkPCast32x2(mce
, vatom
);
5201 case Iop_CmpNEZ32x4
:
5204 case Iop_F32toI32Ux4_RZ
:
5205 case Iop_F32toI32Sx4_RZ
:
5207 case Iop_RSqrtEst32Ux4
:
5209 return mkPCast32x4(mce
, vatom
);
5212 return mkPCastTo(mce
, Ity_I32
, vatom
);
5215 return mkPCastTo(mce
, Ity_I64
, vatom
);
5217 case Iop_CmpNEZ64x2
:
5218 case Iop_CipherSV128
:
5222 return mkPCast64x2(mce
, vatom
);
5224 // This is self-shadowing.
5225 case Iop_PwBitMtxXpose64x2
:
5226 return assignNew('V', mce
, Ity_V128
, unop(op
, vatom
));
5228 case Iop_NarrowUn16to8x8
:
5229 case Iop_NarrowUn32to16x4
:
5230 case Iop_NarrowUn64to32x2
:
5231 case Iop_QNarrowUn16Sto8Sx8
:
5232 case Iop_QNarrowUn16Sto8Ux8
:
5233 case Iop_QNarrowUn16Uto8Ux8
:
5234 case Iop_QNarrowUn32Sto16Sx4
:
5235 case Iop_QNarrowUn32Sto16Ux4
:
5236 case Iop_QNarrowUn32Uto16Ux4
:
5237 case Iop_QNarrowUn64Sto32Sx2
:
5238 case Iop_QNarrowUn64Sto32Ux2
:
5239 case Iop_QNarrowUn64Uto32Ux2
:
5240 return vectorNarrowUnV128(mce
, op
, vatom
);
5242 // JRS FIXME 2019 Mar 17: per comments on F16toF32x4, this is probably not
5244 case Iop_F32toF16x4_DEP
:
5245 return vectorNarrowUnV128(mce
, op
, vatom
);
5247 case Iop_Widen8Sto16x8
:
5248 case Iop_Widen8Uto16x8
:
5249 case Iop_Widen16Sto32x4
:
5250 case Iop_Widen16Uto32x4
:
5251 case Iop_Widen32Sto64x2
:
5252 case Iop_Widen32Uto64x2
:
5253 return vectorWidenI64(mce
, op
, vatom
);
5255 case Iop_F16toF32x4
:
5256 // JRS 2019 Mar 17: this definitely isn't right, but it probably works
5257 // OK by accident if -- as seems likely -- the F16 to F32 conversion
5258 // preserves will generate an output 32 bits with at least one 1 bit
5259 // set if there's one or more 1 bits set in the input 16 bits. More
5260 // correct code for this is just below, but commented out, so as to
5261 // avoid short-term backend failures on targets that can't do
5262 // Iop_Interleave{LO,HI}16x4.
5263 return vectorWidenI64(mce
, op
, vatom
);
5265 case Iop_F16toF32x8
: {
5266 // PCast the input at 16x8. This makes each lane hold either all
5267 // zeroes or all ones.
5268 IRAtom
* pcasted
= mkPCast16x8(mce
, vatom
); // :: I16x8
5269 // Now double the width of each lane to 32 bits. Because the lanes are
5270 // all zeroes or all ones, we can just copy the each lane twice into
5271 // the result. Here's the low half:
5272 IRAtom
* widenedLO
// :: I32x4
5273 = assignNew('V', mce
, Ity_V128
, binop(Iop_InterleaveLO16x8
,
5275 // And the high half:
5276 IRAtom
* widenedHI
// :: I32x4
5277 = assignNew('V', mce
, Ity_V128
, binop(Iop_InterleaveHI16x8
,
5279 // Glue them back together:
5280 return assignNew('V', mce
, Ity_V256
, binop(Iop_V128HLtoV256
,
5281 widenedHI
, widenedLO
));
5284 // See comment just above, for Iop_F16toF32x4
5285 //case Iop_F16toF32x4: {
5286 // // Same scheme as F16toF32x4
5287 // IRAtom* pcasted = mkPCast16x4(mce, vatom); // :: I16x4
5288 // IRAtom* widenedLO // :: I32x2
5289 // = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveLO16x4,
5290 // pcasted, pcasted));
5291 // IRAtom* widenedHI // :: I32x4
5292 // = assignNew('V', mce, Ity_I64, binop(Iop_InterleaveHI16x4,
5293 // pcasted, pcasted));
5294 // // Glue them back together:
5295 // return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
5296 // widenedHI, widenedLO));
5299 case Iop_PwAddL32Ux2
:
5300 case Iop_PwAddL32Sx2
:
5301 return mkPCastTo(mce
, Ity_I64
,
5302 assignNew('V', mce
, Ity_I64
, unop(op
, mkPCast32x2(mce
, vatom
))));
5304 case Iop_PwAddL16Ux4
:
5305 case Iop_PwAddL16Sx4
:
5306 return mkPCast32x2(mce
,
5307 assignNew('V', mce
, Ity_I64
, unop(op
, mkPCast16x4(mce
, vatom
))));
5309 case Iop_PwAddL8Ux8
:
5310 case Iop_PwAddL8Sx8
:
5311 return mkPCast16x4(mce
,
5312 assignNew('V', mce
, Ity_I64
, unop(op
, mkPCast8x8(mce
, vatom
))));
5314 case Iop_PwAddL32Ux4
:
5315 case Iop_PwAddL32Sx4
:
5316 return mkPCast64x2(mce
,
5317 assignNew('V', mce
, Ity_V128
, unop(op
, mkPCast32x4(mce
, vatom
))));
5319 case Iop_PwAddL64Ux2
:
5320 return mkPCast128x1(mce
,
5321 assignNew('V', mce
, Ity_V128
, unop(op
, mkPCast64x2(mce
, vatom
))));
5323 case Iop_PwAddL16Ux8
:
5324 case Iop_PwAddL16Sx8
:
5325 return mkPCast32x4(mce
,
5326 assignNew('V', mce
, Ity_V128
, unop(op
, mkPCast16x8(mce
, vatom
))));
5328 case Iop_PwAddL8Ux16
:
5329 case Iop_PwAddL8Sx16
:
5330 return mkPCast16x8(mce
,
5331 assignNew('V', mce
, Ity_V128
, unop(op
, mkPCast8x16(mce
, vatom
))));
5336 VG_(tool_panic
)("memcheck:expr2vbits_Unop");
5341 /* Worker function -- do not call directly. See comments on
5342 expr2vbits_Load for the meaning of |guard|.
5344 Generates IR to (1) perform a definedness test of |addr|, (2)
5345 perform a validity test of |addr|, and (3) return the Vbits for the
5346 location indicated by |addr|. All of this only happens when
5347 |guard| is NULL or |guard| evaluates to True at run time.
5349 If |guard| evaluates to False at run time, the returned value is
5350 the IR-mandated 0x55..55 value, and no checks nor shadow loads are
5353 The definedness of |guard| itself is not checked. That is assumed
5354 to have been done before this point, by the caller. */
5356 IRAtom
* expr2vbits_Load_WRK ( MCEnv
* mce
,
5357 IREndness end
, IRType ty
,
5358 IRAtom
* addr
, UInt bias
, IRAtom
* guard
)
5360 tl_assert(isOriginalAtom(mce
,addr
));
5361 tl_assert(end
== Iend_LE
|| end
== Iend_BE
);
5363 /* First, emit a definedness test for the address. This also sets
5364 the address (shadow) to 'defined' following the test. */
5365 complainIfUndefined( mce
, addr
, guard
);
5367 /* Now cook up a call to the relevant helper function, to read the
5368 data V bits from shadow memory. */
5369 ty
= shadowTypeV(ty
);
5371 void* helper
= NULL
;
5372 const HChar
* hname
= NULL
;
5373 Bool ret_via_outparam
= False
;
5375 if (end
== Iend_LE
) {
5377 case Ity_V256
: helper
= &MC_(helperc_LOADV256le
);
5378 hname
= "MC_(helperc_LOADV256le)";
5379 ret_via_outparam
= True
;
5381 case Ity_V128
: helper
= &MC_(helperc_LOADV128le
);
5382 hname
= "MC_(helperc_LOADV128le)";
5383 ret_via_outparam
= True
;
5385 case Ity_I64
: helper
= &MC_(helperc_LOADV64le
);
5386 hname
= "MC_(helperc_LOADV64le)";
5388 case Ity_I32
: helper
= &MC_(helperc_LOADV32le
);
5389 hname
= "MC_(helperc_LOADV32le)";
5391 case Ity_I16
: helper
= &MC_(helperc_LOADV16le
);
5392 hname
= "MC_(helperc_LOADV16le)";
5394 case Ity_I8
: helper
= &MC_(helperc_LOADV8
);
5395 hname
= "MC_(helperc_LOADV8)";
5397 default: ppIRType(ty
);
5398 VG_(tool_panic
)("memcheck:expr2vbits_Load_WRK(LE)");
5402 case Ity_V256
: helper
= &MC_(helperc_LOADV256be
);
5403 hname
= "MC_(helperc_LOADV256be)";
5404 ret_via_outparam
= True
;
5406 case Ity_V128
: helper
= &MC_(helperc_LOADV128be
);
5407 hname
= "MC_(helperc_LOADV128be)";
5408 ret_via_outparam
= True
;
5410 case Ity_I64
: helper
= &MC_(helperc_LOADV64be
);
5411 hname
= "MC_(helperc_LOADV64be)";
5413 case Ity_I32
: helper
= &MC_(helperc_LOADV32be
);
5414 hname
= "MC_(helperc_LOADV32be)";
5416 case Ity_I16
: helper
= &MC_(helperc_LOADV16be
);
5417 hname
= "MC_(helperc_LOADV16be)";
5419 case Ity_I8
: helper
= &MC_(helperc_LOADV8
);
5420 hname
= "MC_(helperc_LOADV8)";
5422 default: ppIRType(ty
);
5423 VG_(tool_panic
)("memcheck:expr2vbits_Load_WRK(BE)");
5430 /* Generate the actual address into addrAct. */
5437 IRType tyAddr
= mce
->hWordTy
;
5438 tl_assert( tyAddr
== Ity_I32
|| tyAddr
== Ity_I64
);
5439 mkAdd
= tyAddr
==Ity_I32
? Iop_Add32
: Iop_Add64
;
5440 eBias
= tyAddr
==Ity_I32
? mkU32(bias
) : mkU64(bias
);
5441 addrAct
= assignNew('V', mce
, tyAddr
, binop(mkAdd
, addr
, eBias
) );
5444 /* We need to have a place to park the V bits we're just about to
5446 IRTemp datavbits
= newTemp(mce
, ty
, VSh
);
5448 /* Here's the call. */
5450 if (ret_via_outparam
) {
5451 di
= unsafeIRDirty_1_N( datavbits
,
5453 hname
, VG_(fnptr_to_fnentry
)( helper
),
5454 mkIRExprVec_2( IRExpr_VECRET(), addrAct
) );
5456 di
= unsafeIRDirty_1_N( datavbits
,
5458 hname
, VG_(fnptr_to_fnentry
)( helper
),
5459 mkIRExprVec_1( addrAct
) );
5462 setHelperAnns( mce
, di
);
5465 /* Ideally the didn't-happen return value here would be all-ones
5466 (all-undefined), so it'd be obvious if it got used
5467 inadvertently. We can get by with the IR-mandated default
5468 value (0b01 repeating, 0x55 etc) as that'll still look pretty
5469 undefined if it ever leaks out. */
5471 stmt( 'V', mce
, IRStmt_Dirty(di
) );
5473 return mkexpr(datavbits
);
5477 /* Generate IR to do a shadow load. The helper is expected to check
5478 the validity of the address and return the V bits for that address.
5479 This can optionally be controlled by a guard, which is assumed to
5480 be True if NULL. In the case where the guard is False at runtime,
5481 the helper will return the didn't-do-the-call value of 0x55..55.
5482 Since that means "completely undefined result", the caller of
5483 this function will need to fix up the result somehow in that
5486 Caller of this function is also expected to have checked the
5487 definedness of |guard| before this point.
5490 IRAtom
* expr2vbits_Load ( MCEnv
* mce
,
5491 IREndness end
, IRType ty
,
5492 IRAtom
* addr
, UInt bias
,
5495 tl_assert(end
== Iend_LE
|| end
== Iend_BE
);
5496 switch (shadowTypeV(ty
)) {
5503 return expr2vbits_Load_WRK(mce
, end
, ty
, addr
, bias
, guard
);
5505 VG_(tool_panic
)("expr2vbits_Load");
5510 /* The most general handler for guarded loads. Assumes the
5511 definedness of GUARD has already been checked by the caller. A
5512 GUARD of NULL is assumed to mean "always True". Generates code to
5513 check the definedness and validity of ADDR.
5515 Generate IR to do a shadow load from ADDR and return the V bits.
5516 The loaded type is TY. The loaded data is then (shadow) widened by
5517 using VWIDEN, which can be Iop_INVALID to denote a no-op. If GUARD
5518 evaluates to False at run time then the returned Vbits are simply
5519 VALT instead. Note therefore that the argument type of VWIDEN must
5520 be TY and the result type of VWIDEN must equal the type of VALT.
5523 IRAtom
* expr2vbits_Load_guarded_General ( MCEnv
* mce
,
5524 IREndness end
, IRType ty
,
5525 IRAtom
* addr
, UInt bias
,
5527 IROp vwiden
, IRAtom
* valt
)
5529 /* Sanity check the conversion operation, and also set TYWIDE. */
5530 IRType tyWide
= Ity_INVALID
;
5535 case Iop_16Uto32
: case Iop_16Sto32
: case Iop_8Uto32
: case Iop_8Sto32
:
5539 VG_(tool_panic
)("memcheck:expr2vbits_Load_guarded_General");
5542 /* If the guard evaluates to True, this will hold the loaded V bits
5543 at TY. If the guard evaluates to False, this will be all
5544 ones, meaning "all undefined", in which case we will have to
5545 replace it using an ITE below. */
5547 = assignNew('V', mce
, ty
,
5548 expr2vbits_Load(mce
, end
, ty
, addr
, bias
, guard
));
5549 /* Now (shadow-) widen the loaded V bits to the desired width. In
5550 the guard-is-False case, the allowable widening operators will
5551 in the worst case (unsigned widening) at least leave the
5552 pre-widened part as being marked all-undefined, and in the best
5553 case (signed widening) mark the whole widened result as
5554 undefined. Anyway, it doesn't matter really, since in this case
5555 we will replace said value with the default value |valt| using an
5558 = vwiden
== Iop_INVALID
5560 : assignNew('V', mce
, tyWide
, unop(vwiden
, iftrue1
));
5561 /* These are the V bits we will return if the load doesn't take
5565 /* Prepare the cond for the ITE. Convert a NULL cond into
5566 something that iropt knows how to fold out later. */
5568 = guard
== NULL
? mkU1(1) : guard
;
5569 /* And assemble the final result. */
5570 return assignNew('V', mce
, tyWide
, IRExpr_ITE(cond
, iftrue2
, iffalse
));
5574 /* A simpler handler for guarded loads, in which there is no
5575 conversion operation, and the default V bit return (when the guard
5576 evaluates to False at runtime) is "all defined". If there is no
5577 guard expression or the guard is always TRUE this function behaves
5578 like expr2vbits_Load. It is assumed that definedness of GUARD has
5579 already been checked at the call site. */
5581 IRAtom
* expr2vbits_Load_guarded_Simple ( MCEnv
* mce
,
5582 IREndness end
, IRType ty
,
5583 IRAtom
* addr
, UInt bias
,
5586 return expr2vbits_Load_guarded_General(
5587 mce
, end
, ty
, addr
, bias
, guard
, Iop_INVALID
, definedOfType(ty
)
5593 IRAtom
* expr2vbits_ITE ( MCEnv
* mce
,
5594 IRAtom
* cond
, IRAtom
* iftrue
, IRAtom
* iffalse
)
5596 IRAtom
*vbitsC
, *vbits0
, *vbits1
;
5598 /* Given ITE(cond, iftrue, iffalse), generate
5599 ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
5600 That is, steer the V bits like the originals, but trash the
5601 result if the steering value is undefined. This gives
5602 lazy propagation. */
5603 tl_assert(isOriginalAtom(mce
, cond
));
5604 tl_assert(isOriginalAtom(mce
, iftrue
));
5605 tl_assert(isOriginalAtom(mce
, iffalse
));
5607 vbitsC
= expr2vbits(mce
, cond
, HuOth
); // could we use HuPCa here?
5608 vbits1
= expr2vbits(mce
, iftrue
, HuOth
);
5609 vbits0
= expr2vbits(mce
, iffalse
, HuOth
);
5610 ty
= typeOfIRExpr(mce
->sb
->tyenv
, vbits0
);
5613 mkUifU(mce
, ty
, assignNew('V', mce
, ty
,
5614 IRExpr_ITE(cond
, vbits1
, vbits0
)),
5615 mkPCastTo(mce
, ty
, vbitsC
) );
5618 /* --------- This is the main expression-handling function. --------- */
5621 IRExpr
* expr2vbits ( MCEnv
* mce
, IRExpr
* e
,
5622 HowUsed hu
/*use HuOth if unknown*/ )
5627 return shadow_GET( mce
, e
->Iex
.Get
.offset
, e
->Iex
.Get
.ty
);
5630 return shadow_GETI( mce
, e
->Iex
.GetI
.descr
,
5631 e
->Iex
.GetI
.ix
, e
->Iex
.GetI
.bias
);
5634 return IRExpr_RdTmp( findShadowTmpV(mce
, e
->Iex
.RdTmp
.tmp
) );
5637 return definedOfType(shadowTypeV(typeOfIRExpr(mce
->sb
->tyenv
, e
)));
5640 return expr2vbits_Qop(
5642 e
->Iex
.Qop
.details
->op
,
5643 e
->Iex
.Qop
.details
->arg1
, e
->Iex
.Qop
.details
->arg2
,
5644 e
->Iex
.Qop
.details
->arg3
, e
->Iex
.Qop
.details
->arg4
5648 return expr2vbits_Triop(
5650 e
->Iex
.Triop
.details
->op
,
5651 e
->Iex
.Triop
.details
->arg1
, e
->Iex
.Triop
.details
->arg2
,
5652 e
->Iex
.Triop
.details
->arg3
5656 return expr2vbits_Binop(
5659 e
->Iex
.Binop
.arg1
, e
->Iex
.Binop
.arg2
,
5664 return expr2vbits_Unop( mce
, e
->Iex
.Unop
.op
, e
->Iex
.Unop
.arg
);
5667 return expr2vbits_Load( mce
, e
->Iex
.Load
.end
,
5669 e
->Iex
.Load
.addr
, 0/*addr bias*/,
5670 NULL
/* guard == "always True"*/ );
5673 return mkLazyN( mce
, e
->Iex
.CCall
.args
,
5678 return expr2vbits_ITE( mce
, e
->Iex
.ITE
.cond
, e
->Iex
.ITE
.iftrue
,
5679 e
->Iex
.ITE
.iffalse
);
5685 VG_(tool_panic
)("memcheck: expr2vbits");
5690 /*------------------------------------------------------------*/
5691 /*--- Generate shadow stmts from all kinds of IRStmts. ---*/
5692 /*------------------------------------------------------------*/
5694 /* Widen a value to the host word size. */
5697 IRExpr
* zwidenToHostWord ( MCEnv
* mce
, IRAtom
* vatom
)
5701 /* vatom is vbits-value and as such can only have a shadow type. */
5702 tl_assert(isShadowAtom(mce
,vatom
));
5704 ty
= typeOfIRExpr(mce
->sb
->tyenv
, vatom
);
5707 if (tyH
== Ity_I32
) {
5712 return assignNew('V', mce
, tyH
, unop(Iop_16Uto32
, vatom
));
5714 return assignNew('V', mce
, tyH
, unop(Iop_8Uto32
, vatom
));
5719 if (tyH
== Ity_I64
) {
5722 return assignNew('V', mce
, tyH
, unop(Iop_32Uto64
, vatom
));
5724 return assignNew('V', mce
, tyH
, unop(Iop_32Uto64
,
5725 assignNew('V', mce
, Ity_I32
, unop(Iop_16Uto32
, vatom
))));
5727 return assignNew('V', mce
, tyH
, unop(Iop_32Uto64
,
5728 assignNew('V', mce
, Ity_I32
, unop(Iop_8Uto32
, vatom
))));
5736 VG_(printf
)("\nty = "); ppIRType(ty
); VG_(printf
)("\n");
5737 VG_(tool_panic
)("zwidenToHostWord");
5741 /* Generate a shadow store. |addr| is always the original address
5742 atom. You can pass in either originals or V-bits for the data
5743 atom, but obviously not both. This function generates a check for
5744 the definedness and (indirectly) the validity of |addr|, but only
5745 when |guard| evaluates to True at run time (or is NULL).
5747 |guard| :: Ity_I1 controls whether the store really happens; NULL
5748 means it unconditionally does. Note that |guard| itself is not
5749 checked for definedness; the caller of this function must do that
5753 void do_shadow_Store ( MCEnv
* mce
,
5755 IRAtom
* addr
, UInt bias
,
5756 IRAtom
* data
, IRAtom
* vdata
,
5761 void* helper
= NULL
;
5762 const HChar
* hname
= NULL
;
5765 tyAddr
= mce
->hWordTy
;
5766 mkAdd
= tyAddr
==Ity_I32
? Iop_Add32
: Iop_Add64
;
5767 tl_assert( tyAddr
== Ity_I32
|| tyAddr
== Ity_I64
);
5768 tl_assert( end
== Iend_LE
|| end
== Iend_BE
);
5772 tl_assert(isOriginalAtom(mce
, data
));
5773 tl_assert(bias
== 0);
5774 vdata
= expr2vbits( mce
, data
, HuOth
);
5779 tl_assert(isOriginalAtom(mce
,addr
));
5780 tl_assert(isShadowAtom(mce
,vdata
));
5783 tl_assert(isOriginalAtom(mce
, guard
));
5784 tl_assert(typeOfIRExpr(mce
->sb
->tyenv
, guard
) == Ity_I1
);
5787 ty
= typeOfIRExpr(mce
->sb
->tyenv
, vdata
);
5789 // If we're not doing undefined value checking, pretend that this value
5790 // is "all valid". That lets Vex's optimiser remove some of the V bit
5791 // shadow computation ops that precede it.
5792 if (MC_(clo_mc_level
) == 1) {
5794 case Ity_V256
: // V256 weirdness -- used four times
5795 c
= IRConst_V256(V_BITS32_DEFINED
); break;
5796 case Ity_V128
: // V128 weirdness -- used twice
5797 c
= IRConst_V128(V_BITS16_DEFINED
); break;
5798 case Ity_I64
: c
= IRConst_U64 (V_BITS64_DEFINED
); break;
5799 case Ity_I32
: c
= IRConst_U32 (V_BITS32_DEFINED
); break;
5800 case Ity_I16
: c
= IRConst_U16 (V_BITS16_DEFINED
); break;
5801 case Ity_I8
: c
= IRConst_U8 (V_BITS8_DEFINED
); break;
5802 default: VG_(tool_panic
)("memcheck:do_shadow_Store(LE)");
5804 vdata
= IRExpr_Const( c
);
5807 /* First, emit a definedness test for the address. This also sets
5808 the address (shadow) to 'defined' following the test. Both of
5809 those actions are gated on |guard|. */
5810 complainIfUndefined( mce
, addr
, guard
);
5812 /* Now decide which helper function to call to write the data V
5813 bits into shadow memory. */
5814 if (end
== Iend_LE
) {
5816 case Ity_V256
: /* we'll use the helper four times */
5817 case Ity_V128
: /* we'll use the helper twice */
5818 case Ity_I64
: helper
= &MC_(helperc_STOREV64le
);
5819 hname
= "MC_(helperc_STOREV64le)";
5821 case Ity_I32
: helper
= &MC_(helperc_STOREV32le
);
5822 hname
= "MC_(helperc_STOREV32le)";
5824 case Ity_I16
: helper
= &MC_(helperc_STOREV16le
);
5825 hname
= "MC_(helperc_STOREV16le)";
5827 case Ity_I8
: helper
= &MC_(helperc_STOREV8
);
5828 hname
= "MC_(helperc_STOREV8)";
5830 default: VG_(tool_panic
)("memcheck:do_shadow_Store(LE)");
5834 case Ity_V128
: /* we'll use the helper twice */
5835 case Ity_I64
: helper
= &MC_(helperc_STOREV64be
);
5836 hname
= "MC_(helperc_STOREV64be)";
5838 case Ity_I32
: helper
= &MC_(helperc_STOREV32be
);
5839 hname
= "MC_(helperc_STOREV32be)";
5841 case Ity_I16
: helper
= &MC_(helperc_STOREV16be
);
5842 hname
= "MC_(helperc_STOREV16be)";
5844 case Ity_I8
: helper
= &MC_(helperc_STOREV8
);
5845 hname
= "MC_(helperc_STOREV8)";
5847 /* Note, no V256 case here, because no big-endian target that
5848 we support, has 256 vectors. */
5849 default: VG_(tool_panic
)("memcheck:do_shadow_Store(BE)");
5853 if (UNLIKELY(ty
== Ity_V256
)) {
5855 /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
5856 Q3 being the most significant lane. */
5857 /* These are the offsets of the Qs in memory. */
5858 Int offQ0
, offQ1
, offQ2
, offQ3
;
5860 /* Various bits for constructing the 4 lane helper calls */
5861 IRDirty
*diQ0
, *diQ1
, *diQ2
, *diQ3
;
5862 IRAtom
*addrQ0
, *addrQ1
, *addrQ2
, *addrQ3
;
5863 IRAtom
*vdataQ0
, *vdataQ1
, *vdataQ2
, *vdataQ3
;
5864 IRAtom
*eBiasQ0
, *eBiasQ1
, *eBiasQ2
, *eBiasQ3
;
5866 if (end
== Iend_LE
) {
5867 offQ0
= 0; offQ1
= 8; offQ2
= 16; offQ3
= 24;
5869 offQ3
= 0; offQ2
= 8; offQ1
= 16; offQ0
= 24;
5872 eBiasQ0
= tyAddr
==Ity_I32
? mkU32(bias
+offQ0
) : mkU64(bias
+offQ0
);
5873 addrQ0
= assignNew('V', mce
, tyAddr
, binop(mkAdd
, addr
, eBiasQ0
) );
5874 vdataQ0
= assignNew('V', mce
, Ity_I64
, unop(Iop_V256to64_0
, vdata
));
5875 diQ0
= unsafeIRDirty_0_N(
5877 hname
, VG_(fnptr_to_fnentry
)( helper
),
5878 mkIRExprVec_2( addrQ0
, vdataQ0
)
5881 eBiasQ1
= tyAddr
==Ity_I32
? mkU32(bias
+offQ1
) : mkU64(bias
+offQ1
);
5882 addrQ1
= assignNew('V', mce
, tyAddr
, binop(mkAdd
, addr
, eBiasQ1
) );
5883 vdataQ1
= assignNew('V', mce
, Ity_I64
, unop(Iop_V256to64_1
, vdata
));
5884 diQ1
= unsafeIRDirty_0_N(
5886 hname
, VG_(fnptr_to_fnentry
)( helper
),
5887 mkIRExprVec_2( addrQ1
, vdataQ1
)
5890 eBiasQ2
= tyAddr
==Ity_I32
? mkU32(bias
+offQ2
) : mkU64(bias
+offQ2
);
5891 addrQ2
= assignNew('V', mce
, tyAddr
, binop(mkAdd
, addr
, eBiasQ2
) );
5892 vdataQ2
= assignNew('V', mce
, Ity_I64
, unop(Iop_V256to64_2
, vdata
));
5893 diQ2
= unsafeIRDirty_0_N(
5895 hname
, VG_(fnptr_to_fnentry
)( helper
),
5896 mkIRExprVec_2( addrQ2
, vdataQ2
)
5899 eBiasQ3
= tyAddr
==Ity_I32
? mkU32(bias
+offQ3
) : mkU64(bias
+offQ3
);
5900 addrQ3
= assignNew('V', mce
, tyAddr
, binop(mkAdd
, addr
, eBiasQ3
) );
5901 vdataQ3
= assignNew('V', mce
, Ity_I64
, unop(Iop_V256to64_3
, vdata
));
5902 diQ3
= unsafeIRDirty_0_N(
5904 hname
, VG_(fnptr_to_fnentry
)( helper
),
5905 mkIRExprVec_2( addrQ3
, vdataQ3
)
5909 diQ0
->guard
= diQ1
->guard
= diQ2
->guard
= diQ3
->guard
= guard
;
5911 setHelperAnns( mce
, diQ0
);
5912 setHelperAnns( mce
, diQ1
);
5913 setHelperAnns( mce
, diQ2
);
5914 setHelperAnns( mce
, diQ3
);
5915 stmt( 'V', mce
, IRStmt_Dirty(diQ0
) );
5916 stmt( 'V', mce
, IRStmt_Dirty(diQ1
) );
5917 stmt( 'V', mce
, IRStmt_Dirty(diQ2
) );
5918 stmt( 'V', mce
, IRStmt_Dirty(diQ3
) );
5921 else if (UNLIKELY(ty
== Ity_V128
)) {
5924 /* See comment in next clause re 64-bit regparms */
5925 /* also, need to be careful about endianness */
5927 Int offLo64
, offHi64
;
5928 IRDirty
*diLo64
, *diHi64
;
5929 IRAtom
*addrLo64
, *addrHi64
;
5930 IRAtom
*vdataLo64
, *vdataHi64
;
5931 IRAtom
*eBiasLo64
, *eBiasHi64
;
5933 if (end
== Iend_LE
) {
5941 eBiasLo64
= tyAddr
==Ity_I32
? mkU32(bias
+offLo64
) : mkU64(bias
+offLo64
);
5942 addrLo64
= assignNew('V', mce
, tyAddr
, binop(mkAdd
, addr
, eBiasLo64
) );
5943 vdataLo64
= assignNew('V', mce
, Ity_I64
, unop(Iop_V128to64
, vdata
));
5944 diLo64
= unsafeIRDirty_0_N(
5946 hname
, VG_(fnptr_to_fnentry
)( helper
),
5947 mkIRExprVec_2( addrLo64
, vdataLo64
)
5949 eBiasHi64
= tyAddr
==Ity_I32
? mkU32(bias
+offHi64
) : mkU64(bias
+offHi64
);
5950 addrHi64
= assignNew('V', mce
, tyAddr
, binop(mkAdd
, addr
, eBiasHi64
) );
5951 vdataHi64
= assignNew('V', mce
, Ity_I64
, unop(Iop_V128HIto64
, vdata
));
5952 diHi64
= unsafeIRDirty_0_N(
5954 hname
, VG_(fnptr_to_fnentry
)( helper
),
5955 mkIRExprVec_2( addrHi64
, vdataHi64
)
5957 if (guard
) diLo64
->guard
= guard
;
5958 if (guard
) diHi64
->guard
= guard
;
5959 setHelperAnns( mce
, diLo64
);
5960 setHelperAnns( mce
, diHi64
);
5961 stmt( 'V', mce
, IRStmt_Dirty(diLo64
) );
5962 stmt( 'V', mce
, IRStmt_Dirty(diHi64
) );
5969 /* 8/16/32/64-bit cases */
5970 /* Generate the actual address into addrAct. */
5974 IRAtom
* eBias
= tyAddr
==Ity_I32
? mkU32(bias
) : mkU64(bias
);
5975 addrAct
= assignNew('V', mce
, tyAddr
, binop(mkAdd
, addr
, eBias
));
5978 if (ty
== Ity_I64
) {
5979 /* We can't do this with regparm 2 on 32-bit platforms, since
5980 the back ends aren't clever enough to handle 64-bit
5981 regparm args. Therefore be different. */
5982 di
= unsafeIRDirty_0_N(
5984 hname
, VG_(fnptr_to_fnentry
)( helper
),
5985 mkIRExprVec_2( addrAct
, vdata
)
5988 di
= unsafeIRDirty_0_N(
5990 hname
, VG_(fnptr_to_fnentry
)( helper
),
5991 mkIRExprVec_2( addrAct
,
5992 zwidenToHostWord( mce
, vdata
))
5995 if (guard
) di
->guard
= guard
;
5996 setHelperAnns( mce
, di
);
5997 stmt( 'V', mce
, IRStmt_Dirty(di
) );
6003 /* Do lazy pessimistic propagation through a dirty helper call, by
6004 looking at the annotations on it. This is the most complex part of
6007 static IRType
szToITy ( Int n
)
6010 case 1: return Ity_I8
;
6011 case 2: return Ity_I16
;
6012 case 4: return Ity_I32
;
6013 case 8: return Ity_I64
;
6014 default: VG_(tool_panic
)("szToITy(memcheck)");
6019 void do_shadow_Dirty ( MCEnv
* mce
, IRDirty
* d
)
6021 Int i
, k
, n
, toDo
, gSz
, gOff
;
6022 IRAtom
*src
, *here
, *curr
;
6023 IRType tySrc
, tyDst
;
6027 /* What's the native endianness? We need to know this. */
6028 # if defined(VG_BIGENDIAN)
6030 # elif defined(VG_LITTLEENDIAN)
6033 # error "Unknown endianness"
6036 /* First check the guard. */
6037 complainIfUndefined(mce
, d
->guard
, NULL
);
6039 /* Now round up all inputs and PCast over them. */
6040 curr
= definedOfType(Ity_I32
);
6042 /* Inputs: unmasked args
6043 Note: arguments are evaluated REGARDLESS of the guard expression */
6044 for (i
= 0; d
->args
[i
]; i
++) {
6045 IRAtom
* arg
= d
->args
[i
];
6046 if ( (d
->cee
->mcx_mask
& (1<<i
))
6047 || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg
)) ) {
6048 /* ignore this arg */
6050 here
= mkPCastTo( mce
, Ity_I32
, expr2vbits(mce
, arg
, HuOth
) );
6051 curr
= mkUifU32(mce
, here
, curr
);
6055 /* Inputs: guest state that we read. */
6056 for (i
= 0; i
< d
->nFxState
; i
++) {
6057 tl_assert(d
->fxState
[i
].fx
!= Ifx_None
);
6058 if (d
->fxState
[i
].fx
== Ifx_Write
)
6061 /* Enumerate the described state segments */
6062 for (k
= 0; k
< 1 + d
->fxState
[i
].nRepeats
; k
++) {
6063 gOff
= d
->fxState
[i
].offset
+ k
* d
->fxState
[i
].repeatLen
;
6064 gSz
= d
->fxState
[i
].size
;
6066 /* Ignore any sections marked as 'always defined'. */
6067 if (isAlwaysDefd(mce
, gOff
, gSz
)) {
6069 VG_(printf
)("memcheck: Dirty gst: ignored off %d, sz %d\n",
6074 /* This state element is read or modified. So we need to
6075 consider it. If larger than 8 bytes, deal with it in
6078 tl_assert(gSz
>= 0);
6079 if (gSz
== 0) break;
6080 n
= gSz
<= 8 ? gSz
: 8;
6081 /* update 'curr' with UifU of the state slice
6083 tySrc
= szToITy( n
);
6085 /* Observe the guard expression. If it is false use an
6086 all-bits-defined bit pattern */
6087 IRAtom
*cond
, *iffalse
, *iftrue
;
6089 cond
= assignNew('V', mce
, Ity_I1
, d
->guard
);
6090 iftrue
= assignNew('V', mce
, tySrc
, shadow_GET(mce
, gOff
, tySrc
));
6091 iffalse
= assignNew('V', mce
, tySrc
, definedOfType(tySrc
));
6092 src
= assignNew('V', mce
, tySrc
,
6093 IRExpr_ITE(cond
, iftrue
, iffalse
));
6095 here
= mkPCastTo( mce
, Ity_I32
, src
);
6096 curr
= mkUifU32(mce
, here
, curr
);
6103 /* Inputs: memory. First set up some info needed regardless of
6104 whether we're doing reads or writes. */
6106 if (d
->mFx
!= Ifx_None
) {
6107 /* Because we may do multiple shadow loads/stores from the same
6108 base address, it's best to do a single test of its
6109 definedness right now. Post-instrumentation optimisation
6110 should remove all but this test. */
6112 tl_assert(d
->mAddr
);
6113 complainIfUndefined(mce
, d
->mAddr
, d
->guard
);
6115 tyAddr
= typeOfIRExpr(mce
->sb
->tyenv
, d
->mAddr
);
6116 tl_assert(tyAddr
== Ity_I32
|| tyAddr
== Ity_I64
);
6117 tl_assert(tyAddr
== mce
->hWordTy
); /* not really right */
6120 /* Deal with memory inputs (reads or modifies) */
6121 if (d
->mFx
== Ifx_Read
|| d
->mFx
== Ifx_Modify
) {
6123 /* chew off 32-bit chunks. We don't care about the endianness
6124 since it's all going to be condensed down to a single bit,
6125 but nevertheless choose an endianness which is hopefully
6126 native to the platform. */
6130 expr2vbits_Load_guarded_Simple(
6131 mce
, end
, Ity_I32
, d
->mAddr
, d
->mSize
- toDo
, d
->guard
)
6133 curr
= mkUifU32(mce
, here
, curr
);
6136 /* chew off 16-bit chunks */
6140 expr2vbits_Load_guarded_Simple(
6141 mce
, end
, Ity_I16
, d
->mAddr
, d
->mSize
- toDo
, d
->guard
)
6143 curr
= mkUifU32(mce
, here
, curr
);
6146 /* chew off the remaining 8-bit chunk, if any */
6150 expr2vbits_Load_guarded_Simple(
6151 mce
, end
, Ity_I8
, d
->mAddr
, d
->mSize
- toDo
, d
->guard
)
6153 curr
= mkUifU32(mce
, here
, curr
);
6156 tl_assert(toDo
== 0);
6159 /* Whew! So curr is a 32-bit V-value summarising pessimistically
6160 all the inputs to the helper. Now we need to re-distribute the
6161 results to all destinations. */
6163 /* Outputs: the destination temporary, if there is one. */
6164 if (d
->tmp
!= IRTemp_INVALID
) {
6165 dst
= findShadowTmpV(mce
, d
->tmp
);
6166 tyDst
= typeOfIRTemp(mce
->sb
->tyenv
, d
->tmp
);
6167 assign( 'V', mce
, dst
, mkPCastTo( mce
, tyDst
, curr
) );
6170 /* Outputs: guest state that we write or modify. */
6171 for (i
= 0; i
< d
->nFxState
; i
++) {
6172 tl_assert(d
->fxState
[i
].fx
!= Ifx_None
);
6173 if (d
->fxState
[i
].fx
== Ifx_Read
)
6176 /* Enumerate the described state segments */
6177 for (k
= 0; k
< 1 + d
->fxState
[i
].nRepeats
; k
++) {
6178 gOff
= d
->fxState
[i
].offset
+ k
* d
->fxState
[i
].repeatLen
;
6179 gSz
= d
->fxState
[i
].size
;
6181 /* Ignore any sections marked as 'always defined'. */
6182 if (isAlwaysDefd(mce
, gOff
, gSz
))
6185 /* This state element is written or modified. So we need to
6186 consider it. If larger than 8 bytes, deal with it in
6189 tl_assert(gSz
>= 0);
6190 if (gSz
== 0) break;
6191 n
= gSz
<= 8 ? gSz
: 8;
6192 /* Write suitably-casted 'curr' to the state slice
6194 tyDst
= szToITy( n
);
6195 do_shadow_PUT( mce
, gOff
,
6196 NULL
, /* original atom */
6197 mkPCastTo( mce
, tyDst
, curr
), d
->guard
);
6204 /* Outputs: memory that we write or modify. Same comments about
6205 endianness as above apply. */
6206 if (d
->mFx
== Ifx_Write
|| d
->mFx
== Ifx_Modify
) {
6208 /* chew off 32-bit chunks */
6210 do_shadow_Store( mce
, end
, d
->mAddr
, d
->mSize
- toDo
,
6211 NULL
, /* original data */
6212 mkPCastTo( mce
, Ity_I32
, curr
),
6216 /* chew off 16-bit chunks */
6218 do_shadow_Store( mce
, end
, d
->mAddr
, d
->mSize
- toDo
,
6219 NULL
, /* original data */
6220 mkPCastTo( mce
, Ity_I16
, curr
),
6224 /* chew off the remaining 8-bit chunk, if any */
6226 do_shadow_Store( mce
, end
, d
->mAddr
, d
->mSize
- toDo
,
6227 NULL
, /* original data */
6228 mkPCastTo( mce
, Ity_I8
, curr
),
6232 tl_assert(toDo
== 0);
6238 /* We have an ABI hint telling us that [base .. base+len-1] is to
6239 become undefined ("writable"). Generate code to call a helper to
6240 notify the A/V bit machinery of this fact.
6243 void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
6247 void do_AbiHint ( MCEnv
* mce
, IRExpr
* base
, Int len
, IRExpr
* nia
)
6251 if (MC_(clo_mc_level
) == 3) {
6252 di
= unsafeIRDirty_0_N(
6254 "MC_(helperc_MAKE_STACK_UNINIT_w_o)",
6255 VG_(fnptr_to_fnentry
)( &MC_(helperc_MAKE_STACK_UNINIT_w_o
) ),
6256 mkIRExprVec_3( base
, mkIRExpr_HWord( (UInt
)len
), nia
)
6259 /* We ignore the supplied nia, since it is irrelevant. */
6260 tl_assert(MC_(clo_mc_level
) == 2 || MC_(clo_mc_level
) == 1);
6261 /* Special-case the len==128 case, since that is for amd64-ELF,
6262 which is a very common target. */
6264 di
= unsafeIRDirty_0_N(
6266 "MC_(helperc_MAKE_STACK_UNINIT_128_no_o)",
6267 VG_(fnptr_to_fnentry
)( &MC_(helperc_MAKE_STACK_UNINIT_128_no_o
)),
6268 mkIRExprVec_1( base
)
6271 di
= unsafeIRDirty_0_N(
6273 "MC_(helperc_MAKE_STACK_UNINIT_no_o)",
6274 VG_(fnptr_to_fnentry
)( &MC_(helperc_MAKE_STACK_UNINIT_no_o
) ),
6275 mkIRExprVec_2( base
, mkIRExpr_HWord( (UInt
)len
) )
6280 stmt( 'V', mce
, IRStmt_Dirty(di
) );
6284 /* ------ Dealing with IRCAS (big and complex) ------ */
6287 static IRAtom
* gen_load_b ( MCEnv
* mce
, Int szB
,
6288 IRAtom
* baseaddr
, Int offset
);
6289 static IRAtom
* gen_maxU32 ( MCEnv
* mce
, IRAtom
* b1
, IRAtom
* b2
);
6290 static void gen_store_b ( MCEnv
* mce
, Int szB
,
6291 IRAtom
* baseaddr
, Int offset
, IRAtom
* dataB
,
6294 static void do_shadow_CAS_single ( MCEnv
* mce
, IRCAS
* cas
);
6295 static void do_shadow_CAS_double ( MCEnv
* mce
, IRCAS
* cas
);
6298 /* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
6299 IRExpr.Consts, else this asserts. If they are both Consts, it
6300 doesn't do anything. So that just leaves the RdTmp case.
6302 In which case: this assigns the shadow value SHADOW to the IR
6303 shadow temporary associated with ORIG. That is, ORIG, being an
6304 original temporary, will have a shadow temporary associated with
6305 it. However, in the case envisaged here, there will so far have
6306 been no IR emitted to actually write a shadow value into that
6307 temporary. What this routine does is to (emit IR to) copy the
6308 value in SHADOW into said temporary, so that after this call,
6309 IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
6312 Point is to allow callers to compute "by hand" a shadow value for
6313 ORIG, and force it to be associated with ORIG.
6315 How do we know that that shadow associated with ORIG has not so far
6316 been assigned to? Well, we don't per se know that, but supposing
6317 it had. Then this routine would create a second assignment to it,
6318 and later the IR sanity checker would barf. But that never
6321 static void bind_shadow_tmp_to_orig ( UChar how
,
6323 IRAtom
* orig
, IRAtom
* shadow
)
6325 tl_assert(isOriginalAtom(mce
, orig
));
6326 tl_assert(isShadowAtom(mce
, shadow
));
6327 switch (orig
->tag
) {
6329 tl_assert(shadow
->tag
== Iex_Const
);
6332 tl_assert(shadow
->tag
== Iex_RdTmp
);
6334 assign('V', mce
, findShadowTmpV(mce
,orig
->Iex
.RdTmp
.tmp
),
6337 tl_assert(how
== 'B');
6338 assign('B', mce
, findShadowTmpB(mce
,orig
->Iex
.RdTmp
.tmp
),
6349 void do_shadow_CAS ( MCEnv
* mce
, IRCAS
* cas
)
6351 /* Scheme is (both single- and double- cases):
6353 1. fetch data#,dataB (the proposed new value)
6355 2. fetch expd#,expdB (what we expect to see at the address)
6357 3. check definedness of address
6359 4. load old#,oldB from shadow memory; this also checks
6360 addressibility of the address
6364 6. compute "expected == old". See COMMENT_ON_CasCmpEQ below.
6366 7. if "expected == old" (as computed by (6))
6367 store data#,dataB to shadow memory
6369 Note that 5 reads 'old' but 4 reads 'old#'. Similarly, 5 stores
6370 'data' but 7 stores 'data#'. Hence it is possible for the
6371 shadow data to be incorrectly checked and/or updated:
6373 * 7 is at least gated correctly, since the 'expected == old'
6374 condition is derived from outputs of 5. However, the shadow
6375 write could happen too late: imagine after 5 we are
6376 descheduled, a different thread runs, writes a different
6377 (shadow) value at the address, and then we resume, hence
6378 overwriting the shadow value written by the other thread.
6380 Because the original memory access is atomic, there's no way to
6381 make both the original and shadow accesses into a single atomic
6382 thing, hence this is unavoidable.
6384 At least as Valgrind stands, I don't think it's a problem, since
6385 we're single threaded *and* we guarantee that there are no
6386 context switches during the execution of any specific superblock
6387 -- context switches can only happen at superblock boundaries.
6389 If Valgrind ever becomes MT in the future, then it might be more
6390 of a problem. A possible kludge would be to artificially
6391 associate with the location, a lock, which we must acquire and
6392 release around the transaction as a whole. Hmm, that probably
6393 would't work properly since it only guards us against other
6394 threads doing CASs on the same location, not against other
6395 threads doing normal reads and writes.
6397 ------------------------------------------------------------
6399 COMMENT_ON_CasCmpEQ:
6401 Note two things. Firstly, in the sequence above, we compute
6402 "expected == old", but we don't check definedness of it. Why
6403 not? Also, the x86 and amd64 front ends use
6404 Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
6405 determination (expected == old ?) for themselves, and we also
6406 don't check definedness for those primops; we just say that the
6407 result is defined. Why? Details follow.
6409 x86/amd64 contains various forms of locked insns:
6410 * lock prefix before all basic arithmetic insn;
6411 eg lock xorl %reg1,(%reg2)
6412 * atomic exchange reg-mem
6415 Rather than attempt to represent them all, which would be a
6416 royal PITA, I used a result from Maurice Herlihy
6417 (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
6418 demonstrates that compare-and-swap is a primitive more general
6419 than the other two, and so can be used to represent all of them.
6420 So the translation scheme for (eg) lock incl (%reg) is as
6426 atomically { if (* %reg == old) { * %reg = new } else { goto again } }
6428 The "atomically" is the CAS bit. The scheme is always the same:
6429 get old value from memory, compute new value, atomically stuff
6430 new value back in memory iff the old value has not changed (iow,
6431 no other thread modified it in the meantime). If it has changed
6432 then we've been out-raced and we have to start over.
6434 Now that's all very neat, but it has the bad side effect of
6435 introducing an explicit equality test into the translation.
6436 Consider the behaviour of said code on a memory location which
6437 is uninitialised. We will wind up doing a comparison on
6438 uninitialised data, and mc duly complains.
6440 What's difficult about this is, the common case is that the
6441 location is uncontended, and so we're usually comparing the same
6442 value (* %reg) with itself. So we shouldn't complain even if it
6443 is undefined. But mc doesn't know that.
6445 My solution is to mark the == in the IR specially, so as to tell
6446 mc that it almost certainly compares a value with itself, and we
6447 should just regard the result as always defined. Rather than
6448 add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
6449 Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
6451 So there's always the question of, can this give a false
6452 negative? eg, imagine that initially, * %reg is defined; and we
6453 read that; but then in the gap between the read and the CAS, a
6454 different thread writes an undefined (and different) value at
6455 the location. Then the CAS in this thread will fail and we will
6456 go back to "again:", but without knowing that the trip back
6457 there was based on an undefined comparison. No matter; at least
6458 the other thread won the race and the location is correctly
6459 marked as undefined. What if it wrote an uninitialised version
6460 of the same value that was there originally, though?
6462 etc etc. Seems like there's a small corner case in which we
6463 might lose the fact that something's defined -- we're out-raced
6464 in between the "old = * reg" and the "atomically {", _and_ the
6465 other thread is writing in an undefined version of what's
6466 already there. Well, that seems pretty unlikely.
6470 If we ever need to reinstate it .. code which generates a
6471 definedness test for "expected == old" was removed at r10432 of
6474 if (cas
->oldHi
== IRTemp_INVALID
) {
6475 do_shadow_CAS_single( mce
, cas
);
6477 do_shadow_CAS_double( mce
, cas
);
6482 static void do_shadow_CAS_single ( MCEnv
* mce
, IRCAS
* cas
)
6484 IRAtom
*vdataLo
= NULL
, *bdataLo
= NULL
;
6485 IRAtom
*vexpdLo
= NULL
, *bexpdLo
= NULL
;
6486 IRAtom
*voldLo
= NULL
, *boldLo
= NULL
;
6487 IRAtom
*expd_eq_old
= NULL
;
6491 Bool otrak
= MC_(clo_mc_level
) >= 3; /* a shorthand */
6494 tl_assert(cas
->oldHi
== IRTemp_INVALID
);
6495 tl_assert(cas
->expdHi
== NULL
);
6496 tl_assert(cas
->dataHi
== NULL
);
6498 elemTy
= typeOfIRExpr(mce
->sb
->tyenv
, cas
->expdLo
);
6500 case Ity_I8
: elemSzB
= 1; opCasCmpEQ
= Iop_CasCmpEQ8
; break;
6501 case Ity_I16
: elemSzB
= 2; opCasCmpEQ
= Iop_CasCmpEQ16
; break;
6502 case Ity_I32
: elemSzB
= 4; opCasCmpEQ
= Iop_CasCmpEQ32
; break;
6503 case Ity_I64
: elemSzB
= 8; opCasCmpEQ
= Iop_CasCmpEQ64
; break;
6504 default: tl_assert(0); /* IR defn disallows any other types */
6507 /* 1. fetch data# (the proposed new value) */
6508 tl_assert(isOriginalAtom(mce
, cas
->dataLo
));
6510 = assignNew('V', mce
, elemTy
, expr2vbits(mce
, cas
->dataLo
, HuOth
));
6511 tl_assert(isShadowAtom(mce
, vdataLo
));
6514 = assignNew('B', mce
, Ity_I32
, schemeE(mce
, cas
->dataLo
));
6515 tl_assert(isShadowAtom(mce
, bdataLo
));
6518 /* 2. fetch expected# (what we expect to see at the address) */
6519 tl_assert(isOriginalAtom(mce
, cas
->expdLo
));
6521 = assignNew('V', mce
, elemTy
, expr2vbits(mce
, cas
->expdLo
, HuOth
));
6522 tl_assert(isShadowAtom(mce
, vexpdLo
));
6525 = assignNew('B', mce
, Ity_I32
, schemeE(mce
, cas
->expdLo
));
6526 tl_assert(isShadowAtom(mce
, bexpdLo
));
6529 /* 3. check definedness of address */
6530 /* 4. fetch old# from shadow memory; this also checks
6531 addressibility of the address */
6537 cas
->end
, elemTy
, cas
->addr
, 0/*Addr bias*/,
6538 NULL
/*always happens*/
6540 bind_shadow_tmp_to_orig('V', mce
, mkexpr(cas
->oldLo
), voldLo
);
6543 = assignNew('B', mce
, Ity_I32
,
6544 gen_load_b(mce
, elemSzB
, cas
->addr
, 0/*addr bias*/));
6545 bind_shadow_tmp_to_orig('B', mce
, mkexpr(cas
->oldLo
), boldLo
);
6548 /* 5. the CAS itself */
6549 stmt( 'C', mce
, IRStmt_CAS(cas
) );
6551 /* 6. compute "expected == old" */
6552 /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6553 /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6554 tree, but it's not copied from the input block. */
6556 = assignNew('C', mce
, Ity_I1
,
6557 binop(opCasCmpEQ
, cas
->expdLo
, mkexpr(cas
->oldLo
)));
6559 /* 7. if "expected == old"
6560 store data# to shadow memory */
6561 do_shadow_Store( mce
, cas
->end
, cas
->addr
, 0/*bias*/,
6562 NULL
/*data*/, vdataLo
/*vdata*/,
6563 expd_eq_old
/*guard for store*/ );
6565 gen_store_b( mce
, elemSzB
, cas
->addr
, 0/*offset*/,
6567 expd_eq_old
/*guard for store*/ );
6572 static void do_shadow_CAS_double ( MCEnv
* mce
, IRCAS
* cas
)
6574 IRAtom
*vdataHi
= NULL
, *bdataHi
= NULL
;
6575 IRAtom
*vdataLo
= NULL
, *bdataLo
= NULL
;
6576 IRAtom
*vexpdHi
= NULL
, *bexpdHi
= NULL
;
6577 IRAtom
*vexpdLo
= NULL
, *bexpdLo
= NULL
;
6578 IRAtom
*voldHi
= NULL
, *boldHi
= NULL
;
6579 IRAtom
*voldLo
= NULL
, *boldLo
= NULL
;
6580 IRAtom
*xHi
= NULL
, *xLo
= NULL
, *xHL
= NULL
;
6581 IRAtom
*expd_eq_old
= NULL
, *zero
= NULL
;
6582 IROp opCasCmpEQ
, opOr
, opXor
;
6583 Int elemSzB
, memOffsLo
, memOffsHi
;
6585 Bool otrak
= MC_(clo_mc_level
) >= 3; /* a shorthand */
6588 tl_assert(cas
->oldHi
!= IRTemp_INVALID
);
6589 tl_assert(cas
->expdHi
!= NULL
);
6590 tl_assert(cas
->dataHi
!= NULL
);
6592 elemTy
= typeOfIRExpr(mce
->sb
->tyenv
, cas
->expdLo
);
6595 opCasCmpEQ
= Iop_CasCmpEQ8
; opOr
= Iop_Or8
; opXor
= Iop_Xor8
;
6596 elemSzB
= 1; zero
= mkU8(0);
6599 opCasCmpEQ
= Iop_CasCmpEQ16
; opOr
= Iop_Or16
; opXor
= Iop_Xor16
;
6600 elemSzB
= 2; zero
= mkU16(0);
6603 opCasCmpEQ
= Iop_CasCmpEQ32
; opOr
= Iop_Or32
; opXor
= Iop_Xor32
;
6604 elemSzB
= 4; zero
= mkU32(0);
6607 opCasCmpEQ
= Iop_CasCmpEQ64
; opOr
= Iop_Or64
; opXor
= Iop_Xor64
;
6608 elemSzB
= 8; zero
= mkU64(0);
6611 tl_assert(0); /* IR defn disallows any other types */
6614 /* 1. fetch data# (the proposed new value) */
6615 tl_assert(isOriginalAtom(mce
, cas
->dataHi
));
6616 tl_assert(isOriginalAtom(mce
, cas
->dataLo
));
6618 = assignNew('V', mce
, elemTy
, expr2vbits(mce
, cas
->dataHi
, HuOth
));
6620 = assignNew('V', mce
, elemTy
, expr2vbits(mce
, cas
->dataLo
, HuOth
));
6621 tl_assert(isShadowAtom(mce
, vdataHi
));
6622 tl_assert(isShadowAtom(mce
, vdataLo
));
6625 = assignNew('B', mce
, Ity_I32
, schemeE(mce
, cas
->dataHi
));
6627 = assignNew('B', mce
, Ity_I32
, schemeE(mce
, cas
->dataLo
));
6628 tl_assert(isShadowAtom(mce
, bdataHi
));
6629 tl_assert(isShadowAtom(mce
, bdataLo
));
6632 /* 2. fetch expected# (what we expect to see at the address) */
6633 tl_assert(isOriginalAtom(mce
, cas
->expdHi
));
6634 tl_assert(isOriginalAtom(mce
, cas
->expdLo
));
6636 = assignNew('V', mce
, elemTy
, expr2vbits(mce
, cas
->expdHi
, HuOth
));
6638 = assignNew('V', mce
, elemTy
, expr2vbits(mce
, cas
->expdLo
, HuOth
));
6639 tl_assert(isShadowAtom(mce
, vexpdHi
));
6640 tl_assert(isShadowAtom(mce
, vexpdLo
));
6643 = assignNew('B', mce
, Ity_I32
, schemeE(mce
, cas
->expdHi
));
6645 = assignNew('B', mce
, Ity_I32
, schemeE(mce
, cas
->expdLo
));
6646 tl_assert(isShadowAtom(mce
, bexpdHi
));
6647 tl_assert(isShadowAtom(mce
, bexpdLo
));
6650 /* 3. check definedness of address */
6651 /* 4. fetch old# from shadow memory; this also checks
6652 addressibility of the address */
6653 if (cas
->end
== Iend_LE
) {
6655 memOffsHi
= elemSzB
;
6657 tl_assert(cas
->end
== Iend_BE
);
6658 memOffsLo
= elemSzB
;
6666 cas
->end
, elemTy
, cas
->addr
, memOffsHi
/*Addr bias*/,
6667 NULL
/*always happens*/
6674 cas
->end
, elemTy
, cas
->addr
, memOffsLo
/*Addr bias*/,
6675 NULL
/*always happens*/
6677 bind_shadow_tmp_to_orig('V', mce
, mkexpr(cas
->oldHi
), voldHi
);
6678 bind_shadow_tmp_to_orig('V', mce
, mkexpr(cas
->oldLo
), voldLo
);
6681 = assignNew('B', mce
, Ity_I32
,
6682 gen_load_b(mce
, elemSzB
, cas
->addr
,
6683 memOffsHi
/*addr bias*/));
6685 = assignNew('B', mce
, Ity_I32
,
6686 gen_load_b(mce
, elemSzB
, cas
->addr
,
6687 memOffsLo
/*addr bias*/));
6688 bind_shadow_tmp_to_orig('B', mce
, mkexpr(cas
->oldHi
), boldHi
);
6689 bind_shadow_tmp_to_orig('B', mce
, mkexpr(cas
->oldLo
), boldLo
);
6692 /* 5. the CAS itself */
6693 stmt( 'C', mce
, IRStmt_CAS(cas
) );
6695 /* 6. compute "expected == old" */
6696 /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
6697 /* Note that 'C' is kinda faking it; it is indeed a non-shadow
6698 tree, but it's not copied from the input block. */
6700 xHi = oldHi ^ expdHi;
6701 xLo = oldLo ^ expdLo;
6703 expd_eq_old = xHL == 0;
6705 xHi
= assignNew('C', mce
, elemTy
,
6706 binop(opXor
, cas
->expdHi
, mkexpr(cas
->oldHi
)));
6707 xLo
= assignNew('C', mce
, elemTy
,
6708 binop(opXor
, cas
->expdLo
, mkexpr(cas
->oldLo
)));
6709 xHL
= assignNew('C', mce
, elemTy
,
6710 binop(opOr
, xHi
, xLo
));
6712 = assignNew('C', mce
, Ity_I1
,
6713 binop(opCasCmpEQ
, xHL
, zero
));
6715 /* 7. if "expected == old"
6716 store data# to shadow memory */
6717 do_shadow_Store( mce
, cas
->end
, cas
->addr
, memOffsHi
/*bias*/,
6718 NULL
/*data*/, vdataHi
/*vdata*/,
6719 expd_eq_old
/*guard for store*/ );
6720 do_shadow_Store( mce
, cas
->end
, cas
->addr
, memOffsLo
/*bias*/,
6721 NULL
/*data*/, vdataLo
/*vdata*/,
6722 expd_eq_old
/*guard for store*/ );
6724 gen_store_b( mce
, elemSzB
, cas
->addr
, memOffsHi
/*offset*/,
6726 expd_eq_old
/*guard for store*/ );
6727 gen_store_b( mce
, elemSzB
, cas
->addr
, memOffsLo
/*offset*/,
6729 expd_eq_old
/*guard for store*/ );
6734 /* ------ Dealing with LL/SC (not difficult) ------ */
6736 static void do_shadow_LLSC ( MCEnv
* mce
,
6740 IRExpr
* stStoredata
)
6742 /* In short: treat a load-linked like a normal load followed by an
6743 assignment of the loaded (shadow) data to the result temporary.
6744 Treat a store-conditional like a normal store, and mark the
6745 result temporary as defined. */
6746 IRType resTy
= typeOfIRTemp(mce
->sb
->tyenv
, stResult
);
6747 IRTemp resTmp
= findShadowTmpV(mce
, stResult
);
6749 tl_assert(isIRAtom(stAddr
));
6751 tl_assert(isIRAtom(stStoredata
));
6753 if (stStoredata
== NULL
) {
6755 /* Just treat this as a normal load, followed by an assignment of
6756 the value to .result. */
6758 tl_assert(resTy
== Ity_I64
|| resTy
== Ity_I32
6759 || resTy
== Ity_I16
|| resTy
== Ity_I8
);
6760 assign( 'V', mce
, resTmp
,
6762 mce
, stEnd
, resTy
, stAddr
, 0/*addr bias*/,
6763 NULL
/*always happens*/) );
6765 /* Store Conditional */
6767 IRType dataTy
= typeOfIRExpr(mce
->sb
->tyenv
,
6769 tl_assert(dataTy
== Ity_I64
|| dataTy
== Ity_I32
6770 || dataTy
== Ity_I16
|| dataTy
== Ity_I8
);
6771 do_shadow_Store( mce
, stEnd
,
6772 stAddr
, 0/* addr bias */,
6774 NULL
/* shadow data */,
6776 /* This is a store conditional, so it writes to .result a value
6777 indicating whether or not the store succeeded. Just claim
6778 this value is always defined. In the PowerPC interpretation
6779 of store-conditional, definedness of the success indication
6780 depends on whether the address of the store matches the
6781 reservation address. But we can't tell that here (and
6782 anyway, we're not being PowerPC-specific). At least we are
6783 guaranteed that the definedness of the store address, and its
6784 addressibility, will be checked as per normal. So it seems
6785 pretty safe to just say that the success indication is always
6788 In schemeS, for origin tracking, we must correspondingly set
6789 a no-origin value for the origin shadow of .result.
6791 tl_assert(resTy
== Ity_I1
);
6792 assign( 'V', mce
, resTmp
, definedOfType(resTy
) );
6797 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
6799 static void do_shadow_StoreG ( MCEnv
* mce
, IRStoreG
* sg
)
6801 complainIfUndefined(mce
, sg
->guard
, NULL
);
6802 /* do_shadow_Store will generate code to check the definedness and
6803 validity of sg->addr, in the case where sg->guard evaluates to
6804 True at run-time. */
6805 do_shadow_Store( mce
, sg
->end
,
6806 sg
->addr
, 0/* addr bias */,
6808 NULL
/* shadow data */,
6812 static void do_shadow_LoadG ( MCEnv
* mce
, IRLoadG
* lg
)
6814 complainIfUndefined(mce
, lg
->guard
, NULL
);
6815 /* expr2vbits_Load_guarded_General will generate code to check the
6816 definedness and validity of lg->addr, in the case where
6817 lg->guard evaluates to True at run-time. */
6819 /* Look at the LoadG's built-in conversion operation, to determine
6820 the source (actual loaded data) type, and the equivalent IROp.
6821 NOTE that implicitly we are taking a widening operation to be
6822 applied to original atoms and producing one that applies to V
6823 bits. Since signed and unsigned widening are self-shadowing,
6824 this is a straight copy of the op (modulo swapping from the
6825 IRLoadGOp form to the IROp form). Note also therefore that this
6826 implicitly duplicates the logic to do with said widening ops in
6827 expr2vbits_Unop. See comment at the start of expr2vbits_Unop. */
6828 IROp vwiden
= Iop_INVALID
;
6829 IRType loadedTy
= Ity_INVALID
;
6831 case ILGop_IdentV128
: loadedTy
= Ity_V128
; vwiden
= Iop_INVALID
; break;
6832 case ILGop_Ident64
: loadedTy
= Ity_I64
; vwiden
= Iop_INVALID
; break;
6833 case ILGop_Ident32
: loadedTy
= Ity_I32
; vwiden
= Iop_INVALID
; break;
6834 case ILGop_16Uto32
: loadedTy
= Ity_I16
; vwiden
= Iop_16Uto32
; break;
6835 case ILGop_16Sto32
: loadedTy
= Ity_I16
; vwiden
= Iop_16Sto32
; break;
6836 case ILGop_8Uto32
: loadedTy
= Ity_I8
; vwiden
= Iop_8Uto32
; break;
6837 case ILGop_8Sto32
: loadedTy
= Ity_I8
; vwiden
= Iop_8Sto32
; break;
6838 default: VG_(tool_panic
)("do_shadow_LoadG");
6842 = expr2vbits( mce
, lg
->alt
, HuOth
);
6844 = expr2vbits_Load_guarded_General(mce
, lg
->end
, loadedTy
,
6845 lg
->addr
, 0/*addr bias*/,
6846 lg
->guard
, vwiden
, vbits_alt
);
6847 /* And finally, bind the V bits to the destination temporary. */
6848 assign( 'V', mce
, findShadowTmpV(mce
, lg
->dst
), vbits_final
);
6852 /*------------------------------------------------------------*/
6853 /*--- Origin tracking stuff ---*/
6854 /*------------------------------------------------------------*/
6856 /* Almost identical to findShadowTmpV. */
6857 static IRTemp
findShadowTmpB ( MCEnv
* mce
, IRTemp orig
)
6860 /* VG_(indexXA) range-checks 'orig', hence no need to check
6862 ent
= (TempMapEnt
*)VG_(indexXA
)( mce
->tmpMap
, (Word
)orig
);
6863 tl_assert(ent
->kind
== Orig
);
6864 if (ent
->shadowB
== IRTemp_INVALID
) {
6866 = newTemp( mce
, Ity_I32
, BSh
);
6867 /* newTemp may cause mce->tmpMap to resize, hence previous results
6868 from VG_(indexXA) are invalid. */
6869 ent
= (TempMapEnt
*)VG_(indexXA
)( mce
->tmpMap
, (Word
)orig
);
6870 tl_assert(ent
->kind
== Orig
);
6871 tl_assert(ent
->shadowB
== IRTemp_INVALID
);
6872 ent
->shadowB
= tmpB
;
6874 return ent
->shadowB
;
6877 static IRAtom
* gen_maxU32 ( MCEnv
* mce
, IRAtom
* b1
, IRAtom
* b2
)
6879 return assignNew( 'B', mce
, Ity_I32
, binop(Iop_Max32U
, b1
, b2
) );
6883 /* Make a guarded origin load, with no special handling in the
6884 didn't-happen case. A GUARD of NULL is assumed to mean "always
6887 Generate IR to do a shadow origins load from BASEADDR+OFFSET and
6888 return the otag. The loaded size is SZB. If GUARD evaluates to
6889 False at run time then the returned otag is zero.
6891 static IRAtom
* gen_guarded_load_b ( MCEnv
* mce
, Int szB
,
6893 Int offset
, IRExpr
* guard
)
6899 IRType aTy
= typeOfIRExpr( mce
->sb
->tyenv
, baseaddr
);
6900 IROp opAdd
= aTy
== Ity_I32
? Iop_Add32
: Iop_Add64
;
6901 IRAtom
* ea
= baseaddr
;
6903 IRAtom
* off
= aTy
== Ity_I32
? mkU32( offset
)
6904 : mkU64( (Long
)(Int
)offset
);
6905 ea
= assignNew( 'B', mce
, aTy
, binop(opAdd
, ea
, off
));
6907 bTmp
= newTemp(mce
, mce
->hWordTy
, BSh
);
6910 case 1: hFun
= (void*)&MC_(helperc_b_load1
);
6911 hName
= "MC_(helperc_b_load1)";
6913 case 2: hFun
= (void*)&MC_(helperc_b_load2
);
6914 hName
= "MC_(helperc_b_load2)";
6916 case 4: hFun
= (void*)&MC_(helperc_b_load4
);
6917 hName
= "MC_(helperc_b_load4)";
6919 case 8: hFun
= (void*)&MC_(helperc_b_load8
);
6920 hName
= "MC_(helperc_b_load8)";
6922 case 16: hFun
= (void*)&MC_(helperc_b_load16
);
6923 hName
= "MC_(helperc_b_load16)";
6925 case 32: hFun
= (void*)&MC_(helperc_b_load32
);
6926 hName
= "MC_(helperc_b_load32)";
6929 VG_(printf
)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB
);
6932 di
= unsafeIRDirty_1_N(
6933 bTmp
, 1/*regparms*/, hName
, VG_(fnptr_to_fnentry
)( hFun
),
6938 /* Ideally the didn't-happen return value here would be
6939 all-zeroes (unknown-origin), so it'd be harmless if it got
6940 used inadvertently. We slum it out with the IR-mandated
6941 default value (0b01 repeating, 0x55 etc) as that'll probably
6942 trump all legitimate otags via Max32, and it's pretty
6945 /* no need to mess with any annotations. This call accesses
6946 neither guest state nor guest memory. */
6947 stmt( 'B', mce
, IRStmt_Dirty(di
) );
6948 if (mce
->hWordTy
== Ity_I64
) {
6950 IRTemp bTmp32
= newTemp(mce
, Ity_I32
, BSh
);
6951 assign( 'B', mce
, bTmp32
, unop(Iop_64to32
, mkexpr(bTmp
)) );
6952 return mkexpr(bTmp32
);
6955 return mkexpr(bTmp
);
6960 /* Generate IR to do a shadow origins load from BASEADDR+OFFSET. The
6961 loaded size is SZB. The load is regarded as unconditional (always
6964 static IRAtom
* gen_load_b ( MCEnv
* mce
, Int szB
, IRAtom
* baseaddr
,
6967 return gen_guarded_load_b(mce
, szB
, baseaddr
, offset
, NULL
/*guard*/);
6971 /* The most general handler for guarded origin loads. A GUARD of NULL
6972 is assumed to mean "always True".
6974 Generate IR to do a shadow origin load from ADDR+BIAS and return
6975 the B bits. The loaded type is TY. If GUARD evaluates to False at
6976 run time then the returned B bits are simply BALT instead.
6979 IRAtom
* expr2ori_Load_guarded_General ( MCEnv
* mce
,
6981 IRAtom
* addr
, UInt bias
,
6982 IRAtom
* guard
, IRAtom
* balt
)
6984 /* If the guard evaluates to True, this will hold the loaded
6985 origin. If the guard evaluates to False, this will be zero,
6986 meaning "unknown origin", in which case we will have to replace
6987 it using an ITE below. */
6989 = assignNew('B', mce
, Ity_I32
,
6990 gen_guarded_load_b(mce
, sizeofIRType(ty
),
6991 addr
, bias
, guard
));
6992 /* These are the bits we will return if the load doesn't take
6996 /* Prepare the cond for the ITE. Convert a NULL cond into
6997 something that iropt knows how to fold out later. */
6999 = guard
== NULL
? mkU1(1) : guard
;
7000 /* And assemble the final result. */
7001 return assignNew('B', mce
, Ity_I32
, IRExpr_ITE(cond
, iftrue
, iffalse
));
7005 /* Generate a shadow origins store. guard :: Ity_I1 controls whether
7006 the store really happens; NULL means it unconditionally does. */
7007 static void gen_store_b ( MCEnv
* mce
, Int szB
,
7008 IRAtom
* baseaddr
, Int offset
, IRAtom
* dataB
,
7014 IRType aTy
= typeOfIRExpr( mce
->sb
->tyenv
, baseaddr
);
7015 IROp opAdd
= aTy
== Ity_I32
? Iop_Add32
: Iop_Add64
;
7016 IRAtom
* ea
= baseaddr
;
7018 tl_assert(isOriginalAtom(mce
, guard
));
7019 tl_assert(typeOfIRExpr(mce
->sb
->tyenv
, guard
) == Ity_I1
);
7022 IRAtom
* off
= aTy
== Ity_I32
? mkU32( offset
)
7023 : mkU64( (Long
)(Int
)offset
);
7024 ea
= assignNew( 'B', mce
, aTy
, binop(opAdd
, ea
, off
));
7026 if (mce
->hWordTy
== Ity_I64
)
7027 dataB
= assignNew( 'B', mce
, Ity_I64
, unop(Iop_32Uto64
, dataB
));
7030 case 1: hFun
= (void*)&MC_(helperc_b_store1
);
7031 hName
= "MC_(helperc_b_store1)";
7033 case 2: hFun
= (void*)&MC_(helperc_b_store2
);
7034 hName
= "MC_(helperc_b_store2)";
7036 case 4: hFun
= (void*)&MC_(helperc_b_store4
);
7037 hName
= "MC_(helperc_b_store4)";
7039 case 8: hFun
= (void*)&MC_(helperc_b_store8
);
7040 hName
= "MC_(helperc_b_store8)";
7042 case 16: hFun
= (void*)&MC_(helperc_b_store16
);
7043 hName
= "MC_(helperc_b_store16)";
7045 case 32: hFun
= (void*)&MC_(helperc_b_store32
);
7046 hName
= "MC_(helperc_b_store32)";
7051 di
= unsafeIRDirty_0_N( 2/*regparms*/,
7052 hName
, VG_(fnptr_to_fnentry
)( hFun
),
7053 mkIRExprVec_2( ea
, dataB
)
7055 /* no need to mess with any annotations. This call accesses
7056 neither guest state nor guest memory. */
7057 if (guard
) di
->guard
= guard
;
7058 stmt( 'B', mce
, IRStmt_Dirty(di
) );
7061 static IRAtom
* narrowTo32 ( MCEnv
* mce
, IRAtom
* e
) {
7062 IRType eTy
= typeOfIRExpr(mce
->sb
->tyenv
, e
);
7064 return assignNew( 'B', mce
, Ity_I32
, unop(Iop_64to32
, e
) );
7070 static IRAtom
* zWidenFrom32 ( MCEnv
* mce
, IRType dstTy
, IRAtom
* e
) {
7071 IRType eTy
= typeOfIRExpr(mce
->sb
->tyenv
, e
);
7072 tl_assert(eTy
== Ity_I32
);
7073 if (dstTy
== Ity_I64
)
7074 return assignNew( 'B', mce
, Ity_I64
, unop(Iop_32Uto64
, e
) );
7079 static IRAtom
* schemeE ( MCEnv
* mce
, IRExpr
* e
)
7081 tl_assert(MC_(clo_mc_level
) == 3);
7086 IRRegArray
* descr_b
;
7087 IRAtom
*t1
, *t2
, *t3
, *t4
;
7088 IRRegArray
* descr
= e
->Iex
.GetI
.descr
;
7090 = MC_(get_otrack_reg_array_equiv_int_type
)(descr
);
7091 /* If this array is unshadowable for whatever reason, use the
7092 usual approximation. */
7093 if (equivIntTy
== Ity_INVALID
)
7095 tl_assert(sizeofIRType(equivIntTy
) >= 4);
7096 tl_assert(sizeofIRType(equivIntTy
) == sizeofIRType(descr
->elemTy
));
7097 descr_b
= mkIRRegArray( descr
->base
+ 2*mce
->layout
->total_sizeB
,
7098 equivIntTy
, descr
->nElems
);
7099 /* Do a shadow indexed get of the same size, giving t1. Take
7100 the bottom 32 bits of it, giving t2. Compute into t3 the
7101 origin for the index (almost certainly zero, but there's
7102 no harm in being completely general here, since iropt will
7103 remove any useless code), and fold it in, giving a final
7105 t1
= assignNew( 'B', mce
, equivIntTy
,
7106 IRExpr_GetI( descr_b
, e
->Iex
.GetI
.ix
,
7107 e
->Iex
.GetI
.bias
));
7108 t2
= narrowTo32( mce
, t1
);
7109 t3
= schemeE( mce
, e
->Iex
.GetI
.ix
);
7110 t4
= gen_maxU32( mce
, t2
, t3
);
7116 IRExpr
** args
= e
->Iex
.CCall
.args
;
7117 IRAtom
* curr
= mkU32(0);
7118 for (i
= 0; args
[i
]; i
++) {
7120 tl_assert(isOriginalAtom(mce
, args
[i
]));
7121 /* Only take notice of this arg if the callee's
7122 mc-exclusion mask does not say it is to be excluded. */
7123 if (e
->Iex
.CCall
.cee
->mcx_mask
& (1<<i
)) {
7124 /* the arg is to be excluded from definedness checking.
7126 if (0) VG_(printf
)("excluding %s(%d)\n",
7127 e
->Iex
.CCall
.cee
->name
, i
);
7129 /* calculate the arg's definedness, and pessimistically
7131 here
= schemeE( mce
, args
[i
] );
7132 curr
= gen_maxU32( mce
, curr
, here
);
7139 dszB
= sizeofIRType(e
->Iex
.Load
.ty
);
7140 /* assert that the B value for the address is already
7141 available (somewhere) */
7142 tl_assert(isIRAtom(e
->Iex
.Load
.addr
));
7143 tl_assert(mce
->hWordTy
== Ity_I32
|| mce
->hWordTy
== Ity_I64
);
7144 return gen_load_b( mce
, dszB
, e
->Iex
.Load
.addr
, 0 );
7147 IRAtom
* b1
= schemeE( mce
, e
->Iex
.ITE
.cond
);
7148 IRAtom
* b3
= schemeE( mce
, e
->Iex
.ITE
.iftrue
);
7149 IRAtom
* b2
= schemeE( mce
, e
->Iex
.ITE
.iffalse
);
7150 return gen_maxU32( mce
, b1
, gen_maxU32( mce
, b2
, b3
));
7153 IRAtom
* b1
= schemeE( mce
, e
->Iex
.Qop
.details
->arg1
);
7154 IRAtom
* b2
= schemeE( mce
, e
->Iex
.Qop
.details
->arg2
);
7155 IRAtom
* b3
= schemeE( mce
, e
->Iex
.Qop
.details
->arg3
);
7156 IRAtom
* b4
= schemeE( mce
, e
->Iex
.Qop
.details
->arg4
);
7157 return gen_maxU32( mce
, gen_maxU32( mce
, b1
, b2
),
7158 gen_maxU32( mce
, b3
, b4
) );
7161 IRAtom
* b1
= schemeE( mce
, e
->Iex
.Triop
.details
->arg1
);
7162 IRAtom
* b2
= schemeE( mce
, e
->Iex
.Triop
.details
->arg2
);
7163 IRAtom
* b3
= schemeE( mce
, e
->Iex
.Triop
.details
->arg3
);
7164 return gen_maxU32( mce
, b1
, gen_maxU32( mce
, b2
, b3
) );
7167 switch (e
->Iex
.Binop
.op
) {
7168 case Iop_CasCmpEQ8
: case Iop_CasCmpNE8
:
7169 case Iop_CasCmpEQ16
: case Iop_CasCmpNE16
:
7170 case Iop_CasCmpEQ32
: case Iop_CasCmpNE32
:
7171 case Iop_CasCmpEQ64
: case Iop_CasCmpNE64
:
7172 /* Just say these all produce a defined result,
7173 regardless of their arguments. See
7174 COMMENT_ON_CasCmpEQ in this file. */
7177 IRAtom
* b1
= schemeE( mce
, e
->Iex
.Binop
.arg1
);
7178 IRAtom
* b2
= schemeE( mce
, e
->Iex
.Binop
.arg2
);
7179 return gen_maxU32( mce
, b1
, b2
);
7186 IRAtom
* b1
= schemeE( mce
, e
->Iex
.Unop
.arg
);
7192 return mkexpr( findShadowTmpB( mce
, e
->Iex
.RdTmp
.tmp
));
7194 Int b_offset
= MC_(get_otrack_shadow_offset
)(
7196 sizeofIRType(e
->Iex
.Get
.ty
)
7198 tl_assert(b_offset
>= -1
7199 && b_offset
<= mce
->layout
->total_sizeB
-4);
7200 if (b_offset
>= 0) {
7201 /* FIXME: this isn't an atom! */
7202 return IRExpr_Get( b_offset
+ 2*mce
->layout
->total_sizeB
,
7208 VG_(printf
)("mc_translate.c: schemeE: unhandled: ");
7210 VG_(tool_panic
)("memcheck:schemeE");
7215 static void do_origins_Dirty ( MCEnv
* mce
, IRDirty
* d
)
7217 // This is a hacked version of do_shadow_Dirty
7218 Int i
, k
, n
, toDo
, gSz
, gOff
;
7219 IRAtom
*here
, *curr
;
7222 /* First check the guard. */
7223 curr
= schemeE( mce
, d
->guard
);
7225 /* Now round up all inputs and maxU32 over them. */
7227 /* Inputs: unmasked args
7228 Note: arguments are evaluated REGARDLESS of the guard expression */
7229 for (i
= 0; d
->args
[i
]; i
++) {
7230 IRAtom
* arg
= d
->args
[i
];
7231 if ( (d
->cee
->mcx_mask
& (1<<i
))
7232 || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg
)) ) {
7233 /* ignore this arg */
7235 here
= schemeE( mce
, arg
);
7236 curr
= gen_maxU32( mce
, curr
, here
);
7240 /* Inputs: guest state that we read. */
7241 for (i
= 0; i
< d
->nFxState
; i
++) {
7242 tl_assert(d
->fxState
[i
].fx
!= Ifx_None
);
7243 if (d
->fxState
[i
].fx
== Ifx_Write
)
7246 /* Enumerate the described state segments */
7247 for (k
= 0; k
< 1 + d
->fxState
[i
].nRepeats
; k
++) {
7248 gOff
= d
->fxState
[i
].offset
+ k
* d
->fxState
[i
].repeatLen
;
7249 gSz
= d
->fxState
[i
].size
;
7251 /* Ignore any sections marked as 'always defined'. */
7252 if (isAlwaysDefd(mce
, gOff
, gSz
)) {
7254 VG_(printf
)("memcheck: Dirty gst: ignored off %d, sz %d\n",
7259 /* This state element is read or modified. So we need to
7260 consider it. If larger than 4 bytes, deal with it in
7264 tl_assert(gSz
>= 0);
7265 if (gSz
== 0) break;
7266 n
= gSz
<= 4 ? gSz
: 4;
7267 /* update 'curr' with maxU32 of the state slice
7269 b_offset
= MC_(get_otrack_shadow_offset
)(gOff
, 4);
7270 if (b_offset
!= -1) {
7271 /* Observe the guard expression. If it is false use 0, i.e.
7272 nothing is known about the origin */
7273 IRAtom
*cond
, *iffalse
, *iftrue
;
7275 cond
= assignNew( 'B', mce
, Ity_I1
, d
->guard
);
7277 iftrue
= assignNew( 'B', mce
, Ity_I32
,
7279 + 2*mce
->layout
->total_sizeB
,
7281 here
= assignNew( 'B', mce
, Ity_I32
,
7282 IRExpr_ITE(cond
, iftrue
, iffalse
));
7283 curr
= gen_maxU32( mce
, curr
, here
);
7291 /* Inputs: memory */
7293 if (d
->mFx
!= Ifx_None
) {
7294 /* Because we may do multiple shadow loads/stores from the same
7295 base address, it's best to do a single test of its
7296 definedness right now. Post-instrumentation optimisation
7297 should remove all but this test. */
7298 tl_assert(d
->mAddr
);
7299 here
= schemeE( mce
, d
->mAddr
);
7300 curr
= gen_maxU32( mce
, curr
, here
);
7303 /* Deal with memory inputs (reads or modifies) */
7304 if (d
->mFx
== Ifx_Read
|| d
->mFx
== Ifx_Modify
) {
7306 /* chew off 32-bit chunks. We don't care about the endianness
7307 since it's all going to be condensed down to a single bit,
7308 but nevertheless choose an endianness which is hopefully
7309 native to the platform. */
7311 here
= gen_guarded_load_b( mce
, 4, d
->mAddr
, d
->mSize
- toDo
,
7313 curr
= gen_maxU32( mce
, curr
, here
);
7316 /* handle possible 16-bit excess */
7318 here
= gen_guarded_load_b( mce
, 2, d
->mAddr
, d
->mSize
- toDo
,
7320 curr
= gen_maxU32( mce
, curr
, here
);
7323 /* chew off the remaining 8-bit chunk, if any */
7325 here
= gen_guarded_load_b( mce
, 1, d
->mAddr
, d
->mSize
- toDo
,
7327 curr
= gen_maxU32( mce
, curr
, here
);
7330 tl_assert(toDo
== 0);
7333 /* Whew! So curr is a 32-bit B-value which should give an origin
7334 of some use if any of the inputs to the helper are undefined.
7335 Now we need to re-distribute the results to all destinations. */
7337 /* Outputs: the destination temporary, if there is one. */
7338 if (d
->tmp
!= IRTemp_INVALID
) {
7339 dst
= findShadowTmpB(mce
, d
->tmp
);
7340 assign( 'V', mce
, dst
, curr
);
7343 /* Outputs: guest state that we write or modify. */
7344 for (i
= 0; i
< d
->nFxState
; i
++) {
7345 tl_assert(d
->fxState
[i
].fx
!= Ifx_None
);
7346 if (d
->fxState
[i
].fx
== Ifx_Read
)
7349 /* Enumerate the described state segments */
7350 for (k
= 0; k
< 1 + d
->fxState
[i
].nRepeats
; k
++) {
7351 gOff
= d
->fxState
[i
].offset
+ k
* d
->fxState
[i
].repeatLen
;
7352 gSz
= d
->fxState
[i
].size
;
7354 /* Ignore any sections marked as 'always defined'. */
7355 if (isAlwaysDefd(mce
, gOff
, gSz
))
7358 /* This state element is written or modified. So we need to
7359 consider it. If larger than 4 bytes, deal with it in
7363 tl_assert(gSz
>= 0);
7364 if (gSz
== 0) break;
7365 n
= gSz
<= 4 ? gSz
: 4;
7366 /* Write 'curr' to the state slice gOff .. gOff+n-1 */
7367 b_offset
= MC_(get_otrack_shadow_offset
)(gOff
, 4);
7368 if (b_offset
!= -1) {
7370 /* If the guard expression evaluates to false we simply Put
7371 the value that is already stored in the guest state slot */
7372 IRAtom
*cond
, *iffalse
;
7374 cond
= assignNew('B', mce
, Ity_I1
,
7376 iffalse
= assignNew('B', mce
, Ity_I32
,
7377 IRExpr_Get(b_offset
+
7378 2*mce
->layout
->total_sizeB
,
7380 curr
= assignNew('V', mce
, Ity_I32
,
7381 IRExpr_ITE(cond
, curr
, iffalse
));
7383 stmt( 'B', mce
, IRStmt_Put(b_offset
7384 + 2*mce
->layout
->total_sizeB
,
7393 /* Outputs: memory that we write or modify. Same comments about
7394 endianness as above apply. */
7395 if (d
->mFx
== Ifx_Write
|| d
->mFx
== Ifx_Modify
) {
7397 /* chew off 32-bit chunks */
7399 gen_store_b( mce
, 4, d
->mAddr
, d
->mSize
- toDo
, curr
,
7403 /* handle possible 16-bit excess */
7405 gen_store_b( mce
, 2, d
->mAddr
, d
->mSize
- toDo
, curr
,
7409 /* chew off the remaining 8-bit chunk, if any */
7411 gen_store_b( mce
, 1, d
->mAddr
, d
->mSize
- toDo
, curr
,
7415 tl_assert(toDo
== 0);
7420 /* Generate IR for origin shadowing for a general guarded store. */
7421 static void do_origins_Store_guarded ( MCEnv
* mce
,
7429 /* assert that the B value for the address is already available
7430 (somewhere), since the call to schemeE will want to see it.
7431 XXXX how does this actually ensure that?? */
7432 tl_assert(isIRAtom(stAddr
));
7433 tl_assert(isIRAtom(stData
));
7434 dszB
= sizeofIRType( typeOfIRExpr(mce
->sb
->tyenv
, stData
) );
7435 dataB
= schemeE( mce
, stData
);
7436 gen_store_b( mce
, dszB
, stAddr
, 0/*offset*/, dataB
, guard
);
7440 /* Generate IR for origin shadowing for a plain store. */
7441 static void do_origins_Store_plain ( MCEnv
* mce
,
7446 do_origins_Store_guarded ( mce
, stEnd
, stAddr
, stData
,
7451 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
7453 static void do_origins_StoreG ( MCEnv
* mce
, IRStoreG
* sg
)
7455 do_origins_Store_guarded( mce
, sg
->end
, sg
->addr
,
7456 sg
->data
, sg
->guard
);
7459 static void do_origins_LoadG ( MCEnv
* mce
, IRLoadG
* lg
)
7461 IRType loadedTy
= Ity_INVALID
;
7463 case ILGop_IdentV128
: loadedTy
= Ity_V128
; break;
7464 case ILGop_Ident64
: loadedTy
= Ity_I64
; break;
7465 case ILGop_Ident32
: loadedTy
= Ity_I32
; break;
7466 case ILGop_16Uto32
: loadedTy
= Ity_I16
; break;
7467 case ILGop_16Sto32
: loadedTy
= Ity_I16
; break;
7468 case ILGop_8Uto32
: loadedTy
= Ity_I8
; break;
7469 case ILGop_8Sto32
: loadedTy
= Ity_I8
; break;
7470 default: VG_(tool_panic
)("schemeS.IRLoadG");
7473 = schemeE( mce
,lg
->alt
);
7475 = expr2ori_Load_guarded_General(mce
, loadedTy
,
7476 lg
->addr
, 0/*addr bias*/,
7477 lg
->guard
, ori_alt
);
7478 /* And finally, bind the origin to the destination temporary. */
7479 assign( 'B', mce
, findShadowTmpB(mce
, lg
->dst
), ori_final
);
7483 static void schemeS ( MCEnv
* mce
, IRStmt
* st
)
7485 tl_assert(MC_(clo_mc_level
) == 3);
7490 /* The value-check instrumenter handles this - by arranging
7491 to pass the address of the next instruction to
7492 MC_(helperc_MAKE_STACK_UNINIT). This is all that needs to
7493 happen for origin tracking w.r.t. AbiHints. So there is
7494 nothing to do here. */
7498 IRPutI
*puti
= st
->Ist
.PutI
.details
;
7499 IRRegArray
* descr_b
;
7500 IRAtom
*t1
, *t2
, *t3
, *t4
;
7501 IRRegArray
* descr
= puti
->descr
;
7503 = MC_(get_otrack_reg_array_equiv_int_type
)(descr
);
7504 /* If this array is unshadowable for whatever reason,
7505 generate no code. */
7506 if (equivIntTy
== Ity_INVALID
)
7508 tl_assert(sizeofIRType(equivIntTy
) >= 4);
7509 tl_assert(sizeofIRType(equivIntTy
) == sizeofIRType(descr
->elemTy
));
7511 = mkIRRegArray( descr
->base
+ 2*mce
->layout
->total_sizeB
,
7512 equivIntTy
, descr
->nElems
);
7513 /* Compute a value to Put - the conjoinment of the origin for
7514 the data to be Put-ted (obviously) and of the index value
7515 (not so obviously). */
7516 t1
= schemeE( mce
, puti
->data
);
7517 t2
= schemeE( mce
, puti
->ix
);
7518 t3
= gen_maxU32( mce
, t1
, t2
);
7519 t4
= zWidenFrom32( mce
, equivIntTy
, t3
);
7520 stmt( 'B', mce
, IRStmt_PutI( mkIRPutI(descr_b
, puti
->ix
,
7526 do_origins_Dirty( mce
, st
->Ist
.Dirty
.details
);
7530 do_origins_Store_plain( mce
, st
->Ist
.Store
.end
,
7532 st
->Ist
.Store
.data
);
7536 do_origins_StoreG( mce
, st
->Ist
.StoreG
.details
);
7540 do_origins_LoadG( mce
, st
->Ist
.LoadG
.details
);
7544 /* In short: treat a load-linked like a normal load followed
7545 by an assignment of the loaded (shadow) data the result
7546 temporary. Treat a store-conditional like a normal store,
7547 and mark the result temporary as defined. */
7548 if (st
->Ist
.LLSC
.storedata
== NULL
) {
7551 = typeOfIRTemp(mce
->sb
->tyenv
, st
->Ist
.LLSC
.result
);
7553 = IRExpr_Load(st
->Ist
.LLSC
.end
, resTy
, st
->Ist
.LLSC
.addr
);
7554 tl_assert(resTy
== Ity_I64
|| resTy
== Ity_I32
7555 || resTy
== Ity_I16
|| resTy
== Ity_I8
);
7556 assign( 'B', mce
, findShadowTmpB(mce
, st
->Ist
.LLSC
.result
),
7557 schemeE(mce
, vanillaLoad
));
7559 /* Store conditional */
7560 do_origins_Store_plain( mce
, st
->Ist
.LLSC
.end
,
7562 st
->Ist
.LLSC
.storedata
);
7563 /* For the rationale behind this, see comments at the
7564 place where the V-shadow for .result is constructed, in
7565 do_shadow_LLSC. In short, we regard .result as
7567 assign( 'B', mce
, findShadowTmpB(mce
, st
->Ist
.LLSC
.result
),
7575 = MC_(get_otrack_shadow_offset
)(
7577 sizeofIRType(typeOfIRExpr(mce
->sb
->tyenv
, st
->Ist
.Put
.data
))
7579 if (b_offset
>= 0) {
7580 /* FIXME: this isn't an atom! */
7581 stmt( 'B', mce
, IRStmt_Put(b_offset
+ 2*mce
->layout
->total_sizeB
,
7582 schemeE( mce
, st
->Ist
.Put
.data
)) );
7588 assign( 'B', mce
, findShadowTmpB(mce
, st
->Ist
.WrTmp
.tmp
),
7589 schemeE(mce
, st
->Ist
.WrTmp
.data
) );
7599 VG_(printf
)("mc_translate.c: schemeS: unhandled: ");
7601 VG_(tool_panic
)("memcheck:schemeS");
7606 /*------------------------------------------------------------*/
7607 /*--- Post-tree-build final tidying ---*/
7608 /*------------------------------------------------------------*/
7610 /* This exploits the observation that Memcheck often produces
7611 repeated conditional calls of the form
7613 Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
7615 with the same guard expression G guarding the same helper call.
7616 The second and subsequent calls are redundant. This usually
7617 results from instrumentation of guest code containing multiple
7618 memory references at different constant offsets from the same base
7619 register. After optimisation of the instrumentation, you get a
7620 test for the definedness of the base register for each memory
7621 reference, which is kinda pointless. MC_(final_tidy) therefore
7622 looks for such repeated calls and removes all but the first. */
7625 /* With some testing on perf/bz2.c, on amd64 and x86, compiled with
7626 gcc-5.3.1 -O2, it appears that 16 entries in the array are enough to
7627 get almost all the benefits of this transformation whilst causing
7628 the slide-back case to just often enough to be verifiably
7629 correct. For posterity, the numbers are:
7633 1 4,336 (112,212 -> 1,709,473; ratio 15.2)
7634 2 4,336 (112,194 -> 1,669,895; ratio 14.9)
7635 3 4,336 (112,194 -> 1,660,713; ratio 14.8)
7636 4 4,336 (112,194 -> 1,658,555; ratio 14.8)
7637 5 4,336 (112,194 -> 1,655,447; ratio 14.8)
7638 6 4,336 (112,194 -> 1,655,101; ratio 14.8)
7639 7 4,336 (112,194 -> 1,654,858; ratio 14.7)
7640 8 4,336 (112,194 -> 1,654,810; ratio 14.7)
7641 10 4,336 (112,194 -> 1,654,621; ratio 14.7)
7642 12 4,336 (112,194 -> 1,654,678; ratio 14.7)
7643 16 4,336 (112,194 -> 1,654,494; ratio 14.7)
7644 32 4,336 (112,194 -> 1,654,602; ratio 14.7)
7645 inf 4,336 (112,194 -> 1,654,602; ratio 14.7)
7649 1 4,113 (107,329 -> 1,822,171; ratio 17.0)
7650 2 4,113 (107,329 -> 1,806,443; ratio 16.8)
7651 3 4,113 (107,329 -> 1,803,967; ratio 16.8)
7652 4 4,113 (107,329 -> 1,802,785; ratio 16.8)
7653 5 4,113 (107,329 -> 1,802,412; ratio 16.8)
7654 6 4,113 (107,329 -> 1,802,062; ratio 16.8)
7655 7 4,113 (107,329 -> 1,801,976; ratio 16.8)
7656 8 4,113 (107,329 -> 1,801,886; ratio 16.8)
7657 10 4,113 (107,329 -> 1,801,653; ratio 16.8)
7658 12 4,113 (107,329 -> 1,801,526; ratio 16.8)
7659 16 4,113 (107,329 -> 1,801,298; ratio 16.8)
7660 32 4,113 (107,329 -> 1,800,827; ratio 16.8)
7661 inf 4,113 (107,329 -> 1,800,827; ratio 16.8)
7664 /* Structs for recording which (helper, guard) pairs we have already
7667 #define N_TIDYING_PAIRS 16
7670 struct { void* entry
; IRExpr
* guard
; }
7675 Pair pairs
[N_TIDYING_PAIRS
+1/*for bounds checking*/];
7681 /* Return True if e1 and e2 definitely denote the same value (used to
7682 compare guards). Return False if unknown; False is the safe
7683 answer. Since guest registers and guest memory do not have the
7684 SSA property we must return False if any Gets or Loads appear in
7685 the expression. This implicitly assumes that e1 and e2 have the
7686 same IR type, which is always true here -- the type is Ity_I1. */
7688 static Bool
sameIRValue ( IRExpr
* e1
, IRExpr
* e2
)
7690 if (e1
->tag
!= e2
->tag
)
7694 return eqIRConst( e1
->Iex
.Const
.con
, e2
->Iex
.Const
.con
);
7696 return e1
->Iex
.Binop
.op
== e2
->Iex
.Binop
.op
7697 && sameIRValue(e1
->Iex
.Binop
.arg1
, e2
->Iex
.Binop
.arg1
)
7698 && sameIRValue(e1
->Iex
.Binop
.arg2
, e2
->Iex
.Binop
.arg2
);
7700 return e1
->Iex
.Unop
.op
== e2
->Iex
.Unop
.op
7701 && sameIRValue(e1
->Iex
.Unop
.arg
, e2
->Iex
.Unop
.arg
);
7703 return e1
->Iex
.RdTmp
.tmp
== e2
->Iex
.RdTmp
.tmp
;
7705 return sameIRValue( e1
->Iex
.ITE
.cond
, e2
->Iex
.ITE
.cond
)
7706 && sameIRValue( e1
->Iex
.ITE
.iftrue
, e2
->Iex
.ITE
.iftrue
)
7707 && sameIRValue( e1
->Iex
.ITE
.iffalse
, e2
->Iex
.ITE
.iffalse
);
7711 /* be lazy. Could define equality for these, but they never
7712 appear to be used. */
7717 /* be conservative - these may not give the same value each
7721 /* should never see this */
7724 VG_(printf
)("mc_translate.c: sameIRValue: unhandled: ");
7726 VG_(tool_panic
)("memcheck:sameIRValue");
7731 /* See if 'pairs' already has an entry for (entry, guard). Return
7732 True if so. If not, add an entry. */
7735 Bool
check_or_add ( Pairs
* tidyingEnv
, IRExpr
* guard
, void* entry
)
7737 UInt i
, n
= tidyingEnv
->pairsUsed
;
7738 tl_assert(n
<= N_TIDYING_PAIRS
);
7739 for (i
= 0; i
< n
; i
++) {
7740 if (tidyingEnv
->pairs
[i
].entry
== entry
7741 && sameIRValue(tidyingEnv
->pairs
[i
].guard
, guard
))
7744 /* (guard, entry) wasn't found in the array. Add it at the end.
7745 If the array is already full, slide the entries one slot
7746 backwards. This means we will lose to ability to detect
7747 duplicates from the pair in slot zero, but that happens so
7748 rarely that it's unlikely to have much effect on overall code
7749 quality. Also, this strategy loses the check for the oldest
7750 tracked exit (memory reference, basically) and so that is (I'd
7751 guess) least likely to be re-used after this point. */
7753 if (n
== N_TIDYING_PAIRS
) {
7754 for (i
= 1; i
< N_TIDYING_PAIRS
; i
++) {
7755 tidyingEnv
->pairs
[i
-1] = tidyingEnv
->pairs
[i
];
7757 tidyingEnv
->pairs
[N_TIDYING_PAIRS
-1].entry
= entry
;
7758 tidyingEnv
->pairs
[N_TIDYING_PAIRS
-1].guard
= guard
;
7760 tl_assert(n
< N_TIDYING_PAIRS
);
7761 tidyingEnv
->pairs
[n
].entry
= entry
;
7762 tidyingEnv
->pairs
[n
].guard
= guard
;
7764 tidyingEnv
->pairsUsed
= n
;
7769 static Bool
is_helperc_value_checkN_fail ( const HChar
* name
)
7771 /* This is expensive because it happens a lot. We are checking to
7772 see whether |name| is one of the following 8 strings:
7774 MC_(helperc_value_check8_fail_no_o)
7775 MC_(helperc_value_check4_fail_no_o)
7776 MC_(helperc_value_check0_fail_no_o)
7777 MC_(helperc_value_check1_fail_no_o)
7778 MC_(helperc_value_check8_fail_w_o)
7779 MC_(helperc_value_check0_fail_w_o)
7780 MC_(helperc_value_check1_fail_w_o)
7781 MC_(helperc_value_check4_fail_w_o)
7783 To speed it up, check the common prefix just once, rather than
7786 const HChar
* prefix
= "MC_(helperc_value_check";
7792 if (p
== 0) break; /* ran off the end of the prefix */
7793 /* We still have some prefix to use */
7794 if (n
== 0) return False
; /* have prefix, but name ran out */
7795 if (n
!= p
) return False
; /* have both pfx and name, but no match */
7800 /* Check the part after the prefix. */
7801 tl_assert(*prefix
== 0 && *name
!= 0);
7802 return 0==VG_(strcmp
)(name
, "8_fail_no_o)")
7803 || 0==VG_(strcmp
)(name
, "4_fail_no_o)")
7804 || 0==VG_(strcmp
)(name
, "0_fail_no_o)")
7805 || 0==VG_(strcmp
)(name
, "1_fail_no_o)")
7806 || 0==VG_(strcmp
)(name
, "8_fail_w_o)")
7807 || 0==VG_(strcmp
)(name
, "4_fail_w_o)")
7808 || 0==VG_(strcmp
)(name
, "0_fail_w_o)")
7809 || 0==VG_(strcmp
)(name
, "1_fail_w_o)");
7812 IRSB
* MC_(final_tidy
) ( IRSB
* sb_in
)
7819 Bool alreadyPresent
;
7822 pairs
.pairsUsed
= 0;
7824 pairs
.pairs
[N_TIDYING_PAIRS
].entry
= (void*)0x123;
7825 pairs
.pairs
[N_TIDYING_PAIRS
].guard
= (IRExpr
*)0x456;
7827 /* Scan forwards through the statements. Each time a call to one
7828 of the relevant helpers is seen, check if we have made a
7829 previous call to the same helper using the same guard
7830 expression, and if so, delete the call. */
7831 for (i
= 0; i
< sb_in
->stmts_used
; i
++) {
7832 st
= sb_in
->stmts
[i
];
7834 if (st
->tag
!= Ist_Dirty
)
7836 di
= st
->Ist
.Dirty
.details
;
7839 if (0) { ppIRExpr(guard
); VG_(printf
)("\n"); }
7841 if (!is_helperc_value_checkN_fail( cee
->name
))
7843 /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
7844 guard 'guard'. Check if we have already seen a call to this
7845 function with the same guard. If so, delete it. If not,
7846 add it to the set of calls we do know about. */
7847 alreadyPresent
= check_or_add( &pairs
, guard
, cee
->addr
);
7848 if (alreadyPresent
) {
7849 sb_in
->stmts
[i
] = IRStmt_NoOp();
7850 if (0) VG_(printf
)("XX\n");
7854 tl_assert(pairs
.pairs
[N_TIDYING_PAIRS
].entry
== (void*)0x123);
7855 tl_assert(pairs
.pairs
[N_TIDYING_PAIRS
].guard
== (IRExpr
*)0x456);
7860 #undef N_TIDYING_PAIRS
7863 /*------------------------------------------------------------*/
7864 /*--- Startup assertion checking ---*/
7865 /*------------------------------------------------------------*/
7867 void MC_(do_instrumentation_startup_checks
)( void )
7869 /* Make a best-effort check to see that is_helperc_value_checkN_fail
7870 is working as we expect. */
7872 # define CHECK(_expected, _string) \
7873 tl_assert((_expected) == is_helperc_value_checkN_fail(_string))
7875 /* It should identify these 8, and no others, as targets. */
7876 CHECK(True
, "MC_(helperc_value_check8_fail_no_o)");
7877 CHECK(True
, "MC_(helperc_value_check4_fail_no_o)");
7878 CHECK(True
, "MC_(helperc_value_check0_fail_no_o)");
7879 CHECK(True
, "MC_(helperc_value_check1_fail_no_o)");
7880 CHECK(True
, "MC_(helperc_value_check8_fail_w_o)");
7881 CHECK(True
, "MC_(helperc_value_check0_fail_w_o)");
7882 CHECK(True
, "MC_(helperc_value_check1_fail_w_o)");
7883 CHECK(True
, "MC_(helperc_value_check4_fail_w_o)");
7885 /* Ad-hoc selection of other strings gathered via a quick test. */
7886 CHECK(False
, "amd64g_dirtyhelper_CPUID_avx2");
7887 CHECK(False
, "amd64g_dirtyhelper_RDTSC");
7888 CHECK(False
, "MC_(helperc_b_load1)");
7889 CHECK(False
, "MC_(helperc_b_load2)");
7890 CHECK(False
, "MC_(helperc_b_load4)");
7891 CHECK(False
, "MC_(helperc_b_load8)");
7892 CHECK(False
, "MC_(helperc_b_load16)");
7893 CHECK(False
, "MC_(helperc_b_load32)");
7894 CHECK(False
, "MC_(helperc_b_store1)");
7895 CHECK(False
, "MC_(helperc_b_store2)");
7896 CHECK(False
, "MC_(helperc_b_store4)");
7897 CHECK(False
, "MC_(helperc_b_store8)");
7898 CHECK(False
, "MC_(helperc_b_store16)");
7899 CHECK(False
, "MC_(helperc_b_store32)");
7900 CHECK(False
, "MC_(helperc_LOADV8)");
7901 CHECK(False
, "MC_(helperc_LOADV16le)");
7902 CHECK(False
, "MC_(helperc_LOADV32le)");
7903 CHECK(False
, "MC_(helperc_LOADV64le)");
7904 CHECK(False
, "MC_(helperc_LOADV128le)");
7905 CHECK(False
, "MC_(helperc_LOADV256le)");
7906 CHECK(False
, "MC_(helperc_STOREV16le)");
7907 CHECK(False
, "MC_(helperc_STOREV32le)");
7908 CHECK(False
, "MC_(helperc_STOREV64le)");
7909 CHECK(False
, "MC_(helperc_STOREV8)");
7910 CHECK(False
, "track_die_mem_stack_8");
7911 CHECK(False
, "track_new_mem_stack_8_w_ECU");
7912 CHECK(False
, "MC_(helperc_MAKE_STACK_UNINIT_w_o)");
7913 CHECK(False
, "VG_(unknown_SP_update_w_ECU)");
7919 /*------------------------------------------------------------*/
7920 /*--- Memcheck main ---*/
7921 /*------------------------------------------------------------*/
7923 static Bool
isBogusAtom ( IRAtom
* at
)
7925 if (at
->tag
== Iex_RdTmp
)
7927 tl_assert(at
->tag
== Iex_Const
);
7930 IRConst
* con
= at
->Iex
.Const
.con
;
7932 case Ico_U1
: return False
;
7933 case Ico_U8
: n
= (ULong
)con
->Ico
.U8
; break;
7934 case Ico_U16
: n
= (ULong
)con
->Ico
.U16
; break;
7935 case Ico_U32
: n
= (ULong
)con
->Ico
.U32
; break;
7936 case Ico_U64
: n
= (ULong
)con
->Ico
.U64
; break;
7937 case Ico_F32
: return False
;
7938 case Ico_F64
: return False
;
7939 case Ico_F32i
: return False
;
7940 case Ico_F64i
: return False
;
7941 case Ico_V128
: return False
;
7942 case Ico_V256
: return False
;
7943 default: ppIRExpr(at
); tl_assert(0);
7945 /* VG_(printf)("%llx\n", n); */
7947 if (LIKELY(n
<= 0x0000000000001000ULL
)) return False
;
7948 if (LIKELY(n
>= 0xFFFFFFFFFFFFF000ULL
)) return False
;
7949 /* The list of bogus atoms is: */
7950 return (/*32*/ n
== 0xFEFEFEFFULL
7951 /*32*/ || n
== 0x80808080ULL
7952 /*32*/ || n
== 0x7F7F7F7FULL
7953 /*32*/ || n
== 0x7EFEFEFFULL
7954 /*32*/ || n
== 0x81010100ULL
7955 /*64*/ || n
== 0xFFFFFFFFFEFEFEFFULL
7956 /*64*/ || n
== 0xFEFEFEFEFEFEFEFFULL
7957 /*64*/ || n
== 0x0000000000008080ULL
7958 /*64*/ || n
== 0x8080808080808080ULL
7959 /*64*/ || n
== 0x0101010101010101ULL
7964 /* Does 'st' mention any of the literals identified/listed in
7966 static inline Bool
containsBogusLiterals ( /*FLAT*/ IRStmt
* st
)
7974 e
= st
->Ist
.WrTmp
.data
;
7980 return isBogusAtom(e
);
7982 return isBogusAtom(e
->Iex
.Unop
.arg
)
7983 || e
->Iex
.Unop
.op
== Iop_GetMSBs8x16
;
7985 return isBogusAtom(e
->Iex
.GetI
.ix
);
7987 return isBogusAtom(e
->Iex
.Binop
.arg1
)
7988 || isBogusAtom(e
->Iex
.Binop
.arg2
);
7990 return isBogusAtom(e
->Iex
.Triop
.details
->arg1
)
7991 || isBogusAtom(e
->Iex
.Triop
.details
->arg2
)
7992 || isBogusAtom(e
->Iex
.Triop
.details
->arg3
);
7994 return isBogusAtom(e
->Iex
.Qop
.details
->arg1
)
7995 || isBogusAtom(e
->Iex
.Qop
.details
->arg2
)
7996 || isBogusAtom(e
->Iex
.Qop
.details
->arg3
)
7997 || isBogusAtom(e
->Iex
.Qop
.details
->arg4
);
7999 return isBogusAtom(e
->Iex
.ITE
.cond
)
8000 || isBogusAtom(e
->Iex
.ITE
.iftrue
)
8001 || isBogusAtom(e
->Iex
.ITE
.iffalse
);
8003 return isBogusAtom(e
->Iex
.Load
.addr
);
8005 for (i
= 0; e
->Iex
.CCall
.args
[i
]; i
++)
8006 if (isBogusAtom(e
->Iex
.CCall
.args
[i
]))
8013 d
= st
->Ist
.Dirty
.details
;
8014 for (i
= 0; d
->args
[i
]; i
++) {
8015 IRAtom
* atom
= d
->args
[i
];
8016 if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(atom
))) {
8017 if (isBogusAtom(atom
))
8021 if (isBogusAtom(d
->guard
))
8023 if (d
->mAddr
&& isBogusAtom(d
->mAddr
))
8027 return isBogusAtom(st
->Ist
.Put
.data
);
8029 return isBogusAtom(st
->Ist
.PutI
.details
->ix
)
8030 || isBogusAtom(st
->Ist
.PutI
.details
->data
);
8032 return isBogusAtom(st
->Ist
.Store
.addr
)
8033 || isBogusAtom(st
->Ist
.Store
.data
);
8035 IRStoreG
* sg
= st
->Ist
.StoreG
.details
;
8036 return isBogusAtom(sg
->addr
) || isBogusAtom(sg
->data
)
8037 || isBogusAtom(sg
->guard
);
8040 IRLoadG
* lg
= st
->Ist
.LoadG
.details
;
8041 return isBogusAtom(lg
->addr
) || isBogusAtom(lg
->alt
)
8042 || isBogusAtom(lg
->guard
);
8045 return isBogusAtom(st
->Ist
.Exit
.guard
);
8047 return isBogusAtom(st
->Ist
.AbiHint
.base
)
8048 || isBogusAtom(st
->Ist
.AbiHint
.nia
);
8054 cas
= st
->Ist
.CAS
.details
;
8055 return isBogusAtom(cas
->addr
)
8056 || (cas
->expdHi
? isBogusAtom(cas
->expdHi
) : False
)
8057 || isBogusAtom(cas
->expdLo
)
8058 || (cas
->dataHi
? isBogusAtom(cas
->dataHi
) : False
)
8059 || isBogusAtom(cas
->dataLo
);
8061 return isBogusAtom(st
->Ist
.LLSC
.addr
)
8062 || (st
->Ist
.LLSC
.storedata
8063 ? isBogusAtom(st
->Ist
.LLSC
.storedata
)
8068 VG_(tool_panic
)("hasBogusLiterals");
8073 /* This is the pre-instrumentation analysis. It does a backwards pass over
8074 the stmts in |sb_in| to determine a HowUsed value for each tmp defined in
8077 Unrelatedly, it also checks all literals in the block with |isBogusAtom|,
8078 as a positive result from that is a strong indication that we need to
8079 expensively instrument add/sub in the block. We do both analyses in one
8080 pass, even though they are independent, so as to avoid the overhead of
8081 having to traverse the whole block twice.
8083 The usage pass proceeds as follows. Let max= be the max operation in the
8084 HowUsed lattice, hence
8086 X max= Y means X = max(X, Y)
8090 for t in original tmps . useEnv[t] = HuUnU
8092 for t used in the block's . next field
8093 useEnv[t] max= HuPCa // because jmp targets are PCast-tested
8095 for st iterating *backwards* in the block
8099 case "t1 = load(t2)" // case 1
8100 useEnv[t2] max= HuPCa
8102 case "t1 = add(t2, t3)" // case 2
8103 useEnv[t2] max= useEnv[t1]
8104 useEnv[t3] max= useEnv[t1]
8107 for t in st.usedTmps // case 3
8108 useEnv[t] max= HuOth
8109 // same as useEnv[t] = HuOth
8111 The general idea is that we accumulate, in useEnv[], information about
8112 how each tmp is used. That can be updated as we work further back
8113 through the block and find more uses of it, but its HowUsed value can
8114 only ascend the lattice, not descend.
8116 Initially we mark all tmps as unused. In case (1), if a tmp is seen to
8117 be used as a memory address, then its use is at least HuPCa. The point
8118 is that for a memory address we will add instrumentation to check if any
8119 bit of the address is undefined, which means that we won't need expensive
8120 V-bit propagation through an add expression that computed the address --
8121 cheap add instrumentation will be equivalent.
8123 Note in case (1) that if we have previously seen a non-memory-address use
8124 of the tmp, then its use will already be HuOth and will be unchanged by
8125 the max= operation. And if it turns out that the source of the tmp was
8126 an add, then we'll have to expensively instrument the add, because we
8127 can't prove that, for the previous non-memory-address use of the tmp,
8128 cheap and expensive instrumentation will be equivalent.
8130 In case 2, we propagate the usage-mode of the result of an add back
8131 through to its operands. Again, we use max= so as to take account of the
8132 fact that t2 or t3 might later in the block (viz, earlier in the
8133 iteration) have been used in a way that requires expensive add
8136 In case 3, we deal with all other tmp uses. We assume that we'll need a
8137 result that is as accurate as possible, so we max= HuOth into its use
8138 mode. Since HuOth is the top of the lattice, that's equivalent to just
8139 setting its use to HuOth.
8141 The net result of all this is that:
8143 tmps that are used either
8144 - only as a memory address, or
8145 - only as part of a tree of adds that computes a memory address,
8146 and has no other use
8147 are marked as HuPCa, and so we can instrument their generating Add
8148 nodes cheaply, which is the whole point of this analysis
8150 tmps that are used any other way at all are marked as HuOth
8152 tmps that are unused are marked as HuUnU. We don't expect to see any
8153 since we expect that the incoming IR has had all dead assignments
8154 removed by previous optimisation passes. Nevertheless the analysis is
8155 correct even in the presence of dead tmps.
8157 A final comment on dead tmps. In case 1 and case 2, we could actually
8158 conditionalise the updates thusly:
8160 if (useEnv[t1] > HuUnU) { useEnv[t2] max= HuPCa } // case 1
8162 if (useEnv[t1] > HuUnU) { useEnv[t2] max= useEnv[t1] } // case 2
8163 if (useEnv[t1] > HuUnU) { useEnv[t3] max= useEnv[t1] } // case 2
8165 In other words, if the assigned-to tmp |t1| is never used, then there's
8166 no point in propagating any use through to its operands. That won't
8167 change the final HuPCa-vs-HuOth results, which is what we care about.
8168 Given that we expect to get dead-code-free inputs, there's no point in
8169 adding this extra refinement.
8172 /* Helper for |preInstrumentationAnalysis|. */
8173 static inline void noteTmpUsesIn ( /*MOD*/HowUsed
* useEnv
,
8175 HowUsed newUse
, IRAtom
* at
)
8177 /* For the atom |at|, declare that for any tmp |t| in |at|, we will have
8178 seen a use of |newUse|. So, merge that info into |t|'s accumulated
8186 IRTemp t
= at
->Iex
.RdTmp
.tmp
;
8187 tl_assert(t
< tyenvUsed
); // "is an original tmp"
8188 // The "max" operation in the lattice
8189 if (newUse
> useEnv
[t
]) useEnv
[t
] = newUse
;
8193 // We should never get here -- it implies non-flat IR
8195 VG_(tool_panic
)("noteTmpUsesIn");
8202 static void preInstrumentationAnalysis ( /*OUT*/HowUsed
** useEnvP
,
8203 /*OUT*/Bool
* hasBogusLiteralsP
,
8206 const UInt nOrigTmps
= (UInt
)sb_in
->tyenv
->types_used
;
8208 // We've seen no bogus literals so far.
8211 // This is calloc'd, so implicitly all entries are initialised to HuUnU.
8212 HowUsed
* useEnv
= VG_(calloc
)("mc.preInstrumentationAnalysis.1",
8213 nOrigTmps
, sizeof(HowUsed
));
8215 // Firstly, roll in contributions from the final dst address.
8216 bogus
= isBogusAtom(sb_in
->next
);
8217 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, sb_in
->next
);
8219 // Now work backwards through the stmts.
8220 for (Int i
= sb_in
->stmts_used
-1; i
>= 0; i
--) {
8221 IRStmt
* st
= sb_in
->stmts
[i
];
8223 // Deal with literals.
8224 if (LIKELY(!bogus
)) {
8225 bogus
= containsBogusLiterals(st
);
8228 // Deal with tmp uses.
8231 IRTemp dst
= st
->Ist
.WrTmp
.tmp
;
8232 IRExpr
* rhs
= st
->Ist
.WrTmp
.data
;
8233 // This is the one place where we have to consider all possible
8234 // tags for |rhs|, and can't just assume it is a tmp or a const.
8237 // just propagate demand for |dst| into this tmp use.
8238 noteTmpUsesIn(useEnv
, nOrigTmps
, useEnv
[dst
], rhs
);
8241 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, rhs
->Iex
.Unop
.arg
);
8244 if (rhs
->Iex
.Binop
.op
== Iop_Add64
8245 || rhs
->Iex
.Binop
.op
== Iop_Add32
) {
8246 // propagate demand for |dst| through to the operands.
8247 noteTmpUsesIn(useEnv
, nOrigTmps
,
8248 useEnv
[dst
], rhs
->Iex
.Binop
.arg1
);
8249 noteTmpUsesIn(useEnv
, nOrigTmps
,
8250 useEnv
[dst
], rhs
->Iex
.Binop
.arg2
);
8252 // just say that the operands are used in some unknown way.
8253 noteTmpUsesIn(useEnv
, nOrigTmps
,
8254 HuOth
, rhs
->Iex
.Binop
.arg1
);
8255 noteTmpUsesIn(useEnv
, nOrigTmps
,
8256 HuOth
, rhs
->Iex
.Binop
.arg2
);
8260 // All operands are used in some unknown way.
8261 IRTriop
* tri
= rhs
->Iex
.Triop
.details
;
8262 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, tri
->arg1
);
8263 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, tri
->arg2
);
8264 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, tri
->arg3
);
8268 // All operands are used in some unknown way.
8269 IRQop
* qop
= rhs
->Iex
.Qop
.details
;
8270 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, qop
->arg1
);
8271 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, qop
->arg2
);
8272 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, qop
->arg3
);
8273 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, qop
->arg4
);
8277 // The address will be checked (== PCasted).
8278 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, rhs
->Iex
.Load
.addr
);
8281 // The condition is PCasted, the then- and else-values
8283 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, rhs
->Iex
.ITE
.cond
);
8284 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, rhs
->Iex
.ITE
.iftrue
);
8285 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, rhs
->Iex
.ITE
.iffalse
);
8288 // The args are used in unknown ways.
8289 for (IRExpr
** args
= rhs
->Iex
.CCall
.args
; *args
; args
++) {
8290 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, *args
);
8294 // The index will be checked/PCasted (see do_shadow_GETI)
8295 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, rhs
->Iex
.GetI
.ix
);
8303 VG_(tool_panic
)("preInstrumentationAnalysis:"
8304 " unhandled IRExpr");
8309 // The address will be checked (== PCasted). The data will be
8310 // used in some unknown way.
8311 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, st
->Ist
.Store
.addr
);
8312 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, st
->Ist
.Store
.data
);
8315 // The guard will be checked (== PCasted)
8316 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, st
->Ist
.Exit
.guard
);
8319 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, st
->Ist
.Put
.data
);
8322 IRPutI
* putI
= st
->Ist
.PutI
.details
;
8323 // The index will be checked/PCasted (see do_shadow_PUTI). The
8324 // data will be used in an unknown way.
8325 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, putI
->ix
);
8326 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, putI
->data
);
8330 IRDirty
* d
= st
->Ist
.Dirty
.details
;
8331 // The guard will be checked (== PCasted)
8332 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, d
->guard
);
8333 // The args will be used in unknown ways.
8334 for (IRExpr
** args
= d
->args
; *args
; args
++) {
8335 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, *args
);
8340 IRCAS
* cas
= st
->Ist
.CAS
.details
;
8341 // Address will be pcasted, everything else used as unknown
8342 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, cas
->addr
);
8343 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, cas
->expdLo
);
8344 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, cas
->dataLo
);
8346 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, cas
->expdHi
);
8348 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, cas
->dataHi
);
8352 // Both exprs are used in unknown ways. TODO: can we safely
8353 // just ignore AbiHints?
8354 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, st
->Ist
.AbiHint
.base
);
8355 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, st
->Ist
.AbiHint
.nia
);
8358 // We might be able to do better, and use HuPCa for the addr.
8359 // It's not immediately obvious that we can, because the address
8360 // is regarded as "used" only when the guard is true.
8361 IRStoreG
* sg
= st
->Ist
.StoreG
.details
;
8362 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, sg
->addr
);
8363 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, sg
->data
);
8364 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, sg
->guard
);
8368 // Per similar comments to Ist_StoreG .. not sure whether this
8369 // is really optimal.
8370 IRLoadG
* lg
= st
->Ist
.LoadG
.details
;
8371 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, lg
->addr
);
8372 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, lg
->alt
);
8373 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, lg
->guard
);
8377 noteTmpUsesIn(useEnv
, nOrigTmps
, HuPCa
, st
->Ist
.LLSC
.addr
);
8378 if (st
->Ist
.LLSC
.storedata
)
8379 noteTmpUsesIn(useEnv
, nOrigTmps
, HuOth
, st
->Ist
.LLSC
.storedata
);
8388 VG_(tool_panic
)("preInstrumentationAnalysis: unhandled IRStmt");
8391 } // Now work backwards through the stmts.
8393 // Return the computed use env and the bogus-atom flag.
8394 tl_assert(*useEnvP
== NULL
);
8397 tl_assert(*hasBogusLiteralsP
== False
);
8398 *hasBogusLiteralsP
= bogus
;
8402 IRSB
* MC_(instrument
) ( VgCallbackClosure
* closure
,
8404 const VexGuestLayout
* layout
,
8405 const VexGuestExtents
* vge
,
8406 const VexArchInfo
* archinfo_host
,
8407 IRType gWordTy
, IRType hWordTy
)
8409 Bool verboze
= 0||False
;
8410 Int i
, j
, first_stmt
;
8415 if (gWordTy
!= hWordTy
) {
8416 /* We don't currently support this case. */
8417 VG_(tool_panic
)("host/guest word size mismatch");
8420 /* Check we're not completely nuts */
8421 tl_assert(sizeof(UWord
) == sizeof(void*));
8422 tl_assert(sizeof(Word
) == sizeof(void*));
8423 tl_assert(sizeof(Addr
) == sizeof(void*));
8424 tl_assert(sizeof(ULong
) == 8);
8425 tl_assert(sizeof(Long
) == 8);
8426 tl_assert(sizeof(UInt
) == 4);
8427 tl_assert(sizeof(Int
) == 4);
8429 tl_assert(MC_(clo_mc_level
) >= 1 && MC_(clo_mc_level
) <= 3);
8432 sb_out
= deepCopyIRSBExceptStmts(sb_in
);
8434 /* Set up the running environment. Both .sb and .tmpMap are
8435 modified as we go along. Note that tmps are added to both
8436 .sb->tyenv and .tmpMap together, so the valid index-set for
8437 those two arrays should always be identical. */
8438 VG_(memset
)(&mce
, 0, sizeof(mce
));
8440 mce
.trace
= verboze
;
8441 mce
.layout
= layout
;
8442 mce
.hWordTy
= hWordTy
;
8443 mce
.tmpHowUsed
= NULL
;
8445 /* BEGIN decide on expense levels for instrumentation. */
8447 /* Initially, select the cheap version of everything for which we have an
8449 DetailLevelByOp__set_all( &mce
.dlbo
, DLcheap
);
8451 /* Take account of the --expensive-definedness-checks= flag. */
8452 if (MC_(clo_expensive_definedness_checks
) == EdcNO
) {
8453 /* We just selected 'cheap for everything', so we don't need to do
8454 anything here. mce.tmpHowUsed remains NULL. */
8456 else if (MC_(clo_expensive_definedness_checks
) == EdcYES
) {
8457 /* Select 'expensive for everything'. mce.tmpHowUsed remains NULL. */
8458 DetailLevelByOp__set_all( &mce
.dlbo
, DLexpensive
);
8461 tl_assert(MC_(clo_expensive_definedness_checks
) == EdcAUTO
);
8462 /* We'll make our own selection, based on known per-target constraints
8463 and also on analysis of the block to be instrumented. First, set
8464 up default values for detail levels.
8466 On x86 and amd64, we'll routinely encounter code optimised by LLVM
8467 5 and above. Enable accurate interpretation of the following.
8468 LLVM uses adds for some bitfield inserts, and we get a lot of false
8469 errors if the cheap interpretation is used, alas. Could solve this
8470 much better if we knew which of such adds came from x86/amd64 LEA
8471 instructions, since these are the only ones really needing the
8472 expensive interpretation, but that would require some way to tag
8473 them in the _toIR.c front ends, which is a lot of faffing around.
8474 So for now we use preInstrumentationAnalysis() to detect adds which
8475 are used only to construct memory addresses, which is an
8476 approximation to the above, and is self-contained.*/
8477 # if defined(VGA_x86)
8478 mce
.dlbo
.dl_Add32
= DLauto
;
8479 mce
.dlbo
.dl_CmpEQ32_CmpNE32
= DLexpensive
;
8480 # elif defined(VGA_amd64)
8481 mce
.dlbo
.dl_Add32
= DLexpensive
;
8482 mce
.dlbo
.dl_Add64
= DLauto
;
8483 mce
.dlbo
.dl_CmpEQ32_CmpNE32
= DLexpensive
;
8484 mce
.dlbo
.dl_CmpEQ64_CmpNE64
= DLexpensive
;
8485 # elif defined(VGA_ppc64le)
8486 // Needed by (at least) set_AV_CR6() in the front end.
8487 mce
.dlbo
.dl_CmpEQ64_CmpNE64
= DLexpensive
;
8490 /* preInstrumentationAnalysis() will allocate &mce.tmpHowUsed and then
8492 Bool hasBogusLiterals
= False
;
8493 preInstrumentationAnalysis( &mce
.tmpHowUsed
, &hasBogusLiterals
, sb_in
);
8495 if (hasBogusLiterals
) {
8496 /* This happens very rarely. In this case just select expensive
8497 for everything, and throw away the tmp-use analysis results. */
8498 DetailLevelByOp__set_all( &mce
.dlbo
, DLexpensive
);
8499 VG_(free
)( mce
.tmpHowUsed
);
8500 mce
.tmpHowUsed
= NULL
;
8502 /* Nothing. mce.tmpHowUsed contains tmp-use analysis results,
8503 which will be used for some subset of Iop_{Add,Sub}{32,64},
8504 based on which ones are set to DLauto for this target. */
8508 DetailLevelByOp__check_sanity( &mce
.dlbo
);
8511 // Debug printing: which tmps have been identified as PCast-only use
8512 if (mce
.tmpHowUsed
) {
8513 VG_(printf
)("Cheapies: ");
8514 for (UInt q
= 0; q
< sb_in
->tyenv
->types_used
; q
++) {
8515 if (mce
.tmpHowUsed
[q
] == HuPCa
) {
8516 VG_(printf
)("t%u ", q
);
8522 // Debug printing: number of ops by detail level
8523 UChar nCheap
= DetailLevelByOp__count( &mce
.dlbo
, DLcheap
);
8524 UChar nAuto
= DetailLevelByOp__count( &mce
.dlbo
, DLauto
);
8525 UChar nExpensive
= DetailLevelByOp__count( &mce
.dlbo
, DLexpensive
);
8526 tl_assert(nCheap
+ nAuto
+ nExpensive
== 8);
8528 VG_(printf
)("%u,%u,%u ", nCheap
, nAuto
, nExpensive
);
8530 /* END decide on expense levels for instrumentation. */
8532 /* Initialise the running the tmp environment. */
8534 mce
.tmpMap
= VG_(newXA
)( VG_(malloc
), "mc.MC_(instrument).1", VG_(free
),
8535 sizeof(TempMapEnt
));
8536 VG_(hintSizeXA
) (mce
.tmpMap
, sb_in
->tyenv
->types_used
);
8537 for (i
= 0; i
< sb_in
->tyenv
->types_used
; i
++) {
8540 ent
.shadowV
= IRTemp_INVALID
;
8541 ent
.shadowB
= IRTemp_INVALID
;
8542 VG_(addToXA
)( mce
.tmpMap
, &ent
);
8544 tl_assert( VG_(sizeXA
)( mce
.tmpMap
) == sb_in
->tyenv
->types_used
);
8546 /* Finally, begin instrumentation. */
8547 /* Copy verbatim any IR preamble preceding the first IMark */
8549 tl_assert(mce
.sb
== sb_out
);
8550 tl_assert(mce
.sb
!= sb_in
);
8553 while (i
< sb_in
->stmts_used
&& sb_in
->stmts
[i
]->tag
!= Ist_IMark
) {
8555 st
= sb_in
->stmts
[i
];
8557 tl_assert(isFlatIRStmt(st
));
8559 stmt( 'C', &mce
, sb_in
->stmts
[i
] );
8563 /* Nasty problem. IR optimisation of the pre-instrumented IR may
8564 cause the IR following the preamble to contain references to IR
8565 temporaries defined in the preamble. Because the preamble isn't
8566 instrumented, these temporaries don't have any shadows.
8567 Nevertheless uses of them following the preamble will cause
8568 memcheck to generate references to their shadows. End effect is
8569 to cause IR sanity check failures, due to references to
8570 non-existent shadows. This is only evident for the complex
8571 preambles used for function wrapping on TOC-afflicted platforms
8574 The following loop therefore scans the preamble looking for
8575 assignments to temporaries. For each one found it creates an
8576 assignment to the corresponding (V) shadow temp, marking it as
8577 'defined'. This is the same resulting IR as if the main
8578 instrumentation loop before had been applied to the statement
8581 Similarly, if origin tracking is enabled, we must generate an
8582 assignment for the corresponding origin (B) shadow, claiming
8583 no-origin, as appropriate for a defined value.
8585 for (j
= 0; j
< i
; j
++) {
8586 if (sb_in
->stmts
[j
]->tag
== Ist_WrTmp
) {
8587 /* findShadowTmpV checks its arg is an original tmp;
8588 no need to assert that here. */
8589 IRTemp tmp_o
= sb_in
->stmts
[j
]->Ist
.WrTmp
.tmp
;
8590 IRTemp tmp_v
= findShadowTmpV(&mce
, tmp_o
);
8591 IRType ty_v
= typeOfIRTemp(sb_out
->tyenv
, tmp_v
);
8592 assign( 'V', &mce
, tmp_v
, definedOfType( ty_v
) );
8593 if (MC_(clo_mc_level
) == 3) {
8594 IRTemp tmp_b
= findShadowTmpB(&mce
, tmp_o
);
8595 tl_assert(typeOfIRTemp(sb_out
->tyenv
, tmp_b
) == Ity_I32
);
8596 assign( 'B', &mce
, tmp_b
, mkU32(0)/* UNKNOWN ORIGIN */);
8599 VG_(printf
)("create shadow tmp(s) for preamble tmp [%d] ty ", j
);
8606 /* Iterate over the remaining stmts to generate instrumentation. */
8608 tl_assert(sb_in
->stmts_used
> 0);
8610 tl_assert(i
< sb_in
->stmts_used
);
8611 tl_assert(sb_in
->stmts
[i
]->tag
== Ist_IMark
);
8613 for (/* use current i*/; i
< sb_in
->stmts_used
; i
++) {
8615 st
= sb_in
->stmts
[i
];
8616 first_stmt
= sb_out
->stmts_used
;
8624 if (MC_(clo_mc_level
) == 3) {
8625 /* See comments on case Ist_CAS below. */
8626 if (st
->tag
!= Ist_CAS
)
8627 schemeS( &mce
, st
);
8630 /* Generate instrumentation code for each stmt ... */
8635 IRTemp dst
= st
->Ist
.WrTmp
.tmp
;
8636 tl_assert(dst
< (UInt
)sb_in
->tyenv
->types_used
);
8637 HowUsed hu
= mce
.tmpHowUsed
? mce
.tmpHowUsed
[dst
]
8638 : HuOth
/*we don't know, so play safe*/;
8639 assign( 'V', &mce
, findShadowTmpV(&mce
, st
->Ist
.WrTmp
.tmp
),
8640 expr2vbits( &mce
, st
->Ist
.WrTmp
.data
, hu
));
8645 do_shadow_PUT( &mce
,
8648 NULL
/* shadow atom */, NULL
/* guard */ );
8652 do_shadow_PUTI( &mce
, st
->Ist
.PutI
.details
);
8656 do_shadow_Store( &mce
, st
->Ist
.Store
.end
,
8657 st
->Ist
.Store
.addr
, 0/* addr bias */,
8659 NULL
/* shadow data */,
8664 do_shadow_StoreG( &mce
, st
->Ist
.StoreG
.details
);
8668 do_shadow_LoadG( &mce
, st
->Ist
.LoadG
.details
);
8672 complainIfUndefined( &mce
, st
->Ist
.Exit
.guard
, NULL
);
8683 do_shadow_Dirty( &mce
, st
->Ist
.Dirty
.details
);
8687 do_AbiHint( &mce
, st
->Ist
.AbiHint
.base
,
8688 st
->Ist
.AbiHint
.len
,
8689 st
->Ist
.AbiHint
.nia
);
8693 do_shadow_CAS( &mce
, st
->Ist
.CAS
.details
);
8694 /* Note, do_shadow_CAS copies the CAS itself to the output
8695 block, because it needs to add instrumentation both
8696 before and after it. Hence skip the copy below. Also
8697 skip the origin-tracking stuff (call to schemeS) above,
8698 since that's all tangled up with it too; do_shadow_CAS
8703 do_shadow_LLSC( &mce
,
8705 st
->Ist
.LLSC
.result
,
8707 st
->Ist
.LLSC
.storedata
);
8714 VG_(tool_panic
)("memcheck: unhandled IRStmt");
8716 } /* switch (st->tag) */
8719 for (j
= first_stmt
; j
< sb_out
->stmts_used
; j
++) {
8721 ppIRStmt(sb_out
->stmts
[j
]);
8727 /* ... and finally copy the stmt itself to the output. Except,
8728 skip the copy of IRCASs; see comments on case Ist_CAS
8730 if (st
->tag
!= Ist_CAS
)
8731 stmt('C', &mce
, st
);
8734 /* Now we need to complain if the jump target is undefined. */
8735 first_stmt
= sb_out
->stmts_used
;
8738 VG_(printf
)("sb_in->next = ");
8739 ppIRExpr(sb_in
->next
);
8740 VG_(printf
)("\n\n");
8743 complainIfUndefined( &mce
, sb_in
->next
, NULL
);
8746 for (j
= first_stmt
; j
< sb_out
->stmts_used
; j
++) {
8748 ppIRStmt(sb_out
->stmts
[j
]);
8754 /* If this fails, there's been some serious snafu with tmp management,
8755 that should be investigated. */
8756 tl_assert( VG_(sizeXA
)( mce
.tmpMap
) == mce
.sb
->tyenv
->types_used
);
8757 VG_(deleteXA
)( mce
.tmpMap
);
8759 if (mce
.tmpHowUsed
) {
8760 VG_(free
)( mce
.tmpHowUsed
);
8763 tl_assert(mce
.sb
== sb_out
);
8768 /*--------------------------------------------------------------------*/
8769 /*--- end mc_translate.c ---*/
8770 /*--------------------------------------------------------------------*/