Bug 471036 - disInstr_AMD64: disInstr miscalculated next %rip on RORX imm8, m32/64...
[valgrind.git] / cachegrind / cg_arch.c
blob68314c9dbebd9274e08230dc119d752433dcf6b8
1 /*--------------------------------------------------------------------*/
2 /*--- Cachegrind: cache configuration. cg-arch.c ---*/
3 /*--------------------------------------------------------------------*/
5 /*
6 This file is part of Cachegrind, a high-precision tracing profiler
7 built with Valgrind.
9 Copyright (C) 2011-2017 Nicholas Nethercote
10 njn@valgrind.org
12 This program is free software; you can redistribute it and/or
13 modify it under the terms of the GNU General Public License as
14 published by the Free Software Foundation; either version 2 of the
15 License, or (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, see <http://www.gnu.org/licenses/>.
25 The GNU General Public License is contained in the file COPYING.
28 #include "pub_tool_basics.h"
29 #include "pub_tool_libcassert.h"
30 #include "pub_tool_libcbase.h"
31 #include "pub_tool_libcprint.h"
32 #include "pub_tool_options.h"
33 #include "pub_tool_machine.h"
35 #include "cg_arch.h"
37 static void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc,
38 Bool all_caches_clo_defined);
40 // Checks cache config is ok. Returns NULL if ok, or a pointer to an error
41 // string otherwise.
42 static const HChar* check_cache(cache_t* cache)
44 if (cache->line_size == 0)
46 return "Cache line size is zero.\n";
49 if (cache->assoc == 0)
51 return "Cache associativity is zero.\n";
54 // Simulator requires set count to be a power of two.
55 if ((cache->size % (cache->line_size * cache->assoc) != 0) ||
56 (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc)))
58 return "Cache set count is not a power of two.\n";
61 // Simulator requires line size to be a power of two.
62 if (-1 == VG_(log2)(cache->line_size)) {
63 return "Cache line size is not a power of two.\n";
66 // Then check line size >= 16 -- any smaller and a single instruction could
67 // straddle three cache lines, which breaks a simulation assertion and is
68 // stupid anyway.
69 if (cache->line_size < MIN_LINE_SIZE) {
70 return "Cache line size is too small.\n";
73 /* Then check cache size > line size (causes seg faults if not). */
74 if (cache->size <= cache->line_size) {
75 return "Cache size <= line size.\n";
78 /* Then check assoc <= (size / line size) (seg faults otherwise). */
79 if (cache->assoc > (cache->size / cache->line_size)) {
80 return "Cache associativity > (size / line size).\n";
83 return NULL;
87 static void parse_cache_opt ( cache_t* cache, const HChar* opt,
88 const HChar* optval )
90 Long i1, i2, i3;
91 HChar* endptr;
92 const HChar* checkRes;
94 // Option argument looks like "65536,2,64". Extract them.
95 i1 = VG_(strtoll10)(optval, &endptr); if (*endptr != ',') goto bad;
96 i2 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != ',') goto bad;
97 i3 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != '\0') goto bad;
99 // Check for overflow.
100 cache->size = (Int)i1;
101 cache->assoc = (Int)i2;
102 cache->line_size = (Int)i3;
103 if (cache->size != i1) goto overflow;
104 if (cache->assoc != i2) goto overflow;
105 if (cache->line_size != i3) goto overflow;
107 checkRes = check_cache(cache);
108 if (checkRes) {
109 VG_(fmsg)("%s", checkRes);
110 goto bad;
113 return;
115 bad:
116 VG_(fmsg_bad_option)(opt, "Bad argument '%s'\n", optval);
118 overflow:
119 VG_(fmsg_bad_option)(opt,
120 "One of the cache parameters was too large and overflowed.\n");
124 Bool VG_(str_clo_cache_opt)(const HChar *arg,
125 cache_t* clo_I1c,
126 cache_t* clo_D1c,
127 cache_t* clo_LLc)
129 const HChar* tmp_str;
131 if VG_STR_CLO(arg, "--I1", tmp_str) {
132 parse_cache_opt(clo_I1c, arg, tmp_str);
133 return True;
134 } else if VG_STR_CLO(arg, "--D1", tmp_str) {
135 parse_cache_opt(clo_D1c, arg, tmp_str);
136 return True;
137 } else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
138 VG_STR_CLO(arg, "--LL", tmp_str)) {
139 parse_cache_opt(clo_LLc, arg, tmp_str);
140 return True;
141 } else
142 return False;
145 static void umsg_cache_img(const HChar* desc, cache_t* c)
147 VG_(umsg)(" %s: %'d B, %d-way, %d B lines\n", desc,
148 c->size, c->assoc, c->line_size);
151 // Verifies if c is a valid cache.
152 // An invalid value causes an assert, unless clo_redefined is True.
153 static void check_cache_or_override(const HChar* desc, cache_t* c, Bool clo_redefined)
155 const HChar* checkRes;
157 checkRes = check_cache(c);
158 if (checkRes) {
159 VG_(umsg)("Auto-detected %s cache configuration not supported: %s",
160 desc, checkRes);
161 umsg_cache_img(desc, c);
162 if (!clo_redefined) {
163 VG_(umsg)("As it probably should be supported, please report a bug!\n");
164 VG_(umsg)("Bypass this message by using option --%s=...\n", desc);
165 tl_assert(0);
171 /* If the LL cache config isn't something the simulation functions
172 can handle, try to adjust it so it is. Caches are characterised
173 by (total size T, line size L, associativity A), and then we
174 have
176 number of sets S = T / (L * A)
178 The required constraints are:
180 * L must be a power of 2, but it always is in practice, so
181 no problem there
183 * A can be any value >= 1
185 * T can be any value, but ..
187 * S must be a power of 2.
189 That sometimes gives a problem. For example, some Core iX based
190 Intel CPUs have T = 12MB, A = 16, L = 64, which gives 12288
191 sets. Some AMD cpus have T = 5MB, A = 48, L = 64, which gives
192 1706.667 sets (!).
194 The "fix" is to force S down to the nearest power of two below its
195 original value, and increase A proportionately, so as to keep the
196 total cache size the same. In fact to be safe we recalculate the
197 cache size afterwards anyway, to guarantee that it divides exactly
198 between the new number of sets.
200 The "fix" is "justified" (cough, cough) by alleging that
201 increases of associativity above about 4 have very little effect
202 on the actual miss rate. It would be far more inaccurate to
203 fudge this by changing the size of the simulated cache --
204 changing the associativity is a much better option.
207 /* (Helper function) Returns the largest power of 2 that is <= |x|.
208 Even works when |x| == 0. */
209 static UInt floor_power_of_2 ( UInt x )
211 x = x | (x >> 1);
212 x = x | (x >> 2);
213 x = x | (x >> 4);
214 x = x | (x >> 8);
215 x = x | (x >> 16);
216 return x - (x >> 1);
219 static void
220 maybe_tweak_LLc(cache_t *LLc)
222 if (LLc->size == 0 || LLc->assoc == 0 || LLc->line_size == 0)
223 return;
225 tl_assert(LLc->size > 0 && LLc->assoc > 0 && LLc->line_size > 0);
227 UInt old_size = (UInt)LLc->size;
228 UInt old_assoc = (UInt)LLc->assoc;
229 UInt old_line_size = (UInt)LLc->line_size;
231 UInt new_size = old_size;
232 UInt new_assoc = old_assoc;
233 UInt new_line_size = old_line_size;
235 UInt old_nSets = old_size / (old_assoc * old_line_size);
236 if (old_nSets == 0) {
237 /* This surely can't happen; but would cause chaos with the maths
238 * below if it did. Just give up if it does. */
239 return;
242 if (-1 != VG_(log2_64)(old_nSets)) {
243 /* The number of sets is already a power of 2. Make sure that
244 the size divides exactly between the sets. Almost all of the
245 time this will have no effect. */
246 new_size = old_line_size * old_assoc * old_nSets;
247 } else {
248 /* The number of sets isn't a power of two. Calculate some
249 scale-down factor which causes the number of sets to become a
250 power of two. Then, increase the associativity by that
251 factor. Finally, re-calculate the total size so as to make
252 sure it divides exactly between the sets. */
253 UInt new_nSets = floor_power_of_2 ( old_nSets );
254 tl_assert(new_nSets > 0 && new_nSets < old_nSets);
255 Double factor = (Double)old_nSets / (Double)new_nSets;
256 tl_assert(factor >= 1.0);
258 new_assoc = (UInt)(0.5 + factor * (Double)old_assoc);
259 tl_assert(new_assoc >= old_assoc);
261 new_size = old_line_size * new_assoc * new_nSets;
264 tl_assert(new_line_size == old_line_size); /* we never change this */
265 if (new_size == old_size && new_assoc == old_assoc)
266 return;
268 VG_(dmsg)("warning: "
269 "specified LL cache: line_size %u assoc %u total_size %'u\n",
270 old_line_size, old_assoc, old_size);
271 VG_(dmsg)("warning: "
272 "simulated LL cache: line_size %u assoc %u total_size %'u\n",\
273 new_line_size, new_assoc, new_size);
275 LLc->size = new_size;
276 LLc->assoc = new_assoc;
277 LLc->line_size = new_line_size;
280 void VG_(post_clo_init_configure_caches)(cache_t* I1c,
281 cache_t* D1c,
282 cache_t* LLc,
283 cache_t* clo_I1c,
284 cache_t* clo_D1c,
285 cache_t* clo_LLc)
287 #define DEFINED(L) (-1 != L->size || -1 != L->assoc || -1 != L->line_size)
289 // Count how many were defined on the command line.
290 Bool all_caches_clo_defined =
291 (DEFINED(clo_I1c) &&
292 DEFINED(clo_D1c) &&
293 DEFINED(clo_LLc));
295 // Set the cache config (using auto-detection, if supported by the
296 // architecture).
297 configure_caches( I1c, D1c, LLc, all_caches_clo_defined );
299 maybe_tweak_LLc( LLc );
301 // Check the default/auto-detected values.
302 // Allow the user to override invalid auto-detected caches
303 // with command line.
304 check_cache_or_override ("I1", I1c, DEFINED(clo_I1c));
305 check_cache_or_override ("D1", D1c, DEFINED(clo_D1c));
306 check_cache_or_override ("LL", LLc, DEFINED(clo_LLc));
308 // Then replace with any defined on the command line. (Already checked in
309 // VG(str_clo_cache_opt)().)
310 if (DEFINED(clo_I1c)) { *I1c = *clo_I1c; }
311 if (DEFINED(clo_D1c)) { *D1c = *clo_D1c; }
312 if (DEFINED(clo_LLc)) { *LLc = *clo_LLc; }
314 if (VG_(clo_verbosity) >= 2) {
315 VG_(umsg)("Cache configuration used:\n");
316 umsg_cache_img ("I1", I1c);
317 umsg_cache_img ("D1", D1c);
318 umsg_cache_img ("LL", LLc);
320 #undef DEFINED
323 void VG_(print_cache_clo_opts)(void)
325 VG_(printf)(
326 " --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
327 " --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
328 " --LL=<size>,<assoc>,<line_size> set LL cache manually\n"
333 // Traverse the cache info and return a cache of the given kind and level.
334 // Return NULL if no such cache exists.
335 static const VexCache *
336 locate_cache(const VexCacheInfo *ci, VexCacheKind kind, UInt level)
338 const VexCache *c;
340 for (c = ci->caches; c != ci->caches + ci->num_caches; ++c) {
341 if (c->level == level && c->kind == kind) {
342 return c;
345 return NULL; // not found
349 // Gives the auto-detected configuration of I1, D1 and LL caches. They get
350 // overridden by any cache configurations specified on the command line.
351 static void
352 configure_caches(cache_t *I1c, cache_t *D1c, cache_t *LLc,
353 Bool all_caches_clo_defined)
355 VexArchInfo vai;
356 const VexCacheInfo *ci;
357 const VexCache *i1, *d1, *ll;
359 VG_(machine_get_VexArchInfo)(NULL, &vai);
360 ci = &vai.hwcache_info;
362 // Extract what we need
363 i1 = locate_cache(ci, INSN_CACHE, 1);
364 d1 = locate_cache(ci, DATA_CACHE, 1);
365 ll = locate_cache(ci, UNIFIED_CACHE, ci->num_levels);
367 if (ci->num_caches > 0 && ll == NULL) {
368 VG_(dmsg)("warning: L2 cache not installed, ignore LL results.\n");
371 if (ll && ci->num_levels > 2) {
372 VG_(dmsg)("warning: L%u cache found, using its data for the "
373 "LL simulation.\n", ci->num_levels);
376 if (i1 && d1 && ll) {
377 if (i1->is_trace_cache) {
378 /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based.
379 * conversion to byte size is a total guess; treat the 12K and 16K
380 * cases the same since the cache byte size must be a power of two for
381 * everything to work!. Also guessing 32 bytes for the line size...
383 UInt adjusted_size, guessed_line_size = 32;
385 if (i1->sizeB == 12 * 1024 || i1->sizeB == 16 * 1024) {
386 adjusted_size = 16 * 1024;
387 } else {
388 adjusted_size = 32 * 1024;
390 VG_(dmsg)("warning: Pentium 4 with %u KB micro-op instruction trace cache\n",
391 i1->sizeB / 1024);
392 VG_(dmsg)(" Simulating a %u KB I-cache with %u B lines\n",
393 adjusted_size / 1024, guessed_line_size);
395 *I1c = (cache_t) { adjusted_size, i1->assoc, guessed_line_size };
396 } else {
397 *I1c = (cache_t) { i1->sizeB, i1->assoc, i1->line_sizeB };
399 *D1c = (cache_t) { d1->sizeB, d1->assoc, d1->line_sizeB };
400 *LLc = (cache_t) { ll->sizeB, ll->assoc, ll->line_sizeB };
402 return;
405 // Cache information could not be queried; choose some default
406 // architecture specific default setting.
408 #if defined(VGA_ppc32)
410 // Default cache configuration
411 *I1c = (cache_t) { 65536, 2, 64 };
412 *D1c = (cache_t) { 65536, 2, 64 };
413 *LLc = (cache_t) { 262144, 8, 64 };
415 #elif defined(VGA_ppc64be) || defined(VGA_ppc64le)
417 // Default cache configuration
418 *I1c = (cache_t) { 65536, 2, 64 };
419 *D1c = (cache_t) { 65536, 2, 64 };
420 *LLc = (cache_t) { 262144, 8, 64 };
422 #elif defined(VGA_arm)
424 // Set caches to default (for Cortex-A8 ?)
425 *I1c = (cache_t) { 16384, 4, 64 };
426 *D1c = (cache_t) { 16384, 4, 64 };
427 *LLc = (cache_t) { 262144, 8, 64 };
429 #elif defined(VGA_arm64)
431 // Copy the 32-bit ARM version until such time as we have
432 // some real hardware to run on
433 *I1c = (cache_t) { 16384, 4, 64 };
434 *D1c = (cache_t) { 16384, 4, 64 };
435 *LLc = (cache_t) { 262144, 8, 64 };
437 #elif defined(VGA_s390x)
439 // Here is the cache data from older machine models:
441 // I1 D1 I/D L2
442 // z900 256k/256/4 256k/256/4 16MB
443 // z800 256k/256/4 256k/256/4 8MB
444 // z990 256k/256/4 256k/256/4 32MB
445 // z890 256k/256/4 256k/256/4 32MB
446 // z9 256k/256/4 256k/256/4 40MB
448 // Sources:
449 // (1) IBM System z9 109 Technical Introduction
450 // www.redbooks.ibm.com/redbooks/pdfs/sg246669.pdf
451 // (2) The microarchitecture of the IBM eServer z900 processor
452 // IBM Journal of Research and Development
453 // Volume 46, Number 4/5, pp 381-395, July/September 2002
454 // (3) The IBM eServer z990 microprocessor
455 // IBM Journal of Research and Development
456 // Volume 48, Number 3/4, pp 295-309, May/July 2004
457 // (4) Charles Webb, IBM
459 // L2 data is unfortunately incomplete. Otherwise, we could support
460 // machines without the ECAG insn by looking at VEX_S390X_MODEL(hwcaps).
462 // Default cache configuration is z10-EC (Source: ECAG insn)
463 *I1c = (cache_t) { 65536, 4, 256 };
464 *D1c = (cache_t) { 131072, 8, 256 };
465 *LLc = (cache_t) { 50331648, 24, 256 };
467 #elif defined(VGA_mips32) || defined(VGA_nanomips)
469 // Set caches to default (for MIPS32-r2(mips 74kc))
470 *I1c = (cache_t) { 32768, 4, 32 };
471 *D1c = (cache_t) { 32768, 4, 32 };
472 *LLc = (cache_t) { 524288, 8, 32 };
474 #elif defined(VGA_mips64)
476 // Set caches to default (for MIPS64 - 5kc)
477 *I1c = (cache_t) { 32768, 4, 32 };
478 *D1c = (cache_t) { 32768, 4, 32 };
479 *LLc = (cache_t) { 524288, 8, 32 };
481 #elif defined(VGA_x86) || defined(VGA_amd64)
483 *I1c = (cache_t) { 65536, 2, 64 };
484 *D1c = (cache_t) { 65536, 2, 64 };
485 *LLc = (cache_t) { 262144, 8, 64 };
487 #else
489 #error "Unknown arch"
491 #endif
493 if (!all_caches_clo_defined) {
494 const HChar warning[] =
495 "Warning: Cannot auto-detect cache config, using defaults.\n"
496 " Run with -v to see.\n";
497 VG_(dmsg)("%s", warning);
501 /*--------------------------------------------------------------------*/
502 /*--- end ---*/
503 /*--------------------------------------------------------------------*/