Add bug 467036 Add time cost statistics for Regtest to NEWS
[valgrind.git] / callgrind / sim.c
blob104c63492e36021916238682bcd6870bdb9eceaa
1 /*--------------------------------------------------------------------*/
2 /*--- Cache simulation. ---*/
3 /*--- sim.c ---*/
4 /*--------------------------------------------------------------------*/
6 /*
7 This file is part of Callgrind, a Valgrind tool for call graph
8 profiling programs.
10 Copyright (C) 2003-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
12 This tool is derived from and contains code from Cachegrind
13 Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org)
15 This program is free software; you can redistribute it and/or
16 modify it under the terms of the GNU General Public License as
17 published by the Free Software Foundation; either version 2 of the
18 License, or (at your option) any later version.
20 This program is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 General Public License for more details.
25 You should have received a copy of the GNU General Public License
26 along with this program; if not, see <http://www.gnu.org/licenses/>.
28 The GNU General Public License is contained in the file COPYING.
31 #include "global.h"
34 /* Notes:
35 - simulates a write-allocate cache
36 - (block --> set) hash function uses simple bit selection
37 - handling of references straddling two cache blocks:
38 - counts as only one cache access (not two)
39 - both blocks hit --> one hit
40 - one block hits, the other misses --> one miss
41 - both blocks miss --> one miss (not two)
44 /* Cache configuration */
45 #include "cg_arch.c"
47 /* additional structures for cache use info, separated
48 * according usage frequency:
49 * - line_loaded : pointer to cost center of instruction
50 * which loaded the line into cache.
51 * Needed to increment counters when line is evicted.
52 * - line_use : updated on every access
54 typedef struct {
55 UInt count;
56 UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
57 } line_use;
59 typedef struct {
60 Addr memline, iaddr;
61 line_use* dep_use; /* point to higher-level cacheblock for this memline */
62 ULong* use_base;
63 } line_loaded;
65 /* Cache state */
66 typedef struct {
67 const HChar* name;
68 int size; /* bytes */
69 int assoc;
70 int line_size; /* bytes */
71 Bool sectored; /* prefetch nearside cacheline on read */
72 int sets;
73 int sets_min_1;
74 int line_size_bits;
75 int tag_shift;
76 UWord tag_mask;
77 HChar desc_line[128]; // large enough
78 UWord* tags;
80 /* for cache use */
81 int line_size_mask;
82 int* line_start_mask;
83 int* line_end_mask;
84 line_loaded* loaded;
85 line_use* use;
86 } cache_t2;
89 * States of flat caches in our model.
90 * We use a 2-level hierarchy,
92 static cache_t2 I1, D1, LL;
94 /* Lower bits of cache tags are used as flags for a cache line */
95 #define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
96 #define CACHELINE_DIRTY 1
99 /* Cache simulator Options */
100 static Bool clo_simulate_writeback = False;
101 static Bool clo_simulate_hwpref = False;
102 static Bool clo_simulate_sectors = False;
103 static Bool clo_collect_cacheuse = False;
105 /* Following global vars are setup before by setup_bbcc():
107 * - Addr CLG_(bb_base) (instruction start address of original BB)
108 * - ULong* CLG_(cost_base) (start of cost array for BB)
111 Addr CLG_(bb_base);
112 ULong* CLG_(cost_base);
114 static InstrInfo* current_ii;
116 /* Cache use offsets */
117 /* The offsets are only correct because all per-instruction event sets get
118 * the "Use" set added first !
120 static Int off_I1_AcCost = 0;
121 static Int off_I1_SpLoss = 1;
122 static Int off_D1_AcCost = 0;
123 static Int off_D1_SpLoss = 1;
124 static Int off_LL_AcCost = 2;
125 static Int off_LL_SpLoss = 3;
127 /* Cache access types */
128 typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
130 /* Result of a reference into a flat cache */
131 typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
133 /* Result of a reference into a hierarchical cache model */
134 typedef enum {
135 L1_Hit,
136 LL_Hit,
137 MemAccess,
138 WriteBackMemAccess } CacheModelResult;
140 typedef CacheModelResult (*simcall_type)(Addr, UChar);
142 static struct {
143 simcall_type I1_Read;
144 simcall_type D1_Read;
145 simcall_type D1_Write;
146 } simulator;
148 /*------------------------------------------------------------*/
149 /*--- Cache Simulator Initialization ---*/
150 /*------------------------------------------------------------*/
152 static void cachesim_clearcache(cache_t2* c)
154 Int i;
156 for (i = 0; i < c->sets * c->assoc; i++)
157 c->tags[i] = 0;
158 if (c->use) {
159 for (i = 0; i < c->sets * c->assoc; i++) {
160 c->loaded[i].memline = 0;
161 c->loaded[i].use_base = 0;
162 c->loaded[i].dep_use = 0;
163 c->loaded[i].iaddr = 0;
164 c->use[i].mask = 0;
165 c->use[i].count = 0;
166 c->tags[i] = i % c->assoc; /* init lower bits as pointer */
171 static void cacheuse_initcache(cache_t2* c);
173 /* By this point, the size/assoc/line_size has been checked. */
174 static void cachesim_initcache(cache_t config, cache_t2* c)
176 c->size = config.size;
177 c->assoc = config.assoc;
178 c->line_size = config.line_size;
179 c->sectored = False; // FIXME
181 c->sets = (c->size / c->line_size) / c->assoc;
182 c->sets_min_1 = c->sets - 1;
183 c->line_size_bits = VG_(log2)(c->line_size);
184 c->tag_shift = c->line_size_bits + VG_(log2)(c->sets);
185 c->tag_mask = ~((1u<<c->tag_shift)-1);
187 /* Can bits in tag entries be used for flags?
188 * Should be always true as MIN_LINE_SIZE >= 16 */
189 CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
191 if (c->assoc == 1) {
192 VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
193 c->size, c->line_size,
194 c->sectored ? ", sectored":"");
195 } else {
196 VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
197 c->size, c->line_size, c->assoc,
198 c->sectored ? ", sectored":"");
201 c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
202 sizeof(UWord) * c->sets * c->assoc);
203 if (clo_collect_cacheuse)
204 cacheuse_initcache(c);
205 else
206 c->use = 0;
207 cachesim_clearcache(c);
211 #if 0
212 static void print_cache(cache_t2* c)
214 UInt set, way, i;
216 /* Note initialisation and update of 'i'. */
217 for (i = 0, set = 0; set < c->sets; set++) {
218 for (way = 0; way < c->assoc; way++, i++) {
219 VG_(printf)("%8x ", c->tags[i]);
221 VG_(printf)("\n");
224 #endif
227 /*------------------------------------------------------------*/
228 /*--- Simple Cache Simulation ---*/
229 /*------------------------------------------------------------*/
232 * Model: single inclusive, 2-level cache hierarchy (L1/LL)
233 * with write-allocate
235 * For simple cache hit/miss counts, we do not have to
236 * maintain the dirty state of lines (no need to distinguish
237 * read/write references), and the resulting counts are the
238 * same for write-through and write-back caches.
240 * Simulator functions:
241 * CacheModelResult cachesim_I1_ref(Addr a, UChar size)
242 * CacheModelResult cachesim_D1_ref(Addr a, UChar size)
244 __attribute__((always_inline))
245 static __inline__
246 CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
248 int i, j;
249 UWord *set;
251 set = &(c->tags[set_no * c->assoc]);
253 /* This loop is unrolled for just the first case, which is the most */
254 /* common. We can't unroll any further because it would screw up */
255 /* if we have a direct-mapped (1-way) cache. */
256 if (tag == set[0])
257 return Hit;
259 /* If the tag is one other than the MRU, move it into the MRU spot */
260 /* and shuffle the rest down. */
261 for (i = 1; i < c->assoc; i++) {
262 if (tag == set[i]) {
263 for (j = i; j > 0; j--) {
264 set[j] = set[j - 1];
266 set[0] = tag;
267 return Hit;
271 /* A miss; install this tag as MRU, shuffle rest down. */
272 for (j = c->assoc - 1; j > 0; j--) {
273 set[j] = set[j - 1];
275 set[0] = tag;
277 return Miss;
280 __attribute__((always_inline))
281 static __inline__
282 CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
284 UWord block1 = a >> c->line_size_bits;
285 UWord block2 = (a+size-1) >> c->line_size_bits;
286 UInt set1 = block1 & c->sets_min_1;
287 /* the tag does not need to include bits specifying the set,
288 * but it can, and this saves instructions */
289 UWord tag1 = block1;
291 /* Access entirely within line. */
292 if (block1 == block2)
293 return cachesim_setref(c, set1, tag1);
295 /* Access straddles two lines. */
296 else if (block1 + 1 == block2) {
297 UInt set2 = block2 & c->sets_min_1;
298 UWord tag2 = block2;
300 /* the call updates cache structures as side effect */
301 CacheResult res1 = cachesim_setref(c, set1, tag1);
302 CacheResult res2 = cachesim_setref(c, set2, tag2);
303 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
305 } else {
306 VG_(printf)("addr: %lx size: %u blocks: %lu %lu",
307 a, size, block1, block2);
308 VG_(tool_panic)("item straddles more than two cache sets");
310 return Hit;
313 static
314 CacheModelResult cachesim_I1_ref(Addr a, UChar size)
316 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
317 if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
318 return MemAccess;
321 static
322 CacheModelResult cachesim_D1_ref(Addr a, UChar size)
324 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
325 if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
326 return MemAccess;
330 /*------------------------------------------------------------*/
331 /*--- Write Back Cache Simulation ---*/
332 /*------------------------------------------------------------*/
335 * More complex model: L1 Write-through, LL Write-back
336 * This needs to distinguish among read and write references.
338 * Simulator functions:
339 * CacheModelResult cachesim_I1_Read(Addr a, UChar size)
340 * CacheModelResult cachesim_D1_Read(Addr a, UChar size)
341 * CacheModelResult cachesim_D1_Write(Addr a, UChar size)
345 * With write-back, result can be a miss evicting a dirty line
346 * The dirty state of a cache line is stored in Bit0 of the tag for
347 * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
348 * type (Read/Write), the line gets dirty on a write.
350 __attribute__((always_inline))
351 static __inline__
352 CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
354 int i, j;
355 UWord *set, tmp_tag;
357 set = &(c->tags[set_no * c->assoc]);
359 /* This loop is unrolled for just the first case, which is the most */
360 /* common. We can't unroll any further because it would screw up */
361 /* if we have a direct-mapped (1-way) cache. */
362 if (tag == (set[0] & ~CACHELINE_DIRTY)) {
363 set[0] |= ref;
364 return Hit;
366 /* If the tag is one other than the MRU, move it into the MRU spot */
367 /* and shuffle the rest down. */
368 for (i = 1; i < c->assoc; i++) {
369 if (tag == (set[i] & ~CACHELINE_DIRTY)) {
370 tmp_tag = set[i] | ref; // update dirty flag
371 for (j = i; j > 0; j--) {
372 set[j] = set[j - 1];
374 set[0] = tmp_tag;
375 return Hit;
379 /* A miss; install this tag as MRU, shuffle rest down. */
380 tmp_tag = set[c->assoc - 1];
381 for (j = c->assoc - 1; j > 0; j--) {
382 set[j] = set[j - 1];
384 set[0] = tag | ref;
386 return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
389 __attribute__((always_inline))
390 static __inline__
391 CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
393 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1);
394 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
395 UWord tag = a & c->tag_mask;
397 /* Access entirely within line. */
398 if (set1 == set2)
399 return cachesim_setref_wb(c, ref, set1, tag);
401 /* Access straddles two lines. */
402 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
403 else if (((set1 + 1) & (c->sets_min_1)) == set2) {
404 UWord tag2 = (a+size-1) & c->tag_mask;
406 /* the call updates cache structures as side effect */
407 CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag);
408 CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag2);
410 if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
411 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
413 } else {
414 VG_(printf)("addr: %lx size: %u sets: %u %u", a, size, set1, set2);
415 VG_(tool_panic)("item straddles more than two cache sets");
417 return Hit;
421 static
422 CacheModelResult cachesim_I1_Read(Addr a, UChar size)
424 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
425 switch( cachesim_ref_wb( &LL, Read, a, size) ) {
426 case Hit: return LL_Hit;
427 case Miss: return MemAccess;
428 default: break;
430 return WriteBackMemAccess;
433 static
434 CacheModelResult cachesim_D1_Read(Addr a, UChar size)
436 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
437 switch( cachesim_ref_wb( &LL, Read, a, size) ) {
438 case Hit: return LL_Hit;
439 case Miss: return MemAccess;
440 default: break;
442 return WriteBackMemAccess;
445 static
446 CacheModelResult cachesim_D1_Write(Addr a, UChar size)
448 if ( cachesim_ref( &D1, a, size) == Hit ) {
449 /* Even for a L1 hit, the write-trough L1 passes
450 * the write to the LL to make the LL line dirty.
451 * But this causes no latency, so return the hit.
453 cachesim_ref_wb( &LL, Write, a, size);
454 return L1_Hit;
456 switch( cachesim_ref_wb( &LL, Write, a, size) ) {
457 case Hit: return LL_Hit;
458 case Miss: return MemAccess;
459 default: break;
461 return WriteBackMemAccess;
465 /*------------------------------------------------------------*/
466 /*--- Hardware Prefetch Simulation ---*/
467 /*------------------------------------------------------------*/
469 static ULong prefetch_up = 0;
470 static ULong prefetch_down = 0;
472 #define PF_STREAMS 8
473 #define PF_PAGEBITS 12
475 static UInt pf_lastblock[PF_STREAMS];
476 static Int pf_seqblocks[PF_STREAMS];
478 static
479 void prefetch_clear(void)
481 int i;
482 for(i=0;i<PF_STREAMS;i++)
483 pf_lastblock[i] = pf_seqblocks[i] = 0;
487 * HW Prefetch emulation
488 * Start prefetching when detecting sequential access to 3 memory blocks.
489 * One stream can be detected per 4k page.
491 static __inline__
492 void prefetch_LL_doref(Addr a)
494 UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
495 UInt block = ( a >> LL.line_size_bits);
497 if (block != pf_lastblock[stream]) {
498 if (pf_seqblocks[stream] == 0) {
499 if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
500 else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
502 else if (pf_seqblocks[stream] >0) {
503 if (pf_lastblock[stream] +1 == block) {
504 pf_seqblocks[stream]++;
505 if (pf_seqblocks[stream] >= 2) {
506 prefetch_up++;
507 cachesim_ref(&LL, a + 5 * LL.line_size,1);
510 else pf_seqblocks[stream] = 0;
512 else if (pf_seqblocks[stream] <0) {
513 if (pf_lastblock[stream] -1 == block) {
514 pf_seqblocks[stream]--;
515 if (pf_seqblocks[stream] <= -2) {
516 prefetch_down++;
517 cachesim_ref(&LL, a - 5 * LL.line_size,1);
520 else pf_seqblocks[stream] = 0;
522 pf_lastblock[stream] = block;
526 /* simple model with hardware prefetch */
528 static
529 CacheModelResult prefetch_I1_ref(Addr a, UChar size)
531 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
532 prefetch_LL_doref(a);
533 if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
534 return MemAccess;
537 static
538 CacheModelResult prefetch_D1_ref(Addr a, UChar size)
540 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
541 prefetch_LL_doref(a);
542 if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
543 return MemAccess;
547 /* complex model with hardware prefetch */
549 static
550 CacheModelResult prefetch_I1_Read(Addr a, UChar size)
552 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
553 prefetch_LL_doref(a);
554 switch( cachesim_ref_wb( &LL, Read, a, size) ) {
555 case Hit: return LL_Hit;
556 case Miss: return MemAccess;
557 default: break;
559 return WriteBackMemAccess;
562 static
563 CacheModelResult prefetch_D1_Read(Addr a, UChar size)
565 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
566 prefetch_LL_doref(a);
567 switch( cachesim_ref_wb( &LL, Read, a, size) ) {
568 case Hit: return LL_Hit;
569 case Miss: return MemAccess;
570 default: break;
572 return WriteBackMemAccess;
575 static
576 CacheModelResult prefetch_D1_Write(Addr a, UChar size)
578 prefetch_LL_doref(a);
579 if ( cachesim_ref( &D1, a, size) == Hit ) {
580 /* Even for a L1 hit, the write-trough L1 passes
581 * the write to the LL to make the LL line dirty.
582 * But this causes no latency, so return the hit.
584 cachesim_ref_wb( &LL, Write, a, size);
585 return L1_Hit;
587 switch( cachesim_ref_wb( &LL, Write, a, size) ) {
588 case Hit: return LL_Hit;
589 case Miss: return MemAccess;
590 default: break;
592 return WriteBackMemAccess;
596 /*------------------------------------------------------------*/
597 /*--- Cache Simulation with use metric collection ---*/
598 /*------------------------------------------------------------*/
600 /* can not be combined with write-back or prefetch */
602 static
603 void cacheuse_initcache(cache_t2* c)
605 int i;
606 unsigned int start_mask, start_val;
607 unsigned int end_mask, end_val;
609 c->use = CLG_MALLOC("cl.sim.cu_ic.1",
610 sizeof(line_use) * c->sets * c->assoc);
611 c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
612 sizeof(line_loaded) * c->sets * c->assoc);
613 c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
614 sizeof(int) * c->line_size);
615 c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
616 sizeof(int) * c->line_size);
618 c->line_size_mask = c->line_size-1;
620 /* Meaning of line_start_mask/line_end_mask
621 * Example: for a given cache line, you get an access starting at
622 * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
623 * line size of 32, you have 1 bit per byte in the mask:
625 * bit31 bit8 bit5 bit 0
626 * | | | |
627 * 11..111111100000 line_start_mask[5]
628 * 00..000111111111 line_end_mask[(5+4)-1]
630 * use_mask |= line_start_mask[5] && line_end_mask[8]
633 start_val = end_val = ~0;
634 if (c->line_size < 32) {
635 int bits_per_byte = 32/c->line_size;
636 start_mask = (1<<bits_per_byte)-1;
637 end_mask = start_mask << (32-bits_per_byte);
638 for(i=0;i<c->line_size;i++) {
639 c->line_start_mask[i] = start_val;
640 start_val = start_val & ~start_mask;
641 start_mask = start_mask << bits_per_byte;
643 c->line_end_mask[c->line_size-i-1] = end_val;
644 end_val = end_val & ~end_mask;
645 end_mask = end_mask >> bits_per_byte;
648 else {
649 int bytes_per_bit = c->line_size/32;
650 start_mask = 1;
651 end_mask = 1u << 31;
652 for(i=0;i<c->line_size;i++) {
653 c->line_start_mask[i] = start_val;
654 c->line_end_mask[c->line_size-i-1] = end_val;
655 if ( ((i+1)%bytes_per_bit) == 0) {
656 start_val &= ~start_mask;
657 end_val &= ~end_mask;
658 start_mask <<= 1;
659 end_mask >>= 1;
664 CLG_DEBUG(6, "Config %s:\n", c->desc_line);
665 for(i=0;i<c->line_size;i++) {
666 CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
667 i, (UInt)c->line_start_mask[i], (UInt)c->line_end_mask[i]);
670 /* We use lower tag bits as offset pointers to cache use info.
671 * I.e. some cache parameters don't work.
673 if ( (1<<c->tag_shift) < c->assoc) {
674 VG_(message)(Vg_DebugMsg,
675 "error: Use associativity < %d for cache use statistics!\n",
676 (1<<c->tag_shift) );
677 VG_(tool_panic)("Unsupported cache configuration");
682 /* for I1/D1 caches */
683 #define CACHEUSE(L) \
685 static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
687 UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
688 UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
689 UWord tag = a & L.tag_mask; \
690 UWord tag2; \
691 int i, j, idx; \
692 UWord *set, tmp_tag; \
693 UInt use_mask; \
695 CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%u/%u]\n", \
696 L.name, a, size, set1, set2); \
698 /* First case: word entirely within line. */ \
699 if (set1 == set2) { \
701 set = &(L.tags[set1 * L.assoc]); \
702 use_mask = L.line_start_mask[a & L.line_size_mask] & \
703 L.line_end_mask[(a+size-1) & L.line_size_mask]; \
705 /* This loop is unrolled for just the first case, which is the most */\
706 /* common. We can't unroll any further because it would screw up */\
707 /* if we have a direct-mapped (1-way) cache. */\
708 if (tag == (set[0] & L.tag_mask)) { \
709 idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
710 L.use[idx].count ++; \
711 L.use[idx].mask |= use_mask; \
712 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
713 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
714 use_mask, L.use[idx].mask, L.use[idx].count); \
715 return L1_Hit; \
717 /* If the tag is one other than the MRU, move it into the MRU spot */\
718 /* and shuffle the rest down. */\
719 for (i = 1; i < L.assoc; i++) { \
720 if (tag == (set[i] & L.tag_mask)) { \
721 tmp_tag = set[i]; \
722 for (j = i; j > 0; j--) { \
723 set[j] = set[j - 1]; \
725 set[0] = tmp_tag; \
726 idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
727 L.use[idx].count ++; \
728 L.use[idx].mask |= use_mask; \
729 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
730 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
731 use_mask, L.use[idx].mask, L.use[idx].count); \
732 return L1_Hit; \
736 /* A miss; install this tag as MRU, shuffle rest down. */ \
737 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
738 for (j = L.assoc - 1; j > 0; j--) { \
739 set[j] = set[j - 1]; \
741 set[0] = tag | tmp_tag; \
742 idx = (set1 * L.assoc) + tmp_tag; \
743 return update_##L##_use(&L, idx, \
744 use_mask, a &~ L.line_size_mask); \
746 /* Second case: word straddles two lines. */ \
747 /* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
748 } else if (((set1 + 1) & (L.sets_min_1)) == set2) { \
749 Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */ \
750 set = &(L.tags[set1 * L.assoc]); \
751 use_mask = L.line_start_mask[a & L.line_size_mask]; \
752 if (tag == (set[0] & L.tag_mask)) { \
753 idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \
754 L.use[idx].count ++; \
755 L.use[idx].mask |= use_mask; \
756 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
757 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
758 use_mask, L.use[idx].mask, L.use[idx].count); \
759 goto block2; \
761 for (i = 1; i < L.assoc; i++) { \
762 if (tag == (set[i] & L.tag_mask)) { \
763 tmp_tag = set[i]; \
764 for (j = i; j > 0; j--) { \
765 set[j] = set[j - 1]; \
767 set[0] = tmp_tag; \
768 idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \
769 L.use[idx].count ++; \
770 L.use[idx].mask |= use_mask; \
771 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
772 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
773 use_mask, L.use[idx].mask, L.use[idx].count); \
774 goto block2; \
777 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
778 for (j = L.assoc - 1; j > 0; j--) { \
779 set[j] = set[j - 1]; \
781 set[0] = tag | tmp_tag; \
782 idx = (set1 * L.assoc) + tmp_tag; \
783 miss1 = update_##L##_use(&L, idx, \
784 use_mask, a &~ L.line_size_mask); \
785 block2: \
786 set = &(L.tags[set2 * L.assoc]); \
787 use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \
788 tag2 = (a+size-1) & L.tag_mask; \
789 if (tag2 == (set[0] & L.tag_mask)) { \
790 idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask); \
791 L.use[idx].count ++; \
792 L.use[idx].mask |= use_mask; \
793 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
794 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
795 use_mask, L.use[idx].mask, L.use[idx].count); \
796 return miss1; \
798 for (i = 1; i < L.assoc; i++) { \
799 if (tag2 == (set[i] & L.tag_mask)) { \
800 tmp_tag = set[i]; \
801 for (j = i; j > 0; j--) { \
802 set[j] = set[j - 1]; \
804 set[0] = tmp_tag; \
805 idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask); \
806 L.use[idx].count ++; \
807 L.use[idx].mask |= use_mask; \
808 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
809 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \
810 use_mask, L.use[idx].mask, L.use[idx].count); \
811 return miss1; \
814 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \
815 for (j = L.assoc - 1; j > 0; j--) { \
816 set[j] = set[j - 1]; \
818 set[0] = tag2 | tmp_tag; \
819 idx = (set2 * L.assoc) + tmp_tag; \
820 miss2 = update_##L##_use(&L, idx, \
821 use_mask, (a+size-1) &~ L.line_size_mask); \
822 return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit; \
824 } else { \
825 VG_(printf)("addr: %#lx size: %u sets: %u %u", a, size, set1, set2); \
826 VG_(tool_panic)("item straddles more than two cache sets"); \
828 return 0; \
832 /* logarithmic bitcounting algorithm, see
833 * http://graphics.stanford.edu/~seander/bithacks.html
835 static __inline__ unsigned int countBits(unsigned int bits)
837 unsigned int c; // store the total here
838 const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
839 const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
841 c = bits;
842 c = ((c >> S[0]) & B[0]) + (c & B[0]);
843 c = ((c >> S[1]) & B[1]) + (c & B[1]);
844 c = ((c >> S[2]) & B[2]) + (c & B[2]);
845 c = ((c >> S[3]) & B[3]) + (c & B[3]);
846 c = ((c >> S[4]) & B[4]) + (c & B[4]);
847 return c;
850 static void update_LL_use(int idx, Addr memline)
852 line_loaded* loaded = &(LL.loaded[idx]);
853 line_use* use = &(LL.use[idx]);
854 int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
856 CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
857 idx, CLG_(bb_base) + current_ii->instr_offset, memline);
858 if (use->count>0) {
859 CLG_DEBUG(2, " old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n",
860 use->count, i, use->mask, loaded->memline, loaded->iaddr);
861 CLG_DEBUG(2, " collect: %d, use_base %p\n",
862 CLG_(current_state).collect, loaded->use_base);
864 if (CLG_(current_state).collect && loaded->use_base) {
865 (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
866 (loaded->use_base)[off_LL_SpLoss] += i;
870 use->count = 0;
871 use->mask = 0;
873 loaded->memline = memline;
874 loaded->iaddr = CLG_(bb_base) + current_ii->instr_offset;
875 loaded->use_base = (CLG_(current_state).nonskipped) ?
876 CLG_(current_state).nonskipped->skipped :
877 CLG_(cost_base) + current_ii->cost_offset;
880 static
881 CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
883 UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
884 UWord* set = &(LL.tags[setNo * LL.assoc]);
885 UWord tag = memline & LL.tag_mask;
887 int i, j, idx;
888 UWord tmp_tag;
890 CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %u\n", memline, setNo);
892 if (tag == (set[0] & LL.tag_mask)) {
893 idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
894 l1_loaded->dep_use = &(LL.use[idx]);
896 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
897 idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr,
898 LL.use[idx].mask, LL.use[idx].count);
899 return LL_Hit;
901 for (i = 1; i < LL.assoc; i++) {
902 if (tag == (set[i] & LL.tag_mask)) {
903 tmp_tag = set[i];
904 for (j = i; j > 0; j--) {
905 set[j] = set[j - 1];
907 set[0] = tmp_tag;
908 idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
909 l1_loaded->dep_use = &(LL.use[idx]);
911 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
912 i, idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr,
913 LL.use[idx].mask, LL.use[idx].count);
914 return LL_Hit;
918 /* A miss; install this tag as MRU, shuffle rest down. */
919 tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
920 for (j = LL.assoc - 1; j > 0; j--) {
921 set[j] = set[j - 1];
923 set[0] = tag | tmp_tag;
924 idx = (setNo * LL.assoc) + tmp_tag;
925 l1_loaded->dep_use = &(LL.use[idx]);
927 update_LL_use(idx, memline);
929 return MemAccess;
935 #define UPDATE_USE(L) \
937 static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
938 UInt mask, Addr memline) \
940 line_loaded* loaded = &(cache->loaded[idx]); \
941 line_use* use = &(cache->use[idx]); \
942 int c = ((32 - countBits(use->mask)) * cache->line_size)>>5; \
944 CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
945 cache->name, idx, CLG_(bb_base) + current_ii->instr_offset, memline, mask); \
946 if (use->count>0) { \
947 CLG_DEBUG(2, " old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n",\
948 use->count, c, use->mask, loaded->memline, loaded->iaddr); \
949 CLG_DEBUG(2, " collect: %d, use_base %p\n", \
950 CLG_(current_state).collect, loaded->use_base); \
952 if (CLG_(current_state).collect && loaded->use_base) { \
953 (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
954 (loaded->use_base)[off_##L##_SpLoss] += c; \
956 /* FIXME (?): L1/LL line sizes must be equal ! */ \
957 loaded->dep_use->mask |= use->mask; \
958 loaded->dep_use->count += use->count; \
962 use->count = 1; \
963 use->mask = mask; \
964 loaded->memline = memline; \
965 loaded->iaddr = CLG_(bb_base) + current_ii->instr_offset; \
966 loaded->use_base = (CLG_(current_state).nonskipped) ? \
967 CLG_(current_state).nonskipped->skipped : \
968 CLG_(cost_base) + current_ii->cost_offset; \
970 if (memline == 0) return LL_Hit; \
971 return cacheuse_LL_access(memline, loaded); \
974 UPDATE_USE(I1);
975 UPDATE_USE(D1);
977 CACHEUSE(I1);
978 CACHEUSE(D1);
981 static
982 void cacheuse_finish(void)
984 int i;
985 InstrInfo ii = { 0,0,0,0 };
987 if (!CLG_(current_state).collect) return;
989 CLG_(bb_base) = 0;
990 current_ii = &ii; /* needs to be set for update_XX_use */
991 CLG_(cost_base) = 0;
993 /* update usage counters */
994 if (I1.use)
995 for (i = 0; i < I1.sets * I1.assoc; i++)
996 if (I1.loaded[i].use_base)
997 update_I1_use( &I1, i, 0,0);
999 if (D1.use)
1000 for (i = 0; i < D1.sets * D1.assoc; i++)
1001 if (D1.loaded[i].use_base)
1002 update_D1_use( &D1, i, 0,0);
1004 if (LL.use)
1005 for (i = 0; i < LL.sets * LL.assoc; i++)
1006 if (LL.loaded[i].use_base)
1007 update_LL_use(i, 0);
1009 current_ii = 0;
1014 /*------------------------------------------------------------*/
1015 /*--- Helper functions called by instrumented code ---*/
1016 /*------------------------------------------------------------*/
1019 static __inline__
1020 void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1022 switch(r) {
1023 case WriteBackMemAccess:
1024 if (clo_simulate_writeback) {
1025 c1[3]++;
1026 c2[3]++;
1028 // fall through
1030 case MemAccess:
1031 c1[2]++;
1032 c2[2]++;
1033 // fall through
1035 case LL_Hit:
1036 c1[1]++;
1037 c2[1]++;
1038 // fall through
1040 default:
1041 c1[0]++;
1042 c2[0]++;
1046 static
1047 const HChar* cacheRes(CacheModelResult r)
1049 switch(r) {
1050 case L1_Hit: return "L1 Hit ";
1051 case LL_Hit: return "LL Hit ";
1052 case MemAccess: return "LL Miss";
1053 case WriteBackMemAccess: return "LL Miss (dirty)";
1054 default:
1055 tl_assert(0);
1057 return "??";
1060 VG_REGPARM(1)
1061 static void log_1I0D(InstrInfo* ii)
1063 CacheModelResult IrRes;
1065 current_ii = ii;
1066 IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1068 CLG_DEBUG(6, "log_1I0D: Ir %#lx/%u => %s\n",
1069 CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
1071 if (CLG_(current_state).collect) {
1072 ULong* cost_Ir;
1074 if (CLG_(current_state).nonskipped)
1075 cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1076 else
1077 cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1079 inc_costs(IrRes, cost_Ir,
1080 CLG_(current_state).cost + fullOffset(EG_IR) );
1084 VG_REGPARM(2)
1085 static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
1087 CacheModelResult Ir1Res, Ir2Res;
1088 ULong *global_cost_Ir;
1090 current_ii = ii1;
1091 Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1092 current_ii = ii2;
1093 Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1095 CLG_DEBUG(6, "log_2I0D: Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
1096 CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1097 CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
1099 if (!CLG_(current_state).collect) return;
1101 global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1102 if (CLG_(current_state).nonskipped) {
1103 ULong* skipped_cost_Ir =
1104 CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1106 inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1107 inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1108 return;
1111 inc_costs(Ir1Res, global_cost_Ir,
1112 CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1113 inc_costs(Ir2Res, global_cost_Ir,
1114 CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1117 VG_REGPARM(3)
1118 static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
1120 CacheModelResult Ir1Res, Ir2Res, Ir3Res;
1121 ULong *global_cost_Ir;
1123 current_ii = ii1;
1124 Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1125 current_ii = ii2;
1126 Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1127 current_ii = ii3;
1128 Ir3Res = (*simulator.I1_Read)(CLG_(bb_base) + ii3->instr_offset, ii3->instr_size);
1130 CLG_DEBUG(6, "log_3I0D: Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
1131 CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1132 CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
1133 CLG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
1135 if (!CLG_(current_state).collect) return;
1137 global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1138 if (CLG_(current_state).nonskipped) {
1139 ULong* skipped_cost_Ir =
1140 CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1141 inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1142 inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1143 inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
1144 return;
1147 inc_costs(Ir1Res, global_cost_Ir,
1148 CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1149 inc_costs(Ir2Res, global_cost_Ir,
1150 CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1151 inc_costs(Ir3Res, global_cost_Ir,
1152 CLG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]);
1155 /* Instruction doing a read access */
1157 VG_REGPARM(3)
1158 static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1160 CacheModelResult IrRes, DrRes;
1162 current_ii = ii;
1163 IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1164 DrRes = (*simulator.D1_Read)(data_addr, data_size);
1166 CLG_DEBUG(6, "log_1I1Dr: Ir %#lx/%u => %s, Dr %#lx/%ld => %s\n",
1167 CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1168 data_addr, data_size, cacheRes(DrRes));
1170 if (CLG_(current_state).collect) {
1171 ULong *cost_Ir, *cost_Dr;
1173 if (CLG_(current_state).nonskipped) {
1174 cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1175 cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1177 else {
1178 cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1179 cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1182 inc_costs(IrRes, cost_Ir,
1183 CLG_(current_state).cost + fullOffset(EG_IR) );
1184 inc_costs(DrRes, cost_Dr,
1185 CLG_(current_state).cost + fullOffset(EG_DR) );
1190 /* Note that addEvent_D_guarded assumes that log_0I1Dr and log_0I1Dw
1191 have exactly the same prototype. If you change them, you must
1192 change addEvent_D_guarded too. */
1193 VG_REGPARM(3)
1194 static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1196 CacheModelResult DrRes;
1198 current_ii = ii;
1199 DrRes = (*simulator.D1_Read)(data_addr, data_size);
1201 CLG_DEBUG(6, "log_0I1Dr: Dr %#lx/%ld => %s\n",
1202 data_addr, data_size, cacheRes(DrRes));
1204 if (CLG_(current_state).collect) {
1205 ULong *cost_Dr;
1207 if (CLG_(current_state).nonskipped)
1208 cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1209 else
1210 cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1212 inc_costs(DrRes, cost_Dr,
1213 CLG_(current_state).cost + fullOffset(EG_DR) );
1218 /* Instruction doing a write access */
1220 VG_REGPARM(3)
1221 static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1223 CacheModelResult IrRes, DwRes;
1225 current_ii = ii;
1226 IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1227 DwRes = (*simulator.D1_Write)(data_addr, data_size);
1229 CLG_DEBUG(6, "log_1I1Dw: Ir %#lx/%u => %s, Dw %#lx/%ld => %s\n",
1230 CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1231 data_addr, data_size, cacheRes(DwRes));
1233 if (CLG_(current_state).collect) {
1234 ULong *cost_Ir, *cost_Dw;
1236 if (CLG_(current_state).nonskipped) {
1237 cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1238 cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1240 else {
1241 cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1242 cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1245 inc_costs(IrRes, cost_Ir,
1246 CLG_(current_state).cost + fullOffset(EG_IR) );
1247 inc_costs(DwRes, cost_Dw,
1248 CLG_(current_state).cost + fullOffset(EG_DW) );
1252 /* See comment on log_0I1Dr. */
1253 VG_REGPARM(3)
1254 static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1256 CacheModelResult DwRes;
1258 current_ii = ii;
1259 DwRes = (*simulator.D1_Write)(data_addr, data_size);
1261 CLG_DEBUG(6, "log_0I1Dw: Dw %#lx/%ld => %s\n",
1262 data_addr, data_size, cacheRes(DwRes));
1264 if (CLG_(current_state).collect) {
1265 ULong *cost_Dw;
1267 if (CLG_(current_state).nonskipped)
1268 cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1269 else
1270 cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1272 inc_costs(DwRes, cost_Dw,
1273 CLG_(current_state).cost + fullOffset(EG_DW) );
1279 /*------------------------------------------------------------*/
1280 /*--- Cache configuration ---*/
1281 /*------------------------------------------------------------*/
1283 static cache_t clo_I1_cache = UNDEFINED_CACHE;
1284 static cache_t clo_D1_cache = UNDEFINED_CACHE;
1285 static cache_t clo_LL_cache = UNDEFINED_CACHE;
1287 /* Initialize and clear simulator state */
1288 static void cachesim_post_clo_init(void)
1290 /* Cache configurations. */
1291 cache_t I1c, D1c, LLc;
1293 /* Initialize access handlers */
1294 if (!CLG_(clo).simulate_cache) {
1295 CLG_(cachesim).log_1I0D = 0;
1296 CLG_(cachesim).log_1I0D_name = "(no function)";
1297 CLG_(cachesim).log_2I0D = 0;
1298 CLG_(cachesim).log_2I0D_name = "(no function)";
1299 CLG_(cachesim).log_3I0D = 0;
1300 CLG_(cachesim).log_3I0D_name = "(no function)";
1302 CLG_(cachesim).log_1I1Dr = 0;
1303 CLG_(cachesim).log_1I1Dr_name = "(no function)";
1304 CLG_(cachesim).log_1I1Dw = 0;
1305 CLG_(cachesim).log_1I1Dw_name = "(no function)";
1307 CLG_(cachesim).log_0I1Dr = 0;
1308 CLG_(cachesim).log_0I1Dr_name = "(no function)";
1309 CLG_(cachesim).log_0I1Dw = 0;
1310 CLG_(cachesim).log_0I1Dw_name = "(no function)";
1311 return;
1314 /* Configuration of caches only needed with real cache simulation */
1315 VG_(post_clo_init_configure_caches)(&I1c, &D1c, &LLc,
1316 &clo_I1_cache,
1317 &clo_D1_cache,
1318 &clo_LL_cache);
1320 I1.name = "I1";
1321 D1.name = "D1";
1322 LL.name = "LL";
1324 // min_line_size is used to make sure that we never feed
1325 // accesses to the simulator straddling more than two
1326 // cache lines at any cache level
1327 CLG_(min_line_size) = (I1c.line_size < D1c.line_size)
1328 ? I1c.line_size : D1c.line_size;
1329 CLG_(min_line_size) = (LLc.line_size < CLG_(min_line_size))
1330 ? LLc.line_size : CLG_(min_line_size);
1332 Int largest_load_or_store_size
1333 = VG_(machine_get_size_of_largest_guest_register)();
1334 if (CLG_(min_line_size) < largest_load_or_store_size) {
1335 /* We can't continue, because the cache simulation might
1336 straddle more than 2 lines, and it will assert. So let's
1337 just stop before we start. */
1338 VG_(umsg)("Callgrind: cannot continue: the minimum line size (%d)\n",
1339 (Int)CLG_(min_line_size));
1340 VG_(umsg)(" must be equal to or larger than the maximum register size (%d)\n",
1341 largest_load_or_store_size );
1342 VG_(umsg)(" but it is not. Exiting now.\n");
1343 VG_(exit)(1);
1346 cachesim_initcache(I1c, &I1);
1347 cachesim_initcache(D1c, &D1);
1348 cachesim_initcache(LLc, &LL);
1350 /* the other cache simulators use the standard helpers
1351 * with dispatching via simulator struct */
1353 CLG_(cachesim).log_1I0D = log_1I0D;
1354 CLG_(cachesim).log_1I0D_name = "log_1I0D";
1355 CLG_(cachesim).log_2I0D = log_2I0D;
1356 CLG_(cachesim).log_2I0D_name = "log_2I0D";
1357 CLG_(cachesim).log_3I0D = log_3I0D;
1358 CLG_(cachesim).log_3I0D_name = "log_3I0D";
1360 CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1361 CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1362 CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1363 CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1365 CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1366 CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1367 CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1368 CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1370 if (clo_collect_cacheuse) {
1372 /* Output warning for not supported option combinations */
1373 if (clo_simulate_hwpref) {
1374 VG_(message)(Vg_DebugMsg,
1375 "warning: prefetch simulation can not be "
1376 "used with cache usage\n");
1377 clo_simulate_hwpref = False;
1380 if (clo_simulate_writeback) {
1381 VG_(message)(Vg_DebugMsg,
1382 "warning: write-back simulation can not be "
1383 "used with cache usage\n");
1384 clo_simulate_writeback = False;
1387 simulator.I1_Read = cacheuse_I1_doRead;
1388 simulator.D1_Read = cacheuse_D1_doRead;
1389 simulator.D1_Write = cacheuse_D1_doRead;
1390 return;
1393 if (clo_simulate_hwpref) {
1394 prefetch_clear();
1396 if (clo_simulate_writeback) {
1397 simulator.I1_Read = prefetch_I1_Read;
1398 simulator.D1_Read = prefetch_D1_Read;
1399 simulator.D1_Write = prefetch_D1_Write;
1401 else {
1402 simulator.I1_Read = prefetch_I1_ref;
1403 simulator.D1_Read = prefetch_D1_ref;
1404 simulator.D1_Write = prefetch_D1_ref;
1407 return;
1410 if (clo_simulate_writeback) {
1411 simulator.I1_Read = cachesim_I1_Read;
1412 simulator.D1_Read = cachesim_D1_Read;
1413 simulator.D1_Write = cachesim_D1_Write;
1415 else {
1416 simulator.I1_Read = cachesim_I1_ref;
1417 simulator.D1_Read = cachesim_D1_ref;
1418 simulator.D1_Write = cachesim_D1_ref;
1423 /* Clear simulator state. Has to be initialized before */
1424 static
1425 void cachesim_clear(void)
1427 cachesim_clearcache(&I1);
1428 cachesim_clearcache(&D1);
1429 cachesim_clearcache(&LL);
1431 prefetch_clear();
1435 static void cachesim_dump_desc(VgFile *fp)
1437 VG_(fprintf)(fp, "\ndesc: I1 cache: %s\n", I1.desc_line);
1438 VG_(fprintf)(fp, "desc: D1 cache: %s\n", D1.desc_line);
1439 VG_(fprintf)(fp, "desc: LL cache: %s\n", LL.desc_line);
1442 static
1443 void cachesim_print_opts(void)
1445 VG_(printf)(
1446 "\n cache simulator options (does cache simulation if used):\n"
1447 " --simulate-wb=no|yes Count write-back events [no]\n"
1448 " --simulate-hwpref=no|yes Simulate hardware prefetch [no]\n"
1449 #if CLG_EXPERIMENTAL
1450 " --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1451 #endif
1452 " --cacheuse=no|yes Collect cache block use [no]\n");
1453 VG_(print_cache_clo_opts)();
1456 /* Check for command line option for cache configuration.
1457 * Return False if unknown and not handled.
1459 * Called from CLG_(process_cmd_line_option)() in clo.c
1461 static Bool cachesim_parse_opt(const HChar* arg)
1463 if VG_BOOL_CLO(arg, "--simulate-wb", clo_simulate_writeback) {}
1464 else if VG_BOOL_CLO(arg, "--simulate-hwpref", clo_simulate_hwpref) {}
1465 else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors) {}
1467 else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
1468 if (clo_collect_cacheuse) {
1469 /* Use counters only make sense with fine dumping */
1470 CLG_(clo).dump_instr = True;
1474 else if (VG_(str_clo_cache_opt)(arg,
1475 &clo_I1_cache,
1476 &clo_D1_cache,
1477 &clo_LL_cache)) {}
1479 else
1480 return False;
1482 return True;
1485 static
1486 void cachesim_printstat(Int l1, Int l2, Int l3)
1488 FullCost total = CLG_(total_cost), D_total = 0;
1489 ULong LL_total_m, LL_total_mr, LL_total_mw,
1490 LL_total, LL_total_r, LL_total_w;
1492 if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1493 VG_(message)(Vg_DebugMsg, "Prefetch Up: %llu\n",
1494 prefetch_up);
1495 VG_(message)(Vg_DebugMsg, "Prefetch Down: %llu\n",
1496 prefetch_down);
1497 VG_(message)(Vg_DebugMsg, "\n");
1500 VG_(message)(Vg_UserMsg, "I1 misses: %'*llu\n", l1,
1501 total[fullOffset(EG_IR) +1]);
1503 VG_(message)(Vg_UserMsg, "LLi misses: %'*llu\n", l1,
1504 total[fullOffset(EG_IR) +2]);
1506 if (0 == total[fullOffset(EG_IR)])
1507 total[fullOffset(EG_IR)] = 1;
1509 VG_(message)(Vg_UserMsg, "I1 miss rate: %*.2f%%\n", l1,
1510 total[fullOffset(EG_IR)+1] * 100.0 / total[fullOffset(EG_IR)]);
1512 VG_(message)(Vg_UserMsg, "LLi miss rate: %*.2f%%\n", l1,
1513 total[fullOffset(EG_IR)+2] * 100.0 / total[fullOffset(EG_IR)]);
1515 VG_(message)(Vg_UserMsg, "\n");
1517 /* D cache results.
1518 Use the D_refs.rd and D_refs.wr values to determine the
1519 * width of columns 2 & 3. */
1521 D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1522 CLG_(init_cost)( CLG_(sets).full, D_total);
1523 // we only use the first 3 values of D_total, adding up Dr and Dw costs
1524 CLG_(copy_cost)( CLG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR) );
1525 CLG_(add_cost) ( CLG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) );
1527 VG_(message)(Vg_UserMsg, "D refs: %'*llu (%'*llu rd + %'*llu wr)\n",
1528 l1, D_total[0],
1529 l2, total[fullOffset(EG_DR)],
1530 l3, total[fullOffset(EG_DW)]);
1532 VG_(message)(Vg_UserMsg, "D1 misses: %'*llu (%'*llu rd + %'*llu wr)\n",
1533 l1, D_total[1],
1534 l2, total[fullOffset(EG_DR)+1],
1535 l3, total[fullOffset(EG_DW)+1]);
1537 VG_(message)(Vg_UserMsg, "LLd misses: %'*llu (%'*llu rd + %'*llu wr)\n",
1538 l1, D_total[2],
1539 l2, total[fullOffset(EG_DR)+2],
1540 l3, total[fullOffset(EG_DW)+2]);
1542 if (0 == D_total[0]) D_total[0] = 1;
1543 if (0 == total[fullOffset(EG_DR)]) total[fullOffset(EG_DR)] = 1;
1544 if (0 == total[fullOffset(EG_DW)]) total[fullOffset(EG_DW)] = 1;
1546 VG_(message)(Vg_UserMsg, "D1 miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n",
1547 l1, D_total[1] * 100.0 / D_total[0],
1548 l2, total[fullOffset(EG_DR)+1] * 100.0 / total[fullOffset(EG_DR)],
1549 l3, total[fullOffset(EG_DW)+1] * 100.0 / total[fullOffset(EG_DW)]);
1551 VG_(message)(Vg_UserMsg, "LLd miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n",
1552 l1, D_total[2] * 100.0 / D_total[0],
1553 l2, total[fullOffset(EG_DR)+2] * 100.0 / total[fullOffset(EG_DR)],
1554 l3, total[fullOffset(EG_DW)+2] * 100.0 / total[fullOffset(EG_DW)]);
1555 VG_(message)(Vg_UserMsg, "\n");
1559 /* LL overall results */
1561 LL_total =
1562 total[fullOffset(EG_DR) +1] +
1563 total[fullOffset(EG_DW) +1] +
1564 total[fullOffset(EG_IR) +1];
1565 LL_total_r =
1566 total[fullOffset(EG_DR) +1] +
1567 total[fullOffset(EG_IR) +1];
1568 LL_total_w = total[fullOffset(EG_DW) +1];
1569 VG_(message)(Vg_UserMsg, "LL refs: %'*llu (%'*llu rd + %'*llu wr)\n",
1570 l1, LL_total, l2, LL_total_r, l3, LL_total_w);
1572 LL_total_m =
1573 total[fullOffset(EG_DR) +2] +
1574 total[fullOffset(EG_DW) +2] +
1575 total[fullOffset(EG_IR) +2];
1576 LL_total_mr =
1577 total[fullOffset(EG_DR) +2] +
1578 total[fullOffset(EG_IR) +2];
1579 LL_total_mw = total[fullOffset(EG_DW) +2];
1580 VG_(message)(Vg_UserMsg, "LL misses: %'*llu (%'*llu rd + %'*llu wr)\n",
1581 l1, LL_total_m, l2, LL_total_mr, l3, LL_total_mw);
1583 VG_(message)(Vg_UserMsg, "LL miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n",
1584 l1, LL_total_m * 100.0 / (total[fullOffset(EG_IR)] + D_total[0]),
1585 l2, LL_total_mr * 100.0 / (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
1586 l3, LL_total_mw * 100.0 / total[fullOffset(EG_DW)]);
1590 /*------------------------------------------------------------*/
1591 /*--- Setup for Event set. ---*/
1592 /*------------------------------------------------------------*/
1594 struct event_sets CLG_(sets);
1596 void CLG_(init_eventsets)()
1598 // Event groups from which the event sets are composed
1599 // the "Use" group only is used with "cacheuse" simulation
1600 if (clo_collect_cacheuse)
1601 CLG_(register_event_group4)(EG_USE,
1602 "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
1604 if (!CLG_(clo).simulate_cache)
1605 CLG_(register_event_group)(EG_IR, "Ir");
1606 else if (!clo_simulate_writeback) {
1607 CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
1608 CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
1609 CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
1611 else { // clo_simulate_writeback
1612 CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
1613 CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
1614 CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
1617 if (CLG_(clo).simulate_branch) {
1618 CLG_(register_event_group2)(EG_BC, "Bc", "Bcm");
1619 CLG_(register_event_group2)(EG_BI, "Bi", "Bim");
1622 if (CLG_(clo).collect_bus)
1623 CLG_(register_event_group)(EG_BUS, "Ge");
1625 if (CLG_(clo).collect_alloc)
1626 CLG_(register_event_group2)(EG_ALLOC, "allocCount", "allocSize");
1628 if (CLG_(clo).collect_systime != systime_no) {
1629 if (CLG_(clo).collect_systime == systime_nsec)
1630 CLG_(register_event_group3)(EG_SYS, "sysCount", "sysTime", "sysCpuTime");
1631 else
1632 CLG_(register_event_group2)(EG_SYS, "sysCount", "sysTime");
1635 // event set used as base for instruction self cost
1636 CLG_(sets).base = CLG_(get_event_set2)(EG_USE, EG_IR);
1638 // event set comprising all event groups, used for inclusive cost
1639 CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).base, EG_DR, EG_DW);
1640 CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_BC, EG_BI);
1641 CLG_(sets).full = CLG_(add_event_group) (CLG_(sets).full, EG_BUS);
1642 CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_ALLOC, EG_SYS);
1644 CLG_DEBUGIF(1) {
1645 CLG_DEBUG(1, "EventSets:\n");
1646 CLG_(print_eventset)(-2, CLG_(sets).base);
1647 CLG_(print_eventset)(-2, CLG_(sets).full);
1650 /* Not-existing events are silently ignored */
1651 CLG_(dumpmap) = CLG_(get_eventmapping)(CLG_(sets).full);
1652 CLG_(append_event)(CLG_(dumpmap), "Ir");
1653 CLG_(append_event)(CLG_(dumpmap), "Dr");
1654 CLG_(append_event)(CLG_(dumpmap), "Dw");
1655 CLG_(append_event)(CLG_(dumpmap), "I1mr");
1656 CLG_(append_event)(CLG_(dumpmap), "D1mr");
1657 CLG_(append_event)(CLG_(dumpmap), "D1mw");
1658 CLG_(append_event)(CLG_(dumpmap), "ILmr");
1659 CLG_(append_event)(CLG_(dumpmap), "DLmr");
1660 CLG_(append_event)(CLG_(dumpmap), "DLmw");
1661 CLG_(append_event)(CLG_(dumpmap), "ILdmr");
1662 CLG_(append_event)(CLG_(dumpmap), "DLdmr");
1663 CLG_(append_event)(CLG_(dumpmap), "DLdmw");
1664 CLG_(append_event)(CLG_(dumpmap), "Bc");
1665 CLG_(append_event)(CLG_(dumpmap), "Bcm");
1666 CLG_(append_event)(CLG_(dumpmap), "Bi");
1667 CLG_(append_event)(CLG_(dumpmap), "Bim");
1668 CLG_(append_event)(CLG_(dumpmap), "AcCost1");
1669 CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
1670 CLG_(append_event)(CLG_(dumpmap), "AcCost2");
1671 CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
1672 CLG_(append_event)(CLG_(dumpmap), "Ge");
1673 CLG_(append_event)(CLG_(dumpmap), "allocCount");
1674 CLG_(append_event)(CLG_(dumpmap), "allocSize");
1675 CLG_(append_event)(CLG_(dumpmap), "sysCount");
1676 CLG_(append_event)(CLG_(dumpmap), "sysTime");
1677 CLG_(append_event)(CLG_(dumpmap), "sysCpuTime");
1681 /* this is called at dump time for every instruction executed */
1682 static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
1683 InstrInfo* ii, ULong exe_count)
1685 if (!CLG_(clo).simulate_cache)
1686 cost[ fullOffset(EG_IR) ] += exe_count;
1688 if (ii->eventset)
1689 CLG_(add_and_zero_cost2)( CLG_(sets).full, cost,
1690 ii->eventset, bbcc->cost + ii->cost_offset);
1693 static
1694 void cachesim_finish(void)
1696 if (clo_collect_cacheuse)
1697 cacheuse_finish();
1700 /*------------------------------------------------------------*/
1701 /*--- The simulator defined in this file ---*/
1702 /*------------------------------------------------------------*/
1704 struct cachesim_if CLG_(cachesim) = {
1705 .print_opts = cachesim_print_opts,
1706 .parse_opt = cachesim_parse_opt,
1707 .post_clo_init = cachesim_post_clo_init,
1708 .clear = cachesim_clear,
1709 .dump_desc = cachesim_dump_desc,
1710 .printstat = cachesim_printstat,
1711 .add_icost = cachesim_add_icost,
1712 .finish = cachesim_finish,
1714 /* these will be set by cachesim_post_clo_init */
1715 .log_1I0D = 0,
1716 .log_2I0D = 0,
1717 .log_3I0D = 0,
1719 .log_1I1Dr = 0,
1720 .log_1I1Dw = 0,
1722 .log_0I1Dr = 0,
1723 .log_0I1Dw = 0,
1725 .log_1I0D_name = "(no function)",
1726 .log_2I0D_name = "(no function)",
1727 .log_3I0D_name = "(no function)",
1729 .log_1I1Dr_name = "(no function)",
1730 .log_1I1Dw_name = "(no function)",
1732 .log_0I1Dr_name = "(no function)",
1733 .log_0I1Dw_name = "(no function)",
1737 /*--------------------------------------------------------------------*/
1738 /*--- end ct_sim.c ---*/
1739 /*--------------------------------------------------------------------*/