cachegrind/cg_arch.c

   1 /*--------------------------------------------------------------------*/
   2 /*--- Cachegrind: cache configuration.                   cg-arch.c ---*/
   3 /*--------------------------------------------------------------------*/
   4
   5 /*
   6    This file is part of Cachegrind, a high-precision tracing profiler
   7    built with Valgrind.
   8
   9    Copyright (C) 2011-2017 Nicholas Nethercote
  10       njn@valgrind.org
  11
  12    This program is free software; you can redistribute it and/or
  13    modify it under the terms of the GNU General Public License as
  14    published by the Free Software Foundation; either version 2 of the
  15    License, or (at your option) any later version.
  16
  17    This program is distributed in the hope that it will be useful, but
  18    WITHOUT ANY WARRANTY; without even the implied warranty of
  19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20    General Public License for more details.
  21
  22    You should have received a copy of the GNU General Public License
  23    along with this program; if not, see <http://www.gnu.org/licenses/>.
  24
  25    The GNU General Public License is contained in the file COPYING.
  26 */
  27
  28 #include "pub_tool_basics.h"
  29 #include "pub_tool_libcassert.h"
  30 #include "pub_tool_libcbase.h"
  31 #include "pub_tool_libcprint.h"
  32 #include "pub_tool_options.h"
  33 #include "pub_tool_machine.h"
  34
  35 #include "cg_arch.h"
  36
  37 static void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc,
  38                              Bool all_caches_clo_defined);
  39
  40 // Checks cache config is ok.  Returns NULL if ok, or a pointer to an error
  41 // string otherwise.
  42 static const HChar* check_cache(cache_t* cache)
  43 {
  44    // Simulator requires set count to be a power of two.
  45    if ((cache->size % (cache->line_size * cache->assoc) != 0) ||
  46        (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc)))
  47    {
  48       return "Cache set count is not a power of two.\n";
  49    }
  50
  51    // Simulator requires line size to be a power of two.
  52    if (-1 == VG_(log2)(cache->line_size)) {
  53       return "Cache line size is not a power of two.\n";
  54    }
  55
  56    // Then check line size >= 16 -- any smaller and a single instruction could
  57    // straddle three cache lines, which breaks a simulation assertion and is
  58    // stupid anyway.
  59    if (cache->line_size < MIN_LINE_SIZE) {
  60       return "Cache line size is too small.\n";
  61    }
  62
  63    /* Then check cache size > line size (causes seg faults if not). */
  64    if (cache->size <= cache->line_size) {
  65       return "Cache size <= line size.\n";
  66    }
  67
  68    /* Then check assoc <= (size / line size) (seg faults otherwise). */
  69    if (cache->assoc > (cache->size / cache->line_size)) {
  70       return "Cache associativity > (size / line size).\n";
  71    }
  72
  73    return NULL;
  74 }
  75
  76
  77 static void parse_cache_opt ( cache_t* cache, const HChar* opt,
  78                               const HChar* optval )
  79 {
  80    Long i1, i2, i3;
  81    HChar* endptr;
  82    const HChar* checkRes;
  83
  84    // Option argument looks like "65536,2,64".  Extract them.
  85    i1 = VG_(strtoll10)(optval,   &endptr); if (*endptr != ',')  goto bad;
  86    i2 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != ',')  goto bad;
  87    i3 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != '\0') goto bad;
  88
  89    // Check for overflow.
  90    cache->size      = (Int)i1;
  91    cache->assoc     = (Int)i2;
  92    cache->line_size = (Int)i3;
  93    if (cache->size      != i1) goto overflow;
  94    if (cache->assoc     != i2) goto overflow;
  95    if (cache->line_size != i3) goto overflow;
  96
  97    checkRes = check_cache(cache);
  98    if (checkRes) {
  99       VG_(fmsg)("%s", checkRes);
 100       goto bad;
 101    }
 102
 103    return;
 104
 105   bad:
 106    VG_(fmsg_bad_option)(opt, "Bad argument '%s'\n", optval);
 107
 108   overflow:
 109    VG_(fmsg_bad_option)(opt,
 110       "One of the cache parameters was too large and overflowed.\n");
 111 }
 112
 113
 114 Bool VG_(str_clo_cache_opt)(const HChar *arg,
 115                             cache_t* clo_I1c,
 116                             cache_t* clo_D1c,
 117                             cache_t* clo_LLc)
 118 {
 119    const HChar* tmp_str;
 120
 121    if      VG_STR_CLO(arg, "--I1", tmp_str) {
 122       parse_cache_opt(clo_I1c, arg, tmp_str);
 123       return True;
 124    } else if VG_STR_CLO(arg, "--D1", tmp_str) {
 125       parse_cache_opt(clo_D1c, arg, tmp_str);
 126       return True;
 127    } else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
 128               VG_STR_CLO(arg, "--LL", tmp_str)) {
 129       parse_cache_opt(clo_LLc, arg, tmp_str);
 130       return True;
 131    } else
 132       return False;
 133 }
 134
 135 static void umsg_cache_img(const HChar* desc, cache_t* c)
 136 {
 137    VG_(umsg)("  %s: %'d B, %d-way, %d B lines\n", desc,
 138              c->size, c->assoc, c->line_size);
 139 }
 140
 141 // Verifies if c is a valid cache.
 142 // An invalid value causes an assert, unless clo_redefined is True.
 143 static void check_cache_or_override(const HChar* desc, cache_t* c, Bool clo_redefined)
 144 {
 145    const HChar* checkRes;
 146
 147    checkRes = check_cache(c);
 148    if (checkRes) {
 149       VG_(umsg)("Auto-detected %s cache configuration not supported: %s",
 150                 desc, checkRes);
 151       umsg_cache_img(desc, c);
 152       if (!clo_redefined) {
 153          VG_(umsg)("As it probably should be supported, please report a bug!\n");
 154          VG_(umsg)("Bypass this message by using option --%s=...\n", desc);
 155          tl_assert(0);
 156       }
 157    }
 158 }
 159
 160
 161 /* If the LL cache config isn't something the simulation functions
 162    can handle, try to adjust it so it is.  Caches are characterised
 163    by (total size T, line size L, associativity A), and then we
 164    have
 165
 166      number of sets S = T / (L * A)
 167
 168    The required constraints are:
 169
 170    * L must be a power of 2, but it always is in practice, so
 171      no problem there
 172
 173    * A can be any value >= 1
 174
 175    * T can be any value, but ..
 176
 177    * S must be a power of 2.
 178
 179    That sometimes gives a problem.  For example, some Core iX based
 180    Intel CPUs have T = 12MB, A = 16, L = 64, which gives 12288
 181    sets.  Some AMD cpus have T = 5MB, A = 48, L = 64, which gives
 182    1706.667 sets (!).
 183
 184    The "fix" is to force S down to the nearest power of two below its
 185    original value, and increase A proportionately, so as to keep the
 186    total cache size the same.  In fact to be safe we recalculate the
 187    cache size afterwards anyway, to guarantee that it divides exactly
 188    between the new number of sets.
 189
 190    The "fix" is "justified" (cough, cough) by alleging that
 191    increases of associativity above about 4 have very little effect
 192    on the actual miss rate.  It would be far more inaccurate to
 193    fudge this by changing the size of the simulated cache --
 194    changing the associativity is a much better option.
 195 */
 196
 197 /* (Helper function) Returns the largest power of 2 that is <= |x|.
 198    Even works when |x| == 0. */
 199 static UInt floor_power_of_2 ( UInt x )
 200 {
 201    x = x | (x >> 1);
 202    x = x | (x >> 2);
 203    x = x | (x >> 4);
 204    x = x | (x >> 8);
 205    x = x | (x >> 16);
 206    return x - (x >> 1);
 207 }
 208
 209 static void
 210 maybe_tweak_LLc(cache_t *LLc)
 211 {
 212   if (LLc->size == 0 || LLc->assoc == 0 || LLc->line_size == 0)
 213      return;
 214
 215   tl_assert(LLc->size > 0 && LLc->assoc > 0 && LLc->line_size > 0);
 216
 217   UInt old_size      = (UInt)LLc->size;
 218   UInt old_assoc     = (UInt)LLc->assoc;
 219   UInt old_line_size = (UInt)LLc->line_size;
 220
 221   UInt new_size      = old_size;
 222   UInt new_assoc     = old_assoc;
 223   UInt new_line_size = old_line_size;
 224
 225   UInt old_nSets = old_size / (old_assoc * old_line_size);
 226   if (old_nSets == 0) {
 227      /* This surely can't happen; but would cause chaos with the maths
 228       * below if it did.  Just give up if it does. */
 229      return;
 230   }
 231
 232   if (-1 != VG_(log2_64)(old_nSets)) {
 233      /* The number of sets is already a power of 2.  Make sure that
 234         the size divides exactly between the sets.  Almost all of the
 235         time this will have no effect. */
 236      new_size = old_line_size * old_assoc * old_nSets;
 237   } else {
 238      /* The number of sets isn't a power of two.  Calculate some
 239         scale-down factor which causes the number of sets to become a
 240         power of two.  Then, increase the associativity by that
 241         factor.  Finally, re-calculate the total size so as to make
 242         sure it divides exactly between the sets. */
 243      tl_assert(old_nSets >= 0);
 244      UInt new_nSets = floor_power_of_2 ( old_nSets );
 245      tl_assert(new_nSets > 0 && new_nSets < old_nSets);
 246      Double factor = (Double)old_nSets / (Double)new_nSets;
 247      tl_assert(factor >= 1.0);
 248
 249      new_assoc = (UInt)(0.5 + factor * (Double)old_assoc);
 250      tl_assert(new_assoc >= old_assoc);
 251
 252      new_size = old_line_size * new_assoc * new_nSets;
 253   }
 254
 255   tl_assert(new_line_size == old_line_size); /* we never change this */
 256   if (new_size == old_size && new_assoc == old_assoc)
 257      return;
 258
 259   VG_(dmsg)("warning: "
 260             "specified LL cache: line_size %u  assoc %u  total_size %'u\n",
 261             old_line_size, old_assoc, old_size);
 262   VG_(dmsg)("warning: "
 263             "simulated LL cache: line_size %u  assoc %u  total_size %'u\n",\
 264             new_line_size, new_assoc, new_size);
 265
 266   LLc->size      = new_size;
 267   LLc->assoc     = new_assoc;
 268   LLc->line_size = new_line_size;
 269 }
 270
 271 void VG_(post_clo_init_configure_caches)(cache_t* I1c,
 272                                          cache_t* D1c,
 273                                          cache_t* LLc,
 274                                          cache_t* clo_I1c,
 275                                          cache_t* clo_D1c,
 276                                          cache_t* clo_LLc)
 277 {
 278 #define DEFINED(L)   (-1 != L->size  || -1 != L->assoc || -1 != L->line_size)
 279
 280    // Count how many were defined on the command line.
 281    Bool all_caches_clo_defined =
 282       (DEFINED(clo_I1c) &&
 283        DEFINED(clo_D1c) &&
 284        DEFINED(clo_LLc));
 285
 286    // Set the cache config (using auto-detection, if supported by the
 287    // architecture).
 288    configure_caches( I1c, D1c, LLc, all_caches_clo_defined );
 289
 290    maybe_tweak_LLc( LLc );
 291
 292    // Check the default/auto-detected values.
 293    // Allow the user to override invalid auto-detected caches
 294    // with command line.
 295    check_cache_or_override ("I1", I1c, DEFINED(clo_I1c));
 296    check_cache_or_override ("D1", D1c, DEFINED(clo_D1c));
 297    check_cache_or_override ("LL", LLc, DEFINED(clo_LLc));
 298
 299    // Then replace with any defined on the command line.  (Already checked in
 300    // VG(parse_clo_cache_opt)().)
 301    if (DEFINED(clo_I1c)) { *I1c = *clo_I1c; }
 302    if (DEFINED(clo_D1c)) { *D1c = *clo_D1c; }
 303    if (DEFINED(clo_LLc)) { *LLc = *clo_LLc; }
 304
 305    if (VG_(clo_verbosity) >= 2) {
 306       VG_(umsg)("Cache configuration used:\n");
 307       umsg_cache_img ("I1", I1c);
 308       umsg_cache_img ("D1", D1c);
 309       umsg_cache_img ("LL", LLc);
 310    }
 311 #undef DEFINED
 312 }
 313
 314 void VG_(print_cache_clo_opts)(void)
 315 {
 316    VG_(printf)(
 317 "    --I1=<size>,<assoc>,<line_size>  set I1 cache manually\n"
 318 "    --D1=<size>,<assoc>,<line_size>  set D1 cache manually\n"
 319 "    --LL=<size>,<assoc>,<line_size>  set LL cache manually\n"
 320    );
 321 }
 322
 323
 324 // Traverse the cache info and return a cache of the given kind and level.
 325 // Return NULL if no such cache exists.
 326 static const VexCache *
 327 locate_cache(const VexCacheInfo *ci, VexCacheKind kind, UInt level)
 328 {
 329    const VexCache *c;
 330
 331    for (c = ci->caches; c != ci->caches + ci->num_caches; ++c) {
 332       if (c->level == level && c->kind == kind) {
 333          return c;
 334       }
 335    }
 336    return NULL;  // not found
 337 }
 338
 339
 340 // Gives the auto-detected configuration of I1, D1 and LL caches.  They get
 341 // overridden by any cache configurations specified on the command line.
 342 static void
 343 configure_caches(cache_t *I1c, cache_t *D1c, cache_t *LLc,
 344                  Bool all_caches_clo_defined)
 345 {
 346    VexArchInfo vai;
 347    const VexCacheInfo *ci;
 348    const VexCache *i1, *d1, *ll;
 349
 350    VG_(machine_get_VexArchInfo)(NULL, &vai);
 351    ci = &vai.hwcache_info;
 352
 353    // Extract what we need
 354    i1 = locate_cache(ci, INSN_CACHE, 1);
 355    d1 = locate_cache(ci, DATA_CACHE, 1);
 356    ll = locate_cache(ci, UNIFIED_CACHE, ci->num_levels);
 357
 358    if (ci->num_caches > 0 && ll == NULL) {
 359       VG_(dmsg)("warning: L2 cache not installed, ignore LL results.\n");
 360    }
 361
 362    if (ll && ci->num_levels > 2) {
 363       VG_(dmsg)("warning: L%u cache found, using its data for the "
 364                 "LL simulation.\n", ci->num_levels);
 365    }
 366
 367    if (i1 && d1 && ll) {
 368       if (i1->is_trace_cache) {
 369          /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based.
 370           * conversion to byte size is a total guess;  treat the 12K and 16K
 371           * cases the same since the cache byte size must be a power of two for
 372           * everything to work!.  Also guessing 32 bytes for the line size...
 373           */
 374          UInt adjusted_size, guessed_line_size = 32;
 375
 376          if (i1->sizeB == 12 * 1024 || i1->sizeB == 16 * 1024) {
 377             adjusted_size = 16 * 1024;
 378          } else {
 379             adjusted_size = 32 * 1024;
 380          }
 381          VG_(dmsg)("warning: Pentium 4 with %u KB micro-op instruction trace cache\n",
 382                    i1->sizeB / 1024);
 383          VG_(dmsg)("         Simulating a %u KB I-cache with %u B lines\n",
 384                    adjusted_size / 1024, guessed_line_size);
 385
 386          *I1c = (cache_t) { adjusted_size, i1->assoc, guessed_line_size };
 387       } else {
 388          *I1c = (cache_t) { i1->sizeB, i1->assoc, i1->line_sizeB };
 389       }
 390       *D1c = (cache_t) { d1->sizeB, d1->assoc, d1->line_sizeB };
 391       *LLc = (cache_t) { ll->sizeB, ll->assoc, ll->line_sizeB };
 392
 393       return;
 394    }
 395
 396    // Cache information could not be queried; choose some default
 397    // architecture specific default setting.
 398
 399 #if defined(VGA_ppc32)
 400
 401    // Default cache configuration
 402    *I1c = (cache_t) {  65536, 2, 64 };
 403    *D1c = (cache_t) {  65536, 2, 64 };
 404    *LLc = (cache_t) { 262144, 8, 64 };
 405
 406 #elif defined(VGA_ppc64be) || defined(VGA_ppc64le)
 407
 408    // Default cache configuration
 409    *I1c = (cache_t) {  65536, 2, 64 };
 410    *D1c = (cache_t) {  65536, 2, 64 };
 411    *LLc = (cache_t) { 262144, 8, 64 };
 412
 413 #elif defined(VGA_arm)
 414
 415    // Set caches to default (for Cortex-A8 ?)
 416    *I1c = (cache_t) {  16384, 4, 64 };
 417    *D1c = (cache_t) {  16384, 4, 64 };
 418    *LLc = (cache_t) { 262144, 8, 64 };
 419
 420 #elif defined(VGA_arm64)
 421
 422    // Copy the 32-bit ARM version until such time as we have
 423    // some real hardware to run on
 424    *I1c = (cache_t) {  16384, 4, 64 };
 425    *D1c = (cache_t) {  16384, 4, 64 };
 426    *LLc = (cache_t) { 262144, 8, 64 };
 427
 428 #elif defined(VGA_s390x)
 429    //
 430    // Here is the cache data from older machine models:
 431    //
 432    //           I1            D1      I/D L2
 433    // z900  256k/256/4    256k/256/4   16MB
 434    // z800  256k/256/4    256k/256/4    8MB
 435    // z990  256k/256/4    256k/256/4   32MB
 436    // z890  256k/256/4    256k/256/4   32MB
 437    // z9    256k/256/4    256k/256/4   40MB
 438    //
 439    // Sources:
 440    // (1) IBM System z9 109 Technical Introduction
 441    //     www.redbooks.ibm.com/redbooks/pdfs/sg246669.pdf
 442    // (2) The microarchitecture of the IBM eServer z900 processor
 443    //     IBM Journal of Research and Development
 444    //     Volume 46, Number 4/5, pp 381-395, July/September 2002
 445    // (3) The IBM eServer z990 microprocessor
 446    //     IBM Journal of Research and Development
 447    //     Volume 48, Number 3/4, pp 295-309, May/July 2004
 448    // (4) Charles Webb, IBM
 449    //
 450    // L2 data is unfortunately incomplete. Otherwise, we could support
 451    // machines without the ECAG insn by looking at VEX_S390X_MODEL(hwcaps).
 452
 453    // Default cache configuration is z10-EC  (Source: ECAG insn)
 454    *I1c = (cache_t) {    65536,  4, 256 };
 455    *D1c = (cache_t) {   131072,  8, 256 };
 456    *LLc = (cache_t) { 50331648, 24, 256 };
 457
 458 #elif defined(VGA_mips32) || defined(VGA_nanomips)
 459
 460    // Set caches to default (for MIPS32-r2(mips 74kc))
 461    *I1c = (cache_t) {  32768, 4, 32 };
 462    *D1c = (cache_t) {  32768, 4, 32 };
 463    *LLc = (cache_t) { 524288, 8, 32 };
 464
 465 #elif defined(VGA_mips64)
 466
 467    // Set caches to default (for MIPS64 - 5kc)
 468    *I1c = (cache_t) {  32768, 4, 32 };
 469    *D1c = (cache_t) {  32768, 4, 32 };
 470    *LLc = (cache_t) { 524288, 8, 32 };
 471
 472 #elif defined(VGA_x86) || defined(VGA_amd64)
 473
 474    *I1c = (cache_t) {  65536, 2, 64 };
 475    *D1c = (cache_t) {  65536, 2, 64 };
 476    *LLc = (cache_t) { 262144, 8, 64 };
 477
 478 #else
 479
 480 #error "Unknown arch"
 481
 482 #endif
 483
 484    if (!all_caches_clo_defined) {
 485       const HChar warning[] =
 486         "Warning: Cannot auto-detect cache config, using defaults.\n"
 487         "         Run with -v to see.\n";
 488       VG_(dmsg)("%s", warning);
 489    }
 490 }
 491
 492 /*--------------------------------------------------------------------*/
 493 /*--- end                                                          ---*/
 494 /*--------------------------------------------------------------------*/