hphp/util/hfsort-plus.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com)  |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15 */
  16
  17 #include "hphp/util/hfsort.h"
  18
  19 #include <set>
  20 #include <unordered_map>
  21
  22 #include <folly/Format.h>
  23
  24 #include "hphp/util/hash.h"
  25 #include "hphp/util/trace.h"
  26
  27 namespace HPHP { namespace hfsort {
  28 namespace {
  29
  30 #define HFTRACE(LEVEL, ...)                         \
  31   if (Trace::moduleEnabled(Trace::hfsort, LEVEL)) { \
  32     Trace::traceRelease(__VA_ARGS__);               \
  33   }
  34
  35 // The size of a cache page
  36 // Since we optimize both for iTLB cache (2MB pages) and i-cache (64b pages),
  37 // using a value that fits both
  38 constexpr uint32_t kPageSize = uint32_t(1) << 12;
  39
  40 // Capacity of the iTLB cache: larger values yield more iTLB-friendly result,
  41 // while smaller values result in better i-cache performance
  42 constexpr uint32_t kITLBEntries = 16;
  43
  44 constexpr size_t kInvalidAddr = -1;
  45
  46 // A cache of precomputed results for a pair of clusters
  47 class PrecomputedResults {
  48  public:
  49   PrecomputedResults() {}
  50
  51   bool contains(Cluster* first, Cluster* second) const {
  52     if (invalidKeys.count(first) || invalidKeys.count(second)) {
  53       return false;
  54     }
  55     auto key = std::make_pair(first, second);
  56     return cache.find(key) != cache.end();
  57   }
  58
  59   double get(Cluster* first, Cluster* second) const {
  60     auto key = std::make_pair(first, second);
  61     auto it = cache.find(key);
  62     assert(it != cache.end());
  63     return it->second;
  64   }
  65
  66   void set(Cluster* first, Cluster* second, double value) {
  67     auto key = std::make_pair(first, second);
  68     cache[key] = value;
  69   }
  70
  71   void validateAll() {
  72     invalidKeys.clear();
  73   }
  74
  75   void invalidate(Cluster* cluster) {
  76     invalidKeys.insert(cluster);
  77   }
  78
  79  private:
  80   std::unordered_map<std::pair<Cluster*, Cluster*>, double> cache;
  81   std::unordered_set<Cluster*> invalidKeys;
  82 };
  83
  84 // A wrapper for algorthm-wide variables
  85 struct AlgoState {
  86   // the call graph
  87   const TargetGraph* cg;
  88   // the total number of samples in the graph
  89   double totalSamples;
  90   // target_id => cluster
  91   std::vector<Cluster*> funcCluster;
  92   // current address of the function from the beginning of its cluster
  93   std::vector<size_t> addr;
  94 };
  95
  96 }
  97
  98 /*
  99  * Sorting clusters by their density in decreasing order
 100  */
 101 void sortByDensity(std::vector<Cluster*>& clusters) {
 102   std::sort(
 103     clusters.begin(),
 104     clusters.end(),
 105     [&] (const Cluster* c1, const Cluster* c2) {
 106       double d1 = c1->density();
 107       double d2 = c2->density();
 108       // making sure the sorting is deterministic
 109       if (d1 != d2) return d1 > d2;
 110       if (c1->size != c2->size) return c1->size < c2->size;
 111       if (c1->samples != c2->samples) return c1->samples > c2->samples;
 112       return c1->targets[0] < c2->targets[0];
 113     }
 114   );
 115 }
 116
 117 /*
 118  * Density of a cluster formed by merging a given pair of clusters
 119  */
 120 double density(Cluster* clusterPred, Cluster* clusterSucc) {
 121   double combinedSamples = clusterPred->samples + clusterSucc->samples;
 122   double combinedSize = clusterPred->size + clusterSucc->size;
 123   return combinedSamples / combinedSize;
 124 }
 125
 126 /*
 127  * The probability that a page with a given weight is not present in the cache.
 128  *
 129  * Assume that the hot function are called in a random order; then the
 130  * probability of a TLB page being accessed after a function call is
 131  * p=pageSamples/totalSamples. The probability that the page is not accessed
 132  * is (1-p), and the probability that it is not in the cache (i.e. not accessed
 133  * during the last kITLBEntries function calls) is (1-p)^kITLBEntries
 134  */
 135 double missProbability(const AlgoState& state, double pageSamples) {
 136   double p = pageSamples / state.totalSamples;
 137   double x = kITLBEntries;
 138   // avoiding precision issues for small values
 139   if (p < 0.0001) return (1.0 - x * p + x * (x - 1.0) * p * p / 2.0);
 140   return pow(1.0 - p, x);
 141 }
 142
 143 /*
 144  * Expected hit ratio of the iTLB cache under the given order of clusters
 145  *
 146  * Given an ordering of hot functions (and hence, their assignment to the
 147  * iTLB pages), we can divide all functions calls into two categories:
 148  * - 'short' ones that have a caller-callee distance less than a page;
 149  * - 'long' ones where the distance exceeds a page.
 150  * The short calls are likely to result in a iTLB cache hit. For the long ones,
 151  * the hit/miss result depends on the 'hotness' of the page (i.e., how often
 152  * the page is accessed). Assuming that functions are sent to the iTLB cache
 153  * in a random order, the probability that a page is present in the cache is
 154  * proportional to the number of samples corresponding to the functions on the
 155  * page. The following procedure detects short and long calls, and estimates
 156  * the expected number of cache misses for the long ones.
 157  */
 158 double expectedCacheHitRatio(const AlgoState& state,
 159                              const std::vector<Cluster*>& clusters_) {
 160   // copy and sort by density
 161   std::vector<Cluster*> clusters(clusters_);
 162   sortByDensity(clusters);
 163
 164   // generate function addresses with an alignment
 165   std::vector<size_t> addr(state.cg->targets.size(), kInvalidAddr);
 166   size_t curAddr = 0;
 167   // 'hotness' of the pages
 168   std::vector<double> pageSamples;
 169   for (auto cluster : clusters) {
 170     for (auto targetId : cluster->targets) {
 171       if (curAddr & 0xf) curAddr = (curAddr & ~0xf) + 16;
 172       addr[targetId] = curAddr;
 173       curAddr += state.cg->targets[targetId].size;
 174       // update page weight
 175       size_t page = addr[targetId] / kPageSize;
 176       while (pageSamples.size() <= page) pageSamples.push_back(0.0);
 177       pageSamples[page] += state.cg->targets[targetId].samples;
 178     }
 179   }
 180
 181   // computing expected number of misses for every function
 182   double misses = 0;
 183   for (auto cluster : clusters) {
 184     for (auto targetId : cluster->targets) {
 185       size_t page = addr[targetId] / kPageSize;
 186       double samples = state.cg->targets[targetId].samples;
 187       // probability that the page is not present in the cache
 188       double missProb = missProbability(state, pageSamples[page]);
 189
 190       for (auto pred : state.cg->targets[targetId].preds) {
 191         if (state.cg->targets[pred].samples == 0) continue;
 192         auto arc = state.cg->arcs.find(Arc(pred, targetId));
 193
 194         // the source page
 195         size_t srcPage = (addr[pred] + (size_t)arc->avgCallOffset) / kPageSize;
 196         if (page != srcPage) {
 197           // this is a miss
 198           misses += arc->weight * missProb;
 199         }
 200         samples -= arc->weight;
 201       }
 202
 203       // the remaining samples come from the jitted code
 204       misses += samples * missProb;
 205     }
 206   }
 207
 208   return 100.0 * (1.0 - misses / state.totalSamples);
 209 }
 210
 211 /*
 212  * Get adjacent clusters (the ones that share an arc) with the given one
 213  */
 214 std::unordered_set<Cluster*> adjacentClusters(const AlgoState& state,
 215                                               Cluster* cluster) {
 216   std::unordered_set<Cluster*> result;
 217   for (auto targetId : cluster->targets) {
 218     for (auto succ : state.cg->targets[targetId].succs) {
 219       auto succCluster = state.funcCluster[succ];
 220       if (succCluster != nullptr && succCluster != cluster) {
 221         result.insert(succCluster);
 222       }
 223     }
 224     for (auto pred : state.cg->targets[targetId].preds) {
 225       auto predCluster = state.funcCluster[pred];
 226       if (predCluster != nullptr && predCluster != cluster) {
 227         result.insert(predCluster);
 228       }
 229     }
 230   }
 231   return result;
 232 }
 233
 234 /*
 235  * The expected number of calls for an edge withing the same TLB page
 236  */
 237 double expectedCalls(int src_addr, int dst_addr, double edgeWeight) {
 238   int dist = std::abs(src_addr - dst_addr);
 239   if (dist > kPageSize) {
 240     return 0;
 241   }
 242   return (double(kPageSize - dist) / kPageSize) * edgeWeight;
 243 }
 244
 245 /*
 246  * The expected number of calls within a given cluster with both endpoints on
 247  * the same TLB cache page
 248  */
 249 double shortCalls(const AlgoState& state, Cluster* cluster) {
 250   double calls = 0;
 251   for (auto targetId : cluster->targets) {
 252     for (auto succ : state.cg->targets[targetId].succs) {
 253       if (state.funcCluster[succ] == cluster) {
 254         auto arc = state.cg->arcs.find(Arc(targetId, succ));
 255
 256         int src_addr = state.addr[targetId] + arc->avgCallOffset;
 257         int dst_addr = state.addr[succ];
 258
 259         calls += expectedCalls(src_addr, dst_addr, arc->weight);
 260       }
 261     }
 262   }
 263
 264   return calls;
 265 }
 266
 267 /*
 268  * The number of calls between the two clusters with both endpoints on
 269  * the same TLB page, assuming that a given pair of clusters gets merged
 270  */
 271 double shortCalls(const AlgoState& state,
 272                   Cluster* clusterPred,
 273                   Cluster* clusterSucc) {
 274   double calls = 0;
 275   for (auto targetId : clusterPred->targets) {
 276     for (auto succ : state.cg->targets[targetId].succs) {
 277       if (state.funcCluster[succ] == clusterSucc) {
 278         auto arc = state.cg->arcs.find(Arc(targetId, succ));
 279
 280         int src_addr = state.addr[targetId] + arc->avgCallOffset;
 281         int dst_addr = state.addr[succ] + clusterPred->size;
 282
 283         calls += expectedCalls(src_addr, dst_addr, arc->weight);
 284       }
 285     }
 286   }
 287
 288   for (auto targetId : clusterPred->targets) {
 289     for (auto pred : state.cg->targets[targetId].preds) {
 290       if (state.funcCluster[pred] == clusterSucc) {
 291         auto arc = state.cg->arcs.find(Arc(pred, targetId));
 292
 293         int src_addr = state.addr[pred] + arc->avgCallOffset +
 294           clusterPred->size;
 295         int dst_addr = state.addr[targetId];
 296
 297         calls += expectedCalls(src_addr, dst_addr, arc->weight);
 298       }
 299     }
 300   }
 301
 302   return calls;
 303 }
 304
 305 /*
 306  * The gain of merging two clusters.
 307  *
 308  * We assume that the final clusters are sorted by their density, and hence
 309  * every cluster is likely to be adjacent with clusters of the same density.
 310  * Thus, the 'hotness' of every cluster can be estimated by density*pageSize,
 311  * which is used to compute the probability of cache misses for long calls
 312  * of a given cluster.
 313  * The result is also scaled by the size of the resulting cluster in order to
 314  * increse the chance of merging short clusters, which is helpful for
 315  * the i-cache performance.
 316  */
 317 double mergeGain(const AlgoState& state,
 318                  Cluster* clusterPred,
 319                  Cluster* clusterSucc) {
 320   // cache misses on the first cluster
 321   double longCallsPred = clusterPred->samples - shortCalls(state, clusterPred);
 322   double probPred = missProbability(state, clusterPred->density() * kPageSize);
 323   double expectedMissesPred = longCallsPred * probPred;
 324
 325   // cache misses on the second cluster
 326   double longCallsSucc = clusterSucc->samples - shortCalls(state, clusterSucc);
 327   double probSucc = missProbability(state, clusterSucc->density() * kPageSize);
 328   double expectedMissesSucc = longCallsSucc * probSucc;
 329
 330   // cache misses on the merged cluster
 331   double longCallsNew = longCallsPred + longCallsSucc -
 332                         shortCalls(state, clusterPred, clusterSucc);
 333   double newDensity = density(clusterPred, clusterSucc);
 334   double probNew = missProbability(state, newDensity * kPageSize);
 335   double missesNew = longCallsNew * probNew;
 336
 337   double gain = expectedMissesPred + expectedMissesSucc - missesNew;
 338   // scaling the result to increase the importance of merging short clusters
 339   return gain / (clusterPred->size + clusterSucc->size);
 340 }
 341
 342  /*
 343   * Merge two clusters
 344   */
 345 void mergeInto(AlgoState& state, Cluster* into, Cluster* other) {
 346   auto& targets = other->targets;
 347   into->targets.insert(into->targets.end(), targets.begin(), targets.end());
 348   into->size += other->size;
 349   into->samples += other->samples;
 350
 351   size_t curAddr = 0;
 352   for (auto targetId : into->targets) {
 353     state.funcCluster[targetId] = into;
 354     state.addr[targetId] = curAddr;
 355     curAddr += state.cg->targets[targetId].size;
 356   }
 357
 358   other->size = 0;
 359   other->samples = 0;
 360   other->targets.clear();
 361 }
 362
 363 /*
 364  * HFSortPlus - layout of hot functions with iTLB cache optimization
 365  */
 366 std::vector<Cluster> hfsortPlus(const TargetGraph& cg) {
 367   // create a cluster for every function
 368   std::vector<Cluster> allClusters;
 369   allClusters.reserve(cg.targets.size());
 370   for (size_t f = 0; f < cg.targets.size(); f++) {
 371     allClusters.emplace_back(f, cg.targets[f]);
 372   }
 373
 374   // initialize objects used by the algorithm
 375   std::vector<Cluster*> clusters;
 376   clusters.reserve(cg.targets.size());
 377   AlgoState state;
 378   state.cg = &cg;
 379   state.totalSamples = 0;
 380   state.funcCluster = std::vector<Cluster*>(cg.targets.size(), nullptr);
 381   state.addr = std::vector<size_t>(cg.targets.size(), kInvalidAddr);
 382   for (size_t f = 0; f < cg.targets.size(); f++) {
 383     if (cg.targets[f].samples == 0) continue;
 384
 385     clusters.push_back(&allClusters[f]);
 386     state.funcCluster[f] = &allClusters[f];
 387     state.addr[f] = 0;
 388     state.totalSamples += cg.targets[f].samples;
 389   }
 390
 391   HFTRACE(1, "Starting hfsort+ for %lu clusters\n", clusters.size());
 392   HFTRACE(1, "Initial expected iTLB cache hit ratio: %.4lf\n",
 393     expectedCacheHitRatio(state, clusters));
 394
 395   // the cache keeps precomputed values of mergeGain for pairs of clusters;
 396   // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
 397   // containing both x and y (and recompute them on the next iteration)
 398   PrecomputedResults cache;
 399
 400   int steps = 0;
 401   // merge pairs of clusters while there is an improvement
 402   while (clusters.size() > 1) {
 403     if (steps % 500 == 0) {
 404       HFTRACE(1, "step = %d  clusters = %lu  expected_hit_rate = %.4lf\n",
 405         steps,
 406         clusters.size(),
 407         expectedCacheHitRatio(state, clusters));
 408     }
 409     steps++;
 410
 411     Cluster* bestClusterPred = nullptr;
 412     Cluster* bestClusterSucc = nullptr;
 413     double bestGain = -1;
 414     for (auto clusterPred : clusters) {
 415       // get candidates for merging with the current cluster
 416       auto candidateClusters = adjacentClusters(state, clusterPred);
 417
 418       // find the best candidate
 419       for (auto clusterSucc : candidateClusters) {
 420         // get a cost of merging two clusters
 421         if (!cache.contains(clusterPred, clusterSucc)) {
 422           double value = mergeGain(state, clusterPred, clusterSucc);
 423           cache.set(clusterPred, clusterSucc, value);
 424         }
 425
 426         double gain = cache.get(clusterPred, clusterSucc);
 427         // breaking ties by density to make the hottest clusters be merged first
 428         if (gain > bestGain || (std::abs(gain - bestGain) < 1e-8 &&
 429                                 density(clusterPred, clusterSucc) >
 430                                 density(bestClusterPred, bestClusterSucc))) {
 431           bestGain = gain;
 432           bestClusterPred = clusterPred;
 433           bestClusterSucc = clusterSucc;
 434         }
 435       }
 436     }
 437     cache.validateAll();
 438
 439     if (bestGain <= 0.0) break;
 440
 441     cache.invalidate(bestClusterPred);
 442     cache.invalidate(bestClusterSucc);
 443
 444     // merge the best pair of clusters
 445     mergeInto(state, bestClusterPred, bestClusterSucc);
 446     // remove bestClusterSucc from the list of active clusters
 447     auto iter = std::remove(clusters.begin(), clusters.end(), bestClusterSucc);
 448     clusters.erase(iter, clusters.end());
 449   }
 450
 451   HFTRACE(1, "Completed hfsort+ with %lu clusters\n", clusters.size());
 452   HFTRACE(1, "Final expected iTLB cache hit ratio: %.4lf\n",
 453     expectedCacheHitRatio(state, clusters));
 454
 455   // Return the set of clusters that are left, which are the ones that
 456   // didn't get merged (so their first func is its original func).
 457   sortByDensity(clusters);
 458   std::vector<Cluster> result;
 459   for (auto cluster : clusters) {
 460     result.emplace_back(std::move(*cluster));
 461   }
 462   return result;
 463 }
 464
 465 }}