use new hugify method only in fb builds
[hiphop-php.git] / hphp / util / hfsort-plus.cpp
blob495a25bd99658fbdcc5c58c407791ea65e45d1b9
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/util/hfsort.h"
19 #include <set>
20 #include <unordered_map>
22 #include <folly/Format.h>
24 #include "hphp/util/hash.h"
25 #include "hphp/util/trace.h"
27 namespace HPHP { namespace hfsort {
28 namespace {
30 #define HFTRACE(LEVEL, ...) \
31 if (Trace::moduleEnabled(Trace::hfsort, LEVEL)) { \
32 Trace::traceRelease(__VA_ARGS__); \
35 // The size of a cache page
36 // Since we optimize both for iTLB cache (2MB pages) and i-cache (64b pages),
37 // using a value that fits both
38 constexpr uint32_t kPageSize = uint32_t(1) << 12;
40 // Capacity of the iTLB cache: larger values yield more iTLB-friendly result,
41 // while smaller values result in better i-cache performance
42 constexpr uint32_t kITLBEntries = 16;
44 constexpr size_t kInvalidAddr = -1;
46 // A cache of precomputed results for a pair of clusters
47 class PrecomputedResults {
48 public:
49 PrecomputedResults() {}
51 bool contains(Cluster* first, Cluster* second) const {
52 if (invalidKeys.count(first) || invalidKeys.count(second)) {
53 return false;
55 auto key = std::make_pair(first, second);
56 return cache.find(key) != cache.end();
59 double get(Cluster* first, Cluster* second) const {
60 auto key = std::make_pair(first, second);
61 auto it = cache.find(key);
62 assert(it != cache.end());
63 return it->second;
66 void set(Cluster* first, Cluster* second, double value) {
67 auto key = std::make_pair(first, second);
68 cache[key] = value;
71 void validateAll() {
72 invalidKeys.clear();
75 void invalidate(Cluster* cluster) {
76 invalidKeys.insert(cluster);
79 private:
80 std::unordered_map<std::pair<Cluster*, Cluster*>, double> cache;
81 std::unordered_set<Cluster*> invalidKeys;
84 // A wrapper for algorthm-wide variables
85 struct AlgoState {
86 // the call graph
87 const TargetGraph* cg;
88 // the total number of samples in the graph
89 double totalSamples;
90 // target_id => cluster
91 std::vector<Cluster*> funcCluster;
92 // current address of the function from the beginning of its cluster
93 std::vector<size_t> addr;
99 * Sorting clusters by their density in decreasing order
101 void sortByDensity(std::vector<Cluster*>& clusters) {
102 std::sort(
103 clusters.begin(),
104 clusters.end(),
105 [&] (const Cluster* c1, const Cluster* c2) {
106 double d1 = c1->density();
107 double d2 = c2->density();
108 // making sure the sorting is deterministic
109 if (d1 != d2) return d1 > d2;
110 if (c1->size != c2->size) return c1->size < c2->size;
111 if (c1->samples != c2->samples) return c1->samples > c2->samples;
112 return c1->targets[0] < c2->targets[0];
118 * Density of a cluster formed by merging a given pair of clusters
120 double density(Cluster* clusterPred, Cluster* clusterSucc) {
121 double combinedSamples = clusterPred->samples + clusterSucc->samples;
122 double combinedSize = clusterPred->size + clusterSucc->size;
123 return combinedSamples / combinedSize;
127 * The probability that a page with a given weight is not present in the cache.
129 * Assume that the hot function are called in a random order; then the
130 * probability of a TLB page being accessed after a function call is
131 * p=pageSamples/totalSamples. The probability that the page is not accessed
132 * is (1-p), and the probability that it is not in the cache (i.e. not accessed
133 * during the last kITLBEntries function calls) is (1-p)^kITLBEntries
135 double missProbability(const AlgoState& state, double pageSamples) {
136 double p = pageSamples / state.totalSamples;
137 double x = kITLBEntries;
138 // avoiding precision issues for small values
139 if (p < 0.0001) return (1.0 - x * p + x * (x - 1.0) * p * p / 2.0);
140 return pow(1.0 - p, x);
144 * Expected hit ratio of the iTLB cache under the given order of clusters
146 * Given an ordering of hot functions (and hence, their assignment to the
147 * iTLB pages), we can divide all functions calls into two categories:
148 * - 'short' ones that have a caller-callee distance less than a page;
149 * - 'long' ones where the distance exceeds a page.
150 * The short calls are likely to result in a iTLB cache hit. For the long ones,
151 * the hit/miss result depends on the 'hotness' of the page (i.e., how often
152 * the page is accessed). Assuming that functions are sent to the iTLB cache
153 * in a random order, the probability that a page is present in the cache is
154 * proportional to the number of samples corresponding to the functions on the
155 * page. The following procedure detects short and long calls, and estimates
156 * the expected number of cache misses for the long ones.
158 double expectedCacheHitRatio(const AlgoState& state,
159 const std::vector<Cluster*>& clusters_) {
160 // copy and sort by density
161 std::vector<Cluster*> clusters(clusters_);
162 sortByDensity(clusters);
164 // generate function addresses with an alignment
165 std::vector<size_t> addr(state.cg->targets.size(), kInvalidAddr);
166 size_t curAddr = 0;
167 // 'hotness' of the pages
168 std::vector<double> pageSamples;
169 for (auto cluster : clusters) {
170 for (auto targetId : cluster->targets) {
171 if (curAddr & 0xf) curAddr = (curAddr & ~0xf) + 16;
172 addr[targetId] = curAddr;
173 curAddr += state.cg->targets[targetId].size;
174 // update page weight
175 size_t page = addr[targetId] / kPageSize;
176 while (pageSamples.size() <= page) pageSamples.push_back(0.0);
177 pageSamples[page] += state.cg->targets[targetId].samples;
181 // computing expected number of misses for every function
182 double misses = 0;
183 for (auto cluster : clusters) {
184 for (auto targetId : cluster->targets) {
185 size_t page = addr[targetId] / kPageSize;
186 double samples = state.cg->targets[targetId].samples;
187 // probability that the page is not present in the cache
188 double missProb = missProbability(state, pageSamples[page]);
190 for (auto pred : state.cg->targets[targetId].preds) {
191 if (state.cg->targets[pred].samples == 0) continue;
192 auto arc = state.cg->arcs.find(Arc(pred, targetId));
194 // the source page
195 size_t srcPage = (addr[pred] + (size_t)arc->avgCallOffset) / kPageSize;
196 if (page != srcPage) {
197 // this is a miss
198 misses += arc->weight * missProb;
200 samples -= arc->weight;
203 // the remaining samples come from the jitted code
204 misses += samples * missProb;
208 return 100.0 * (1.0 - misses / state.totalSamples);
212 * Get adjacent clusters (the ones that share an arc) with the given one
214 std::unordered_set<Cluster*> adjacentClusters(const AlgoState& state,
215 Cluster* cluster) {
216 std::unordered_set<Cluster*> result;
217 for (auto targetId : cluster->targets) {
218 for (auto succ : state.cg->targets[targetId].succs) {
219 auto succCluster = state.funcCluster[succ];
220 if (succCluster != nullptr && succCluster != cluster) {
221 result.insert(succCluster);
224 for (auto pred : state.cg->targets[targetId].preds) {
225 auto predCluster = state.funcCluster[pred];
226 if (predCluster != nullptr && predCluster != cluster) {
227 result.insert(predCluster);
231 return result;
235 * The expected number of calls for an edge withing the same TLB page
237 double expectedCalls(int src_addr, int dst_addr, double edgeWeight) {
238 int dist = std::abs(src_addr - dst_addr);
239 if (dist > kPageSize) {
240 return 0;
242 return (double(kPageSize - dist) / kPageSize) * edgeWeight;
246 * The expected number of calls within a given cluster with both endpoints on
247 * the same TLB cache page
249 double shortCalls(const AlgoState& state, Cluster* cluster) {
250 double calls = 0;
251 for (auto targetId : cluster->targets) {
252 for (auto succ : state.cg->targets[targetId].succs) {
253 if (state.funcCluster[succ] == cluster) {
254 auto arc = state.cg->arcs.find(Arc(targetId, succ));
256 int src_addr = state.addr[targetId] + arc->avgCallOffset;
257 int dst_addr = state.addr[succ];
259 calls += expectedCalls(src_addr, dst_addr, arc->weight);
264 return calls;
268 * The number of calls between the two clusters with both endpoints on
269 * the same TLB page, assuming that a given pair of clusters gets merged
271 double shortCalls(const AlgoState& state,
272 Cluster* clusterPred,
273 Cluster* clusterSucc) {
274 double calls = 0;
275 for (auto targetId : clusterPred->targets) {
276 for (auto succ : state.cg->targets[targetId].succs) {
277 if (state.funcCluster[succ] == clusterSucc) {
278 auto arc = state.cg->arcs.find(Arc(targetId, succ));
280 int src_addr = state.addr[targetId] + arc->avgCallOffset;
281 int dst_addr = state.addr[succ] + clusterPred->size;
283 calls += expectedCalls(src_addr, dst_addr, arc->weight);
288 for (auto targetId : clusterPred->targets) {
289 for (auto pred : state.cg->targets[targetId].preds) {
290 if (state.funcCluster[pred] == clusterSucc) {
291 auto arc = state.cg->arcs.find(Arc(pred, targetId));
293 int src_addr = state.addr[pred] + arc->avgCallOffset +
294 clusterPred->size;
295 int dst_addr = state.addr[targetId];
297 calls += expectedCalls(src_addr, dst_addr, arc->weight);
302 return calls;
306 * The gain of merging two clusters.
308 * We assume that the final clusters are sorted by their density, and hence
309 * every cluster is likely to be adjacent with clusters of the same density.
310 * Thus, the 'hotness' of every cluster can be estimated by density*pageSize,
311 * which is used to compute the probability of cache misses for long calls
312 * of a given cluster.
313 * The result is also scaled by the size of the resulting cluster in order to
314 * increse the chance of merging short clusters, which is helpful for
315 * the i-cache performance.
317 double mergeGain(const AlgoState& state,
318 Cluster* clusterPred,
319 Cluster* clusterSucc) {
320 // cache misses on the first cluster
321 double longCallsPred = clusterPred->samples - shortCalls(state, clusterPred);
322 double probPred = missProbability(state, clusterPred->density() * kPageSize);
323 double expectedMissesPred = longCallsPred * probPred;
325 // cache misses on the second cluster
326 double longCallsSucc = clusterSucc->samples - shortCalls(state, clusterSucc);
327 double probSucc = missProbability(state, clusterSucc->density() * kPageSize);
328 double expectedMissesSucc = longCallsSucc * probSucc;
330 // cache misses on the merged cluster
331 double longCallsNew = longCallsPred + longCallsSucc -
332 shortCalls(state, clusterPred, clusterSucc);
333 double newDensity = density(clusterPred, clusterSucc);
334 double probNew = missProbability(state, newDensity * kPageSize);
335 double missesNew = longCallsNew * probNew;
337 double gain = expectedMissesPred + expectedMissesSucc - missesNew;
338 // scaling the result to increase the importance of merging short clusters
339 return gain / (clusterPred->size + clusterSucc->size);
343 * Merge two clusters
345 void mergeInto(AlgoState& state, Cluster* into, Cluster* other) {
346 auto& targets = other->targets;
347 into->targets.insert(into->targets.end(), targets.begin(), targets.end());
348 into->size += other->size;
349 into->samples += other->samples;
351 size_t curAddr = 0;
352 for (auto targetId : into->targets) {
353 state.funcCluster[targetId] = into;
354 state.addr[targetId] = curAddr;
355 curAddr += state.cg->targets[targetId].size;
358 other->size = 0;
359 other->samples = 0;
360 other->targets.clear();
364 * HFSortPlus - layout of hot functions with iTLB cache optimization
366 std::vector<Cluster> hfsortPlus(const TargetGraph& cg) {
367 // create a cluster for every function
368 std::vector<Cluster> allClusters;
369 allClusters.reserve(cg.targets.size());
370 for (size_t f = 0; f < cg.targets.size(); f++) {
371 allClusters.emplace_back(f, cg.targets[f]);
374 // initialize objects used by the algorithm
375 std::vector<Cluster*> clusters;
376 clusters.reserve(cg.targets.size());
377 AlgoState state;
378 state.cg = &cg;
379 state.totalSamples = 0;
380 state.funcCluster = std::vector<Cluster*>(cg.targets.size(), nullptr);
381 state.addr = std::vector<size_t>(cg.targets.size(), kInvalidAddr);
382 for (size_t f = 0; f < cg.targets.size(); f++) {
383 if (cg.targets[f].samples == 0) continue;
385 clusters.push_back(&allClusters[f]);
386 state.funcCluster[f] = &allClusters[f];
387 state.addr[f] = 0;
388 state.totalSamples += cg.targets[f].samples;
391 HFTRACE(1, "Starting hfsort+ for %lu clusters\n", clusters.size());
392 HFTRACE(1, "Initial expected iTLB cache hit ratio: %.4lf\n",
393 expectedCacheHitRatio(state, clusters));
395 // the cache keeps precomputed values of mergeGain for pairs of clusters;
396 // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
397 // containing both x and y (and recompute them on the next iteration)
398 PrecomputedResults cache;
400 int steps = 0;
401 // merge pairs of clusters while there is an improvement
402 while (clusters.size() > 1) {
403 if (steps % 500 == 0) {
404 HFTRACE(1, "step = %d clusters = %lu expected_hit_rate = %.4lf\n",
405 steps,
406 clusters.size(),
407 expectedCacheHitRatio(state, clusters));
409 steps++;
411 Cluster* bestClusterPred = nullptr;
412 Cluster* bestClusterSucc = nullptr;
413 double bestGain = -1;
414 for (auto clusterPred : clusters) {
415 // get candidates for merging with the current cluster
416 auto candidateClusters = adjacentClusters(state, clusterPred);
418 // find the best candidate
419 for (auto clusterSucc : candidateClusters) {
420 // get a cost of merging two clusters
421 if (!cache.contains(clusterPred, clusterSucc)) {
422 double value = mergeGain(state, clusterPred, clusterSucc);
423 cache.set(clusterPred, clusterSucc, value);
426 double gain = cache.get(clusterPred, clusterSucc);
427 // breaking ties by density to make the hottest clusters be merged first
428 if (gain > bestGain || (std::abs(gain - bestGain) < 1e-8 &&
429 density(clusterPred, clusterSucc) >
430 density(bestClusterPred, bestClusterSucc))) {
431 bestGain = gain;
432 bestClusterPred = clusterPred;
433 bestClusterSucc = clusterSucc;
437 cache.validateAll();
439 if (bestGain <= 0.0) break;
441 cache.invalidate(bestClusterPred);
442 cache.invalidate(bestClusterSucc);
444 // merge the best pair of clusters
445 mergeInto(state, bestClusterPred, bestClusterSucc);
446 // remove bestClusterSucc from the list of active clusters
447 auto iter = std::remove(clusters.begin(), clusters.end(), bestClusterSucc);
448 clusters.erase(iter, clusters.end());
451 HFTRACE(1, "Completed hfsort+ with %lu clusters\n", clusters.size());
452 HFTRACE(1, "Final expected iTLB cache hit ratio: %.4lf\n",
453 expectedCacheHitRatio(state, clusters));
455 // Return the set of clusters that are left, which are the ones that
456 // didn't get merged (so their first func is its original func).
457 sortByDensity(clusters);
458 std::vector<Cluster> result;
459 for (auto cluster : clusters) {
460 result.emplace_back(std::move(*cluster));
462 return result;