2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/util/hfsort.h"
20 #include <unordered_map>
22 #include <folly/Format.h>
24 #include "hphp/util/hash.h"
25 #include "hphp/util/trace.h"
27 namespace HPHP
{ namespace hfsort
{
30 #define HFTRACE(LEVEL, ...) \
31 if (Trace::moduleEnabled(Trace::hfsort, LEVEL)) { \
32 Trace::traceRelease(__VA_ARGS__); \
35 // The size of a cache page
36 // Since we optimize both for iTLB cache (2MB pages) and i-cache (64b pages),
37 // using a value that fits both
38 constexpr uint32_t kPageSize
= uint32_t(1) << 12;
40 // Capacity of the iTLB cache: larger values yield more iTLB-friendly result,
41 // while smaller values result in better i-cache performance
42 constexpr uint32_t kITLBEntries
= 16;
44 constexpr size_t kInvalidAddr
= -1;
46 // A cache of precomputed results for a pair of clusters
47 class PrecomputedResults
{
49 PrecomputedResults() {}
51 bool contains(Cluster
* first
, Cluster
* second
) const {
52 if (invalidKeys
.count(first
) || invalidKeys
.count(second
)) {
55 auto key
= std::make_pair(first
, second
);
56 return cache
.find(key
) != cache
.end();
59 double get(Cluster
* first
, Cluster
* second
) const {
60 auto key
= std::make_pair(first
, second
);
61 auto it
= cache
.find(key
);
62 assert(it
!= cache
.end());
66 void set(Cluster
* first
, Cluster
* second
, double value
) {
67 auto key
= std::make_pair(first
, second
);
75 void invalidate(Cluster
* cluster
) {
76 invalidKeys
.insert(cluster
);
80 std::unordered_map
<std::pair
<Cluster
*, Cluster
*>, double> cache
;
81 std::unordered_set
<Cluster
*> invalidKeys
;
84 // A wrapper for algorthm-wide variables
87 const TargetGraph
* cg
;
88 // the total number of samples in the graph
90 // target_id => cluster
91 std::vector
<Cluster
*> funcCluster
;
92 // current address of the function from the beginning of its cluster
93 std::vector
<size_t> addr
;
99 * Sorting clusters by their density in decreasing order
101 void sortByDensity(std::vector
<Cluster
*>& clusters
) {
105 [&] (const Cluster
* c1
, const Cluster
* c2
) {
106 double d1
= c1
->density();
107 double d2
= c2
->density();
108 // making sure the sorting is deterministic
109 if (d1
!= d2
) return d1
> d2
;
110 if (c1
->size
!= c2
->size
) return c1
->size
< c2
->size
;
111 if (c1
->samples
!= c2
->samples
) return c1
->samples
> c2
->samples
;
112 return c1
->targets
[0] < c2
->targets
[0];
118 * Density of a cluster formed by merging a given pair of clusters
120 double density(Cluster
* clusterPred
, Cluster
* clusterSucc
) {
121 double combinedSamples
= clusterPred
->samples
+ clusterSucc
->samples
;
122 double combinedSize
= clusterPred
->size
+ clusterSucc
->size
;
123 return combinedSamples
/ combinedSize
;
127 * The probability that a page with a given weight is not present in the cache.
129 * Assume that the hot function are called in a random order; then the
130 * probability of a TLB page being accessed after a function call is
131 * p=pageSamples/totalSamples. The probability that the page is not accessed
132 * is (1-p), and the probability that it is not in the cache (i.e. not accessed
133 * during the last kITLBEntries function calls) is (1-p)^kITLBEntries
135 double missProbability(const AlgoState
& state
, double pageSamples
) {
136 double p
= pageSamples
/ state
.totalSamples
;
137 double x
= kITLBEntries
;
138 // avoiding precision issues for small values
139 if (p
< 0.0001) return (1.0 - x
* p
+ x
* (x
- 1.0) * p
* p
/ 2.0);
140 return pow(1.0 - p
, x
);
144 * Expected hit ratio of the iTLB cache under the given order of clusters
146 * Given an ordering of hot functions (and hence, their assignment to the
147 * iTLB pages), we can divide all functions calls into two categories:
148 * - 'short' ones that have a caller-callee distance less than a page;
149 * - 'long' ones where the distance exceeds a page.
150 * The short calls are likely to result in a iTLB cache hit. For the long ones,
151 * the hit/miss result depends on the 'hotness' of the page (i.e., how often
152 * the page is accessed). Assuming that functions are sent to the iTLB cache
153 * in a random order, the probability that a page is present in the cache is
154 * proportional to the number of samples corresponding to the functions on the
155 * page. The following procedure detects short and long calls, and estimates
156 * the expected number of cache misses for the long ones.
158 double expectedCacheHitRatio(const AlgoState
& state
,
159 const std::vector
<Cluster
*>& clusters_
) {
160 // copy and sort by density
161 std::vector
<Cluster
*> clusters(clusters_
);
162 sortByDensity(clusters
);
164 // generate function addresses with an alignment
165 std::vector
<size_t> addr(state
.cg
->targets
.size(), kInvalidAddr
);
167 // 'hotness' of the pages
168 std::vector
<double> pageSamples
;
169 for (auto cluster
: clusters
) {
170 for (auto targetId
: cluster
->targets
) {
171 if (curAddr
& 0xf) curAddr
= (curAddr
& ~0xf) + 16;
172 addr
[targetId
] = curAddr
;
173 curAddr
+= state
.cg
->targets
[targetId
].size
;
174 // update page weight
175 size_t page
= addr
[targetId
] / kPageSize
;
176 while (pageSamples
.size() <= page
) pageSamples
.push_back(0.0);
177 pageSamples
[page
] += state
.cg
->targets
[targetId
].samples
;
181 // computing expected number of misses for every function
183 for (auto cluster
: clusters
) {
184 for (auto targetId
: cluster
->targets
) {
185 size_t page
= addr
[targetId
] / kPageSize
;
186 double samples
= state
.cg
->targets
[targetId
].samples
;
187 // probability that the page is not present in the cache
188 double missProb
= missProbability(state
, pageSamples
[page
]);
190 for (auto pred
: state
.cg
->targets
[targetId
].preds
) {
191 if (state
.cg
->targets
[pred
].samples
== 0) continue;
192 auto arc
= state
.cg
->arcs
.find(Arc(pred
, targetId
));
195 size_t srcPage
= (addr
[pred
] + (size_t)arc
->avgCallOffset
) / kPageSize
;
196 if (page
!= srcPage
) {
198 misses
+= arc
->weight
* missProb
;
200 samples
-= arc
->weight
;
203 // the remaining samples come from the jitted code
204 misses
+= samples
* missProb
;
208 return 100.0 * (1.0 - misses
/ state
.totalSamples
);
212 * Get adjacent clusters (the ones that share an arc) with the given one
214 std::unordered_set
<Cluster
*> adjacentClusters(const AlgoState
& state
,
216 std::unordered_set
<Cluster
*> result
;
217 for (auto targetId
: cluster
->targets
) {
218 for (auto succ
: state
.cg
->targets
[targetId
].succs
) {
219 auto succCluster
= state
.funcCluster
[succ
];
220 if (succCluster
!= nullptr && succCluster
!= cluster
) {
221 result
.insert(succCluster
);
224 for (auto pred
: state
.cg
->targets
[targetId
].preds
) {
225 auto predCluster
= state
.funcCluster
[pred
];
226 if (predCluster
!= nullptr && predCluster
!= cluster
) {
227 result
.insert(predCluster
);
235 * The expected number of calls for an edge withing the same TLB page
237 double expectedCalls(int src_addr
, int dst_addr
, double edgeWeight
) {
238 int dist
= std::abs(src_addr
- dst_addr
);
239 if (dist
> kPageSize
) {
242 return (double(kPageSize
- dist
) / kPageSize
) * edgeWeight
;
246 * The expected number of calls within a given cluster with both endpoints on
247 * the same TLB cache page
249 double shortCalls(const AlgoState
& state
, Cluster
* cluster
) {
251 for (auto targetId
: cluster
->targets
) {
252 for (auto succ
: state
.cg
->targets
[targetId
].succs
) {
253 if (state
.funcCluster
[succ
] == cluster
) {
254 auto arc
= state
.cg
->arcs
.find(Arc(targetId
, succ
));
256 int src_addr
= state
.addr
[targetId
] + arc
->avgCallOffset
;
257 int dst_addr
= state
.addr
[succ
];
259 calls
+= expectedCalls(src_addr
, dst_addr
, arc
->weight
);
268 * The number of calls between the two clusters with both endpoints on
269 * the same TLB page, assuming that a given pair of clusters gets merged
271 double shortCalls(const AlgoState
& state
,
272 Cluster
* clusterPred
,
273 Cluster
* clusterSucc
) {
275 for (auto targetId
: clusterPred
->targets
) {
276 for (auto succ
: state
.cg
->targets
[targetId
].succs
) {
277 if (state
.funcCluster
[succ
] == clusterSucc
) {
278 auto arc
= state
.cg
->arcs
.find(Arc(targetId
, succ
));
280 int src_addr
= state
.addr
[targetId
] + arc
->avgCallOffset
;
281 int dst_addr
= state
.addr
[succ
] + clusterPred
->size
;
283 calls
+= expectedCalls(src_addr
, dst_addr
, arc
->weight
);
288 for (auto targetId
: clusterPred
->targets
) {
289 for (auto pred
: state
.cg
->targets
[targetId
].preds
) {
290 if (state
.funcCluster
[pred
] == clusterSucc
) {
291 auto arc
= state
.cg
->arcs
.find(Arc(pred
, targetId
));
293 int src_addr
= state
.addr
[pred
] + arc
->avgCallOffset
+
295 int dst_addr
= state
.addr
[targetId
];
297 calls
+= expectedCalls(src_addr
, dst_addr
, arc
->weight
);
306 * The gain of merging two clusters.
308 * We assume that the final clusters are sorted by their density, and hence
309 * every cluster is likely to be adjacent with clusters of the same density.
310 * Thus, the 'hotness' of every cluster can be estimated by density*pageSize,
311 * which is used to compute the probability of cache misses for long calls
312 * of a given cluster.
313 * The result is also scaled by the size of the resulting cluster in order to
314 * increse the chance of merging short clusters, which is helpful for
315 * the i-cache performance.
317 double mergeGain(const AlgoState
& state
,
318 Cluster
* clusterPred
,
319 Cluster
* clusterSucc
) {
320 // cache misses on the first cluster
321 double longCallsPred
= clusterPred
->samples
- shortCalls(state
, clusterPred
);
322 double probPred
= missProbability(state
, clusterPred
->density() * kPageSize
);
323 double expectedMissesPred
= longCallsPred
* probPred
;
325 // cache misses on the second cluster
326 double longCallsSucc
= clusterSucc
->samples
- shortCalls(state
, clusterSucc
);
327 double probSucc
= missProbability(state
, clusterSucc
->density() * kPageSize
);
328 double expectedMissesSucc
= longCallsSucc
* probSucc
;
330 // cache misses on the merged cluster
331 double longCallsNew
= longCallsPred
+ longCallsSucc
-
332 shortCalls(state
, clusterPred
, clusterSucc
);
333 double newDensity
= density(clusterPred
, clusterSucc
);
334 double probNew
= missProbability(state
, newDensity
* kPageSize
);
335 double missesNew
= longCallsNew
* probNew
;
337 double gain
= expectedMissesPred
+ expectedMissesSucc
- missesNew
;
338 // scaling the result to increase the importance of merging short clusters
339 return gain
/ (clusterPred
->size
+ clusterSucc
->size
);
345 void mergeInto(AlgoState
& state
, Cluster
* into
, Cluster
* other
) {
346 auto& targets
= other
->targets
;
347 into
->targets
.insert(into
->targets
.end(), targets
.begin(), targets
.end());
348 into
->size
+= other
->size
;
349 into
->samples
+= other
->samples
;
352 for (auto targetId
: into
->targets
) {
353 state
.funcCluster
[targetId
] = into
;
354 state
.addr
[targetId
] = curAddr
;
355 curAddr
+= state
.cg
->targets
[targetId
].size
;
360 other
->targets
.clear();
364 * HFSortPlus - layout of hot functions with iTLB cache optimization
366 std::vector
<Cluster
> hfsortPlus(const TargetGraph
& cg
) {
367 // create a cluster for every function
368 std::vector
<Cluster
> allClusters
;
369 allClusters
.reserve(cg
.targets
.size());
370 for (size_t f
= 0; f
< cg
.targets
.size(); f
++) {
371 allClusters
.emplace_back(f
, cg
.targets
[f
]);
374 // initialize objects used by the algorithm
375 std::vector
<Cluster
*> clusters
;
376 clusters
.reserve(cg
.targets
.size());
379 state
.totalSamples
= 0;
380 state
.funcCluster
= std::vector
<Cluster
*>(cg
.targets
.size(), nullptr);
381 state
.addr
= std::vector
<size_t>(cg
.targets
.size(), kInvalidAddr
);
382 for (size_t f
= 0; f
< cg
.targets
.size(); f
++) {
383 if (cg
.targets
[f
].samples
== 0) continue;
385 clusters
.push_back(&allClusters
[f
]);
386 state
.funcCluster
[f
] = &allClusters
[f
];
388 state
.totalSamples
+= cg
.targets
[f
].samples
;
391 HFTRACE(1, "Starting hfsort+ for %lu clusters\n", clusters
.size());
392 HFTRACE(1, "Initial expected iTLB cache hit ratio: %.4lf\n",
393 expectedCacheHitRatio(state
, clusters
));
395 // the cache keeps precomputed values of mergeGain for pairs of clusters;
396 // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
397 // containing both x and y (and recompute them on the next iteration)
398 PrecomputedResults cache
;
401 // merge pairs of clusters while there is an improvement
402 while (clusters
.size() > 1) {
403 if (steps
% 500 == 0) {
404 HFTRACE(1, "step = %d clusters = %lu expected_hit_rate = %.4lf\n",
407 expectedCacheHitRatio(state
, clusters
));
411 Cluster
* bestClusterPred
= nullptr;
412 Cluster
* bestClusterSucc
= nullptr;
413 double bestGain
= -1;
414 for (auto clusterPred
: clusters
) {
415 // get candidates for merging with the current cluster
416 auto candidateClusters
= adjacentClusters(state
, clusterPred
);
418 // find the best candidate
419 for (auto clusterSucc
: candidateClusters
) {
420 // get a cost of merging two clusters
421 if (!cache
.contains(clusterPred
, clusterSucc
)) {
422 double value
= mergeGain(state
, clusterPred
, clusterSucc
);
423 cache
.set(clusterPred
, clusterSucc
, value
);
426 double gain
= cache
.get(clusterPred
, clusterSucc
);
427 // breaking ties by density to make the hottest clusters be merged first
428 if (gain
> bestGain
|| (std::abs(gain
- bestGain
) < 1e-8 &&
429 density(clusterPred
, clusterSucc
) >
430 density(bestClusterPred
, bestClusterSucc
))) {
432 bestClusterPred
= clusterPred
;
433 bestClusterSucc
= clusterSucc
;
439 if (bestGain
<= 0.0) break;
441 cache
.invalidate(bestClusterPred
);
442 cache
.invalidate(bestClusterSucc
);
444 // merge the best pair of clusters
445 mergeInto(state
, bestClusterPred
, bestClusterSucc
);
446 // remove bestClusterSucc from the list of active clusters
447 auto iter
= std::remove(clusters
.begin(), clusters
.end(), bestClusterSucc
);
448 clusters
.erase(iter
, clusters
.end());
451 HFTRACE(1, "Completed hfsort+ with %lu clusters\n", clusters
.size());
452 HFTRACE(1, "Final expected iTLB cache hit ratio: %.4lf\n",
453 expectedCacheHitRatio(state
, clusters
));
455 // Return the set of clusters that are left, which are the ones that
456 // didn't get merged (so their first func is its original func).
457 sortByDensity(clusters
);
458 std::vector
<Cluster
> result
;
459 for (auto cluster
: clusters
) {
460 result
.emplace_back(std::move(*cluster
));