Code cleanup preparing for more GC scan functions.
[hiphop-php.git] / hphp / runtime / ext / ext_hotprofiler.cpp
blob83066e66dd48b657ee8201004c7896346f48ecc9
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-2014 Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1997-2010 The PHP Group |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 3.01 of the PHP license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.php.net/license/3_01.txt |
12 | If you did not receive a copy of the PHP license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@php.net so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
18 #include "hphp/runtime/ext/ext_hotprofiler.h"
20 #include "hphp/runtime/base/builtin-functions.h"
21 #include "hphp/runtime/base/memory-manager.h"
22 #include "hphp/runtime/base/request-local.h"
23 #include "hphp/runtime/base/zend-math.h"
24 #include "hphp/runtime/base/ini-setting.h"
25 #include "hphp/runtime/vm/event-hook.h"
26 #include "hphp/runtime/vm/jit/translator-inline.h"
27 #include "hphp/util/alloc.h"
28 #include "hphp/util/vdso.h"
29 #include "hphp/util/cycles.h"
30 #include "hphp/runtime/base/variable-serializer.h"
31 #include "hphp/runtime/ext/std/ext_std_function.h"
32 #include "hphp/runtime/ext/ext_system_profiler.h"
33 #include "hphp/runtime/ext/xdebug/xdebug_profiler.h"
34 #include "hphp/runtime/ext/extension-registry.h"
35 #include "hphp/runtime/base/request-event-handler.h"
37 #include <sys/time.h>
38 #include <sys/resource.h>
39 #include <iostream>
40 #include <fstream>
41 #include <zlib.h>
42 #include <algorithm>
43 #include <map>
44 #include <new>
45 #include <utility>
46 #include <vector>
48 // Append the delimiter
49 #define HP_STACK_DELIM "==>"
50 #define HP_STACK_DELIM_LEN (sizeof(HP_STACK_DELIM) - 1)
52 namespace HPHP {
54 IMPLEMENT_DEFAULT_EXTENSION_VERSION(hotprofiler, NO_VERSION_YET);
56 using std::vector;
57 using std::string;
59 const StaticString s_hotprofiler("hotprofiler");
61 ///////////////////////////////////////////////////////////////////////////////
62 // helpers
64 /**
65 * Get time delta in microseconds.
67 static long get_us_interval(struct timeval *start, struct timeval *end) {
68 return (((end->tv_sec - start->tv_sec) * 1000000)
69 + (end->tv_usec - start->tv_usec));
72 /**
73 * Incr time with the given microseconds.
75 static void incr_us_interval(struct timeval *start, uint64_t incr) {
76 incr += (start->tv_sec * 1000000 + start->tv_usec);
77 start->tv_sec = incr/1000000;
78 start->tv_usec = incr%1000000;
79 return;
82 /**
83 * Truncates the given timeval to the nearest slot begin, where
84 * the slot size is determined by intr
86 * @param tv Input timeval to be truncated in place
87 * @param intr Time interval in microsecs - slot width
88 * @return void
89 * @author veeve
91 static void hp_trunc_time(struct timeval *tv, uint64_t intr) {
92 uint64_t time_in_micro;
94 // Convert to microsecs and trunc that first
95 time_in_micro = (tv->tv_sec * 1000000) + tv->tv_usec;
96 time_in_micro /= intr;
97 time_in_micro *= intr;
99 // Update tv
100 tv->tv_sec = (time_in_micro / 1000000);
101 tv->tv_usec = (time_in_micro % 1000000);
104 ///////////////////////////////////////////////////////////////////////////////
105 // High precision timer related functions.
108 * This is a microbenchmark to get cpu frequency the process is running on. The
109 * returned value is used to convert TSC counter values to microseconds.
111 * @return int64.
112 * @author cjiang
114 static int64_t get_cpu_frequency() {
115 struct timeval start;
116 struct timeval end;
118 if (gettimeofday(&start, 0)) {
119 perror("gettimeofday");
120 return 0.0;
122 uint64_t tsc_start = cpuCycles();
123 uint64_t tsc_end;
124 volatile int i;
125 // Busy loop for 5 miliseconds. Don't use usleep() here since it causes the
126 // CPU to halt which will generate meaningless results.
127 do {
128 for (i = 0; i < 1000000; i++);
129 if (gettimeofday(&end, 0)) {
130 perror("gettimeofday");
131 return 0.0;
133 tsc_end = cpuCycles();
134 } while (get_us_interval(&start, &end) < 5000);
136 return nearbyint((tsc_end - tsc_start) * 1.0
137 / (get_us_interval(&start, &end)));
140 #define MAX_LINELENGTH 1024
142 static int64_t* get_cpu_frequency_from_file(const char *file, int ncpus)
144 std::ifstream cpuinfo(file);
145 if (cpuinfo.fail()) {
146 return nullptr;
148 char line[MAX_LINELENGTH];
149 int64_t* freqs = new int64_t[ncpus];
150 for (int i = 0; i < ncpus; ++i) {
151 freqs[i] = 0;
153 int processor = -1;
155 while (cpuinfo.getline(line, sizeof(line))) {
156 if (sscanf(line, "processor : %d", &processor) == 1) {
157 continue;
159 float freq;
160 if (sscanf(line, "cpu MHz : %f", &freq) == 1) {
161 if (processor != -1 && processor < ncpus) {
162 freqs[processor] = nearbyint(freq);
163 processor = -1;
167 for (int i = 0; i < ncpus; ++i) {
168 if (freqs[i] == 0) {
169 delete[] freqs;
170 return nullptr;
173 return freqs;
176 ///////////////////////////////////////////////////////////////////////////////
177 // Machine information that we collect just once.
179 class MachineInfo {
180 public:
182 * Bind the current process to a specified CPU. This function is to ensure
183 * that the OS won't schedule the process to different processors, which
184 * would make values read by rdtsc unreliable.
186 * @param uint32 cpu_id, the id of the logical cpu to be bound to.
188 * @author cjiang
190 static void BindToCPU(uint32_t cpu_id) {
191 cpu_set_t new_mask;
192 CPU_ZERO(&new_mask);
193 CPU_SET(cpu_id, &new_mask);
194 SET_AFFINITY(0, sizeof(cpu_set_t), &new_mask);
197 public:
198 // The number of logical CPUs this machine has.
199 int m_cpu_num;
200 // Store the cpu frequency. Get it from /proc/cpuinfo if we can.
201 int64_t* m_cpu_frequencies;
203 MachineInfo() {
204 m_cpu_num = sysconf(_SC_NPROCESSORS_CONF);
205 m_cpu_frequencies = get_cpu_frequency_from_file("/proc/cpuinfo", m_cpu_num);
207 if (m_cpu_frequencies)
208 return;
210 m_cpu_frequencies = new int64_t[m_cpu_num];
211 for (int i = 0; i < m_cpu_num; i++) {
212 cpu_set_t prev_mask;
213 GET_AFFINITY(0, sizeof(cpu_set_t), &prev_mask);
214 BindToCPU(i);
215 // Make sure the current process gets scheduled to the target cpu. This
216 // might not be necessary though.
217 usleep(0);
218 m_cpu_frequencies[i] = get_cpu_frequency();
219 SET_AFFINITY(0, sizeof(cpu_set_t), &prev_mask);
223 ~MachineInfo() {
224 delete[] m_cpu_frequencies;
227 static MachineInfo s_machine;
229 static inline uint64_t
230 tv_to_cycles(const struct timeval& tv, int64_t MHz)
232 return (((uint64_t)tv.tv_sec * 1000000) + tv.tv_usec) * MHz;
235 static inline uint64_t
236 to_usec(int64_t cycles, int64_t MHz, bool cpu_time = false)
238 #ifdef CLOCK_THREAD_CPUTIME_ID
239 static int64_t vdso_usable = Vdso::ClockGetTimeNS(CLOCK_THREAD_CPUTIME_ID);
240 #else
241 static int64_t vdso_usable = -1;
242 #endif
244 if (cpu_time && vdso_usable >= 0)
245 return cycles / 1000;
246 return (cycles + MHz/2) / MHz;
249 static inline uint64_t cpuTime(int64_t MHz) {
250 #ifdef CLOCK_THREAD_CPUTIME_ID
251 int64_t rval = Vdso::ClockGetTimeNS(CLOCK_THREAD_CPUTIME_ID);
252 if (rval >= 0) {
253 return rval;
255 #endif
256 struct rusage usage;
257 getrusage(RUSAGE_SELF, &usage);
258 return
259 tv_to_cycles(usage.ru_utime, MHz) + tv_to_cycles(usage.ru_stime, MHz);
262 uint64_t
263 get_allocs()
265 #ifdef USE_JEMALLOC
266 auto& mm = MM();
267 return mm.getAllocated();
268 #endif
269 #ifdef USE_TCMALLOC
270 if (MallocExtensionInstance) {
271 size_t stat;
272 MallocExtensionInstance()->GetNumericProperty(
273 "generic.thread_bytes_allocated", &stat);
274 return stat;
276 #endif
277 return 0;
280 uint64_t
281 get_frees()
283 #ifdef USE_JEMALLOC
284 auto& mm = MM();
285 return mm.getDeallocated();
286 #endif
287 #ifdef USE_TCMALLOC
288 if (MallocExtensionInstance) {
289 size_t stat;
290 MallocExtensionInstance()->GetNumericProperty(
291 "generic.thread_bytes_freed", &stat);
292 return stat;
294 #endif
295 return 0;
298 size_t Frame::getName(char *result_buf, size_t result_len) {
299 if (result_len <= 1) {
300 return 0; // Insufficient result_bug. Bail!
303 // Add '@recurse_level' if required
304 // NOTE: Dont use snprintf's return val as it is compiler dependent
305 if (m_recursion) {
306 snprintf(result_buf, result_len, "%s@%d", m_name, m_recursion);
307 } else {
308 snprintf(result_buf, result_len, "%s", m_name);
311 // Force null-termination at MAX
312 result_buf[result_len - 1] = 0;
313 return strlen(result_buf);
316 size_t Frame::getStack(int level, char *result_buf, size_t result_len) {
317 // End recursion if we dont need deeper levels or
318 // we dont have any deeper levels
319 if (!m_parent || level <= 1) {
320 return getName(result_buf, result_len);
323 // Take care of all ancestors first
324 size_t len = m_parent->getStack(level - 1, result_buf, result_len);
325 if (result_len < (len + HP_STACK_DELIM_LEN)) {
326 return len; // Insufficient result_buf. Bail out!
329 // Add delimiter only if entry had ancestors
330 if (len) {
331 strncat(result_buf + len, HP_STACK_DELIM, result_len - len);
332 len += HP_STACK_DELIM_LEN;
335 // Append the current function name
336 return len + getName(result_buf + len, result_len - len);
339 const StaticString
340 s_ct("ct"),
341 s_wt("wt"),
342 s_cpu("cpu"),
343 s_mu("mu"),
344 s_pmu("pmu"),
345 s_alloc("alloc"),
346 s_free("free"),
347 s_compressed_trace("(compressed_trace)");
350 * Maintain profiles of a running stack.
352 Profiler::Profiler(bool needCPUAffinity) : m_successful(true),
353 m_stack(nullptr),
354 m_frame_free_list(nullptr),
355 m_has_affinity(needCPUAffinity) {
356 if (!s_rand_initialized) {
357 s_rand_initialized = true;
358 srand(math_generate_seed());
361 if (m_has_affinity) {
363 // Bind to a random cpu so that we can use rdtsc instruction.
365 int cur_cpu_id = rand() % s_machine.m_cpu_num;
366 GET_AFFINITY(0, sizeof(cpu_set_t), &m_prev_mask);
367 MachineInfo::BindToCPU(cur_cpu_id);
368 m_MHz = s_machine.m_cpu_frequencies[cur_cpu_id];
369 } else {
371 // Take cpu0's speed as a proxy for all cpus.
373 m_MHz = s_machine.m_cpu_frequencies[0];
376 memset(m_func_hash_counters, 0, sizeof(m_func_hash_counters));
379 Profiler::~Profiler() {
380 if (m_has_affinity) {
381 SET_AFFINITY(0, sizeof(cpu_set_t), &m_prev_mask);
384 endAllFrames();
385 for (Frame *p = m_frame_free_list; p;) {
386 Frame *cur = p;
387 p = p->m_parent;
388 delete cur;
393 * Called right before a function call.
395 void Profiler::beginFrameEx(const char *symbol) {
399 * Called right after a function is finished.
401 void Profiler::endFrameEx(const TypedValue *retval,
402 const char *_symbol) {
405 void Profiler::writeStats(Array &ret) {
408 void Profiler::endAllFrames() {
409 while (m_stack) {
410 endFrame(nullptr, nullptr, true);
414 template<class phpret, class Name, class Counts>
415 void Profiler::returnVals(phpret& ret, const Name& name, const Counts& counts,
416 int flags, int64_t MHz)
418 ArrayInit arr(5, ArrayInit::Map{});
419 arr.set(s_ct, counts.count);
420 arr.set(s_wt, to_usec(counts.wall_time, MHz));
421 if (flags & TrackCPU) {
422 arr.set(s_cpu, to_usec(counts.cpu, MHz, true));
424 if (flags & TrackMemory) {
425 arr.set(s_mu, counts.memory);
426 arr.set(s_pmu, counts.peak_memory);
427 } else if (flags & TrackMalloc) {
428 arr.set(s_alloc, counts.memory);
429 arr.set(s_free, counts.peak_memory);
431 ret.set(String(name), arr.toArray());
434 template<class phpret, class StatsMap>
435 bool Profiler::extractStats(phpret& ret, StatsMap& stats, int flags,
436 int64_t MHz)
438 for (typename StatsMap::const_iterator iter = stats.begin();
439 iter != stats.end(); ++iter) {
440 returnVals(ret, iter->first, iter->second, flags, MHz);
442 return true;
445 bool Profiler::s_rand_initialized = false;
447 void Profiler::beginFrame(const char *symbol) {
448 Frame *current = createFrame(symbol);
450 // NOTE(cjiang): use hash code to fend off most of call-stack traversal
451 int recursion_level = 0;
452 if (m_func_hash_counters[current->m_hash_code] > 0) {
453 // Find this symbols recurse level
454 for (Frame *p = current->m_parent; p; p = p->m_parent) {
455 if (strcmp(current->m_name, p->m_name) == 0) {
456 recursion_level = p->m_recursion + 1;
457 break;
461 current->m_recursion = recursion_level;
463 m_func_hash_counters[current->m_hash_code]++;
464 beginFrameEx(symbol);
468 * End top of the stack.
470 void Profiler::endFrame(const TypedValue *retval,
471 const char *symbol,
472 bool endMain) {
473 if (m_stack) {
474 // special case for main() frame that's only ended by endAllFrames()
475 if (!endMain && m_stack->m_parent == nullptr) {
476 return;
478 endFrameEx(retval, symbol);
479 m_func_hash_counters[m_stack->m_hash_code]--;
480 releaseFrame();
484 ///////////////////////////////////////////////////////////////////////////////
485 // HierarchicalProfiler
487 class HierarchicalProfiler final : public Profiler {
488 private:
489 class CountMap {
490 public:
491 CountMap() : count(0), wall_time(0), cpu(0), memory(0), peak_memory(0) {}
493 int64_t count;
494 int64_t wall_time;
495 int64_t cpu;
496 int64_t memory;
497 int64_t peak_memory;
500 class HierarchicalProfilerFrame : public Frame {
501 public:
502 virtual ~HierarchicalProfilerFrame() {
505 uint64_t m_tsc_start; // start value for TSC counter
506 int64_t m_mu_start; // memory usage
507 int64_t m_pmu_start; // peak memory usage
508 int64_t m_vtsc_start; // user/sys time start
511 typedef hphp_hash_map<std::string, CountMap, string_hash> StatsMap;
512 StatsMap m_stats; // outcome
514 public:
515 explicit HierarchicalProfiler(int flags) : Profiler(true), m_flags(flags) {
518 Frame *allocateFrame() override {
519 return new HierarchicalProfilerFrame();
522 void beginFrameEx(const char *symbol) override {
523 HierarchicalProfilerFrame *frame =
524 dynamic_cast<HierarchicalProfilerFrame *>(m_stack);
525 frame->m_tsc_start = cpuCycles();
527 if (m_flags & TrackCPU) {
528 frame->m_vtsc_start = cpuTime(m_MHz);
531 if (m_flags & TrackMemory) {
532 auto const& stats = MM().getStats();
533 frame->m_mu_start = stats.usage;
534 frame->m_pmu_start = stats.peakUsage;
535 } else if (m_flags & TrackMalloc) {
536 frame->m_mu_start = get_allocs();
537 frame->m_pmu_start = get_frees();
541 void endFrameEx(const TypedValue *retval, const char *given_symbol) override {
542 char symbol[512];
543 HierarchicalProfilerFrame *frame =
544 dynamic_cast<HierarchicalProfilerFrame *>(m_stack);
545 frame->getStack(2, symbol, sizeof(symbol));
546 CountMap &counts = m_stats[symbol];
547 counts.count++;
548 counts.wall_time += cpuCycles() - frame->m_tsc_start;
550 if (m_flags & TrackCPU) {
551 counts.cpu += cpuTime(m_MHz) - frame->m_vtsc_start;
554 if (m_flags & TrackMemory) {
555 auto const& stats = MM().getStats();
556 int64_t mu_end = stats.usage;
557 int64_t pmu_end = stats.peakUsage;
558 counts.memory += mu_end - frame->m_mu_start;
559 counts.peak_memory += pmu_end - frame->m_pmu_start;
560 } else if (m_flags & TrackMalloc) {
561 counts.memory += get_allocs() - frame->m_mu_start;
562 counts.peak_memory += get_frees() - frame->m_pmu_start;
566 void writeStats(Array &ret) override {
567 extractStats(ret, m_stats, m_flags, m_MHz);
570 bool shouldSkipBuiltins() const override {
571 return m_flags & NoTrackBuiltins;
574 private:
575 uint32_t m_flags;
578 ///////////////////////////////////////////////////////////////////////////////
579 // TraceProfiler
581 // Walks a log of function enter and exit events captured by
582 // TraceProfiler and generates statistics for each function executed.
583 template <class TraceIterator, class Stats>
584 class TraceWalker {
585 public:
586 struct Frame {
587 TraceIterator trace; // Pointer to the log entry which pushed this frame
588 int level; // Recursion level for this function
589 int len; // Length of the function name
592 TraceWalker()
593 : m_arcBuffLen(200)
594 , m_arcBuff((char*)malloc(200))
595 , m_badArcCount(0)
598 ~TraceWalker() {
599 free(m_arcBuff);
600 for (auto& r : m_recursion) delete[] r.first;
603 void walk(TraceIterator begin, TraceIterator end, TraceIterator final,
604 Stats& stats) {
605 if (begin == end) return;
606 m_recursion.push_back(std::make_pair(nullptr, 0));
607 // Trim exit traces off the front of the log. These may be due to
608 // the return from turning tracing on.
609 std::map<const char*, unsigned> functionLevel;
610 auto current = begin;
611 while (current != end && !current->symbol) ++current;
612 while (current != end) {
613 if (!current->is_func_exit) {
614 unsigned level = ++functionLevel[current->symbol];
615 if (level >= m_recursion.size()) {
616 char *level_string = new char[8];
617 sprintf(level_string, "@%u", level);
618 m_recursion.push_back(std::make_pair(level_string,
619 strlen(level_string)));
621 Frame fr;
622 fr.trace = current;
623 fr.level = level - 1;
624 fr.len = strlen(current->symbol);
625 checkArcBuff(fr.len);
626 m_stack.push_back(fr);
627 } else if (m_stack.size() > 1) {
628 validateStack(current, stats); // NB: may update m_stack.
629 --functionLevel[m_stack.back().trace->symbol];
630 popFrame(current, stats);
632 ++current;
634 // Close the dangling stack with the last entry. This
635 // under-represents any functions still on the stack.
636 --current;
637 while (m_stack.size() > 1) {
638 popFrame(current, stats);
640 // Close main() with the final data from when profiling was turned
641 // off. This ensures main() represents the entire run, even if we
642 // run out of log space.
643 if (!m_stack.empty()) {
644 assert(strcmp(m_stack.back().trace->symbol, "main()") == 0);
645 incStats(m_stack.back().trace->symbol, final, m_stack.back(), stats);
647 if (m_badArcCount > 0) {
648 stats["(trace has mismatched calls and returns)"].count = m_badArcCount;
652 private:
653 void checkArcBuff(int len) {
654 len = 2*len + HP_STACK_DELIM_LEN + 2;
655 if (len >= m_arcBuffLen) {
656 m_arcBuffLen *= 2;
657 m_arcBuff = (char *)realloc(m_arcBuff, m_arcBuffLen);
658 if (m_arcBuff == nullptr) {
659 throw std::bad_alloc();
664 void incStats(const char* arc, TraceIterator tr, const Frame& fr,
665 Stats& stats) {
666 auto& st = stats[arc];
667 ++st.count;
668 st.wall_time += tr->wall_time - fr.trace->wall_time;
669 st.cpu += tr->cpu - fr.trace->cpu;
670 st.memory += tr->memory - fr.trace->memory;
671 st.peak_memory += tr->peak_memory - fr.trace->peak_memory;
674 // Look for mismatched enter and exit events, and try to correct if
675 // we can. Only try to correct very simple imbalances... we could go
676 // nuts here, but it's likely not worth it.
677 void validateStack(TraceIterator tIt, Stats& stats) {
678 auto enteredName = m_stack.back().trace->symbol;
679 auto exitedName = tIt->symbol;
680 if ((exitedName != nullptr) &&
681 ((enteredName == nullptr) || (strcmp(enteredName, exitedName) != 0))) {
682 // We have a few special names that we form on entry. We don't
683 // have the information to form them again on exit, so tolerate
684 // them here. See EventHook::GetFunctionNameForProfiler().
685 if ((enteredName != nullptr) &&
686 ((strncmp(enteredName, "run_init::", 10) == 0) ||
687 (strcmp(enteredName, "_") == 0))) return;
688 bool fixed = false;
689 if (m_stack.size() > 1) {
690 auto callerName = (m_stack.end() - 2)->trace->symbol;
691 if ((callerName != nullptr) && (strcmp(callerName, exitedName) == 0)) {
692 // We have an exit for Foo(), but we were in Bar(). However,
693 // it appears that Foo() was the caller of Bar(). This
694 // suggests we've missed the exit event for Bar() and have
695 // the exit event for Foo() in hand. So remove Bar() to
696 // re-balance the stack.
697 m_stack.pop_back();
698 fixed = true;
701 // The first few bad arcs typically point at the problem, so
702 // report them. The rest we'll just count.
703 if (++m_badArcCount < 20) {
704 std::string badArc;
705 if (fixed) {
706 badArc = folly::format("(warning: corrected bad arc #{}: "
707 "enter '{}', exit '{}')",
708 m_badArcCount,
709 enteredName, exitedName).str();
710 } else {
711 badArc = folly::format("(error: bad arc #{}: "
712 "enter '{}', exit '{}')",
713 m_badArcCount,
714 enteredName, exitedName).str();
716 ++stats[badArc.data()].count;
721 void popFrame(TraceIterator tIt, Stats& stats) {
722 Frame callee = m_stack.back();
723 m_stack.pop_back();
724 Frame& caller = m_stack.back();
725 char *cp = m_arcBuff;
726 memcpy(cp, caller.trace->symbol, caller.len);
727 cp += caller.len;
728 if (caller.level >= 1) {
729 std::pair<char*, int>& lvl = m_recursion[caller.level];
730 memcpy(cp, lvl.first, lvl.second);
731 cp += lvl.second;
733 memcpy(cp, HP_STACK_DELIM, HP_STACK_DELIM_LEN);
734 cp += HP_STACK_DELIM_LEN;
735 memcpy(cp, callee.trace->symbol, callee.len);
736 cp += callee.len;
737 if (callee.level >= 1) {
738 std::pair<char*, int>& lvl = m_recursion[callee.level];
739 memcpy(cp, lvl.first, lvl.second);
740 cp += lvl.second;
742 *cp = 0;
743 incStats(m_arcBuff, tIt, callee, stats);
746 vector<std::pair<char*, int>> m_recursion;
747 vector<Frame> m_stack;
748 int m_arcBuffLen;
749 char *m_arcBuff;
750 int m_badArcCount;
753 // Profiler which makes a log of all function enter and exit events,
754 // then processes that into per-function statistics. A single-frame
755 // stack trace is used to aggregate stats for each function when
756 // called from different call sites.
757 class TraceProfiler : public Profiler {
758 public:
759 explicit TraceProfiler(int flags)
760 : Profiler(true)
761 , m_traceBuffer(nullptr)
762 , m_traceBufferSize(0)
763 , m_nextTraceEntry(0)
764 , m_traceBufferFilled(false)
765 , m_maxTraceBuffer(0)
766 , m_overflowCalls(0)
767 , m_flags(flags)
769 if (!(m_flags & IHaveInfiniteMemory) && pthread_mutex_trylock(&s_inUse)) {
770 // This profiler uses a very large amount of memory. Only allow
771 // one in the process at any time.
772 m_successful = false;
773 } else {
774 m_maxTraceBuffer = RuntimeOption::ProfilerMaxTraceBuffer;
775 Extension* ext = ExtensionRegistry::get(s_hotprofiler);
776 assert(ext);
777 IniSetting::Bind(ext, IniSetting::PHP_INI_ALL,
778 "profiler.max_trace_buffer",
779 &m_maxTraceBuffer);
783 ~TraceProfiler() {
784 if (m_successful) {
785 free(m_traceBuffer);
786 IniSetting::Unbind("profiler.max_trace_buffer");
787 pthread_mutex_unlock(&s_inUse);
791 private:
792 // Data measued on function entry and exit
793 struct TraceData {
794 int64_t wall_time;
795 int64_t cpu;
797 // It's not plausible that we need a full 64bits to hold memory
798 // stats, no matter what the collection mode. So we steal one bit
799 // from the memory field to use as a flag. We want to keep this
800 // data structure small since we need a huge number of them during
801 // a profiled run.
802 int64_t memory : 63; // Total memory, or memory allocated depending on flags
803 bool is_func_exit : 1; // Is the entry for a function exit?
804 int64_t peak_memory : 63; // Peak memory, or memory freed depending on flags
805 uint64_t unused : 1; // Unused, to keep memory and peak_memory the same size
807 void clear() {
808 wall_time = cpu = memory = peak_memory = 0;
810 static void compileTimeAssertions() {
811 static_assert(sizeof(TraceData) == (sizeof(uint64_t) * 4), "");
815 // One entry in the log, representing a function enter or exit event
816 struct TraceEntry : TraceData {
817 const char *symbol; // Function name
820 bool isTraceSpaceAvailable() {
821 // the two slots are reserved for internal use
822 return m_nextTraceEntry < m_traceBufferSize - 3;
825 bool ensureTraceSpace() {
826 bool track_realloc = false;
827 if (m_traceBufferFilled) {
828 m_overflowCalls++;
829 return false;
831 int new_array_size;
832 if (m_traceBufferSize == 0) {
833 new_array_size = RuntimeOption::ProfilerTraceBuffer;
834 } else {
835 new_array_size = m_traceBufferSize *
836 RuntimeOption::ProfilerTraceExpansion;
837 if (m_maxTraceBuffer != 0 && new_array_size > m_maxTraceBuffer) {
838 new_array_size = m_maxTraceBuffer > m_traceBufferSize ?
839 m_maxTraceBuffer : m_traceBufferSize;
841 if (new_array_size - m_nextTraceEntry <= 5) {
842 // for this operation to succeed, we need room for the entry we're
843 // adding, two realloc entries, and two entries to mark the end of
844 // the trace.
845 m_traceBufferFilled = true;
846 collectStats("(trace buffer terminated)", false,
847 m_traceBuffer[m_nextTraceEntry++]);
848 return false;
850 track_realloc = true;
852 if (track_realloc) {
853 collectStats("(trace buffer realloc)", false,
854 m_traceBuffer[m_nextTraceEntry++]);
857 MemoryManager::MaskAlloc masker(MM());
858 auto r = (TraceEntry*)realloc((void*)m_traceBuffer,
859 new_array_size * sizeof(TraceEntry));
861 if (!r) {
862 m_traceBufferFilled = true;
863 if (m_traceBuffer) {
864 collectStats("(trace buffer terminated)", false,
865 m_traceBuffer[m_nextTraceEntry++]);
867 return false;
869 m_traceBufferSize = new_array_size;
870 m_traceBuffer = r;
872 if (track_realloc) {
873 collectStats("(trace buffer realloc)", true,
874 m_traceBuffer[m_nextTraceEntry++]);
876 return true;
879 virtual void beginFrame(const char *symbol) override {
880 doTrace(symbol, false);
883 virtual void endFrame(const TypedValue *retval,
884 const char *symbol,
885 bool endMain = false) override {
886 doTrace(symbol, true);
889 virtual void endAllFrames() override {
890 if (m_traceBuffer && m_nextTraceEntry < m_traceBufferSize - 1) {
891 collectStats(nullptr, true, m_finalEntry);
892 m_traceBufferFilled = true;
896 void collectStats(const char *symbol, bool isFuncExit, TraceEntry& te) {
897 te.symbol = symbol;
898 te.is_func_exit = isFuncExit;
899 collectStats(te);
902 void collectStats(TraceData& te) {
903 te.wall_time = cpuCycles();
904 te.cpu = 0;
905 if (m_flags & TrackCPU) {
906 te.cpu = cpuTime(m_MHz);
908 if (m_flags & TrackMemory) {
909 auto const& stats = MM().getStats();
910 te.memory = stats.usage;
911 te.peak_memory = stats.peakUsage;
912 } else if (m_flags & TrackMalloc) {
913 te.memory = get_allocs();
914 te.peak_memory = get_frees();
915 } else {
916 te.memory = 0;
917 te.peak_memory = 0;
921 TraceEntry* nextTraceEntry() {
922 if (!isTraceSpaceAvailable() && !ensureTraceSpace()) {
923 return 0;
925 return &m_traceBuffer[m_nextTraceEntry++];
928 void doTrace(const char *symbol, bool isFuncExit) {
929 TraceEntry *te = nextTraceEntry();
930 if (te != nullptr) {
931 collectStats(symbol, isFuncExit, *te);
935 template<class TraceIterator, class Stats>
936 void walkTrace(TraceIterator begin, TraceIterator end, TraceIterator final,
937 Stats& stats) {
938 TraceWalker<TraceIterator, Stats> walker;
939 walker.walk(begin, end, final, stats);
942 virtual void writeStats(Array &ret) override {
943 TraceData my_begin;
944 collectStats(my_begin);
945 walkTrace(m_traceBuffer, m_traceBuffer + m_nextTraceEntry, &m_finalEntry,
946 m_stats);
947 if (m_overflowCalls) {
948 m_stats["(trace buffer terminated)"].count += m_overflowCalls/2;
950 extractStats(ret, m_stats, m_flags, m_MHz);
951 CountedTraceData allocStats;
952 allocStats.count = 0;
953 allocStats.peak_memory = allocStats.memory =
954 m_nextTraceEntry * sizeof(*m_traceBuffer);
955 returnVals(ret, "(trace buffer alloc)", allocStats, m_flags, m_MHz);
956 if (m_flags & MeasureXhprofDisable) {
957 CountedTraceData my_end;
958 collectStats(my_end);
959 my_end.count = 1;
960 my_end.cpu -= my_begin.cpu;
961 my_end.wall_time -= my_begin.wall_time;
962 my_end.memory -= my_begin.memory;
963 my_end.peak_memory -= my_begin.peak_memory;
964 returnVals(ret, "xhprof_post_processing()", my_end, m_flags, m_MHz);
968 virtual bool shouldSkipBuiltins() const override {
969 return m_flags & NoTrackBuiltins;
972 TraceEntry* m_traceBuffer;
973 TraceEntry m_finalEntry;
974 int m_traceBufferSize;
975 int m_nextTraceEntry;
976 bool m_traceBufferFilled;
977 int64_t m_maxTraceBuffer;
978 int64_t m_overflowCalls;
979 uint32_t m_flags;
981 // Final stats, per-function per-callsite, with a count of how many
982 // times the function was called from that callsite.
983 class CountedTraceData : public TraceData {
984 public:
985 int64_t count;
986 CountedTraceData() : count(0) { clear(); }
988 typedef hphp_hash_map<std::string, CountedTraceData, string_hash> StatsMap;
989 StatsMap m_stats; // outcome
991 static pthread_mutex_t s_inUse;
994 pthread_mutex_t TraceProfiler::s_inUse = PTHREAD_MUTEX_INITIALIZER;
996 ///////////////////////////////////////////////////////////////////////////////
997 // SampleProfiler
1000 * Sampling based profiler.
1002 class SampleProfiler : public Profiler {
1003 private:
1004 typedef std::pair<int64_t, int64_t> Timestamp;
1005 typedef smart::vector<std::pair<Timestamp, std::string>> SampleVec;
1006 SampleVec m_samples; // outcome
1008 public:
1009 SampleProfiler() : Profiler(true) {
1010 struct timeval now;
1011 uint64_t truncated_us;
1012 uint64_t truncated_tsc;
1014 // Init the last_sample in tsc
1015 m_last_sample_tsc = cpuCycles();
1017 // Find the microseconds that need to be truncated
1018 gettimeofday(&m_last_sample_time, 0);
1019 now = m_last_sample_time;
1020 hp_trunc_time(&m_last_sample_time, SAMPLING_INTERVAL);
1022 // Subtract truncated time from last_sample_tsc
1023 truncated_us = get_us_interval(&m_last_sample_time, &now);
1024 truncated_tsc = truncated_us * m_MHz;
1025 if (m_last_sample_tsc > truncated_tsc) {
1026 // just to be safe while subtracting unsigned ints
1027 m_last_sample_tsc -= truncated_tsc;
1030 // Convert sampling interval to ticks
1031 m_sampling_interval_tsc = SAMPLING_INTERVAL * m_MHz;
1034 virtual void beginFrameEx(const char *symbol) override {
1035 sample_check();
1038 virtual void endFrameEx(const TypedValue *retvalue,
1039 const char *symbol) override {
1040 sample_check();
1043 virtual void writeStats(Array &ret) override {
1044 for (auto const& sample : m_samples) {
1045 auto const& time = sample.first;
1046 char timestr[512];
1047 snprintf(timestr, sizeof(timestr), "%" PRId64 ".%06" PRId64,
1048 time.first, time.second);
1050 ret.set(String(timestr), String(sample.second));
1054 private:
1055 static const int SAMPLING_INTERVAL = 100000; // microsecs
1057 struct timeval m_last_sample_time;
1058 uint64_t m_last_sample_tsc;
1059 uint64_t m_sampling_interval_tsc;
1062 * Sample the stack. Add it to the stats_count global.
1064 * @param tv current time
1065 * @param entries func stack as linked list of hprof_entry_t
1066 * @return void
1067 * @author veeve
1069 void sample_stack() {
1070 char symbol[5120];
1071 m_stack->getStack(INT_MAX, symbol, sizeof(symbol));
1073 auto time = std::make_pair((int64_t)m_last_sample_time.tv_sec,
1074 (int64_t)m_last_sample_time.tv_usec);
1075 m_samples.push_back(std::make_pair(time, symbol));
1079 * Checks to see if it is time to sample the stack.
1080 * Calls hp_sample_stack() if its time.
1082 * @param entries func stack as linked list of hprof_entry_t
1083 * @param last_sample time the last sample was taken
1084 * @param sampling_intr sampling interval in microsecs
1085 * @return void
1086 * @author veeve
1088 void sample_check() {
1089 if (m_stack) {
1090 // While loop is to handle a single function taking a long time
1091 // and passing several sampling intervals
1092 while ((cpuCycles() - m_last_sample_tsc) > m_sampling_interval_tsc) {
1093 m_last_sample_tsc += m_sampling_interval_tsc;
1094 // HAS TO BE UPDATED BEFORE calling sample_stack
1095 incr_us_interval(&m_last_sample_time, SAMPLING_INTERVAL);
1096 sample_stack();
1102 ///////////////////////////////////////////////////////////////////////////////
1103 // Memoization Opportunity Profiler
1105 // Identifies potential memoization opportunities by comparing the
1106 // serialized form of function return values. Functions which return
1107 // the same data every time, but not the same instance, are
1108 // candidates.
1110 // For member functions, the args are serialized and kept along with
1111 // the 'this' pointer, and used to look for constant returns given the
1112 // same args.
1114 // This profiler is very, very slow. It tries to be faster by giving
1115 // up on functions quickly, and making a quick test to ignore them
1116 // later. It also ignores functions which return "small"
1117 // things. Nevertheless, you likely need to adjust
1118 // Server.RequestTimeoutSeconds to get a full request processed.
1120 // Future options:
1121 // - Collect the data cross-request to find APC opportunities.
1122 // - Auto-identify keys to functions which could be used to place things in APC
1123 // - i.e., same per-request, but not cross-request unless, say, locale is
1124 // considered.
1126 // @TODO: this is quite ghetto right now, but is useful as is to
1127 // others. In particular, it should provide the results via the return
1128 // value from writeStats, not print to stderr :) Task 3396401 tracks this.
1130 class MemoProfiler : public Profiler {
1131 public:
1132 explicit MemoProfiler(int flags) : Profiler(true) {}
1134 ~MemoProfiler() {
1137 private:
1138 virtual void beginFrame(const char *symbol) override {
1139 VMRegAnchor _;
1140 ActRec *ar = vmfp();
1141 Frame f(symbol);
1142 if (ar->hasThis()) {
1143 auto& memo = m_memos[symbol];
1144 if (!memo.m_ignore) {
1145 auto args = hhvm_get_frame_args(ar, 0);
1146 args.append((int64_t)(ar->getThis())); // Use the pointer not the obj
1147 VariableSerializer vs(VariableSerializer::Type::DebuggerSerialize);
1148 String sdata;
1149 try {
1150 sdata = vs.serialize(args, true);
1151 f.m_args = sdata;
1152 } catch (...) {
1153 fprintf(stderr, "Args Serialization failure: %s\n", symbol);
1157 m_stack.push_back(f);
1160 virtual void endFrame(const TypedValue *retval,
1161 const char *symbol,
1162 bool endMain = false) override {
1163 if (m_stack.empty()) {
1164 fprintf(stderr, "STACK IMBALANCE empty %s\n", symbol);
1165 return;
1167 auto f = m_stack.back();
1168 m_stack.pop_back();
1169 if (strcmp(f.m_symbol, symbol) != 0) {
1170 fprintf(stderr, "STACK IMBALANCE %s\n", symbol);
1171 return;
1173 auto& memo = m_memos[symbol];
1174 if (memo.m_ignore) return;
1175 ++memo.m_count;
1176 memo.m_ignore = true;
1177 VMRegAnchor _;
1178 ActRec *ar = vmfp();
1179 // Lots of random cases to skip just to keep this simple for
1180 // now. There's no reason not to do more later.
1181 if (!g_context->m_faults.empty()) return;
1182 if (ar->m_func->isCPPBuiltin() || ar->resumed()) return;
1183 auto ret = tvAsCVarRef(retval);
1184 if (ret.isNull()) return;
1185 if (!(ret.isString() || ret.isObject() || ret.isArray())) return;
1186 VariableSerializer vs(VariableSerializer::Type::DebuggerSerialize);
1187 String sdata;
1188 try {
1189 sdata = vs.serialize(ret, true);
1190 } catch (...) {
1191 fprintf(stderr, "Serialization failure: %s\n", symbol);
1192 return;
1194 if (sdata.length() < 3) return;
1195 if (ar->hasThis()) {
1196 memo.m_has_this = true;
1197 auto& member_memo = memo.m_member_memos[f.m_args.data()];
1198 ++member_memo.m_count;
1199 if (member_memo.m_return_value.length() == 0) { // First time
1200 member_memo.m_return_value = sdata;
1201 // Intentionally copy the raw pointer value
1202 member_memo.m_ret_tv = *retval;
1203 memo.m_ignore = false;
1204 } else if (member_memo.m_return_value == sdata) { // Same
1205 memo.m_ignore = false;
1206 if ((member_memo.m_ret_tv.m_data.num != retval->m_data.num) ||
1207 (member_memo.m_ret_tv.m_type != retval->m_type)) {
1208 memo.m_ret_tv_same = false;
1210 } else {
1211 memo.m_member_memos.clear(); // Cleanup and ignore
1213 } else {
1214 if (memo.m_return_value.length() == 0) { // First time
1215 memo.m_return_value = sdata;
1216 // Intentionally copy the raw pointer value
1217 memo.m_ret_tv = *retval;
1218 memo.m_ignore = false;
1219 } else if (memo.m_return_value == sdata) { // Same
1220 memo.m_ignore = false;
1221 if ((memo.m_ret_tv.m_data.num != retval->m_data.num) ||
1222 (memo.m_ret_tv.m_type != retval->m_type)) {
1223 memo.m_ret_tv_same = false;
1225 } else {
1226 memo.m_return_value = ""; // Different, cleanup and ignore
1231 virtual void endAllFrames() override {
1232 // Nothing to do for this profiler since all work is done as we go.
1235 virtual void writeStats(Array &ret) override {
1236 fprintf(stderr, "writeStats start\n");
1237 // RetSame: the return value is the same instance every time
1238 // HasThis: call has a this argument
1239 // AllSame: all returns were the same data even though args are different
1240 // MemberCount: number of different arg sets (including this)
1241 fprintf(stderr, "Count Function MinSerLen MaxSerLen RetSame HasThis "
1242 "AllSame MemberCount\n");
1243 for (auto& me : m_memos) {
1244 if (me.second.m_ignore) continue;
1245 if (me.second.m_count == 1) continue;
1246 int min_ser_len = 999999999;
1247 int max_ser_len = 0;
1248 int count = 0;
1249 int member_count = 0;
1250 bool all_same = true;
1251 if (me.second.m_has_this) {
1252 bool any_multiple = false;
1253 auto& fr = me.second.m_member_memos.begin()->second.m_return_value;
1254 member_count = me.second.m_member_memos.size();
1255 for (auto& mme : me.second.m_member_memos) {
1256 if (mme.second.m_return_value != fr) all_same = false;
1257 count += mme.second.m_count;
1258 auto ser_len = mme.second.m_return_value.length();
1259 min_ser_len = std::min(min_ser_len, ser_len);
1260 max_ser_len = std::max(max_ser_len, ser_len);
1261 if (mme.second.m_count > 1) any_multiple = true;
1263 if (!any_multiple && !all_same) continue;
1264 } else {
1265 min_ser_len = max_ser_len = me.second.m_return_value.length();
1266 count = me.second.m_count;
1267 all_same = me.second.m_ret_tv_same;
1269 fprintf(stderr, "%d %s %d %d %s %s %s %d\n",
1270 count, me.first.data(),
1271 min_ser_len, max_ser_len,
1272 me.second.m_ret_tv_same ? " true" : "false",
1273 me.second.m_has_this ? " true" : "false",
1274 all_same ? " true" : "false",
1275 member_count
1278 fprintf(stderr, "writeStats end\n");
1281 struct MemberMemoInfo {
1282 String m_return_value;
1283 TypedValue m_ret_tv;
1284 int m_count{0};
1286 using MemberMemoMap = hphp_hash_map<std::string, MemberMemoInfo, string_hash>;
1288 struct MemoInfo {
1289 MemberMemoMap m_member_memos; // Keyed by serialized args
1290 String m_return_value;
1291 TypedValue m_ret_tv;
1292 int m_count{0};
1293 bool m_ignore{false};
1294 bool m_has_this{false};
1295 bool m_ret_tv_same{true};
1297 using MemoMap = hphp_hash_map<std::string, MemoInfo, string_hash>;
1299 struct Frame {
1300 explicit Frame(const char* symbol) : m_symbol(symbol) {}
1301 const char* m_symbol;
1302 String m_args;
1305 public:
1306 MemoMap m_memos; // Keyed by function name
1307 vector<Frame> m_stack;
1310 ///////////////////////////////////////////////////////////////////////////////
1311 // ProfilerFactory
1313 bool ProfilerFactory::start(ProfilerKind kind,
1314 long flags,
1315 bool beginFrame /* = true */) {
1316 if (m_profiler != nullptr) {
1317 return false;
1320 switch (kind) {
1321 case ProfilerKind::Hierarchical:
1322 m_profiler = new HierarchicalProfiler(flags);
1323 break;
1324 case ProfilerKind::Sample:
1325 m_profiler = new SampleProfiler();
1326 break;
1327 case ProfilerKind::Trace:
1328 m_profiler = new TraceProfiler(flags);
1329 break;
1330 case ProfilerKind::Memo:
1331 m_profiler = new MemoProfiler(flags);
1332 break;
1333 case ProfilerKind::XDebug:
1334 m_profiler = new XDebugProfiler();
1335 break;
1336 case ProfilerKind::External:
1337 if (g_system_profiler) {
1338 m_profiler = g_system_profiler->getHotProfiler();
1339 } else if (m_external_profiler) {
1340 m_profiler = m_external_profiler;
1341 } else {
1342 throw_invalid_argument(
1343 "ProfilerFactory::setExternalProfiler() not yet called");
1344 return false;
1346 break;
1347 default:
1348 throw_invalid_argument("level: %d", static_cast<int>(kind));
1349 return false;
1351 if (m_profiler && m_profiler->m_successful) {
1352 // This will be disabled automatically when the thread completes the request
1353 HPHP::EventHook::Enable();
1354 ThreadInfo::s_threadInfo->m_profiler = m_profiler;
1355 if (beginFrame) {
1356 m_profiler->beginFrame("main()");
1358 return true;
1359 } else {
1360 delete m_profiler;
1361 m_profiler = nullptr;
1362 return false;
1366 Variant ProfilerFactory::stop() {
1367 if (m_profiler) {
1368 m_profiler->endAllFrames();
1370 Array ret;
1371 m_profiler->writeStats(ret);
1372 delete m_profiler;
1373 m_profiler = nullptr;
1374 ThreadInfo::s_threadInfo->m_profiler = nullptr;
1376 return ret;
1378 return init_null();
1381 bool ProfilerFactory::EnableNetworkProfiler = false;
1383 IMPLEMENT_REQUEST_LOCAL(ProfilerFactory, s_profiler_factory);
1385 ///////////////////////////////////////////////////////////////////////////////
1386 // main functions
1388 void f_hotprofiler_enable(int ikind) {
1389 auto kind = static_cast<ProfilerKind>(ikind);
1390 long flags = 0;
1391 if (kind == ProfilerKind::Hierarchical) {
1392 flags = NoTrackBuiltins;
1393 } else if (kind == ProfilerKind::Memory) {
1394 kind = ProfilerKind::Hierarchical;
1395 flags = NoTrackBuiltins | TrackMemory;
1397 if (RuntimeOption::EnableHotProfiler) {
1398 s_profiler_factory->start(kind, flags);
1402 Variant f_hotprofiler_disable() {
1403 return s_profiler_factory->stop();
1406 void f_phprof_enable(int flags /* = 0 */) {
1407 if (RuntimeOption::EnableHotProfiler) {
1408 s_profiler_factory->start(ProfilerKind::Hierarchical, flags);
1412 Variant f_phprof_disable() {
1413 return s_profiler_factory->stop();
1416 ///////////////////////////////////////////////////////////////////////////////
1417 // injected code
1419 void begin_profiler_frame(Profiler *p,
1420 const char *symbol) {
1421 p->beginFrame(symbol);
1424 void end_profiler_frame(Profiler *p,
1425 const TypedValue *retval,
1426 const char *symbol) {
1427 p->endFrame(retval, symbol);
1430 ///////////////////////////////////////////////////////////////////////////////