2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-2013 Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1997-2010 The PHP Group |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 3.01 of the PHP license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.php.net/license/3_01.txt |
12 | If you did not receive a copy of the PHP license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@php.net so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
18 #include "hphp/runtime/ext/ext_fb.h"
19 #include "hphp/runtime/base/memory-manager.h"
20 #include "hphp/runtime/base/request-local.h"
21 #include "hphp/runtime/base/zend-math.h"
22 #include "hphp/runtime/server/server-stats.h"
23 #include "hphp/runtime/base/ini-setting.h"
24 #include "hphp/runtime/vm/event-hook.h"
25 #include "hphp/util/alloc.h"
26 #include "hphp/util/vdso.h"
27 #include "hphp/util/cycles.h"
30 #include <sys/resource.h>
31 #include <sys/param.h>
32 #include <sys/cpuset.h>
33 #define cpu_set_t cpuset_t
34 #define SET_AFFINITY(pid, size, mask) \
35 cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, size, mask)
36 #define GET_AFFINITY(pid, size, mask) \
37 cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, size, mask)
38 #elif defined(__APPLE__)
39 #include <mach/mach_init.h>
40 #include <mach/thread_policy.h>
41 #include <mach/thread_act.h>
43 #define cpu_set_t thread_affinity_policy_data_t
44 #define CPU_SET(cpu_id, new_mask) \
45 (*(new_mask)).affinity_tag = (cpu_id + 1)
46 #define CPU_ZERO(new_mask) \
47 (*(new_mask)).affinity_tag = THREAD_AFFINITY_TAG_NULL
48 #define GET_AFFINITY(pid, size, mask) \
49 (*(mask)).affinity_tag = THREAD_AFFINITY_TAG_NULL
50 #define SET_AFFINITY(pid, size, mask) \
51 thread_policy_set(mach_thread_self(), THREAD_AFFINITY_POLICY, \
52 (int *)mask, THREAD_AFFINITY_POLICY_COUNT)
55 #define SET_AFFINITY(pid, size, mask) sched_setaffinity(0, size, mask)
56 #define GET_AFFINITY(pid, size, mask) sched_getaffinity(0, size, mask)
64 // Append the delimiter
65 #define HP_STACK_DELIM "==>"
66 #define HP_STACK_DELIM_LEN (sizeof(HP_STACK_DELIM) - 1)
70 IMPLEMENT_DEFAULT_EXTENSION(hotprofiler
);
71 IMPLEMENT_DEFAULT_EXTENSION(xhprof
);
72 ///////////////////////////////////////////////////////////////////////////////
76 * A hash function to calculate a 8-bit hash code for a function name.
77 * This is based on a small modification to 'zend_inline_hash_func' by summing
78 * up all bytes of the ulong returned by 'zend_inline_hash_func'.
80 * @param str, char *, string to be calculated hash code for.
84 static inline uint8_t hprof_inline_hash(const char * str
) {
85 unsigned long h
= 5381;
91 h
^= (unsigned long) *str
++;
94 for (i
= 0; i
< sizeof(unsigned long); i
++) {
95 res
+= ((uint8_t *)&h
)[i
];
101 * Get time delta in microseconds.
103 static long get_us_interval(struct timeval
*start
, struct timeval
*end
) {
104 return (((end
->tv_sec
- start
->tv_sec
) * 1000000)
105 + (end
->tv_usec
- start
->tv_usec
));
109 * Incr time with the given microseconds.
111 static void incr_us_interval(struct timeval
*start
, uint64_t incr
) {
112 incr
+= (start
->tv_sec
* 1000000 + start
->tv_usec
);
113 start
->tv_sec
= incr
/1000000;
114 start
->tv_usec
= incr
%1000000;
119 * Truncates the given timeval to the nearest slot begin, where
120 * the slot size is determined by intr
122 * @param tv Input timeval to be truncated in place
123 * @param intr Time interval in microsecs - slot width
127 static void hp_trunc_time(struct timeval
*tv
, uint64_t intr
) {
128 uint64_t time_in_micro
;
130 // Convert to microsecs and trunc that first
131 time_in_micro
= (tv
->tv_sec
* 1000000) + tv
->tv_usec
;
132 time_in_micro
/= intr
;
133 time_in_micro
*= intr
;
136 tv
->tv_sec
= (time_in_micro
/ 1000000);
137 tv
->tv_usec
= (time_in_micro
% 1000000);
140 ///////////////////////////////////////////////////////////////////////////////
141 // High precision timer related functions.
144 * This is a microbenchmark to get cpu frequency the process is running on. The
145 * returned value is used to convert TSC counter values to microseconds.
150 static int64_t get_cpu_frequency() {
151 struct timeval start
;
154 if (gettimeofday(&start
, 0)) {
155 perror("gettimeofday");
158 uint64_t tsc_start
= cpuCycles();
159 // Sleep for 5 miliseconds. Comparaing with gettimeofday's few microseconds
160 // execution time, this should be enough.
162 if (gettimeofday(&end
, 0)) {
163 perror("gettimeofday");
166 uint64_t tsc_end
= cpuCycles();
167 return nearbyint((tsc_end
- tsc_start
) * 1.0
168 / (get_us_interval(&start
, &end
)));
171 #define MAX_LINELENGTH 1024
173 static int64_t* get_cpu_frequency_from_file(const char *file
, int ncpus
)
175 std::ifstream
cpuinfo(file
);
176 if (cpuinfo
.fail()) {
179 char line
[MAX_LINELENGTH
];
180 int64_t* freqs
= new int64_t[ncpus
];
181 for (int i
= 0; i
< ncpus
; ++i
) {
186 while (cpuinfo
.getline(line
, sizeof(line
))) {
187 if (sscanf(line
, "processor : %d", &processor
) == 1) {
191 if (sscanf(line
, "cpu MHz : %f", &freq
) == 1) {
192 if (processor
!= -1 && processor
< ncpus
) {
193 freqs
[processor
] = nearbyint(freq
);
198 for (int i
= 0; i
< ncpus
; ++i
) {
211 explicit esyscall(const char *syscall_name
) {
213 char format
[strlen(syscall_name
) + sizeof(" %d")];
214 sprintf(format
, "%s %%d", syscall_name
);
216 std::ifstream
syscalls("/proc/esyscall");
217 if (syscalls
.fail()) {
220 char line
[MAX_LINELENGTH
];
221 if (!syscalls
.getline(line
, sizeof(line
))) {
224 // perhaps we should check the format, but we're just going to assume
226 while (syscalls
.getline(line
, sizeof(line
))) {
228 if (sscanf(line
, format
, &number
) == 1) {
236 ///////////////////////////////////////////////////////////////////////////////
237 // Machine information that we collect just once.
242 * Bind the current process to a specified CPU. This function is to ensure
243 * that the OS won't schedule the process to different processors, which
244 * would make values read by rdtsc unreliable.
246 * @param uint32 cpu_id, the id of the logical cpu to be bound to.
250 static void BindToCPU(uint32_t cpu_id
) {
253 CPU_SET(cpu_id
, &new_mask
);
254 SET_AFFINITY(0, sizeof(cpu_set_t
), &new_mask
);
258 // The number of logical CPUs this machine has.
260 // Store the cpu frequency. Get it from /proc/cpuinfo if we can.
261 int64_t* m_cpu_frequencies
;
264 m_cpu_num
= sysconf(_SC_NPROCESSORS_CONF
);
265 m_cpu_frequencies
= get_cpu_frequency_from_file("/proc/cpuinfo", m_cpu_num
);
267 if (m_cpu_frequencies
)
270 m_cpu_frequencies
= new int64_t[m_cpu_num
];
271 for (int i
= 0; i
< m_cpu_num
; i
++) {
273 GET_AFFINITY(0, sizeof(cpu_set_t
), &prev_mask
);
275 // Make sure the current process gets scheduled to the target cpu. This
276 // might not be necessary though.
278 m_cpu_frequencies
[i
] = get_cpu_frequency();
279 SET_AFFINITY(0, sizeof(cpu_set_t
), &prev_mask
);
284 delete[] m_cpu_frequencies
;
287 static MachineInfo s_machine
;
289 static inline uint64_t
290 tv_to_cycles(const struct timeval
& tv
, int64_t MHz
)
292 return (((uint64_t)tv
.tv_sec
* 1000000) + tv
.tv_usec
) * MHz
;
295 static inline uint64_t
296 to_usec(int64_t cycles
, int64_t MHz
, bool cpu_time
= false)
298 #ifdef CLOCK_THREAD_CPUTIME_ID
299 static int64_t vdso_usable
=
300 Util::Vdso::ClockGetTimeNS(CLOCK_THREAD_CPUTIME_ID
);
302 static int64_t vdso_usable
= -1;
305 if (cpu_time
&& vdso_usable
>= 0)
306 return cycles
/ 1000;
307 return (cycles
+ MHz
/2) / MHz
;
310 static esyscall
vtsc_syscall("vtsc");
312 static inline uint64_t vtsc(int64_t MHz
) {
313 #ifdef CLOCK_THREAD_CPUTIME_ID
314 int64_t rval
= Util::Vdso::ClockGetTimeNS(CLOCK_THREAD_CPUTIME_ID
);
319 if (vtsc_syscall
.num
> 0) {
320 return syscall(vtsc_syscall
.num
);
323 getrusage(RUSAGE_SELF
, &usage
);
325 tv_to_cycles(usage
.ru_utime
, MHz
) + tv_to_cycles(usage
.ru_stime
, MHz
);
333 return mm
.getAllocated();
336 if (MallocExtensionInstance
) {
338 MallocExtensionInstance()->GetNumericProperty(
339 "generic.thread_bytes_allocated", &stat
);
351 return mm
.getDeallocated();
354 if (MallocExtensionInstance
) {
356 MallocExtensionInstance()->GetNumericProperty(
357 "generic.thread_bytes_freed", &stat
);
364 ///////////////////////////////////////////////////////////////////////////////
368 * All information we collect about a frame.
372 Frame
*m_parent
; // ptr to parent frame
373 const char *m_name
; // function name
374 uint8_t m_hash_code
; // hash_code for the function name
375 int m_recursion
; // recursion level for function
377 uint64_t m_tsc_start
; // start value for TSC counter
378 int64_t m_mu_start
; // memory usage
379 int64_t m_pmu_start
; // peak memory usage
380 int64_t m_vtsc_start
; // user/sys time start
383 * Returns formatted function name
385 * @param result_buf ptr to result buf
386 * @param result_len max size of result buf
387 * @return total size of the function name returned in result_buf
390 size_t getName(char *result_buf
, size_t result_len
) {
391 if (result_len
<= 1) {
392 return 0; // Insufficient result_bug. Bail!
395 // Add '@recurse_level' if required
396 // NOTE: Dont use snprintf's return val as it is compiler dependent
398 snprintf(result_buf
, result_len
, "%s@%d", m_name
, m_recursion
);
400 snprintf(result_buf
, result_len
, "%s", m_name
);
403 // Force null-termination at MAX
404 result_buf
[result_len
- 1] = 0;
405 return strlen(result_buf
);
409 * Build a caller qualified name for a callee.
411 * For example, if A() is caller for B(), then it returns "A==>B".
412 * Recursive invokations are denoted with @<n> where n is the recursion
415 * For example, "foo==>foo@1", and "foo@2==>foo@3" are examples of direct
416 * recursion. And "bar==>foo@1" is an example of an indirect recursive
417 * call to foo (implying the foo() is on the call stack some levels
420 size_t getStack(int level
, char *result_buf
, size_t result_len
) {
421 // End recursion if we dont need deeper levels or
422 // we dont have any deeper levels
423 if (!m_parent
|| level
<= 1) {
424 return getName(result_buf
, result_len
);
427 // Take care of all ancestors first
428 size_t len
= m_parent
->getStack(level
- 1, result_buf
, result_len
);
429 if (result_len
< (len
+ HP_STACK_DELIM_LEN
)) {
430 return len
; // Insufficient result_buf. Bail out!
433 // Add delimiter only if entry had ancestors
435 strncat(result_buf
+ len
, HP_STACK_DELIM
, result_len
- len
);
436 len
+= HP_STACK_DELIM_LEN
;
439 // Append the current function name
440 return len
+ getName(result_buf
+ len
, result_len
- len
);
450 MeasureXhprofDisable
= 0x20,
463 s_compressed_trace("(compressed_trace)");
466 * Maintain profiles of a running stack.
470 Profiler() : m_successful(true), m_stack(nullptr),
471 m_frame_free_list(nullptr) {
472 if (!s_rand_initialized
) {
473 s_rand_initialized
= true;
474 srand(math_generate_seed());
477 // bind to a random cpu so that we can use rdtsc instruction.
478 int cur_cpu_id
= rand() % s_machine
.m_cpu_num
;
479 GET_AFFINITY(0, sizeof(cpu_set_t
), &m_prev_mask
);
480 MachineInfo::BindToCPU(cur_cpu_id
);
481 m_MHz
= s_machine
.m_cpu_frequencies
[cur_cpu_id
];
483 memset(m_func_hash_counters
, 0, sizeof(m_func_hash_counters
));
486 virtual ~Profiler() {
487 SET_AFFINITY(0, sizeof(cpu_set_t
), &m_prev_mask
);
490 for (Frame
*p
= m_frame_free_list
; p
;) {
498 * Subclass can do extra work by overriding these two virtual functions.
500 virtual void beginFrameEx() {} // called right before a function call
501 virtual void endFrameEx() {} // called right after a function is finished
506 virtual void writeStats(Array
&ret
) {}
509 * Start a new frame with the specified symbol.
511 virtual void beginFrame(const char *symbol
) __attribute__ ((noinline
)) ;
514 * End top of the stack.
516 virtual void endFrame(const char *symbol
,
517 bool endMain
= false) __attribute__ ((noinline
)) ;
519 virtual void endAllFrames() {
521 endFrame(nullptr, true);
525 template<class phpret
, class Name
, class Counts
>
526 static void returnVals(phpret
& ret
, const Name
& name
, const Counts
& counts
,
527 int flags
, int64_t MHz
)
530 arr
.set(s_ct
, counts
.count
);
531 arr
.set(s_wt
, to_usec(counts
.wall_time
, MHz
));
532 if (flags
& TrackCPU
) {
533 arr
.set(s_cpu
, to_usec(counts
.cpu
, MHz
, true));
535 if (flags
& TrackMemory
) {
536 arr
.set(s_mu
, counts
.memory
);
537 arr
.set(s_pmu
, counts
.peak_memory
);
538 } else if (flags
& TrackMalloc
) {
539 arr
.set(s_alloc
, counts
.memory
);
540 arr
.set(s_free
, counts
.peak_memory
);
542 ret
.set(String(name
), arr
.create());
545 template<class phpret
, class StatsMap
>
546 static bool extractStats(phpret
& ret
, StatsMap
& stats
, int flags
, int64_t MHz
)
548 for (typename
StatsMap::const_iterator iter
= stats
.begin();
549 iter
!= stats
.end(); ++iter
) {
550 returnVals(ret
, iter
->first
, iter
->second
, flags
, MHz
);
557 int64_t m_MHz
; // cpu freq for either the local cpu or the saved trace
558 Frame
*m_stack
; // top of the profile stack
560 static bool s_rand_initialized
;
562 cpu_set_t m_prev_mask
; // saved cpu affinity
563 Frame
*m_frame_free_list
; // freelist of Frame
564 uint8_t m_func_hash_counters
[256]; // counter table by hash code;
567 * Fast allocate a Frame structure. Picks one from the
568 * free list if available, else does an actual allocate.
570 Frame
*createFrame(const char *symbol
) {
571 Frame
*p
= m_frame_free_list
;
573 m_frame_free_list
= p
->m_parent
;
575 p
= (Frame
*)malloc(sizeof(Frame
));
577 p
->m_parent
= m_stack
;
579 p
->m_hash_code
= hprof_inline_hash(symbol
);
585 * Fast free a Frame structure. Simply returns back the Frame to a free list
586 * and doesn't actually perform the free.
588 void releaseFrame() {
592 m_stack
= p
->m_parent
;
593 p
->m_parent
= m_frame_free_list
; // we overload the m_parent field here
594 m_frame_free_list
= p
;
597 bool Profiler::s_rand_initialized
= false;
599 void Profiler::beginFrame(const char *symbol
) {
600 Frame
*current
= createFrame(symbol
);
602 // NOTE(cjiang): use hash code to fend off most of call-stack traversal
603 int recursion_level
= 0;
604 if (m_func_hash_counters
[current
->m_hash_code
] > 0) {
605 // Find this symbols recurse level
606 for (Frame
*p
= current
->m_parent
; p
; p
= p
->m_parent
) {
607 if (strcmp(current
->m_name
, p
->m_name
) == 0) {
608 recursion_level
= p
->m_recursion
+ 1;
613 current
->m_recursion
= recursion_level
;
615 m_func_hash_counters
[current
->m_hash_code
]++;
620 * End top of the stack.
622 void Profiler::endFrame(const char *symbol
, bool endMain
) {
624 // special case for main() frame that's only ended by endAllFrames()
625 if (!endMain
&& m_stack
->m_parent
== nullptr) {
629 m_func_hash_counters
[m_stack
->m_hash_code
]--;
634 ///////////////////////////////////////////////////////////////////////////////
638 * vtsc() based profiler, but simple enough to print basic information.
640 * When available, we now use the vtsc() call, which is relatively inexpensive
641 * and accurate. It's still a system call, but a cheap one. If the call isn't
642 * available, the comment below still applies. --renglish
644 * COMMENT(cjiang): getrusage is very expensive and inaccurate. It is based
645 * on sampling at the rate about once every 5 to 10 miliseconds. The sampling
646 * error can be very significantly, especially given that we are
647 * instrumenting at a very fine granularity. (every PHP function call will
648 * lead to one invokation of getrusage.) Most PHP functions such as the
649 * built-ins typically finish in microseconds. Thus the result we get from
650 * getrusage is very likely going to be skewed. Also worth noting that
651 * getrusage actually is a system call, which involves expensive swapping
652 * between user-mode and kernel mode. I would suggest we remove collecting
653 * CPU usage all together, as exclusive wall-time is very useful already.
654 * Or at least we should make it an opt-in choice.
656 * See: http://ww2.cs.fsu.edu/~hines/present/timing_linux.pdf
658 * Above is a nice paper talking about the overhead and the inaccuracy problem
659 * associated with getrusage.
661 class SimpleProfiler
: public Profiler
{
665 CountMap() : count(0), tsc(0), vtsc(0) {}
671 typedef hphp_hash_map
<std::string
, CountMap
, string_hash
> StatsMap
;
672 StatsMap m_stats
; // outcome
676 echo("<div style='display:none'>");
684 virtual void beginFrameEx() {
685 m_stack
->m_tsc_start
= cpuCycles();
686 m_stack
->m_vtsc_start
= vtsc(m_MHz
);
689 virtual void endFrameEx() {
690 CountMap
&counts
= m_stats
[m_stack
->m_name
];
692 counts
.tsc
+= cpuCycles() - m_stack
->m_tsc_start
;
693 counts
.vtsc
+= vtsc(m_MHz
) - m_stack
->m_vtsc_start
;
697 void print_output() {
698 echo("<link rel='stylesheet' href='/css/hotprofiler.css' type='text/css'>"
699 "<script language='javascript' src='/js/hotprofiler.js'></script>"
700 "<p><center><h2>Hotprofiler Data</h2></center><br>"
701 "<div id='hotprofiler_stats'></div>"
702 "<script language='javascript'>hotprofiler_data = [");
703 for (StatsMap::const_iterator iter
= m_stats
.begin();
704 iter
!= m_stats
.end(); ++iter
) {
706 echo(iter
->first
.c_str());
709 const CountMap
&counts
= iter
->second
;
714 ",\"ct\": %" PRId64
",\"wt\": %" PRId64
",\"ut\": %" PRId64
",\"st\": 0",
715 counts
.count
, (int64_t)to_usec(counts
.tsc
, m_MHz
),
716 (int64_t)to_usec(counts
.vtsc
, m_MHz
, true));
721 echo("]; write_data('ut', false);</script><br><br> <br>");
725 ///////////////////////////////////////////////////////////////////////////////
726 // HierarchicalProfiler
728 class HierarchicalProfiler
: public Profiler
{
732 CountMap() : count(0), wall_time(0), cpu(0), memory(0), peak_memory(0) {}
740 typedef hphp_hash_map
<std::string
, CountMap
, string_hash
> StatsMap
;
741 StatsMap m_stats
; // outcome
746 explicit HierarchicalProfiler(int flags
) : m_flags(flags
) {
749 virtual void beginFrameEx() {
750 m_stack
->m_tsc_start
= cpuCycles();
752 if (m_flags
& TrackCPU
) {
753 m_stack
->m_vtsc_start
= vtsc(m_MHz
);
756 if (m_flags
& TrackMemory
) {
757 auto const& stats
= MM().getStats();
758 m_stack
->m_mu_start
= stats
.usage
;
759 m_stack
->m_pmu_start
= stats
.peakUsage
;
760 } else if (m_flags
& TrackMalloc
) {
761 m_stack
->m_mu_start
= get_allocs();
762 m_stack
->m_pmu_start
= get_frees();
766 virtual void endFrameEx() {
768 m_stack
->getStack(2, symbol
, sizeof(symbol
));
769 CountMap
&counts
= m_stats
[symbol
];
771 counts
.wall_time
+= cpuCycles() - m_stack
->m_tsc_start
;
773 if (m_flags
& TrackCPU
) {
774 counts
.cpu
+= vtsc(m_MHz
) - m_stack
->m_vtsc_start
;
777 if (m_flags
& TrackMemory
) {
778 auto const& stats
= MM().getStats();
779 int64_t mu_end
= stats
.usage
;
780 int64_t pmu_end
= stats
.peakUsage
;
781 counts
.memory
+= mu_end
- m_stack
->m_mu_start
;
782 counts
.peak_memory
+= pmu_end
- m_stack
->m_pmu_start
;
783 } else if (m_flags
& TrackMalloc
) {
784 counts
.memory
+= get_allocs() - m_stack
->m_mu_start
;
785 counts
.peak_memory
+= get_frees() - m_stack
->m_pmu_start
;
789 virtual void writeStats(Array
&ret
) {
790 extractStats(ret
, m_stats
, m_flags
, m_MHz
);
797 ///////////////////////////////////////////////////////////////////////////////
800 // Walks a log of function enter and exit events captured by
801 // TraceProfiler and generates statistics for each function executed.
802 template <class TraceIterator
, class Stats
>
806 TraceIterator trace
; // Pointer to the log entry which pushed this frame
807 int level
; // Recursion level for this function
808 int len
; // Length of the function name
813 , m_arcBuff((char*)malloc(200))
819 for (auto& r
: m_recursion
) delete[] r
.first
;
822 void walk(TraceIterator begin
, TraceIterator end
, Stats
& stats
) {
823 if (begin
== end
) return;
824 m_recursion
.push_back(std::make_pair(nullptr, 0));
825 // Trim exit traces off the front of the log. These may be due to
826 // the return from turning tracing on.
827 std::map
<const char*, unsigned> functionLevel
;
828 auto current
= begin
;
829 while (current
!= end
&& !current
->symbol
) ++current
;
830 while (current
!= end
) {
831 if (!current
->is_func_exit
) {
832 unsigned level
= ++functionLevel
[current
->symbol
];
833 if (level
>= m_recursion
.size()) {
834 char *level_string
= new char[8];
835 sprintf(level_string
, "@%u", level
);
836 m_recursion
.push_back(std::make_pair(level_string
,
837 strlen(level_string
)));
841 fr
.level
= level
- 1;
842 fr
.len
= strlen(current
->symbol
);
843 checkArcBuff(fr
.len
);
844 m_stack
.push_back(fr
);
845 } else if (m_stack
.size() > 1) {
846 validateStack(current
, stats
); // NB: may update m_stack.
847 --functionLevel
[m_stack
.back().trace
->symbol
];
848 popFrame(current
, stats
);
853 while (m_stack
.size() > 1) {
854 popFrame(current
, stats
);
856 if (!m_stack
.empty()) {
857 incStats(m_stack
.back().trace
->symbol
, current
, m_stack
.back(), stats
);
859 if (m_badArcCount
> 0) {
860 stats
["(trace has mismatched calls and returns)"].count
= m_badArcCount
;
865 void checkArcBuff(int len
) {
866 len
= 2*len
+ HP_STACK_DELIM_LEN
+ 2;
867 if (len
>= m_arcBuffLen
) {
869 m_arcBuff
= (char *)realloc(m_arcBuff
, m_arcBuffLen
);
870 if (m_arcBuff
== nullptr) {
871 throw std::bad_alloc();
876 void incStats(const char* arc
, TraceIterator tr
, const Frame
& fr
,
878 auto& st
= stats
[arc
];
880 st
.wall_time
+= tr
->wall_time
- fr
.trace
->wall_time
;
881 st
.cpu
+= tr
->cpu
- fr
.trace
->cpu
;
882 st
.memory
+= tr
->memory
- fr
.trace
->memory
;
883 st
.peak_memory
+= tr
->peak_memory
- fr
.trace
->peak_memory
;
886 // Look for mismatched enter and exit events, and try to correct if
887 // we can. Only try to correct very simple imbalances... we could go
888 // nuts here, but it's likely not worth it.
889 void validateStack(TraceIterator tIt
, Stats
& stats
) {
890 auto enteredName
= m_stack
.back().trace
->symbol
;
891 auto exitedName
= tIt
->symbol
;
892 if ((exitedName
!= nullptr) &&
893 ((enteredName
== nullptr) || (strcmp(enteredName
, exitedName
) != 0))) {
894 // We have a few special names that we form on entry. We don't
895 // have the information to form them again on exit, so tolerate
896 // them here. See EventHook::GetFunctionNameForProfiler().
897 if ((enteredName
!= nullptr) &&
898 ((strncmp(enteredName
, "run_init::", 10) == 0) ||
899 (strcmp(enteredName
, "_") == 0))) return;
901 if (m_stack
.size() > 1) {
902 auto callerName
= (m_stack
.end() - 2)->trace
->symbol
;
903 if ((callerName
!= nullptr) && (strcmp(callerName
, exitedName
) == 0)) {
904 // We have an exit for Foo(), but we were in Bar(). However,
905 // it appears that Foo() was the caller of Bar(). This
906 // suggests we've missed the exit event for Bar() and have
907 // the exit event for Foo() in hand. So remove Bar() to
908 // re-balance the stack.
913 // The first few bad arcs typically point at the problem, so
914 // report them. The rest we'll just count.
915 if (++m_badArcCount
< 20) {
918 badArc
= folly::format("(warning: corrected bad arc #{}: "
919 "enter '{}', exit '{}')",
921 enteredName
, exitedName
).str();
923 badArc
= folly::format("(error: bad arc #{}: "
924 "enter '{}', exit '{}')",
926 enteredName
, exitedName
).str();
928 ++stats
[badArc
.data()].count
;
933 void popFrame(TraceIterator tIt
, Stats
& stats
) {
934 Frame callee
= m_stack
.back();
936 Frame
& caller
= m_stack
.back();
937 char *cp
= m_arcBuff
;
938 memcpy(cp
, caller
.trace
->symbol
, caller
.len
);
940 if (caller
.level
>= 1) {
941 std::pair
<char*, int>& lvl
= m_recursion
[caller
.level
];
942 memcpy(cp
, lvl
.first
, lvl
.second
);
945 memcpy(cp
, HP_STACK_DELIM
, HP_STACK_DELIM_LEN
);
946 cp
+= HP_STACK_DELIM_LEN
;
947 memcpy(cp
, callee
.trace
->symbol
, callee
.len
);
949 if (callee
.level
>= 1) {
950 std::pair
<char*, int>& lvl
= m_recursion
[callee
.level
];
951 memcpy(cp
, lvl
.first
, lvl
.second
);
955 incStats(m_arcBuff
, tIt
, callee
, stats
);
958 vector
<std::pair
<char*, int>> m_recursion
;
959 vector
<Frame
> m_stack
;
965 // Profiler which makes a log of all function enter and exit events,
966 // then processes that into per-function statistics. A single-frame
967 // stack trace is used to aggregate stats for each function when
968 // called from different call sites.
969 class TraceProfiler
: public Profiler
{
971 explicit TraceProfiler(int flags
)
972 : m_traceBuffer(nullptr)
973 , m_traceBufferSize(0)
974 , m_nextTraceEntry(0)
975 , m_traceBufferFilled(false)
976 , m_maxTraceBuffer(0)
980 if (pthread_mutex_trylock(&s_inUse
)) {
981 // This profiler uses a very large amount of memory. Only allow
982 // one in the process at any time.
983 m_successful
= false;
986 sprintf(buf
, "%d", RuntimeOption::ProfilerMaxTraceBuffer
);
987 IniSetting::Bind("profiler.max_trace_buffer", buf
,
988 ini_on_update_long
, &m_maxTraceBuffer
);
995 IniSetting::Unbind("profiler.max_trace_buffer");
996 pthread_mutex_unlock(&s_inUse
);
1001 // Data measued on function entry and exit
1006 // It's not plausible that we need a full 64bits to hold memory
1007 // stats, no matter what the collection mode. So we steal one bit
1008 // from the memory field to use as a flag. We want to keep this
1009 // data structure small since we need a huge number of them during
1011 int64_t memory
: 63; // Total memory, or memory allocated depending on flags
1012 bool is_func_exit
: 1; // Is the entry for a function exit?
1013 int64_t peak_memory
: 63; // Peak memory, or memory freed depending on flags
1014 uint64_t unused
: 1; // Unused, to keep memory and peak_memory the same size
1017 wall_time
= cpu
= memory
= peak_memory
= 0;
1019 static void compileTimeAssertions() {
1020 static_assert(sizeof(TraceData
) == (sizeof(uint64_t) * 4), "");
1024 // One entry in the log, representing a function enter or exit event
1025 struct TraceEntry
: TraceData
{
1026 const char *symbol
; // Function name
1029 bool isTraceSpaceAvailable() {
1030 // the two slots are reserved for internal use
1031 return m_nextTraceEntry
< m_traceBufferSize
- 3;
1034 bool ensureTraceSpace() {
1035 bool track_realloc
= FALSE
;
1036 if (m_traceBufferFilled
) {
1041 if (m_traceBufferSize
== 0) {
1042 new_array_size
= RuntimeOption::ProfilerTraceBuffer
;
1044 new_array_size
= m_traceBufferSize
*
1045 RuntimeOption::ProfilerTraceExpansion
;
1046 if (m_maxTraceBuffer
!= 0 && new_array_size
> m_maxTraceBuffer
) {
1047 new_array_size
= m_maxTraceBuffer
> m_traceBufferSize
?
1048 m_maxTraceBuffer
: m_traceBufferSize
;
1050 if (new_array_size
- m_nextTraceEntry
<= 5) {
1051 // for this operation to succeed, we need room for the entry we're
1052 // adding, two realloc entries, and two entries to mark the end of
1054 m_traceBufferFilled
= true;
1055 collectStats("(trace buffer terminated)", false,
1056 m_traceBuffer
[m_nextTraceEntry
++]);
1059 track_realloc
= TRUE
;
1061 if (track_realloc
) {
1062 collectStats("(trace buffer realloc)", false,
1063 m_traceBuffer
[m_nextTraceEntry
++]);
1067 MemoryManager::MaskAlloc
masker(*info
->m_mm
);
1068 TraceEntry
*r
= (TraceEntry
*)realloc((void *)m_traceBuffer
,
1069 new_array_size
* sizeof(TraceEntry
));
1072 m_traceBufferFilled
= true;
1073 if (m_traceBuffer
) {
1074 collectStats("(trace buffer terminated)", false,
1075 m_traceBuffer
[m_nextTraceEntry
++]);
1079 m_traceBufferSize
= new_array_size
;
1082 if (track_realloc
) {
1083 collectStats("(trace buffer realloc)", true,
1084 m_traceBuffer
[m_nextTraceEntry
++]);
1089 virtual void beginFrame(const char *symbol
) {
1090 doTrace(symbol
, false);
1093 virtual void endFrame(const char *symbol
, bool endMain
= false) {
1094 doTrace(symbol
, true);
1097 virtual void endAllFrames() {
1098 if (m_traceBuffer
&& m_nextTraceEntry
< m_traceBufferSize
- 1) {
1100 m_traceBufferFilled
= true;
1104 void collectStats(const char *symbol
, bool isFuncExit
, TraceEntry
& te
) {
1106 te
.is_func_exit
= isFuncExit
;
1110 void collectStats(TraceData
& te
) {
1111 te
.wall_time
= cpuCycles();
1113 if (m_flags
& TrackCPU
) {
1114 te
.cpu
= vtsc(m_MHz
);
1116 if (m_flags
& TrackMemory
) {
1117 auto const& stats
= MM().getStats();
1118 te
.memory
= stats
.usage
;
1119 te
.peak_memory
= stats
.peakUsage
;
1120 } else if (m_flags
& TrackMalloc
) {
1121 te
.memory
= get_allocs();
1122 te
.peak_memory
= get_frees();
1129 TraceEntry
* nextTraceEntry() {
1130 if (!isTraceSpaceAvailable() && !ensureTraceSpace()) {
1133 return &m_traceBuffer
[m_nextTraceEntry
++];
1136 void doTrace(const char *symbol
, bool isFuncExit
) {
1137 TraceEntry
*te
= nextTraceEntry();
1138 if (te
!= nullptr) {
1139 collectStats(symbol
, isFuncExit
, *te
);
1143 template<class TraceIterator
, class Stats
>
1144 void walkTrace(TraceIterator begin
, TraceIterator end
, Stats
& stats
) {
1145 TraceWalker
<TraceIterator
, Stats
> walker
;
1146 walker
.walk(begin
, end
, stats
);
1149 virtual void writeStats(Array
&ret
) {
1151 collectStats(my_begin
);
1152 walkTrace(m_traceBuffer
, m_traceBuffer
+ m_nextTraceEntry
, m_stats
);
1153 if (m_overflowCalls
) {
1154 m_stats
["(trace buffer terminated)"].count
+= m_overflowCalls
/2;
1156 extractStats(ret
, m_stats
, m_flags
, m_MHz
);
1157 CountedTraceData allocStats
;
1158 allocStats
.count
= 0;
1159 allocStats
.peak_memory
= allocStats
.memory
=
1160 m_nextTraceEntry
* sizeof(*m_traceBuffer
);
1161 returnVals(ret
, "(trace buffer alloc)", allocStats
, m_flags
, m_MHz
);
1162 if (m_flags
& MeasureXhprofDisable
) {
1163 CountedTraceData my_end
;
1164 collectStats(my_end
);
1166 my_end
.cpu
-= my_begin
.cpu
;
1167 my_end
.wall_time
-= my_begin
.wall_time
;
1168 my_end
.memory
-= my_begin
.memory
;
1169 my_end
.peak_memory
-= my_begin
.peak_memory
;
1170 returnVals(ret
, "xhprof_post_processing()", my_end
, m_flags
, m_MHz
);
1174 TraceEntry
* m_traceBuffer
;
1175 int m_traceBufferSize
;
1176 int m_nextTraceEntry
;
1177 bool m_traceBufferFilled
;
1178 int64_t m_maxTraceBuffer
;
1179 int64_t m_overflowCalls
;
1182 // Final stats, per-function per-callsite, with a count of how many
1183 // times the function was called from that callsite.
1184 class CountedTraceData
: public TraceData
{
1187 CountedTraceData() : count(0) { clear(); }
1189 typedef hphp_hash_map
<std::string
, CountedTraceData
, string_hash
> StatsMap
;
1190 StatsMap m_stats
; // outcome
1192 static pthread_mutex_t s_inUse
;
1195 pthread_mutex_t
TraceProfiler::s_inUse
= PTHREAD_MUTEX_INITIALIZER
;
1197 ///////////////////////////////////////////////////////////////////////////////
1201 * Sampling based profiler.
1203 class SampleProfiler
: public Profiler
{
1205 typedef hphp_hash_map
<std::string
, int64_t, string_hash
> CountMap
;
1206 typedef hphp_hash_map
<std::string
, CountMap
, string_hash
> StatsMap
;
1207 StatsMap m_stats
; // outcome
1212 uint64_t truncated_us
;
1213 uint64_t truncated_tsc
;
1215 // Init the last_sample in tsc
1216 m_last_sample_tsc
= cpuCycles();
1218 // Find the microseconds that need to be truncated
1219 gettimeofday(&m_last_sample_time
, 0);
1220 now
= m_last_sample_time
;
1221 hp_trunc_time(&m_last_sample_time
, SAMPLING_INTERVAL
);
1223 // Subtract truncated time from last_sample_tsc
1224 truncated_us
= get_us_interval(&m_last_sample_time
, &now
);
1225 truncated_tsc
= truncated_us
* m_MHz
;
1226 if (m_last_sample_tsc
> truncated_tsc
) {
1227 // just to be safe while subtracting unsigned ints
1228 m_last_sample_tsc
-= truncated_tsc
;
1231 // Convert sampling interval to ticks
1232 m_sampling_interval_tsc
= SAMPLING_INTERVAL
* m_MHz
;
1235 virtual void beginFrameEx() {
1239 virtual void endFrameEx() {
1243 virtual void writeStats(Array
&ret
) {
1244 for (StatsMap::const_iterator iter
= m_stats
.begin();
1245 iter
!= m_stats
.end(); ++iter
) {
1247 const CountMap
&counts
= iter
->second
;
1248 for (CountMap::const_iterator iterCount
= counts
.begin();
1249 iterCount
!= counts
.end(); ++iterCount
) {
1250 arr
.set(String(iterCount
->first
), iterCount
->second
);
1252 ret
.set(String(iter
->first
), arr
);
1257 static const int SAMPLING_INTERVAL
= 100000; // microsecs
1259 struct timeval m_last_sample_time
;
1260 uint64_t m_last_sample_tsc
;
1261 uint64_t m_sampling_interval_tsc
;
1264 * Sample the stack. Add it to the stats_count global.
1266 * @param tv current time
1267 * @param entries func stack as linked list of hprof_entry_t
1271 void sample_stack() {
1273 snprintf(key
, sizeof(key
), "%" PRId64
".%06" PRId64
,
1274 (int64_t)m_last_sample_time
.tv_sec
,
1275 (int64_t)m_last_sample_time
.tv_usec
);
1278 m_stack
->getStack(INT_MAX
, symbol
, sizeof(symbol
));
1279 m_stats
[key
][symbol
] = 1;
1283 * Checks to see if it is time to sample the stack.
1284 * Calls hp_sample_stack() if its time.
1286 * @param entries func stack as linked list of hprof_entry_t
1287 * @param last_sample time the last sample was taken
1288 * @param sampling_intr sampling interval in microsecs
1292 void sample_check() {
1294 // While loop is to handle a single function taking a long time
1295 // and passing several sampling intervals
1296 while ((cpuCycles() - m_last_sample_tsc
) > m_sampling_interval_tsc
) {
1297 m_last_sample_tsc
+= m_sampling_interval_tsc
;
1298 // HAS TO BE UPDATED BEFORE calling sample_stack
1299 incr_us_interval(&m_last_sample_time
, SAMPLING_INTERVAL
);
1306 ///////////////////////////////////////////////////////////////////////////////
1308 class ProfilerFactory
: public RequestEventHandler
{
1315 Sample
= 620002, // Rockfort's zip code
1318 static bool EnableNetworkProfiler
;
1321 ProfilerFactory() : m_profiler(nullptr) {
1324 ~ProfilerFactory() {
1328 Profiler
*getProfiler() {
1332 virtual void requestInit() {
1335 virtual void requestShutdown() {
1337 m_artificialFrameNames
.reset();
1340 void start(Level level
, long flags
) {
1341 if (!RuntimeOption::EnableHotProfiler
) {
1344 // This will be disabled automatically when the thread completes the request
1345 HPHP::EventHook::Enable();
1346 if (m_profiler
== nullptr) {
1349 m_profiler
= new SimpleProfiler();
1352 m_profiler
= new HierarchicalProfiler(flags
);
1355 m_profiler
= new SampleProfiler();
1358 m_profiler
= new TraceProfiler(flags
);
1361 throw_invalid_argument("level: %d", level
);
1364 if (m_profiler
->m_successful
) {
1365 m_profiler
->beginFrame("main()");
1366 ThreadInfo::s_threadInfo
->m_profiler
= m_profiler
;
1369 m_profiler
= nullptr;
1376 m_profiler
->endAllFrames();
1379 m_profiler
->writeStats(ret
);
1381 m_profiler
= nullptr;
1382 ThreadInfo::s_threadInfo
->m_profiler
= nullptr;
1386 return uninit_null();
1390 * The whole purpose to make sure "const char *" is safe to take on these
1393 void cacheString(const String
& name
) {
1394 m_artificialFrameNames
.append(name
);
1398 Profiler
*m_profiler
;
1399 Array m_artificialFrameNames
;
1402 bool ProfilerFactory::EnableNetworkProfiler
= false;
1405 IMPLEMENT_STATIC_REQUEST_LOCAL(ProfilerFactory
, s_factory
);
1408 ///////////////////////////////////////////////////////////////////////////////
1411 void f_hotprofiler_enable(int level
) {
1414 if (level
== ProfilerFactory::Hierarchical
) {
1415 flags
= TrackBuiltins
;
1416 } else if (level
== ProfilerFactory::Memory
) {
1417 level
= ProfilerFactory::Hierarchical
;
1418 flags
= TrackBuiltins
| TrackMemory
;
1420 s_factory
->start((ProfilerFactory::Level
)level
, flags
);
1424 Variant
f_hotprofiler_disable() {
1426 return s_factory
->stop();
1428 return uninit_null();
1432 void f_phprof_enable(int flags
/* = 0 */) {
1434 s_factory
->start(ProfilerFactory::Hierarchical
, flags
);
1438 Variant
f_phprof_disable() {
1440 return s_factory
->stop();
1442 return uninit_null();
1446 void f_fb_setprofile(CVarRef callback
) {
1448 if (ThreadInfo::s_threadInfo
->m_profiler
!= nullptr) {
1449 // phpprof is enabled, don't let PHP code override it
1453 g_vmContext
->m_setprofileCallback
= callback
;
1454 if (callback
.isNull()) {
1455 HPHP::EventHook::Disable();
1457 HPHP::EventHook::Enable();
1461 void f_xhprof_frame_begin(const String
& name
) {
1463 Profiler
*prof
= ThreadInfo::s_threadInfo
->m_profiler
;
1465 s_factory
->cacheString(name
);
1466 prof
->beginFrame(name
.data());
1471 void f_xhprof_frame_end() {
1473 Profiler
*prof
= ThreadInfo::s_threadInfo
->m_profiler
;
1475 prof
->endFrame(nullptr);
1480 void f_xhprof_enable(int flags
/* = 0 */,
1481 CArrRef args
/* = null_array */) {
1483 #ifdef CLOCK_THREAD_CPUTIME_ID
1484 bool missingClockGetTimeNS
=
1485 Util::Vdso::ClockGetTimeNS(CLOCK_THREAD_CPUTIME_ID
) == -1;
1487 bool missingClockGetTimeNS
= true;
1489 if (vtsc_syscall
.num
<= 0 && missingClockGetTimeNS
) {
1490 flags
&= ~TrackVtsc
;
1492 if (flags
& TrackVtsc
) {
1495 if (flags
& XhpTrace
) {
1496 s_factory
->start(ProfilerFactory::Trace
, flags
);
1498 s_factory
->start(ProfilerFactory::Hierarchical
, flags
);
1503 Variant
f_xhprof_disable() {
1505 return s_factory
->stop();
1507 return uninit_null();
1511 void f_xhprof_network_enable() {
1512 ServerStats::StartNetworkProfile();
1515 Variant
f_xhprof_network_disable() {
1516 return ServerStats::EndNetworkProfile();
1519 void f_xhprof_sample_enable() {
1521 s_factory
->start(ProfilerFactory::Sample
, 0);
1525 Variant
f_xhprof_sample_disable() {
1527 return s_factory
->stop();
1529 return uninit_null();
1533 ///////////////////////////////////////////////////////////////////////////////
1535 const int64_t k_XHPROF_FLAGS_NO_BUILTINS
= TrackBuiltins
;
1536 const int64_t k_XHPROF_FLAGS_CPU
= TrackCPU
;
1537 const int64_t k_XHPROF_FLAGS_MEMORY
= TrackMemory
;
1538 const int64_t k_XHPROF_FLAGS_VTSC
= TrackVtsc
;
1539 const int64_t k_XHPROF_FLAGS_TRACE
= XhpTrace
;
1540 const int64_t k_XHPROF_FLAGS_MEASURE_XHPROF_DISABLE
= MeasureXhprofDisable
;
1541 const int64_t k_XHPROF_FLAGS_MALLOC
= TrackMalloc
;
1543 ///////////////////////////////////////////////////////////////////////////////
1546 void begin_profiler_frame(Profiler
*p
, const char *symbol
) {
1547 p
->beginFrame(symbol
);
1550 void end_profiler_frame(Profiler
*p
, const char *symbol
) {
1551 p
->endFrame(symbol
);
1554 ///////////////////////////////////////////////////////////////////////////////