2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-2014 Facebook, Inc. (http://www.facebook.com) |
6 | Copyright (c) 1997-2010 The PHP Group |
7 +----------------------------------------------------------------------+
8 | This source file is subject to version 3.01 of the PHP license, |
9 | that is bundled with this package in the file LICENSE, and is |
10 | available through the world-wide-web at the following url: |
11 | http://www.php.net/license/3_01.txt |
12 | If you did not receive a copy of the PHP license and are unable to |
13 | obtain it through the world-wide-web, please send a note to |
14 | license@php.net so we can mail you a copy immediately. |
15 +----------------------------------------------------------------------+
18 #include "hphp/runtime/ext/ext_hotprofiler.h"
20 #include "hphp/runtime/base/builtin-functions.h"
21 #include "hphp/runtime/base/memory-manager.h"
22 #include "hphp/runtime/base/request-local.h"
23 #include "hphp/runtime/base/zend-math.h"
24 #include "hphp/runtime/base/ini-setting.h"
25 #include "hphp/runtime/vm/event-hook.h"
26 #include "hphp/runtime/vm/jit/translator-inline.h"
27 #include "hphp/util/alloc.h"
28 #include "hphp/util/vdso.h"
29 #include "hphp/util/cycles.h"
30 #include "hphp/runtime/base/variable-serializer.h"
31 #include "hphp/runtime/ext/std/ext_std_function.h"
32 #include "hphp/runtime/ext/ext_system_profiler.h"
33 #include "hphp/runtime/ext/xdebug/xdebug_profiler.h"
34 #include "hphp/runtime/ext/extension-registry.h"
35 #include "hphp/runtime/base/request-event-handler.h"
38 #include <sys/resource.h>
48 // Append the delimiter
49 #define HP_STACK_DELIM "==>"
50 #define HP_STACK_DELIM_LEN (sizeof(HP_STACK_DELIM) - 1)
54 IMPLEMENT_DEFAULT_EXTENSION_VERSION(hotprofiler
, NO_VERSION_YET
);
59 const StaticString
s_hotprofiler("hotprofiler");
61 ///////////////////////////////////////////////////////////////////////////////
65 * Get time delta in microseconds.
67 static long get_us_interval(struct timeval
*start
, struct timeval
*end
) {
68 return (((end
->tv_sec
- start
->tv_sec
) * 1000000)
69 + (end
->tv_usec
- start
->tv_usec
));
73 * Incr time with the given microseconds.
75 static void incr_us_interval(struct timeval
*start
, uint64_t incr
) {
76 incr
+= (start
->tv_sec
* 1000000 + start
->tv_usec
);
77 start
->tv_sec
= incr
/1000000;
78 start
->tv_usec
= incr
%1000000;
83 * Truncates the given timeval to the nearest slot begin, where
84 * the slot size is determined by intr
86 * @param tv Input timeval to be truncated in place
87 * @param intr Time interval in microsecs - slot width
91 static void hp_trunc_time(struct timeval
*tv
, uint64_t intr
) {
92 uint64_t time_in_micro
;
94 // Convert to microsecs and trunc that first
95 time_in_micro
= (tv
->tv_sec
* 1000000) + tv
->tv_usec
;
96 time_in_micro
/= intr
;
97 time_in_micro
*= intr
;
100 tv
->tv_sec
= (time_in_micro
/ 1000000);
101 tv
->tv_usec
= (time_in_micro
% 1000000);
104 ///////////////////////////////////////////////////////////////////////////////
105 // High precision timer related functions.
108 * This is a microbenchmark to get cpu frequency the process is running on. The
109 * returned value is used to convert TSC counter values to microseconds.
114 static int64_t get_cpu_frequency() {
115 struct timeval start
;
118 if (gettimeofday(&start
, 0)) {
119 perror("gettimeofday");
122 uint64_t tsc_start
= cpuCycles();
125 // Busy loop for 5 miliseconds. Don't use usleep() here since it causes the
126 // CPU to halt which will generate meaningless results.
128 for (i
= 0; i
< 1000000; i
++);
129 if (gettimeofday(&end
, 0)) {
130 perror("gettimeofday");
133 tsc_end
= cpuCycles();
134 } while (get_us_interval(&start
, &end
) < 5000);
136 return nearbyint((tsc_end
- tsc_start
) * 1.0
137 / (get_us_interval(&start
, &end
)));
140 #define MAX_LINELENGTH 1024
142 static int64_t* get_cpu_frequency_from_file(const char *file
, int ncpus
)
144 std::ifstream
cpuinfo(file
);
145 if (cpuinfo
.fail()) {
148 char line
[MAX_LINELENGTH
];
149 int64_t* freqs
= new int64_t[ncpus
];
150 for (int i
= 0; i
< ncpus
; ++i
) {
155 while (cpuinfo
.getline(line
, sizeof(line
))) {
156 if (sscanf(line
, "processor : %d", &processor
) == 1) {
160 if (sscanf(line
, "cpu MHz : %f", &freq
) == 1) {
161 if (processor
!= -1 && processor
< ncpus
) {
162 freqs
[processor
] = nearbyint(freq
);
167 for (int i
= 0; i
< ncpus
; ++i
) {
176 ///////////////////////////////////////////////////////////////////////////////
177 // Machine information that we collect just once.
182 * Bind the current process to a specified CPU. This function is to ensure
183 * that the OS won't schedule the process to different processors, which
184 * would make values read by rdtsc unreliable.
186 * @param uint32 cpu_id, the id of the logical cpu to be bound to.
190 static void BindToCPU(uint32_t cpu_id
) {
193 CPU_SET(cpu_id
, &new_mask
);
194 SET_AFFINITY(0, sizeof(cpu_set_t
), &new_mask
);
198 // The number of logical CPUs this machine has.
200 // Store the cpu frequency. Get it from /proc/cpuinfo if we can.
201 int64_t* m_cpu_frequencies
;
204 m_cpu_num
= sysconf(_SC_NPROCESSORS_CONF
);
205 m_cpu_frequencies
= get_cpu_frequency_from_file("/proc/cpuinfo", m_cpu_num
);
207 if (m_cpu_frequencies
)
210 m_cpu_frequencies
= new int64_t[m_cpu_num
];
211 for (int i
= 0; i
< m_cpu_num
; i
++) {
213 GET_AFFINITY(0, sizeof(cpu_set_t
), &prev_mask
);
215 // Make sure the current process gets scheduled to the target cpu. This
216 // might not be necessary though.
218 m_cpu_frequencies
[i
] = get_cpu_frequency();
219 SET_AFFINITY(0, sizeof(cpu_set_t
), &prev_mask
);
224 delete[] m_cpu_frequencies
;
227 static MachineInfo s_machine
;
229 static inline uint64_t
230 tv_to_cycles(const struct timeval
& tv
, int64_t MHz
)
232 return (((uint64_t)tv
.tv_sec
* 1000000) + tv
.tv_usec
) * MHz
;
235 static inline uint64_t
236 to_usec(int64_t cycles
, int64_t MHz
, bool cpu_time
= false)
238 #ifdef CLOCK_THREAD_CPUTIME_ID
239 static int64_t vdso_usable
= Vdso::ClockGetTimeNS(CLOCK_THREAD_CPUTIME_ID
);
241 static int64_t vdso_usable
= -1;
244 if (cpu_time
&& vdso_usable
>= 0)
245 return cycles
/ 1000;
246 return (cycles
+ MHz
/2) / MHz
;
249 static inline uint64_t cpuTime(int64_t MHz
) {
250 #ifdef CLOCK_THREAD_CPUTIME_ID
251 int64_t rval
= Vdso::ClockGetTimeNS(CLOCK_THREAD_CPUTIME_ID
);
257 getrusage(RUSAGE_SELF
, &usage
);
259 tv_to_cycles(usage
.ru_utime
, MHz
) + tv_to_cycles(usage
.ru_stime
, MHz
);
267 return mm
.getAllocated();
270 if (MallocExtensionInstance
) {
272 MallocExtensionInstance()->GetNumericProperty(
273 "generic.thread_bytes_allocated", &stat
);
285 return mm
.getDeallocated();
288 if (MallocExtensionInstance
) {
290 MallocExtensionInstance()->GetNumericProperty(
291 "generic.thread_bytes_freed", &stat
);
298 size_t Frame::getName(char *result_buf
, size_t result_len
) {
299 if (result_len
<= 1) {
300 return 0; // Insufficient result_bug. Bail!
303 // Add '@recurse_level' if required
304 // NOTE: Dont use snprintf's return val as it is compiler dependent
306 snprintf(result_buf
, result_len
, "%s@%d", m_name
, m_recursion
);
308 snprintf(result_buf
, result_len
, "%s", m_name
);
311 // Force null-termination at MAX
312 result_buf
[result_len
- 1] = 0;
313 return strlen(result_buf
);
316 size_t Frame::getStack(int level
, char *result_buf
, size_t result_len
) {
317 // End recursion if we dont need deeper levels or
318 // we dont have any deeper levels
319 if (!m_parent
|| level
<= 1) {
320 return getName(result_buf
, result_len
);
323 // Take care of all ancestors first
324 size_t len
= m_parent
->getStack(level
- 1, result_buf
, result_len
);
325 if (result_len
< (len
+ HP_STACK_DELIM_LEN
)) {
326 return len
; // Insufficient result_buf. Bail out!
329 // Add delimiter only if entry had ancestors
331 strncat(result_buf
+ len
, HP_STACK_DELIM
, result_len
- len
);
332 len
+= HP_STACK_DELIM_LEN
;
335 // Append the current function name
336 return len
+ getName(result_buf
+ len
, result_len
- len
);
347 s_compressed_trace("(compressed_trace)");
350 * Maintain profiles of a running stack.
352 Profiler::Profiler(bool needCPUAffinity
) : m_successful(true),
354 m_frame_free_list(nullptr),
355 m_has_affinity(needCPUAffinity
) {
356 if (!s_rand_initialized
) {
357 s_rand_initialized
= true;
358 srand(math_generate_seed());
361 if (m_has_affinity
) {
363 // Bind to a random cpu so that we can use rdtsc instruction.
365 int cur_cpu_id
= rand() % s_machine
.m_cpu_num
;
366 GET_AFFINITY(0, sizeof(cpu_set_t
), &m_prev_mask
);
367 MachineInfo::BindToCPU(cur_cpu_id
);
368 m_MHz
= s_machine
.m_cpu_frequencies
[cur_cpu_id
];
371 // Take cpu0's speed as a proxy for all cpus.
373 m_MHz
= s_machine
.m_cpu_frequencies
[0];
376 memset(m_func_hash_counters
, 0, sizeof(m_func_hash_counters
));
379 Profiler::~Profiler() {
380 if (m_has_affinity
) {
381 SET_AFFINITY(0, sizeof(cpu_set_t
), &m_prev_mask
);
385 for (Frame
*p
= m_frame_free_list
; p
;) {
393 * Called right before a function call.
395 void Profiler::beginFrameEx(const char *symbol
) {
399 * Called right after a function is finished.
401 void Profiler::endFrameEx(const TypedValue
*retval
,
402 const char *_symbol
) {
405 void Profiler::writeStats(Array
&ret
) {
408 void Profiler::endAllFrames() {
410 endFrame(nullptr, nullptr, true);
414 template<class phpret
, class Name
, class Counts
>
415 void Profiler::returnVals(phpret
& ret
, const Name
& name
, const Counts
& counts
,
416 int flags
, int64_t MHz
)
418 ArrayInit
arr(5, ArrayInit::Map
{});
419 arr
.set(s_ct
, counts
.count
);
420 arr
.set(s_wt
, to_usec(counts
.wall_time
, MHz
));
421 if (flags
& TrackCPU
) {
422 arr
.set(s_cpu
, to_usec(counts
.cpu
, MHz
, true));
424 if (flags
& TrackMemory
) {
425 arr
.set(s_mu
, counts
.memory
);
426 arr
.set(s_pmu
, counts
.peak_memory
);
427 } else if (flags
& TrackMalloc
) {
428 arr
.set(s_alloc
, counts
.memory
);
429 arr
.set(s_free
, counts
.peak_memory
);
431 ret
.set(String(name
), arr
.toArray());
434 template<class phpret
, class StatsMap
>
435 bool Profiler::extractStats(phpret
& ret
, StatsMap
& stats
, int flags
,
438 for (typename
StatsMap::const_iterator iter
= stats
.begin();
439 iter
!= stats
.end(); ++iter
) {
440 returnVals(ret
, iter
->first
, iter
->second
, flags
, MHz
);
445 bool Profiler::s_rand_initialized
= false;
447 void Profiler::beginFrame(const char *symbol
) {
448 Frame
*current
= createFrame(symbol
);
450 // NOTE(cjiang): use hash code to fend off most of call-stack traversal
451 int recursion_level
= 0;
452 if (m_func_hash_counters
[current
->m_hash_code
] > 0) {
453 // Find this symbols recurse level
454 for (Frame
*p
= current
->m_parent
; p
; p
= p
->m_parent
) {
455 if (strcmp(current
->m_name
, p
->m_name
) == 0) {
456 recursion_level
= p
->m_recursion
+ 1;
461 current
->m_recursion
= recursion_level
;
463 m_func_hash_counters
[current
->m_hash_code
]++;
464 beginFrameEx(symbol
);
468 * End top of the stack.
470 void Profiler::endFrame(const TypedValue
*retval
,
474 // special case for main() frame that's only ended by endAllFrames()
475 if (!endMain
&& m_stack
->m_parent
== nullptr) {
478 endFrameEx(retval
, symbol
);
479 m_func_hash_counters
[m_stack
->m_hash_code
]--;
484 ///////////////////////////////////////////////////////////////////////////////
485 // HierarchicalProfiler
487 class HierarchicalProfiler final
: public Profiler
{
491 CountMap() : count(0), wall_time(0), cpu(0), memory(0), peak_memory(0) {}
500 class HierarchicalProfilerFrame
: public Frame
{
502 virtual ~HierarchicalProfilerFrame() {
505 uint64_t m_tsc_start
; // start value for TSC counter
506 int64_t m_mu_start
; // memory usage
507 int64_t m_pmu_start
; // peak memory usage
508 int64_t m_vtsc_start
; // user/sys time start
511 typedef hphp_hash_map
<std::string
, CountMap
, string_hash
> StatsMap
;
512 StatsMap m_stats
; // outcome
515 explicit HierarchicalProfiler(int flags
) : Profiler(true), m_flags(flags
) {
518 Frame
*allocateFrame() override
{
519 return new HierarchicalProfilerFrame();
522 void beginFrameEx(const char *symbol
) override
{
523 HierarchicalProfilerFrame
*frame
=
524 dynamic_cast<HierarchicalProfilerFrame
*>(m_stack
);
525 frame
->m_tsc_start
= cpuCycles();
527 if (m_flags
& TrackCPU
) {
528 frame
->m_vtsc_start
= cpuTime(m_MHz
);
531 if (m_flags
& TrackMemory
) {
532 auto const& stats
= MM().getStats();
533 frame
->m_mu_start
= stats
.usage
;
534 frame
->m_pmu_start
= stats
.peakUsage
;
535 } else if (m_flags
& TrackMalloc
) {
536 frame
->m_mu_start
= get_allocs();
537 frame
->m_pmu_start
= get_frees();
541 void endFrameEx(const TypedValue
*retval
, const char *given_symbol
) override
{
543 HierarchicalProfilerFrame
*frame
=
544 dynamic_cast<HierarchicalProfilerFrame
*>(m_stack
);
545 frame
->getStack(2, symbol
, sizeof(symbol
));
546 CountMap
&counts
= m_stats
[symbol
];
548 counts
.wall_time
+= cpuCycles() - frame
->m_tsc_start
;
550 if (m_flags
& TrackCPU
) {
551 counts
.cpu
+= cpuTime(m_MHz
) - frame
->m_vtsc_start
;
554 if (m_flags
& TrackMemory
) {
555 auto const& stats
= MM().getStats();
556 int64_t mu_end
= stats
.usage
;
557 int64_t pmu_end
= stats
.peakUsage
;
558 counts
.memory
+= mu_end
- frame
->m_mu_start
;
559 counts
.peak_memory
+= pmu_end
- frame
->m_pmu_start
;
560 } else if (m_flags
& TrackMalloc
) {
561 counts
.memory
+= get_allocs() - frame
->m_mu_start
;
562 counts
.peak_memory
+= get_frees() - frame
->m_pmu_start
;
566 void writeStats(Array
&ret
) override
{
567 extractStats(ret
, m_stats
, m_flags
, m_MHz
);
570 bool shouldSkipBuiltins() const override
{
571 return m_flags
& NoTrackBuiltins
;
578 ///////////////////////////////////////////////////////////////////////////////
581 // Walks a log of function enter and exit events captured by
582 // TraceProfiler and generates statistics for each function executed.
583 template <class TraceIterator
, class Stats
>
587 TraceIterator trace
; // Pointer to the log entry which pushed this frame
588 int level
; // Recursion level for this function
589 int len
; // Length of the function name
594 , m_arcBuff((char*)malloc(200))
600 for (auto& r
: m_recursion
) delete[] r
.first
;
603 void walk(TraceIterator begin
, TraceIterator end
, TraceIterator final
,
605 if (begin
== end
) return;
606 m_recursion
.push_back(std::make_pair(nullptr, 0));
607 // Trim exit traces off the front of the log. These may be due to
608 // the return from turning tracing on.
609 std::map
<const char*, unsigned> functionLevel
;
610 auto current
= begin
;
611 while (current
!= end
&& !current
->symbol
) ++current
;
612 while (current
!= end
) {
613 if (!current
->is_func_exit
) {
614 unsigned level
= ++functionLevel
[current
->symbol
];
615 if (level
>= m_recursion
.size()) {
616 char *level_string
= new char[8];
617 sprintf(level_string
, "@%u", level
);
618 m_recursion
.push_back(std::make_pair(level_string
,
619 strlen(level_string
)));
623 fr
.level
= level
- 1;
624 fr
.len
= strlen(current
->symbol
);
625 checkArcBuff(fr
.len
);
626 m_stack
.push_back(fr
);
627 } else if (m_stack
.size() > 1) {
628 validateStack(current
, stats
); // NB: may update m_stack.
629 --functionLevel
[m_stack
.back().trace
->symbol
];
630 popFrame(current
, stats
);
634 // Close the dangling stack with the last entry. This
635 // under-represents any functions still on the stack.
637 while (m_stack
.size() > 1) {
638 popFrame(current
, stats
);
640 // Close main() with the final data from when profiling was turned
641 // off. This ensures main() represents the entire run, even if we
642 // run out of log space.
643 if (!m_stack
.empty()) {
644 assert(strcmp(m_stack
.back().trace
->symbol
, "main()") == 0);
645 incStats(m_stack
.back().trace
->symbol
, final
, m_stack
.back(), stats
);
647 if (m_badArcCount
> 0) {
648 stats
["(trace has mismatched calls and returns)"].count
= m_badArcCount
;
653 void checkArcBuff(int len
) {
654 len
= 2*len
+ HP_STACK_DELIM_LEN
+ 2;
655 if (len
>= m_arcBuffLen
) {
657 m_arcBuff
= (char *)realloc(m_arcBuff
, m_arcBuffLen
);
658 if (m_arcBuff
== nullptr) {
659 throw std::bad_alloc();
664 void incStats(const char* arc
, TraceIterator tr
, const Frame
& fr
,
666 auto& st
= stats
[arc
];
668 st
.wall_time
+= tr
->wall_time
- fr
.trace
->wall_time
;
669 st
.cpu
+= tr
->cpu
- fr
.trace
->cpu
;
670 st
.memory
+= tr
->memory
- fr
.trace
->memory
;
671 st
.peak_memory
+= tr
->peak_memory
- fr
.trace
->peak_memory
;
674 // Look for mismatched enter and exit events, and try to correct if
675 // we can. Only try to correct very simple imbalances... we could go
676 // nuts here, but it's likely not worth it.
677 void validateStack(TraceIterator tIt
, Stats
& stats
) {
678 auto enteredName
= m_stack
.back().trace
->symbol
;
679 auto exitedName
= tIt
->symbol
;
680 if ((exitedName
!= nullptr) &&
681 ((enteredName
== nullptr) || (strcmp(enteredName
, exitedName
) != 0))) {
682 // We have a few special names that we form on entry. We don't
683 // have the information to form them again on exit, so tolerate
684 // them here. See EventHook::GetFunctionNameForProfiler().
685 if ((enteredName
!= nullptr) &&
686 ((strncmp(enteredName
, "run_init::", 10) == 0) ||
687 (strcmp(enteredName
, "_") == 0))) return;
689 if (m_stack
.size() > 1) {
690 auto callerName
= (m_stack
.end() - 2)->trace
->symbol
;
691 if ((callerName
!= nullptr) && (strcmp(callerName
, exitedName
) == 0)) {
692 // We have an exit for Foo(), but we were in Bar(). However,
693 // it appears that Foo() was the caller of Bar(). This
694 // suggests we've missed the exit event for Bar() and have
695 // the exit event for Foo() in hand. So remove Bar() to
696 // re-balance the stack.
701 // The first few bad arcs typically point at the problem, so
702 // report them. The rest we'll just count.
703 if (++m_badArcCount
< 20) {
706 badArc
= folly::format("(warning: corrected bad arc #{}: "
707 "enter '{}', exit '{}')",
709 enteredName
, exitedName
).str();
711 badArc
= folly::format("(error: bad arc #{}: "
712 "enter '{}', exit '{}')",
714 enteredName
, exitedName
).str();
716 ++stats
[badArc
.data()].count
;
721 void popFrame(TraceIterator tIt
, Stats
& stats
) {
722 Frame callee
= m_stack
.back();
724 Frame
& caller
= m_stack
.back();
725 char *cp
= m_arcBuff
;
726 memcpy(cp
, caller
.trace
->symbol
, caller
.len
);
728 if (caller
.level
>= 1) {
729 std::pair
<char*, int>& lvl
= m_recursion
[caller
.level
];
730 memcpy(cp
, lvl
.first
, lvl
.second
);
733 memcpy(cp
, HP_STACK_DELIM
, HP_STACK_DELIM_LEN
);
734 cp
+= HP_STACK_DELIM_LEN
;
735 memcpy(cp
, callee
.trace
->symbol
, callee
.len
);
737 if (callee
.level
>= 1) {
738 std::pair
<char*, int>& lvl
= m_recursion
[callee
.level
];
739 memcpy(cp
, lvl
.first
, lvl
.second
);
743 incStats(m_arcBuff
, tIt
, callee
, stats
);
746 vector
<std::pair
<char*, int>> m_recursion
;
747 vector
<Frame
> m_stack
;
753 // Profiler which makes a log of all function enter and exit events,
754 // then processes that into per-function statistics. A single-frame
755 // stack trace is used to aggregate stats for each function when
756 // called from different call sites.
757 class TraceProfiler
: public Profiler
{
759 explicit TraceProfiler(int flags
)
761 , m_traceBuffer(nullptr)
762 , m_traceBufferSize(0)
763 , m_nextTraceEntry(0)
764 , m_traceBufferFilled(false)
765 , m_maxTraceBuffer(0)
769 if (!(m_flags
& IHaveInfiniteMemory
) && pthread_mutex_trylock(&s_inUse
)) {
770 // This profiler uses a very large amount of memory. Only allow
771 // one in the process at any time.
772 m_successful
= false;
774 m_maxTraceBuffer
= RuntimeOption::ProfilerMaxTraceBuffer
;
775 Extension
* ext
= ExtensionRegistry::get(s_hotprofiler
);
777 IniSetting::Bind(ext
, IniSetting::PHP_INI_ALL
,
778 "profiler.max_trace_buffer",
786 IniSetting::Unbind("profiler.max_trace_buffer");
787 pthread_mutex_unlock(&s_inUse
);
792 // Data measued on function entry and exit
797 // It's not plausible that we need a full 64bits to hold memory
798 // stats, no matter what the collection mode. So we steal one bit
799 // from the memory field to use as a flag. We want to keep this
800 // data structure small since we need a huge number of them during
802 int64_t memory
: 63; // Total memory, or memory allocated depending on flags
803 bool is_func_exit
: 1; // Is the entry for a function exit?
804 int64_t peak_memory
: 63; // Peak memory, or memory freed depending on flags
805 uint64_t unused
: 1; // Unused, to keep memory and peak_memory the same size
808 wall_time
= cpu
= memory
= peak_memory
= 0;
810 static void compileTimeAssertions() {
811 static_assert(sizeof(TraceData
) == (sizeof(uint64_t) * 4), "");
815 // One entry in the log, representing a function enter or exit event
816 struct TraceEntry
: TraceData
{
817 const char *symbol
; // Function name
820 bool isTraceSpaceAvailable() {
821 // the two slots are reserved for internal use
822 return m_nextTraceEntry
< m_traceBufferSize
- 3;
825 bool ensureTraceSpace() {
826 bool track_realloc
= false;
827 if (m_traceBufferFilled
) {
832 if (m_traceBufferSize
== 0) {
833 new_array_size
= RuntimeOption::ProfilerTraceBuffer
;
835 new_array_size
= m_traceBufferSize
*
836 RuntimeOption::ProfilerTraceExpansion
;
837 if (m_maxTraceBuffer
!= 0 && new_array_size
> m_maxTraceBuffer
) {
838 new_array_size
= m_maxTraceBuffer
> m_traceBufferSize
?
839 m_maxTraceBuffer
: m_traceBufferSize
;
841 if (new_array_size
- m_nextTraceEntry
<= 5) {
842 // for this operation to succeed, we need room for the entry we're
843 // adding, two realloc entries, and two entries to mark the end of
845 m_traceBufferFilled
= true;
846 collectStats("(trace buffer terminated)", false,
847 m_traceBuffer
[m_nextTraceEntry
++]);
850 track_realloc
= true;
853 collectStats("(trace buffer realloc)", false,
854 m_traceBuffer
[m_nextTraceEntry
++]);
857 MemoryManager::MaskAlloc
masker(MM());
858 auto r
= (TraceEntry
*)realloc((void*)m_traceBuffer
,
859 new_array_size
* sizeof(TraceEntry
));
862 m_traceBufferFilled
= true;
864 collectStats("(trace buffer terminated)", false,
865 m_traceBuffer
[m_nextTraceEntry
++]);
869 m_traceBufferSize
= new_array_size
;
873 collectStats("(trace buffer realloc)", true,
874 m_traceBuffer
[m_nextTraceEntry
++]);
879 virtual void beginFrame(const char *symbol
) override
{
880 doTrace(symbol
, false);
883 virtual void endFrame(const TypedValue
*retval
,
885 bool endMain
= false) override
{
886 doTrace(symbol
, true);
889 virtual void endAllFrames() override
{
890 if (m_traceBuffer
&& m_nextTraceEntry
< m_traceBufferSize
- 1) {
891 collectStats(nullptr, true, m_finalEntry
);
892 m_traceBufferFilled
= true;
896 void collectStats(const char *symbol
, bool isFuncExit
, TraceEntry
& te
) {
898 te
.is_func_exit
= isFuncExit
;
902 void collectStats(TraceData
& te
) {
903 te
.wall_time
= cpuCycles();
905 if (m_flags
& TrackCPU
) {
906 te
.cpu
= cpuTime(m_MHz
);
908 if (m_flags
& TrackMemory
) {
909 auto const& stats
= MM().getStats();
910 te
.memory
= stats
.usage
;
911 te
.peak_memory
= stats
.peakUsage
;
912 } else if (m_flags
& TrackMalloc
) {
913 te
.memory
= get_allocs();
914 te
.peak_memory
= get_frees();
921 TraceEntry
* nextTraceEntry() {
922 if (!isTraceSpaceAvailable() && !ensureTraceSpace()) {
925 return &m_traceBuffer
[m_nextTraceEntry
++];
928 void doTrace(const char *symbol
, bool isFuncExit
) {
929 TraceEntry
*te
= nextTraceEntry();
931 collectStats(symbol
, isFuncExit
, *te
);
935 template<class TraceIterator
, class Stats
>
936 void walkTrace(TraceIterator begin
, TraceIterator end
, TraceIterator final
,
938 TraceWalker
<TraceIterator
, Stats
> walker
;
939 walker
.walk(begin
, end
, final
, stats
);
942 virtual void writeStats(Array
&ret
) override
{
944 collectStats(my_begin
);
945 walkTrace(m_traceBuffer
, m_traceBuffer
+ m_nextTraceEntry
, &m_finalEntry
,
947 if (m_overflowCalls
) {
948 m_stats
["(trace buffer terminated)"].count
+= m_overflowCalls
/2;
950 extractStats(ret
, m_stats
, m_flags
, m_MHz
);
951 CountedTraceData allocStats
;
952 allocStats
.count
= 0;
953 allocStats
.peak_memory
= allocStats
.memory
=
954 m_nextTraceEntry
* sizeof(*m_traceBuffer
);
955 returnVals(ret
, "(trace buffer alloc)", allocStats
, m_flags
, m_MHz
);
956 if (m_flags
& MeasureXhprofDisable
) {
957 CountedTraceData my_end
;
958 collectStats(my_end
);
960 my_end
.cpu
-= my_begin
.cpu
;
961 my_end
.wall_time
-= my_begin
.wall_time
;
962 my_end
.memory
-= my_begin
.memory
;
963 my_end
.peak_memory
-= my_begin
.peak_memory
;
964 returnVals(ret
, "xhprof_post_processing()", my_end
, m_flags
, m_MHz
);
968 virtual bool shouldSkipBuiltins() const override
{
969 return m_flags
& NoTrackBuiltins
;
972 TraceEntry
* m_traceBuffer
;
973 TraceEntry m_finalEntry
;
974 int m_traceBufferSize
;
975 int m_nextTraceEntry
;
976 bool m_traceBufferFilled
;
977 int64_t m_maxTraceBuffer
;
978 int64_t m_overflowCalls
;
981 // Final stats, per-function per-callsite, with a count of how many
982 // times the function was called from that callsite.
983 class CountedTraceData
: public TraceData
{
986 CountedTraceData() : count(0) { clear(); }
988 typedef hphp_hash_map
<std::string
, CountedTraceData
, string_hash
> StatsMap
;
989 StatsMap m_stats
; // outcome
991 static pthread_mutex_t s_inUse
;
994 pthread_mutex_t
TraceProfiler::s_inUse
= PTHREAD_MUTEX_INITIALIZER
;
996 ///////////////////////////////////////////////////////////////////////////////
1000 * Sampling based profiler.
1002 class SampleProfiler
: public Profiler
{
1004 typedef std::pair
<int64_t, int64_t> Timestamp
;
1005 typedef smart::vector
<std::pair
<Timestamp
, std::string
>> SampleVec
;
1006 SampleVec m_samples
; // outcome
1009 SampleProfiler() : Profiler(true) {
1011 uint64_t truncated_us
;
1012 uint64_t truncated_tsc
;
1014 // Init the last_sample in tsc
1015 m_last_sample_tsc
= cpuCycles();
1017 // Find the microseconds that need to be truncated
1018 gettimeofday(&m_last_sample_time
, 0);
1019 now
= m_last_sample_time
;
1020 hp_trunc_time(&m_last_sample_time
, SAMPLING_INTERVAL
);
1022 // Subtract truncated time from last_sample_tsc
1023 truncated_us
= get_us_interval(&m_last_sample_time
, &now
);
1024 truncated_tsc
= truncated_us
* m_MHz
;
1025 if (m_last_sample_tsc
> truncated_tsc
) {
1026 // just to be safe while subtracting unsigned ints
1027 m_last_sample_tsc
-= truncated_tsc
;
1030 // Convert sampling interval to ticks
1031 m_sampling_interval_tsc
= SAMPLING_INTERVAL
* m_MHz
;
1034 virtual void beginFrameEx(const char *symbol
) override
{
1038 virtual void endFrameEx(const TypedValue
*retvalue
,
1039 const char *symbol
) override
{
1043 virtual void writeStats(Array
&ret
) override
{
1044 for (auto const& sample
: m_samples
) {
1045 auto const& time
= sample
.first
;
1047 snprintf(timestr
, sizeof(timestr
), "%" PRId64
".%06" PRId64
,
1048 time
.first
, time
.second
);
1050 ret
.set(String(timestr
), String(sample
.second
));
1055 static const int SAMPLING_INTERVAL
= 100000; // microsecs
1057 struct timeval m_last_sample_time
;
1058 uint64_t m_last_sample_tsc
;
1059 uint64_t m_sampling_interval_tsc
;
1062 * Sample the stack. Add it to the stats_count global.
1064 * @param tv current time
1065 * @param entries func stack as linked list of hprof_entry_t
1069 void sample_stack() {
1071 m_stack
->getStack(INT_MAX
, symbol
, sizeof(symbol
));
1073 auto time
= std::make_pair((int64_t)m_last_sample_time
.tv_sec
,
1074 (int64_t)m_last_sample_time
.tv_usec
);
1075 m_samples
.push_back(std::make_pair(time
, symbol
));
1079 * Checks to see if it is time to sample the stack.
1080 * Calls hp_sample_stack() if its time.
1082 * @param entries func stack as linked list of hprof_entry_t
1083 * @param last_sample time the last sample was taken
1084 * @param sampling_intr sampling interval in microsecs
1088 void sample_check() {
1090 // While loop is to handle a single function taking a long time
1091 // and passing several sampling intervals
1092 while ((cpuCycles() - m_last_sample_tsc
) > m_sampling_interval_tsc
) {
1093 m_last_sample_tsc
+= m_sampling_interval_tsc
;
1094 // HAS TO BE UPDATED BEFORE calling sample_stack
1095 incr_us_interval(&m_last_sample_time
, SAMPLING_INTERVAL
);
1102 ///////////////////////////////////////////////////////////////////////////////
1103 // Memoization Opportunity Profiler
1105 // Identifies potential memoization opportunities by comparing the
1106 // serialized form of function return values. Functions which return
1107 // the same data every time, but not the same instance, are
1110 // For member functions, the args are serialized and kept along with
1111 // the 'this' pointer, and used to look for constant returns given the
1114 // This profiler is very, very slow. It tries to be faster by giving
1115 // up on functions quickly, and making a quick test to ignore them
1116 // later. It also ignores functions which return "small"
1117 // things. Nevertheless, you likely need to adjust
1118 // Server.RequestTimeoutSeconds to get a full request processed.
1121 // - Collect the data cross-request to find APC opportunities.
1122 // - Auto-identify keys to functions which could be used to place things in APC
1123 // - i.e., same per-request, but not cross-request unless, say, locale is
1126 // @TODO: this is quite ghetto right now, but is useful as is to
1127 // others. In particular, it should provide the results via the return
1128 // value from writeStats, not print to stderr :) Task 3396401 tracks this.
1130 class MemoProfiler
: public Profiler
{
1132 explicit MemoProfiler(int flags
) : Profiler(true) {}
1138 virtual void beginFrame(const char *symbol
) override
{
1140 ActRec
*ar
= vmfp();
1142 if (ar
->hasThis()) {
1143 auto& memo
= m_memos
[symbol
];
1144 if (!memo
.m_ignore
) {
1145 auto args
= hhvm_get_frame_args(ar
, 0);
1146 args
.append((int64_t)(ar
->getThis())); // Use the pointer not the obj
1147 VariableSerializer
vs(VariableSerializer::Type::DebuggerSerialize
);
1150 sdata
= vs
.serialize(args
, true);
1153 fprintf(stderr
, "Args Serialization failure: %s\n", symbol
);
1157 m_stack
.push_back(f
);
1160 virtual void endFrame(const TypedValue
*retval
,
1162 bool endMain
= false) override
{
1163 if (m_stack
.empty()) {
1164 fprintf(stderr
, "STACK IMBALANCE empty %s\n", symbol
);
1167 auto f
= m_stack
.back();
1169 if (strcmp(f
.m_symbol
, symbol
) != 0) {
1170 fprintf(stderr
, "STACK IMBALANCE %s\n", symbol
);
1173 auto& memo
= m_memos
[symbol
];
1174 if (memo
.m_ignore
) return;
1176 memo
.m_ignore
= true;
1178 ActRec
*ar
= vmfp();
1179 // Lots of random cases to skip just to keep this simple for
1180 // now. There's no reason not to do more later.
1181 if (!g_context
->m_faults
.empty()) return;
1182 if (ar
->m_func
->isCPPBuiltin() || ar
->resumed()) return;
1183 auto ret
= tvAsCVarRef(retval
);
1184 if (ret
.isNull()) return;
1185 if (!(ret
.isString() || ret
.isObject() || ret
.isArray())) return;
1186 VariableSerializer
vs(VariableSerializer::Type::DebuggerSerialize
);
1189 sdata
= vs
.serialize(ret
, true);
1191 fprintf(stderr
, "Serialization failure: %s\n", symbol
);
1194 if (sdata
.length() < 3) return;
1195 if (ar
->hasThis()) {
1196 memo
.m_has_this
= true;
1197 auto& member_memo
= memo
.m_member_memos
[f
.m_args
.data()];
1198 ++member_memo
.m_count
;
1199 if (member_memo
.m_return_value
.length() == 0) { // First time
1200 member_memo
.m_return_value
= sdata
;
1201 // Intentionally copy the raw pointer value
1202 member_memo
.m_ret_tv
= *retval
;
1203 memo
.m_ignore
= false;
1204 } else if (member_memo
.m_return_value
== sdata
) { // Same
1205 memo
.m_ignore
= false;
1206 if ((member_memo
.m_ret_tv
.m_data
.num
!= retval
->m_data
.num
) ||
1207 (member_memo
.m_ret_tv
.m_type
!= retval
->m_type
)) {
1208 memo
.m_ret_tv_same
= false;
1211 memo
.m_member_memos
.clear(); // Cleanup and ignore
1214 if (memo
.m_return_value
.length() == 0) { // First time
1215 memo
.m_return_value
= sdata
;
1216 // Intentionally copy the raw pointer value
1217 memo
.m_ret_tv
= *retval
;
1218 memo
.m_ignore
= false;
1219 } else if (memo
.m_return_value
== sdata
) { // Same
1220 memo
.m_ignore
= false;
1221 if ((memo
.m_ret_tv
.m_data
.num
!= retval
->m_data
.num
) ||
1222 (memo
.m_ret_tv
.m_type
!= retval
->m_type
)) {
1223 memo
.m_ret_tv_same
= false;
1226 memo
.m_return_value
= ""; // Different, cleanup and ignore
1231 virtual void endAllFrames() override
{
1232 // Nothing to do for this profiler since all work is done as we go.
1235 virtual void writeStats(Array
&ret
) override
{
1236 fprintf(stderr
, "writeStats start\n");
1237 // RetSame: the return value is the same instance every time
1238 // HasThis: call has a this argument
1239 // AllSame: all returns were the same data even though args are different
1240 // MemberCount: number of different arg sets (including this)
1241 fprintf(stderr
, "Count Function MinSerLen MaxSerLen RetSame HasThis "
1242 "AllSame MemberCount\n");
1243 for (auto& me
: m_memos
) {
1244 if (me
.second
.m_ignore
) continue;
1245 if (me
.second
.m_count
== 1) continue;
1246 int min_ser_len
= 999999999;
1247 int max_ser_len
= 0;
1249 int member_count
= 0;
1250 bool all_same
= true;
1251 if (me
.second
.m_has_this
) {
1252 bool any_multiple
= false;
1253 auto& fr
= me
.second
.m_member_memos
.begin()->second
.m_return_value
;
1254 member_count
= me
.second
.m_member_memos
.size();
1255 for (auto& mme
: me
.second
.m_member_memos
) {
1256 if (mme
.second
.m_return_value
!= fr
) all_same
= false;
1257 count
+= mme
.second
.m_count
;
1258 auto ser_len
= mme
.second
.m_return_value
.length();
1259 min_ser_len
= std::min(min_ser_len
, ser_len
);
1260 max_ser_len
= std::max(max_ser_len
, ser_len
);
1261 if (mme
.second
.m_count
> 1) any_multiple
= true;
1263 if (!any_multiple
&& !all_same
) continue;
1265 min_ser_len
= max_ser_len
= me
.second
.m_return_value
.length();
1266 count
= me
.second
.m_count
;
1267 all_same
= me
.second
.m_ret_tv_same
;
1269 fprintf(stderr
, "%d %s %d %d %s %s %s %d\n",
1270 count
, me
.first
.data(),
1271 min_ser_len
, max_ser_len
,
1272 me
.second
.m_ret_tv_same
? " true" : "false",
1273 me
.second
.m_has_this
? " true" : "false",
1274 all_same
? " true" : "false",
1278 fprintf(stderr
, "writeStats end\n");
1281 struct MemberMemoInfo
{
1282 String m_return_value
;
1283 TypedValue m_ret_tv
;
1286 using MemberMemoMap
= hphp_hash_map
<std::string
, MemberMemoInfo
, string_hash
>;
1289 MemberMemoMap m_member_memos
; // Keyed by serialized args
1290 String m_return_value
;
1291 TypedValue m_ret_tv
;
1293 bool m_ignore
{false};
1294 bool m_has_this
{false};
1295 bool m_ret_tv_same
{true};
1297 using MemoMap
= hphp_hash_map
<std::string
, MemoInfo
, string_hash
>;
1300 explicit Frame(const char* symbol
) : m_symbol(symbol
) {}
1301 const char* m_symbol
;
1306 MemoMap m_memos
; // Keyed by function name
1307 vector
<Frame
> m_stack
;
1310 ///////////////////////////////////////////////////////////////////////////////
1313 bool ProfilerFactory::start(ProfilerKind kind
,
1315 bool beginFrame
/* = true */) {
1316 if (m_profiler
!= nullptr) {
1321 case ProfilerKind::Hierarchical
:
1322 m_profiler
= new HierarchicalProfiler(flags
);
1324 case ProfilerKind::Sample
:
1325 m_profiler
= new SampleProfiler();
1327 case ProfilerKind::Trace
:
1328 m_profiler
= new TraceProfiler(flags
);
1330 case ProfilerKind::Memo
:
1331 m_profiler
= new MemoProfiler(flags
);
1333 case ProfilerKind::XDebug
:
1334 m_profiler
= new XDebugProfiler();
1336 case ProfilerKind::External
:
1337 if (g_system_profiler
) {
1338 m_profiler
= g_system_profiler
->getHotProfiler();
1339 } else if (m_external_profiler
) {
1340 m_profiler
= m_external_profiler
;
1342 throw_invalid_argument(
1343 "ProfilerFactory::setExternalProfiler() not yet called");
1348 throw_invalid_argument("level: %d", static_cast<int>(kind
));
1351 if (m_profiler
&& m_profiler
->m_successful
) {
1352 // This will be disabled automatically when the thread completes the request
1353 HPHP::EventHook::Enable();
1354 ThreadInfo::s_threadInfo
->m_profiler
= m_profiler
;
1356 m_profiler
->beginFrame("main()");
1361 m_profiler
= nullptr;
1366 Variant
ProfilerFactory::stop() {
1368 m_profiler
->endAllFrames();
1371 m_profiler
->writeStats(ret
);
1373 m_profiler
= nullptr;
1374 ThreadInfo::s_threadInfo
->m_profiler
= nullptr;
1381 bool ProfilerFactory::EnableNetworkProfiler
= false;
1383 IMPLEMENT_REQUEST_LOCAL(ProfilerFactory
, s_profiler_factory
);
1385 ///////////////////////////////////////////////////////////////////////////////
1388 void f_hotprofiler_enable(int ikind
) {
1389 auto kind
= static_cast<ProfilerKind
>(ikind
);
1391 if (kind
== ProfilerKind::Hierarchical
) {
1392 flags
= NoTrackBuiltins
;
1393 } else if (kind
== ProfilerKind::Memory
) {
1394 kind
= ProfilerKind::Hierarchical
;
1395 flags
= NoTrackBuiltins
| TrackMemory
;
1397 if (RuntimeOption::EnableHotProfiler
) {
1398 s_profiler_factory
->start(kind
, flags
);
1402 Variant
f_hotprofiler_disable() {
1403 return s_profiler_factory
->stop();
1406 void f_phprof_enable(int flags
/* = 0 */) {
1407 if (RuntimeOption::EnableHotProfiler
) {
1408 s_profiler_factory
->start(ProfilerKind::Hierarchical
, flags
);
1412 Variant
f_phprof_disable() {
1413 return s_profiler_factory
->stop();
1416 ///////////////////////////////////////////////////////////////////////////////
1419 void begin_profiler_frame(Profiler
*p
,
1420 const char *symbol
) {
1421 p
->beginFrame(symbol
);
1424 void end_profiler_frame(Profiler
*p
,
1425 const TypedValue
*retval
,
1426 const char *symbol
) {
1427 p
->endFrame(retval
, symbol
);
1430 ///////////////////////////////////////////////////////////////////////////////