enhanced PROCESS_FILE
[hiphop-php.git] / hphp / hack / src / utils / cgroup / cgroupProfiler.ml
blob00828f36a53814ef4b0a7a77e8548b10b021f6b1
1 open Hh_prelude
3 (** `cgroup_watcher_start filename subtract_kb_for_array` will reset cgroup_watcher counters, and have it
4 start monitoring the filename (e.g. "/sys/fs/.../memory.current". See [cgroup_watcher_get] for
5 the meaning of [subtract_bytes_for_array]. *)
6 external cgroup_watcher_start : string -> subtract_kb_for_array:int -> unit
7 = "cgroup_watcher_start"
9 (** This returns (hwm_kb, num_readings, seconds_at_gb[||]) that have been tallied since
10 [cgroup_watcher_start]. Hwm_kb is the high water mark of the cgroup. Num_readings is the
11 number of readings that succeeded. The meaning of seconds_at_gb.(i) is that we spent
12 this many seconds with the value between (subtract_kb_for_array+i) and
13 (subtract_kb_for_array+i+1) gb. *)
14 external cgroup_watcher_get : unit -> int * int * float array
15 = "cgroup_watcher_get"
17 type initial_reading =
18 HackEventLogger.ProfileTypeCheck.stats * (CGroup.stats, string) result
20 (** This is the [initial_reading] that we capture upon module load, i.e. process startup *)
21 let initial_reading_capture : initial_reading =
22 let stats =
23 HackEventLogger.ProfileTypeCheck.get_stats
24 ~include_current_process:false
25 (Telemetry.create ())
27 (stats, CGroup.get_stats ())
29 (** This is the [initial_reading] that we'll actually use, i.e. we'll record data relative to this.
30 None means we won't use anything. *)
31 let initial_reading : initial_reading option ref = ref None
33 let get_initial_reading () : initial_reading =
34 match !initial_reading with
35 | None -> initial_reading_capture
36 | Some initial_reading -> initial_reading
38 let use_initial_reading (new_initial_reading : initial_reading) : unit =
39 initial_reading := Some new_initial_reading
41 let get_initial_stats () = get_initial_reading () |> fst
43 (** A small helper: returns either (initial_reading, f initial_reading) if (1) someone
44 has previously called [use_initial_reading] and provided an Ok one, (2) the [current_cgroup]
45 parameter is Ok, (3) the [current_cgroup] has the same cgroup name as the initial one.
46 Otherwise, it returns ({0...}, default). *)
47 let initial_value_map current_cgroup ~f ~default =
48 let open CGroup in
49 match (!initial_reading, current_cgroup) with
50 | (Some (_stats, Ok initial_cgroup), Ok current_cgroup)
51 when String.equal initial_cgroup.cgroup_name current_cgroup.cgroup_name ->
52 (initial_cgroup, f initial_cgroup)
53 | _ ->
54 ( {
55 CGroup.total = 0;
56 total_swap = 0;
57 anon = 0;
58 shmem = 0;
59 file = 0;
60 cgroup_name = "";
62 default )
64 type step_summary = {
65 time: float;
66 log_label_and_suffix: string;
67 step_total: int;
70 type step_group = {
71 name: string; (** e.g. "lazy_init" *)
72 step_count: int ref;
73 (** used to prefix the step-names for telemetry+logging *)
74 log: bool; (** should it be logged to Hh_logger? *)
75 last_printed_total: int ref;
76 (** we only print to Hh_logger if it's markedly changed since last we printed. *)
77 prev_step_to_show: step_summary option ref;
78 (** in case we print, we might want to include something about the previous one, if it hasn't already been printed *)
81 (** We'll only bother writing to the log if the total has changed by at least this many bytes *)
82 let threshold_for_logging = 100 * 1024 * 1024
84 (** to avoid flooding logs with errors *)
85 let has_logged_error = ref SSet.empty
87 (** prints bytes in gb *)
88 let pretty_num i = Printf.sprintf "%0.2fGiB" (float i /. 1073741824.0)
90 (** Records to Hh_logger if it differs from the previous total, and update total *)
91 let log_cgroup_total cgroup ~step_group ~step ~suffix ~total_hwm =
92 let (initial, initial_msg) =
93 initial_value_map cgroup ~default:"" ~f:(fun i ->
94 Printf.sprintf " (relative to initial %s)" (pretty_num i.CGroup.total))
96 match cgroup with
97 | Error _ -> ()
98 | Ok cgroup ->
99 let log_label_and_suffix =
100 Printf.sprintf "%s/%s%s" step_group.name step suffix
102 let prev = !(step_group.prev_step_to_show) in
103 let current =
105 time = Unix.gettimeofday ();
106 log_label_and_suffix;
107 step_total = cgroup.CGroup.total;
111 abs (cgroup.CGroup.total - !(step_group.last_printed_total))
112 < threshold_for_logging
113 then
114 (* We didn't print this current step. But if the *next* step proves to have a big
115 increase in memory, then when we print the next step, we'll also print this one. *)
116 step_group.prev_step_to_show := Some current
117 else begin
118 (* This step is being shown right now. So if the next step proves to have a big
119 increase in memory, we won't need to show this one again! *)
120 step_group.prev_step_to_show := None;
121 step_group.last_printed_total := cgroup.CGroup.total;
122 let hwm =
123 if cgroup.CGroup.total >= total_hwm then
125 else
126 Printf.sprintf
127 " (hwm %s)"
128 (pretty_num (total_hwm - initial.CGroup.total))
130 let prev =
131 match prev with
132 | None -> ""
133 | Some prev ->
134 Printf.sprintf
135 "\n >%s %s [@%s] previous measurement"
136 (Utils.timestring prev.time)
137 (pretty_num (prev.step_total - initial.CGroup.total))
138 prev.log_label_and_suffix
140 Hh_logger.log
141 "Cgroup: %s%s [@%s]%s%s"
142 (pretty_num (cgroup.CGroup.total - initial.CGroup.total))
144 current.log_label_and_suffix
145 initial_msg
146 prev
149 (** Given a float array [secs_at_gb] where secs_at_gb[0] says how many secs were spent at cgroup memory 0gb,
150 secs_at_gb[1] says how many secs were spent at cgroup memory 1gb, and so on, produces an SMap
151 where keys are some arbitrary thresholds "secs_above_20gb" and values are (int) number of seconds
152 spent at that memory or higher. We pick just a few arbitrary thresholds that we think are useful
153 for telemetry, and their sole purpose is telemetry. *)
154 let secs_above_gb_summary (secs_at_gb : float array) : int SMap.t =
155 if Array.is_empty secs_at_gb then
156 SMap.empty
157 else
158 List.init 15 ~f:(fun i -> i * 5) (* 0gb, 5gb, ..., 70gb) *)
159 |> List.filter_map ~f:(fun threshold ->
160 let title = Printf.sprintf "secs_above_%02dgb" threshold in
161 let secs =
162 Array.foldi secs_at_gb ~init:0. ~f:(fun i acc s ->
165 if i < threshold then
167 else
170 let secs = Float.round secs |> int_of_float in
171 Option.some_if (secs > 0) (title, secs))
172 |> SMap.of_list
174 (** Records to HackEventLogger *)
175 let log_telemetry
176 ~step_group
177 ~step
178 ~start_time
179 ~total_hwm
180 ~start_cgroup
181 ~cgroup
182 ~telemetry_ref
183 ~secs_at_total_gb =
184 let open CGroup in
185 let (initial, initial_opt) =
186 initial_value_map cgroup ~default:false ~f:(fun _ -> true)
188 match (start_cgroup, cgroup) with
189 | (Some (Error e), _)
190 | (_, Error e) ->
191 telemetry_ref := Telemetry.create () |> Telemetry.error ~e |> Option.some;
192 if SSet.mem e !has_logged_error then
194 else begin
195 has_logged_error := SSet.add e !has_logged_error;
196 HackEventLogger.CGroup.error e
198 | (Some (Ok start_cgroup), Ok cgroup) ->
199 let total_hwm = max (max start_cgroup.total cgroup.total) total_hwm in
200 let secs_above_total_gb_summary = secs_above_gb_summary secs_at_total_gb in
201 telemetry_ref :=
202 Telemetry.create ()
203 |> SMap.fold
204 (fun key value -> Telemetry.int_ ~key ~value)
205 secs_above_total_gb_summary
206 |> Telemetry.int_
207 ~key:"start_bytes"
208 ~value:(start_cgroup.total - initial.total)
209 |> Telemetry.int_ ~key:"end_bytes" ~value:(cgroup.total - initial.total)
210 |> Telemetry.int_ ~key:"hwm_bytes" ~value:(total_hwm - initial.total)
211 |> Telemetry.int_opt
212 ~key:"relative_to_initial"
213 ~value:(Option.some_if initial_opt initial.total)
214 |> Option.some;
215 HackEventLogger.CGroup.step
216 ~cgroup:cgroup.cgroup_name
217 ~step_group:step_group.name
218 ~step
219 ~start_time
220 ~total_relative_to:(Option.some_if initial_opt initial.total)
221 ~totalswap_relative_to:(Option.some_if initial_opt initial.total_swap)
222 ~anon_relative_to:(Option.some_if initial_opt initial.anon)
223 ~shmem_relative_to:(Option.some_if initial_opt initial.shmem)
224 ~file_relative_to:(Option.some_if initial_opt initial.file)
225 ~total_start:(Some (start_cgroup.total - initial.total))
226 ~totalswap_start:(Some (start_cgroup.total_swap - initial.total_swap))
227 ~anon_start:(Some (start_cgroup.anon - initial.anon))
228 ~shmem_start:(Some (start_cgroup.shmem - initial.shmem))
229 ~file_start:(Some (start_cgroup.file - initial.file))
230 ~total_hwm:(total_hwm - initial.total)
231 ~total:(cgroup.total - initial.total)
232 ~totalswap:(cgroup.total_swap - initial.total_swap)
233 ~anon:(cgroup.anon - initial.anon)
234 ~shmem:(cgroup.shmem - initial.shmem)
235 ~file:(cgroup.file - initial.file)
236 ~secs_at_total_gb:(Some (secs_at_total_gb |> Array.to_list))
237 ~secs_above_total_gb_summary
238 | (None, Ok cgroup) ->
239 telemetry_ref :=
240 Telemetry.create ()
241 |> Telemetry.int_ ~key:"end" ~value:cgroup.total
242 |> Option.some;
243 HackEventLogger.CGroup.step
244 ~cgroup:cgroup.cgroup_name
245 ~step_group:step_group.name
246 ~step
247 ~start_time:None
248 ~total_relative_to:(Option.some_if initial_opt initial.total)
249 ~totalswap_relative_to:(Option.some_if initial_opt initial.total_swap)
250 ~anon_relative_to:(Option.some_if initial_opt initial.anon)
251 ~shmem_relative_to:(Option.some_if initial_opt initial.shmem)
252 ~file_relative_to:(Option.some_if initial_opt initial.file)
253 ~total_start:None
254 ~totalswap_start:None
255 ~anon_start:None
256 ~shmem_start:None
257 ~file_start:None
258 ~total_hwm:(cgroup.total - initial.total)
259 ~total:(cgroup.total - initial.total)
260 ~totalswap:(cgroup.total_swap - initial.total_swap)
261 ~anon:(cgroup.anon - initial.anon)
262 ~shmem:(cgroup.shmem - initial.shmem)
263 ~file:(cgroup.file - initial.file)
264 ~secs_at_total_gb:None
265 ~secs_above_total_gb_summary:SMap.empty
267 let step_group name ~log f =
268 let log = log && Sys_utils.enable_telemetry () in
269 let profiling =
271 name;
272 log;
273 step_count = ref 0;
274 last_printed_total = ref 0;
275 prev_step_to_show = ref None;
278 f profiling
280 let step step_group ?(telemetry_ref = ref None) name =
281 if not step_group.log then
283 else begin
284 step_group.step_count := !(step_group.step_count) + 1;
285 let step = Printf.sprintf "%02d_%s" !(step_group.step_count) name in
286 let cgroup = CGroup.get_stats () in
287 log_cgroup_total cgroup ~step_group ~step ~suffix:"" ~total_hwm:0;
288 log_telemetry
289 ~step_group
290 ~step
291 ~start_time:None
292 ~total_hwm:0
293 ~start_cgroup:None
294 ~cgroup
295 ~telemetry_ref
296 ~secs_at_total_gb:[||]
299 let step_start_end step_group ?(telemetry_ref = ref None) name f =
300 if not step_group.log then
301 f ()
302 else begin
303 step_group.step_count := !(step_group.step_count) + 1;
304 let step = Printf.sprintf "%02d_%s" !(step_group.step_count) name in
305 let start_time = Unix.gettimeofday () in
306 let start_cgroup = CGroup.get_stats () in
307 log_cgroup_total
308 start_cgroup
309 ~step_group
310 ~step
311 ~suffix:" start"
312 ~total_hwm:0;
313 let (initial, ()) =
314 initial_value_map start_cgroup ~default:() ~f:(fun _ -> ())
316 Result.iter start_cgroup ~f:(fun { CGroup.cgroup_name; _ } ->
317 let path = "/sys/fs/cgroup/" ^ cgroup_name ^ "/memory.current" in
318 cgroup_watcher_start
319 path
320 ~subtract_kb_for_array:(initial.CGroup.total / 1024));
321 Utils.try_finally ~f ~finally:(fun () ->
323 let cgroup = CGroup.get_stats () in
324 let (hwm_kb, _num_readings, secs_at_total_gb) =
325 cgroup_watcher_get ()
327 let total_hwm = hwm_kb * 1024 in
328 log_cgroup_total cgroup ~step_group ~step ~suffix:" end" ~total_hwm;
329 log_telemetry
330 ~step_group
331 ~step
332 ~start_time:(Some start_time)
333 ~total_hwm
334 ~start_cgroup:(Some start_cgroup)
335 ~cgroup
336 ~telemetry_ref
337 ~secs_at_total_gb
338 with
339 | exn ->
340 let e = Exception.wrap exn in
341 telemetry_ref :=
342 Telemetry.create () |> Telemetry.exception_ ~e |> Option.some;
343 Hh_logger.log "cgroup - failed to log. %s" (Exception.to_string e))