3 (** `cgroup_watcher_start filename subtract_kb_for_array` will reset cgroup_watcher counters, and have it
4 start monitoring the filename (e.g. "/sys/fs/.../memory.current". See [cgroup_watcher_get] for
5 the meaning of [subtract_bytes_for_array]. *)
6 external cgroup_watcher_start
: string -> subtract_kb_for_array
:int -> unit
7 = "cgroup_watcher_start"
9 (** This returns (hwm_kb, num_readings, seconds_at_gb[||]) that have been tallied since
10 [cgroup_watcher_start]. Hwm_kb is the high water mark of the cgroup. Num_readings is the
11 number of readings that succeeded. The meaning of seconds_at_gb.(i) is that we spent
12 this many seconds with the value between (subtract_kb_for_array+i) and
13 (subtract_kb_for_array+i+1) gb. *)
14 external cgroup_watcher_get
: unit -> int * int * float array
15 = "cgroup_watcher_get"
17 type initial_reading
=
18 HackEventLogger.ProfileTypeCheck.stats
* (CGroup.stats
, string) result
20 (** This is the [initial_reading] that we capture upon module load, i.e. process startup *)
21 let initial_reading_capture : initial_reading
=
23 HackEventLogger.ProfileTypeCheck.get_stats
24 ~include_current_process
:false
27 (stats, CGroup.get_stats
())
29 (** This is the [initial_reading] that we'll actually use, i.e. we'll record data relative to this.
30 None means we won't use anything. *)
31 let initial_reading : initial_reading option ref = ref None
33 let get_initial_reading () : initial_reading =
34 match !initial_reading with
35 | None
-> initial_reading_capture
36 | Some
initial_reading -> initial_reading
38 let use_initial_reading (new_initial_reading
: initial_reading) : unit =
39 initial_reading := Some new_initial_reading
41 let get_initial_stats () = get_initial_reading () |> fst
43 (** A small helper: returns either (initial_reading, f initial_reading) if (1) someone
44 has previously called [use_initial_reading] and provided an Ok one, (2) the [current_cgroup]
45 parameter is Ok, (3) the [current_cgroup] has the same cgroup name as the initial one.
46 Otherwise, it returns ({0...}, default). *)
47 let initial_value_map current_cgroup ~f ~default
=
49 match (!initial_reading, current_cgroup
) with
50 | (Some
(_stats
, Ok initial_cgroup
), Ok current_cgroup
)
51 when String.equal initial_cgroup
.cgroup_name current_cgroup
.cgroup_name
->
52 (initial_cgroup
, f initial_cgroup
)
66 log_label_and_suffix
: string;
71 name
: string; (** e.g. "lazy_init" *)
73 (** used to prefix the step-names for telemetry+logging *)
74 log
: bool; (** should it be logged to Hh_logger? *)
75 last_printed_total
: int ref;
76 (** we only print to Hh_logger if it's markedly changed since last we printed. *)
77 prev_step_to_show
: step_summary
option ref;
78 (** in case we print, we might want to include something about the previous one, if it hasn't already been printed *)
81 (** We'll only bother writing to the log if the total has changed by at least this many bytes *)
82 let threshold_for_logging = 100 * 1024 * 1024
84 (** to avoid flooding logs with errors *)
85 let has_logged_error = ref SSet.empty
87 (** prints bytes in gb *)
88 let pretty_num i
= Printf.sprintf
"%0.2fGiB" (float i
/. 1073741824.0)
90 (** Records to Hh_logger if it differs from the previous total, and update total *)
91 let log_cgroup_total cgroup ~step_group ~step ~suffix ~total_hwm
=
92 let (initial
, initial_msg
) =
93 initial_value_map cgroup ~default
:"" ~f
:(fun i
->
94 Printf.sprintf
" (relative to initial %s)" (pretty_num i
.CGroup.total
))
99 let log_label_and_suffix =
100 Printf.sprintf
"%s/%s%s" step_group
.name step suffix
102 let prev = !(step_group
.prev_step_to_show
) in
105 time
= Unix.gettimeofday
();
106 log_label_and_suffix;
107 step_total
= cgroup
.CGroup.total
;
111 abs
(cgroup
.CGroup.total
- !(step_group
.last_printed_total
))
112 < threshold_for_logging
114 (* We didn't print this current step. But if the *next* step proves to have a big
115 increase in memory, then when we print the next step, we'll also print this one. *)
116 step_group
.prev_step_to_show
:= Some
current
118 (* This step is being shown right now. So if the next step proves to have a big
119 increase in memory, we won't need to show this one again! *)
120 step_group
.prev_step_to_show
:= None
;
121 step_group
.last_printed_total
:= cgroup
.CGroup.total
;
123 if cgroup
.CGroup.total
>= total_hwm
then
128 (pretty_num (total_hwm
- initial
.CGroup.total
))
135 "\n >%s %s [@%s] previous measurement"
136 (Utils.timestring
prev.time
)
137 (pretty_num (prev.step_total
- initial
.CGroup.total
))
138 prev.log_label_and_suffix
141 "Cgroup: %s%s [@%s]%s%s"
142 (pretty_num (cgroup
.CGroup.total
- initial
.CGroup.total
))
144 current.log_label_and_suffix
149 (** Given a float array [secs_at_gb] where secs_at_gb[0] says how many secs were spent at cgroup memory 0gb,
150 secs_at_gb[1] says how many secs were spent at cgroup memory 1gb, and so on, produces an SMap
151 where keys are some arbitrary thresholds "secs_above_20gb" and values are (int) number of seconds
152 spent at that memory or higher. We pick just a few arbitrary thresholds that we think are useful
153 for telemetry, and their sole purpose is telemetry. *)
154 let secs_above_gb_summary (secs_at_gb
: float array
) : int SMap.t
=
155 if Array.is_empty secs_at_gb
then
158 List.init
15 ~f
:(fun i
-> i
* 5) (* 0gb, 5gb, ..., 70gb) *)
159 |> List.filter_map ~f
:(fun threshold
->
160 let title = Printf.sprintf
"secs_above_%02dgb" threshold
in
162 Array.foldi secs_at_gb ~init
:0. ~f
:(fun i acc s
->
165 if i
< threshold
then
170 let secs = Float.round
secs |> int_of_float
in
171 Option.some_if
(secs > 0) (title, secs))
174 (** Records to HackEventLogger *)
185 let (initial
, initial_opt
) =
186 initial_value_map cgroup ~default
:false ~f
:(fun _
-> true)
188 match (start_cgroup
, cgroup
) with
189 | (Some
(Error e
), _
)
191 telemetry_ref
:= Telemetry.create
() |> Telemetry.error ~e
|> Option.some
;
192 if SSet.mem e
!has_logged_error then
195 has_logged_error := SSet.add e
!has_logged_error;
196 HackEventLogger.CGroup.error e
198 | (Some
(Ok start_cgroup
), Ok cgroup
) ->
199 let total_hwm = max
(max start_cgroup
.total cgroup
.total
) total_hwm in
200 let secs_above_total_gb_summary = secs_above_gb_summary secs_at_total_gb
in
204 (fun key
value -> Telemetry.int_ ~key ~
value)
205 secs_above_total_gb_summary
208 ~
value:(start_cgroup
.total
- initial
.total
)
209 |> Telemetry.int_ ~key
:"end_bytes" ~
value:(cgroup
.total
- initial
.total
)
210 |> Telemetry.int_ ~key
:"hwm_bytes" ~
value:(total_hwm - initial
.total
)
212 ~key
:"relative_to_initial"
213 ~
value:(Option.some_if initial_opt initial
.total
)
215 HackEventLogger.CGroup.step
216 ~cgroup
:cgroup
.cgroup_name
217 ~step_group
:step_group
.name
220 ~total_relative_to
:(Option.some_if initial_opt initial
.total
)
221 ~totalswap_relative_to
:(Option.some_if initial_opt initial
.total_swap
)
222 ~anon_relative_to
:(Option.some_if initial_opt initial
.anon
)
223 ~shmem_relative_to
:(Option.some_if initial_opt initial
.shmem
)
224 ~file_relative_to
:(Option.some_if initial_opt initial
.file
)
225 ~total_start
:(Some
(start_cgroup
.total
- initial
.total
))
226 ~totalswap_start
:(Some
(start_cgroup
.total_swap
- initial
.total_swap
))
227 ~anon_start
:(Some
(start_cgroup
.anon
- initial
.anon
))
228 ~shmem_start
:(Some
(start_cgroup
.shmem
- initial
.shmem
))
229 ~file_start
:(Some
(start_cgroup
.file
- initial
.file
))
230 ~
total_hwm:(total_hwm - initial
.total
)
231 ~total
:(cgroup
.total
- initial
.total
)
232 ~totalswap
:(cgroup
.total_swap
- initial
.total_swap
)
233 ~anon
:(cgroup
.anon
- initial
.anon
)
234 ~shmem
:(cgroup
.shmem
- initial
.shmem
)
235 ~file
:(cgroup
.file
- initial
.file
)
236 ~secs_at_total_gb
:(Some
(secs_at_total_gb
|> Array.to_list
))
237 ~
secs_above_total_gb_summary
238 | (None
, Ok cgroup
) ->
241 |> Telemetry.int_ ~key
:"end" ~
value:cgroup
.total
243 HackEventLogger.CGroup.step
244 ~cgroup
:cgroup
.cgroup_name
245 ~step_group
:step_group
.name
248 ~total_relative_to
:(Option.some_if initial_opt initial
.total
)
249 ~totalswap_relative_to
:(Option.some_if initial_opt initial
.total_swap
)
250 ~anon_relative_to
:(Option.some_if initial_opt initial
.anon
)
251 ~shmem_relative_to
:(Option.some_if initial_opt initial
.shmem
)
252 ~file_relative_to
:(Option.some_if initial_opt initial
.file
)
254 ~totalswap_start
:None
258 ~
total_hwm:(cgroup
.total
- initial
.total
)
259 ~total
:(cgroup
.total
- initial
.total
)
260 ~totalswap
:(cgroup
.total_swap
- initial
.total_swap
)
261 ~anon
:(cgroup
.anon
- initial
.anon
)
262 ~shmem
:(cgroup
.shmem
- initial
.shmem
)
263 ~file
:(cgroup
.file
- initial
.file
)
264 ~secs_at_total_gb
:None
265 ~
secs_above_total_gb_summary:SMap.empty
267 let step_group name ~log f
=
268 let log = log && Sys_utils.enable_telemetry
() in
274 last_printed_total
= ref 0;
275 prev_step_to_show
= ref None
;
280 let step step_group ?
(telemetry_ref
= ref None
) name
=
281 if not
step_group.log then
284 step_group.step_count
:= !(step_group.step_count
) + 1;
285 let step = Printf.sprintf
"%02d_%s" !(step_group.step_count
) name
in
286 let cgroup = CGroup.get_stats
() in
287 log_cgroup_total cgroup ~
step_group ~
step ~suffix
:"" ~
total_hwm:0;
296 ~secs_at_total_gb
:[||]
299 let step_start_end step_group ?
(telemetry_ref
= ref None
) name f
=
300 if not
step_group.log then
303 step_group.step_count
:= !(step_group.step_count
) + 1;
304 let step = Printf.sprintf
"%02d_%s" !(step_group.step_count
) name
in
305 let start_time = Unix.gettimeofday
() in
306 let start_cgroup = CGroup.get_stats
() in
314 initial_value_map start_cgroup ~default
:() ~f
:(fun _
-> ())
316 Result.iter
start_cgroup ~f
:(fun { CGroup.cgroup_name
; _
} ->
317 let path = "/sys/fs/cgroup/" ^ cgroup_name ^
"/memory.current" in
320 ~subtract_kb_for_array
:(initial
.CGroup.total
/ 1024));
321 Utils.try_finally ~f ~finally
:(fun () ->
323 let cgroup = CGroup.get_stats
() in
324 let (hwm_kb
, _num_readings
, secs_at_total_gb
) =
325 cgroup_watcher_get
()
327 let total_hwm = hwm_kb
* 1024 in
328 log_cgroup_total cgroup ~
step_group ~
step ~suffix
:" end" ~
total_hwm;
332 ~
start_time:(Some
start_time)
334 ~
start_cgroup:(Some
start_cgroup)
340 let e = Exception.wrap exn
in
342 Telemetry.create
() |> Telemetry.exception_ ~
e |> Option.some
;
343 Hh_logger.log "cgroup - failed to log. %s" (Exception.to_string
e))