track monitor times for request-handling
[hiphop-php.git] / hphp / hack / src / monitor / serverMonitor.ml
blobd2d056bc12790ba84bfb19857ea68762a93875f0
1 (*
2 * Copyright (c) 2015, Facebook, Inc.
3 * All rights reserved.
5 * This source code is licensed under the MIT license found in the
6 * LICENSE file in the "hack" directory of this source tree.
8 *)
11 * The server monitor is the parent process for a server. It
12 * listens to a socket for client connections and passes the connections
13 * to the server and serves the following objectives:
15 * 1) Readily accepts client connections
16 * 2) Confirms a Build ID match (killing itself and the server quickly
17 * on mismatch)
18 * 3) Hands the client connection to the daemon server
19 * 4) Tracks when the server process crashes or OOMs and echos
20 * its fate to the next client.
23 open Hh_prelude
24 open ServerProcess
25 open ServerMonitorUtils
27 let log s ~tracker =
28 Hh_logger.log ("[%s] " ^^ s) (Connection_tracker.log_id tracker)
30 module Sent_fds_collector = struct
32 This module exists to fix an issue with libancillary (passing a file descriptor
33 to another process with sendmsg over Unix Domain Sockets) and certain operating
34 systems. It allows us to delay closing of a File Descriptor inside the Monitor
35 until it is safe to do so.
37 Normally:
38 Monitor sends client FD to Server process, and immediately closes the FD.
39 This is fine even if the Server is busy and hasn't "recv_fd" the FD yet
40 because this doesn't really "close" the file. The kernel still considers
41 it to be open by the receiving process. If the server closes the FD
42 then reads on the client will get an EOF. If the client closes the FD
43 then reads on the server will get an EOF.
45 Mac OS X:
46 EOF isn't showing up correctly on file descriptors passed between
47 processes.
48 When the Monitor closes the FD after sending it to the Server (and
49 before the Server receives it), the kernel thinks it is the last open
50 descriptor on the file and actually closes it. After the server
51 receieves the FD, it gets an EOF when reading from it (which it shouldn't
52 because the client is still there; aside: oddly enough, writing to it
53 succeeds instead of getting EPIPE). The server then closes that FD after
54 reading the EOF. Normally (as noted above) the client would read an
55 EOF after this. But (this is the bug) this EOF never shows up and the
56 client blocks forever on "select" instead.
58 To get around this problem, we want to close the FD in the monitor only
59 after the server has received it. Unfortunately, we don't actually
60 have a way to reliably detect that it has been received. So we just delay
61 closing by 2 seconds.
63 Note: It's not safe to detect the receiving by reading the
64 Hello message from the server (since it could/would be consumed
65 here instead of by the Client) nor by "select" (by a race condition
66 with the client, the select might miss the Hello, and could prevent
67 an EOF from being read by the server).
70 module Fd_scheduler = Scheduler.Make (struct
71 type t = (* Unix.time *) float
72 end)
74 let cleanup_fd fd =
75 if Sys_utils.is_apple_os () then
76 (* Close it 2 seconds later. *)
77 let trigger = Unix.gettimeofday () +. 2.0 in
78 Fd_scheduler.wait_for_fun
79 ~once:true
80 ~priority:1
81 (fun time -> Float.(time >= trigger))
82 (fun x ->
83 let () = Printf.eprintf "Closing client fd\n" in
84 let () = Unix.close fd in
86 else
87 Unix.close fd
89 let collect_garbage () =
90 if Sys_utils.is_apple_os () then
91 ignore (Fd_scheduler.wait_and_run_ready (Unix.gettimeofday ()))
92 else
94 end
96 exception Malformed_build_id
98 exception Send_fd_failure of int
100 module Make_monitor
101 (SC : ServerMonitorUtils.Server_config)
102 (Informant : Informant_sig.S) =
103 struct
104 type env = {
105 informant: Informant.t;
106 server: ServerProcess.server_process;
107 server_start_options: SC.server_start_options;
108 (* How many times have we tried to relaunch it? *)
109 retries: int;
110 sql_retries: int;
111 watchman_retries: int;
112 max_purgatory_clients: int;
113 (* Version of this running server, as specified in the config file. *)
114 current_version: Config_file.version;
115 (* After sending a Server_not_alive_dormant during Prehandoff,
116 * clients are put here waiting for a server to come alive, at
117 * which point they get pushed through the rest of prehandoff and
118 * then sent to the living server.
120 * String is the server name it wants to connect to. *)
121 purgatory_clients:
122 (Connection_tracker.t * MonitorRpc.handoff_options * Unix.file_descr)
123 Queue.t;
124 (* Whether to ignore hh version mismatches *)
125 ignore_hh_version: bool;
126 (* What server is doing now *)
127 server_progress: string;
128 (* Why what it is doing now might not be going as well as it could *)
129 server_progress_warning: string option;
132 type t = env * ServerMonitorUtils.monitor_config * Unix.file_descr
134 let fd_to_int (x : Unix.file_descr) : int = Obj.magic x
136 let msg_to_channel fd msg =
137 (* This FD will be passed to a server process, so avoid using Ocaml's
138 * channels which have built-in buffering. Even though we are only writing
139 * to the FD here, it seems using Ocaml's channels also causes read
140 * buffering to happen here, so the server process doesn't get what was
141 * meant for it. *)
142 Marshal_tools.to_fd_with_preamble fd msg |> ignore
144 let setup_handler_for_signals handler signals =
145 List.iter signals (fun signal ->
146 Sys_utils.set_signal signal (Sys.Signal_handle handler))
148 let setup_autokill_server_on_exit process =
150 setup_handler_for_signals
151 begin
152 fun _ ->
153 Hh_logger.log "Got an exit signal. Killing server and exiting.";
154 SC.kill_server process;
155 Exit_status.exit Exit_status.Interrupted
157 [Sys.sigint; Sys.sigquit; Sys.sigterm; Sys.sighup]
158 with _ -> Hh_logger.log "Failed to set signal handler"
160 let sleep_and_check socket =
161 let (ready_socket_l, _, _) = Unix.select [socket] [] [] 1.0 in
162 not (List.is_empty ready_socket_l)
164 let start_server ?target_saved_state ~informant_managed options exit_status =
165 let server_process =
166 SC.start_server
167 ?target_saved_state
168 ~prior_exit_status:exit_status
169 ~informant_managed
170 options
172 setup_autokill_server_on_exit server_process;
173 Alive server_process
175 let maybe_start_first_server options informant =
176 if Informant.should_start_first_server informant then (
177 Hh_logger.log "Starting first server";
178 HackEventLogger.starting_first_server ();
179 start_server
180 ~informant_managed:(Informant.is_managing informant)
181 options
182 None
183 ) else (
184 Hh_logger.log
185 ( "Not starting first server. "
186 ^^ "Starting will be triggered by informant later." );
187 Not_yet_started
190 let kill_server_with_check = function
191 | Alive server -> SC.kill_server server
192 | _ -> ()
194 let wait_for_server_exit_with_check server kill_signal_time =
195 match server with
196 | Alive server -> SC.wait_for_server_exit server kill_signal_time
197 | _ -> ()
199 let kill_server_and_wait_for_exit env =
200 kill_server_with_check env.server;
201 let kill_signal_time = Unix.gettimeofday () in
202 wait_for_server_exit_with_check env.server kill_signal_time
204 (* Reads current hhconfig contents from disk and returns true if the
205 * version specified in there matches our currently running version. *)
206 let is_config_version_matching env =
207 let filename =
208 Relative_path.from_root
209 ~suffix:Config_file.file_path_relative_to_repo_root
211 let (_hash, config) =
212 Config_file.parse_hhconfig
213 ~silent:true
214 (Relative_path.to_absolute filename)
216 let new_version =
217 Config_file.parse_version (SMap.find_opt "version" config)
219 0 = Config_file.compare_versions env.current_version new_version
221 (* Actually starts a new server. *)
222 let start_new_server ?target_saved_state env exit_status =
223 let informant_managed = Informant.is_managing env.informant in
224 let new_server =
225 start_server
226 ?target_saved_state
227 ~informant_managed
228 env.server_start_options
229 exit_status
231 { env with server = new_server; retries = env.retries + 1 }
233 (* Kill the server (if it's running) and restart it - maybe. Obeying the rules
234 * of state transitions. See docs on the ServerProcess.server_process ADT for
235 * state transitions. *)
236 let kill_and_maybe_restart_server ?target_saved_state env exit_status =
237 (* Ideally, all restarts should be triggered by Changed_merge_base notification
238 * which generate target mini state. There are other kind of restarts too, mostly
239 * related to server crashing - if we just restart and keep going, we risk
240 * Changed_merge_base eventually arriving and restarting the already started server
241 * for no reason. Re-issuing merge base query here should bring the Monitor and Server
242 * understanding of current revision to be the same *)
243 if Option.is_none target_saved_state then Informant.reinit env.informant;
244 kill_server_and_wait_for_exit env;
245 let version_matches = is_config_version_matching env in
246 match (env.server, version_matches) with
247 | (Died_config_changed, _) ->
248 (* Now we can start a new instance safely.
249 * See diagram on ServerProcess.server_process docs. *)
250 start_new_server ?target_saved_state env exit_status
251 | (Not_yet_started, false)
252 | (Alive _, false)
253 | (Died_unexpectedly _, false) ->
254 (* Can't start server instance. State goes to Died_config_changed
255 * See diagram on ServerProcess.server_process docs. *)
256 Hh_logger.log
257 "Avoiding starting a new server because version in config no longer matches.";
258 { env with server = Died_config_changed }
259 | (Not_yet_started, true)
260 | (Alive _, true)
261 | (Died_unexpectedly _, true) ->
262 (* Start new server instance because config matches.
263 * See diagram on ServerProcess.server_process docs. *)
264 start_new_server ?target_saved_state env exit_status
266 let read_version fd =
267 let client_build_id : string = Marshal_tools.from_fd_with_preamble fd in
268 let newline_byte = Bytes.create 1 in
269 let _ = Unix.read fd newline_byte 0 1 in
270 if not (String.equal (Bytes.to_string newline_byte) "\n") then (
271 Hh_logger.log "Did not find newline character after version";
272 raise Malformed_build_id
274 client_build_id
276 let rec handle_monitor_rpc env client_fd =
277 let cmd : MonitorRpc.command =
278 Marshal_tools.from_fd_with_preamble client_fd
280 match cmd with
281 | MonitorRpc.HANDOFF_TO_SERVER (tracker, handoff_options) ->
282 tracker.Connection_tracker.t_received_handoff <- Unix.gettimeofday ();
283 client_prehandoff
284 ~tracker
285 ~is_purgatory_client:false
287 handoff_options
288 client_fd
289 | MonitorRpc.SHUT_DOWN tracker ->
290 log "Got shutdown RPC. Shutting down." ~tracker;
291 let kill_signal_time = Unix.gettimeofday () in
292 kill_server_with_check env.server;
293 wait_for_server_exit_with_check env.server kill_signal_time;
294 Exit_status.(exit No_error)
295 | MonitorRpc.SERVER_PROGRESS _tracker ->
296 msg_to_channel client_fd (env.server_progress, env.server_progress_warning);
297 Unix.close client_fd;
300 and hand_off_client_connection ~tracker server_fd client_fd =
301 let status = Libancillary.ancil_send_fd server_fd client_fd in
302 if status = 0 then begin
303 tracker.Connection_tracker.t_sent_fd <- Unix.gettimeofday ();
304 msg_to_channel server_fd tracker;
305 Sent_fds_collector.cleanup_fd client_fd
306 end else begin
307 Hh_logger.log "Failed to handoff FD to server.";
308 raise (Send_fd_failure status)
311 (* Sends the client connection FD to the server process then closes the
312 * FD. *)
313 and hand_off_client_connection_with_retries
314 ~tracker server_fd retries client_fd =
315 let (_, ready_l, _) = Unix.select [] [server_fd] [] 0.5 in
316 if not (List.is_empty ready_l) then
317 try hand_off_client_connection ~tracker server_fd client_fd
318 with e ->
319 if retries > 0 then (
320 log "Retrying FD handoff" ~tracker;
321 hand_off_client_connection_with_retries
322 ~tracker
323 server_fd
324 (retries - 1)
325 client_fd
326 ) else (
327 log "No more retries. Ignoring request." ~tracker;
328 HackEventLogger.send_fd_failure e;
329 Unix.close client_fd
331 else if retries > 0 then (
332 log "server socket not yet ready. Retrying." ~tracker;
333 hand_off_client_connection_with_retries
334 ~tracker
335 server_fd
336 (retries - 1)
337 client_fd
338 ) else (
340 "server socket not yet ready. No more retries. Ignoring request."
341 ~tracker;
342 Unix.close client_fd
345 (* Does not return. *)
346 and client_out_of_date_ client_fd mismatch_info =
347 msg_to_channel client_fd (Build_id_mismatch_ex mismatch_info);
348 HackEventLogger.out_of_date ()
350 (* Kills servers, sends build ID mismatch message to client, and exits.
352 * Does not return. Exits after waiting for server processes to exit. So
353 * the client can wait for socket closure as indication that both the monitor
354 * and server have exited.
356 and client_out_of_date env client_fd mismatch_info =
357 Hh_logger.log "Client out of date. Killing server.";
358 kill_server_with_check env.server;
359 let kill_signal_time = Unix.gettimeofday () in
360 (* If we detect out of date client, should always kill server and exit
361 * monitor, even if messaging to channel or event logger fails. *)
362 (try client_out_of_date_ client_fd mismatch_info
363 with e ->
364 Hh_logger.log
365 "Handling client_out_of_date threw with: %s"
366 (Exn.to_string e));
367 wait_for_server_exit_with_check env.server kill_signal_time;
368 Exit_status.exit Exit_status.Build_id_mismatch
370 (* Send (possibly empty) sequences of messages before handing off to
371 * server. *)
372 and client_prehandoff
373 ~tracker ~is_purgatory_client env handoff_options client_fd =
374 let module PH = Prehandoff in
375 match env.server with
376 | Alive server ->
377 let server_fd =
379 @@ List.find_exn server.out_fds ~f:(fun x ->
380 String.equal (fst x) handoff_options.MonitorRpc.pipe_name)
382 tracker.Connection_tracker.t_monitor_ready <- Unix.gettimeofday ();
383 (* TODO: Send this to client so it is visible. *)
385 "Got %s request for typechecker. Prior request %.1f seconds ago"
386 ~tracker
387 handoff_options.MonitorRpc.pipe_name
388 ( tracker.Connection_tracker.t_monitor_ready
389 -. !(server.last_request_handoff) );
390 msg_to_channel client_fd (PH.Sentinel server.finale_file);
391 tracker.Connection_tracker.t_sent_ack_to_client <- Unix.gettimeofday ();
392 hand_off_client_connection_with_retries ~tracker server_fd 8 client_fd;
393 log "handed off client fd to server" ~tracker;
394 HackEventLogger.client_connection_sent ();
395 server.last_request_handoff := Unix.time ();
396 { env with server = Alive server }
397 | Died_unexpectedly (status, was_oom) ->
398 (* Server has died; notify the client *)
399 msg_to_channel client_fd (PH.Server_died { PH.status; PH.was_oom });
401 (* Next client to connect starts a new server. *)
402 Exit_status.exit Exit_status.No_error
403 | Died_config_changed ->
404 if not is_purgatory_client then (
405 let env = kill_and_maybe_restart_server env None in
406 (* Assert that the restart succeeded, and then push prehandoff through again. *)
407 match env.server with
408 | Alive _ ->
409 (* Server restarted. We want to re-run prehandoff, which will
410 * actually do the prehandoff this time. *)
411 client_prehandoff
412 ~tracker
413 ~is_purgatory_client
415 handoff_options
416 client_fd
417 | Died_unexpectedly _
418 | Died_config_changed
419 | Not_yet_started ->
420 Hh_logger.log
421 ( "Unreachable state. Server should be alive after trying a restart"
422 ^^ " from Died_config_changed state" );
423 failwith
424 "Failed starting server transitioning off Died_config_changed state"
425 ) else (
426 msg_to_channel client_fd PH.Server_died_config_change;
429 | Not_yet_started ->
430 let env =
431 if handoff_options.MonitorRpc.force_dormant_start then (
432 msg_to_channel
433 client_fd
434 (PH.Server_not_alive_dormant
435 "Warning - starting a server by force-dormant-start option...");
436 kill_and_maybe_restart_server env None
437 ) else (
438 msg_to_channel
439 client_fd
440 (PH.Server_not_alive_dormant
441 "Server killed by informant. Waiting for next server...");
445 if Queue.length env.purgatory_clients >= env.max_purgatory_clients then
446 let () =
447 msg_to_channel client_fd PH.Server_dormant_connections_limit_reached
450 else
451 let () =
452 Queue.enqueue
453 env.purgatory_clients
454 (tracker, handoff_options, client_fd)
458 and ack_and_handoff_client env client_fd =
460 let client_version = read_version client_fd in
462 (not env.ignore_hh_version)
463 && not (String.equal client_version Build_id.build_revision)
464 then
465 client_out_of_date env client_fd ServerMonitorUtils.current_build_info
466 else (
467 msg_to_channel client_fd Connection_ok;
468 handle_monitor_rpc env client_fd
470 with Malformed_build_id as e ->
471 let stack = Caml.Printexc.get_raw_backtrace () in
472 HackEventLogger.malformed_build_id ();
473 Hh_logger.log "Malformed Build ID";
474 Caml.Printexc.raise_with_backtrace e stack
476 and push_purgatory_clients env =
477 (* We create a queue and transfer all the purgatory clients to it before
478 * processing to avoid repeatedly retrying the same client even after
479 * an EBADF. Control flow is easier this way than trying to manage an
480 * immutable env in the face of exceptions. *)
481 let clients = Queue.create () in
482 Queue.blit_transfer ~src:env.purgatory_clients ~dst:clients ();
483 let env =
484 Queue.fold
486 begin
487 fun env (tracker, handoff_options, client_fd) ->
489 client_prehandoff
490 ~tracker
491 ~is_purgatory_client:true
493 handoff_options
494 client_fd
495 with
496 | Unix.Unix_error (Unix.EPIPE, _, _)
497 | Unix.Unix_error (Unix.EBADF, _, _) ->
498 log "Purgatory client disconnected. Dropping." ~tracker;
501 ~init:env
502 clients
506 and maybe_push_purgatory_clients env =
507 match (env.server, Queue.length env.purgatory_clients) with
508 | (Alive _, 0) -> env
509 | (Died_config_changed, _) ->
510 (* These clients are waiting for a server to be started. But this Monitor
511 * is waiting for a new client to connect (which confirms to us that we
512 * are running the correct version of the Monitor). So let them know
513 * that they might want to do something. *)
514 push_purgatory_clients env
515 | (Alive _, _) -> push_purgatory_clients env
516 | (Not_yet_started, _)
517 | (Died_unexpectedly _, _) ->
520 let rec read_server_messages process env =
521 let msg =
522 ServerProgress.(make_pipe_from_server process.in_fd |> read_from_server)
524 match msg with
525 | None -> env
526 | Some msg ->
527 let env =
528 match msg with
529 | MonitorRpc.PROGRESS msg -> { env with server_progress = msg }
530 | MonitorRpc.PROGRESS_WARNING msg ->
531 { env with server_progress_warning = msg }
533 read_server_messages process env
535 (* Kill command from client is handled by server server, so the monitor
536 * needs to check liveness of the server process to know whether
537 * to stop itself. *)
538 let update_status_ (env : env) monitor_config =
539 let env =
540 match env.server with
541 | Alive process ->
542 let (pid, proc_stat) = SC.wait_pid process in
543 (match (pid, proc_stat) with
544 | (0, _) ->
545 (* "pid=0" means the pid we waited for (i.e. process) hasn't yet died/stopped *)
546 read_server_messages process env
547 | (_, _) ->
548 (* "pid<>0" means the pid has died or received a stop signal *)
549 let oom_code = Exit_status.(exit_code Out_of_shared_memory) in
550 let was_oom =
551 match proc_stat with
552 | Unix.WEXITED code when code = oom_code -> true
553 | _ -> Sys_utils.check_dmesg_for_oom process.pid "hh_server"
555 SC.on_server_exit monitor_config;
556 ServerProcessTools.check_exit_status proc_stat process monitor_config;
557 { env with server = Died_unexpectedly (proc_stat, was_oom) })
558 | Not_yet_started ->
560 env with
561 server_progress = "server is currently stopped";
562 server_progress_warning = None;
564 | Died_config_changed ->
566 env with
567 server_progress = "server stopped because its configuration changed";
568 server_progress_warning = None;
570 | Died_unexpectedly _ ->
572 env with
573 server_progress = "server stopped because of an error";
574 server_progress_warning = None;
578 let (exit_status, server_state) =
579 match env.server with
580 | Alive _ -> (None, Informant_sig.Server_alive)
581 | Died_unexpectedly (Unix.WEXITED c, _) ->
582 (Some c, Informant_sig.Server_dead)
583 | Not_yet_started -> (None, Informant_sig.Server_not_yet_started)
584 | Died_unexpectedly ((Unix.WSIGNALED _ | Unix.WSTOPPED _), _)
585 | Died_config_changed ->
586 (None, Informant_sig.Server_dead)
588 (env, exit_status, server_state)
590 let server_not_started env = { env with server = Not_yet_started }
592 let update_status env monitor_config =
593 let (env, exit_status, server_state) = update_status_ env monitor_config in
594 let informant_report = Informant.report env.informant server_state in
595 let is_watchman_fresh_instance =
596 match exit_status with
597 | Some c when c = Exit_status.(exit_code Watchman_fresh_instance) -> true
598 | _ -> false
600 let is_watchman_failed =
601 match exit_status with
602 | Some c when c = Exit_status.(exit_code Watchman_failed) -> true
603 | _ -> false
605 let is_config_changed =
606 match exit_status with
607 | Some c when c = Exit_status.(exit_code Hhconfig_changed) -> true
608 | _ -> false
610 let is_heap_stale =
611 match exit_status with
612 | Some c
613 when (c = Exit_status.(exit_code File_provider_stale))
614 || c = Exit_status.(exit_code Decl_not_found) ->
615 true
616 | _ -> false
618 let is_sql_assertion_failure =
619 match exit_status with
620 | Some c
621 when (c = Exit_status.(exit_code Sql_assertion_failure))
622 || (c = Exit_status.(exit_code Sql_cantopen))
623 || (c = Exit_status.(exit_code Sql_corrupt))
624 || c = Exit_status.(exit_code Sql_misuse) ->
625 true
626 | _ -> false
628 let is_worker_error =
629 match exit_status with
630 | Some c
631 when (c = Exit_status.(exit_code Worker_not_found_exception))
632 || (c = Exit_status.(exit_code Worker_busy))
633 || c = Exit_status.(exit_code Worker_failed_to_send_job) ->
634 true
635 | _ -> false
637 let is_decl_heap_elems_bug =
638 match exit_status with
639 | Some c when c = Exit_status.(exit_code Decl_heap_elems_bug) -> true
640 | _ -> false
642 let is_big_rebase =
643 match exit_status with
644 | Some c when c = Exit_status.(exit_code Big_rebase_detected) -> true
645 | _ -> false
647 let max_watchman_retries = 3 in
648 let max_sql_retries = 3 in
649 match (informant_report, env.server) with
650 | (Informant_sig.Move_along, Died_config_changed) -> env
651 | (Informant_sig.Restart_server _, Died_config_changed) ->
652 Hh_logger.log "%s"
653 @@ "Ignoring Informant directed restart - waiting for next client "
654 ^ "connection to verify server version first";
656 | (Informant_sig.Restart_server target_saved_state, _) ->
657 Hh_logger.log "Informant directed server restart. Restarting server.";
658 HackEventLogger.informant_induced_restart ();
659 kill_and_maybe_restart_server ?target_saved_state env exit_status
660 | (Informant_sig.Move_along, _) ->
662 (is_watchman_failed || is_watchman_fresh_instance)
663 && env.watchman_retries < max_watchman_retries
664 then (
665 Hh_logger.log
666 "Watchman died. Restarting hh_server (attempt: %d)"
667 (env.watchman_retries + 1);
668 let env = { env with watchman_retries = env.watchman_retries + 1 } in
669 server_not_started env
670 ) else if is_decl_heap_elems_bug then (
671 Hh_logger.log "hh_server died due to Decl_heap_elems_bug. Restarting";
672 server_not_started env
673 ) else if is_worker_error then (
674 Hh_logger.log "hh_server died due to worker error. Restarting";
675 server_not_started env
676 ) else if is_config_changed then (
677 Hh_logger.log "hh_server died from hh config change. Restarting";
678 server_not_started env
679 ) else if is_heap_stale then (
680 Hh_logger.log
681 "Several large rebases caused shared heap to be stale. Restarting";
682 server_not_started env
683 ) else if is_big_rebase then (
684 Hh_logger.log "Server exited because of big rebase. Restarting";
685 server_not_started env
686 ) else if is_sql_assertion_failure && env.sql_retries < max_sql_retries
687 then (
688 Hh_logger.log
689 "Sql failed. Restarting hh_server in fresh mode (attempt: %d)"
690 (env.sql_retries + 1);
691 let env = { env with sql_retries = env.sql_retries + 1 } in
692 server_not_started env
693 ) else
696 let rec check_and_run_loop
697 ?(consecutive_throws = 0) env monitor_config (socket : Unix.file_descr) =
698 let (env, consecutive_throws) =
699 try (check_and_run_loop_ env monitor_config socket, 0) with
700 | Unix.Unix_error (Unix.ECHILD, _, _) ->
701 let stack = Printexc.get_backtrace () in
702 ignore
703 (Hh_logger.log
704 "check_and_run_loop_ threw with Unix.ECHILD. Exiting. - %s"
705 stack);
706 Exit_status.exit Exit_status.No_server_running_should_retry
707 | Watchman.Watchman_restarted ->
708 Exit_status.exit Exit_status.Watchman_fresh_instance
709 | Exit_status.Exit_with _ as e -> raise e
710 | e ->
711 let stack = Printexc.get_backtrace () in
712 if consecutive_throws > 500 then (
713 Hh_logger.log "Too many consecutive exceptions.";
714 Hh_logger.log
715 "Probably an uncaught exception rethrown each retry. Exiting. %s"
716 stack;
717 HackEventLogger.uncaught_exception e;
718 Exit_status.exit Exit_status.Uncaught_exception
720 Hh_logger.log
721 "check_and_run_loop_ threw with exception: %s - %s"
722 (Exn.to_string e)
723 stack;
724 (env, consecutive_throws + 1)
726 check_and_run_loop ~consecutive_throws env monitor_config socket
728 and check_and_run_loop_ env monitor_config (socket : Unix.file_descr) =
729 let lock_file = monitor_config.lock_file in
730 if not (Lock.grab lock_file) then (
731 Hh_logger.log "Lost lock; terminating.\n%!";
732 HackEventLogger.lock_stolen lock_file;
733 Exit_status.(exit Lock_stolen)
735 let env = maybe_push_purgatory_clients env in
736 let () = Sent_fds_collector.collect_garbage () in
737 let has_client = sleep_and_check socket in
738 let env = update_status env monitor_config in
739 if not has_client then
740 let () = EventLogger.recheck_disk_files () in
742 else
744 let (fd, _) = Unix.accept socket in
746 HackEventLogger.accepted_client_fd (fd_to_int fd);
747 ack_and_handoff_client env fd
748 with
749 | Exit_status.Exit_with _ as e -> raise e
750 | e ->
751 let e = Exception.wrap e in
752 HackEventLogger.ack_and_handoff_exception e;
753 Hh_logger.log
754 "Handling client connection failed. Ignoring connection attempt.\n%s\n"
755 (Exception.to_string e |> Exception.clean_stack);
756 Unix.close fd;
758 with
759 | Exit_status.Exit_with _ as e -> raise e
760 | e ->
761 HackEventLogger.accepting_on_socket_exception e;
762 Hh_logger.log
763 "Accepting on socket failed. Ignoring client connection attempt.";
766 let check_and_run_loop_once (env, monitor_config, socket) =
767 let env = check_and_run_loop_ env monitor_config socket in
768 (env, monitor_config, socket)
770 let start_monitor
771 ~current_version
772 ~waiting_client
773 ~max_purgatory_clients
774 server_start_options
775 informant_init_env
776 monitor_config =
777 let socket = Socket.init_unix_socket monitor_config.socket_file in
778 (* If the client started the server, it opened an FD before forking, so it
779 * can be notified when the monitor socket is ready. The FD number was
780 * passed in program args. *)
781 Option.iter waiting_client (fun fd ->
782 let oc = Unix.out_channel_of_descr fd in
784 Out_channel.output_string oc (ServerMonitorUtils.ready ^ "\n");
785 Out_channel.close oc
786 with
787 | (Sys_error _ | Unix.Unix_error _) as e ->
788 Printf.eprintf
789 "Caught exception while waking client: %s\n%!"
790 (Exn.to_string e));
792 (* It is essential that we initiate the Informant before the server if we
793 * want to give the opportunity for the Informant to truly take
794 * ownership over the lifetime of the server.
796 * This is because start_server won't actually start a server if it sees
797 * a hg update sentinel file indicating an hg update is in-progress.
798 * Starting the informant first ensures that its Watchman watch is started
799 * before we check for the hgupdate sentinel file - this is required
800 * for the informant to properly observe an update is complete without
801 * hitting race conditions. *)
802 let informant = Informant.init informant_init_env in
803 let server_process =
804 maybe_start_first_server server_start_options informant
806 let env =
808 informant;
809 max_purgatory_clients;
810 current_version;
811 purgatory_clients = Queue.create ();
812 server = server_process;
813 server_start_options;
814 retries = 0;
815 sql_retries = 0;
816 watchman_retries = 0;
817 ignore_hh_version =
818 Informant.should_ignore_hh_version informant_init_env;
819 server_progress_warning = None;
820 server_progress = "server status is unknown";
823 (env, monitor_config, socket)
825 let start_monitoring
826 ~current_version
827 ~waiting_client
828 ~max_purgatory_clients
829 server_start_options
830 informant_init_env
831 monitor_config =
832 let (env, monitor_config, socket) =
833 start_monitor
834 ~current_version
835 ~waiting_client
836 ~max_purgatory_clients
837 server_start_options
838 informant_init_env
839 monitor_config
841 check_and_run_loop env monitor_config socket