hphp/hack/src/monitor/serverMonitor.ml

   1 (*
   2  * Copyright (c) 2015, Facebook, Inc.
   3  * All rights reserved.
   4  *
   5  * This source code is licensed under the MIT license found in the
   6  * LICENSE file in the "hack" directory of this source tree.
   7  *
   8  *)
   9
  10 (*
  11  * The server monitor is the parent process for a server. It
  12  * listens to a socket for client connections and passes the connections
  13  * to the server and serves the following objectives:
  14  *
  15    * 1) Readily accepts client connections
  16    * 2) Confirms a Build ID match (killing itself and the server quickly
  17    *    on mismatch)
  18    * 3) Hands the client connection to the daemon server
  19    * 4) Tracks when the server process crashes or OOMs and echos
  20    *    its fate to the next client.
  21  *)
  22
  23 open Hh_prelude
  24 open ServerProcess
  25 open ServerMonitorUtils
  26
  27 let log s ~tracker =
  28   Hh_logger.log ("[%s] " ^^ s) (Connection_tracker.log_id tracker)
  29
  30 module Sent_fds_collector = struct
  31   (*
  32    This module exists to fix an issue with libancillary (passing a file descriptor
  33    to another process with sendmsg over Unix Domain Sockets) and certain operating
  34    systems. It allows us to delay closing of a File Descriptor inside the Monitor
  35    until it is safe to do so.
  36
  37    Normally:
  38      Monitor sends client FD to Server process, and immediately closes the FD.
  39      This is fine even if the Server is busy and hasn't "recv_fd" the FD yet
  40      because this doesn't really "close" the file. The kernel still considers
  41      it to be open by the receiving process. If the server closes the FD
  42      then reads on the client will get an EOF. If the client closes the FD
  43      then reads on the server will get an EOF.
  44
  45    Mac OS X:
  46      EOF isn't showing up correctly on file descriptors passed between
  47      processes.
  48      When the Monitor closes the FD after sending it to the Server (and
  49      before the Server receives it), the kernel thinks it is the last open
  50      descriptor on the file and actually closes it. After the server
  51      receieves the FD, it gets an EOF when reading from it (which it shouldn't
  52      because the client is still there; aside: oddly enough, writing to it
  53      succeeds instead of getting EPIPE). The server then closes that FD after
  54      reading the EOF. Normally (as noted above) the client would read an
  55      EOF after this. But (this is the bug) this EOF never shows up and the
  56      client blocks forever on "select" instead.
  57
  58    To get around this problem, we want to close the FD in the monitor only
  59    after the server has received it. Unfortunately, we don't actually
  60    have a way to reliably detect that it has been received. So we just delay
  61    closing by 2 seconds.
  62
  63    Note: It's not safe to detect the receiving by reading the
  64    Hello message from the server (since it could/would be consumed
  65    here instead of by the Client) nor by "select" (by a race condition
  66    with the client, the select might miss the Hello, and could prevent
  67    an EOF from being read by the server).
  68    *)
  69
  70   module Fd_scheduler = Scheduler.Make (struct
  71     type t = (* Unix.time *) float
  72   end)
  73
  74   let cleanup_fd fd =
  75     if Sys_utils.is_apple_os () then
  76       (* Close it 2 seconds later. *)
  77       let trigger = Unix.gettimeofday () +. 2.0 in
  78       Fd_scheduler.wait_for_fun
  79         ~once:true
  80         ~priority:1
  81         (fun time -> Float.(time >= trigger))
  82         (fun x ->
  83           let () = Printf.eprintf "Closing client fd\n" in
  84           let () = Unix.close fd in
  85           x)
  86     else
  87       Unix.close fd
  88
  89   let collect_garbage () =
  90     if Sys_utils.is_apple_os () then
  91       ignore (Fd_scheduler.wait_and_run_ready (Unix.gettimeofday ()))
  92     else
  93       ()
  94 end
  95
  96 exception Malformed_build_id
  97
  98 exception Send_fd_failure of int
  99
 100 module Make_monitor
 101     (SC : ServerMonitorUtils.Server_config)
 102     (Informant : Informant_sig.S) =
 103 struct
 104   type env = {
 105     informant: Informant.t;
 106     server: ServerProcess.server_process;
 107     server_start_options: SC.server_start_options;
 108     (* How many times have we tried to relaunch it? *)
 109     retries: int;
 110     sql_retries: int;
 111     watchman_retries: int;
 112     max_purgatory_clients: int;
 113     (* Version of this running server, as specified in the config file. *)
 114     current_version: Config_file.version;
 115     (* After sending a Server_not_alive_dormant during Prehandoff,
 116      * clients are put here waiting for a server to come alive, at
 117      * which point they get pushed through the rest of prehandoff and
 118      * then sent to the living server.
 119      *
 120      * String is the server name it wants to connect to. *)
 121     purgatory_clients:
 122       (Connection_tracker.t * MonitorRpc.handoff_options * Unix.file_descr)
 123       Queue.t;
 124     (* Whether to ignore hh version mismatches *)
 125     ignore_hh_version: bool;
 126     (* What server is doing now *)
 127     server_progress: string;
 128     (* Why what it is doing now might not be going as well as it could *)
 129     server_progress_warning: string option;
 130   }
 131
 132   type t = env * ServerMonitorUtils.monitor_config * Unix.file_descr
 133
 134   let fd_to_int (x : Unix.file_descr) : int = Obj.magic x
 135
 136   let msg_to_channel fd msg =
 137     (* This FD will be passed to a server process, so avoid using Ocaml's
 138      * channels which have built-in buffering. Even though we are only writing
 139      * to the FD here, it seems using Ocaml's channels also causes read
 140      * buffering to happen here, so the server process doesn't get what was
 141      * meant for it. *)
 142     Marshal_tools.to_fd_with_preamble fd msg |> ignore
 143
 144   let setup_handler_for_signals handler signals =
 145     List.iter signals (fun signal ->
 146         Sys_utils.set_signal signal (Sys.Signal_handle handler))
 147
 148   let setup_autokill_server_on_exit process =
 149     try
 150       setup_handler_for_signals
 151         begin
 152           fun _ ->
 153           Hh_logger.log "Got an exit signal. Killing server and exiting.";
 154           SC.kill_server process;
 155           Exit_status.exit Exit_status.Interrupted
 156         end
 157         [Sys.sigint; Sys.sigquit; Sys.sigterm; Sys.sighup]
 158     with _ -> Hh_logger.log "Failed to set signal handler"
 159
 160   let sleep_and_check socket =
 161     let (ready_socket_l, _, _) = Unix.select [socket] [] [] 1.0 in
 162     not (List.is_empty ready_socket_l)
 163
 164   let start_server ?target_saved_state ~informant_managed options exit_status =
 165     let server_process =
 166       SC.start_server
 167         ?target_saved_state
 168         ~prior_exit_status:exit_status
 169         ~informant_managed
 170         options
 171     in
 172     setup_autokill_server_on_exit server_process;
 173     Alive server_process
 174
 175   let maybe_start_first_server options informant =
 176     if Informant.should_start_first_server informant then (
 177       Hh_logger.log "Starting first server";
 178       HackEventLogger.starting_first_server ();
 179       start_server
 180         ~informant_managed:(Informant.is_managing informant)
 181         options
 182         None
 183     ) else (
 184       Hh_logger.log
 185         ( "Not starting first server. "
 186         ^^ "Starting will be triggered by informant later." );
 187       Not_yet_started
 188     )
 189
 190   let kill_server_with_check = function
 191     | Alive server -> SC.kill_server server
 192     | _ -> ()
 193
 194   let wait_for_server_exit_with_check server kill_signal_time =
 195     match server with
 196     | Alive server -> SC.wait_for_server_exit server kill_signal_time
 197     | _ -> ()
 198
 199   let kill_server_and_wait_for_exit env =
 200     kill_server_with_check env.server;
 201     let kill_signal_time = Unix.gettimeofday () in
 202     wait_for_server_exit_with_check env.server kill_signal_time
 203
 204   (* Reads current hhconfig contents from disk and returns true if the
 205    * version specified in there matches our currently running version. *)
 206   let is_config_version_matching env =
 207     let filename =
 208       Relative_path.from_root
 209         ~suffix:Config_file.file_path_relative_to_repo_root
 210     in
 211     let (_hash, config) =
 212       Config_file.parse_hhconfig
 213         ~silent:true
 214         (Relative_path.to_absolute filename)
 215     in
 216     let new_version =
 217       Config_file.parse_version (SMap.find_opt "version" config)
 218     in
 219     0 = Config_file.compare_versions env.current_version new_version
 220
 221   (* Actually starts a new server. *)
 222   let start_new_server ?target_saved_state env exit_status =
 223     let informant_managed = Informant.is_managing env.informant in
 224     let new_server =
 225       start_server
 226         ?target_saved_state
 227         ~informant_managed
 228         env.server_start_options
 229         exit_status
 230     in
 231     { env with server = new_server; retries = env.retries + 1 }
 232
 233   (* Kill the server (if it's running) and restart it - maybe. Obeying the rules
 234    * of state transitions. See docs on the ServerProcess.server_process ADT for
 235    * state transitions. *)
 236   let kill_and_maybe_restart_server ?target_saved_state env exit_status =
 237     (* Ideally, all restarts should be triggered by Changed_merge_base notification
 238      * which generate target mini state. There are other kind of restarts too, mostly
 239      * related to server crashing - if we just restart and keep going, we risk
 240      * Changed_merge_base eventually arriving and restarting the already started server
 241      * for no reason. Re-issuing merge base query here should bring the Monitor and Server
 242      * understanding of current revision to be the same *)
 243     if Option.is_none target_saved_state then Informant.reinit env.informant;
 244     kill_server_and_wait_for_exit env;
 245     let version_matches = is_config_version_matching env in
 246     match (env.server, version_matches) with
 247     | (Died_config_changed, _) ->
 248       (* Now we can start a new instance safely.
 249        * See diagram on ServerProcess.server_process docs. *)
 250       start_new_server ?target_saved_state env exit_status
 251     | (Not_yet_started, false)
 252     | (Alive _, false)
 253     | (Died_unexpectedly _, false) ->
 254       (* Can't start server instance. State goes to Died_config_changed
 255        * See diagram on ServerProcess.server_process docs. *)
 256       Hh_logger.log
 257         "Avoiding starting a new server because version in config no longer matches.";
 258       { env with server = Died_config_changed }
 259     | (Not_yet_started, true)
 260     | (Alive _, true)
 261     | (Died_unexpectedly _, true) ->
 262       (* Start new server instance because config matches.
 263        * See diagram on ServerProcess.server_process docs. *)
 264       start_new_server ?target_saved_state env exit_status
 265
 266   let read_version fd =
 267     let client_build_id : string = Marshal_tools.from_fd_with_preamble fd in
 268     let newline_byte = Bytes.create 1 in
 269     let _ = Unix.read fd newline_byte 0 1 in
 270     if not (String.equal (Bytes.to_string newline_byte) "\n") then (
 271       Hh_logger.log "Did not find newline character after version";
 272       raise Malformed_build_id
 273     );
 274     client_build_id
 275
 276   let rec handle_monitor_rpc env client_fd =
 277     let cmd : MonitorRpc.command =
 278       Marshal_tools.from_fd_with_preamble client_fd
 279     in
 280     match cmd with
 281     | MonitorRpc.HANDOFF_TO_SERVER (tracker, handoff_options) ->
 282       tracker.Connection_tracker.t_received_handoff <- Unix.gettimeofday ();
 283       client_prehandoff
 284         ~tracker
 285         ~is_purgatory_client:false
 286         env
 287         handoff_options
 288         client_fd
 289     | MonitorRpc.SHUT_DOWN tracker ->
 290       log "Got shutdown RPC. Shutting down." ~tracker;
 291       let kill_signal_time = Unix.gettimeofday () in
 292       kill_server_with_check env.server;
 293       wait_for_server_exit_with_check env.server kill_signal_time;
 294       Exit_status.(exit No_error)
 295     | MonitorRpc.SERVER_PROGRESS _tracker ->
 296       msg_to_channel client_fd (env.server_progress, env.server_progress_warning);
 297       Unix.close client_fd;
 298       env
 299
 300   and hand_off_client_connection ~tracker server_fd client_fd =
 301     let status = Libancillary.ancil_send_fd server_fd client_fd in
 302     if status = 0 then begin
 303       tracker.Connection_tracker.t_sent_fd <- Unix.gettimeofday ();
 304       msg_to_channel server_fd tracker;
 305       Sent_fds_collector.cleanup_fd client_fd
 306     end else begin
 307       Hh_logger.log "Failed to handoff FD to server.";
 308       raise (Send_fd_failure status)
 309     end
 310
 311   (* Sends the client connection FD to the server process then closes the
 312    * FD. *)
 313   and hand_off_client_connection_with_retries
 314       ~tracker server_fd retries client_fd =
 315     let (_, ready_l, _) = Unix.select [] [server_fd] [] 0.5 in
 316     if not (List.is_empty ready_l) then
 317       try hand_off_client_connection ~tracker server_fd client_fd
 318       with e ->
 319         if retries > 0 then (
 320           log "Retrying FD handoff" ~tracker;
 321           hand_off_client_connection_with_retries
 322             ~tracker
 323             server_fd
 324             (retries - 1)
 325             client_fd
 326         ) else (
 327           log "No more retries. Ignoring request." ~tracker;
 328           HackEventLogger.send_fd_failure e;
 329           Unix.close client_fd
 330         )
 331     else if retries > 0 then (
 332       log "server socket not yet ready. Retrying." ~tracker;
 333       hand_off_client_connection_with_retries
 334         ~tracker
 335         server_fd
 336         (retries - 1)
 337         client_fd
 338     ) else (
 339       log
 340         "server socket not yet ready. No more retries. Ignoring request."
 341         ~tracker;
 342       Unix.close client_fd
 343     )
 344
 345   (* Does not return. *)
 346   and client_out_of_date_ client_fd mismatch_info =
 347     msg_to_channel client_fd (Build_id_mismatch_ex mismatch_info);
 348     HackEventLogger.out_of_date ()
 349
 350   (* Kills servers, sends build ID mismatch message to client, and exits.
 351    *
 352    * Does not return. Exits after waiting for server processes to exit. So
 353    * the client can wait for socket closure as indication that both the monitor
 354    * and server have exited.
 355    *)
 356   and client_out_of_date env client_fd mismatch_info =
 357     Hh_logger.log "Client out of date. Killing server.";
 358     kill_server_with_check env.server;
 359     let kill_signal_time = Unix.gettimeofday () in
 360     (* If we detect out of date client, should always kill server and exit
 361      * monitor, even if messaging to channel or event logger fails. *)
 362     (try client_out_of_date_ client_fd mismatch_info
 363      with e ->
 364        Hh_logger.log
 365          "Handling client_out_of_date threw with: %s"
 366          (Exn.to_string e));
 367     wait_for_server_exit_with_check env.server kill_signal_time;
 368     Exit_status.exit Exit_status.Build_id_mismatch
 369
 370   (* Send (possibly empty) sequences of messages before handing off to
 371    * server. *)
 372   and client_prehandoff
 373       ~tracker ~is_purgatory_client env handoff_options client_fd =
 374     let module PH = Prehandoff in
 375     match env.server with
 376     | Alive server ->
 377       let server_fd =
 378         snd
 379         @@ List.find_exn server.out_fds ~f:(fun x ->
 380                String.equal (fst x) handoff_options.MonitorRpc.pipe_name)
 381       in
 382       tracker.Connection_tracker.t_monitor_ready <- Unix.gettimeofday ();
 383       (* TODO: Send this to client so it is visible. *)
 384       log
 385         "Got %s request for typechecker. Prior request %.1f seconds ago"
 386         ~tracker
 387         handoff_options.MonitorRpc.pipe_name
 388         ( tracker.Connection_tracker.t_monitor_ready
 389         -. !(server.last_request_handoff) );
 390       msg_to_channel client_fd (PH.Sentinel server.finale_file);
 391       tracker.Connection_tracker.t_sent_ack_to_client <- Unix.gettimeofday ();
 392       hand_off_client_connection_with_retries ~tracker server_fd 8 client_fd;
 393       log "handed off client fd to server" ~tracker;
 394       HackEventLogger.client_connection_sent ();
 395       server.last_request_handoff := Unix.time ();
 396       { env with server = Alive server }
 397     | Died_unexpectedly (status, was_oom) ->
 398       (* Server has died; notify the client *)
 399       msg_to_channel client_fd (PH.Server_died { PH.status; PH.was_oom });
 400
 401       (* Next client to connect starts a new server. *)
 402       Exit_status.exit Exit_status.No_error
 403     | Died_config_changed ->
 404       if not is_purgatory_client then (
 405         let env = kill_and_maybe_restart_server env None in
 406         (* Assert that the restart succeeded, and then push prehandoff through again. *)
 407         match env.server with
 408         | Alive _ ->
 409           (* Server restarted. We want to re-run prehandoff, which will
 410            * actually do the prehandoff this time. *)
 411           client_prehandoff
 412             ~tracker
 413             ~is_purgatory_client
 414             env
 415             handoff_options
 416             client_fd
 417         | Died_unexpectedly _
 418         | Died_config_changed
 419         | Not_yet_started ->
 420           Hh_logger.log
 421             ( "Unreachable state. Server should be alive after trying a restart"
 422             ^^ " from Died_config_changed state" );
 423           failwith
 424             "Failed starting server transitioning off Died_config_changed state"
 425       ) else (
 426         msg_to_channel client_fd PH.Server_died_config_change;
 427         env
 428       )
 429     | Not_yet_started ->
 430       let env =
 431         if handoff_options.MonitorRpc.force_dormant_start then (
 432           msg_to_channel
 433             client_fd
 434             (PH.Server_not_alive_dormant
 435                "Warning - starting a server by force-dormant-start option...");
 436           kill_and_maybe_restart_server env None
 437         ) else (
 438           msg_to_channel
 439             client_fd
 440             (PH.Server_not_alive_dormant
 441                "Server killed by informant. Waiting for next server...");
 442           env
 443         )
 444       in
 445       if Queue.length env.purgatory_clients >= env.max_purgatory_clients then
 446         let () =
 447           msg_to_channel client_fd PH.Server_dormant_connections_limit_reached
 448         in
 449         env
 450       else
 451         let () =
 452           Queue.enqueue
 453             env.purgatory_clients
 454             (tracker, handoff_options, client_fd)
 455         in
 456         env
 457
 458   and ack_and_handoff_client env client_fd =
 459     try
 460       let client_version = read_version client_fd in
 461       if
 462         (not env.ignore_hh_version)
 463         && not (String.equal client_version Build_id.build_revision)
 464       then
 465         client_out_of_date env client_fd ServerMonitorUtils.current_build_info
 466       else (
 467         msg_to_channel client_fd Connection_ok;
 468         handle_monitor_rpc env client_fd
 469       )
 470     with Malformed_build_id as e ->
 471       let stack = Caml.Printexc.get_raw_backtrace () in
 472       HackEventLogger.malformed_build_id ();
 473       Hh_logger.log "Malformed Build ID";
 474       Caml.Printexc.raise_with_backtrace e stack
 475
 476   and push_purgatory_clients env =
 477     (* We create a queue and transfer all the purgatory clients to it before
 478      * processing to avoid repeatedly retrying the same client even after
 479      * an EBADF. Control flow is easier this way than trying to manage an
 480      * immutable env in the face of exceptions. *)
 481     let clients = Queue.create () in
 482     Queue.blit_transfer ~src:env.purgatory_clients ~dst:clients ();
 483     let env =
 484       Queue.fold
 485         ~f:
 486           begin
 487             fun env (tracker, handoff_options, client_fd) ->
 488             try
 489               client_prehandoff
 490                 ~tracker
 491                 ~is_purgatory_client:true
 492                 env
 493                 handoff_options
 494                 client_fd
 495             with
 496             | Unix.Unix_error (Unix.EPIPE, _, _)
 497             | Unix.Unix_error (Unix.EBADF, _, _) ->
 498               log "Purgatory client disconnected. Dropping." ~tracker;
 499               env
 500           end
 501         ~init:env
 502         clients
 503     in
 504     env
 505
 506   and maybe_push_purgatory_clients env =
 507     match (env.server, Queue.length env.purgatory_clients) with
 508     | (Alive _, 0) -> env
 509     | (Died_config_changed, _) ->
 510       (* These clients are waiting for a server to be started. But this Monitor
 511        * is waiting for a new client to connect (which confirms to us that we
 512        * are running the correct version of the Monitor). So let them know
 513        * that they might want to do something. *)
 514       push_purgatory_clients env
 515     | (Alive _, _) -> push_purgatory_clients env
 516     | (Not_yet_started, _)
 517     | (Died_unexpectedly _, _) ->
 518       env
 519
 520   let rec read_server_messages process env =
 521     let msg =
 522       ServerProgress.(make_pipe_from_server process.in_fd |> read_from_server)
 523     in
 524     match msg with
 525     | None -> env
 526     | Some msg ->
 527       let env =
 528         match msg with
 529         | MonitorRpc.PROGRESS msg -> { env with server_progress = msg }
 530         | MonitorRpc.PROGRESS_WARNING msg ->
 531           { env with server_progress_warning = msg }
 532       in
 533       read_server_messages process env
 534
 535   (* Kill command from client is handled by server server, so the monitor
 536    * needs to check liveness of the server process to know whether
 537    * to stop itself. *)
 538   let update_status_ (env : env) monitor_config =
 539     let env =
 540       match env.server with
 541       | Alive process ->
 542         let (pid, proc_stat) = SC.wait_pid process in
 543         (match (pid, proc_stat) with
 544         | (0, _) ->
 545           (* "pid=0" means the pid we waited for (i.e. process) hasn't yet died/stopped *)
 546           read_server_messages process env
 547         | (_, _) ->
 548           (* "pid<>0" means the pid has died or received a stop signal *)
 549           let oom_code = Exit_status.(exit_code Out_of_shared_memory) in
 550           let was_oom =
 551             match proc_stat with
 552             | Unix.WEXITED code when code = oom_code -> true
 553             | _ -> Sys_utils.check_dmesg_for_oom process.pid "hh_server"
 554           in
 555           SC.on_server_exit monitor_config;
 556           ServerProcessTools.check_exit_status proc_stat process monitor_config;
 557           { env with server = Died_unexpectedly (proc_stat, was_oom) })
 558       | Not_yet_started ->
 559         {
 560           env with
 561           server_progress = "server is currently stopped";
 562           server_progress_warning = None;
 563         }
 564       | Died_config_changed ->
 565         {
 566           env with
 567           server_progress = "server stopped because its configuration changed";
 568           server_progress_warning = None;
 569         }
 570       | Died_unexpectedly _ ->
 571         {
 572           env with
 573           server_progress = "server stopped because of an error";
 574           server_progress_warning = None;
 575         }
 576     in
 577
 578     let (exit_status, server_state) =
 579       match env.server with
 580       | Alive _ -> (None, Informant_sig.Server_alive)
 581       | Died_unexpectedly (Unix.WEXITED c, _) ->
 582         (Some c, Informant_sig.Server_dead)
 583       | Not_yet_started -> (None, Informant_sig.Server_not_yet_started)
 584       | Died_unexpectedly ((Unix.WSIGNALED _ | Unix.WSTOPPED _), _)
 585       | Died_config_changed ->
 586         (None, Informant_sig.Server_dead)
 587     in
 588     (env, exit_status, server_state)
 589
 590   let server_not_started env = { env with server = Not_yet_started }
 591
 592   let update_status env monitor_config =
 593     let (env, exit_status, server_state) = update_status_ env monitor_config in
 594     let informant_report = Informant.report env.informant server_state in
 595     let is_watchman_fresh_instance =
 596       match exit_status with
 597       | Some c when c = Exit_status.(exit_code Watchman_fresh_instance) -> true
 598       | _ -> false
 599     in
 600     let is_watchman_failed =
 601       match exit_status with
 602       | Some c when c = Exit_status.(exit_code Watchman_failed) -> true
 603       | _ -> false
 604     in
 605     let is_config_changed =
 606       match exit_status with
 607       | Some c when c = Exit_status.(exit_code Hhconfig_changed) -> true
 608       | _ -> false
 609     in
 610     let is_heap_stale =
 611       match exit_status with
 612       | Some c
 613         when (c = Exit_status.(exit_code File_provider_stale))
 614              || c = Exit_status.(exit_code Decl_not_found) ->
 615         true
 616       | _ -> false
 617     in
 618     let is_sql_assertion_failure =
 619       match exit_status with
 620       | Some c
 621         when (c = Exit_status.(exit_code Sql_assertion_failure))
 622              || (c = Exit_status.(exit_code Sql_cantopen))
 623              || (c = Exit_status.(exit_code Sql_corrupt))
 624              || c = Exit_status.(exit_code Sql_misuse) ->
 625         true
 626       | _ -> false
 627     in
 628     let is_worker_error =
 629       match exit_status with
 630       | Some c
 631         when (c = Exit_status.(exit_code Worker_not_found_exception))
 632              || (c = Exit_status.(exit_code Worker_busy))
 633              || c = Exit_status.(exit_code Worker_failed_to_send_job) ->
 634         true
 635       | _ -> false
 636     in
 637     let is_decl_heap_elems_bug =
 638       match exit_status with
 639       | Some c when c = Exit_status.(exit_code Decl_heap_elems_bug) -> true
 640       | _ -> false
 641     in
 642     let is_big_rebase =
 643       match exit_status with
 644       | Some c when c = Exit_status.(exit_code Big_rebase_detected) -> true
 645       | _ -> false
 646     in
 647     let max_watchman_retries = 3 in
 648     let max_sql_retries = 3 in
 649     match (informant_report, env.server) with
 650     | (Informant_sig.Move_along, Died_config_changed) -> env
 651     | (Informant_sig.Restart_server _, Died_config_changed) ->
 652       Hh_logger.log "%s"
 653       @@ "Ignoring Informant directed restart - waiting for next client "
 654       ^ "connection to verify server version first";
 655       env
 656     | (Informant_sig.Restart_server target_saved_state, _) ->
 657       Hh_logger.log "Informant directed server restart. Restarting server.";
 658       HackEventLogger.informant_induced_restart ();
 659       kill_and_maybe_restart_server ?target_saved_state env exit_status
 660     | (Informant_sig.Move_along, _) ->
 661       if
 662         (is_watchman_failed || is_watchman_fresh_instance)
 663         && env.watchman_retries < max_watchman_retries
 664       then (
 665         Hh_logger.log
 666           "Watchman died. Restarting hh_server (attempt: %d)"
 667           (env.watchman_retries + 1);
 668         let env = { env with watchman_retries = env.watchman_retries + 1 } in
 669         server_not_started env
 670       ) else if is_decl_heap_elems_bug then (
 671         Hh_logger.log "hh_server died due to Decl_heap_elems_bug. Restarting";
 672         server_not_started env
 673       ) else if is_worker_error then (
 674         Hh_logger.log "hh_server died due to worker error. Restarting";
 675         server_not_started env
 676       ) else if is_config_changed then (
 677         Hh_logger.log "hh_server died from hh config change. Restarting";
 678         server_not_started env
 679       ) else if is_heap_stale then (
 680         Hh_logger.log
 681           "Several large rebases caused shared heap to be stale. Restarting";
 682         server_not_started env
 683       ) else if is_big_rebase then (
 684         Hh_logger.log "Server exited because of big rebase. Restarting";
 685         server_not_started env
 686       ) else if is_sql_assertion_failure && env.sql_retries < max_sql_retries
 687         then (
 688         Hh_logger.log
 689           "Sql failed. Restarting hh_server in fresh mode (attempt: %d)"
 690           (env.sql_retries + 1);
 691         let env = { env with sql_retries = env.sql_retries + 1 } in
 692         server_not_started env
 693       ) else
 694         env
 695
 696   let rec check_and_run_loop
 697       ?(consecutive_throws = 0) env monitor_config (socket : Unix.file_descr) =
 698     let (env, consecutive_throws) =
 699       try (check_and_run_loop_ env monitor_config socket, 0) with
 700       | Unix.Unix_error (Unix.ECHILD, _, _) ->
 701         let stack = Printexc.get_backtrace () in
 702         ignore
 703           (Hh_logger.log
 704              "check_and_run_loop_ threw with Unix.ECHILD. Exiting. - %s"
 705              stack);
 706         Exit_status.exit Exit_status.No_server_running_should_retry
 707       | Watchman.Watchman_restarted ->
 708         Exit_status.exit Exit_status.Watchman_fresh_instance
 709       | Exit_status.Exit_with _ as e -> raise e
 710       | e ->
 711         let stack = Printexc.get_backtrace () in
 712         if consecutive_throws > 500 then (
 713           Hh_logger.log "Too many consecutive exceptions.";
 714           Hh_logger.log
 715             "Probably an uncaught exception rethrown each retry. Exiting. %s"
 716             stack;
 717           HackEventLogger.uncaught_exception e;
 718           Exit_status.exit Exit_status.Uncaught_exception
 719         );
 720         Hh_logger.log
 721           "check_and_run_loop_ threw with exception: %s - %s"
 722           (Exn.to_string e)
 723           stack;
 724         (env, consecutive_throws + 1)
 725     in
 726     check_and_run_loop ~consecutive_throws env monitor_config socket
 727
 728   and check_and_run_loop_ env monitor_config (socket : Unix.file_descr) =
 729     let lock_file = monitor_config.lock_file in
 730     if not (Lock.grab lock_file) then (
 731       Hh_logger.log "Lost lock; terminating.\n%!";
 732       HackEventLogger.lock_stolen lock_file;
 733       Exit_status.(exit Lock_stolen)
 734     );
 735     let env = maybe_push_purgatory_clients env in
 736     let () = Sent_fds_collector.collect_garbage () in
 737     let has_client = sleep_and_check socket in
 738     let env = update_status env monitor_config in
 739     if not has_client then
 740       let () = EventLogger.recheck_disk_files () in
 741       env
 742     else
 743       try
 744         let (fd, _) = Unix.accept socket in
 745         try
 746           HackEventLogger.accepted_client_fd (fd_to_int fd);
 747           ack_and_handoff_client env fd
 748         with
 749         | Exit_status.Exit_with _ as e -> raise e
 750         | e ->
 751           let e = Exception.wrap e in
 752           HackEventLogger.ack_and_handoff_exception e;
 753           Hh_logger.log
 754             "Handling client connection failed. Ignoring connection attempt.\n%s\n"
 755             (Exception.to_string e |> Exception.clean_stack);
 756           Unix.close fd;
 757           env
 758       with
 759       | Exit_status.Exit_with _ as e -> raise e
 760       | e ->
 761         HackEventLogger.accepting_on_socket_exception e;
 762         Hh_logger.log
 763           "Accepting on socket failed. Ignoring client connection attempt.";
 764         env
 765
 766   let check_and_run_loop_once (env, monitor_config, socket) =
 767     let env = check_and_run_loop_ env monitor_config socket in
 768     (env, monitor_config, socket)
 769
 770   let start_monitor
 771       ~current_version
 772       ~waiting_client
 773       ~max_purgatory_clients
 774       server_start_options
 775       informant_init_env
 776       monitor_config =
 777     let socket = Socket.init_unix_socket monitor_config.socket_file in
 778     (* If the client started the server, it opened an FD before forking, so it
 779      * can be notified when the monitor socket is ready. The FD number was
 780      * passed in program args. *)
 781     Option.iter waiting_client (fun fd ->
 782         let oc = Unix.out_channel_of_descr fd in
 783         try
 784           Out_channel.output_string oc (ServerMonitorUtils.ready ^ "\n");
 785           Out_channel.close oc
 786         with
 787         | (Sys_error _ | Unix.Unix_error _) as e ->
 788           Printf.eprintf
 789             "Caught exception while waking client: %s\n%!"
 790             (Exn.to_string e));
 791
 792     (* It is essential that we initiate the Informant before the server if we
 793      * want to give the opportunity for the Informant to truly take
 794      * ownership over the lifetime of the server.
 795      *
 796      * This is because start_server won't actually start a server if it sees
 797      * a hg update sentinel file indicating an hg update is in-progress.
 798      * Starting the informant first ensures that its Watchman watch is started
 799      * before we check for the hgupdate sentinel file - this is required
 800      * for the informant to properly observe an update is complete without
 801      * hitting race conditions. *)
 802     let informant = Informant.init informant_init_env in
 803     let server_process =
 804       maybe_start_first_server server_start_options informant
 805     in
 806     let env =
 807       {
 808         informant;
 809         max_purgatory_clients;
 810         current_version;
 811         purgatory_clients = Queue.create ();
 812         server = server_process;
 813         server_start_options;
 814         retries = 0;
 815         sql_retries = 0;
 816         watchman_retries = 0;
 817         ignore_hh_version =
 818           Informant.should_ignore_hh_version informant_init_env;
 819         server_progress_warning = None;
 820         server_progress = "server status is unknown";
 821       }
 822     in
 823     (env, monitor_config, socket)
 824
 825   let start_monitoring
 826       ~current_version
 827       ~waiting_client
 828       ~max_purgatory_clients
 829       server_start_options
 830       informant_init_env
 831       monitor_config =
 832     let (env, monitor_config, socket) =
 833       start_monitor
 834         ~current_version
 835         ~waiting_client
 836         ~max_purgatory_clients
 837         server_start_options
 838         informant_init_env
 839         monitor_config
 840     in
 841     check_and_run_loop env monitor_config socket
 842 end