hphp/hack/src/monitor/monitorConnection.ml

   1 (*
   2  * Copyright (c) 2015, Facebook, Inc.
   3  * All rights reserved.
   4  *
   5  * This source code is licensed under the MIT license found in the
   6  * LICENSE file in the "hack" directory of this source tree.
   7  *
   8  *)
   9
  10 open Hh_prelude
  11 open ServerMonitorUtils
  12
  13 let log s ~tracker =
  14   Hh_logger.log ("[%s] " ^^ s) (Connection_tracker.log_id tracker)
  15
  16 let server_exists lock_file = not (Lock.check lock_file)
  17
  18 let from_channel_without_buffering ?timeout tic =
  19   Marshal_tools.from_fd_with_preamble ?timeout (Timeout.descr_of_in_channel tic)
  20
  21 let wait_on_server_restart ic =
  22   try
  23     while true do
  24       let _ = Timeout.input_char ic in
  25       ()
  26     done
  27   with
  28   | End_of_file
  29   | Sys_error _ ->
  30     (* Server has exited and hung up on us *)
  31     ()
  32
  33 let send_version oc =
  34   Marshal_tools.to_fd_with_preamble
  35     (Unix.descr_of_out_channel oc)
  36     Build_id.build_revision
  37   |> ignore;
  38
  39   (* For backwards-compatibility, newline has always followed the version *)
  40   let (_ : int) =
  41     Unix.write (Unix.descr_of_out_channel oc) (Bytes.of_string "\n") 0 1
  42   in
  43   ()
  44
  45 let send_server_handoff_rpc ~tracker handoff_options oc =
  46   Marshal_tools.to_fd_with_preamble
  47     (Unix.descr_of_out_channel oc)
  48     (MonitorRpc.HANDOFF_TO_SERVER (tracker, handoff_options))
  49   |> ignore
  50
  51 let send_shutdown_rpc ~tracker oc =
  52   log "send_shutdown" ~tracker;
  53   Marshal_tools.to_fd_with_preamble
  54     (Unix.descr_of_out_channel oc)
  55     (MonitorRpc.SHUT_DOWN tracker)
  56   |> ignore
  57
  58 let send_server_progress_rpc ~tracker oc =
  59   log "send_server_process_rpc" ~tracker;
  60   let (_ : int) =
  61     Marshal_tools.to_fd_with_preamble
  62       (Unix.descr_of_out_channel oc)
  63       (MonitorRpc.SERVER_PROGRESS tracker)
  64   in
  65   ()
  66
  67 let read_server_progress ~tracker ic : string * string option =
  68   log "read_server_progress" ~tracker;
  69   from_channel_without_buffering ic
  70
  71 let establish_connection ~timeout config =
  72   let sock_name = Socket.get_path config.socket_file in
  73   let sockaddr =
  74     if Sys.win32 then (
  75       let ic = In_channel.create ~binary:true sock_name in
  76       let port = Option.value_exn (In_channel.input_binary_int ic) in
  77       In_channel.close ic;
  78       Unix.(ADDR_INET (inet_addr_loopback, port))
  79     ) else
  80       Unix.ADDR_UNIX sock_name
  81   in
  82   try Ok (Timeout.open_connection ~timeout sockaddr) with
  83   | (Unix.Unix_error (Unix.ECONNREFUSED, _, _) as e)
  84   | (Unix.Unix_error (Unix.ENOENT, _, _) as e) ->
  85     let e = Exception.wrap e in
  86     if not (server_exists config.lock_file) then
  87       Error (Server_missing_exn e)
  88     else
  89       Error (Monitor_socket_not_ready e)
  90
  91 let get_cstate
  92     ~(tracker : Connection_tracker.t)
  93     (config : ServerMonitorUtils.monitor_config)
  94     ((ic, oc) : Timeout.in_channel * Out_channel.t) :
  95     ( Timeout.in_channel
  96       * Out_channel.t
  97       * ServerMonitorUtils.connection_state
  98       * Connection_tracker.t,
  99       ServerMonitorUtils.connection_error )
 100     result =
 101   try
 102     send_version oc;
 103     let tracker = Connection_tracker.(track tracker ~key:Client_sent_version) in
 104     let cstate : connection_state = from_channel_without_buffering ic in
 105     let tracker = Connection_tracker.(track tracker ~key:Client_got_cstate) in
 106     Ok (ic, oc, cstate, tracker)
 107   with e ->
 108     let e = Exception.wrap e in
 109     log
 110       "error getting cstate; closing connection. %s"
 111       ~tracker
 112       (Exception.to_string e);
 113     Timeout.shutdown_connection ic;
 114     Timeout.close_in_noerr ic;
 115     if not (server_exists config.lock_file) then
 116       Error (Server_missing_exn e)
 117     else
 118       Error (Monitor_connection_failure e)
 119
 120 let verify_cstate ~tracker ic cstate =
 121   match cstate with
 122   | Connection_ok -> Ok ()
 123   | Build_id_mismatch_ex mismatch_info ->
 124     (* The server is out of date and is going to exit. Subsequent calls
 125      * to connect on the Unix Domain Socket might succeed, connecting to
 126      * the server that is about to die, and eventually we will be hung
 127      * up on while trying to read from our end.
 128      *
 129      * To avoid that fate, when we know the server is about to exit, we
 130      * wait for the connection to be closed, signaling that the server
 131      * has exited and the OS has cleaned up after it, then we try again.
 132      *
 133      * See also: ServerMonitor.client_out_of_date
 134      *)
 135     log "verify_cstate: waiting on server restart" ~tracker;
 136     wait_on_server_restart ic;
 137     log "verify_cstate: closing ic" ~tracker;
 138     Timeout.close_in_noerr ic;
 139     Error (Build_id_mismatched (Some mismatch_info))
 140   | Build_id_mismatch ->
 141     (* The server no longer ever sends this message, as of July 2017 *)
 142     failwith "Ancient version of server sent old Build_id_mismatch"
 143
 144 (* Consume sequence of Prehandoff messages. *)
 145 let rec consume_prehandoff_messages
 146     ~(timeout : Timeout.t) (ic : Timeout.in_channel) (oc : Stdlib.out_channel) :
 147     ( Timeout.in_channel * Stdlib.out_channel * string,
 148       ServerMonitorUtils.connection_error )
 149     result =
 150   let module PH = Prehandoff in
 151   let m : PH.msg = from_channel_without_buffering ~timeout ic in
 152   match m with
 153   | PH.Sentinel finale_file -> Ok (ic, oc, finale_file)
 154   | PH.Server_dormant_connections_limit_reached ->
 155     Printf.eprintf
 156     @@ "Connections limit on dormant server reached."
 157     ^^ " Be patient waiting for a server to be started.";
 158     Error Server_dormant
 159   | PH.Server_not_alive_dormant _ ->
 160     Printf.eprintf
 161       "Waiting for a server to be started...%s\n%!"
 162       ClientMessages.waiting_for_server_to_be_started_doc;
 163     consume_prehandoff_messages ~timeout ic oc
 164   | PH.Server_died_config_change ->
 165     Printf.eprintf
 166       ( "Last server exited due to config change. Please re-run client"
 167       ^^ " to force discovery of the correct version of the client." );
 168     Error Server_died
 169   | PH.Server_died { PH.status; PH.was_oom } ->
 170     (match (was_oom, status) with
 171     | (true, _) -> Printf.eprintf "Last server killed by OOM Manager.\n%!"
 172     | (false, Unix.WEXITED exit_code) ->
 173       Printf.eprintf "Last server exited with code: %d.\n%!" exit_code
 174     | (false, Unix.WSIGNALED signal) ->
 175       Printf.eprintf "Last server killed by signal: %d.\n%!" signal
 176     | (false, Unix.WSTOPPED signal) ->
 177       Printf.eprintf "Last server stopped by signal: %d.\n%!" signal);
 178
 179     (* Monitor will exit now that it has provided a client with a reason
 180      * for the last server dying. Wait for the Monitor to exit. *)
 181     wait_on_server_restart ic;
 182     Error Server_died
 183
 184 let consume_prehandoff_messages
 185     ~(timeout : int) (ic : Timeout.in_channel) (oc : Stdlib.out_channel) :
 186     ( Timeout.in_channel * Stdlib.out_channel * string,
 187       ServerMonitorUtils.connection_error )
 188     result =
 189   Timeout.with_timeout
 190     ~timeout
 191     ~do_:(fun timeout -> consume_prehandoff_messages ~timeout ic oc)
 192     ~on_timeout:(fun _ ->
 193       Error ServerMonitorUtils.Server_dormant_out_of_retries)
 194
 195 let connect_to_monitor ~tracker ~timeout config =
 196   let open Result.Monad_infix in
 197   Timeout.with_timeout
 198     ~timeout
 199     ~on_timeout:(fun timings ->
 200       (*
 201       * Monitor should always readily accept connections. In theory, this will
 202       * only timeout if the Monitor is being very heavily DDOS'd, or the Monitor
 203       * has wedged itself (a bug).
 204       *
 205       * The DDOS occurs when the Monitor's new connections (arriving on
 206       * the socket) queue grows faster than they are being processed. This can
 207       * happen in two scenarios:
 208         * 1) Malicious DDOSer fills up new connection queue (incoming
 209         *    connections on the socket) quicker than the queue is being
 210         *    consumed.
 211         * 2) New client connections to the monitor are being created by the
 212         *    retry logic in hh_client faster than those cancelled connections
 213         *    (cancelled due to the timeout above) are being discarded by the
 214         *    monitor. This could happen from thousands of hh_clients being
 215         *    used to parallelize a job. This is effectively an inadvertent DDOS.
 216         *    In detail, suppose the timeout above is set to 1 ssecond and that
 217         *    1000 thousand hh_client have timed out at the line above. Then these
 218         *    1000 clients will cancel the connection and retry. But the Monitor's
 219         *    connection queue still has these dead/canceled connections waiting
 220         *    to be processed. Suppose it takes the monitor longer than 1
 221         *    millisecond to handle and discard a dead connection. Then the
 222         *    1000 retrying hh_clients will again add another 1000 dead
 223         *    connections during retrying even tho the monitor has discarded
 224         *    fewer than 1000 dead connections. Thus, no progress will be made
 225         *    on clearing out dead connections and all new connection attempts
 226         *    will time out.
 227         *
 228         *    We ameliorate this by having the timeout be quite large
 229         *    (many seconds) and by not auto-retrying connections to the Monitor.
 230       * *)
 231       HackEventLogger.client_connect_to_monitor_timeout ();
 232       let exists_lock_file = server_exists config.lock_file in
 233       log
 234         "connect_to_monitor: lockfile=%b timeout=%s"
 235         ~tracker
 236         exists_lock_file
 237         (Timeout.show_timings timings);
 238       if not exists_lock_file then
 239         Error (Server_missing_timeout timings)
 240       else
 241         Error ServerMonitorUtils.Monitor_establish_connection_timeout)
 242     ~do_:
 243       begin
 244         fun timeout ->
 245         establish_connection ~timeout config >>= fun (ic, oc) ->
 246         let tracker =
 247           Connection_tracker.(track tracker ~key:Client_opened_socket)
 248         in
 249         get_cstate ~tracker config (ic, oc)
 250       end
 251
 252 let connect_and_shut_down ~tracker config =
 253   let open Result.Monad_infix in
 254   connect_to_monitor ~tracker ~timeout:3 config
 255   >>= fun (ic, oc, cstate, tracker) ->
 256   verify_cstate ~tracker ic cstate >>= fun () ->
 257   send_shutdown_rpc ~tracker oc;
 258   Timeout.with_timeout
 259     ~timeout:3
 260     ~on_timeout:(fun timings ->
 261       if not (server_exists config.lock_file) then
 262         Error (Server_missing_timeout timings)
 263       else
 264         Ok ServerMonitorUtils.SHUTDOWN_UNVERIFIED)
 265     ~do_:
 266       begin
 267         fun _ ->
 268         wait_on_server_restart ic;
 269         Ok ServerMonitorUtils.SHUTDOWN_VERIFIED
 270       end
 271
 272 (** connect_once.
 273 1. OPEN SOCKET. After this point we have a working stdin/stdout to the
 274 process. Implemented in establish_connection.
 275   | catch EConnRefused/ENoEnt/Timeout 1s when lockfile present ->
 276     Error Monitor_socket_not_ready.
 277       This is unexpected! But can happen if you manage to catch the
 278       monitor in the short timeframe after it has grabbed its lock but
 279       before it has started listening in on its socket.
 280       -> "hh_client check/ide" -> retry from step 1, up to 800 times.
 281          The number 800 is hard-coded in 9 places through the codebase.
 282       -> "hh_client start" -> print "replacing unresponsive server"
 283              kill_server; start_server; exit.
 284   | catch Timeout <retries>s when lockfile present ->
 285     Error Monitor_establish_connection_timeout
 286       This is unexpected! after all the monitor is always responsive,
 287       and indeed start_server waits until responsive before returning.
 288       But this can happen during a DDOS.
 289       -> "hh_client check/ide" -> Its retry attempts are passed to the
 290           monitor connection attempt already. So in this timeout all
 291           the retries have already been consumed. Just exit.
 292       -> "hh_client start" -> print "replacing unresponsive server"
 293              kill_server; start_server; exit.
 294   | catch EConnRefused/ENoEnt/Timeout when lockfile absent ->
 295     Error Server_missing.
 296       -> "hh_client ide" -> raise Exit_with IDE_no_server.
 297       -> "hh_client check" -> start_server; retry step 1, up to 800x.
 298       -> "hh_client start" -> start_server; exit.
 299   | catch other exception -> unhandled.
 300
 301 2. SEND VERSION; READ VERSION; CHECK VERSIONS. After this point we can
 302 safely marshal OCaml types back and forth. Implemented in get_cstate
 303 and verify_cstate.
 304   | catch any exception when lockfile present ->
 305     close_connection; Error Monitor_connection_failure.
 306       This is unexpected!
 307       -> "hh_client check/ide" -> retry from step 1, up to 800 times.
 308       -> "hh_client start" -> print "replacing unresponsive server"
 309              kill_server; start_server; exit.
 310   | catch any exception when lockfile absent ->
 311     close_connection; Error Server_missing.
 312       -> "hh_client ide" -> raise Exit_with IDE_no_server
 313       -> "hh_client check" -> start_server; retry step 1, up to 800x.
 314       -> "hh_client start" -> start_server; exit.
 315   | if version numbers differ ->
 316     Error Build_mismatch.
 317       -> "hh_client ide" -> raise Exit_with IDE_no_server.
 318       -> "hh_client check" -> close_log_tailer; retry from step 1.
 319       -> "hh_client start" -> start_server; exit.
 320
 321 3. SEND HANDOFF; READ RESPONSE. After this point we have a working
 322 connection to a server who we believe is ready to handle our messages.
 323 Handoff is the stage of the protocol when we're speaking to the monitor
 324 rather than directly to the server process itself. Implemented in
 325 send_server_handoff_rpc and consume_prehandoff_message.
 326   | response Server_name_not_found ->
 327     raise Exit_with Server_name_not_found.
 328   | response Server_not_alive_dormant ->
 329     print "Waiting for server to start"; retry step 5, unlimited times.
 330   | response Server_dormant_connections_limit_reached ->
 331     Error Server_dormant.
 332       -> "hh_client ide" -> raise Exit_with IDE_no_server.
 333       -> "hh_client start" -> print "Server already exists but is
 334         dormant"; exit.
 335       -> "hh_client check" -> print "No server running, and connection
 336         limit reached for waiting  on the next server to be started.
 337         Please wait patiently." raise Exit_with No_server_running.
 338   | response Server_died ->
 339     print "Last killed by OOM / signal / stopped by signal / exited";
 340     wait for server to close; Error Server_died.
 341       -> "hh_client ide" -> raise Exit_with IDE_no_server.
 342       -> "hh_client start" -> start_server.
 343       -> "hh_client check" -> retry from step 1, up to 800 times.
 344   | catch any exception -> unhandled.
 345
 346 The following two steps aren't implemented inside connect_once but are
 347 typically done by callers after connect_once has succeeded...
 348
 349 4. READ "HELLO" FROM SERVER. After this point we have evidence that the
 350 server is ready to handle our messages. We basically gobble whatever
 351 the server sends until it finally sends a line with just "hello".
 352 Implemented in wait_for_server_hello.
 353   | read anything other than "hello" -> retry from step 4, up to 800x.
 354   | catch Timeout 1s -> retry from step 4, up to 800 times.
 355   | catch exception EndOfFile/Sys_error ->
 356     raise ServerHungUp.
 357       -> "hh_client ide/check" -> program exit, code=No_server_running.
 358       -> clientStart never actually bothers to do step 4.
 359   | catch other exception -> unhandled.
 360
 361 5. SEND CONNECTION TYPE; READ RESPONSE. After this point we have
 362 evidence that the server is able to handle our connection. The
 363 connection type indicates Persistent vs Non-persistent.
 364   | response Denied_due_to_existing_persistent_connection.
 365       -> "hh_client lsp" -> raise Lsp.Error_server_start.
 366   | catch any exception -> unhandled.
 367 *)
 368 let connect_once ~tracker ~timeout config handoff_options =
 369   let open Result.Monad_infix in
 370   let t_start = Unix.gettimeofday () in
 371   let tracker =
 372     Connection_tracker.(track tracker ~key:Client_start_connect ~time:t_start)
 373   in
 374   connect_to_monitor ~tracker ~timeout config
 375   >>= fun (ic, oc, cstate, tracker) ->
 376   verify_cstate ~tracker ic cstate >>= fun () ->
 377   let tracker =
 378     Connection_tracker.(track tracker ~key:Client_ready_to_send_handoff)
 379   in
 380   send_server_handoff_rpc ~tracker handoff_options oc;
 381   let elapsed_t = int_of_float (Unix.gettimeofday () -. t_start) in
 382   let timeout = max (timeout - elapsed_t) 1 in
 383   consume_prehandoff_messages ~timeout ic oc
 384
 385 let connect_to_monitor_and_get_server_progress ~tracker ~timeout config :
 386     (string * string option, ServerMonitorUtils.connection_error) result =
 387   let open Result.Monad_infix in
 388   connect_to_monitor ~tracker ~timeout config
 389   >>= fun (ic, oc, cstate, tracker) ->
 390   verify_cstate ~tracker ic cstate >>= fun () ->
 391   (* This is similar to connect_once up to this point, where instead of
 392     * being handed off to server we just get our answer from monitor *)
 393   send_server_progress_rpc ~tracker oc;
 394   Ok (read_server_progress ~tracker ic)