hphp/hack/src/monitor/monitorConnection.ml

   1 (*
   2  * Copyright (c) 2015, Facebook, Inc.
   3  * All rights reserved.
   4  *
   5  * This source code is licensed under the MIT license found in the
   6  * LICENSE file in the "hack" directory of this source tree.
   7  *
   8  *)
   9
  10 open Hh_prelude
  11 open ServerMonitorUtils
  12
  13 let log s ~tracker =
  14   Hh_logger.log ("[%s] " ^^ s) (Connection_tracker.log_id tracker)
  15
  16 let server_exists lock_file = not (Lock.check lock_file)
  17
  18 let from_channel_without_buffering ?timeout tic =
  19   Marshal_tools.from_fd_with_preamble ?timeout (Timeout.descr_of_in_channel tic)
  20
  21 let wait_on_server_restart ic =
  22   try
  23     while true do
  24       let _ = Timeout.input_char ic in
  25       ()
  26     done
  27   with
  28   | End_of_file
  29   | Sys_error _ ->
  30     (* Server has exited and hung up on us *)
  31     ()
  32
  33 let send_version oc =
  34   Marshal_tools.to_fd_with_preamble
  35     (Unix.descr_of_out_channel oc)
  36     Build_id.build_revision
  37   |> ignore;
  38
  39   (* For backwards-compatibility, newline has always followed the version *)
  40   let (_ : int) =
  41     Unix.write (Unix.descr_of_out_channel oc) (Bytes.of_string "\n") 0 1
  42   in
  43   ()
  44
  45 let send_server_handoff_rpc ~tracker handoff_options oc =
  46   Marshal_tools.to_fd_with_preamble
  47     (Unix.descr_of_out_channel oc)
  48     (MonitorRpc.HANDOFF_TO_SERVER (tracker, handoff_options))
  49   |> ignore
  50
  51 let send_shutdown_rpc ~tracker oc =
  52   log "send_shutdown" ~tracker;
  53   Marshal_tools.to_fd_with_preamble
  54     (Unix.descr_of_out_channel oc)
  55     (MonitorRpc.SHUT_DOWN tracker)
  56   |> ignore
  57
  58 let send_server_progress_rpc ~tracker oc =
  59   log "send_server_process_rpc" ~tracker;
  60   let (_ : int) =
  61     Marshal_tools.to_fd_with_preamble
  62       (Unix.descr_of_out_channel oc)
  63       (MonitorRpc.SERVER_PROGRESS tracker)
  64   in
  65   ()
  66
  67 let read_server_progress ~tracker ic : string * string option =
  68   log "read_server_progress" ~tracker;
  69   from_channel_without_buffering ic
  70
  71 let establish_connection ~timeout config =
  72   let sock_name = Socket.get_path config.socket_file in
  73   let sockaddr =
  74     if Sys.win32 then (
  75       let ic = In_channel.create ~binary:true sock_name in
  76       let port = Option.value_exn (In_channel.input_binary_int ic) in
  77       In_channel.close ic;
  78       Unix.(ADDR_INET (inet_addr_loopback, port))
  79     ) else
  80       Unix.ADDR_UNIX sock_name
  81   in
  82   try Ok (Timeout.open_connection ~timeout sockaddr) with
  83   | (Unix.Unix_error (Unix.ECONNREFUSED, _, _) as e)
  84   | (Unix.Unix_error (Unix.ENOENT, _, _) as e) ->
  85     let e = Exception.wrap e in
  86     if not (server_exists config.lock_file) then
  87       Error (Server_missing_exn e)
  88     else
  89       Error (Monitor_socket_not_ready e)
  90
  91 let get_cstate
  92     ~(tracker : Connection_tracker.t)
  93     (config : ServerMonitorUtils.monitor_config)
  94     ((ic, oc) : Timeout.in_channel * Out_channel.t) :
  95     ( Timeout.in_channel
  96       * Out_channel.t
  97       * ServerMonitorUtils.connection_state
  98       * Connection_tracker.t,
  99       ServerMonitorUtils.connection_error )
 100     result =
 101   try
 102     send_version oc;
 103     let tracker = Connection_tracker.(track tracker ~key:Client_sent_version) in
 104     let cstate : connection_state = from_channel_without_buffering ic in
 105     let tracker = Connection_tracker.(track tracker ~key:Client_got_cstate) in
 106     Ok (ic, oc, cstate, tracker)
 107   with e ->
 108     let e = Exception.wrap e in
 109     log
 110       "error getting cstate; closing connection. %s"
 111       ~tracker
 112       (Exception.to_string e);
 113     Timeout.shutdown_connection ic;
 114     Timeout.close_in_noerr ic;
 115     if not (server_exists config.lock_file) then
 116       Error (Server_missing_exn e)
 117     else
 118       Error (Monitor_connection_failure e)
 119
 120 let verify_cstate ~tracker ic cstate =
 121   match cstate with
 122   | Connection_ok
 123   | Connection_ok_v2 _ ->
 124     Ok ()
 125   | Build_id_mismatch_ex mismatch_info
 126   | Build_id_mismatch_v3 (mismatch_info, _) ->
 127     (* The server is out of date and is going to exit. Subsequent calls
 128      * to connect on the Unix Domain Socket might succeed, connecting to
 129      * the server that is about to die, and eventually we will be hung
 130      * up on while trying to read from our end.
 131      *
 132      * To avoid that fate, when we know the server is about to exit, we
 133      * wait for the connection to be closed, signaling that the server
 134      * has exited and the OS has cleaned up after it, then we try again.
 135      *
 136      * See also: ServerMonitor.client_out_of_date
 137      *)
 138     log "verify_cstate: waiting on server restart" ~tracker;
 139     wait_on_server_restart ic;
 140     log "verify_cstate: closing ic" ~tracker;
 141     Timeout.close_in_noerr ic;
 142     Error (Build_id_mismatched (Some mismatch_info))
 143   | Build_id_mismatch ->
 144     (* The server no longer ever sends this message, as of July 2017 *)
 145     failwith "Ancient version of server sent old Build_id_mismatch"
 146
 147 (* Consume sequence of Prehandoff messages. *)
 148 let rec consume_prehandoff_messages
 149     ~(timeout : Timeout.t) (ic : Timeout.in_channel) (oc : Stdlib.out_channel) :
 150     ( Timeout.in_channel
 151       * Stdlib.out_channel
 152       * ServerCommandTypes.server_specific_files,
 153       ServerMonitorUtils.connection_error )
 154     result =
 155   let module PH = Prehandoff in
 156   let m : PH.msg = from_channel_without_buffering ~timeout ic in
 157   match m with
 158   | PH.Sentinel server_specific_files -> Ok (ic, oc, server_specific_files)
 159   | PH.Server_dormant_connections_limit_reached ->
 160     Printf.eprintf
 161     @@ "Connections limit on dormant server reached."
 162     ^^ " Be patient waiting for a server to be started.";
 163     Error Server_dormant
 164   | PH.Server_not_alive_dormant _ ->
 165     Printf.eprintf
 166       "Waiting for a server to be started...%s\n%!"
 167       ClientMessages.waiting_for_server_to_be_started_doc;
 168     consume_prehandoff_messages ~timeout ic oc
 169   | PH.Server_died_config_change ->
 170     Printf.eprintf
 171       ( "Last server exited due to config change. Please re-run client"
 172       ^^ " to force discovery of the correct version of the client." );
 173     Error Server_died
 174   | PH.Server_died { PH.status; PH.was_oom } ->
 175     (match (was_oom, status) with
 176     | (true, _) -> Printf.eprintf "Last server killed by OOM Manager.\n%!"
 177     | (false, Unix.WEXITED exit_code) ->
 178       Printf.eprintf "Last server exited with code: %d.\n%!" exit_code
 179     | (false, Unix.WSIGNALED signal) ->
 180       Printf.eprintf "Last server killed by signal: %d.\n%!" signal
 181     | (false, Unix.WSTOPPED signal) ->
 182       Printf.eprintf "Last server stopped by signal: %d.\n%!" signal);
 183
 184     (* Monitor will exit now that it has provided a client with a reason
 185      * for the last server dying. Wait for the Monitor to exit. *)
 186     wait_on_server_restart ic;
 187     Error Server_died
 188
 189 let consume_prehandoff_messages
 190     ~(timeout : int) (ic : Timeout.in_channel) (oc : Stdlib.out_channel) :
 191     ( Timeout.in_channel
 192       * Stdlib.out_channel
 193       * ServerCommandTypes.server_specific_files,
 194       ServerMonitorUtils.connection_error )
 195     result =
 196   Timeout.with_timeout
 197     ~timeout
 198     ~do_:(fun timeout -> consume_prehandoff_messages ~timeout ic oc)
 199     ~on_timeout:(fun _ ->
 200       Error ServerMonitorUtils.Server_dormant_out_of_retries)
 201
 202 let connect_to_monitor ~tracker ~timeout config =
 203   let open Result.Monad_infix in
 204   Timeout.with_timeout
 205     ~timeout
 206     ~on_timeout:(fun timings ->
 207       (*
 208        * Monitor should always readily accept connections. In theory, this will
 209        * only timeout if the Monitor is being very heavily DDOS'd, or the Monitor
 210        * has wedged itself (a bug).
 211        *
 212        * The DDOS occurs when the Monitor's new connections (arriving on
 213        * the socket) queue grows faster than they are being processed. This can
 214        * happen in two scenarios:
 215        * 1) Malicious DDOSer fills up new connection queue (incoming
 216        *    connections on the socket) quicker than the queue is being
 217        *    consumed.
 218        * 2) New client connections to the monitor are being created by the
 219        *    retry logic in hh_client faster than those cancelled connections
 220        *    (cancelled due to the timeout above) are being discarded by the
 221        *    monitor. This could happen from thousands of hh_clients being
 222        *    used to parallelize a job. This is effectively an inadvertent DDOS.
 223        *    In detail, suppose the timeout above is set to 1 ssecond and that
 224        *    1000 thousand hh_client have timed out at the line above. Then these
 225        *    1000 clients will cancel the connection and retry. But the Monitor's
 226        *    connection queue still has these dead/canceled connections waiting
 227        *    to be processed. Suppose it takes the monitor longer than 1
 228        *    millisecond to handle and discard a dead connection. Then the
 229        *    1000 retrying hh_clients will again add another 1000 dead
 230        *    connections during retrying even tho the monitor has discarded
 231        *    fewer than 1000 dead connections. Thus, no progress will be made
 232        *    on clearing out dead connections and all new connection attempts
 233        *    will time out.
 234        *
 235        *    We ameliorate this by having the timeout be quite large
 236        *    (many seconds) and by not auto-retrying connections to the Monitor. *)
 237       HackEventLogger.client_connect_to_monitor_timeout ();
 238       let exists_lock_file = server_exists config.lock_file in
 239       log
 240         "connect_to_monitor: lockfile=%b timeout=%s"
 241         ~tracker
 242         exists_lock_file
 243         (Timeout.show_timings timings);
 244       if not exists_lock_file then
 245         Error (Server_missing_timeout timings)
 246       else
 247         Error ServerMonitorUtils.Monitor_establish_connection_timeout)
 248     ~do_:
 249       begin
 250         fun timeout ->
 251         establish_connection ~timeout config >>= fun (ic, oc) ->
 252         let tracker =
 253           Connection_tracker.(track tracker ~key:Client_opened_socket)
 254         in
 255         get_cstate ~tracker config (ic, oc)
 256       end
 257
 258 let connect_and_shut_down ~tracker config =
 259   let open Result.Monad_infix in
 260   connect_to_monitor ~tracker ~timeout:3 config
 261   >>= fun (ic, oc, cstate, tracker) ->
 262   verify_cstate ~tracker ic cstate >>= fun () ->
 263   send_shutdown_rpc ~tracker oc;
 264   Timeout.with_timeout
 265     ~timeout:3
 266     ~on_timeout:(fun timings ->
 267       if not (server_exists config.lock_file) then
 268         Error (Server_missing_timeout timings)
 269       else
 270         Ok ServerMonitorUtils.SHUTDOWN_UNVERIFIED)
 271     ~do_:
 272       begin
 273         fun _ ->
 274         wait_on_server_restart ic;
 275         Ok ServerMonitorUtils.SHUTDOWN_VERIFIED
 276       end
 277
 278 (** connect_once.
 279     1. OPEN SOCKET. After this point we have a working stdin/stdout to the
 280     process. Implemented in establish_connection.
 281       | catch EConnRefused/ENoEnt/Timeout 1s when lockfile present ->
 282         Error Monitor_socket_not_ready.
 283           This is unexpected! But can happen if you manage to catch the
 284           monitor in the short timeframe after it has grabbed its lock but
 285           before it has started listening in on its socket.
 286           -> "hh_client check/ide" -> retry from step 1, up to 800 times.
 287             The number 800 is hard-coded in 9 places through the codebase.
 288           -> "hh_client start" -> print "replacing unresponsive server"
 289                 kill_server; start_server; exit.
 290       | catch Timeout <retries>s when lockfile present ->
 291         Error Monitor_establish_connection_timeout
 292           This is unexpected! after all the monitor is always responsive,
 293           and indeed start_server waits until responsive before returning.
 294           But this can happen during a DDOS.
 295           -> "hh_client check/ide" -> Its retry attempts are passed to the
 296               monitor connection attempt already. So in this timeout all
 297               the retries have already been consumed. Just exit.
 298           -> "hh_client start" -> print "replacing unresponsive server"
 299                 kill_server; start_server; exit.
 300       | catch EConnRefused/ENoEnt/Timeout when lockfile absent ->
 301         Error Server_missing.
 302           -> "hh_client ide" -> raise Exit_with IDE_no_server.
 303           -> "hh_client check" -> start_server; retry step 1, up to 800x.
 304           -> "hh_client start" -> start_server; exit.
 305       | catch other exception -> unhandled.
 306
 307     2. SEND VERSION; READ VERSION; CHECK VERSIONS. After this point we can
 308     safely marshal OCaml types back and forth. Implemented in get_cstate
 309     and verify_cstate.
 310       | catch any exception when lockfile present ->
 311         close_connection; Error Monitor_connection_failure.
 312           This is unexpected!
 313           -> "hh_client check/ide" -> retry from step 1, up to 800 times.
 314           -> "hh_client start" -> print "replacing unresponsive server"
 315                 kill_server; start_server; exit.
 316       | catch any exception when lockfile absent ->
 317         close_connection; Error Server_missing.
 318           -> "hh_client ide" -> raise Exit_with IDE_no_server
 319           -> "hh_client check" -> start_server; retry step 1, up to 800x.
 320           -> "hh_client start" -> start_server; exit.
 321       | if version numbers differ ->
 322         Error Build_mismatch.
 323           -> "hh_client ide" -> raise Exit_with IDE_no_server.
 324           -> "hh_client check" -> close_log_tailer; retry from step 1.
 325           -> "hh_client start" -> start_server; exit.
 326
 327     3. SEND HANDOFF; READ RESPONSE. After this point we have a working
 328     connection to a server who we believe is ready to handle our messages.
 329     Handoff is the stage of the protocol when we're speaking to the monitor
 330     rather than directly to the server process itself. Implemented in
 331     send_server_handoff_rpc and consume_prehandoff_message.
 332       | response Server_name_not_found ->
 333         raise Exit_with Server_name_not_found.
 334       | response Server_not_alive_dormant ->
 335         print "Waiting for server to start"; retry step 5, unlimited times.
 336       | response Server_dormant_connections_limit_reached ->
 337         Error Server_dormant.
 338           -> "hh_client ide" -> raise Exit_with IDE_no_server.
 339           -> "hh_client start" -> print "Server already exists but is
 340             dormant"; exit.
 341           -> "hh_client check" -> print "No server running, and connection
 342             limit reached for waiting  on the next server to be started.
 343             Please wait patiently." raise Exit_with No_server_running.
 344       | response Server_died ->
 345         print "Last killed by OOM / signal / stopped by signal / exited";
 346         wait for server to close; Error Server_died.
 347           -> "hh_client ide" -> raise Exit_with IDE_no_server.
 348           -> "hh_client start" -> start_server.
 349           -> "hh_client check" -> retry from step 1, up to 800 times.
 350       | catch any exception -> unhandled.
 351
 352     The following two steps aren't implemented inside connect_once but are
 353     typically done by callers after connect_once has succeeded...
 354
 355     4. READ "HELLO" FROM SERVER. After this point we have evidence that the
 356     server is ready to handle our messages. We basically gobble whatever
 357     the server sends until it finally sends a line with just "hello".
 358     Implemented in wait_for_server_hello.
 359       | read anything other than "hello" -> retry from step 4, up to 800x.
 360       | catch Timeout 1s -> retry from step 4, up to 800 times.
 361       | catch exception EndOfFile/Sys_error ->
 362         raise ServerHungUp.
 363           -> "hh_client ide/check" -> program exit, code=No_server_running.
 364           -> clientStart never actually bothers to do step 4.
 365       | catch other exception -> unhandled.
 366
 367     5. SEND CONNECTION TYPE; READ RESPONSE. After this point we have
 368     evidence that the server is able to handle our connection. The
 369     connection type indicates Persistent vs Non-persistent.
 370       | response Denied_due_to_existing_persistent_connection.
 371           -> "hh_client lsp" -> raise Lsp.Error_server_start.
 372       | catch any exception -> unhandled.
 373 *)
 374 let connect_once ~tracker ~timeout config handoff_options =
 375   let open Result.Monad_infix in
 376   let t_start = Unix.gettimeofday () in
 377   let tracker =
 378     Connection_tracker.(track tracker ~key:Client_start_connect ~time:t_start)
 379   in
 380   connect_to_monitor ~tracker ~timeout config
 381   >>= fun (ic, oc, cstate, tracker) ->
 382   verify_cstate ~tracker ic cstate >>= fun () ->
 383   let tracker =
 384     Connection_tracker.(track tracker ~key:Client_ready_to_send_handoff)
 385   in
 386   send_server_handoff_rpc ~tracker handoff_options oc;
 387   let elapsed_t = int_of_float (Unix.gettimeofday () -. t_start) in
 388   let timeout = max (timeout - elapsed_t) 1 in
 389   consume_prehandoff_messages ~timeout ic oc
 390
 391 let connect_to_monitor_and_get_server_progress ~tracker ~timeout config :
 392     (string * string option, ServerMonitorUtils.connection_error) result =
 393   let open Result.Monad_infix in
 394   connect_to_monitor ~tracker ~timeout config
 395   >>= fun (ic, oc, cstate, tracker) ->
 396   verify_cstate ~tracker ic cstate >>= fun () ->
 397   (* This is similar to connect_once up to this point, where instead of
 398     * being handed off to server we just get our answer from monitor *)
 399   send_server_progress_rpc ~tracker oc;
 400   Ok (read_server_progress ~tracker ic)