server_specific_files
[hiphop-php.git] / hphp / hack / src / monitor / monitorConnection.ml
blob5b7b5fe34fb99b07e36644b8bf5afc995f208b07
1 (*
2 * Copyright (c) 2015, Facebook, Inc.
3 * All rights reserved.
5 * This source code is licensed under the MIT license found in the
6 * LICENSE file in the "hack" directory of this source tree.
8 *)
10 open Hh_prelude
11 open ServerMonitorUtils
13 let log s ~tracker =
14 Hh_logger.log ("[%s] " ^^ s) (Connection_tracker.log_id tracker)
16 let server_exists lock_file = not (Lock.check lock_file)
18 let from_channel_without_buffering ?timeout tic =
19 Marshal_tools.from_fd_with_preamble ?timeout (Timeout.descr_of_in_channel tic)
21 let wait_on_server_restart ic =
22 try
23 while true do
24 let _ = Timeout.input_char ic in
26 done
27 with
28 | End_of_file
29 | Sys_error _ ->
30 (* Server has exited and hung up on us *)
33 let send_version oc =
34 Marshal_tools.to_fd_with_preamble
35 (Unix.descr_of_out_channel oc)
36 Build_id.build_revision
37 |> ignore;
39 (* For backwards-compatibility, newline has always followed the version *)
40 let (_ : int) =
41 Unix.write (Unix.descr_of_out_channel oc) (Bytes.of_string "\n") 0 1
45 let send_server_handoff_rpc ~tracker handoff_options oc =
46 Marshal_tools.to_fd_with_preamble
47 (Unix.descr_of_out_channel oc)
48 (MonitorRpc.HANDOFF_TO_SERVER (tracker, handoff_options))
49 |> ignore
51 let send_shutdown_rpc ~tracker oc =
52 log "send_shutdown" ~tracker;
53 Marshal_tools.to_fd_with_preamble
54 (Unix.descr_of_out_channel oc)
55 (MonitorRpc.SHUT_DOWN tracker)
56 |> ignore
58 let send_server_progress_rpc ~tracker oc =
59 log "send_server_process_rpc" ~tracker;
60 let (_ : int) =
61 Marshal_tools.to_fd_with_preamble
62 (Unix.descr_of_out_channel oc)
63 (MonitorRpc.SERVER_PROGRESS tracker)
67 let read_server_progress ~tracker ic : string * string option =
68 log "read_server_progress" ~tracker;
69 from_channel_without_buffering ic
71 let establish_connection ~timeout config =
72 let sock_name = Socket.get_path config.socket_file in
73 let sockaddr =
74 if Sys.win32 then (
75 let ic = In_channel.create ~binary:true sock_name in
76 let port = Option.value_exn (In_channel.input_binary_int ic) in
77 In_channel.close ic;
78 Unix.(ADDR_INET (inet_addr_loopback, port))
79 ) else
80 Unix.ADDR_UNIX sock_name
82 try Ok (Timeout.open_connection ~timeout sockaddr) with
83 | (Unix.Unix_error (Unix.ECONNREFUSED, _, _) as e)
84 | (Unix.Unix_error (Unix.ENOENT, _, _) as e) ->
85 let e = Exception.wrap e in
86 if not (server_exists config.lock_file) then
87 Error (Server_missing_exn e)
88 else
89 Error (Monitor_socket_not_ready e)
91 let get_cstate
92 ~(tracker : Connection_tracker.t)
93 (config : ServerMonitorUtils.monitor_config)
94 ((ic, oc) : Timeout.in_channel * Out_channel.t) :
95 ( Timeout.in_channel
96 * Out_channel.t
97 * ServerMonitorUtils.connection_state
98 * Connection_tracker.t,
99 ServerMonitorUtils.connection_error )
100 result =
102 send_version oc;
103 let tracker = Connection_tracker.(track tracker ~key:Client_sent_version) in
104 let cstate : connection_state = from_channel_without_buffering ic in
105 let tracker = Connection_tracker.(track tracker ~key:Client_got_cstate) in
106 Ok (ic, oc, cstate, tracker)
107 with e ->
108 let e = Exception.wrap e in
110 "error getting cstate; closing connection. %s"
111 ~tracker
112 (Exception.to_string e);
113 Timeout.shutdown_connection ic;
114 Timeout.close_in_noerr ic;
115 if not (server_exists config.lock_file) then
116 Error (Server_missing_exn e)
117 else
118 Error (Monitor_connection_failure e)
120 let verify_cstate ~tracker ic cstate =
121 match cstate with
122 | Connection_ok
123 | Connection_ok_v2 _ ->
124 Ok ()
125 | Build_id_mismatch_ex mismatch_info
126 | Build_id_mismatch_v3 (mismatch_info, _) ->
127 (* The server is out of date and is going to exit. Subsequent calls
128 * to connect on the Unix Domain Socket might succeed, connecting to
129 * the server that is about to die, and eventually we will be hung
130 * up on while trying to read from our end.
132 * To avoid that fate, when we know the server is about to exit, we
133 * wait for the connection to be closed, signaling that the server
134 * has exited and the OS has cleaned up after it, then we try again.
136 * See also: ServerMonitor.client_out_of_date
138 log "verify_cstate: waiting on server restart" ~tracker;
139 wait_on_server_restart ic;
140 log "verify_cstate: closing ic" ~tracker;
141 Timeout.close_in_noerr ic;
142 Error (Build_id_mismatched (Some mismatch_info))
143 | Build_id_mismatch ->
144 (* The server no longer ever sends this message, as of July 2017 *)
145 failwith "Ancient version of server sent old Build_id_mismatch"
147 (* Consume sequence of Prehandoff messages. *)
148 let rec consume_prehandoff_messages
149 ~(timeout : Timeout.t) (ic : Timeout.in_channel) (oc : Stdlib.out_channel) :
150 ( Timeout.in_channel
151 * Stdlib.out_channel
152 * ServerCommandTypes.server_specific_files,
153 ServerMonitorUtils.connection_error )
154 result =
155 let module PH = Prehandoff in
156 let m : PH.msg = from_channel_without_buffering ~timeout ic in
157 match m with
158 | PH.Sentinel server_specific_files -> Ok (ic, oc, server_specific_files)
159 | PH.Server_dormant_connections_limit_reached ->
160 Printf.eprintf
161 @@ "Connections limit on dormant server reached."
162 ^^ " Be patient waiting for a server to be started.";
163 Error Server_dormant
164 | PH.Server_not_alive_dormant _ ->
165 Printf.eprintf
166 "Waiting for a server to be started...%s\n%!"
167 ClientMessages.waiting_for_server_to_be_started_doc;
168 consume_prehandoff_messages ~timeout ic oc
169 | PH.Server_died_config_change ->
170 Printf.eprintf
171 ( "Last server exited due to config change. Please re-run client"
172 ^^ " to force discovery of the correct version of the client." );
173 Error Server_died
174 | PH.Server_died { PH.status; PH.was_oom } ->
175 (match (was_oom, status) with
176 | (true, _) -> Printf.eprintf "Last server killed by OOM Manager.\n%!"
177 | (false, Unix.WEXITED exit_code) ->
178 Printf.eprintf "Last server exited with code: %d.\n%!" exit_code
179 | (false, Unix.WSIGNALED signal) ->
180 Printf.eprintf "Last server killed by signal: %d.\n%!" signal
181 | (false, Unix.WSTOPPED signal) ->
182 Printf.eprintf "Last server stopped by signal: %d.\n%!" signal);
184 (* Monitor will exit now that it has provided a client with a reason
185 * for the last server dying. Wait for the Monitor to exit. *)
186 wait_on_server_restart ic;
187 Error Server_died
189 let consume_prehandoff_messages
190 ~(timeout : int) (ic : Timeout.in_channel) (oc : Stdlib.out_channel) :
191 ( Timeout.in_channel
192 * Stdlib.out_channel
193 * ServerCommandTypes.server_specific_files,
194 ServerMonitorUtils.connection_error )
195 result =
196 Timeout.with_timeout
197 ~timeout
198 ~do_:(fun timeout -> consume_prehandoff_messages ~timeout ic oc)
199 ~on_timeout:(fun _ ->
200 Error ServerMonitorUtils.Server_dormant_out_of_retries)
202 let connect_to_monitor ~tracker ~timeout config =
203 let open Result.Monad_infix in
204 Timeout.with_timeout
205 ~timeout
206 ~on_timeout:(fun timings ->
208 * Monitor should always readily accept connections. In theory, this will
209 * only timeout if the Monitor is being very heavily DDOS'd, or the Monitor
210 * has wedged itself (a bug).
212 * The DDOS occurs when the Monitor's new connections (arriving on
213 * the socket) queue grows faster than they are being processed. This can
214 * happen in two scenarios:
215 * 1) Malicious DDOSer fills up new connection queue (incoming
216 * connections on the socket) quicker than the queue is being
217 * consumed.
218 * 2) New client connections to the monitor are being created by the
219 * retry logic in hh_client faster than those cancelled connections
220 * (cancelled due to the timeout above) are being discarded by the
221 * monitor. This could happen from thousands of hh_clients being
222 * used to parallelize a job. This is effectively an inadvertent DDOS.
223 * In detail, suppose the timeout above is set to 1 ssecond and that
224 * 1000 thousand hh_client have timed out at the line above. Then these
225 * 1000 clients will cancel the connection and retry. But the Monitor's
226 * connection queue still has these dead/canceled connections waiting
227 * to be processed. Suppose it takes the monitor longer than 1
228 * millisecond to handle and discard a dead connection. Then the
229 * 1000 retrying hh_clients will again add another 1000 dead
230 * connections during retrying even tho the monitor has discarded
231 * fewer than 1000 dead connections. Thus, no progress will be made
232 * on clearing out dead connections and all new connection attempts
233 * will time out.
235 * We ameliorate this by having the timeout be quite large
236 * (many seconds) and by not auto-retrying connections to the Monitor. *)
237 HackEventLogger.client_connect_to_monitor_timeout ();
238 let exists_lock_file = server_exists config.lock_file in
240 "connect_to_monitor: lockfile=%b timeout=%s"
241 ~tracker
242 exists_lock_file
243 (Timeout.show_timings timings);
244 if not exists_lock_file then
245 Error (Server_missing_timeout timings)
246 else
247 Error ServerMonitorUtils.Monitor_establish_connection_timeout)
248 ~do_:
249 begin
250 fun timeout ->
251 establish_connection ~timeout config >>= fun (ic, oc) ->
252 let tracker =
253 Connection_tracker.(track tracker ~key:Client_opened_socket)
255 get_cstate ~tracker config (ic, oc)
258 let connect_and_shut_down ~tracker config =
259 let open Result.Monad_infix in
260 connect_to_monitor ~tracker ~timeout:3 config
261 >>= fun (ic, oc, cstate, tracker) ->
262 verify_cstate ~tracker ic cstate >>= fun () ->
263 send_shutdown_rpc ~tracker oc;
264 Timeout.with_timeout
265 ~timeout:3
266 ~on_timeout:(fun timings ->
267 if not (server_exists config.lock_file) then
268 Error (Server_missing_timeout timings)
269 else
270 Ok ServerMonitorUtils.SHUTDOWN_UNVERIFIED)
271 ~do_:
272 begin
273 fun _ ->
274 wait_on_server_restart ic;
275 Ok ServerMonitorUtils.SHUTDOWN_VERIFIED
278 (** connect_once.
279 1. OPEN SOCKET. After this point we have a working stdin/stdout to the
280 process. Implemented in establish_connection.
281 | catch EConnRefused/ENoEnt/Timeout 1s when lockfile present ->
282 Error Monitor_socket_not_ready.
283 This is unexpected! But can happen if you manage to catch the
284 monitor in the short timeframe after it has grabbed its lock but
285 before it has started listening in on its socket.
286 -> "hh_client check/ide" -> retry from step 1, up to 800 times.
287 The number 800 is hard-coded in 9 places through the codebase.
288 -> "hh_client start" -> print "replacing unresponsive server"
289 kill_server; start_server; exit.
290 | catch Timeout <retries>s when lockfile present ->
291 Error Monitor_establish_connection_timeout
292 This is unexpected! after all the monitor is always responsive,
293 and indeed start_server waits until responsive before returning.
294 But this can happen during a DDOS.
295 -> "hh_client check/ide" -> Its retry attempts are passed to the
296 monitor connection attempt already. So in this timeout all
297 the retries have already been consumed. Just exit.
298 -> "hh_client start" -> print "replacing unresponsive server"
299 kill_server; start_server; exit.
300 | catch EConnRefused/ENoEnt/Timeout when lockfile absent ->
301 Error Server_missing.
302 -> "hh_client ide" -> raise Exit_with IDE_no_server.
303 -> "hh_client check" -> start_server; retry step 1, up to 800x.
304 -> "hh_client start" -> start_server; exit.
305 | catch other exception -> unhandled.
307 2. SEND VERSION; READ VERSION; CHECK VERSIONS. After this point we can
308 safely marshal OCaml types back and forth. Implemented in get_cstate
309 and verify_cstate.
310 | catch any exception when lockfile present ->
311 close_connection; Error Monitor_connection_failure.
312 This is unexpected!
313 -> "hh_client check/ide" -> retry from step 1, up to 800 times.
314 -> "hh_client start" -> print "replacing unresponsive server"
315 kill_server; start_server; exit.
316 | catch any exception when lockfile absent ->
317 close_connection; Error Server_missing.
318 -> "hh_client ide" -> raise Exit_with IDE_no_server
319 -> "hh_client check" -> start_server; retry step 1, up to 800x.
320 -> "hh_client start" -> start_server; exit.
321 | if version numbers differ ->
322 Error Build_mismatch.
323 -> "hh_client ide" -> raise Exit_with IDE_no_server.
324 -> "hh_client check" -> close_log_tailer; retry from step 1.
325 -> "hh_client start" -> start_server; exit.
327 3. SEND HANDOFF; READ RESPONSE. After this point we have a working
328 connection to a server who we believe is ready to handle our messages.
329 Handoff is the stage of the protocol when we're speaking to the monitor
330 rather than directly to the server process itself. Implemented in
331 send_server_handoff_rpc and consume_prehandoff_message.
332 | response Server_name_not_found ->
333 raise Exit_with Server_name_not_found.
334 | response Server_not_alive_dormant ->
335 print "Waiting for server to start"; retry step 5, unlimited times.
336 | response Server_dormant_connections_limit_reached ->
337 Error Server_dormant.
338 -> "hh_client ide" -> raise Exit_with IDE_no_server.
339 -> "hh_client start" -> print "Server already exists but is
340 dormant"; exit.
341 -> "hh_client check" -> print "No server running, and connection
342 limit reached for waiting on the next server to be started.
343 Please wait patiently." raise Exit_with No_server_running.
344 | response Server_died ->
345 print "Last killed by OOM / signal / stopped by signal / exited";
346 wait for server to close; Error Server_died.
347 -> "hh_client ide" -> raise Exit_with IDE_no_server.
348 -> "hh_client start" -> start_server.
349 -> "hh_client check" -> retry from step 1, up to 800 times.
350 | catch any exception -> unhandled.
352 The following two steps aren't implemented inside connect_once but are
353 typically done by callers after connect_once has succeeded...
355 4. READ "HELLO" FROM SERVER. After this point we have evidence that the
356 server is ready to handle our messages. We basically gobble whatever
357 the server sends until it finally sends a line with just "hello".
358 Implemented in wait_for_server_hello.
359 | read anything other than "hello" -> retry from step 4, up to 800x.
360 | catch Timeout 1s -> retry from step 4, up to 800 times.
361 | catch exception EndOfFile/Sys_error ->
362 raise ServerHungUp.
363 -> "hh_client ide/check" -> program exit, code=No_server_running.
364 -> clientStart never actually bothers to do step 4.
365 | catch other exception -> unhandled.
367 5. SEND CONNECTION TYPE; READ RESPONSE. After this point we have
368 evidence that the server is able to handle our connection. The
369 connection type indicates Persistent vs Non-persistent.
370 | response Denied_due_to_existing_persistent_connection.
371 -> "hh_client lsp" -> raise Lsp.Error_server_start.
372 | catch any exception -> unhandled.
374 let connect_once ~tracker ~timeout config handoff_options =
375 let open Result.Monad_infix in
376 let t_start = Unix.gettimeofday () in
377 let tracker =
378 Connection_tracker.(track tracker ~key:Client_start_connect ~time:t_start)
380 connect_to_monitor ~tracker ~timeout config
381 >>= fun (ic, oc, cstate, tracker) ->
382 verify_cstate ~tracker ic cstate >>= fun () ->
383 let tracker =
384 Connection_tracker.(track tracker ~key:Client_ready_to_send_handoff)
386 send_server_handoff_rpc ~tracker handoff_options oc;
387 let elapsed_t = int_of_float (Unix.gettimeofday () -. t_start) in
388 let timeout = max (timeout - elapsed_t) 1 in
389 consume_prehandoff_messages ~timeout ic oc
391 let connect_to_monitor_and_get_server_progress ~tracker ~timeout config :
392 (string * string option, ServerMonitorUtils.connection_error) result =
393 let open Result.Monad_infix in
394 connect_to_monitor ~tracker ~timeout config
395 >>= fun (ic, oc, cstate, tracker) ->
396 verify_cstate ~tracker ic cstate >>= fun () ->
397 (* This is similar to connect_once up to this point, where instead of
398 * being handed off to server we just get our answer from monitor *)
399 send_server_progress_rpc ~tracker oc;
400 Ok (read_server_progress ~tracker ic)