solved some TODOs about Tgeneric type arguments (2)
[hiphop-php.git] / hphp / hack / src / monitor / monitorConnection.ml
blob33b67486a2734ab954d23440bc2bfa3184f3e8d9
1 (*
2 * Copyright (c) 2015, Facebook, Inc.
3 * All rights reserved.
5 * This source code is licensed under the MIT license found in the
6 * LICENSE file in the "hack" directory of this source tree.
8 *)
10 open Hh_prelude
11 open ServerMonitorUtils
13 let log s ~tracker =
14 Hh_logger.log ("[%s] " ^^ s) (Connection_tracker.log_id tracker)
16 let server_exists lock_file = not (Lock.check lock_file)
18 let from_channel_without_buffering ?timeout tic =
19 Marshal_tools.from_fd_with_preamble ?timeout (Timeout.descr_of_in_channel tic)
21 let wait_on_server_restart ic =
22 try
23 while true do
24 let _ = Timeout.input_char ic in
26 done
27 with
28 | End_of_file
29 | Sys_error _ ->
30 (* Server has exited and hung up on us *)
33 let send_version oc =
34 Marshal_tools.to_fd_with_preamble
35 (Unix.descr_of_out_channel oc)
36 Build_id.build_revision
37 |> ignore;
39 (* For backwards-compatibility, newline has always followed the version *)
40 let (_ : int) =
41 Unix.write (Unix.descr_of_out_channel oc) (Bytes.of_string "\n") 0 1
45 let send_server_handoff_rpc ~tracker handoff_options oc =
46 Marshal_tools.to_fd_with_preamble
47 (Unix.descr_of_out_channel oc)
48 (MonitorRpc.HANDOFF_TO_SERVER (tracker, handoff_options))
49 |> ignore
51 let send_shutdown_rpc ~tracker oc =
52 log "send_shutdown" ~tracker;
53 Marshal_tools.to_fd_with_preamble
54 (Unix.descr_of_out_channel oc)
55 (MonitorRpc.SHUT_DOWN tracker)
56 |> ignore
58 let send_server_progress_rpc ~tracker oc =
59 log "send_server_process_rpc" ~tracker;
60 let (_ : int) =
61 Marshal_tools.to_fd_with_preamble
62 (Unix.descr_of_out_channel oc)
63 (MonitorRpc.SERVER_PROGRESS tracker)
67 let read_server_progress ~tracker ic : string * string option =
68 log "read_server_progress" ~tracker;
69 from_channel_without_buffering ic
71 let establish_connection ~timeout config =
72 let sock_name = Socket.get_path config.socket_file in
73 let sockaddr =
74 if Sys.win32 then (
75 let ic = In_channel.create ~binary:true sock_name in
76 let port = Option.value_exn (In_channel.input_binary_int ic) in
77 In_channel.close ic;
78 Unix.(ADDR_INET (inet_addr_loopback, port))
79 ) else
80 Unix.ADDR_UNIX sock_name
82 try Ok (Timeout.open_connection ~timeout sockaddr) with
83 | (Unix.Unix_error (Unix.ECONNREFUSED, _, _) as e)
84 | (Unix.Unix_error (Unix.ENOENT, _, _) as e) ->
85 let e = Exception.wrap e in
86 if not (server_exists config.lock_file) then
87 Error (Server_missing_exn e)
88 else
89 Error (Monitor_socket_not_ready e)
91 let get_cstate
92 ~(tracker : Connection_tracker.t)
93 (config : ServerMonitorUtils.monitor_config)
94 ((ic, oc) : Timeout.in_channel * Out_channel.t) :
95 ( Timeout.in_channel
96 * Out_channel.t
97 * ServerMonitorUtils.connection_state
98 * Connection_tracker.t,
99 ServerMonitorUtils.connection_error )
100 result =
102 send_version oc;
103 let tracker = Connection_tracker.(track tracker ~key:Client_sent_version) in
104 let cstate : connection_state = from_channel_without_buffering ic in
105 let tracker = Connection_tracker.(track tracker ~key:Client_got_cstate) in
106 Ok (ic, oc, cstate, tracker)
107 with e ->
108 let e = Exception.wrap e in
110 "error getting cstate; closing connection. %s"
111 ~tracker
112 (Exception.to_string e);
113 Timeout.shutdown_connection ic;
114 Timeout.close_in_noerr ic;
115 if not (server_exists config.lock_file) then
116 Error (Server_missing_exn e)
117 else
118 Error (Monitor_connection_failure e)
120 let verify_cstate ~tracker ic cstate =
121 match cstate with
122 | Connection_ok -> Ok ()
123 | Build_id_mismatch_ex mismatch_info ->
124 (* The server is out of date and is going to exit. Subsequent calls
125 * to connect on the Unix Domain Socket might succeed, connecting to
126 * the server that is about to die, and eventually we will be hung
127 * up on while trying to read from our end.
129 * To avoid that fate, when we know the server is about to exit, we
130 * wait for the connection to be closed, signaling that the server
131 * has exited and the OS has cleaned up after it, then we try again.
133 * See also: ServerMonitor.client_out_of_date
135 log "verify_cstate: waiting on server restart" ~tracker;
136 wait_on_server_restart ic;
137 log "verify_cstate: closing ic" ~tracker;
138 Timeout.close_in_noerr ic;
139 Error (Build_id_mismatched (Some mismatch_info))
140 | Build_id_mismatch ->
141 (* The server no longer ever sends this message, as of July 2017 *)
142 failwith "Ancient version of server sent old Build_id_mismatch"
144 (* Consume sequence of Prehandoff messages. *)
145 let rec consume_prehandoff_messages
146 ~(timeout : Timeout.t) (ic : Timeout.in_channel) (oc : Stdlib.out_channel) :
147 ( Timeout.in_channel * Stdlib.out_channel * string,
148 ServerMonitorUtils.connection_error )
149 result =
150 let module PH = Prehandoff in
151 let m : PH.msg = from_channel_without_buffering ~timeout ic in
152 match m with
153 | PH.Sentinel finale_file -> Ok (ic, oc, finale_file)
154 | PH.Server_dormant_connections_limit_reached ->
155 Printf.eprintf
156 @@ "Connections limit on dormant server reached."
157 ^^ " Be patient waiting for a server to be started.";
158 Error Server_dormant
159 | PH.Server_not_alive_dormant _ ->
160 Printf.eprintf
161 "Waiting for a server to be started...%s\n%!"
162 ClientMessages.waiting_for_server_to_be_started_doc;
163 consume_prehandoff_messages ~timeout ic oc
164 | PH.Server_died_config_change ->
165 Printf.eprintf
166 ( "Last server exited due to config change. Please re-run client"
167 ^^ " to force discovery of the correct version of the client." );
168 Error Server_died
169 | PH.Server_died { PH.status; PH.was_oom } ->
170 (match (was_oom, status) with
171 | (true, _) -> Printf.eprintf "Last server killed by OOM Manager.\n%!"
172 | (false, Unix.WEXITED exit_code) ->
173 Printf.eprintf "Last server exited with code: %d.\n%!" exit_code
174 | (false, Unix.WSIGNALED signal) ->
175 Printf.eprintf "Last server killed by signal: %d.\n%!" signal
176 | (false, Unix.WSTOPPED signal) ->
177 Printf.eprintf "Last server stopped by signal: %d.\n%!" signal);
179 (* Monitor will exit now that it has provided a client with a reason
180 * for the last server dying. Wait for the Monitor to exit. *)
181 wait_on_server_restart ic;
182 Error Server_died
184 let consume_prehandoff_messages
185 ~(timeout : int) (ic : Timeout.in_channel) (oc : Stdlib.out_channel) :
186 ( Timeout.in_channel * Stdlib.out_channel * string,
187 ServerMonitorUtils.connection_error )
188 result =
189 Timeout.with_timeout
190 ~timeout
191 ~do_:(fun timeout -> consume_prehandoff_messages ~timeout ic oc)
192 ~on_timeout:(fun _ ->
193 Error ServerMonitorUtils.Server_dormant_out_of_retries)
195 let connect_to_monitor ~tracker ~timeout config =
196 let open Result.Monad_infix in
197 Timeout.with_timeout
198 ~timeout
199 ~on_timeout:(fun timings ->
201 * Monitor should always readily accept connections. In theory, this will
202 * only timeout if the Monitor is being very heavily DDOS'd, or the Monitor
203 * has wedged itself (a bug).
205 * The DDOS occurs when the Monitor's new connections (arriving on
206 * the socket) queue grows faster than they are being processed. This can
207 * happen in two scenarios:
208 * 1) Malicious DDOSer fills up new connection queue (incoming
209 * connections on the socket) quicker than the queue is being
210 * consumed.
211 * 2) New client connections to the monitor are being created by the
212 * retry logic in hh_client faster than those cancelled connections
213 * (cancelled due to the timeout above) are being discarded by the
214 * monitor. This could happen from thousands of hh_clients being
215 * used to parallelize a job. This is effectively an inadvertent DDOS.
216 * In detail, suppose the timeout above is set to 1 ssecond and that
217 * 1000 thousand hh_client have timed out at the line above. Then these
218 * 1000 clients will cancel the connection and retry. But the Monitor's
219 * connection queue still has these dead/canceled connections waiting
220 * to be processed. Suppose it takes the monitor longer than 1
221 * millisecond to handle and discard a dead connection. Then the
222 * 1000 retrying hh_clients will again add another 1000 dead
223 * connections during retrying even tho the monitor has discarded
224 * fewer than 1000 dead connections. Thus, no progress will be made
225 * on clearing out dead connections and all new connection attempts
226 * will time out.
228 * We ameliorate this by having the timeout be quite large
229 * (many seconds) and by not auto-retrying connections to the Monitor.
230 * *)
231 HackEventLogger.client_connect_to_monitor_timeout ();
232 let exists_lock_file = server_exists config.lock_file in
234 "connect_to_monitor: lockfile=%b timeout=%s"
235 ~tracker
236 exists_lock_file
237 (Timeout.show_timings timings);
238 if not exists_lock_file then
239 Error (Server_missing_timeout timings)
240 else
241 Error ServerMonitorUtils.Monitor_establish_connection_timeout)
242 ~do_:
243 begin
244 fun timeout ->
245 establish_connection ~timeout config >>= fun (ic, oc) ->
246 let tracker =
247 Connection_tracker.(track tracker ~key:Client_opened_socket)
249 get_cstate ~tracker config (ic, oc)
252 let connect_and_shut_down ~tracker config =
253 let open Result.Monad_infix in
254 connect_to_monitor ~tracker ~timeout:3 config
255 >>= fun (ic, oc, cstate, tracker) ->
256 verify_cstate ~tracker ic cstate >>= fun () ->
257 send_shutdown_rpc ~tracker oc;
258 Timeout.with_timeout
259 ~timeout:3
260 ~on_timeout:(fun timings ->
261 if not (server_exists config.lock_file) then
262 Error (Server_missing_timeout timings)
263 else
264 Ok ServerMonitorUtils.SHUTDOWN_UNVERIFIED)
265 ~do_:
266 begin
267 fun _ ->
268 wait_on_server_restart ic;
269 Ok ServerMonitorUtils.SHUTDOWN_VERIFIED
272 (** connect_once.
273 1. OPEN SOCKET. After this point we have a working stdin/stdout to the
274 process. Implemented in establish_connection.
275 | catch EConnRefused/ENoEnt/Timeout 1s when lockfile present ->
276 Error Monitor_socket_not_ready.
277 This is unexpected! But can happen if you manage to catch the
278 monitor in the short timeframe after it has grabbed its lock but
279 before it has started listening in on its socket.
280 -> "hh_client check/ide" -> retry from step 1, up to 800 times.
281 The number 800 is hard-coded in 9 places through the codebase.
282 -> "hh_client start" -> print "replacing unresponsive server"
283 kill_server; start_server; exit.
284 | catch Timeout <retries>s when lockfile present ->
285 Error Monitor_establish_connection_timeout
286 This is unexpected! after all the monitor is always responsive,
287 and indeed start_server waits until responsive before returning.
288 But this can happen during a DDOS.
289 -> "hh_client check/ide" -> Its retry attempts are passed to the
290 monitor connection attempt already. So in this timeout all
291 the retries have already been consumed. Just exit.
292 -> "hh_client start" -> print "replacing unresponsive server"
293 kill_server; start_server; exit.
294 | catch EConnRefused/ENoEnt/Timeout when lockfile absent ->
295 Error Server_missing.
296 -> "hh_client ide" -> raise Exit_with IDE_no_server.
297 -> "hh_client check" -> start_server; retry step 1, up to 800x.
298 -> "hh_client start" -> start_server; exit.
299 | catch other exception -> unhandled.
301 2. SEND VERSION; READ VERSION; CHECK VERSIONS. After this point we can
302 safely marshal OCaml types back and forth. Implemented in get_cstate
303 and verify_cstate.
304 | catch any exception when lockfile present ->
305 close_connection; Error Monitor_connection_failure.
306 This is unexpected!
307 -> "hh_client check/ide" -> retry from step 1, up to 800 times.
308 -> "hh_client start" -> print "replacing unresponsive server"
309 kill_server; start_server; exit.
310 | catch any exception when lockfile absent ->
311 close_connection; Error Server_missing.
312 -> "hh_client ide" -> raise Exit_with IDE_no_server
313 -> "hh_client check" -> start_server; retry step 1, up to 800x.
314 -> "hh_client start" -> start_server; exit.
315 | if version numbers differ ->
316 Error Build_mismatch.
317 -> "hh_client ide" -> raise Exit_with IDE_no_server.
318 -> "hh_client check" -> close_log_tailer; retry from step 1.
319 -> "hh_client start" -> start_server; exit.
321 3. SEND HANDOFF; READ RESPONSE. After this point we have a working
322 connection to a server who we believe is ready to handle our messages.
323 Handoff is the stage of the protocol when we're speaking to the monitor
324 rather than directly to the server process itself. Implemented in
325 send_server_handoff_rpc and consume_prehandoff_message.
326 | response Server_name_not_found ->
327 raise Exit_with Server_name_not_found.
328 | response Server_not_alive_dormant ->
329 print "Waiting for server to start"; retry step 5, unlimited times.
330 | response Server_dormant_connections_limit_reached ->
331 Error Server_dormant.
332 -> "hh_client ide" -> raise Exit_with IDE_no_server.
333 -> "hh_client start" -> print "Server already exists but is
334 dormant"; exit.
335 -> "hh_client check" -> print "No server running, and connection
336 limit reached for waiting on the next server to be started.
337 Please wait patiently." raise Exit_with No_server_running.
338 | response Server_died ->
339 print "Last killed by OOM / signal / stopped by signal / exited";
340 wait for server to close; Error Server_died.
341 -> "hh_client ide" -> raise Exit_with IDE_no_server.
342 -> "hh_client start" -> start_server.
343 -> "hh_client check" -> retry from step 1, up to 800 times.
344 | catch any exception -> unhandled.
346 The following two steps aren't implemented inside connect_once but are
347 typically done by callers after connect_once has succeeded...
349 4. READ "HELLO" FROM SERVER. After this point we have evidence that the
350 server is ready to handle our messages. We basically gobble whatever
351 the server sends until it finally sends a line with just "hello".
352 Implemented in wait_for_server_hello.
353 | read anything other than "hello" -> retry from step 4, up to 800x.
354 | catch Timeout 1s -> retry from step 4, up to 800 times.
355 | catch exception EndOfFile/Sys_error ->
356 raise ServerHungUp.
357 -> "hh_client ide/check" -> program exit, code=No_server_running.
358 -> clientStart never actually bothers to do step 4.
359 | catch other exception -> unhandled.
361 5. SEND CONNECTION TYPE; READ RESPONSE. After this point we have
362 evidence that the server is able to handle our connection. The
363 connection type indicates Persistent vs Non-persistent.
364 | response Denied_due_to_existing_persistent_connection.
365 -> "hh_client lsp" -> raise Lsp.Error_server_start.
366 | catch any exception -> unhandled.
368 let connect_once ~tracker ~timeout config handoff_options =
369 let open Result.Monad_infix in
370 let t_start = Unix.gettimeofday () in
371 let tracker =
372 Connection_tracker.(track tracker ~key:Client_start_connect ~time:t_start)
374 connect_to_monitor ~tracker ~timeout config
375 >>= fun (ic, oc, cstate, tracker) ->
376 verify_cstate ~tracker ic cstate >>= fun () ->
377 let tracker =
378 Connection_tracker.(track tracker ~key:Client_ready_to_send_handoff)
380 send_server_handoff_rpc ~tracker handoff_options oc;
381 let elapsed_t = int_of_float (Unix.gettimeofday () -. t_start) in
382 let timeout = max (timeout - elapsed_t) 1 in
383 consume_prehandoff_messages ~timeout ic oc
385 let connect_to_monitor_and_get_server_progress ~tracker ~timeout config :
386 (string * string option, ServerMonitorUtils.connection_error) result =
387 let open Result.Monad_infix in
388 connect_to_monitor ~tracker ~timeout config
389 >>= fun (ic, oc, cstate, tracker) ->
390 verify_cstate ~tracker ic cstate >>= fun () ->
391 (* This is similar to connect_once up to this point, where instead of
392 * being handed off to server we just get our answer from monitor *)
393 send_server_progress_rpc ~tracker oc;
394 Ok (read_server_progress ~tracker ic)