2 * Copyright (c) 2015, Facebook, Inc.
5 * This source code is licensed under the MIT license found in the
6 * LICENSE file in the "hack" directory of this source tree.
11 open ServerMonitorUtils
14 Hh_logger.log ("[%s] " ^^ s
) (Connection_tracker.log_id tracker
)
16 let server_exists lock_file
= not
(Lock.check lock_file
)
18 let from_channel_without_buffering ?timeout tic
=
19 Marshal_tools.from_fd_with_preamble ?timeout
(Timeout.descr_of_in_channel tic
)
21 let wait_on_server_restart ic
=
24 let _ = Timeout.input_char ic
in
30 (* Server has exited and hung up on us *)
34 Marshal_tools.to_fd_with_preamble
35 (Unix.descr_of_out_channel oc
)
36 Build_id.build_revision
39 (* For backwards-compatibility, newline has always followed the version *)
41 Unix.write
(Unix.descr_of_out_channel oc
) (Bytes.of_string
"\n") 0 1
45 let send_server_handoff_rpc ~tracker handoff_options oc
=
46 Marshal_tools.to_fd_with_preamble
47 (Unix.descr_of_out_channel oc
)
48 (MonitorRpc.HANDOFF_TO_SERVER
(tracker
, handoff_options
))
51 let send_shutdown_rpc ~tracker oc
=
52 log "send_shutdown" ~tracker
;
53 Marshal_tools.to_fd_with_preamble
54 (Unix.descr_of_out_channel oc
)
55 (MonitorRpc.SHUT_DOWN tracker
)
58 let send_server_progress_rpc ~tracker oc
=
59 log "send_server_process_rpc" ~tracker
;
61 Marshal_tools.to_fd_with_preamble
62 (Unix.descr_of_out_channel oc
)
63 (MonitorRpc.SERVER_PROGRESS tracker
)
67 let read_server_progress ~tracker ic
: string * string option =
68 log "read_server_progress" ~tracker
;
69 from_channel_without_buffering ic
71 let establish_connection ~timeout config
=
72 let sock_name = Socket.get_path config
.socket_file
in
75 let ic = In_channel.create ~binary
:true sock_name in
76 let port = Option.value_exn
(In_channel.input_binary_int
ic) in
78 Unix.(ADDR_INET
(inet_addr_loopback
, port))
80 Unix.ADDR_UNIX
sock_name
82 try Ok
(Timeout.open_connection ~timeout
sockaddr) with
83 | (Unix.Unix_error
(Unix.ECONNREFUSED
, _, _) as e
)
84 | (Unix.Unix_error
(Unix.ENOENT
, _, _) as e
) ->
85 let e = Exception.wrap
e in
86 if not
(server_exists config
.lock_file
) then
87 Error
(Server_missing_exn
e)
89 Error
(Monitor_socket_not_ready
e)
92 ~
(tracker
: Connection_tracker.t
)
93 (config
: ServerMonitorUtils.monitor_config
)
94 ((ic, oc
) : Timeout.in_channel
* Out_channel.t
) :
97 * ServerMonitorUtils.connection_state
98 * Connection_tracker.t
,
99 ServerMonitorUtils.connection_error
)
103 let tracker = Connection_tracker.(track
tracker ~key
:Client_sent_version
) in
104 let cstate : connection_state
= from_channel_without_buffering ic in
105 let tracker = Connection_tracker.(track
tracker ~key
:Client_got_cstate
) in
106 Ok
(ic, oc
, cstate, tracker)
108 let e = Exception.wrap
e in
110 "error getting cstate; closing connection. %s"
112 (Exception.to_string
e);
113 Timeout.shutdown_connection
ic;
114 Timeout.close_in_noerr
ic;
115 if not
(server_exists config
.lock_file
) then
116 Error
(Server_missing_exn
e)
118 Error
(Monitor_connection_failure
e)
120 let verify_cstate ~
tracker ic cstate =
122 | Connection_ok
-> Ok
()
123 | Build_id_mismatch_ex mismatch_info
->
124 (* The server is out of date and is going to exit. Subsequent calls
125 * to connect on the Unix Domain Socket might succeed, connecting to
126 * the server that is about to die, and eventually we will be hung
127 * up on while trying to read from our end.
129 * To avoid that fate, when we know the server is about to exit, we
130 * wait for the connection to be closed, signaling that the server
131 * has exited and the OS has cleaned up after it, then we try again.
133 * See also: ServerMonitor.client_out_of_date
135 log "verify_cstate: waiting on server restart" ~
tracker;
136 wait_on_server_restart ic;
137 log "verify_cstate: closing ic" ~
tracker;
138 Timeout.close_in_noerr
ic;
139 Error
(Build_id_mismatched
(Some mismatch_info
))
140 | Build_id_mismatch
->
141 (* The server no longer ever sends this message, as of July 2017 *)
142 failwith
"Ancient version of server sent old Build_id_mismatch"
144 (* Consume sequence of Prehandoff messages. *)
145 let rec consume_prehandoff_messages
146 ~
(timeout
: Timeout.t
) (ic : Timeout.in_channel
) (oc
: Stdlib.out_channel
) :
147 ( Timeout.in_channel
* Stdlib.out_channel
* string,
148 ServerMonitorUtils.connection_error
)
150 let module PH
= Prehandoff
in
151 let m : PH.msg
= from_channel_without_buffering ~timeout
ic in
153 | PH.Sentinel finale_file
-> Ok
(ic, oc
, finale_file
)
154 | PH.Server_dormant_connections_limit_reached
->
156 @@ "Connections limit on dormant server reached."
157 ^^
" Be patient waiting for a server to be started.";
159 | PH.Server_not_alive_dormant
_ ->
161 "Waiting for a server to be started...%s\n%!"
162 ClientMessages.waiting_for_server_to_be_started_doc
;
163 consume_prehandoff_messages ~timeout
ic oc
164 | PH.Server_died_config_change
->
166 ( "Last server exited due to config change. Please re-run client"
167 ^^
" to force discovery of the correct version of the client." );
169 | PH.Server_died
{ PH.status
; PH.was_oom
} ->
170 (match (was_oom
, status
) with
171 | (true, _) -> Printf.eprintf
"Last server killed by OOM Manager.\n%!"
172 | (false, Unix.WEXITED exit_code
) ->
173 Printf.eprintf
"Last server exited with code: %d.\n%!" exit_code
174 | (false, Unix.WSIGNALED signal
) ->
175 Printf.eprintf
"Last server killed by signal: %d.\n%!" signal
176 | (false, Unix.WSTOPPED signal
) ->
177 Printf.eprintf
"Last server stopped by signal: %d.\n%!" signal
);
179 (* Monitor will exit now that it has provided a client with a reason
180 * for the last server dying. Wait for the Monitor to exit. *)
181 wait_on_server_restart ic;
184 let consume_prehandoff_messages
185 ~
(timeout
: int) (ic : Timeout.in_channel
) (oc
: Stdlib.out_channel
) :
186 ( Timeout.in_channel
* Stdlib.out_channel
* string,
187 ServerMonitorUtils.connection_error
)
191 ~do_
:(fun timeout
-> consume_prehandoff_messages ~timeout
ic oc
)
192 ~on_timeout
:(fun _ ->
193 Error
ServerMonitorUtils.Server_dormant_out_of_retries
)
195 let connect_to_monitor ~
tracker ~timeout config
=
196 let open Result.Monad_infix
in
199 ~on_timeout
:(fun timings
->
201 * Monitor should always readily accept connections. In theory, this will
202 * only timeout if the Monitor is being very heavily DDOS'd, or the Monitor
203 * has wedged itself (a bug).
205 * The DDOS occurs when the Monitor's new connections (arriving on
206 * the socket) queue grows faster than they are being processed. This can
207 * happen in two scenarios:
208 * 1) Malicious DDOSer fills up new connection queue (incoming
209 * connections on the socket) quicker than the queue is being
211 * 2) New client connections to the monitor are being created by the
212 * retry logic in hh_client faster than those cancelled connections
213 * (cancelled due to the timeout above) are being discarded by the
214 * monitor. This could happen from thousands of hh_clients being
215 * used to parallelize a job. This is effectively an inadvertent DDOS.
216 * In detail, suppose the timeout above is set to 1 ssecond and that
217 * 1000 thousand hh_client have timed out at the line above. Then these
218 * 1000 clients will cancel the connection and retry. But the Monitor's
219 * connection queue still has these dead/canceled connections waiting
220 * to be processed. Suppose it takes the monitor longer than 1
221 * millisecond to handle and discard a dead connection. Then the
222 * 1000 retrying hh_clients will again add another 1000 dead
223 * connections during retrying even tho the monitor has discarded
224 * fewer than 1000 dead connections. Thus, no progress will be made
225 * on clearing out dead connections and all new connection attempts
228 * We ameliorate this by having the timeout be quite large
229 * (many seconds) and by not auto-retrying connections to the Monitor.
231 HackEventLogger.client_connect_to_monitor_timeout
();
232 let exists_lock_file = server_exists config
.lock_file
in
234 "connect_to_monitor: lockfile=%b timeout=%s"
237 (Timeout.show_timings timings
);
238 if not
exists_lock_file then
239 Error
(Server_missing_timeout timings
)
241 Error
ServerMonitorUtils.Monitor_establish_connection_timeout
)
245 establish_connection ~timeout config
>>= fun (ic, oc
) ->
247 Connection_tracker.(track
tracker ~key
:Client_opened_socket
)
249 get_cstate ~
tracker config
(ic, oc
)
252 let connect_and_shut_down ~
tracker config
=
253 let open Result.Monad_infix
in
254 connect_to_monitor ~
tracker ~timeout
:3 config
255 >>= fun (ic, oc
, cstate, tracker) ->
256 verify_cstate ~
tracker ic cstate >>= fun () ->
257 send_shutdown_rpc ~
tracker oc
;
260 ~on_timeout
:(fun timings
->
261 if not
(server_exists config
.lock_file
) then
262 Error
(Server_missing_timeout timings
)
264 Ok
ServerMonitorUtils.SHUTDOWN_UNVERIFIED
)
268 wait_on_server_restart ic;
269 Ok
ServerMonitorUtils.SHUTDOWN_VERIFIED
273 1. OPEN SOCKET. After this point we have a working stdin/stdout to the
274 process. Implemented in establish_connection.
275 | catch EConnRefused/ENoEnt/Timeout 1s when lockfile present ->
276 Error Monitor_socket_not_ready.
277 This is unexpected! But can happen if you manage to catch the
278 monitor in the short timeframe after it has grabbed its lock but
279 before it has started listening in on its socket.
280 -> "hh_client check/ide" -> retry from step 1, up to 800 times.
281 The number 800 is hard-coded in 9 places through the codebase.
282 -> "hh_client start" -> print "replacing unresponsive server"
283 kill_server; start_server; exit.
284 | catch Timeout <retries>s when lockfile present ->
285 Error Monitor_establish_connection_timeout
286 This is unexpected! after all the monitor is always responsive,
287 and indeed start_server waits until responsive before returning.
288 But this can happen during a DDOS.
289 -> "hh_client check/ide" -> Its retry attempts are passed to the
290 monitor connection attempt already. So in this timeout all
291 the retries have already been consumed. Just exit.
292 -> "hh_client start" -> print "replacing unresponsive server"
293 kill_server; start_server; exit.
294 | catch EConnRefused/ENoEnt/Timeout when lockfile absent ->
295 Error Server_missing.
296 -> "hh_client ide" -> raise Exit_with IDE_no_server.
297 -> "hh_client check" -> start_server; retry step 1, up to 800x.
298 -> "hh_client start" -> start_server; exit.
299 | catch other exception -> unhandled.
301 2. SEND VERSION; READ VERSION; CHECK VERSIONS. After this point we can
302 safely marshal OCaml types back and forth. Implemented in get_cstate
304 | catch any exception when lockfile present ->
305 close_connection; Error Monitor_connection_failure.
307 -> "hh_client check/ide" -> retry from step 1, up to 800 times.
308 -> "hh_client start" -> print "replacing unresponsive server"
309 kill_server; start_server; exit.
310 | catch any exception when lockfile absent ->
311 close_connection; Error Server_missing.
312 -> "hh_client ide" -> raise Exit_with IDE_no_server
313 -> "hh_client check" -> start_server; retry step 1, up to 800x.
314 -> "hh_client start" -> start_server; exit.
315 | if version numbers differ ->
316 Error Build_mismatch.
317 -> "hh_client ide" -> raise Exit_with IDE_no_server.
318 -> "hh_client check" -> close_log_tailer; retry from step 1.
319 -> "hh_client start" -> start_server; exit.
321 3. SEND HANDOFF; READ RESPONSE. After this point we have a working
322 connection to a server who we believe is ready to handle our messages.
323 Handoff is the stage of the protocol when we're speaking to the monitor
324 rather than directly to the server process itself. Implemented in
325 send_server_handoff_rpc and consume_prehandoff_message.
326 | response Server_name_not_found ->
327 raise Exit_with Server_name_not_found.
328 | response Server_not_alive_dormant ->
329 print "Waiting for server to start"; retry step 5, unlimited times.
330 | response Server_dormant_connections_limit_reached ->
331 Error Server_dormant.
332 -> "hh_client ide" -> raise Exit_with IDE_no_server.
333 -> "hh_client start" -> print "Server already exists but is
335 -> "hh_client check" -> print "No server running, and connection
336 limit reached for waiting on the next server to be started.
337 Please wait patiently." raise Exit_with No_server_running.
338 | response Server_died ->
339 print "Last killed by OOM / signal / stopped by signal / exited";
340 wait for server to close; Error Server_died.
341 -> "hh_client ide" -> raise Exit_with IDE_no_server.
342 -> "hh_client start" -> start_server.
343 -> "hh_client check" -> retry from step 1, up to 800 times.
344 | catch any exception -> unhandled.
346 The following two steps aren't implemented inside connect_once but are
347 typically done by callers after connect_once has succeeded...
349 4. READ "HELLO" FROM SERVER. After this point we have evidence that the
350 server is ready to handle our messages. We basically gobble whatever
351 the server sends until it finally sends a line with just "hello".
352 Implemented in wait_for_server_hello.
353 | read anything other than "hello" -> retry from step 4, up to 800x.
354 | catch Timeout 1s -> retry from step 4, up to 800 times.
355 | catch exception EndOfFile/Sys_error ->
357 -> "hh_client ide/check" -> program exit, code=No_server_running.
358 -> clientStart never actually bothers to do step 4.
359 | catch other exception -> unhandled.
361 5. SEND CONNECTION TYPE; READ RESPONSE. After this point we have
362 evidence that the server is able to handle our connection. The
363 connection type indicates Persistent vs Non-persistent.
364 | response Denied_due_to_existing_persistent_connection.
365 -> "hh_client lsp" -> raise Lsp.Error_server_start.
366 | catch any exception -> unhandled.
368 let connect_once ~
tracker ~timeout config handoff_options
=
369 let open Result.Monad_infix
in
370 let t_start = Unix.gettimeofday
() in
372 Connection_tracker.(track
tracker ~key
:Client_start_connect ~time
:t_start)
374 connect_to_monitor ~
tracker ~timeout config
375 >>= fun (ic, oc
, cstate, tracker) ->
376 verify_cstate ~
tracker ic cstate >>= fun () ->
378 Connection_tracker.(track
tracker ~key
:Client_ready_to_send_handoff
)
380 send_server_handoff_rpc ~
tracker handoff_options oc
;
381 let elapsed_t = int_of_float
(Unix.gettimeofday
() -. t_start) in
382 let timeout = max
(timeout - elapsed_t) 1 in
383 consume_prehandoff_messages ~
timeout ic oc
385 let connect_to_monitor_and_get_server_progress ~
tracker ~
timeout config
:
386 (string * string option, ServerMonitorUtils.connection_error
) result
=
387 let open Result.Monad_infix
in
388 connect_to_monitor ~
tracker ~
timeout config
389 >>= fun (ic, oc
, cstate, tracker) ->
390 verify_cstate ~
tracker ic cstate >>= fun () ->
391 (* This is similar to connect_once up to this point, where instead of
392 * being handed off to server we just get our answer from monitor *)
393 send_server_progress_rpc ~
tracker oc
;
394 Ok
(read_server_progress ~
tracker ic)