2 * Copyright (c) 2015, Facebook, Inc.
5 * This source code is licensed under the MIT license found in the
6 * LICENSE file in the "hack" directory of this source tree.
13 +---------------------------------+
14 | Let's talk about naming tables! |
15 +---------------------------------+
17 Every typechecker or compiler that's more complicated than a toy project needs some way to look
18 up the location of a given symbol, especially one that operates at the same scale as Hack and
19 double especially as Hack moves towards generating data lazily on demand instead of generating
20 all of the data up front during strictly defined typechecking phases. In Hack, the data
21 structures that map from symbols to filenames and from filenames to symbol names are referred to
22 as the "naming table," with the "forward naming table" mapping from filenames to symbol names
23 and the "reverse naming table" mapping from symbol names to file names. We store the forward
24 naming table as a standard OCaml map since we only read and write it from the main process, and
25 we store the reverse naming table in shared memory since we want to allow rapid reads and writes
26 from worker processes.
29 - Forward naming table: OCaml map, maps filename -> symbols in file
30 - Reverse naming table: shared memory, maps symbol name to filename it's defined in
32 Seems simple enough, right? Unfortunately, this is where the real world and all of its attendant
33 complexity comes in. The naming table (forward + reverse) is big. And not only is it big, its
34 size is on the order of O(repo). This conflicts with our goal of making Hack use a flat amount
35 of memory regardless of the amount of code we're actually typechecking, so it needs to go. In
36 this case, we've chosen to use our existing saved state infrastructure and dump the naming
37 table to a SQLite file which we can use to serve queries without having to store anything in
38 memory other than just the differences between what's in the saved state and what's on disk.
40 Right now, only the reverse naming table supports falling back to SQLite, so let's go through
41 each operation that the reverse naming table supports and cover how it handles falling back to
45 +---------------------------------+
46 | Reverse naming table operations |
47 +---------------------------------+
49 The reverse naming table supports three operations: [put], [get], and [delete]. [Put] is pretty
50 simple, conceptually, but there's some trickiness with [delete] that would benefit from being
51 covered in more detail, and that trickiness bleeds into how [get] works as well. One important
52 consideration is that *the SQLite database is read-only.* The general idea of these operations
53 is that shared memory stores only entries that have changed.
56 [Put] is simple. Since the SQLite database is read-only and we use shared memory for storing any
57 changes, we just put the value in shared memory.
59 +------------------+ +------------+
61 | Naming Heap | | SQLite |
63 [put key value] -----> [put key value] | | |
65 +------------------+ +------------+
68 We're going to cover [delete] next because it's the only reason that [get] has any complexity
69 beyond "look for the value in shared memory and if it's not there look for it in SQLite." At
70 first glance, [delete] doesn't seem too difficult. Just delete the entry from shared memory,
71 right? Well, that's fine... unless that key also exists in SQLite. If the value is in SQLite,
72 then all that deleting it from shared memory does is make us start returning the stored SQLite
73 value! This is bad, because if we just deleted a key of course we want [get] operations for it
74 to return [None]. But okay, we can work around that. Instead of storing direct values in shared
75 memory, we'll store an enum of [[Added of 'a | Deleted]]. Easy peasy!
77 ...except what do we do if we want to delete a value in the main process, then add it again in a
78 worker process? That would require changing a key's value in shared memory, which is something
79 that we don't support due to data integrity concerns with multiple writers and readers. We could
80 remove and re-add, except removal can only be done by master because of the same integrity
81 concerns. So master would somehow need to know which entries will be added and prematurely
82 remove the [Deleted] sentinel from them. This is difficult enough as to be effectively
85 So what we're left needing is some way to delete values which can be undone by child processes,
86 which means that undeleting a value needs to consist only of [add] operations to shared memory,
87 and have no dependency on [remove]. Enter: the blocked entries heap.
89 For each of the reverse naming table heaps (types, functions, and constants) we also create a
90 heap that only stores values of a single-case enum: [Blocked]. The crucial difference between
91 [Blocked] and [[Added of 'a | Deleted]] is that *we only check for [Blocked] if the value is
92 not in the shared memory naming heap.* This means that we can effectively undelete an entry by
93 using an only an [add] operation. Exactly what we need! Thus, [delete] becomes:
95 +-----------------+ +---------------------+ +------------+
97 | Naming Heap | | Blocked Entries | | SQLite |
99 [delete key] --+-----> [delete key] | | | | |
101 +----------------------------> [add key Blocked] | | |
103 +-----------------+ +---------------------+ +------------+
106 Wow, what a ride, huh? Now that we know how to add and remove entries, let's make this actually
107 useful and talk about how to read them! The general idea is that we first check to see if the
108 value is in the shared memory naming heap. If so, we can return that immediately. If it's not
109 in the naming heap, then we check to see if it's blocked. If it is, we immediately return
110 [None]. If it's not in the naming heap, and it's not in the blocked entries, then and only then
111 do we read the value from SQLite:
113 +-----------------+ +---------------------+ +------------------+
115 | Naming Heap | | Blocked Entries | | SQLite |
117 [get key] -----------> [get key] is: | | | | |
118 [Some value] <---------- [Some value] | | | | |
119 | [None] -------+ | | | |
121 | | +--> [get key] is: | | |
122 [None] <--------------------------------------- [Some Blocked] | | |
123 | | | [None] -----------+ | |
125 | | | | +--> [get key] is: |
126 [Some value] <------------------------------------------------------------ [Some value] |
127 [None] <------------------------------------------------------------------ [None] |
129 +-----------------+ +---------------------+ +------------------+
131 And we're done! I hope this was easy to understand and as fun to read as it was to write :)
134 Q: When do we delete entries from the Blocked Entries heaps?
135 A: Never! Once an entry has been removed at least once we never want to retrieve the SQLite
136 version for the rest of the program execution.
138 Q: Won't we just end up with a heap full of [Blocked] entries now?
139 A: Not really. We only have to do deletions once to remove entries for dirty files, and after
140 that it's not a concern. Plus [Blocked] entries are incredibly small in shared memory
141 (although they do still occupy a hash slot).
144 (* Changes since baseline can be None if there was no baseline to begin with.
145 The scenario where we apply changes since baseline instead of relying on
146 the packaged local changes in the LOCAL_CHANGES table is this:
147 1) Load the naming table baseline (may include local changes if it was
148 incrementally updated at some point - not currently done in practice,
149 but possible and likely to happen in the future)
150 2) Load the changes since baseline that include naming changes processed
151 at another time (perhaps, on another host)
152 In the scenario where the naming table is saved to SQLite from an Unbacked
153 naming table, there is no baseline to speak of
155 type changes_since_baseline
= Naming_sqlite.local_changes
option
158 | Unbacked
of FileInfo.t
Relative_path.Map.t
159 | Backed
of Naming_sqlite.local_changes
* Naming_sqlite.db_path
162 type fast
= FileInfo.names
Relative_path.Map.t
164 type saved_state_info
= FileInfo.saved
Relative_path.Map.t
166 (*****************************************************************************)
167 (* Forward naming table functions *)
168 (*****************************************************************************)
170 let empty = Unbacked
Relative_path.Map.empty
174 | Unbacked a
-> Unbacked
(Relative_path.Map.filter a ~f
)
175 | Backed
(local_changes
, db_path
) ->
176 let file_deltas = local_changes
.Naming_sqlite.file_deltas in
180 Naming_sqlite.file_deltas =
190 Relative_path.Map.add
193 ~data
:Naming_sqlite.Deleted
199 let fold a ~init ~f
=
201 | Unbacked a
-> Relative_path.Map.fold a ~init ~f
202 | Backed
(local_changes
, db_path
) ->
207 ~
file_deltas:local_changes
.Naming_sqlite.file_deltas
211 | Unbacked a
-> Relative_path.Map.keys a
212 | Backed
(local_changes
, db_path
) ->
213 (* Reverse at the end to preserve ascending sort order. *)
217 ~f
:(fun path _ acc
-> path
:: acc
)
218 ~
file_deltas:local_changes
.Naming_sqlite.file_deltas
221 let get_files_changed_since_baseline
222 (changes_since_baseline
: changes_since_baseline
) : Relative_path.t list
=
223 match changes_since_baseline
with
224 | Some changes_since_baseline
->
225 Relative_path.Map.keys changes_since_baseline
.Naming_sqlite.file_deltas
228 let get_file_info a key
=
230 | Unbacked a
-> Relative_path.Map.find_opt a key
231 | Backed
(local_changes
, db_path
) ->
233 Relative_path.Map.find_opt local_changes
.Naming_sqlite.file_deltas key
235 | Some
(Naming_sqlite.Modified fi
) -> Some fi
236 | Some
Naming_sqlite.Deleted
-> None
237 | None
-> Naming_sqlite.get_file_info db_path key
)
239 exception File_info_not_found
241 let get_file_info_unsafe a key
=
242 match get_file_info a key
with
244 | None
-> raise File_info_not_found
246 let get_64bit_dep_set_files (naming_table
: t
) (deps
: Typing_deps.DepSet.t
) :
247 Relative_path.Set.t
=
248 match naming_table
with
251 "get_64bit_dep_set_files not supported for unbacked naming tables. Use Typing_deps.get_ifiles instead."
252 | Backed
(local_changes
, db_path
) ->
254 Typing_deps.DepSet.fold
256 ~init
:Relative_path.Set.empty
258 match Naming_sqlite.get_path_by_64bit_dep db_path dep
with
260 | Some
(file
, _namekind
) -> Relative_path.Set.add acc file
)
263 Relative_path.Map.fold
264 local_changes
.Naming_sqlite.file_deltas
266 ~f
:(fun path file_info acc
->
268 | Naming_sqlite.Deleted
-> Relative_path.Set.remove acc path
269 | Naming_sqlite.Modified file_info
->
271 Typing_deps.deps_of_file_info file_info
272 |> Typing_deps.DepSet.of_list
276 (Typing_deps.DepSet.is_empty
277 (Typing_deps.DepSet.inter deps
file_deps))
279 Relative_path.Set.add acc path
281 (* If this file happened to be present in the base changes, it
282 would be permissible to include it since we can return an
283 overestimate, but we may as well remove it. *)
284 Relative_path.Set.remove acc path
)
287 match get_file_info a key
with
293 | Unbacked a
-> Relative_path.Map.iter a ~f
294 | Backed
(local_changes
, db_path
) ->
298 ~f
:(fun path fi
() -> f path fi
)
299 ~
file_deltas:local_changes
.Naming_sqlite.file_deltas
303 | Unbacked a
-> Unbacked
(Relative_path.Map.remove a key
)
304 | Backed
(local_changes
, db_path
) ->
308 Naming_sqlite.file_deltas =
309 Relative_path.Map.add
310 local_changes
.Naming_sqlite.file_deltas
312 ~data
:Naming_sqlite.Deleted
;
316 let update a key data
=
318 | Unbacked a
-> Unbacked
(Relative_path.Map.add a ~key ~data
)
319 | Backed
(local_changes
, db_path
) ->
323 Naming_sqlite.file_deltas =
324 Relative_path.Map.add
325 local_changes
.Naming_sqlite.file_deltas
327 ~data
:(Naming_sqlite.Modified data
);
331 let update_many a updates
=
334 (* Reverse the order because union always takes the first value. *)
335 Unbacked
(Relative_path.Map.union updates a
)
336 | Backed
(local_changes
, db_path
) ->
338 Relative_path.Map.map updates ~f
:(fun data
-> Naming_sqlite.Modified data
)
343 Naming_sqlite.file_deltas =
344 Relative_path.Map.union
346 local_changes
.Naming_sqlite.file_deltas;
350 let update_from_deltas a
file_deltas =
352 | Unbacked file_infos
->
354 Relative_path.Map.fold
357 ~f
:(fun path file_delta acc
->
358 match file_delta
with
359 | Naming_sqlite.Modified file_info
->
360 Relative_path.Map.add acc ~key
:path ~data
:file_info
361 | Naming_sqlite.Deleted
-> Relative_path.Map.remove acc path
)
364 | Backed
(local_changes
, db_path
) ->
366 (* Reverse the order because union always takes the first value. *)
367 Relative_path.Map.union
369 local_changes
.Naming_sqlite.file_deltas
371 Backed
({ local_changes
with Naming_sqlite.file_deltas }, db_path
)
375 | Unbacked b
-> update_many a b
378 "SQLite-backed naming tables cannot be the second argument to combine."
381 This function saves the local changes structure as a binary blob.
382 Local changes represents the (forward) naming table changes since
383 the SQLite naming table baseline. Therefore, this function is most meaningful
384 when the naming table is backed by SQLite. If the naming table is NOT backed
385 by SQLite, meaning that it was computed from scratch by parsing the whole
386 repo, then this baseline-to-current difference is an empty set.
387 The resulting snapshot can be applied when loading a SQLite naming table.
389 let save_changes_since_baseline naming_table ~destination_path
=
391 match naming_table
with
393 | Backed
(local_changes
, _db_path
) -> Some local_changes
395 let contents = Marshal.to_string
snapshot [Marshal.No_sharing
] in
396 Disk.write_file ~file
:destination_path ~
contents
398 let save naming_table db_name
=
399 match naming_table
with
400 | Unbacked naming_table
->
401 let t = Unix.gettimeofday
() in
402 (* Ideally, we would have a commit hash, which would result in the same
403 content version given the same version of source files. However,
404 using a unique version every time we save the naming table from scratch
405 is good enough to enable us to check that the local changes diff
406 provided as an argument to load_from_sqlite_with_changes_since_baseline
407 is compatible with the underlying SQLite table. *)
408 let base_content_version =
409 Printf.sprintf
"%d-%s" (int_of_float
t) (Random_id.short_string
())
412 Naming_sqlite.save_file_infos db_name naming_table ~
base_content_version
414 Naming_sqlite.free_db_cache
();
416 let open Naming_sqlite
in
417 Hh_logger.log_duration
419 "Inserted %d symbols from %d files"
420 save_result.symbols_added
421 save_result.files_added
)
425 | Backed
(local_changes
, db_path
) ->
426 let t = Unix.gettimeofday
() in
428 Naming_sqlite.copy_and_update
430 ~new_db
:(Naming_sqlite.Db_path db_name
)
433 Naming_sqlite.free_db_cache
();
435 let open Naming_sqlite
in
436 Hh_logger.log_duration
438 "Inserted %d symbols from %d files"
439 save_result.symbols_added
440 save_result.files_added
)
445 (*****************************************************************************)
446 (* Conversion functions *)
447 (*****************************************************************************)
449 let from_saved saved
=
450 Hh_logger.log
"Loading naming table from marshalled blob...";
451 let t = Unix.gettimeofday
() in
454 (Relative_path.Map.fold
456 ~init
:Relative_path.Map.empty
457 ~f
:(fun fn saved acc
->
458 let file_info = FileInfo.from_saved fn saved
in
459 Relative_path.Map.add acc ~key
:fn ~data
:file_info))
461 let _t = Hh_logger.log_duration
"Loaded naming table from blob" t in
466 | Unbacked a
-> Relative_path.Map.map a ~f
:FileInfo.to_saved
468 fold a ~init
:Relative_path.Map.empty ~f
:(fun path fi acc
->
469 Relative_path.Map.add acc ~key
:path ~data
:(FileInfo.to_saved fi
))
473 | Unbacked a
-> Relative_path.Map.map a ~f
:FileInfo.simplify
475 fold a ~init
:Relative_path.Map.empty ~f
:(fun path fi acc
->
476 Relative_path.Map.add acc ~key
:path ~data
:(FileInfo.simplify fi
))
478 let saved_to_fast saved
= Relative_path.Map.map saved ~f
:FileInfo.saved_to_names
480 (*****************************************************************************)
481 (* Forward naming table creation functions *)
482 (*****************************************************************************)
484 let create a
= Unbacked a
486 (* Helper function to apply new files info to reverse naming table *)
487 let update_reverse_entries_helper
488 (ctx
: Provider_context.t)
489 (changed_file_infos
: (Relative_path.t * FileInfo.t option) list
) : unit =
490 let backend = Provider_context.get_backend ctx
in
491 let db_path_opt = Db_path_provider.get_naming_db_path
backend in
492 (* Remove all old file symbols first *)
494 ~f
:(fun (path
, _file_info
) ->
496 Option.bind
db_path_opt ~f
:(fun db_path
->
497 Naming_sqlite.get_file_info db_path path
)
501 Naming_provider.remove_type_batch
503 (fi
.FileInfo.classes
|> List.map ~f
:(fun (_
, x
, _
) -> x
));
504 Naming_provider.remove_type_batch
506 (fi
.FileInfo.typedefs
|> List.map ~f
:(fun (_
, x
, _
) -> x
));
507 Naming_provider.remove_fun_batch
509 (fi
.FileInfo.funs
|> List.map ~f
:(fun (_
, x
, _
) -> x
));
510 Naming_provider.remove_const_batch
512 (fi
.FileInfo.consts
|> List.map ~f
:(fun (_
, x
, _
) -> x
));
513 Naming_provider.remove_module_batch
515 (fi
.FileInfo.modules
|> List.map ~f
:(fun (_
, x
, _
) -> x
))
519 (* Add new file symbols after removing old files symbols *)
521 ~f
:(fun (_path
, new_file_info
) ->
522 match new_file_info
with
525 ~f
:(fun (pos
, name
, _
) -> Naming_provider.add_class
backend name pos
)
528 ~f
:(fun (pos
, name
, _
) ->
529 Naming_provider.add_typedef
backend name pos
)
530 fi
.FileInfo.typedefs
;
532 ~f
:(fun (pos
, name
, _
) -> Naming_provider.add_fun
backend name pos
)
535 ~f
:(fun (pos
, name
, _
) -> Naming_provider.add_const
backend name pos
)
538 ~f
:(fun (pos
, name
, _
) -> Naming_provider.add_module
backend name pos
)
543 let update_reverse_entries ctx
file_deltas =
544 let file_delta_list = Relative_path.Map.bindings
file_deltas in
545 let changed_files_info =
547 ~f
:(fun (path
, file_delta
) ->
548 match file_delta
with
549 | Naming_sqlite.Modified fi
-> (path
, Some fi
)
550 | Naming_sqlite.Deleted
-> (path
, None
))
553 update_reverse_entries_helper ctx
changed_files_info
555 let choose_local_changes ~local_changes ~changes_since_baseline
=
556 match changes_since_baseline
with
557 | None
-> local_changes
558 | Some changes_since_baseline
->
561 changes_since_baseline
.Naming_sqlite.base_content_version
562 local_changes
.Naming_sqlite.base_content_version
564 changes_since_baseline
568 "%s\nSQLite content version: %s\nLocal changes content version: %s"
569 "Incompatible local changes diff supplied."
570 local_changes
.Naming_sqlite.base_content_version
571 changes_since_baseline
.Naming_sqlite.base_content_version)
574 Loads the naming table from a SQLite database. The optional custom local
575 changes represents the naming table changes that occurred since the original
576 SQLite database was created.
577 To recap, the SQLite-based naming table, once loaded, is not mutated. All the
578 forward naming table changes are kept in an in-memory map. When we save an
579 update to a naming table that is backed by SQLite, we store the changes
580 since the baseline as a blob is a separate table and we rehydrate those
581 changes into an in-memory map when we load the updated naming table later.
582 The custom local changes is an alternative to the changes serialized as a
583 blob into the SQLite table. This enables scenarios that require repeatedly
584 loading the same base SQLite naming table with different versions of
587 let load_from_sqlite_for_type_checking
588 ~
(changes_since_baseline
: Naming_sqlite.local_changes
option)
589 (ctx
: Provider_context.t)
590 (db_path
: string) : t =
591 Hh_logger.log
"Loading naming table from SQLite...";
592 let t = Unix.gettimeofday
() in
593 let db_path = Naming_sqlite.Db_path
db_path in
594 Naming_sqlite.validate_can_open_db
db_path;
595 (* throw in master if anything's wrong *)
596 Db_path_provider.set_naming_db_path
597 (Provider_context.get_backend ctx
)
601 ~
local_changes:(Naming_sqlite.get_local_changes
db_path)
602 ~changes_since_baseline
604 let t = Hh_logger.log_duration
"Loaded local naming table changes" t in
605 update_reverse_entries ctx
local_changes.Naming_sqlite.file_deltas;
606 let _t = Hh_logger.log_duration
"Updated reverse naming table entries" t in
607 Backed
(local_changes, db_path)
609 let load_from_sqlite_with_changed_file_infos
610 (ctx
: Provider_context.t)
611 (changed_file_infos
: (Relative_path.t * FileInfo.t option) list
)
612 (db_path : string) : t =
613 Hh_logger.log
"Loading naming table from SQLite...";
614 let db_path = Naming_sqlite.Db_path
db_path in
615 Naming_sqlite.validate_can_open_db
db_path;
616 (* throw in master if anything's wrong *)
617 Db_path_provider.set_naming_db_path
618 (Provider_context.get_backend ctx
)
620 let t = Unix.gettimeofday
() in
621 (* Get changed files delta from file info *)
622 let changed_file_deltas =
624 ~f
:(fun acc_files_delta
(path
, changed_file_info_opt
) ->
626 match changed_file_info_opt
with
627 | Some
file_info -> Naming_sqlite.Modified
file_info
628 | None
-> Naming_sqlite.Deleted
630 Relative_path.Map.add acc_files_delta ~key
:path ~data
:file_delta)
631 ~init
:Relative_path.Map.empty
634 let base_local_changes = Naming_sqlite.get_local_changes
db_path in
638 file_deltas = changed_file_deltas;
639 base_content_version =
640 base_local_changes.Naming_sqlite.base_content_version;
643 let t = Hh_logger.log_duration
"Calculate changed files delta" t in
644 update_reverse_entries_helper ctx changed_file_infos
;
645 let _t = Hh_logger.log_duration
"Updated reverse naming table entries" t in
646 Backed
(local_changes, db_path)
648 let load_from_sqlite_with_changes_since_baseline
649 (ctx
: Provider_context.t)
650 (changes_since_baseline
: Naming_sqlite.local_changes option)
651 (db_path : string) : t =
652 load_from_sqlite_for_type_checking ~changes_since_baseline ctx
db_path
654 let load_from_sqlite (ctx
: Provider_context.t) (db_path : string) : t =
655 load_from_sqlite_for_type_checking ~changes_since_baseline
:None ctx
db_path
657 let get_forward_naming_fallback_path a
: string option =
660 | Backed
(_
, Naming_sqlite.Db_path
db_path) -> Some
db_path
662 module SaveAsync
= struct
663 (* The input record that gets passed from the main process to the daemon process *)
666 destination_path
: string;
671 (* The main entry point of the daemon process that saves the naming table
672 from a blob to the SQLite format. *)
673 let save { blob_path
; destination_path
; root
; init_id
} : unit =
674 HackEventLogger.init_batch_tool
676 ~root
:(Path.make root
)
677 ~time
:(Unix.gettimeofday
());
678 let chan = In_channel.create ~binary
:true blob_path
in
679 let (naming_table : t) = Marshal.from_channel
chan in
680 Sys_utils.close_in_no_fail blob_path
chan;
681 Sys_utils.rm_dir_tree
(Filename.dirname blob_path
);
682 let (_save_result
: Naming_sqlite.save_result) =
683 save naming_table destination_path
685 (* The intention here is to force the daemon process to exit with
686 an failed exit code so that the main process can detect
687 the condition and log the outcome. *)
688 assert (Disk.file_exists destination_path
)
690 (* The daemon entry registration - used by the main process *)
692 Process.register_entry_point
"naming_table_save_async_entry" save
695 let save_async naming_table ~init_id ~root ~destination_path
=
696 Hh_logger.log
"Saving naming table to %s" destination_path
;
697 let blob_dir = Tempfile.mkdtemp_with_dir
(Path.make
GlobalConfig.tmp_dir
) in
698 let blob_path = Path.(to_string
(concat
blob_dir "naming_bin")) in
699 let chan = Stdlib.open_out_bin
blob_path in
700 Marshal.to_channel
chan naming_table [];
701 Stdlib.close_out
chan;
703 let open SaveAsync
in
706 Process_types.Default
708 { blob_path; destination_path
; init_id
; root
})
709 (fun output
-> Hh_logger.log
"Output from out-of-proc: %s" output
)
711 (*****************************************************************************)
712 (* Testing functions *)
713 (*****************************************************************************)
715 let assert_is_backed a backed
=
717 | Unbacked _
-> assert (not backed
)
718 | Backed _
-> assert backed