hphp/hack/src/heap/hh_shared.c

   1 /**
   2  * Copyright (c) 2015, Facebook, Inc.
   3  * All rights reserved.
   4  *
   5  * This source code is licensed under the MIT license found in the
   6  * LICENSE file in the "hack" directory of this source tree.
   7  *
   8  */
   9
  10 #include "hh_shared.h"
  11 /* For some reason this header file is not on path in OSS builds.
  12  * But we only lose the ability to trim the OCaml heap after a GC
  13  */
  14 #if __has_include("malloc.h")
  15 #  define MALLOC_TRIM
  16 #  include <malloc.h>
  17 #endif
  18
  19 /*****************************************************************************/
  20 /* File Implementing the shared memory system for Hack.
  21  *
  22  * THIS CODE ONLY WORKS WITH HACK, IT MAY LOOK LIKE A GENERIC ATOMIC
  23  * HASHTABLE FOR OCAML: IT IS NOT!
  24  * BUT ... YOU WERE GOING TO SAY BUT? BUT ...
  25  * THERE IS NO BUT! DONNY YOU'RE OUT OF YOUR ELEMENT!
  26  *
  27  * The lock-free data structures implemented here only work because of how
  28  * the Hack phases are synchronized.
  29  *
  30  * There are 2 kinds of storage implemented in this file.
  31  * I) The global storage. Used by the master to efficiently transfer a blob
  32  *    of data to the workers. This is used to share an environment in
  33  *    read-only mode with all the workers.
  34  *    The master stores, the workers read.
  35  *    Only concurrent reads allowed. No concurrent write/read and write/write.
  36  *    There are a few different OCaml modules that act as interfaces to this
  37  *    global storage. They all use the same area of memory, so only one can be
  38  *    active at any one time. The first word indicates the size of the global
  39  *    storage currently in use; callers are responsible for setting it to zero
  40  *    once they are done.
  41  *
  42  * II) The hashtable that maps string keys to string values. (The strings
  43  *    are really serialized / marshalled representations of OCaml structures.)
  44  *    Key observation of the table is that data with the same key are
  45  *    considered equivalent, and so you can arbitrarily get any copy of it;
  46  *    furthermore if data is missing it can be recomputed, so incorrectly
  47  *    saying data is missing when it is being written is only a potential perf
  48  *    loss. Note that "equivalent" doesn't necessarily mean "identical", e.g.,
  49  *    two alpha-converted types are "equivalent" though not literally byte-
  50  *    identical. (That said, I'm pretty sure the Hack typechecker actually does
  51  *    always write identical data, but the hashtable doesn't need quite that
  52  *    strong of an invariant.)
  53  *
  54  *    The operations implemented, and their limitations:
  55  *
  56  *    -) Concurrent writes: SUPPORTED
  57  *       One will win and the other will get dropped on the floor. There is no
  58  *       way to tell which happened. Only promise is that after a write, the
  59  *       one thread which did the write will see data in the table (though it
  60  *       may be slightly different data than what was written, see above about
  61  *       equivalent data).
  62  *
  63  *    -) Concurrent reads: SUPPORTED
  64  *       If interleaved with a concurrent write, the read will arbitrarily
  65  *       say that there is no data at that slot or return the entire new data
  66  *       written by the concurrent writer.
  67  *
  68  *    -) Concurrent removes: NOT SUPPORTED
  69  *       Only the master can remove, and can only do so if there are no other
  70  *       concurrent operations (reads or writes).
  71  *
  72  *    Since the values are variably sized and can get quite large, they are
  73  *    stored separately from the hashes in a garbage-collected heap.
  74  *
  75  * Both II and III resolve hash collisions via linear probing.
  76  */
  77 /*****************************************************************************/
  78
  79 /* For printing uint64_t
  80  * http://jhshi.me/2014/07/11/print-uint64-t-properly-in-c/index.html */
  81 #define __STDC_FORMAT_MACROS
  82
  83 /* define CAML_NAME_SPACE to ensure all the caml imports are prefixed with
  84  * 'caml_' */
  85 #define CAML_NAME_SPACE
  86 #include <caml/mlvalues.h>
  87 #include <caml/callback.h>
  88 #include <caml/memory.h>
  89 #include <caml/alloc.h>
  90 #include <caml/fail.h>
  91 #include <caml/unixsupport.h>
  92 #include <caml/intext.h>
  93
  94 #ifdef _WIN32
  95 #  include <windows.h>
  96 #else
  97 #  include <fcntl.h>
  98 #  include <pthread.h>
  99 #  include <signal.h>
 100 #  include <stdint.h>
 101 #  include <stdio.h>
 102 #  include <string.h>
 103 #  include <sys/errno.h>
 104 #  include <sys/mman.h>
 105 #  include <sys/resource.h>
 106 #  include <sys/stat.h>
 107 #  include <sys/syscall.h>
 108 #  include <sys/types.h>
 109 #  include <unistd.h>
 110 #endif
 111
 112 #include <inttypes.h>
 113 #include <lz4.h>
 114 #include <sys/time.h>
 115 #include <time.h>
 116 #include <zstd.h>
 117
 118 // Some OCaml utility functions (introduced only in 4.12.0)
 119 //
 120 // TODO(hverr): Remove these when we move to 4.12.0
 121 value hh_shared_caml_alloc_some(value v) {
 122   CAMLparam1(v);
 123   value some = caml_alloc_small(1, 0);
 124   Store_field(some, 0, v);
 125   CAMLreturn(some);
 126 }
 127 #  define Val_none Val_int(0)
 128
 129 #include "hh_assert.h"
 130
 131 #define UNUSED(x) \
 132     ((void)(x))
 133 #define UNUSED1 UNUSED
 134 #define UNUSED2(a, b) \
 135     (UNUSED(a), UNUSED(b))
 136 #define UNUSED3(a, b, c) \
 137     (UNUSED(a), UNUSED(b), UNUSED(c))
 138 #define UNUSED4(a, b, c, d) \
 139     (UNUSED(a), UNUSED(b), UNUSED(c), UNUSED(d))
 140 #define UNUSED5(a, b, c, d, e) \
 141     (UNUSED(a), UNUSED(b), UNUSED(c), UNUSED(d), UNUSED(e))
 142
 143
 144 // Ideally these would live in a handle.h file but our internal build system
 145 // can't support that at the moment. These are shared with handle_stubs.c
 146 #ifdef _WIN32
 147 #  define Val_handle(fd) (win_alloc_handle(fd))
 148 #else
 149 #  define Handle_val(fd) (Long_val(fd))
 150 #  define Val_handle(fd) (Val_long(fd))
 151 #endif
 152
 153
 154 #define HASHTBL_WRITE_IN_PROGRESS ((heap_entry_t*)1)
 155
 156 /****************************************************************************
 157  * Quoting the linux manpage: memfd_create() creates an anonymous file
 158  * and returns a file descriptor that refers to it. The file behaves
 159  * like a regular file, and so can be modified, truncated,
 160  * memory-mapped, and so on. However, unlike a regular file, it lives
 161  * in RAM and has a volatile backing storage. Once all references to
 162  * the file are dropped, it is automatically released. Anonymous
 163  * memory is used for all backing pages of the file. Therefore, files
 164  * created by memfd_create() have the same semantics as other
 165  * anonymous memory allocations such as those allocated using mmap(2)
 166  * with the MAP_ANONYMOUS flag. The memfd_create() system call first
 167  * appeared in Linux 3.17.
 168  ****************************************************************************/
 169 #ifdef __linux__
 170 #  define MEMFD_CREATE 1
 171    // glibc only added support for memfd_create in version 2.27.
 172 #  ifndef MFD_CLOEXEC
 173      // Linux version for the architecture must support syscall
 174      // memfd_create
 175 #    ifndef SYS_memfd_create
 176 #      if defined(__x86_64__)
 177 #        define SYS_memfd_create 319
 178 #      elif defined(__powerpc64__)
 179 #        define SYS_memfd_create 360
 180 #      elif defined(__aarch64__)
 181 #        define SYS_memfd_create 385
 182 #      else
 183 #        error "hh_shared.c requires an architecture that supports memfd_create"
 184 #      endif //#if defined(__x86_64__)
 185 #    endif //#ifndef SYS_memfd_create
 186
 187 #    include <asm/unistd.h>
 188
 189   /* Originally this function would call uname(), parse the linux
 190    * kernel release version and make a decision based on whether
 191    * the kernel was >= 3.17 or not. However, syscall will return -1
 192    * with an strerr(errno) of "Function not implemented" if the
 193    * kernel is < 3.17, and that's good enough.
 194    */
 195   static int memfd_create(const char *name, unsigned int flags) {
 196     return syscall(SYS_memfd_create, name, flags);
 197   }
 198 #  endif // #ifndef MFD_CLOEXEC
 199 #endif //#ifdef __linux__
 200
 201 #ifndef MAP_NORESERVE
 202   // This flag was unimplemented in FreeBSD and then later removed
 203 #  define MAP_NORESERVE 0
 204 #endif
 205
 206 // The following 'typedef' won't be required anymore
 207 // when dropping support for OCaml < 4.03
 208 #ifdef __MINGW64__
 209 typedef unsigned __int32 uint32_t;
 210 typedef unsigned __int64 uint64_t;
 211 #endif
 212
 213 #ifdef _WIN32
 214 static int win32_getpagesize(void) {
 215   SYSTEM_INFO siSysInfo;
 216   GetSystemInfo(&siSysInfo);
 217   return siSysInfo.dwPageSize;
 218 }
 219 #  define getpagesize win32_getpagesize
 220 #endif
 221
 222
 223 /*****************************************************************************/
 224 /* API to shmffi */
 225 /*****************************************************************************/
 226
 227 extern void shmffi_init(void* mmap_address, size_t file_size);
 228 extern void shmffi_attach(void* mmap_address, size_t file_size);
 229 extern value shmffi_add(uint64_t hash, value data);
 230 extern value shmffi_mem(uint64_t hash);
 231 extern value shmffi_get_and_deserialize(uint64_t hash);
 232 extern value shmffi_mem_status(uint64_t hash);
 233 extern value shmffi_get_size(uint64_t hash);
 234 extern void shmffi_move(uint64_t hash1, uint64_t hash2);
 235 extern value shmffi_remove(uint64_t hash);
 236 extern value shmffi_allocated_bytes();
 237 extern value shmffi_num_entries();
 238
 239
 240 /*****************************************************************************/
 241 /* Config settings (essentially constants, so they don't need to live in shared
 242  * memory), initialized in hh_shared_init */
 243 /*****************************************************************************/
 244
 245 /* Convention: .*_b = Size in bytes. */
 246
 247 static size_t global_size_b;
 248 static size_t global_size;
 249 static size_t heap_size;
 250 static size_t dep_table_pow;
 251 static size_t hash_table_pow;
 252 static size_t shm_use_sharded_hashtbl;
 253
 254 /* Used for the dependency hashtable */
 255 static uint64_t dep_size;
 256 static size_t dep_size_b;
 257 static size_t bindings_size_b;
 258
 259 /* Used for the shared hashtable */
 260 static uint64_t hashtbl_size;
 261 static size_t hashtbl_size_b;
 262
 263 /* Used for worker-local data */
 264 static size_t locals_size_b;
 265
 266 typedef enum {
 267   KIND_STRING = 1,
 268   KIND_SERIALIZED = !KIND_STRING
 269 } storage_kind;
 270
 271 /* Too lazy to use getconf */
 272 #define CACHE_LINE_SIZE (1 << 6)
 273
 274 #define __ALIGN_MASK(x,mask)    (((x)+(mask))&~(mask))
 275 #define ALIGN(x,a)              __ALIGN_MASK(x,(typeof(x))(a)-1)
 276 #define CACHE_ALIGN(x)          ALIGN(x,CACHE_LINE_SIZE)
 277
 278 /* Align heap entries on 64-bit boundaries */
 279 #define HEAP_ALIGN(x)           ALIGN(x,8)
 280
 281 /* Fix the location of our shared memory so we can save and restore the
 282  * hashtable easily */
 283 #ifdef _WIN32
 284 /* We have to set differently our shared memory location on Windows. */
 285 #  define SHARED_MEM_INIT ((char *) 0x48047e00000ll)
 286 #elif defined __aarch64__
 287 /* CentOS 7.3.1611 kernel does not support a full 48-bit VA space, so choose a
 288  * value low enough that the 21 GB's mmapped in do not interfere with anything
 289  * growing down from the top. 1 << 36 works. */
 290 #  define SHARED_MEM_INIT ((char *) 0x1000000000ll)
 291 #else
 292 #  define SHARED_MEM_INIT ((char *) 0x500000000000ll)
 293 #  define SHARDED_HASHTBL_MEM_ADDR ((char *) 0x510000000000ll)
 294 #  define SHARDED_HASHTBL_MEM_SIZE ((size_t)100 * 1024 * 1024 * 1024)
 295 #endif
 296
 297 /* As a sanity check when loading from a file */
 298 static const uint64_t MAGIC_CONSTANT = 0xfacefacefaceb000ull;
 299
 300 /* The VCS identifier (typically a git hash) of the build */
 301 extern const char* const BuildInfo_kRevision;
 302
 303 /*****************************************************************************/
 304 /* Types */
 305 /*****************************************************************************/
 306
 307 /* Per-worker data which can be quickly updated non-atomically. Will be placed
 308  * in cache-aligned array in the first few pages of shared memory, indexed by
 309  * worker id. */
 310 typedef struct {
 311   uint64_t counter;
 312 } local_t;
 313
 314 // Every heap entry starts with a 64-bit header with the following layout:
 315 //
 316 //  6                                3 3  3                                0 0
 317 //  3                                3 2  1                                1 0
 318 // +----------------------------------+-+-----------------------------------+-+
 319 // |11111111 11111111 11111111 1111111|0| 11111111 11111111 11111111 1111111|1|
 320 // +----------------------------------+-+-----------------------------------+-+
 321 // |                                  | |                                   |
 322 // |                                  | |                                   * 0 tag
 323 // |                                  | |
 324 // |                                  | * 31-1 uncompressed size (0 if uncompressed)
 325 // |                                  |
 326 // |                                  * 32 kind (0 = serialized, 1 = string)
 327 // |
 328 // * 63-33 size of heap entry
 329 //
 330 // The tag bit is always 1 and is used to differentiate headers from pointers
 331 // during garbage collection (see hh_collect).
 332 typedef uint64_t hh_header_t;
 333
 334 #define Entry_size(x) ((x) >> 33)
 335 #define Entry_kind(x) (((x) >> 32) & 1)
 336 #define Entry_uncompressed_size(x) (((x) >> 1) & 0x7FFFFFFF)
 337 #define Heap_entry_total_size(header) sizeof(heap_entry_t) + Entry_size(header)
 338
 339 /* Shared memory structures. hh_shared.h typedefs this to heap_entry_t. */
 340 typedef struct {
 341   hh_header_t header;
 342   char data[];
 343 } heap_entry_t;
 344
 345 /* Cells of the Hashtable */
 346 typedef struct {
 347   uint64_t hash;
 348   heap_entry_t* addr;
 349 } helt_t;
 350
 351 /*****************************************************************************/
 352 /* Globals */
 353 /*****************************************************************************/
 354
 355 /* Total size of allocated shared memory */
 356 static size_t shared_mem_size = 0;
 357
 358 /* Beginning of shared memory */
 359 static char* shared_mem = NULL;
 360
 361 /* ENCODING: The first element is the size stored in bytes, the rest is
 362  * the data. The size is set to zero when the storage is empty.
 363  */
 364 static value* global_storage = NULL;
 365
 366 /* A pair of a 31-bit unsigned number and a tag bit. */
 367 typedef struct {
 368   uint32_t num : 31;
 369   uint32_t tag : 1;
 370 } tagged_uint_t;
 371
 372 /* A deptbl_entry_t is one slot in the deptbl hash table.
 373  *
 374  * deptbl maps a 31-bit integer key to a linked list of 31-bit integer values.
 375  * The key corresponds to a node in a graph and the values correspond to all
 376  * nodes to which that node has an edge. List order does not matter, and there
 377  * are no duplicates. Edges are only added, never removed.
 378  *
 379  * This data structure, while conceptually simple, is implemented in a
 380  * complicated way because we store it in shared memory and update it from
 381  * multiple processes without using any mutexes.  In particular, both the
 382  * traditional hash table entries and the storage for the linked lists to
 383  * which they point are stored in the same shared memory array. A tag bit
 384  * distinguishes the two cases so that hash lookups never accidentally match
 385  * linked list nodes.
 386  *
 387  * Each slot s in deptbl is in one of three states:
 388  *
 389  * if s.raw == 0:
 390  *   empty (the initial state).
 391  * elif s.key.tag == TAG_KEY:
 392  *   A traditional hash table entry, where s.key.num is the key
 393  *   used for hashing/equality and s.next is a "pointer" to a linked
 394  *   list of all the values for that key, as described below.
 395  * else (s.key.tag == TAG_VAL):
 396  *   A node in a linked list of values. s.key.num contains one value and
 397  *   s.next "points" to the rest of the list, as described below.
 398  *   Such a slot is NOT matchable by any hash lookup due to the tag bit.
 399  *
 400  * To save space, a "next" entry can be one of two things:
 401  *
 402  * if next.tag == TAG_NEXT:
 403  *   next.num is the deptbl slot number of the next node in the linked list
 404  *   (i.e. just a troditional linked list "next" pointer, shared-memory style).
 405  * else (next.tag == TAG_VAL):
 406  *   next.num actually holds the final value in the linked list, rather
 407  *   than a "pointer" to another entry or some kind of "NULL" sentinel.
 408  *   This space optimization provides the useful property that each edge
 409  *   in the graph takes up exactly one slot in deptbl.
 410  *
 411  * For example, a mapping from key K to one value V takes one slot S in deptbl:
 412  *
 413  *     S = { .key = { K, TAG_KEY }, .val = { V, TAG_VAL } }
 414  *
 415  * Mapping K to two values V1 and V2 takes two slots, S1 and S2:
 416  *
 417  *     S1 = { .key = { K,  TAG_KEY }, .val = { &S2, TAG_NEXT } }
 418  *     S2 = { .key = { V1, TAG_VAL }, .val = { V2,  TAG_VAL } }
 419  *
 420  * Mapping K to three values V1, V2 and V3 takes three slots:
 421  *
 422  *     S1 = { .key = { K,  TAG_KEY }, .val = { &S2, TAG_NEXT } }
 423  *     S2 = { .key = { V1, TAG_VAL }, .val = { &S3, TAG_NEXT } }
 424  *     S3 = { .key = { V2, TAG_VAL }, .val = { V3,  TAG_VAL } }
 425  *
 426  * ...and so on.
 427  *
 428  * You can see that the final node in a linked list always contains
 429  * two values.
 430  *
 431  * As an important invariant, we need to ensure that a non-empty hash table
 432  * slot can never legally be encoded as all zero bits, because that would look
 433  * just like an empty slot. How could this happen? Because TAG_VAL == 0,
 434  * an all-zero slot would look like this:
 435  *
 436  *    { .key = { 0, TAG_VAL }, .val = { 0, TAG_VAL } }
 437  *
 438  * But fortunately that is impossible -- this entry would correspond to
 439  * having the same value (0) in the list twice, which is forbidden. Since
 440  * one of the two values must be nonzero, the entire "raw" uint64_t must
 441  * be nonzero, and thus distinct from "empty".
 442  */
 443
 444 enum {
 445   /* Valid for both the deptbl_entry_t 'key' and 'next' fields. */
 446   TAG_VAL = 0,
 447
 448   /* Only valid for the deptbl_entry_t 'key' field (so != TAG_VAL). */
 449   TAG_KEY = !TAG_VAL,
 450
 451   /* Only valid for the deptbl_entry_t 'next' field (so != TAG_VAL). */
 452   TAG_NEXT = !TAG_VAL
 453 };
 454
 455 typedef union {
 456   struct {
 457     /* Tag bit is either TAG_KEY or TAG_VAL. */
 458     tagged_uint_t key;
 459
 460     /* Tag bit is either TAG_VAL or TAG_NEXT. */
 461     tagged_uint_t next;
 462   } s;
 463
 464   /* Raw 64 bits of this slot. Useful for atomic operations. */
 465   uint64_t raw;
 466 } deptbl_entry_t;
 467
 468 static deptbl_entry_t* deptbl = NULL;
 469 static uint64_t* dcounter = NULL;
 470
 471
 472 /* ENCODING:
 473  * The highest 2 bits are unused.
 474  * The next 31 bits encode the key the lower 31 bits the value.
 475  */
 476 static uint64_t* deptbl_bindings = NULL;
 477
 478 /* The hashtable containing the shared values. */
 479 static helt_t* hashtbl = NULL;
 480 /* The number of nonempty slots in the hashtable. A nonempty slot has a
 481  * non-zero hash. We never clear hashes so this monotonically increases */
 482 static uint64_t* hcounter = NULL;
 483 /* The number of nonempty filled slots in the hashtable. A nonempty filled slot
 484  * has a non-zero hash AND a non-null addr. It increments when we write data
 485  * into a slot with addr==NULL and decrements when we clear data from a slot */
 486 static uint64_t* hcounter_filled = NULL;
 487
 488 /* A counter increasing globally across all forks. */
 489 static uintptr_t* counter = NULL;
 490
 491 /* Each process reserves a range of values at a time from the shared counter.
 492  * Should be a power of two for more efficient modulo calculation. */
 493 #define COUNTER_RANGE 2048
 494
 495 /* Logging level for shared memory statistics
 496  * 0 = nothing
 497  * 1 = log totals, averages, min, max bytes marshalled and unmarshalled
 498  */
 499 static size_t* log_level = NULL;
 500
 501 static double* sample_rate = NULL;
 502
 503 static size_t* compression = NULL;
 504
 505 static size_t* workers_should_exit = NULL;
 506
 507 static size_t* allow_removes = NULL;
 508
 509 static size_t* allow_dependency_table_reads = NULL;
 510
 511 /* Worker-local storage is cache line aligned. */
 512 static char* locals;
 513 #define LOCAL(id) ((local_t *)(locals + id * CACHE_ALIGN(sizeof(local_t))))
 514
 515 /* This should only be used before forking */
 516 static uintptr_t early_counter = 0;
 517
 518 /* The top of the heap */
 519 static char** heap = NULL;
 520
 521 /* Useful to add assertions */
 522 static pid_t* master_pid = NULL;
 523 static pid_t my_pid = 0;
 524
 525 static size_t num_workers;
 526
 527 /* This is a process-local value. The master process is 0, workers are numbered
 528  * starting at 1. This is an offset into the worker local values in the heap. */
 529 static size_t worker_id;
 530
 531 static size_t allow_hashtable_writes_by_current_process = 1;
 532 static size_t worker_can_exit = 1;
 533
 534 /* Where the heap started (bottom) */
 535 static char* heap_init = NULL;
 536 /* Where the heap will end (top) */
 537 static char* heap_max = NULL;
 538
 539 static size_t* wasted_heap_size = NULL;
 540
 541 static size_t used_heap_size(void) {
 542   return *heap - heap_init;
 543 }
 544
 545 static long removed_count = 0;
 546
 547 /* Expose so we can display diagnostics */
 548 CAMLprim value hh_used_heap_size(void) {
 549   if (shm_use_sharded_hashtbl) {
 550     return shmffi_allocated_bytes();
 551   }
 552   return Val_long(used_heap_size());
 553 }
 554
 555 /* Part of the heap not reachable from hashtable entries. Can be reclaimed with
 556  * hh_collect. */
 557 CAMLprim value hh_wasted_heap_size(void) {
 558   // TODO(hverr): Support sharded hash tables
 559   assert(wasted_heap_size != NULL);
 560   return Val_long(*wasted_heap_size);
 561 }
 562
 563 CAMLprim value hh_log_level(void) {
 564   return Val_long(*log_level);
 565 }
 566
 567 CAMLprim value hh_sample_rate(void) {
 568   CAMLparam0();
 569   CAMLreturn(caml_copy_double(*sample_rate));
 570 }
 571
 572 CAMLprim value hh_hash_used_slots(void) {
 573   // TODO(hverr): For some reason this returns a tuple.
 574   // Fix this when the migration is complete.
 575   CAMLparam0();
 576   CAMLlocal2(connector, num_entries);
 577
 578   connector = caml_alloc_tuple(2);
 579   if (shm_use_sharded_hashtbl) {
 580     num_entries = shmffi_num_entries();
 581     Store_field(connector, 0, num_entries);
 582     Store_field(connector, 1, num_entries);
 583   } else {
 584     Store_field(connector, 0, Val_long(*hcounter_filled));
 585     Store_field(connector, 1, Val_long(*hcounter));
 586   }
 587
 588   CAMLreturn(connector);
 589 }
 590
 591 CAMLprim value hh_hash_slots(void) {
 592   CAMLparam0();
 593   if (shm_use_sharded_hashtbl) {
 594     // In the sharded hash table implementation, we dynamically resize
 595     // the tables. As such, this doesn't really make sense. Return the
 596     // number of entries for now.
 597     CAMLreturn(shmffi_num_entries());
 598   }
 599
 600   CAMLreturn(Val_long(hashtbl_size));
 601 }
 602
 603 #ifdef _WIN32
 604
 605 struct timeval log_duration(const char *prefix, struct timeval start_t) {
 606    return start_t; // TODO
 607 }
 608
 609 #else
 610
 611 struct timeval log_duration(const char *prefix, struct timeval start_t) {
 612   struct timeval end_t = {0};
 613   gettimeofday(&end_t, NULL);
 614   time_t secs = end_t.tv_sec - start_t.tv_sec;
 615   suseconds_t usecs = end_t.tv_usec - start_t.tv_usec;
 616   double time_taken = secs + ((double)usecs / 1000000);
 617   fprintf(stderr, "%s took %.2lfs\n", prefix, time_taken);
 618   return end_t;
 619 }
 620
 621 #endif
 622
 623 #ifdef _WIN32
 624
 625 static HANDLE memfd;
 626
 627 /**************************************************************************
 628  * We create an anonymous memory file, whose `handle` might be
 629  * inherited by subprocesses.
 630  *
 631  * This memory file is tagged "reserved" but not "committed". This
 632  * means that the memory space will be reserved in the virtual memory
 633  * table but the pages will not be bound to any physical memory
 634  * yet. Further calls to 'VirtualAlloc' will "commit" pages, meaning
 635  * they will be bound to physical memory.
 636  *
 637  * This is behavior that should reflect the 'MAP_NORESERVE' flag of
 638  * 'mmap' on Unix. But, on Unix, the "commit" is implicit.
 639  *
 640  * Committing the whole shared heap at once would require the same
 641  * amount of free space in memory (or in swap file).
 642  **************************************************************************/
 643 void memfd_init(const char *shm_dir, size_t shared_mem_size, uint64_t minimum_avail) {
 644   memfd = CreateFileMapping(
 645     INVALID_HANDLE_VALUE,
 646     NULL,
 647     PAGE_READWRITE | SEC_RESERVE,
 648     shared_mem_size >> 32, shared_mem_size & ((1ll << 32) - 1),
 649     NULL);
 650   if (memfd == NULL) {
 651     win32_maperr(GetLastError());
 652     uerror("CreateFileMapping", Nothing);
 653   }
 654   if (!SetHandleInformation(memfd, HANDLE_FLAG_INHERIT, HANDLE_FLAG_INHERIT)) {
 655     win32_maperr(GetLastError());
 656     uerror("SetHandleInformation", Nothing);
 657   }
 658 }
 659
 660 #else
 661
 662 static int memfd_shared_mem = -1;
 663 static int memfd_shmffi = -1;
 664
 665 static void raise_failed_anonymous_memfd_init(void) {
 666   static const value *exn = NULL;
 667   if (!exn) exn = caml_named_value("failed_anonymous_memfd_init");
 668   caml_raise_constant(*exn);
 669 }
 670
 671 static void raise_less_than_minimum_available(uint64_t avail) {
 672   value arg;
 673   static const value *exn = NULL;
 674   if (!exn) exn = caml_named_value("less_than_minimum_available");
 675   arg = Val_long(avail);
 676   caml_raise_with_arg(*exn, arg);
 677 }
 678
 679 #include <sys/statvfs.h>
 680 void assert_avail_exceeds_minimum(const char *shm_dir, uint64_t minimum_avail) {
 681   struct statvfs stats;
 682   uint64_t avail;
 683   if (statvfs(shm_dir, &stats)) {
 684     uerror("statvfs", caml_copy_string(shm_dir));
 685   }
 686   avail = stats.f_bsize * stats.f_bavail;
 687   if (avail < minimum_avail) {
 688     raise_less_than_minimum_available(avail);
 689   }
 690 }
 691
 692 int memfd_create_helper(const char *name, const char *shm_dir, size_t shared_mem_size, uint64_t minimum_avail) {
 693   int memfd = -1;
 694
 695   if (shm_dir == NULL) {
 696     // This means that we should try to use the anonymous-y system calls
 697 #if defined(MEMFD_CREATE)
 698     memfd = memfd_create(name, 0);
 699 #endif
 700 #if defined(__APPLE__)
 701     if (memfd < 0) {
 702       char memname[255];
 703       snprintf(memname, sizeof(memname), "/%s.%d", name, getpid());
 704       // the ftruncate below will fail with errno EINVAL if you try to
 705       // ftruncate the same sharedmem fd more than once. We're seeing this in
 706       // some tests, which might imply that two flow processes with the same
 707       // pid are starting up. This shm_unlink should prevent that from
 708       // happening. Here's a stackoverflow about it
 709       // http://stackoverflow.com/questions/25502229/ftruncate-not-working-on-posix-shared-memory-in-mac-os-x
 710       shm_unlink(memname);
 711       memfd = shm_open(memname, O_CREAT | O_RDWR, 0666);
 712       if (memfd < 0) {
 713           uerror("shm_open", Nothing);
 714       }
 715
 716       // shm_open sets FD_CLOEXEC automatically. This is undesirable, because
 717       // we want this fd to be open for other processes, so that they can
 718       // reconnect to the shared memory.
 719       int fcntl_flags = fcntl(memfd, F_GETFD);
 720       if (fcntl_flags == -1) {
 721         printf("Error with fcntl(memfd): %s\n", strerror(errno));
 722         uerror("fcntl", Nothing);
 723       }
 724       // Unset close-on-exec
 725       fcntl(memfd, F_SETFD, fcntl_flags & ~FD_CLOEXEC);
 726     }
 727 #endif
 728     if (memfd < 0) {
 729       raise_failed_anonymous_memfd_init();
 730     }
 731   } else {
 732     if (minimum_avail > 0) {
 733       assert_avail_exceeds_minimum(shm_dir, minimum_avail);
 734     }
 735     if (memfd < 0) {
 736       char template[1024];
 737       if (!snprintf(template, 1024, "%s/%s-XXXXXX", shm_dir, name)) {
 738         uerror("snprintf", Nothing);
 739       };
 740       memfd = mkstemp(template);
 741       if (memfd < 0) {
 742         uerror("mkstemp", caml_copy_string(template));
 743       }
 744       unlink(template);
 745     }
 746   }
 747   if(ftruncate(memfd, shared_mem_size) == -1) {
 748     uerror("ftruncate", Nothing);
 749   }
 750   return memfd;
 751 }
 752
 753 /**************************************************************************
 754  * The memdfd_init function creates a anonymous memory file that might
 755  * be inherited by `Daemon.spawned` processus (contrary to a simple
 756  * anonymous mmap).
 757  *
 758  * The preferred mechanism is memfd_create(2) (see the upper
 759  * description).  Then we try shm_open(2) (on Apple OS X). As a safe fallback,
 760  * we use `mkstemp/unlink`.
 761  *
 762  * mkstemp is preferred over shm_open on Linux as it allows to
 763  * choose another directory that `/dev/shm` on system where this
 764  * partition is too small (e.g. the Travis containers).
 765  *
 766  * The resulting file descriptor should be mmaped with the memfd_map
 767  * function (see below).
 768  ****************************************************************************/
 769 void memfd_init(const char *shm_dir, size_t shared_mem_size, uint64_t minimum_avail) {
 770   memfd_shared_mem = memfd_create_helper("fb_heap", shm_dir, shared_mem_size, minimum_avail);
 771   if (shm_use_sharded_hashtbl) {
 772     memfd_shmffi = memfd_create_helper("fb_sharded_hashtbl", shm_dir, SHARDED_HASHTBL_MEM_SIZE, 0);
 773   }
 774 }
 775
 776 #endif
 777
 778
 779 /*****************************************************************************/
 780 /* Given a pointer to the shared memory address space, initializes all
 781  * the globals that live in shared memory.
 782  */
 783 /*****************************************************************************/
 784
 785 #ifdef _WIN32
 786
 787 static char *memfd_map(HANDLE memfd, char *mem_addr, size_t shared_mem_size) {
 788   char *mem = NULL;
 789   mem = MapViewOfFileEx(
 790     memfd,
 791     FILE_MAP_ALL_ACCESS,
 792     0, 0, 0,
 793     (char *)mem_addr);
 794   if (mem != mem_addr) {
 795     win32_maperr(GetLastError());
 796     uerror("MapViewOfFileEx", Nothing);
 797   }
 798   return mem;
 799 }
 800
 801 #else
 802
 803 static char *memfd_map(int memfd, char *mem_addr, size_t shared_mem_size) {
 804   char *mem = NULL;
 805   /* MAP_NORESERVE is because we want a lot more virtual memory than what
 806    * we are actually going to use.
 807    */
 808   int flags = MAP_SHARED | MAP_NORESERVE | MAP_FIXED;
 809   int prot  = PROT_READ  | PROT_WRITE;
 810   mem =
 811     (char*)mmap((void *)mem_addr, shared_mem_size, prot,
 812                 flags, memfd, 0);
 813   if(mem == MAP_FAILED) {
 814     printf("Error initializing: %s\n", strerror(errno));
 815     exit(2);
 816   }
 817   return mem;
 818 }
 819
 820 #endif
 821
 822 /****************************************************************************
 823  * The function memfd_reserve force allocation of (mem -> mem+sz) in
 824  * the shared heap. This is mandatory on Windows. This is optional on
 825  * Linux but it allows to have explicit "Out of memory" error
 826  * messages. Otherwise, the kernel might terminate the process with
 827  * `SIGBUS`.
 828  ****************************************************************************/
 829
 830
 831 static void raise_out_of_shared_memory(void)
 832 {
 833   static const value *exn = NULL;
 834   if (!exn) exn = caml_named_value("out_of_shared_memory");
 835   caml_raise_constant(*exn);
 836 }
 837
 838 #ifdef _WIN32
 839
 840 /* Reserves memory. This is required on Windows */
 841 static void win_reserve(char * mem, size_t sz) {
 842   if (!VirtualAlloc(mem, sz, MEM_COMMIT, PAGE_READWRITE)) {
 843     win32_maperr(GetLastError());
 844     raise_out_of_shared_memory();
 845   }
 846 }
 847
 848 /* On Linux, memfd_reserve is only used to reserve memory that is mmap'd to the
 849  * memfd file. Memory outside of that mmap does not need to be reserved, so we
 850  * don't call memfd_reserve on things like the temporary mmap used by
 851  * hh_collect. Instead, they use win_reserve() */
 852 static void memfd_reserve(int memfd, char * mem, size_t sz) {
 853   (void)memfd;
 854   win_reserve(mem, sz);
 855 }
 856
 857 #elif defined(__APPLE__)
 858
 859 /* So OSX lacks fallocate, but in general you can do
 860  * fcntl(fd, F_PREALLOCATE, &store)
 861  * however it doesn't seem to work for a shm_open fd, so this function is
 862  * currently a no-op. This means that our OOM handling for OSX is a little
 863  * weaker than the other OS's */
 864 static void memfd_reserve(int memfd, char * mem, size_t sz) {
 865   (void)memfd;
 866   (void)mem;
 867   (void)sz;
 868 }
 869
 870 #else
 871
 872 static void memfd_reserve(int memfd, char *mem, size_t sz) {
 873   off_t offset = (off_t)(mem - shared_mem);
 874   int err;
 875   do {
 876     err = posix_fallocate(memfd, offset, sz);
 877   } while (err == EINTR);
 878   if (err) {
 879     raise_out_of_shared_memory();
 880   }
 881 }
 882
 883 #endif
 884
 885 // DON'T WRITE TO THE SHARED MEMORY IN THIS FUNCTION!!!  This function just
 886 // calculates where the memory is and sets local globals. The shared memory
 887 // might not be ready for writing yet! If you want to initialize a bit of
 888 // shared memory, check out init_shared_globals
 889 static void define_globals(char * shared_mem_init) {
 890   size_t page_size = getpagesize();
 891   char *mem = shared_mem_init;
 892
 893   // Beginning of the shared memory
 894   shared_mem = mem;
 895
 896 #ifdef MADV_DONTDUMP
 897     // We are unlikely to get much useful information out of the shared heap in
 898     // a core file. Moreover, it can be HUGE, and the extensive work done dumping
 899     // it once for each CPU can mean that the user will reboot their machine
 900     // before the much more useful stack gets dumped!
 901     madvise(shared_mem, shared_mem_size, MADV_DONTDUMP);
 902 #endif
 903
 904   /* BEGINNING OF THE SMALL OBJECTS PAGE
 905    * We keep all the small objects in this page.
 906    * They are on different cache lines because we modify them atomically.
 907    */
 908
 909   /* The pointer to the top of the heap.
 910    * We will atomically increment *heap every time we want to allocate.
 911    */
 912   heap = (char**)mem;
 913
 914   // The number of elements in the hashtable
 915   assert(CACHE_LINE_SIZE >= sizeof(uint64_t));
 916   hcounter = (uint64_t*)(mem + CACHE_LINE_SIZE);
 917
 918   // The number of elements in the deptable
 919   assert(CACHE_LINE_SIZE >= sizeof(uint64_t));
 920   dcounter = (uint64_t*)(mem + 2*CACHE_LINE_SIZE);
 921
 922   assert (CACHE_LINE_SIZE >= sizeof(uintptr_t));
 923   counter = (uintptr_t*)(mem + 3*CACHE_LINE_SIZE);
 924
 925   assert (CACHE_LINE_SIZE >= sizeof(pid_t));
 926   master_pid = (pid_t*)(mem + 4*CACHE_LINE_SIZE);
 927
 928   assert (CACHE_LINE_SIZE >= sizeof(size_t));
 929   log_level = (size_t*)(mem + 5*CACHE_LINE_SIZE);
 930
 931   assert (CACHE_LINE_SIZE >= sizeof(double));
 932   sample_rate = (double*)(mem + 6*CACHE_LINE_SIZE);
 933
 934   assert (CACHE_LINE_SIZE >= sizeof(size_t));
 935   compression = (size_t*)(mem + 7*CACHE_LINE_SIZE);
 936
 937   assert (CACHE_LINE_SIZE >= sizeof(size_t));
 938   workers_should_exit = (size_t*)(mem + 8*CACHE_LINE_SIZE);
 939
 940   assert (CACHE_LINE_SIZE >= sizeof(size_t));
 941   wasted_heap_size = (size_t*)(mem + 9*CACHE_LINE_SIZE);
 942
 943   assert (CACHE_LINE_SIZE >= sizeof(size_t));
 944   allow_removes = (size_t*)(mem + 10*CACHE_LINE_SIZE);
 945
 946   assert (CACHE_LINE_SIZE >= sizeof(size_t));
 947   allow_dependency_table_reads = (size_t*)(mem + 11*CACHE_LINE_SIZE);
 948
 949   assert (CACHE_LINE_SIZE >= sizeof(size_t));
 950   hcounter_filled = (size_t*)(mem + 12*CACHE_LINE_SIZE);
 951
 952   mem += page_size;
 953   // Just checking that the page is large enough.
 954   assert(page_size > 13*CACHE_LINE_SIZE + (int)sizeof(int));
 955
 956   assert (CACHE_LINE_SIZE >= sizeof(local_t));
 957   locals = mem;
 958   mem += locals_size_b;
 959
 960   /* END OF THE SMALL OBJECTS PAGE */
 961
 962   /* Global storage initialization */
 963   global_storage = (value*)mem;
 964   mem += global_size_b;
 965
 966   /* Dependencies */
 967   deptbl = (deptbl_entry_t*)mem;
 968   mem += dep_size_b;
 969
 970   deptbl_bindings = (uint64_t*)mem;
 971   mem += bindings_size_b;
 972
 973   /* Hashtable */
 974   hashtbl = (helt_t*)mem;
 975   mem += hashtbl_size_b;
 976
 977   /* Heap */
 978   heap_init = mem;
 979   heap_max = heap_init + heap_size;
 980
 981 #ifdef _WIN32
 982   /* Reserve all memory space except the "huge" `global_size_b`. This is
 983    * required for Windows but we don't do this for Linux since it lets us run
 984    * more processes in parallel without running out of memory immediately
 985    * (though we do risk it later on) */
 986   memfd_reserve(memfd_shared_mem, (char *)global_storage, sizeof(global_storage[0]));
 987   memfd_reserve(memfd_shared_mem, (char *)heap, heap_init - (char *)heap);
 988 #endif
 989
 990 }
 991
 992 /* The total size of the shared memory.  Most of it is going to remain
 993  * virtual. */
 994 static size_t get_shared_mem_size(void) {
 995   size_t page_size = getpagesize();
 996   return (global_size_b + dep_size_b + bindings_size_b + hashtbl_size_b +
 997           heap_size + page_size + locals_size_b);
 998 }
 999
1000 static void init_shared_globals(
1001   size_t config_log_level,
1002   double config_sample_rate,
1003   size_t config_compression
1004 ) {
1005   // Initial size is zero for global storage is zero
1006   global_storage[0] = 0;
1007   // Initialize the number of element in the table
1008   *hcounter = 0;
1009   *hcounter_filled = 0;
1010   *dcounter = 0;
1011   // Ensure the global counter starts on a COUNTER_RANGE boundary
1012   *counter = ALIGN(early_counter + 1, COUNTER_RANGE);
1013   *log_level = config_log_level;
1014   *sample_rate = config_sample_rate;
1015   *compression = config_compression;
1016   *workers_should_exit = 0;
1017   *wasted_heap_size = 0;
1018   *allow_removes = 1;
1019   *allow_dependency_table_reads = 1;
1020
1021   for (uint64_t i = 0; i <= num_workers; i++) {
1022     LOCAL(i)->counter = 0;
1023   }
1024
1025   // Initialize top heap pointers
1026   *heap = heap_init;
1027 }
1028
1029 static void set_sizes(
1030   uint64_t config_global_size,
1031   uint64_t config_heap_size,
1032   uint64_t config_dep_table_pow,
1033   uint64_t config_hash_table_pow,
1034   uint64_t config_num_workers) {
1035
1036   size_t page_size = getpagesize();
1037
1038   global_size = config_global_size;
1039   global_size_b = sizeof(global_storage[0]) + config_global_size;
1040   heap_size = config_heap_size;
1041   dep_table_pow = config_dep_table_pow;
1042   hash_table_pow = config_hash_table_pow;
1043
1044   dep_size        = 1ul << config_dep_table_pow;
1045   dep_size_b      = dep_size * sizeof(deptbl[0]);
1046   bindings_size_b = dep_size * sizeof(deptbl_bindings[0]);
1047   hashtbl_size    = 1ul << config_hash_table_pow;
1048   hashtbl_size_b  = hashtbl_size * sizeof(hashtbl[0]);
1049
1050   // We will allocate a cache line for the master process and each worker
1051   // process, then pad that out to the nearest page.
1052   num_workers = config_num_workers;
1053   locals_size_b = ALIGN((1 + num_workers) * CACHE_LINE_SIZE, page_size);
1054
1055   shared_mem_size = get_shared_mem_size();
1056 }
1057
1058 /*****************************************************************************/
1059 /* Must be called by the master BEFORE forking the workers! */
1060 /*****************************************************************************/
1061 CAMLprim value hh_shared_init(
1062   value config_val,
1063   value shm_dir_val,
1064   value num_workers_val
1065 ) {
1066   CAMLparam3(config_val, shm_dir_val, num_workers_val);
1067   CAMLlocal1(connector);
1068   CAMLlocal5(
1069     config_global_size_val,
1070     config_heap_size_val,
1071     config_dep_table_pow_val,
1072     config_hash_table_pow_val,
1073     config_shm_use_sharded_hashtbl
1074   );
1075
1076   config_global_size_val = Field(config_val, 0);
1077   config_heap_size_val = Field(config_val, 1);
1078   config_dep_table_pow_val = Field(config_val, 2);
1079   config_hash_table_pow_val = Field(config_val, 3);
1080   config_shm_use_sharded_hashtbl = Field(config_val, 4);
1081
1082   set_sizes(
1083     Long_val(config_global_size_val),
1084     Long_val(config_heap_size_val),
1085     Long_val(config_dep_table_pow_val),
1086     Long_val(config_hash_table_pow_val),
1087     Long_val(num_workers_val)
1088   );
1089   shm_use_sharded_hashtbl = Bool_val(config_shm_use_sharded_hashtbl);
1090
1091   // None -> NULL
1092   // Some str -> String_val(str)
1093   const char *shm_dir = NULL;
1094   if (shm_dir_val != Val_int(0)) {
1095     shm_dir = String_val(Field(shm_dir_val, 0));
1096   }
1097
1098   memfd_init(
1099     shm_dir,
1100     shared_mem_size,
1101     Long_val(Field(config_val, 6))
1102   );
1103   assert(memfd_shared_mem >= 0);
1104   char *shared_mem_init = memfd_map(memfd_shared_mem, SHARED_MEM_INIT, shared_mem_size);
1105   define_globals(shared_mem_init);
1106
1107   if (shm_use_sharded_hashtbl) {
1108     assert(memfd_shmffi >= 0);
1109     assert(SHARED_MEM_INIT + shared_mem_size <= SHARDED_HASHTBL_MEM_ADDR);
1110     char *mem_addr = memfd_map(memfd_shmffi, SHARDED_HASHTBL_MEM_ADDR, SHARDED_HASHTBL_MEM_SIZE);
1111     shmffi_init(mem_addr, SHARDED_HASHTBL_MEM_SIZE);
1112   }
1113
1114   // Keeping the pids around to make asserts.
1115 #ifdef _WIN32
1116   *master_pid = 0;
1117   my_pid = *master_pid;
1118 #else
1119   *master_pid = getpid();
1120   my_pid = *master_pid;
1121 #endif
1122
1123   init_shared_globals(
1124     Long_val(Field(config_val, 7)),
1125     Double_val(Field(config_val, 8)),
1126     Long_val(Field(config_val, 9))
1127   );
1128   // Checking that we did the maths correctly.
1129   assert(*heap + heap_size == shared_mem + shared_mem_size);
1130
1131 #ifndef _WIN32
1132   // Uninstall ocaml's segfault handler. It's supposed to throw an exception on
1133   // stack overflow, but we don't actually handle that exception, so what
1134   // happens in practice is we terminate at toplevel with an unhandled exception
1135   // and a useless ocaml backtrace. A core dump is actually more useful. Sigh.
1136   struct sigaction sigact = { 0 };
1137   sigact.sa_handler = SIG_DFL;
1138   sigemptyset(&sigact.sa_mask);
1139   sigact.sa_flags = 0;
1140   sigaction(SIGSEGV, &sigact, NULL);
1141 #endif
1142
1143   connector = caml_alloc_tuple(8);
1144   Store_field(connector, 0, Val_handle(memfd_shared_mem));
1145   Store_field(connector, 1, config_global_size_val);
1146   Store_field(connector, 2, config_heap_size_val);
1147   Store_field(connector, 3, config_dep_table_pow_val);
1148   Store_field(connector, 4, config_hash_table_pow_val);
1149   Store_field(connector, 5, num_workers_val);
1150   Store_field(connector, 6, config_shm_use_sharded_hashtbl);
1151   Store_field(connector, 7, Val_handle(memfd_shmffi));
1152
1153   CAMLreturn(connector);
1154 }
1155
1156 /* Must be called by every worker before any operation is performed */
1157 value hh_connect(value connector, value worker_id_val) {
1158   CAMLparam2(connector, worker_id_val);
1159   memfd_shared_mem = Handle_val(Field(connector, 0));
1160   set_sizes(
1161     Long_val(Field(connector, 1)),
1162     Long_val(Field(connector, 2)),
1163     Long_val(Field(connector, 3)),
1164     Long_val(Field(connector, 4)),
1165     Long_val(Field(connector, 5))
1166   );
1167   shm_use_sharded_hashtbl = Bool_val(Field(connector, 6));
1168   memfd_shmffi = Handle_val(Field(connector, 7));
1169   worker_id = Long_val(worker_id_val);
1170 #ifdef _WIN32
1171   my_pid = 1; // Trick
1172 #else
1173   my_pid = getpid();
1174 #endif
1175   assert(memfd_shared_mem >= 0);
1176   char *shared_mem_init = memfd_map(memfd_shared_mem, SHARED_MEM_INIT, shared_mem_size);
1177   define_globals(shared_mem_init);
1178
1179   if (shm_use_sharded_hashtbl) {
1180     assert(memfd_shmffi >= 0);
1181     char *mem_addr = memfd_map(memfd_shmffi, SHARDED_HASHTBL_MEM_ADDR, SHARDED_HASHTBL_MEM_SIZE);
1182     shmffi_attach(mem_addr, SHARDED_HASHTBL_MEM_SIZE);
1183   }
1184
1185   CAMLreturn(Val_unit);
1186 }
1187
1188 /* Can only be called after init or after earlier connect. */
1189 value hh_get_handle(void) {
1190   CAMLparam0();
1191   CAMLlocal1(
1192       connector
1193   );
1194   connector = caml_alloc_tuple(8);
1195   Store_field(connector, 0, Val_handle(memfd_shared_mem));
1196   Store_field(connector, 1, Val_long(global_size));
1197   Store_field(connector, 2, Val_long(heap_size));
1198   Store_field(connector, 3, Val_long(dep_table_pow));
1199   Store_field(connector, 4, Val_long(hash_table_pow));
1200   Store_field(connector, 5, Val_long(num_workers));
1201   Store_field(connector, 6, Val_bool(shm_use_sharded_hashtbl));
1202   Store_field(connector, 7, Val_bool(memfd_shmffi));
1203
1204   CAMLreturn(connector);
1205 }
1206
1207 /*****************************************************************************/
1208 /* Counter
1209  *
1210  * Provides a counter intended to be increasing over the lifetime of the program
1211  * including all forks. Uses a global variable until hh_shared_init is called,
1212  * so it's safe to use in the early init stages of the program (as long as you
1213  * fork after hh_shared_init of course). Wraps around at the maximum value of an
1214  * ocaml int, which is something like 30 or 62 bits on 32 and 64-bit
1215  * architectures respectively.
1216  */
1217 /*****************************************************************************/
1218
1219 CAMLprim value hh_counter_next(void) {
1220   CAMLparam0();
1221   CAMLlocal1(result);
1222
1223   uintptr_t v = 0;
1224   if (counter) {
1225     v = LOCAL(worker_id)->counter;
1226     if (v % COUNTER_RANGE == 0) {
1227       v = __atomic_fetch_add(counter, COUNTER_RANGE, __ATOMIC_RELAXED);
1228     }
1229     ++v;
1230     LOCAL(worker_id)->counter = v;
1231   } else {
1232     v = ++early_counter;
1233   }
1234
1235   result = Val_long(v % Max_long); // Wrap around.
1236   CAMLreturn(result);
1237 }
1238
1239 /*****************************************************************************/
1240 /* There are a bunch of operations that only the designated master thread is
1241  * allowed to do. This assert will fail if the current process is not the master
1242  * process
1243  */
1244 /*****************************************************************************/
1245 void assert_master(void) {
1246   assert(my_pid == *master_pid);
1247 }
1248
1249 void assert_not_master(void) {
1250   assert(my_pid != *master_pid);
1251 }
1252
1253 void assert_allow_removes(void) {
1254   assert(*allow_removes);
1255 }
1256
1257 void assert_allow_hashtable_writes_by_current_process(void) {
1258   assert(allow_hashtable_writes_by_current_process);
1259 }
1260
1261 void assert_allow_dependency_table_reads(void) {
1262   assert(*allow_dependency_table_reads);
1263 }
1264
1265 /*****************************************************************************/
1266
1267 CAMLprim value hh_stop_workers(void) {
1268   CAMLparam0();
1269   assert_master();
1270   *workers_should_exit = 1;
1271   CAMLreturn(Val_unit);
1272 }
1273
1274 CAMLprim value hh_resume_workers(void) {
1275   CAMLparam0();
1276   assert_master();
1277   *workers_should_exit = 0;
1278   CAMLreturn(Val_unit);
1279 }
1280
1281 CAMLprim value hh_set_can_worker_stop(value val) {
1282   CAMLparam1(val);
1283   worker_can_exit = Bool_val(val);
1284   CAMLreturn(Val_unit);
1285 }
1286
1287 CAMLprim value hh_set_allow_removes(value val) {
1288   CAMLparam1(val);
1289   *allow_removes = Bool_val(val);
1290   CAMLreturn(Val_unit);
1291 }
1292
1293 CAMLprim value hh_set_allow_hashtable_writes_by_current_process(value val) {
1294   CAMLparam1(val);
1295   allow_hashtable_writes_by_current_process = Bool_val(val);
1296   CAMLreturn(Val_unit);
1297 }
1298
1299 void check_should_exit(void) {
1300   if (workers_should_exit == NULL) {
1301     caml_failwith(
1302       "`check_should_exit` failed: `workers_should_exit` was uninitialized. "
1303       "Did you forget to call one of `hh_connect` or `hh_shared_init` "
1304       "to initialize shared memory before accessing it?"
1305     );
1306   } else if (*workers_should_exit) {
1307     static const value *exn = NULL;
1308     if (!exn) exn = caml_named_value("worker_should_exit");
1309     caml_raise_constant(*exn);
1310   }
1311 }
1312
1313 CAMLprim value hh_check_should_exit (void) {
1314   CAMLparam0();
1315   check_should_exit();
1316   CAMLreturn(Val_unit);
1317 }
1318
1319 /*****************************************************************************/
1320 /* Global storage */
1321 /*****************************************************************************/
1322
1323 void hh_shared_store(value data) {
1324   CAMLparam1(data);
1325   size_t size = caml_string_length(data);
1326
1327   assert_master();                               // only the master can store
1328   assert(global_storage[0] == 0);                // Is it clear?
1329   assert(size < global_size_b - sizeof(global_storage[0])); // Do we have enough space?
1330
1331   global_storage[0] = size;
1332   memfd_reserve(memfd_shared_mem, (char *)&global_storage[1], size);
1333   memcpy(&global_storage[1], &Field(data, 0), size);
1334
1335   CAMLreturn0;
1336 }
1337
1338 /*****************************************************************************/
1339 /* We are allocating ocaml values. The OCaml GC must know about them.
1340  * caml_alloc_string might trigger the GC, when that happens, the GC needs
1341  * to scan the stack to find the OCaml roots. The macros CAMLparam0 and
1342  * CAMLlocal1 register the roots.
1343  */
1344 /*****************************************************************************/
1345
1346 CAMLprim value hh_shared_load(void) {
1347   CAMLparam0();
1348   CAMLlocal1(result);
1349
1350   size_t size = global_storage[0];
1351   assert(size != 0);
1352   result = caml_alloc_string(size);
1353   memcpy(&Field(result, 0), &global_storage[1], size);
1354
1355   CAMLreturn(result);
1356 }
1357
1358 void hh_shared_clear(void) {
1359   assert_master();
1360   global_storage[0] = 0;
1361 }
1362
1363 value hh_check_heap_overflow(void) {
1364   if (shm_use_sharded_hashtbl) {
1365     return Val_bool(0);
1366   }
1367
1368   if (*heap >= shared_mem + shared_mem_size) {
1369     return Val_bool(1);
1370   }
1371   return Val_bool(0);
1372 }
1373
1374 /*****************************************************************************/
1375 /* We compact the heap when it gets twice as large as its initial size.
1376  * Step one, copy the live values in a new heap.
1377  * Step two, memcopy the values back into the shared heap.
1378  * We could probably use something smarter, but this is fast enough.
1379  *
1380  * The collector should only be called by the master.
1381  */
1382 /*****************************************************************************/
1383
1384 CAMLprim value hh_collect(void) {
1385   if (shm_use_sharded_hashtbl != 0) {
1386     return Val_unit;
1387   }
1388
1389   // NOTE: explicitly do NOT call CAMLparam or any of the other functions/macros
1390   // defined in caml/memory.h .
1391   // This function takes a boolean and returns unit.
1392   // Those are both immediates in the OCaml runtime.
1393   assert_master();
1394   assert_allow_removes();
1395
1396   // Step 1: Walk the hashtbl entries, which are the roots of our marking pass.
1397
1398   for (size_t i = 0; i < hashtbl_size; i++) {
1399     // Skip empty slots
1400     if (hashtbl[i].addr == NULL) { continue; }
1401
1402     // No workers should be writing at the moment. If a worker died in the
1403     // middle of a write, that is also very bad
1404     assert(hashtbl[i].addr != HASHTBL_WRITE_IN_PROGRESS);
1405
1406     // The hashtbl addr will be wrong after we relocate the heap entry, but we
1407     // don't know where the heap entry will relocate to yet. We need to first
1408     // move the heap entry, then fix up the hashtbl addr.
1409     //
1410     // We accomplish this by storing the heap header in the now useless addr
1411     // field and storing a pointer to the addr field where the header used to
1412     // be. Then, after moving the heap entry, we can follow the pointer to
1413     // restore our original header and update the addr field to our relocated
1414     // address.
1415     //
1416     // This is all super unsafe and only works because we constrain the size of
1417     // an hh_header_t struct to the size of a pointer.
1418
1419     // Location of the addr field (8 bytes) in the hashtable
1420     char **hashtbl_addr = (char **)&hashtbl[i].addr;
1421
1422     // Location of the header (8 bytes) in the heap
1423     char *heap_addr = (char *)hashtbl[i].addr;
1424
1425     // Swap
1426     hh_header_t header = *(hh_header_t *)heap_addr;
1427     *(hh_header_t *)hashtbl_addr = header;
1428     *(uintptr_t *)heap_addr = (uintptr_t)hashtbl_addr;
1429   }
1430
1431   // Step 2: Walk the heap and relocate entries, updating the hashtbl to point
1432   // to relocated addresses.
1433
1434   // Pointer to free space in the heap where moved values will move to.
1435   char *dest = heap_init;
1436
1437   // Pointer that walks the heap from bottom to top.
1438   char *src = heap_init;
1439
1440   size_t aligned_size;
1441   hh_header_t header;
1442   while (src < *heap) {
1443     if (*(uint64_t *)src & 1) {
1444       // If the lsb is set, this is a header. If it's a header, that means the
1445       // entry was not marked in the first pass and should be collected. Don't
1446       // move dest pointer, but advance src pointer to next heap entry.
1447       header = *(hh_header_t *)src;
1448       aligned_size = HEAP_ALIGN(Heap_entry_total_size(header));
1449     } else {
1450       // If the lsb is 0, this is a pointer to the addr field of the hashtable
1451       // element, which holds the header bytes. This entry is live.
1452       char *hashtbl_addr = *(char **)src;
1453       header = *(hh_header_t *)hashtbl_addr;
1454       aligned_size = HEAP_ALIGN(Heap_entry_total_size(header));
1455
1456       // Fix the hashtbl addr field to point to our new location and restore the
1457       // heap header data temporarily stored in the addr field bits.
1458       *(uintptr_t *)hashtbl_addr = (uintptr_t)dest;
1459       *(hh_header_t *)src = header;
1460
1461       // Move the entry as far to the left as possible.
1462       memmove(dest, src, aligned_size);
1463       dest += aligned_size;
1464     }
1465
1466     src += aligned_size;
1467   }
1468
1469   // TODO: Space between dest and *heap is unused, but will almost certainly
1470   // become used again soon. Currently we will never decommit, which may cause
1471   // issues when there is memory pressure.
1472   //
1473   // If the kernel supports it, we might consider using madvise(MADV_FREE),
1474   // which allows the kernel to reclaim the memory lazily under pressure, but
1475   // would not force page faults under healthy operation.
1476
1477   *heap = dest;
1478   *wasted_heap_size = 0;
1479
1480   return Val_unit;
1481 }
1482
1483 CAMLprim value hh_malloc_trim(void) {
1484 #ifdef MALLOC_TRIM
1485   malloc_trim(0);
1486 #endif
1487   return Val_unit;
1488 }
1489
1490 static void raise_heap_full(void) {
1491   static const value *exn = NULL;
1492   if (!exn) exn = caml_named_value("heap_full");
1493   caml_raise_constant(*exn);
1494 }
1495
1496 /*****************************************************************************/
1497 /* Allocates in the shared heap. The chunks are cache aligned. */
1498 /*****************************************************************************/
1499
1500 static heap_entry_t* hh_alloc(hh_header_t header, /*out*/size_t *total_size) {
1501   // the size of this allocation needs to be kept in sync with wasted_heap_size
1502   // modification in hh_remove
1503   size_t slot_size = HEAP_ALIGN(Heap_entry_total_size(header));
1504   *total_size = slot_size;
1505   char *chunk = __sync_fetch_and_add(heap, (char*) slot_size);
1506   if (chunk + slot_size > heap_max) {
1507     raise_heap_full();
1508   }
1509   memfd_reserve(memfd_shared_mem, chunk, slot_size);
1510   ((heap_entry_t *)chunk)->header = header;
1511   return (heap_entry_t *)chunk;
1512 }
1513
1514 /*****************************************************************************/
1515 /* Serializes an ocaml value into an Ocaml raw heap_entry (bytes) */
1516 /*****************************************************************************/
1517 value hh_serialize_raw(value data) {
1518   CAMLparam1(data);
1519   CAMLlocal1(result);
1520   char* data_value = NULL;
1521   size_t size = 0;
1522   size_t uncompressed_size = 0;
1523   storage_kind kind = 0;
1524   if (shm_use_sharded_hashtbl != 0) {
1525     raise_assertion_failure(LOCATION": shm_use_sharded_hashtbl not implemented");
1526   }
1527
1528   // If the data is an Ocaml string it is more efficient to copy its contents
1529   // directly instead of serializing it.
1530   if (Is_block(data) && Tag_val(data) == String_tag) {
1531     size = caml_string_length(data);
1532     data_value = malloc(size);
1533     memcpy(data_value, String_val(data), size);
1534     kind = KIND_STRING;
1535   } else {
1536     intnat serialized_size;
1537     // We are responsible for freeing the memory allocated by this function
1538     // After copying data_value we need to make sure to free data_value
1539     caml_output_value_to_malloc(
1540       data, Val_int(0)/*flags*/, &data_value, &serialized_size);
1541
1542     assert(serialized_size >= 0);
1543     size = (size_t) serialized_size;
1544     kind = KIND_SERIALIZED;
1545   }
1546
1547   // We limit the size of elements we will allocate to our heap to ~2GB
1548   assert(size < 0x80000000);
1549
1550   size_t max_compression_size = 0;
1551   char* compressed_data = NULL;
1552   size_t compressed_size = 0;
1553
1554   if (*compression) {
1555     max_compression_size = ZSTD_compressBound(size);
1556     compressed_data = malloc(max_compression_size);
1557
1558     compressed_size = ZSTD_compress(compressed_data, max_compression_size, data_value, size, *compression);
1559   }
1560   else {
1561     max_compression_size = LZ4_compressBound(size);
1562     compressed_data = malloc(max_compression_size);
1563     compressed_size = LZ4_compress_default(
1564       data_value,
1565       compressed_data,
1566       size,
1567       max_compression_size);
1568   }
1569
1570   if (compressed_size != 0 && compressed_size < size) {
1571     uncompressed_size = size;
1572     size = compressed_size;
1573   }
1574
1575   // Both size and uncompressed_size will certainly fit in 31 bits, as the
1576   // original size fits per the assert above and we check that the compressed
1577   // size is less than the original size.
1578   hh_header_t header
1579     = size << 33
1580     | (uint64_t)kind << 32
1581     | uncompressed_size << 1
1582     | 1;
1583
1584   size_t ocaml_size = Heap_entry_total_size(header);
1585   result = caml_alloc_string(ocaml_size);
1586   heap_entry_t *addr = (heap_entry_t *)Bytes_val(result);
1587   addr->header = header;
1588   memcpy(&addr->data,
1589          uncompressed_size ? compressed_data : data_value,
1590          size);
1591
1592   free(compressed_data);
1593   // We temporarily allocate memory using malloc to serialize the Ocaml object.
1594   // When we have finished copying the serialized data we need to free the
1595   // memory we allocated to avoid a leak.
1596   free(data_value);
1597
1598   CAMLreturn(result);
1599 }
1600
1601 /*****************************************************************************/
1602 /* Allocates an ocaml value in the shared heap.
1603  * Any ocaml value is valid, except closures. It returns the address of
1604  * the allocated chunk.
1605  */
1606 /*****************************************************************************/
1607 static heap_entry_t* hh_store_ocaml(
1608   value data,
1609   /*out*/size_t *alloc_size,
1610   /*out*/size_t *orig_size,
1611   /*out*/size_t *total_size
1612 ) {
1613   char* data_value = NULL;
1614   size_t size = 0;
1615   size_t uncompressed_size = 0;
1616   storage_kind kind = 0;
1617
1618   // If the data is an Ocaml string it is more efficient to copy its contents
1619   // directly in our heap instead of serializing it.
1620   if (Is_block(data) && Tag_val(data) == String_tag) {
1621     size = caml_string_length(data);
1622     data_value = malloc(size);
1623     memcpy(data_value, String_val(data), size);
1624     kind = KIND_STRING;
1625   } else {
1626     intnat serialized_size;
1627     // We are responsible for freeing the memory allocated by this function
1628     // After copying data_value into our object heap we need to make sure to free
1629     // data_value
1630     caml_output_value_to_malloc(
1631       data, Val_int(0)/*flags*/, &data_value, &serialized_size);
1632
1633     assert(serialized_size >= 0);
1634     size = (size_t) serialized_size;
1635     kind = KIND_SERIALIZED;
1636   }
1637
1638   // We limit the size of elements we will allocate to our heap to ~2GB
1639   assert(size < 0x80000000);
1640   *orig_size = size;
1641
1642   size_t max_compression_size = 0;
1643   char* compressed_data = NULL;
1644   size_t compressed_size = 0;
1645
1646   if (*compression) {
1647     max_compression_size = ZSTD_compressBound(size);
1648     compressed_data = malloc(max_compression_size);
1649
1650     compressed_size = ZSTD_compress(compressed_data, max_compression_size, data_value, size, *compression);
1651   }
1652   else {
1653     max_compression_size = LZ4_compressBound(size);
1654     compressed_data = malloc(max_compression_size);
1655     compressed_size = LZ4_compress_default(
1656       data_value,
1657       compressed_data,
1658       size,
1659       max_compression_size);
1660   }
1661
1662   if (compressed_size != 0 && compressed_size < size) {
1663     uncompressed_size = size;
1664     size = compressed_size;
1665   }
1666
1667   *alloc_size = size;
1668
1669   // Both size and uncompressed_size will certainly fit in 31 bits, as the
1670   // original size fits per the assert above and we check that the compressed
1671   // size is less than the original size.
1672   hh_header_t header
1673     = size << 33
1674     | (uint64_t)kind << 32
1675     | uncompressed_size << 1
1676     | 1;
1677
1678   heap_entry_t* addr = hh_alloc(header, total_size);
1679   memcpy(&addr->data,
1680          uncompressed_size ? compressed_data : data_value,
1681          size);
1682
1683   free(compressed_data);
1684   // We temporarily allocate memory using malloc to serialize the Ocaml object.
1685   // When we have finished copying the serialized data into our heap we need
1686   // to free the memory we allocated to avoid a leak.
1687   free(data_value);
1688
1689   return addr;
1690 }
1691
1692 /*****************************************************************************/
1693 /* Given an OCaml string, returns the 8 first bytes in an unsigned long.
1694  * The key is generated using MD5, but we only use the first 8 bytes because
1695  * it allows us to use atomic operations.
1696  */
1697 /*****************************************************************************/
1698 static uint64_t get_hash(value key) {
1699   return *((uint64_t*)String_val(key));
1700 }
1701
1702 CAMLprim value get_hash_ocaml(value key) {
1703   return caml_copy_int64(*((uint64_t*)String_val(key)));
1704 }
1705
1706 /*****************************************************************************/
1707 /* Writes the data in one of the slots of the hashtable. There might be
1708  * concurrent writers, when that happens, the first writer wins.
1709  *
1710  * Returns the number of bytes allocated in the shared heap. If the slot
1711  * was already written to, a negative value is returned to indicate no new
1712  * memory was allocated.
1713  */
1714 /*****************************************************************************/
1715 static value write_at(unsigned int slot, value data) {
1716   CAMLparam1(data);
1717   CAMLlocal1(result);
1718   result = caml_alloc_tuple(3);
1719   // Try to write in a value to indicate that the data is being written.
1720   if(
1721      __sync_bool_compare_and_swap(
1722        &(hashtbl[slot].addr),
1723        NULL,
1724        HASHTBL_WRITE_IN_PROGRESS
1725      )
1726   ) {
1727     assert_allow_hashtable_writes_by_current_process();
1728     size_t alloc_size = 0;
1729     size_t orig_size = 0;
1730     size_t total_size = 0;
1731     hashtbl[slot].addr = hh_store_ocaml(data, &alloc_size, &orig_size, &total_size);
1732     Store_field(result, 0, Val_long(alloc_size));
1733     Store_field(result, 1, Val_long(orig_size));
1734     Store_field(result, 2, Val_long(total_size));
1735     __sync_fetch_and_add(hcounter_filled, 1);
1736   } else {
1737     Store_field(result, 0, Min_long);
1738     Store_field(result, 1, Min_long);
1739     Store_field(result, 2, Min_long);
1740   }
1741   CAMLreturn(result);
1742 }
1743
1744 static void raise_hash_table_full(void) {
1745   static const value *exn = NULL;
1746   if (!exn) exn = caml_named_value("hash_table_full");
1747   caml_raise_constant(*exn);
1748 }
1749
1750 /*****************************************************************************/
1751 /* Adds a key value to the hashtable. This code is perf sensitive, please
1752  * check the perf before modifying.
1753  *
1754  * Returns the number of bytes allocated into the shared heap, or a negative
1755  * number if no new memory was allocated.
1756  */
1757 /*****************************************************************************/
1758 value hh_add(value key, value data) {
1759   CAMLparam2(key, data);
1760   uint64_t hash = get_hash(key);
1761   if (shm_use_sharded_hashtbl != 0) {
1762     CAMLreturn(shmffi_add(hash, data));
1763   }
1764   check_should_exit();
1765   unsigned int slot = hash & (hashtbl_size - 1);
1766   unsigned int init_slot = slot;
1767   while(1) {
1768     uint64_t slot_hash = hashtbl[slot].hash;
1769
1770     if(slot_hash == hash) {
1771       // overwrite previous value for this hash
1772       CAMLreturn(write_at(slot, data));
1773     }
1774
1775     if (*hcounter >= hashtbl_size) {
1776       // We're never going to find a spot
1777       raise_hash_table_full();
1778     }
1779
1780     if(slot_hash == 0) {
1781       // We think we might have a free slot, try to atomically grab it.
1782       if(__sync_bool_compare_and_swap(&(hashtbl[slot].hash), 0, hash)) {
1783         uint64_t size = __sync_fetch_and_add(hcounter, 1);
1784         // Sanity check
1785         assert(size < hashtbl_size);
1786         CAMLreturn(write_at(slot, data));
1787       }
1788
1789       // Grabbing it failed -- why? If someone else is trying to insert
1790       // the data we were about to, try to insert it ourselves too.
1791       // Otherwise, keep going.
1792       // Note that this read relies on the __sync call above preventing the
1793       // compiler from caching the value read out of memory. (And of course
1794       // isn't safe on any arch that requires memory barriers.)
1795       if(hashtbl[slot].hash == hash) {
1796         // Some other thread already grabbed this slot to write this
1797         // key, but they might not have written the address (or even
1798         // the sigil value) yet. We can't return from hh_add until we
1799         // know that hh_mem would succeed, which is to say that addr is
1800         // no longer null. To make sure hh_mem will work, we try
1801         // writing the value ourselves; either we insert it ourselves or
1802         // we know the address is now non-NULL.
1803         CAMLreturn(write_at(slot, data));
1804       }
1805     }
1806
1807     slot = (slot + 1) & (hashtbl_size - 1);
1808     if (slot == init_slot) {
1809       // We're never going to find a spot
1810       raise_hash_table_full();
1811     }
1812   }
1813 }
1814
1815 /*****************************************************************************/
1816 /* Stores a raw bytes representation of an heap_entry in the shared heap.  It
1817  * returns the address of the allocated chunk.
1818  */
1819 /*****************************************************************************/
1820 static heap_entry_t* hh_store_raw_entry(
1821   value data
1822 ) {
1823   size_t size = caml_string_length(data) - sizeof(heap_entry_t);
1824   size_t total_size = 0;
1825   heap_entry_t* entry = (heap_entry_t*)Bytes_val(data);
1826
1827   hh_header_t header = entry->header;
1828   heap_entry_t* addr = hh_alloc(header, &total_size);
1829   memcpy(&addr->data,
1830          entry->data,
1831          size);
1832   return addr;
1833 }
1834
1835 /*****************************************************************************/
1836 /* Writes the raw serialized data in one of the slots of the hashtable. There
1837  * might be concurrent writers, when that happens, the first writer wins.
1838  *
1839  */
1840 /*****************************************************************************/
1841 static value write_raw_at(unsigned int slot, value data) {
1842   CAMLparam1(data);
1843   // Try to write in a value to indicate that the data is being written.
1844   if(
1845      __sync_bool_compare_and_swap(
1846        &(hashtbl[slot].addr),
1847        NULL,
1848        HASHTBL_WRITE_IN_PROGRESS
1849      )
1850   ) {
1851     assert_allow_hashtable_writes_by_current_process();
1852     hashtbl[slot].addr = hh_store_raw_entry(data);
1853     __sync_fetch_and_add(hcounter_filled, 1);
1854   }
1855   CAMLreturn(Val_unit);
1856 }
1857
1858 /*****************************************************************************/
1859 /* Adds a key and raw heap_entry (represented as bytes) to the hashtable. Used
1860  * for over the network proxying.
1861  *
1862  * Returns unit.
1863  */
1864 /*****************************************************************************/
1865 CAMLprim value hh_add_raw(value key, value data) {
1866   CAMLparam2(key, data);
1867   if (shm_use_sharded_hashtbl != 0) {
1868     raise_assertion_failure(LOCATION": shm_use_sharded_hashtbl not implemented");
1869   }
1870   check_should_exit();
1871   uint64_t hash = get_hash(key);
1872   unsigned int slot = hash & (hashtbl_size - 1);
1873   unsigned int init_slot = slot;
1874   while(1) {
1875     uint64_t slot_hash = hashtbl[slot].hash;
1876
1877     if(slot_hash == hash) {
1878       CAMLreturn(write_raw_at(slot, data));
1879     }
1880
1881     if (*hcounter >= hashtbl_size) {
1882       // We're never going to find a spot
1883       raise_hash_table_full();
1884     }
1885
1886     if(slot_hash == 0) {
1887       // We think we might have a free slot, try to atomically grab it.
1888       if(__sync_bool_compare_and_swap(&(hashtbl[slot].hash), 0, hash)) {
1889         uint64_t size = __sync_fetch_and_add(hcounter, 1);
1890         // Sanity check
1891         assert(size < hashtbl_size);
1892         CAMLreturn(write_raw_at(slot, data));
1893       }
1894
1895       // Grabbing it failed -- why? If someone else is trying to insert
1896       // the data we were about to, try to insert it ourselves too.
1897       // Otherwise, keep going.
1898       // Note that this read relies on the __sync call above preventing the
1899       // compiler from caching the value read out of memory. (And of course
1900       // isn't safe on any arch that requires memory barriers.)
1901       if(hashtbl[slot].hash == hash) {
1902         // Some other thread already grabbed this slot to write this
1903         // key, but they might not have written the address (or even
1904         // the sigil value) yet. We can't return from hh_add until we
1905         // know that hh_mem would succeed, which is to say that addr is
1906         // no longer null. To make sure hh_mem will work, we try
1907         // writing the value ourselves; either we insert it ourselves or
1908         // we know the address is now non-NULL.
1909         CAMLreturn(write_raw_at(slot, data));
1910       }
1911     }
1912
1913     slot = (slot + 1) & (hashtbl_size - 1);
1914     if (slot == init_slot) {
1915       // We're never going to find a spot
1916       raise_hash_table_full();
1917     }
1918   }
1919   CAMLreturn(Val_unit);
1920 }
1921
1922 /*****************************************************************************/
1923 /* Finds the slot corresponding to the key in a hash table. The returned slot
1924  * is either free or points to the key.
1925  */
1926 /*****************************************************************************/
1927 static unsigned int find_slot(value key) {
1928   uint64_t hash = get_hash(key);
1929   unsigned int slot = hash & (hashtbl_size - 1);
1930   unsigned int init_slot = slot;
1931   while(1) {
1932     if(hashtbl[slot].hash == hash) {
1933       return slot;
1934     }
1935     if(hashtbl[slot].hash == 0) {
1936       return slot;
1937     }
1938     slot = (slot + 1) & (hashtbl_size - 1);
1939
1940     if (slot == init_slot) {
1941       raise_hash_table_full();
1942     }
1943   }
1944 }
1945
1946 _Bool hh_is_slot_taken_for_key(unsigned int slot, value key) {
1947   _Bool good_hash = hashtbl[slot].hash == get_hash(key);
1948   _Bool non_null_addr = hashtbl[slot].addr != NULL;
1949   if (good_hash && non_null_addr) {
1950     // The data is currently in the process of being written, wait until it
1951     // actually is ready to be used before returning.
1952     time_t start = 0;
1953     while (hashtbl[slot].addr == HASHTBL_WRITE_IN_PROGRESS) {
1954 #if defined(__aarch64__) || defined(__powerpc64__)
1955       asm volatile("yield" : : : "memory");
1956 #else
1957       asm volatile("pause" : : : "memory");
1958 #endif
1959       // if the worker writing the data dies, we can get stuck. Timeout check
1960       // to prevent it.
1961       time_t now = time(0);
1962       if (start == 0 || start > now) {
1963         start = now;
1964       } else if (now - start > 60) {
1965         caml_failwith("hh_mem busy-wait loop stuck for 60s");
1966       }
1967     }
1968     return 1;
1969   }
1970   return 0;
1971 }
1972
1973 _Bool hh_mem_inner(value key) {
1974   check_should_exit();
1975   unsigned int slot = find_slot(key);
1976   return hh_is_slot_taken_for_key(slot, key);
1977 }
1978
1979 /*****************************************************************************/
1980 /* Returns true if the key is present. We need to check both the hash and
1981  * the address of the data. This is due to the fact that we remove by setting
1982  * the address slot to NULL (we never remove a hash from the table, outside
1983  * of garbage collection).
1984  */
1985 /*****************************************************************************/
1986 value hh_mem(value key) {
1987   CAMLparam1(key);
1988   if (shm_use_sharded_hashtbl != 0) {
1989     CAMLreturn(shmffi_mem(get_hash(key)));
1990   }
1991   CAMLreturn(Val_bool(hh_mem_inner(key) == 1));
1992 }
1993
1994 /*****************************************************************************/
1995 /* Deserializes the value pointed to by elt. */
1996 /*****************************************************************************/
1997 CAMLprim value hh_deserialize(heap_entry_t *elt) {
1998   CAMLparam0();
1999   CAMLlocal1(result);
2000   size_t size = Entry_size(elt->header);
2001   size_t uncompressed_size_exp = Entry_uncompressed_size(elt->header);
2002   char *src = elt->data;
2003   char *data = elt->data;
2004   if (uncompressed_size_exp) {
2005     data = malloc(uncompressed_size_exp);
2006   size_t uncompressed_size = 0;
2007   if (*compression) {
2008     uncompressed_size = ZSTD_decompress(data, uncompressed_size_exp, src, size);
2009   }
2010   else {
2011     uncompressed_size = LZ4_decompress_safe(
2012       src,
2013       data,
2014       size,
2015       uncompressed_size_exp);
2016   }
2017     assert(uncompressed_size == uncompressed_size_exp);
2018     size = uncompressed_size;
2019   }
2020
2021   if (Entry_kind(elt->header) == KIND_STRING) {
2022     result = caml_alloc_initialized_string(size, data);
2023   } else {
2024     result = caml_input_value_from_block(data, size);
2025   }
2026
2027   if (data != src) {
2028     free(data);
2029   }
2030   CAMLreturn(result);
2031 }
2032
2033 /*****************************************************************************/
2034 /* Returns the value associated to a given key, and deserialize it. */
2035 /* Returns [None] if the slot for the key is empty. */
2036 /*****************************************************************************/
2037 CAMLprim value hh_get_and_deserialize(value key) {
2038   CAMLparam1(key);
2039   check_should_exit();
2040   CAMLlocal2(deserialized_value, result);
2041   if (shm_use_sharded_hashtbl != 0) {
2042     CAMLreturn(shmffi_get_and_deserialize(get_hash(key)));
2043   }
2044
2045   unsigned int slot = find_slot(key);
2046   if (!hh_is_slot_taken_for_key(slot, key)) {
2047     CAMLreturn(Val_none);
2048   }
2049   deserialized_value = hh_deserialize(hashtbl[slot].addr);
2050   result = hh_shared_caml_alloc_some(deserialized_value);
2051   CAMLreturn(result);
2052 }
2053
2054 /*****************************************************************************/
2055 /* Returns Ocaml bytes representing the raw heap_entry. */
2056 /* Returns [None] if the slot for the key is empty. */
2057 /*****************************************************************************/
2058 CAMLprim value hh_get_raw(value key) {
2059   CAMLparam1(key);
2060   if (shm_use_sharded_hashtbl != 0) {
2061     raise_assertion_failure(LOCATION": shm_use_sharded_hashtbl not implemented");
2062   }
2063   check_should_exit();
2064   CAMLlocal2(result, bytes);
2065
2066   unsigned int slot = find_slot(key);
2067   if (!hh_is_slot_taken_for_key(slot, key)) {
2068     CAMLreturn(Val_none);
2069   }
2070
2071   heap_entry_t *elt = hashtbl[slot].addr;
2072   size_t size = Heap_entry_total_size(elt->header);
2073   char *data = (char *)elt;
2074   bytes = caml_alloc_string(size);
2075   memcpy(Bytes_val(bytes), data, size);
2076   result = hh_shared_caml_alloc_some(bytes);
2077   CAMLreturn(result);
2078 }
2079
2080 /*****************************************************************************/
2081 /* Returns result of deserializing and possibly uncompressing a raw heap_entry
2082  * passed in as Ocaml bytes. */
2083 /*****************************************************************************/
2084 CAMLprim value hh_deserialize_raw(value heap_entry) {
2085   CAMLparam1(heap_entry);
2086   CAMLlocal1(result);
2087   if (shm_use_sharded_hashtbl != 0) {
2088     raise_assertion_failure(LOCATION": shm_use_sharded_hashtbl not implemented");
2089   }
2090
2091   heap_entry_t* entry = (heap_entry_t*)Bytes_val(heap_entry);
2092   result = hh_deserialize(entry);
2093   CAMLreturn(result);
2094 }
2095
2096 /*****************************************************************************/
2097 /* Returns the size of the value associated to a given key. */
2098 /* The key MUST be present. */
2099 /*****************************************************************************/
2100 CAMLprim value hh_get_size(value key) {
2101   CAMLparam1(key);
2102   if (shm_use_sharded_hashtbl != 0) {
2103     CAMLreturn(shmffi_get_size(get_hash(key)));
2104   }
2105
2106   unsigned int slot = find_slot(key);
2107   assert(hashtbl[slot].hash == get_hash(key));
2108   CAMLreturn(Val_long(Entry_size(hashtbl[slot].addr->header)));
2109 }
2110
2111 /*****************************************************************************/
2112 /* Moves the data associated to key1 to key2.
2113  * key1 must be present.
2114  * key2 must be free.
2115  * Only the master can perform this operation.
2116  */
2117 /*****************************************************************************/
2118 void hh_move(value key1, value key2) {
2119   if (shm_use_sharded_hashtbl != 0) {
2120     shmffi_move(get_hash(key1), get_hash(key2));
2121     return;
2122   }
2123
2124   unsigned int slot1 = find_slot(key1);
2125   unsigned int slot2 = find_slot(key2);
2126
2127   assert_master();
2128   assert_allow_removes();
2129   assert(hashtbl[slot1].hash == get_hash(key1));
2130   assert(hashtbl[slot2].addr == NULL);
2131   // We are taking up a previously empty slot. Let's increment the counter.
2132   // hcounter_filled doesn't change, since slot1 becomes empty and slot2 becomes
2133   // filled.
2134   if (hashtbl[slot2].hash == 0) {
2135     __sync_fetch_and_add(hcounter, 1);
2136   }
2137   hashtbl[slot2].hash = get_hash(key2);
2138   hashtbl[slot2].addr = hashtbl[slot1].addr;
2139   hashtbl[slot1].addr = NULL;
2140 }
2141
2142 /*****************************************************************************/
2143 /* Removes a key from the hash table.
2144  * Only the master can perform this operation.
2145  */
2146 /*****************************************************************************/
2147 CAMLprim value hh_remove(value key) {
2148   CAMLparam1(key);
2149   if (shm_use_sharded_hashtbl != 0) {
2150     CAMLreturn(shmffi_remove(get_hash(key)));
2151   }
2152
2153   unsigned int slot = find_slot(key);
2154
2155   assert_master();
2156   assert_allow_removes();
2157   assert(hashtbl[slot].hash == get_hash(key));
2158   size_t entry_size = Entry_size(hashtbl[slot].addr->header);
2159   // see hh_alloc for the source of this size
2160   size_t slot_size =
2161     HEAP_ALIGN(Heap_entry_total_size(hashtbl[slot].addr->header));
2162   __sync_fetch_and_add(wasted_heap_size, slot_size);
2163   hashtbl[slot].addr = NULL;
2164   removed_count += 1;
2165   __sync_fetch_and_sub(hcounter_filled, 1);
2166   CAMLreturn(Val_long(entry_size));
2167 }
2168
2169 CAMLprim value hh_removed_count(value ml_unit) {
2170     // TODO(hverr): Support sharded hash tables
2171     CAMLparam1(ml_unit);
2172     UNUSED(ml_unit);
2173     return Val_long(removed_count);
2174 }