hphp/util/alloc.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com)  |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15 */
  16 #include "hphp/util/alloc.h"
  17
  18 #include <atomic>
  19 #include <mutex>
  20
  21 #include <errno.h>
  22 #include <signal.h>
  23 #include <stdlib.h>
  24 #include <unistd.h>
  25
  26 #ifdef __APPLE__
  27 #include <sys/sysctl.h>
  28 #endif
  29
  30 #include <folly/portability/SysMman.h>
  31 #include <folly/portability/SysResource.h>
  32
  33 #include "hphp/util/address-range.h"
  34 #include "hphp/util/bump-mapper.h"
  35 #include "hphp/util/extent-hooks.h"
  36 #include "hphp/util/hugetlb.h"
  37 #include "hphp/util/kernel-version.h"
  38 #include "hphp/util/managed-arena.h"
  39 #include "hphp/util/numa.h"
  40 #include "hphp/util/slab-manager.h"
  41
  42 namespace HPHP {
  43 ///////////////////////////////////////////////////////////////////////////////
  44
  45 void flush_thread_caches() {
  46 #ifdef USE_JEMALLOC
  47   mallctlCall<true>("thread.tcache.flush");
  48 #if USE_JEMALLOC_EXTENT_HOOKS
  49   arenas_thread_flush();
  50 #endif
  51 #endif
  52 }
  53
  54 __thread int32_t s_numaNode;
  55
  56 __thread uintptr_t s_stackLimit;
  57 __thread size_t s_stackSize;
  58 const size_t s_pageSize = sysconf(_SC_PAGESIZE);
  59
  60 __thread MemBlock s_tlSpace;
  61 __thread MemBlock s_hugeRange;
  62
  63 __thread TLStaticArena* tl_static_arena;
  64 bool s_enable_static_arena = false;
  65
  66 static NEVER_INLINE uintptr_t get_stack_top() {
  67   using ActRec = char;
  68   DECLARE_FRAME_POINTER(fp);
  69   return uintptr_t(fp) - s_pageSize;
  70 }
  71
  72 void init_stack_limits(pthread_attr_t* attr) {
  73   size_t stacksize, guardsize;
  74   void *stackaddr;
  75   struct rlimit rlim;
  76
  77 #ifndef __APPLE__
  78   if (pthread_attr_getstack(attr, &stackaddr, &stacksize) != 0) {
  79     always_assert(false);
  80   }
  81 #else
  82   // We must use the following (undocumented) APIs because pthread_attr_getstack
  83   // returns incorrect values on OSX.
  84   pthread_t self = pthread_self();
  85   stackaddr = pthread_get_stackaddr_np(self);
  86   stacksize = pthread_get_stacksize_np(self);
  87
  88   // On OSX 10.9, we are lied to about the main thread's stack size.  Set it to
  89   // the minimum stack size, which is set earlier by execute_program_impl.
  90   if (pthread_main_np() == 1) {
  91     if (s_stackSize < kStackSizeMinimum) {
  92       char osRelease[256];
  93       size_t osReleaseSize = sizeof(osRelease);
  94       if (sysctlbyname("kern.osrelease", osRelease, &osReleaseSize,
  95                        nullptr, 0) == 0) {
  96         if (atoi(osRelease) >= 13) {
  97           stacksize = kStackSizeMinimum;
  98         }
  99       }
 100     }
 101   }
 102
 103   // stackaddr is not base, but top of the stack. Yes, really.
 104   stackaddr = ((char*) stackaddr) - stacksize;
 105 #endif
 106
 107   // Get the guard page's size, because the stack address returned
 108   // above starts at the guard page, so the thread's stack limit is
 109   // stackaddr + guardsize.
 110   if (pthread_attr_getguardsize(attr, &guardsize) != 0) {
 111     guardsize = 0;
 112   }
 113
 114   assert(stackaddr != nullptr);
 115   assert(stacksize >= PTHREAD_STACK_MIN);
 116   s_stackLimit = uintptr_t(stackaddr) + guardsize;
 117   s_stackSize = stacksize - guardsize;
 118
 119   // The main thread's native stack may be larger than desired if
 120   // set_stack_size() failed.  Make sure that even if the native stack is
 121   // extremely large (in which case anonymous mmap() could map some of the
 122   // "stack space"), we can differentiate between the part of the native stack
 123   // that could conceivably be used in practice and all anonymous mmap() memory.
 124   if (getrlimit(RLIMIT_STACK, &rlim) == 0 && rlim.rlim_cur == RLIM_INFINITY &&
 125       s_stackSize > kStackSizeMinimum) {
 126     s_stackLimit += s_stackSize - kStackSizeMinimum;
 127     s_stackSize = kStackSizeMinimum;
 128   }
 129 }
 130
 131 void flush_thread_stack() {
 132   uintptr_t top = get_stack_top() & (s_pageSize - 1);
 133   auto const hugeBase = reinterpret_cast<uintptr_t>(s_hugeRange.ptr);
 134   if (top > hugeBase) top = hugeBase;
 135   if (top <= s_stackLimit) return;
 136   size_t len = top - s_stackLimit;
 137   if (madvise((void*)s_stackLimit, len, MADV_DONTNEED) != 0 &&
 138       errno != EAGAIN) {
 139     fprintf(stderr, "%s failed to madvise with error %d\n", __func__, errno);
 140   }
 141 }
 142
 143 ssize_t purgeable_bytes() {
 144 #ifdef USE_JEMALLOC
 145   return s_pageSize * mallctl_all_pdirty();
 146 #else
 147   return 0;
 148 #endif
 149 }
 150
 151 #if !defined USE_JEMALLOC || !defined HAVE_NUMA
 152 void set_numa_binding(int node) {}
 153 void* mallocx_on_node(size_t size, int node, size_t align) {
 154   void* ret = nullptr;
 155   posix_memalign(&ret, align, size);
 156   return ret;
 157 }
 158 #endif
 159
 160 #ifdef USE_JEMALLOC
 161 unsigned low_arena = 0;
 162 unsigned lower_arena = 0;
 163 unsigned low_cold_arena = 0;
 164 unsigned high_arena = 0;
 165 unsigned high_cold_arena = 0;
 166 __thread unsigned local_arena = 0;
 167
 168 int low_arena_flags = 0;
 169 int lower_arena_flags = 0;
 170 int low_cold_arena_flags = 0;
 171 int high_cold_arena_flags = 0;
 172 __thread int high_arena_flags = 0;
 173 __thread int local_arena_flags = 0;
 174
 175 #if USE_JEMALLOC_EXTENT_HOOKS
 176 // Keep track of the size of recently freed memory that might be in the high1g
 177 // arena when it is disabled, so that we know when to reenable it.
 178 std::atomic_uint g_highArenaRecentlyFreed;
 179
 180 alloc::BumpFileMapper* cold_file_mapper = nullptr;
 181
 182 // Customized hooks to use 1g pages for jemalloc metadata.
 183 static extent_hooks_t huge_page_metadata_hooks;
 184 static extent_alloc_t* orig_alloc = nullptr;
 185
 186 static bool enableArenaMetadata1GPage = false;
 187 static bool enableNumaArenaMetadata1GPage = false;
 188 // jemalloc metadata is allocated through the internal base allocator, which
 189 // expands memory with an increasingly larger sequence.  The default reserved
 190 // space (216MB)is a sum of the sequence, from 2MB to 40MB.
 191 static size_t a0MetadataReservedSize = 0;
 192 static std::atomic<bool> jemallocMetadataCanUseHuge(false);
 193 static void* a0ReservedBase = nullptr;
 194 static std::atomic<size_t> a0ReservedLeft(0);
 195
 196 // Explicit per-thread tcache arenas needing it.
 197 // In jemalloc/include/jemalloc/jemalloc_macros.h.in, we have
 198 // #define MALLOCX_TCACHE_NONE MALLOCX_TCACHE(-1)
 199 __thread int high_arena_tcache = -1;
 200 __thread int local_arena_tcache = -1;
 201 #endif
 202
 203 static unsigned base_arena;
 204
 205 #ifdef HAVE_NUMA
 206
 207 void set_numa_binding(int node) {
 208   if (node < 0) return;                 // thread not created from JobQueue
 209   s_numaNode = node;
 210   unsigned arena = base_arena + node;
 211   mallctlWrite("thread.arena", arena);
 212
 213   if (use_numa) {
 214     numa_sched_setaffinity(0, node_to_cpu_mask[node]);
 215     numa_set_interleave_mask(numa_no_nodes_ptr);
 216     bitmask* nodes = numa_allocate_nodemask();
 217     numa_bitmask_setbit(nodes, node);
 218     numa_set_membind(nodes);
 219     numa_bitmask_free(nodes);
 220   }
 221 }
 222
 223 void* mallocx_on_node(size_t size, int node, size_t align) {
 224   assert((align & (align - 1)) == 0);
 225   int flags = MALLOCX_ALIGN(align);
 226   if (node < 0) return mallocx(size, flags);
 227   int arena = base_arena + node;
 228   flags |= MALLOCX_ARENA(arena) | MALLOCX_TCACHE_NONE;
 229   return mallocx(size, flags);
 230 }
 231
 232 #endif // HAVE_NUMA
 233
 234 #if USE_JEMALLOC_EXTENT_HOOKS
 235 using namespace alloc;
 236 static NEVER_INLINE
 237 RangeMapper* getMapperChain(RangeState& range, unsigned n1GPages,
 238                             bool use2MPages, unsigned n2MPages,
 239                             bool useNormalPages,
 240                             int numaMask, short nextNode) {
 241   RangeMapper* head = nullptr;
 242   RangeMapper** ptail = &head;
 243   if (n1GPages) {
 244     RangeMapper::append(ptail,
 245                         new Bump1GMapper(range, n1GPages, numaMask, nextNode));
 246   }
 247   if (use2MPages) {
 248     RangeMapper::append(ptail, new Bump2MMapper(range, n2MPages, numaMask));
 249   }
 250   if (useNormalPages) {
 251     RangeMapper::append(ptail, new BumpNormalMapper(range, 0, numaMask));
 252   }
 253   assertx(head);
 254   return head;
 255 }
 256
 257 // Find the first 2M mapper for the range, and grant it some 2M page budget.
 258 // Return the actual number of pages granted. The actual number can be different
 259 // from the input, because some part of the range may have already been mapped
 260 // in.
 261 unsigned allocate2MPagesToRange(AddrRangeClass c, unsigned pages) {
 262   auto& range = getRange(c);
 263   auto mapper = range.getLowMapper();
 264   if (!mapper) return 0;
 265   // Search for the first 2M mapper.
 266   do {
 267     if (auto mapper2m = dynamic_cast<Bump2MMapper*>(mapper)) {
 268       const unsigned maxPages = (range.capacity() - range.mapped()) / size2m;
 269       auto const assigned = std::min(pages, maxPages);
 270       mapper2m->setMaxPages(assigned);
 271       return assigned;
 272     }
 273     mapper = mapper->next();
 274   } while (mapper);
 275   return 0;
 276 }
 277
 278 void setup_low_arena(PageSpec s) {
 279   auto const lowArenaStart = lowArenaMinAddr();
 280   assert(reinterpret_cast<uintptr_t>(sbrk(0)) <= lowArenaStart);
 281   always_assert_flog(lowArenaStart <= (2ull << 30),
 282                      "low arena min addr ({}) must be <= 2GB",
 283                      lowArenaStart);
 284   // Initialize mappers for the VeryLow and Low address ranges.
 285   auto& veryLowRange = getRange(AddrRangeClass::VeryLow);
 286   auto& lowRange = getRange(AddrRangeClass::Low);
 287   auto& emergencyRange = getRange(AddrRangeClass::LowEmergency);
 288   auto veryLowMapper =
 289     getMapperChain(veryLowRange,
 290                    (s.n1GPages != 0) ? 1 : 0,
 291                    true, s.n2MPages,    // 2M
 292                    true,                // 4K
 293                    numa_node_set, 0);
 294   auto lowMapper =
 295     getMapperChain(lowRange,
 296                    (s.n1GPages > 1) ? (s.n1GPages - 1) : 0,
 297                    true, 0,             // 2M
 298                    true,                // 4K
 299                    numa_node_set, 1);
 300   auto emergencyMapper =
 301     new BumpEmergencyMapper([]{kill(getpid(), SIGTERM);}, emergencyRange);
 302   veryLowRange.setLowMapper(veryLowMapper);
 303   lowRange.setLowMapper(lowMapper);
 304   emergencyRange.setLowMapper(emergencyMapper);
 305
 306   auto veryLowColdMapper =
 307     new BumpNormalMapper<Direction::HighToLow>(veryLowRange, 0, numa_node_set);
 308   auto lowColdMapper =
 309     new BumpNormalMapper<Direction::HighToLow>(lowRange, 0, numa_node_set);
 310   veryLowRange.setHighMapper(veryLowColdMapper);
 311   lowRange.setHighMapper(lowColdMapper);
 312
 313   auto ma = LowArena::CreateAt(&g_lowArena);
 314   ma->appendMapper(lowMapper);
 315   ma->appendMapper(veryLowMapper);
 316   ma->appendMapper(emergencyMapper);
 317   low_arena = ma->id();
 318   low_arena_flags = MALLOCX_ARENA(low_arena) | MALLOCX_TCACHE_NONE;
 319
 320   ma = LowArena::CreateAt(&g_lowerArena);
 321   ma->appendMapper(veryLowMapper);
 322   ma->appendMapper(lowMapper);
 323   ma->appendMapper(emergencyMapper);
 324   lower_arena = ma->id();
 325   lower_arena_flags = MALLOCX_ARENA(lower_arena) | MALLOCX_TCACHE_NONE;
 326
 327   ma = LowArena::CreateAt(&g_lowColdArena);
 328   ma->appendMapper(lowColdMapper);
 329   ma->appendMapper(veryLowColdMapper);
 330   ma->appendMapper(emergencyMapper);
 331   low_cold_arena = ma->id();
 332   low_cold_arena_flags = MALLOCX_ARENA(low_cold_arena) | MALLOCX_TCACHE_NONE;
 333 }
 334
 335 void setup_high_arena(PageSpec s) {
 336   auto& range = getRange(AddrRangeClass::Uncounted);
 337   auto mapper = getMapperChain(range, s.n1GPages,
 338                                true, s.n2MPages, // 2M pages can be added later
 339                                true,             // use normal pages
 340                                numa_node_set,
 341                                num_numa_nodes() / 2 + 1);
 342   range.setLowMapper(mapper);
 343
 344   auto arena = HighArena::CreateAt(&g_highArena);
 345   arena->appendMapper(range.getLowMapper());
 346   high_arena = arena->id();
 347
 348   auto& fileRange = getRange(AddrRangeClass::UncountedCold);
 349   cold_file_mapper = new BumpFileMapper(fileRange);
 350   fileRange.setLowMapper(cold_file_mapper);
 351   auto coldMapper =
 352     new BumpNormalMapper<Direction::HighToLow>(range, 0, numa_node_set);
 353   range.setHighMapper(coldMapper);
 354   auto coldArena = HighArena::CreateAt(&g_coldArena);
 355   coldArena->appendMapper(cold_file_mapper);
 356   coldArena->appendMapper(coldMapper);
 357   high_cold_arena = coldArena->id();
 358   high_cold_arena_flags = MALLOCX_ARENA(high_cold_arena) | MALLOCX_TCACHE_NONE;
 359 }
 360
 361 void setup_arena0(PageSpec s) {
 362   size_t size = size1g * s.n1GPages + size2m * s.n2MPages;
 363   if (size == 0) return;
 364   // Give arena 0 some huge pages, starting at 2TB.
 365   auto ret = mmap(reinterpret_cast<void*>(kArena0Base),
 366                   size + size1g, PROT_NONE,
 367                   MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE,
 368                   -1, 0);
 369   auto base = reinterpret_cast<uintptr_t>(ret);
 370   if (auto r = base % size1g) {         // align to 1G boundary
 371     base = base + size1g - r;
 372   }
 373   assertx(base % size1g == 0);
 374
 375   auto a0 = PreMappedArena::AttachTo(low_malloc(sizeof(PreMappedArena)), 0,
 376                                      base, base + size, Reserved{});
 377   auto mapper = getMapperChain(*a0, s.n1GPages,
 378                                s.n2MPages, s.n2MPages,
 379                                false,
 380                                numa_node_set, 0);
 381   a0->setLowMapper(mapper);
 382   g_arena0 = a0;
 383 }
 384
 385 // Set up extra arenas for use in non-VM threads, when we have short bursts of
 386 // worker threads running, e.g., during deserialization of profile data.
 387 static std::vector<std::pair<std::vector<DefaultArena*>,
 388                              std::atomic_uint*>> s_extra_arenas;
 389 static unsigned s_extra_arena_per_node;
 390 bool setup_extra_arenas(unsigned count) {
 391   if (count == 0) return false;
 392   // This may be called when we have many other threads running.  So hold the
 393   // lock while making changes.
 394   static std::mutex lock;
 395   std::lock_guard<std::mutex> g(lock);
 396   // only the first call allocate the arenas.
 397   if (!s_extra_arenas.empty()) {
 398     return count <= s_extra_arenas.size() * s_extra_arenas[0].first.size();
 399   }
 400   // `count` needs to be a multiple of `num_numa_nodes()`, if it isn't, we round
 401   // it up to make it easy to balance across nodes.
 402   auto const nNodes = std::max(1u, num_numa_nodes());
 403   s_extra_arena_per_node = (count + nNodes - 1) / nNodes;
 404   assert(s_extra_arena_per_node >= 1);
 405   s_extra_arenas.resize(nNodes);
 406   for (unsigned n = 0; n < nNodes; ++n) {
 407     s_extra_arenas[n].first.resize(s_extra_arena_per_node);
 408     auto constexpr kArenaSize =
 409       (sizeof(DefaultArena) + alignof(DefaultArena) - 1)
 410       / alignof(DefaultArena) * alignof(DefaultArena);
 411     auto const allocSize = kArenaSize * s_extra_arena_per_node
 412       + sizeof(std::atomic_uint);
 413     void* addr = mallocx_on_node(allocSize, n, alignof(DefaultArena));
 414     memset(addr, 0, allocSize);
 415     for (unsigned i = 0; i < s_extra_arena_per_node; ++i) {
 416       s_extra_arenas[n].first[i] = DefaultArena::CreateAt(addr);
 417       addr = (char*)addr + kArenaSize;
 418     }
 419     s_extra_arenas[n].second = static_cast<std::atomic_uint*>(addr);
 420   }
 421   return true;
 422 }
 423
 424 DefaultArena* next_extra_arena(int node) {
 425   if (s_extra_arena_per_node == 0) return nullptr;
 426   if (node >= s_extra_arenas.size()) return nullptr;
 427   if (node < 0) node = 0;
 428   auto const n = static_cast<unsigned>(node);
 429   auto counter = s_extra_arenas[n].second;
 430   auto const next = counter->fetch_add(1, std::memory_order_relaxed);
 431   return s_extra_arenas[n].first[next % s_extra_arena_per_node];
 432 }
 433
 434 void* huge_page_extent_alloc(extent_hooks_t* extent_hooks, void* addr,
 435                              size_t size, size_t alignment, bool* zero,
 436                              bool* commit, unsigned arena_ind) {
 437   // This is used for arena 0's extent_alloc.  No malloc / free allowed within
 438   // this function since reentrancy is not supported for a0's extent hooks.
 439
 440   // Note that, only metadata will use 2M alignment (size will be multiple of 2M
 441   // as well). Aligned allocation doesn't require alignment by default, because
 442   // of the way virtual memory is expanded with opt.retain (which is the
 443   // default).  The current extent hook API has no other way to tell if the
 444   // allocation is for metadata.  The next major jemalloc release will include
 445   // this information in the API.
 446   if (!jemallocMetadataCanUseHuge.load() || alignment != size2m) {
 447     goto default_alloc;
 448   }
 449
 450   assert(a0ReservedBase != nullptr && (size & (size2m - 1)) == 0);
 451   if (arena_ind == 0) {
 452     size_t oldValue;
 453     while (size <= (oldValue = a0ReservedLeft.load())) {
 454       // Try placing a0 metadata on 1G huge pages.
 455       if (a0ReservedLeft.compare_exchange_weak(oldValue, oldValue - size)) {
 456         assert((oldValue & (size2m - 1)) == 0);
 457         return
 458           reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(a0ReservedBase) +
 459                                    (a0MetadataReservedSize - oldValue));
 460       }
 461     }
 462   } else if (auto ma = alloc::highArena()) {
 463     // For non arena 0: malloc / free allowed in this branch.
 464     void* ret = ma->extent_alloc(extent_hooks, addr, size, alignment, zero,
 465                                  commit, high_arena);
 466     if (ret != nullptr) return ret;
 467   }
 468 default_alloc:
 469   return orig_alloc(extent_hooks, addr, size, alignment, zero,
 470                     commit, arena_ind);
 471 }
 472
 473 /*
 474  * Customize arena 0's extent hook to use 1g pages for metadata.
 475  */
 476 void setup_jemalloc_metadata_extent_hook(bool enable, bool enable_numa_arena,
 477                                          size_t reserved) {
 478 #if !JEMALLOC_METADATA_1G_PAGES
 479   return;
 480 #endif
 481   assert(!jemallocMetadataCanUseHuge.load());
 482   enableArenaMetadata1GPage = enable;
 483   enableNumaArenaMetadata1GPage = enable_numa_arena;
 484   a0MetadataReservedSize = reserved;
 485
 486   auto ma = alloc::highArena();
 487   if (!ma) return;
 488   bool retain_enabled = false;
 489   mallctlRead("opt.retain", &retain_enabled);
 490   if (!enableArenaMetadata1GPage || !retain_enabled) return;
 491
 492   bool zero = true, commit = true;
 493   void* ret = ma->extent_alloc(nullptr, nullptr, a0MetadataReservedSize, size2m,
 494                                &zero, &commit, high_arena);
 495   if (!ret) return;
 496
 497   a0ReservedBase = ret;
 498   a0ReservedLeft.store(a0MetadataReservedSize);
 499
 500   extent_hooks_t* orig_hooks;
 501   int err = mallctlRead<extent_hooks_t*, true>("arena.0.extent_hooks",
 502                                                &orig_hooks);
 503   if (err) return;
 504
 505   orig_alloc = orig_hooks->alloc;
 506   huge_page_metadata_hooks = *orig_hooks;
 507   huge_page_metadata_hooks.alloc = &huge_page_extent_alloc;
 508
 509   err = mallctlWrite<extent_hooks_t*, true>("arena.0.extent_hooks",
 510                                             &huge_page_metadata_hooks);
 511   if (err) return;
 512
 513   jemallocMetadataCanUseHuge.store(true);
 514 }
 515
 516 void arenas_thread_init() {
 517   if (high_arena_tcache == -1) {
 518     mallctlRead<int, true>("tcache.create", &high_arena_tcache);
 519     high_arena_flags =
 520       MALLOCX_ARENA(high_arena) | MALLOCX_TCACHE(high_arena_tcache);
 521   }
 522   if (local_arena_tcache == -1) {
 523     local_arena = get_local_arena(s_numaNode);
 524     if (local_arena) {
 525       mallctlRead<int, true>("tcache.create", &local_arena_tcache);
 526       local_arena_flags =
 527         MALLOCX_ARENA(local_arena) | MALLOCX_TCACHE(local_arena_tcache);
 528     }
 529   }
 530   if (s_enable_static_arena) {
 531     assertx(!tl_static_arena);
 532     constexpr size_t kStaticArenaChunkSize = 256 * 1024;
 533     static TaggedSlabList s_static_pool;
 534     tl_static_arena = new TLStaticArena(kStaticArenaChunkSize, &s_static_pool);
 535   }
 536 }
 537
 538 void arenas_thread_flush() {
 539   // It is OK if flushing fails
 540   if (high_arena_tcache != -1) {
 541     mallctlWrite<int, true>("tcache.flush", high_arena_tcache);
 542   }
 543   if (local_arena_tcache != -1) {
 544     mallctlWrite<int, true>("tcache.flush", local_arena_tcache);
 545   }
 546 }
 547
 548 void arenas_thread_exit() {
 549   if (high_arena_tcache != -1) {
 550     mallctlWrite<int, true>("tcache.destroy", high_arena_tcache);
 551     high_arena_tcache = -1;
 552     // Ideally we shouldn't read high_arena_flags any more, but just in case.
 553     high_arena_flags = MALLOCX_ARENA(high_arena) | MALLOCX_TCACHE_NONE;
 554   }
 555   if (local_arena_tcache != -1) {
 556     mallctlWrite<int, true>("tcache.destroy", local_arena_tcache);
 557     local_arena_tcache = -1;
 558     // Ideally we shouldn't read local_arena_flags any more, but just in case.
 559     local_arena_flags = MALLOCX_ARENA(local_arena) | MALLOCX_TCACHE_NONE;
 560   }
 561   if (tl_static_arena) {
 562     delete tl_static_arena;
 563     tl_static_arena = nullptr;
 564   }
 565 }
 566
 567 #endif // USE_JEMALLOC_EXTENT_HOOKS
 568
 569 std::vector<SlabManager*> s_slab_managers;
 570
 571 void setup_local_arenas(PageSpec spec, unsigned slabs) {
 572   s_slab_managers.reserve(num_numa_nodes());
 573   slabs /= num_numa_nodes();
 574
 575   mallctlRead<unsigned>("arenas.narenas", &base_arena); // throw upon failure
 576   // The default one per node.
 577   for (int i = 0; i < num_numa_nodes(); i++) {
 578     unsigned arena = 0;
 579     mallctlRead<unsigned>("arenas.create", &arena);
 580     always_assert(arena == base_arena + i);
 581     if (slabs) {
 582       auto mem = low_malloc(sizeof(SlabManager));
 583       s_slab_managers.push_back(new (mem) SlabManager);
 584     } else {
 585       s_slab_managers.push_back(nullptr);
 586     }
 587   }
 588
 589 #if USE_JEMALLOC_EXTENT_HOOKS
 590   spec.n1GPages = std::min(spec.n1GPages, get_huge1g_info().nr_hugepages);
 591   spec.n1GPages /= num_numa_nodes();
 592   spec.n2MPages = std::min(spec.n2MPages, get_huge2m_info().nr_hugepages);
 593   spec.n2MPages /= num_numa_nodes();
 594   const size_t reserveSize =
 595     spec.n1GPages * size1g + spec.n2MPages * size2m;
 596   if (reserveSize == 0) return;
 597
 598   g_local_arenas.resize(num_numa_nodes(), 0);
 599   for (unsigned i = 0; i < num_numa_nodes(); ++i) {
 600     static_assert(kLocalArenaMinAddr % size1g == 0, "");
 601     auto const desiredBase = kLocalArenaMinAddr + i * kLocalArenaSizeLimit;
 602     // Try to get the desired address range, but don't use MAP_FIXED.
 603     auto ret = mmap(reinterpret_cast<void*>(desiredBase),
 604                     reserveSize + size1g, PROT_NONE,
 605                     MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE,
 606                     -1, 0);
 607     if (ret == MAP_FAILED) {
 608       throw std::runtime_error{"mmap() failed to reserve address range"};
 609     }
 610     auto base = reinterpret_cast<uintptr_t>(ret);
 611     if (base % size1g) {                // adjust to start at 1GB boundary
 612       auto const newBase = (base + size1g - 1) & ~(size1g - 1);
 613       munmap(reinterpret_cast<void*>(base), newBase - base);
 614       base = newBase;
 615     }
 616     assert(base % size1g == 0);
 617     auto arena = PreMappedArena::CreateAt(low_malloc(sizeof(PreMappedArena)),
 618                                           base, base + reserveSize, Reserved{});
 619     auto mapper = getMapperChain(*arena,
 620                                  spec.n1GPages,
 621                                  (bool)spec.n2MPages,
 622                                  spec.n2MPages,
 623                                  false,       // don't use normal pages
 624                                  1u << i,
 625                                  i);
 626     // Allocate some slabs first, which are not given to the arena, but managed
 627     // separately by the slab manager.
 628     auto const totalSlabSize = std::min(slabs * kSlabSize, reserveSize);
 629     if (totalSlabSize) {
 630       auto slabRange = mapper->alloc(totalSlabSize, kSlabAlign);
 631       if (slabRange) {
 632         s_slab_managers[i]->addRange<true>(slabRange, totalSlabSize);
 633       }
 634     }
 635     if (totalSlabSize == reserveSize) continue;
 636     arena->setLowMapper(mapper);
 637     g_local_arenas[i] = arena;
 638   }
 639 #endif
 640 }
 641
 642 unsigned get_local_arena(uint32_t node) {
 643 #if USE_JEMALLOC_EXTENT_HOOKS
 644   if (node >= g_local_arenas.size()) return 0;
 645   auto const arena = g_local_arenas[node];
 646   if (arena == nullptr) return 0;
 647   return arena->id();
 648 #else
 649   return 0;
 650 #endif
 651 }
 652
 653 SlabManager* get_local_slab_manager(uint32_t node) {
 654   if (node >= s_slab_managers.size()) return nullptr;
 655   return s_slab_managers[node];
 656 }
 657
 658 void shutdown_slab_managers() {
 659   for (auto slab_manager : s_slab_managers) {
 660     if (slab_manager) slab_manager->shutdown();
 661   }
 662 }
 663
 664 #endif // USE_JEMALLOC
 665
 666 ssize_t get_free_slab_bytes() {
 667   ssize_t bytes = 0;
 668 #ifdef USE_JEMALLOC
 669   for (auto const slabManager : s_slab_managers) {
 670     if (slabManager) {
 671       bytes += slabManager->bytes();
 672     }
 673   }
 674 #endif // USE_JEMALLOC
 675   return bytes;
 676 }
 677
 678 struct JEMallocInitializer {
 679   JEMallocInitializer() {
 680     // The following comes from malloc_extension.cc in google-perftools
 681 #ifdef __GLIBC__
 682     // GNU libc++ versions 3.3 and 3.4 obey the environment variables
 683     // GLIBCPP_FORCE_NEW and GLIBCXX_FORCE_NEW respectively.  Setting
 684     // one of these variables forces the STL default allocator to call
 685     // new() or delete() for each allocation or deletion.  Otherwise
 686     // the STL allocator tries to avoid the high cost of doing
 687     // allocations by pooling memory internally.
 688     setenv("GLIBCPP_FORCE_NEW", "1", false /* no overwrite*/);
 689     setenv("GLIBCXX_FORCE_NEW", "1", false /* no overwrite*/);
 690
 691     // Now we need to make the setenv 'stick', which it may not do since
 692     // the env is flakey before main() is called.  But luckily stl only
 693     // looks at this env var the first time it tries to do an alloc, and
 694     // caches what it finds.  So we just cause an stl alloc here.
 695     std::string dummy("I need to be allocated");
 696     dummy += "!";         // so the definition of dummy isn't optimized out
 697 #endif  /* __GLIBC__ */
 698
 699     // Enable backtracing through PHP frames (t9814472).
 700     setenv("UNW_RBP_ALWAYS_VALID", "1", false);
 701
 702     init_numa();
 703 #ifdef USE_JEMALLOC
 704 #if !USE_JEMALLOC_EXTENT_HOOKS
 705     // Create the legacy low arena that uses brk() instead of mmap().  When
 706     // using newer versions of jemalloc, we use extent hooks to get more
 707     // control.  If the mallctl fails, it will always_assert in mallctlHelper.
 708     if (mallctlRead<unsigned, true>("arenas.create", &low_arena)) {
 709       return;
 710     }
 711     char buf[32];
 712     snprintf(buf, sizeof(buf), "arena.%u.dss", low_arena);
 713     if (mallctlWrite<const char*, true>(buf, "primary") != 0) {
 714       // Error; bail out.
 715       return;
 716     }
 717     low_arena_flags = MALLOCX_ARENA(low_arena) | MALLOCX_TCACHE_NONE;
 718     lower_arena = low_arena;
 719     lower_arena_flags = low_arena_flags;
 720     low_cold_arena = low_arena;
 721     low_cold_arena_flags = low_arena_flags;
 722
 723     // We normally maintain the invariant that the region surrounding the
 724     // current brk is mapped huge, but we don't know yet whether huge pages
 725     // are enabled for low memory. Round up to the start of a huge page,
 726     // and set the high water mark to one below.
 727     constexpr size_t kHugePageSize = size2m;
 728     constexpr size_t kHugePageMask = kHugePageSize - 1;
 729     unsigned leftInPage = kHugePageSize - (uintptr_t(sbrk(0)) & kHugePageMask);
 730     (void) sbrk(leftInPage);
 731     assert((uintptr_t(sbrk(0)) & kHugePageMask) == 0);
 732
 733 #else // USE_JEMALLOC_EXTENT_HOOKS
 734     unsigned low_1g_pages = 0;
 735     if (char* buffer = getenv("HHVM_LOW_1G_PAGE")) {
 736       if (!sscanf(buffer, "%u", &low_1g_pages)) {
 737         fprintf(stderr,
 738                 "Bad environment variable HHVM_LOW_1G_PAGE: %s\n", buffer);
 739         abort();
 740       }
 741     }
 742     unsigned high_1g_pages = 0;
 743     if (char* buffer = getenv("HHVM_HIGH_1G_PAGE")) {
 744       if (!sscanf(buffer, "%u", &high_1g_pages)) {
 745         fprintf(stderr,
 746                 "Bad environment variable HHVM_HIGH_1G_PAGE: %s\n", buffer);
 747         abort();
 748       }
 749     }
 750     unsigned low_2m_pages = 0;
 751     if (char* buffer = getenv("HHVM_LOW_2M_PAGE")) {
 752       if (!sscanf(buffer, "%u", &low_2m_pages)) {
 753         fprintf(stderr,
 754                 "Bad environment variable HHVM_LOW_2M_PAGE: %s\n", buffer);
 755         abort();
 756       }
 757     }
 758     unsigned high_2m_pages = 0;
 759     if (char* buffer = getenv("HHVM_HIGH_2M_PAGE")) {
 760       if (!sscanf(buffer, "%u", &high_2m_pages)) {
 761         fprintf(stderr,
 762                 "Bad environment variable HHVM_HIGH_2M_PAGE: %s\n", buffer);
 763         abort();
 764       }
 765     }
 766
 767     HugePageInfo info = get_huge1g_info();
 768     unsigned remaining = static_cast<unsigned>(info.nr_hugepages);
 769     if (remaining == 0) {
 770       low_1g_pages = high_1g_pages = 0;
 771     } else if (low_1g_pages > 0 || high_1g_pages > 0) {
 772       KernelVersion version;
 773       if (version.m_major < 3 ||
 774           (version.m_major == 3 && version.m_minor < 9)) {
 775         // Older kernels need an explicit hugetlbfs mount point.
 776         find_hugetlbfs_path() || auto_mount_hugetlbfs();
 777       }
 778     }
 779
 780     // Do some allocation between low and high 1G arenas.  We use at most 2 1G
 781     // pages for the low 1G arena; usually 1 is good enough.
 782     auto const origLow1G = low_1g_pages;
 783     auto const origHigh1G = high_1g_pages;
 784     if (low_1g_pages > 0) {
 785       if (low_1g_pages > 2) {
 786         low_1g_pages = 2;
 787       }
 788       if (low_1g_pages + high_1g_pages > remaining) {
 789         low_1g_pages = 1;
 790       }
 791       assert(remaining >= low_1g_pages);
 792       remaining -= low_1g_pages;
 793     }
 794     if (origLow1G) {
 795       fprintf(stderr,
 796               "using %u (specified %u) 1G huge pages for low arena\n",
 797               low_1g_pages, origLow1G);
 798     }
 799     setup_low_arena({low_1g_pages, low_2m_pages});
 800
 801     if (high_1g_pages > remaining) {
 802       high_1g_pages = remaining;
 803     }
 804     if (origHigh1G) {
 805       fprintf(stderr,
 806               "using %u (specified %u) 1G huge pages for high arena\n",
 807               high_1g_pages, origHigh1G);
 808     }
 809     setup_high_arena({high_1g_pages, high_2m_pages});
 810     // Make sure high/low arenas are available to the current thread.
 811     arenas_thread_init();
 812 #endif
 813     // Initialize global mibs
 814     init_mallctl_mibs();
 815 #endif
 816   }
 817 };
 818
 819 #if defined(__GNUC__) && !defined(__APPLE__)
 820 // Construct this object before any others.
 821 // 101 is the highest priority allowed by the init_priority attribute.
 822 // http://gcc.gnu.org/onlinedocs/gcc-4.0.4/gcc/C_002b_002b-Attributes.html
 823 #define MAX_CONSTRUCTOR_PRIORITY __attribute__((__init_priority__(101)))
 824 #else
 825 // init_priority is a gcc extension, so we can't use it on other compilers.
 826 // However, since constructor ordering is only known to be an issue with
 827 // GNU libc++ we're probably OK on other compilers so let the situation pass
 828 // silently instead of issuing a warning.
 829 #define MAX_CONSTRUCTOR_PRIORITY
 830 #endif
 831
 832 static JEMallocInitializer initJEMalloc MAX_CONSTRUCTOR_PRIORITY;
 833
 834 void low_2m_pages(uint32_t pages) {
 835 #if USE_JEMALLOC_EXTENT_HOOKS
 836   pages -= allocate2MPagesToRange(AddrRangeClass::VeryLow, pages);
 837   allocate2MPagesToRange(AddrRangeClass::Low, pages);
 838 #endif
 839 }
 840
 841 void high_2m_pages(uint32_t pages) {
 842 #if USE_JEMALLOC_EXTENT_HOOKS
 843   allocate2MPagesToRange(AddrRangeClass::Uncounted, pages);
 844 #endif
 845 }
 846
 847 void enable_high_cold_file() {
 848 #if USE_JEMALLOC_EXTENT_HOOKS
 849   if (cold_file_mapper) {
 850     cold_file_mapper->enable();
 851   }
 852 #endif
 853 }
 854
 855 void set_cold_file_dir(const char* dir) {
 856 #if USE_JEMALLOC_EXTENT_HOOKS
 857   if (cold_file_mapper) {
 858     cold_file_mapper->setDirectory(dir);
 859   }
 860 #endif
 861 }
 862
 863 ///////////////////////////////////////////////////////////////////////////////
 864 }
 865
 866 extern "C" {
 867   const char* malloc_conf = "narenas:1,lg_tcache_max:16"
 868 #if (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR >= 1) || \
 869     (JEMALLOC_VERSION_MAJOR > 5) // requires jemalloc >= 5.1
 870     ",metadata_thp:disabled"
 871 #endif
 872 #ifdef ENABLE_HHPROF
 873     ",prof:true,prof_active:false,prof_thread_active_init:false"
 874 #endif
 875     ;
 876 }