Ignore BC size budget on very cheap regions
[hiphop-php.git] / hphp / util / hugetlb.cpp
blob3607bc60fbcccd317ab63d64d882444b0c1d54b0
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/util/hugetlb.h"
19 // Techniques used here are Linux-specific, so don't bother to be portable.
20 #ifdef __linux__
21 #include <sys/mman.h>
22 #include <sys/mount.h>
23 #include <sys/param.h>
24 #include <sys/stat.h>
25 #include <sys/types.h>
26 #include <sys/vfs.h>
27 #include <fcntl.h>
28 #ifdef HAVE_NUMA
29 #include <numaif.h>
30 #endif
31 #include <unistd.h>
33 #include "hphp/util/kernel-version.h"
34 #include "hphp/util/numa.h"
35 #include "hphp/util/portability.h"
37 #endif
39 #include <assert.h>
40 #include <errno.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
45 #include <atomic>
46 #include <stdexcept>
48 namespace HPHP {
50 static char s_hugePath[256];
51 constexpr size_t maxErrorMsgLen = 512;
52 static char s_errorMsg[maxErrorMsgLen];
54 static unsigned s_num1GPages;
55 constexpr unsigned kMaxNum1GPages = 16;
56 static void* s_1GPages[kMaxNum1GPages];
58 static unsigned s_num2MPages;
60 // Record error message based on errno, with an optional message.
61 static void record_err_msg(const char* msg = nullptr) {
62 size_t len = 0;
63 if (msg) {
64 len = strlen(msg);
65 if (len > maxErrorMsgLen / 2) {
66 len = maxErrorMsgLen / 2;
68 memcpy(s_errorMsg, msg, len);
69 s_errorMsg[len] = 0;
70 } else {
71 len = strlen(s_errorMsg);
73 #ifdef __linux__
74 #ifdef _GNU_SOURCE
75 char* err = strerror_r(errno, s_errorMsg + len, maxErrorMsgLen - len);
76 if (len == strlen(s_errorMsg)) {
77 size_t appendLen = strlen(err);
78 if (appendLen + len >= maxErrorMsgLen) {
79 appendLen = maxErrorMsgLen - 1 - len;
81 memcpy(s_errorMsg + len, err, appendLen);
82 s_errorMsg[len + appendLen] = 0;
84 #else
85 strerror_r(errno, s_errorMsg + len, maxErrorMsgLen - len);
86 #endif
87 #endif
90 const char* get_hugetlb_err_msg() {
91 return s_errorMsg;
94 // Return the page size for hugetlbfs mount point, or 0 if anything goes wrong:
95 // e.g., mount point doesn't exist, mount point isn't hugetlbfs.
96 static size_t get_hugepage_size(const char* path) {
97 #ifdef __linux__
98 struct statfs64 sb;
99 if (statfs64(path, &sb) == 0) {
100 // Magic number defined in Linux kernel: include/uapi/linux/magic.h
101 auto constexpr HUGETLBFS_MAGIC = 0x958458f6;
102 if (sb.f_type == HUGETLBFS_MAGIC) {
103 return sb.f_bsize;
104 } else {
105 snprintf(s_errorMsg, maxErrorMsgLen,
106 "path %s isn't mounted as hugetlbfs", path);
108 } else {
109 snprintf(s_errorMsg, maxErrorMsgLen,
110 "statfs64() for %s failed: ", path);
111 record_err_msg();
113 #endif
114 return 0;
117 bool set_hugetlbfs_path(const char* path) {
118 if (get_hugepage_size(path) != size1g) return false;
119 size_t len = strlen(path);
120 if (len + 8 >= sizeof(s_hugePath)) return false;
121 memcpy(s_hugePath, path, len);
122 *reinterpret_cast<int*>(s_hugePath + len) = 0;
123 if (s_hugePath[len - 1] != '/') {
124 s_hugePath[len] = '/';
126 return true;
129 bool find_hugetlbfs_path() {
130 #ifdef __linux__
131 auto mounts = fopen("/proc/mounts", "r");
132 if (!mounts) return false;
133 // Search the file for lines like the following
134 // none /dev/hugepages hugetlbfs seclabel,relatime...
135 char line[4096];
136 char path[4096];
137 char option[4096];
138 while (fgets(line, sizeof(line), mounts)) {
139 if (sscanf(line, "%*s %s hugetlbfs %s", path, option) == 2) {
140 // It matches hugetlbfs, check page size and save results.
141 if (set_hugetlbfs_path(path)) {
142 fclose(mounts);
143 return true;
147 fclose(mounts);
148 #endif
149 return false;
152 HugePageInfo read_hugepage_info(size_t pagesize, int node /* = -1 */) {
153 unsigned nr_huge = 0, free_huge = 0;
154 if (pagesize != size2m && pagesize != size1g) { // only 2M and 1G supported
155 return HugePageInfo{0, 0};
157 #ifdef __linux__
158 if (node >= 0) {
159 auto const readNumFrom = [] (const char* path) {
160 unsigned result = 0;
161 char buffer[32];
162 memset(buffer, 0, sizeof(buffer));
163 int fd = open(path, O_RDONLY);
164 if (fd < 0) return result;
165 bool done = false;
166 do {
167 ssize_t bytes = read(fd, buffer, 20);
168 if (bytes == 0) break; // EOF
169 if (bytes < 0) {
170 if (errno == EINTR) continue; // try again
171 break; // totally failed
173 for (ssize_t i = 0; i < bytes; ++i) {
174 char c = buffer[i];
175 // only read numbers, and stop on white space, etc.
176 if (c < '0' || c > '9') {
177 done = true;
178 break;
180 result = result * 10 + c - '0';
182 } while (!done);
183 close(fd);
184 return result;
187 char fileName[256];
188 memcpy(fileName, "/sys/devices/system/node/node", 29);
189 assert(strlen("/sys/devices/system/node/node") == 29);
190 char* p = fileName + 29;
191 // We support at most 32 NUMA node, so at most two bytes.
192 if (node >= 10) *p++ = '0' + node / 10;
193 *p++ = '0' + node % 10;
194 if (pagesize == size2m) {
195 memcpy(p, "/hugepages/hugepages-2048kB/", 28);
196 assert(strlen("/hugepages/hugepages-2048kB/") == 28);
197 p += 28;
198 } else {
199 memcpy(p, "/hugepages/hugepages-1048576kB/", 31);
200 assert(strlen("/hugepages/hugepages-1048576kB/") == 31);
201 p += 31;
204 memcpy(p, "nr_hugepages", 13);
205 assert(strlen("nr_hugepages") == 12); // extra \0 byte
206 nr_huge = readNumFrom(fileName);
208 memcpy(p, "free_hugepages", 15);
209 assert(strlen("free_hugepages") == 14); // extra \0 byte
210 free_huge = readNumFrom(fileName);
212 return HugePageInfo{nr_huge, free_huge};
214 // All nodes
215 #ifdef HAVE_NUMA
216 const int MAX_NUMA_NODE = numa_max_node();
217 #else
218 constexpr int MAX_NUMA_NODE = 0;
219 #endif
220 for (int i = 0; i <= MAX_NUMA_NODE; ++i) {
221 // Skip nodes we are not allowed to allocate on.
222 if (!numa_node_allowed(i)) continue;
223 auto const info = read_hugepage_info(pagesize, i);
224 nr_huge += info.nr_hugepages;
225 free_huge += info.free_hugepages;
227 #endif
228 return HugePageInfo{nr_huge, free_huge};
231 HugePageInfo get_huge1g_info(int node /* = -1 */) {
232 return read_hugepage_info(size1g, node);
235 HugePageInfo get_huge2m_info(int node /* = -1 */) {
236 return read_hugepage_info(size2m, node);
239 bool auto_mount_hugetlbfs() {
240 #ifdef __linux__
241 auto const info = get_huge1g_info();
242 if (info.nr_hugepages <= 0) return false; // No page reserved.
244 const char* hugePath = "/tmp/huge1g";
245 if (mkdir(hugePath, 0777)) {
246 if (errno != EEXIST) {
247 snprintf(s_errorMsg, maxErrorMsgLen, "Failed to mkdir %s: ", hugePath);
248 record_err_msg();
249 return false;
252 if (mount("none", hugePath, "hugetlbfs", 0, "pagesize=1G,mode=0777")) {
253 record_err_msg("Failed to mount hugetlbfs with 1G page size: ");
254 return false;
256 return set_hugetlbfs_path(hugePath);
257 #else
258 return false;
259 #endif
262 #ifdef __linux__
263 // Beware that MAP_FIXED overrides existing mapping silently. If the specified
264 // memory was mapped in, it may no longer be after this function fails.
265 // mincore() can be used to check if a memory region is stilled mapped in.
266 NEVER_INLINE void* mmap_2m_impl(void* addr, bool fixed) {
267 void* ret = MAP_FAILED;
268 int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB;
269 if (fixed) {
270 flags |= MAP_FIXED;
271 assert(addr != nullptr);
273 // MAP_HUGE_2MB can be specified after 3.8 kernel.
274 static KernelVersion version;
275 if (version.m_major > 3 || (version.m_major == 3 && version.m_minor >= 8)) {
276 #ifndef MAP_HUGE_2MB
277 #define MAP_HUGE_2MB (21 << 26)
278 #endif
279 flags |= MAP_HUGE_2MB;
281 ret = mmap(addr, size2m, PROT_READ | PROT_WRITE, flags, -1, 0);
282 if (ret == MAP_FAILED) {
283 record_err_msg("mmap() with MAP_HUGE_2MB failed: ");
284 return nullptr;
286 if (addr && ret != addr) {
287 assert(fixed == false);
288 // Didn't get the intended address.
289 munmap(ret, size2m);
290 return nullptr;
293 // Fault the page in. This guarantees availablility of memory, and avoids
294 // subsequent errors when the huge page isn't really available. Ideally the
295 // kernel should've failed mmap() in such a case, but it doesn't seem to even
296 // with MAP_LOCKED | MAP_POPULATE.
297 if (mlock(ret, 1)) {
298 snprintf(s_errorMsg, maxErrorMsgLen, "mlock() failed for %p: ", ret);
299 record_err_msg();
300 munmap(ret, size2m);
301 return nullptr;
304 return ret;
307 inline void* mmap_1g_impl(void* addr, bool map_fixed) {
308 void* ret = MAP_FAILED;
309 if (s_hugePath[0] != 0) {
310 int fd = -1;
311 size_t dirNameLen = strlen(s_hugePath);
312 assert(dirNameLen > 0 && s_hugePath[dirNameLen - 1] == '/');
313 for (char i = '0'; i <= '9'; ++i) {
314 s_hugePath[dirNameLen] = i;
315 // We don't put code on 1G huge pages, so no execute permission.
316 fd = open(s_hugePath, O_CREAT | O_EXCL | O_RDWR, 0666);
317 // Retry a few times if the file already exists.
318 if (fd < 0) {
319 if (errno == EEXIST) {
320 errno = 0;
321 continue;
322 } else {
323 snprintf(s_errorMsg, maxErrorMsgLen,
324 "Failed to create hugetlbfs file %s: ", s_hugePath);
325 record_err_msg();
326 s_hugePath[dirNameLen] = 0;
327 return nullptr;
329 } else {
330 unlink(s_hugePath);
332 break;
335 s_hugePath[dirNameLen] = 0;
336 if (fd < 0) {
337 snprintf(s_errorMsg, maxErrorMsgLen,
338 "Failed to create a hugetlbfs file in %s: "
339 "it seems already full of files", s_hugePath);
340 return nullptr;
343 ret = mmap(addr, size1g, PROT_READ | PROT_WRITE,
344 MAP_SHARED | (map_fixed ? MAP_FIXED : 0),
345 fd, 0);
346 if (ret == MAP_FAILED) {
347 snprintf(s_errorMsg, maxErrorMsgLen,
348 "mmap() for hugetlbfs file failed: ");
349 record_err_msg();
351 close(fd);
354 if (ret == MAP_FAILED) {
355 // MAP_HUGE_1GB is available in 3.9 and later kernels
356 KernelVersion version;
357 if (version.m_major > 3 || (version.m_major == 3 && version.m_minor >= 9)) {
358 #ifndef MAP_HUGE_1GB
359 #define MAP_HUGE_1GB (30 << 26)
360 #endif
361 int flags = MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_1GB |
362 (map_fixed ? MAP_FIXED : 0);
363 ret = mmap(addr, size1g, PROT_READ | PROT_WRITE, flags, -1, 0);
364 if (ret == MAP_FAILED) {
365 record_err_msg("mmap() with MAP_HUGE_1GB failed: ");
366 return nullptr;
368 } else {
369 return nullptr;
373 // Didn't get the desired address. This can happen is map_fixed is false.
374 if (addr != nullptr && ret != addr) {
375 snprintf(s_errorMsg, maxErrorMsgLen,
376 "mmap() for huge page returned %p, desired %p", ret, addr);
377 munmap(ret, size1g);
378 return nullptr;
381 // Fault the page in. This guarantees availablility of memory, and avoids
382 // SIGBUS when the huge page isn't really available. In many cases
383 // RLIMIT_MEMLOCK isn't big enough for us to lock 1G. Fortunately that
384 // is unnecessary here; a byte should work equally well.
385 if (mlock(ret, 1)) {
386 snprintf(s_errorMsg, maxErrorMsgLen, "mlock() failed for %p: ", ret);
387 record_err_msg();
388 munmap(ret, size1g);
389 return nullptr;
392 return ret;
394 #endif
396 #ifdef HAVE_NUMA
397 namespace {
398 // We support at most 32 NUMA nodes (numa_node_set in 32-bit), so a single
399 // unsigned long is more than enough for the mask. This can be used in jemalloc
400 // allocation hooks, so it is wise to avoid calling malloc/free here, even
401 // though jemalloc might still be able to handle reentrance correctly. Thus, we
402 // bypass libnuma and do the syscalls directly here.
403 struct SavedNumaPolicy {
404 bool needRestore{false};
405 int oldPolicy{0};
406 unsigned long oldMask{0};
408 // Save NUMA policy for the current thread.
409 void save() {
410 needRestore = !get_mempolicy(&oldPolicy, &oldMask, sizeof(oldMask),
411 nullptr, 0);
413 ~SavedNumaPolicy() {
414 if (needRestore) {
415 set_mempolicy(oldPolicy, &oldMask, sizeof(oldMask));
420 #endif
422 void* mmap_2m(int node) {
423 #ifdef __linux__
424 if (get_huge2m_info(node).free_hugepages <= 0) return nullptr;
425 if (node >= 0 && !numa_node_allowed(node)) return nullptr;
426 #ifdef HAVE_NUMA
427 SavedNumaPolicy numaPolicy;
428 if (node >= 0 && numa_num_nodes > 1) {
429 numaPolicy.save();
430 unsigned long singleNodeMask = 1ul << node;
431 set_mempolicy(MPOL_BIND, &singleNodeMask, sizeof(singleNodeMask));
433 #endif
434 void* ret = mmap_2m_impl(nullptr, /* fixed */ false);
435 s_num2MPages += (ret != nullptr);
436 return ret;
437 #else // not linux
438 return nullptr;
439 #endif
442 void* remap_2m(void* addr, int node) {
443 assert(addr != nullptr);
444 assert(reinterpret_cast<uintptr_t>(addr) % size2m == 0);
445 #ifdef __linux__
446 if (node >= 0 && !numa_node_allowed(node)) return nullptr;
447 if (get_huge2m_info(node).free_hugepages <= 0) return nullptr;
448 #ifdef HAVE_NUMA
449 SavedNumaPolicy numaPolicy;
450 unsigned long singleNodeMask = (1ull << 32) - 1;
451 if (node >= 0 && numa_num_nodes > 1) {
452 numaPolicy.save();
453 singleNodeMask = 1ul << node;
454 set_mempolicy(MPOL_BIND, &singleNodeMask, sizeof(singleNodeMask));
456 #endif
457 void* ret = mmap_2m_impl(addr, /* fixed */ true);
458 if (!ret) {
459 // When mmap_2m_impl() fails, pages in the range [addr, addr + size2m) may
460 // have been unmapped, depending on the implementation of the kernel. Remap
461 // the range in that case.
462 unsigned char v[size2m / size4k];
463 if (mincore(addr, size2m, v) == -1 && errno == ENOMEM) {
464 // [addr, addr + size2m) contains an unmapped page.
465 int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED;
466 int prot = PROT_READ | PROT_WRITE;
467 void* normalPages = mmap(addr, size2m, prot, flags, -1, 0);
468 if (normalPages != addr) {
469 // Either the mmap() failed again without trying to get huge pages, or
470 // it has returned something other than addr even with MAP_FIXED. In
471 // either case, we need to bail out.
472 throw std::runtime_error{"mmap() failure with MAP_FIXED"};
474 #ifdef HAVE_NUMA
475 // Enforce the NUMA node spec.
476 if (node >= 0 && numa_num_nodes > 1) {
477 mbind(normalPages, size2m, MPOL_BIND,
478 &singleNodeMask, 32 /* maxnode */, 0 /* flags */);
480 #endif
481 // Since hugetlb pages are not available, try transparent huge pages.
482 madvise(normalPages, size2m, MADV_HUGEPAGE);
484 } else {
485 ++s_num2MPages;
488 return ret;
489 #else // not linux
490 return nullptr;
491 #endif
494 int remap_interleaved_2m_pages(void* addr, size_t pages) {
495 #ifdef __linux__
496 assert(reinterpret_cast<uintptr_t>(addr) % size2m == 0);
497 assert(addr != nullptr);
498 int count = 0;
499 std::atomic<uint32_t> node{0};
500 while (pages > 0) {
501 auto const curr_node = next_numa_node(node);
502 count += (remap_2m(addr, curr_node) != nullptr);
503 addr = (char*)addr + size2m;
504 --pages;
506 return count;
507 #else // not linux
508 return 0;
509 #endif
512 void* mmap_1g(void* addr, int node, bool map_fixed) {
513 #ifdef __linux__
514 if (s_num1GPages >= kMaxNum1GPages) return nullptr;
515 if (get_huge1g_info(node).free_hugepages <= 0) return nullptr;
516 if (node >= 0 && !numa_node_allowed(node)) return nullptr;
517 #ifdef HAVE_NUMA
518 SavedNumaPolicy numaPolicy;
519 if (node >= 0 && numa_num_nodes > 1) {
520 numaPolicy.save();
521 unsigned long singleNodeMask = 1ul << node;
522 set_mempolicy(MPOL_BIND, &singleNodeMask, sizeof(singleNodeMask));
524 #endif
525 void* ret = mmap_1g_impl(addr, map_fixed);
526 if (ret != nullptr) {
527 s_1GPages[s_num1GPages++] = ret;
529 return ret;
530 #else
531 return nullptr;
532 #endif
535 unsigned num_1g_pages() {
536 return s_num1GPages;
539 unsigned num_2m_pages() {
540 return s_num2MPages;
543 int mprotect_1g_pages(int prot) {
544 #ifdef __linux__
545 for (unsigned i = 0; i < s_num1GPages; ++i) {
546 void* p = s_1GPages[i];
547 assert(p != nullptr &&
548 (reinterpret_cast<uintptr_t>(p) & (size1g - 1)) == 0);
549 if (auto ret = mprotect(p, size1g, prot)) {
550 // mprotect() failed for this page, callers should check errno if they
551 // care.
552 return ret;
555 #endif
556 return 0;