2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/util/hugetlb.h"
19 // Techniques used here are Linux-specific, so don't bother to be portable.
22 #include <sys/mount.h>
23 #include <sys/param.h>
25 #include <sys/types.h>
33 #include "hphp/util/kernel-version.h"
34 #include "hphp/util/numa.h"
35 #include "hphp/util/portability.h"
50 static char s_hugePath
[256];
51 constexpr size_t maxErrorMsgLen
= 512;
52 static char s_errorMsg
[maxErrorMsgLen
];
54 static unsigned s_num1GPages
;
55 constexpr unsigned kMaxNum1GPages
= 16;
56 static void* s_1GPages
[kMaxNum1GPages
];
58 static unsigned s_num2MPages
;
60 // Record error message based on errno, with an optional message.
61 static void record_err_msg(const char* msg
= nullptr) {
65 if (len
> maxErrorMsgLen
/ 2) {
66 len
= maxErrorMsgLen
/ 2;
68 memcpy(s_errorMsg
, msg
, len
);
71 len
= strlen(s_errorMsg
);
75 char* err
= strerror_r(errno
, s_errorMsg
+ len
, maxErrorMsgLen
- len
);
76 if (len
== strlen(s_errorMsg
)) {
77 size_t appendLen
= strlen(err
);
78 if (appendLen
+ len
>= maxErrorMsgLen
) {
79 appendLen
= maxErrorMsgLen
- 1 - len
;
81 memcpy(s_errorMsg
+ len
, err
, appendLen
);
82 s_errorMsg
[len
+ appendLen
] = 0;
85 strerror_r(errno
, s_errorMsg
+ len
, maxErrorMsgLen
- len
);
90 const char* get_hugetlb_err_msg() {
94 // Return the page size for hugetlbfs mount point, or 0 if anything goes wrong:
95 // e.g., mount point doesn't exist, mount point isn't hugetlbfs.
96 static size_t get_hugepage_size(const char* path
) {
99 if (statfs64(path
, &sb
) == 0) {
100 // Magic number defined in Linux kernel: include/uapi/linux/magic.h
101 auto constexpr HUGETLBFS_MAGIC
= 0x958458f6;
102 if (sb
.f_type
== HUGETLBFS_MAGIC
) {
105 snprintf(s_errorMsg
, maxErrorMsgLen
,
106 "path %s isn't mounted as hugetlbfs", path
);
109 snprintf(s_errorMsg
, maxErrorMsgLen
,
110 "statfs64() for %s failed: ", path
);
117 bool set_hugetlbfs_path(const char* path
) {
118 if (get_hugepage_size(path
) != size1g
) return false;
119 size_t len
= strlen(path
);
120 if (len
+ 8 >= sizeof(s_hugePath
)) return false;
121 memcpy(s_hugePath
, path
, len
);
122 *reinterpret_cast<int*>(s_hugePath
+ len
) = 0;
123 if (s_hugePath
[len
- 1] != '/') {
124 s_hugePath
[len
] = '/';
129 bool find_hugetlbfs_path() {
131 auto mounts
= fopen("/proc/mounts", "r");
132 if (!mounts
) return false;
133 // Search the file for lines like the following
134 // none /dev/hugepages hugetlbfs seclabel,relatime...
138 while (fgets(line
, sizeof(line
), mounts
)) {
139 if (sscanf(line
, "%*s %s hugetlbfs %s", path
, option
) == 2) {
140 // It matches hugetlbfs, check page size and save results.
141 if (set_hugetlbfs_path(path
)) {
152 HugePageInfo
read_hugepage_info(size_t pagesize
, int node
/* = -1 */) {
153 unsigned nr_huge
= 0, free_huge
= 0;
154 if (pagesize
!= size2m
&& pagesize
!= size1g
) { // only 2M and 1G supported
155 return HugePageInfo
{0, 0};
159 auto const readNumFrom
= [] (const char* path
) {
162 memset(buffer
, 0, sizeof(buffer
));
163 int fd
= open(path
, O_RDONLY
);
164 if (fd
< 0) return result
;
167 ssize_t bytes
= read(fd
, buffer
, 20);
168 if (bytes
== 0) break; // EOF
170 if (errno
== EINTR
) continue; // try again
171 break; // totally failed
173 for (ssize_t i
= 0; i
< bytes
; ++i
) {
175 // only read numbers, and stop on white space, etc.
176 if (c
< '0' || c
> '9') {
180 result
= result
* 10 + c
- '0';
188 memcpy(fileName
, "/sys/devices/system/node/node", 29);
189 assert(strlen("/sys/devices/system/node/node") == 29);
190 char* p
= fileName
+ 29;
191 // We support at most 32 NUMA node, so at most two bytes.
192 if (node
>= 10) *p
++ = '0' + node
/ 10;
193 *p
++ = '0' + node
% 10;
194 if (pagesize
== size2m
) {
195 memcpy(p
, "/hugepages/hugepages-2048kB/", 28);
196 assert(strlen("/hugepages/hugepages-2048kB/") == 28);
199 memcpy(p
, "/hugepages/hugepages-1048576kB/", 31);
200 assert(strlen("/hugepages/hugepages-1048576kB/") == 31);
204 memcpy(p
, "nr_hugepages", 13);
205 assert(strlen("nr_hugepages") == 12); // extra \0 byte
206 nr_huge
= readNumFrom(fileName
);
208 memcpy(p
, "free_hugepages", 15);
209 assert(strlen("free_hugepages") == 14); // extra \0 byte
210 free_huge
= readNumFrom(fileName
);
212 return HugePageInfo
{nr_huge
, free_huge
};
216 const int MAX_NUMA_NODE
= numa_max_node();
218 constexpr int MAX_NUMA_NODE
= 0;
220 for (int i
= 0; i
<= MAX_NUMA_NODE
; ++i
) {
221 // Skip nodes we are not allowed to allocate on.
222 if (!numa_node_allowed(i
)) continue;
223 auto const info
= read_hugepage_info(pagesize
, i
);
224 nr_huge
+= info
.nr_hugepages
;
225 free_huge
+= info
.free_hugepages
;
228 return HugePageInfo
{nr_huge
, free_huge
};
231 HugePageInfo
get_huge1g_info(int node
/* = -1 */) {
232 return read_hugepage_info(size1g
, node
);
235 HugePageInfo
get_huge2m_info(int node
/* = -1 */) {
236 return read_hugepage_info(size2m
, node
);
239 bool auto_mount_hugetlbfs() {
241 auto const info
= get_huge1g_info();
242 if (info
.nr_hugepages
<= 0) return false; // No page reserved.
244 const char* hugePath
= "/tmp/huge1g";
245 if (mkdir(hugePath
, 0777)) {
246 if (errno
!= EEXIST
) {
247 snprintf(s_errorMsg
, maxErrorMsgLen
, "Failed to mkdir %s: ", hugePath
);
252 if (mount("none", hugePath
, "hugetlbfs", 0, "pagesize=1G,mode=0777")) {
253 record_err_msg("Failed to mount hugetlbfs with 1G page size: ");
256 return set_hugetlbfs_path(hugePath
);
263 // Beware that MAP_FIXED overrides existing mapping silently. If the specified
264 // memory was mapped in, it may no longer be after this function fails.
265 // mincore() can be used to check if a memory region is stilled mapped in.
266 NEVER_INLINE
void* mmap_2m_impl(void* addr
, bool fixed
) {
267 void* ret
= MAP_FAILED
;
268 int flags
= MAP_ANONYMOUS
| MAP_PRIVATE
| MAP_HUGETLB
;
271 assert(addr
!= nullptr);
273 // MAP_HUGE_2MB can be specified after 3.8 kernel.
274 static KernelVersion version
;
275 if (version
.m_major
> 3 || (version
.m_major
== 3 && version
.m_minor
>= 8)) {
277 #define MAP_HUGE_2MB (21 << 26)
279 flags
|= MAP_HUGE_2MB
;
281 ret
= mmap(addr
, size2m
, PROT_READ
| PROT_WRITE
, flags
, -1, 0);
282 if (ret
== MAP_FAILED
) {
283 record_err_msg("mmap() with MAP_HUGE_2MB failed: ");
286 if (addr
&& ret
!= addr
) {
287 assert(fixed
== false);
288 // Didn't get the intended address.
293 // Fault the page in. This guarantees availablility of memory, and avoids
294 // subsequent errors when the huge page isn't really available. Ideally the
295 // kernel should've failed mmap() in such a case, but it doesn't seem to even
296 // with MAP_LOCKED | MAP_POPULATE.
298 snprintf(s_errorMsg
, maxErrorMsgLen
, "mlock() failed for %p: ", ret
);
307 inline void* mmap_1g_impl(void* addr
, bool map_fixed
) {
308 void* ret
= MAP_FAILED
;
309 if (s_hugePath
[0] != 0) {
311 size_t dirNameLen
= strlen(s_hugePath
);
312 assert(dirNameLen
> 0 && s_hugePath
[dirNameLen
- 1] == '/');
313 for (char i
= '0'; i
<= '9'; ++i
) {
314 s_hugePath
[dirNameLen
] = i
;
315 // We don't put code on 1G huge pages, so no execute permission.
316 fd
= open(s_hugePath
, O_CREAT
| O_EXCL
| O_RDWR
, 0666);
317 // Retry a few times if the file already exists.
319 if (errno
== EEXIST
) {
323 snprintf(s_errorMsg
, maxErrorMsgLen
,
324 "Failed to create hugetlbfs file %s: ", s_hugePath
);
326 s_hugePath
[dirNameLen
] = 0;
335 s_hugePath
[dirNameLen
] = 0;
337 snprintf(s_errorMsg
, maxErrorMsgLen
,
338 "Failed to create a hugetlbfs file in %s: "
339 "it seems already full of files", s_hugePath
);
343 ret
= mmap(addr
, size1g
, PROT_READ
| PROT_WRITE
,
344 MAP_SHARED
| (map_fixed
? MAP_FIXED
: 0),
346 if (ret
== MAP_FAILED
) {
347 snprintf(s_errorMsg
, maxErrorMsgLen
,
348 "mmap() for hugetlbfs file failed: ");
354 if (ret
== MAP_FAILED
) {
355 // MAP_HUGE_1GB is available in 3.9 and later kernels
356 KernelVersion version
;
357 if (version
.m_major
> 3 || (version
.m_major
== 3 && version
.m_minor
>= 9)) {
359 #define MAP_HUGE_1GB (30 << 26)
361 int flags
= MAP_SHARED
| MAP_ANONYMOUS
| MAP_HUGETLB
| MAP_HUGE_1GB
|
362 (map_fixed
? MAP_FIXED
: 0);
363 ret
= mmap(addr
, size1g
, PROT_READ
| PROT_WRITE
, flags
, -1, 0);
364 if (ret
== MAP_FAILED
) {
365 record_err_msg("mmap() with MAP_HUGE_1GB failed: ");
373 // Didn't get the desired address. This can happen is map_fixed is false.
374 if (addr
!= nullptr && ret
!= addr
) {
375 snprintf(s_errorMsg
, maxErrorMsgLen
,
376 "mmap() for huge page returned %p, desired %p", ret
, addr
);
381 // Fault the page in. This guarantees availablility of memory, and avoids
382 // SIGBUS when the huge page isn't really available. In many cases
383 // RLIMIT_MEMLOCK isn't big enough for us to lock 1G. Fortunately that
384 // is unnecessary here; a byte should work equally well.
386 snprintf(s_errorMsg
, maxErrorMsgLen
, "mlock() failed for %p: ", ret
);
398 // We support at most 32 NUMA nodes (numa_node_set in 32-bit), so a single
399 // unsigned long is more than enough for the mask. This can be used in jemalloc
400 // allocation hooks, so it is wise to avoid calling malloc/free here, even
401 // though jemalloc might still be able to handle reentrance correctly. Thus, we
402 // bypass libnuma and do the syscalls directly here.
403 struct SavedNumaPolicy
{
404 bool needRestore
{false};
406 unsigned long oldMask
{0};
408 // Save NUMA policy for the current thread.
410 needRestore
= !get_mempolicy(&oldPolicy
, &oldMask
, sizeof(oldMask
),
415 set_mempolicy(oldPolicy
, &oldMask
, sizeof(oldMask
));
422 void* mmap_2m(int node
) {
424 if (get_huge2m_info(node
).free_hugepages
<= 0) return nullptr;
425 if (node
>= 0 && !numa_node_allowed(node
)) return nullptr;
427 SavedNumaPolicy numaPolicy
;
428 if (node
>= 0 && numa_num_nodes
> 1) {
430 unsigned long singleNodeMask
= 1ul << node
;
431 set_mempolicy(MPOL_BIND
, &singleNodeMask
, sizeof(singleNodeMask
));
434 void* ret
= mmap_2m_impl(nullptr, /* fixed */ false);
435 s_num2MPages
+= (ret
!= nullptr);
442 void* remap_2m(void* addr
, int node
) {
443 assert(addr
!= nullptr);
444 assert(reinterpret_cast<uintptr_t>(addr
) % size2m
== 0);
446 if (node
>= 0 && !numa_node_allowed(node
)) return nullptr;
447 if (get_huge2m_info(node
).free_hugepages
<= 0) return nullptr;
449 SavedNumaPolicy numaPolicy
;
450 unsigned long singleNodeMask
= (1ull << 32) - 1;
451 if (node
>= 0 && numa_num_nodes
> 1) {
453 singleNodeMask
= 1ul << node
;
454 set_mempolicy(MPOL_BIND
, &singleNodeMask
, sizeof(singleNodeMask
));
457 void* ret
= mmap_2m_impl(addr
, /* fixed */ true);
459 // When mmap_2m_impl() fails, pages in the range [addr, addr + size2m) may
460 // have been unmapped, depending on the implementation of the kernel. Remap
461 // the range in that case.
462 unsigned char v
[size2m
/ size4k
];
463 if (mincore(addr
, size2m
, v
) == -1 && errno
== ENOMEM
) {
464 // [addr, addr + size2m) contains an unmapped page.
465 int flags
= MAP_ANONYMOUS
| MAP_PRIVATE
| MAP_FIXED
;
466 int prot
= PROT_READ
| PROT_WRITE
;
467 void* normalPages
= mmap(addr
, size2m
, prot
, flags
, -1, 0);
468 if (normalPages
!= addr
) {
469 // Either the mmap() failed again without trying to get huge pages, or
470 // it has returned something other than addr even with MAP_FIXED. In
471 // either case, we need to bail out.
472 throw std::runtime_error
{"mmap() failure with MAP_FIXED"};
475 // Enforce the NUMA node spec.
476 if (node
>= 0 && numa_num_nodes
> 1) {
477 mbind(normalPages
, size2m
, MPOL_BIND
,
478 &singleNodeMask
, 32 /* maxnode */, 0 /* flags */);
481 // Since hugetlb pages are not available, try transparent huge pages.
482 madvise(normalPages
, size2m
, MADV_HUGEPAGE
);
494 int remap_interleaved_2m_pages(void* addr
, size_t pages
) {
496 assert(reinterpret_cast<uintptr_t>(addr
) % size2m
== 0);
497 assert(addr
!= nullptr);
499 std::atomic
<uint32_t> node
{0};
501 auto const curr_node
= next_numa_node(node
);
502 count
+= (remap_2m(addr
, curr_node
) != nullptr);
503 addr
= (char*)addr
+ size2m
;
512 void* mmap_1g(void* addr
, int node
, bool map_fixed
) {
514 if (s_num1GPages
>= kMaxNum1GPages
) return nullptr;
515 if (get_huge1g_info(node
).free_hugepages
<= 0) return nullptr;
516 if (node
>= 0 && !numa_node_allowed(node
)) return nullptr;
518 SavedNumaPolicy numaPolicy
;
519 if (node
>= 0 && numa_num_nodes
> 1) {
521 unsigned long singleNodeMask
= 1ul << node
;
522 set_mempolicy(MPOL_BIND
, &singleNodeMask
, sizeof(singleNodeMask
));
525 void* ret
= mmap_1g_impl(addr
, map_fixed
);
526 if (ret
!= nullptr) {
527 s_1GPages
[s_num1GPages
++] = ret
;
535 unsigned num_1g_pages() {
539 unsigned num_2m_pages() {
543 int mprotect_1g_pages(int prot
) {
545 for (unsigned i
= 0; i
< s_num1GPages
; ++i
) {
546 void* p
= s_1GPages
[i
];
547 assert(p
!= nullptr &&
548 (reinterpret_cast<uintptr_t>(p
) & (size1g
- 1)) == 0);
549 if (auto ret
= mprotect(p
, size1g
, prot
)) {
550 // mprotect() failed for this page, callers should check errno if they