hphp/util/hugetlb.cpp

   1 /*
   2    +----------------------------------------------------------------------+
   3    | HipHop for PHP                                                       |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com)  |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15 */
  16
  17 #include "hphp/util/hugetlb.h"
  18
  19 // Techniques used here are Linux-specific, so don't bother to be portable.
  20 #ifdef __linux__
  21 #include <sys/mman.h>
  22 #include <sys/mount.h>
  23 #include <sys/param.h>
  24 #include <sys/stat.h>
  25 #include <sys/types.h>
  26 #include <sys/vfs.h>
  27 #include <fcntl.h>
  28 #ifdef HAVE_NUMA
  29 #include <numaif.h>
  30 #endif
  31 #include <unistd.h>
  32
  33 #include "hphp/util/kernel-version.h"
  34 #include "hphp/util/numa.h"
  35 #include "hphp/util/portability.h"
  36
  37 #endif
  38
  39 #include <assert.h>
  40 #include <errno.h>
  41 #include <stdio.h>
  42 #include <stdlib.h>
  43 #include <string.h>
  44
  45 #include <atomic>
  46 #include <stdexcept>
  47
  48 namespace HPHP {
  49
  50 static char s_hugePath[256];
  51 constexpr size_t maxErrorMsgLen = 512;
  52 static char s_errorMsg[maxErrorMsgLen];
  53
  54 static unsigned s_num1GPages;
  55 constexpr unsigned kMaxNum1GPages = 16;
  56 static void* s_1GPages[kMaxNum1GPages];
  57
  58 static unsigned s_num2MPages;
  59
  60 // Record error message based on errno, with an optional message.
  61 static void record_err_msg(const char* msg = nullptr) {
  62   size_t len = 0;
  63   if (msg) {
  64     len = strlen(msg);
  65     if (len > maxErrorMsgLen / 2) {
  66       len = maxErrorMsgLen / 2;
  67     }
  68     memcpy(s_errorMsg, msg, len);
  69     s_errorMsg[len] = 0;
  70   } else {
  71     len = strlen(s_errorMsg);
  72   }
  73 #ifdef __linux__
  74 #ifdef _GNU_SOURCE
  75   char* err = strerror_r(errno, s_errorMsg + len, maxErrorMsgLen - len);
  76   if (len == strlen(s_errorMsg)) {
  77     size_t appendLen = strlen(err);
  78     if (appendLen + len >= maxErrorMsgLen) {
  79       appendLen = maxErrorMsgLen - 1 - len;
  80     }
  81     memcpy(s_errorMsg + len, err, appendLen);
  82     s_errorMsg[len + appendLen] = 0;
  83   }
  84 #else
  85   strerror_r(errno, s_errorMsg + len, maxErrorMsgLen - len);
  86 #endif
  87 #endif
  88 }
  89
  90 const char* get_hugetlb_err_msg() {
  91   return s_errorMsg;
  92 }
  93
  94 // Return the page size for hugetlbfs mount point, or 0 if anything goes wrong:
  95 // e.g., mount point doesn't exist, mount point isn't hugetlbfs.
  96 static size_t get_hugepage_size(const char* path) {
  97 #ifdef __linux__
  98   struct statfs64 sb;
  99   if (statfs64(path, &sb) == 0) {
 100     // Magic number defined in Linux kernel: include/uapi/linux/magic.h
 101     auto constexpr HUGETLBFS_MAGIC = 0x958458f6;
 102     if (sb.f_type == HUGETLBFS_MAGIC) {
 103       return sb.f_bsize;
 104     } else {
 105       snprintf(s_errorMsg, maxErrorMsgLen,
 106                "path %s isn't mounted as hugetlbfs", path);
 107     }
 108   } else {
 109     snprintf(s_errorMsg, maxErrorMsgLen,
 110              "statfs64() for %s failed: ", path);
 111     record_err_msg();
 112   }
 113 #endif
 114   return 0;
 115 }
 116
 117 bool set_hugetlbfs_path(const char* path) {
 118   if (get_hugepage_size(path) != size1g) return false;
 119   size_t len = strlen(path);
 120   if (len + 8 >= sizeof(s_hugePath)) return false;
 121   memcpy(s_hugePath, path, len);
 122   *reinterpret_cast<int*>(s_hugePath + len) = 0;
 123   if (s_hugePath[len - 1] != '/') {
 124     s_hugePath[len] = '/';
 125   }
 126   return true;
 127 }
 128
 129 bool find_hugetlbfs_path() {
 130 #ifdef __linux__
 131   auto mounts = fopen("/proc/mounts", "r");
 132   if (!mounts) return false;
 133   // Search the file for lines like the following
 134   // none /dev/hugepages hugetlbfs seclabel,relatime...
 135   char line[4096];
 136   char path[4096];
 137   char option[4096];
 138   while (fgets(line, sizeof(line), mounts)) {
 139     if (sscanf(line, "%*s %s hugetlbfs %s", path, option) == 2) {
 140       // It matches hugetlbfs, check page size and save results.
 141       if (set_hugetlbfs_path(path)) {
 142         fclose(mounts);
 143         return true;
 144       }
 145     }
 146   }
 147   fclose(mounts);
 148 #endif
 149   return false;
 150 }
 151
 152 HugePageInfo read_hugepage_info(size_t pagesize, int node /* = -1 */) {
 153   unsigned nr_huge = 0, free_huge = 0;
 154   if (pagesize != size2m && pagesize != size1g) { // only 2M and 1G supported
 155     return HugePageInfo{0, 0};
 156   }
 157 #ifdef __linux__
 158   if (node >= 0) {
 159     auto const readNumFrom = [] (const char* path) {
 160       unsigned result = 0;
 161       char buffer[32];
 162       memset(buffer, 0, sizeof(buffer));
 163       int fd = open(path, O_RDONLY);
 164       if (fd < 0) return result;
 165       bool done = false;
 166       do {
 167         ssize_t bytes = read(fd, buffer, 20);
 168         if (bytes == 0) break;          // EOF
 169         if (bytes < 0) {
 170           if (errno == EINTR) continue; // try again
 171           break;                        // totally failed
 172         }
 173         for (ssize_t i = 0; i < bytes; ++i) {
 174           char c = buffer[i];
 175           // only read numbers, and stop on white space, etc.
 176           if (c < '0' || c > '9') {
 177             done = true;
 178             break;
 179           }
 180           result = result * 10 + c - '0';
 181         }
 182       } while (!done);
 183       close(fd);
 184       return result;
 185     };
 186
 187     char fileName[256];
 188     memcpy(fileName, "/sys/devices/system/node/node", 29);
 189     assert(strlen("/sys/devices/system/node/node") == 29);
 190     char* p = fileName + 29;
 191     // We support at most 32 NUMA node, so at most two bytes.
 192     if (node >= 10) *p++ = '0' + node / 10;
 193     *p++ = '0' + node % 10;
 194     if (pagesize == size2m) {
 195       memcpy(p, "/hugepages/hugepages-2048kB/", 28);
 196       assert(strlen("/hugepages/hugepages-2048kB/") == 28);
 197       p += 28;
 198     } else {
 199       memcpy(p, "/hugepages/hugepages-1048576kB/", 31);
 200       assert(strlen("/hugepages/hugepages-1048576kB/") == 31);
 201       p += 31;
 202     }
 203
 204     memcpy(p, "nr_hugepages", 13);
 205     assert(strlen("nr_hugepages") == 12); // extra \0 byte
 206     nr_huge = readNumFrom(fileName);
 207
 208     memcpy(p, "free_hugepages", 15);
 209     assert(strlen("free_hugepages") == 14); // extra \0 byte
 210     free_huge = readNumFrom(fileName);
 211
 212     return HugePageInfo{nr_huge, free_huge};
 213   }
 214   // All nodes
 215 #ifdef HAVE_NUMA
 216   const int MAX_NUMA_NODE = numa_max_node();
 217 #else
 218   constexpr int MAX_NUMA_NODE = 0;
 219 #endif
 220   for (int i = 0; i <= MAX_NUMA_NODE; ++i) {
 221     // Skip nodes we are not allowed to allocate on.
 222     if (!numa_node_allowed(i)) continue;
 223     auto const info = read_hugepage_info(pagesize, i);
 224     nr_huge += info.nr_hugepages;
 225     free_huge += info.free_hugepages;
 226   }
 227 #endif
 228   return HugePageInfo{nr_huge, free_huge};
 229 }
 230
 231 HugePageInfo get_huge1g_info(int node /* = -1 */) {
 232   return read_hugepage_info(size1g, node);
 233 }
 234
 235 HugePageInfo get_huge2m_info(int node /* = -1 */) {
 236   return read_hugepage_info(size2m, node);
 237 }
 238
 239 bool auto_mount_hugetlbfs() {
 240 #ifdef __linux__
 241   auto const info = get_huge1g_info();
 242   if (info.nr_hugepages <= 0) return false; // No page reserved.
 243
 244   const char* hugePath = "/tmp/huge1g";
 245   if (mkdir(hugePath, 0777)) {
 246     if (errno != EEXIST) {
 247       snprintf(s_errorMsg, maxErrorMsgLen, "Failed to mkdir %s: ", hugePath);
 248       record_err_msg();
 249       return false;
 250     }
 251   }
 252   if (mount("none", hugePath, "hugetlbfs", 0, "pagesize=1G,mode=0777")) {
 253     record_err_msg("Failed to mount hugetlbfs with 1G page size: ");
 254     return false;
 255   }
 256   return set_hugetlbfs_path(hugePath);
 257 #else
 258   return false;
 259 #endif
 260 }
 261
 262 #ifdef __linux__
 263 // Beware that MAP_FIXED overrides existing mapping silently.  If the specified
 264 // memory was mapped in, it may no longer be after this function fails.
 265 // mincore() can be used to check if a memory region is stilled mapped in.
 266 NEVER_INLINE void* mmap_2m_impl(void* addr, bool fixed) {
 267   void* ret = MAP_FAILED;
 268   int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB;
 269   if (fixed) {
 270     flags |= MAP_FIXED;
 271     assert(addr != nullptr);
 272   }
 273   // MAP_HUGE_2MB can be specified after 3.8 kernel.
 274   static KernelVersion version;
 275   if (version.m_major > 3 || (version.m_major == 3 && version.m_minor >= 8)) {
 276 #ifndef MAP_HUGE_2MB
 277 #define MAP_HUGE_2MB (21 << 26)
 278 #endif
 279     flags |= MAP_HUGE_2MB;
 280   }
 281   ret = mmap(addr, size2m, PROT_READ | PROT_WRITE, flags, -1, 0);
 282   if (ret == MAP_FAILED) {
 283     record_err_msg("mmap() with MAP_HUGE_2MB failed: ");
 284     return nullptr;
 285   }
 286   if (addr && ret != addr) {
 287     assert(fixed == false);
 288     // Didn't get the intended address.
 289     munmap(ret, size2m);
 290     return nullptr;
 291   }
 292
 293   // Fault the page in.  This guarantees availablility of memory, and avoids
 294   // subsequent errors when the huge page isn't really available.  Ideally the
 295   // kernel should've failed mmap() in such a case, but it doesn't seem to even
 296   // with MAP_LOCKED | MAP_POPULATE.
 297   if (mlock(ret, 1)) {
 298     snprintf(s_errorMsg, maxErrorMsgLen, "mlock() failed for %p: ", ret);
 299     record_err_msg();
 300     munmap(ret, size2m);
 301     return nullptr;
 302   }
 303
 304   return ret;
 305 }
 306
 307 inline void* mmap_1g_impl(void* addr, bool map_fixed) {
 308   void* ret = MAP_FAILED;
 309   if (s_hugePath[0] != 0) {
 310     int fd = -1;
 311     size_t dirNameLen = strlen(s_hugePath);
 312     assert(dirNameLen > 0 && s_hugePath[dirNameLen - 1] == '/');
 313     for (char i = '0'; i <= '9'; ++i) {
 314       s_hugePath[dirNameLen] = i;
 315       // We don't put code on 1G huge pages, so no execute permission.
 316       fd = open(s_hugePath, O_CREAT | O_EXCL | O_RDWR, 0666);
 317       // Retry a few times if the file already exists.
 318       if (fd < 0) {
 319         if (errno == EEXIST) {
 320           errno = 0;
 321           continue;
 322         } else {
 323           snprintf(s_errorMsg, maxErrorMsgLen,
 324                    "Failed to create hugetlbfs file %s: ", s_hugePath);
 325           record_err_msg();
 326           s_hugePath[dirNameLen] = 0;
 327           return nullptr;
 328         }
 329       } else {
 330         unlink(s_hugePath);
 331       }
 332       break;
 333     }
 334
 335     s_hugePath[dirNameLen] = 0;
 336     if (fd < 0) {
 337       snprintf(s_errorMsg, maxErrorMsgLen,
 338                "Failed to create a hugetlbfs file in %s: "
 339                "it seems already full of files", s_hugePath);
 340       return nullptr;
 341     }
 342
 343     ret = mmap(addr, size1g, PROT_READ | PROT_WRITE,
 344                MAP_SHARED | (map_fixed ? MAP_FIXED : 0),
 345                fd, 0);
 346     if (ret == MAP_FAILED) {
 347       snprintf(s_errorMsg, maxErrorMsgLen,
 348                "mmap() for hugetlbfs file failed: ");
 349       record_err_msg();
 350     }
 351     close(fd);
 352   }
 353
 354   if (ret == MAP_FAILED) {
 355     // MAP_HUGE_1GB is available in 3.9 and later kernels
 356     KernelVersion version;
 357     if (version.m_major > 3 || (version.m_major == 3 && version.m_minor >= 9)) {
 358 #ifndef MAP_HUGE_1GB
 359 #define MAP_HUGE_1GB (30 << 26)
 360 #endif
 361       int flags = MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_1GB |
 362         (map_fixed ? MAP_FIXED : 0);
 363       ret = mmap(addr, size1g, PROT_READ | PROT_WRITE, flags, -1, 0);
 364       if (ret == MAP_FAILED) {
 365         record_err_msg("mmap() with MAP_HUGE_1GB failed: ");
 366         return nullptr;
 367       }
 368     } else {
 369       return nullptr;
 370     }
 371   }
 372
 373   // Didn't get the desired address.  This can happen is map_fixed is false.
 374   if (addr != nullptr && ret != addr) {
 375     snprintf(s_errorMsg, maxErrorMsgLen,
 376              "mmap() for huge page returned %p, desired %p", ret, addr);
 377     munmap(ret, size1g);
 378     return nullptr;
 379   }
 380
 381   // Fault the page in.  This guarantees availablility of memory, and avoids
 382   // SIGBUS when the huge page isn't really available.  In many cases
 383   // RLIMIT_MEMLOCK isn't big enough for us to lock 1G.  Fortunately that
 384   // is unnecessary here; a byte should work equally well.
 385   if (mlock(ret, 1)) {
 386     snprintf(s_errorMsg, maxErrorMsgLen, "mlock() failed for %p: ", ret);
 387     record_err_msg();
 388     munmap(ret, size1g);
 389     return nullptr;
 390   }
 391
 392   return ret;
 393 }
 394 #endif
 395
 396 #ifdef HAVE_NUMA
 397 namespace {
 398 // We support at most 32 NUMA nodes (numa_node_set in 32-bit), so a single
 399 // unsigned long is more than enough for the mask.  This can be used in jemalloc
 400 // allocation hooks, so it is wise to avoid calling malloc/free here, even
 401 // though jemalloc might still be able to handle reentrance correctly.  Thus, we
 402 // bypass libnuma and do the syscalls directly here.
 403 struct SavedNumaPolicy {
 404   bool needRestore{false};
 405   int oldPolicy{0};
 406   unsigned long oldMask{0};
 407
 408   // Save NUMA policy for the current thread.
 409   void save() {
 410     needRestore = !get_mempolicy(&oldPolicy, &oldMask, sizeof(oldMask),
 411                                  nullptr, 0);
 412   }
 413   ~SavedNumaPolicy() {
 414     if (needRestore) {
 415       set_mempolicy(oldPolicy, &oldMask, sizeof(oldMask));
 416     }
 417   }
 418 };
 419 }
 420 #endif
 421
 422 void* mmap_2m(int node) {
 423 #ifdef __linux__
 424   if (get_huge2m_info(node).free_hugepages <= 0) return nullptr;
 425   if (node >= 0 && !numa_node_allowed(node)) return nullptr;
 426 #ifdef HAVE_NUMA
 427   SavedNumaPolicy numaPolicy;
 428   if (node >= 0 && numa_num_nodes > 1) {
 429     numaPolicy.save();
 430     unsigned long singleNodeMask = 1ul << node;
 431     set_mempolicy(MPOL_BIND, &singleNodeMask, sizeof(singleNodeMask));
 432   }
 433 #endif
 434   void* ret = mmap_2m_impl(nullptr, /* fixed */ false);
 435   s_num2MPages += (ret != nullptr);
 436   return ret;
 437 #else  // not linux
 438   return nullptr;
 439 #endif
 440 }
 441
 442 void* remap_2m(void* addr, int node) {
 443   assert(addr != nullptr);
 444   assert(reinterpret_cast<uintptr_t>(addr) % size2m == 0);
 445 #ifdef __linux__
 446   if (node >= 0 && !numa_node_allowed(node)) return nullptr;
 447   if (get_huge2m_info(node).free_hugepages <= 0) return nullptr;
 448 #ifdef HAVE_NUMA
 449   SavedNumaPolicy numaPolicy;
 450   unsigned long singleNodeMask = (1ull << 32) - 1;
 451   if (node >= 0 && numa_num_nodes > 1) {
 452     numaPolicy.save();
 453     singleNodeMask = 1ul << node;
 454     set_mempolicy(MPOL_BIND, &singleNodeMask, sizeof(singleNodeMask));
 455   }
 456 #endif
 457   void* ret = mmap_2m_impl(addr, /* fixed */ true);
 458   if (!ret) {
 459     // When mmap_2m_impl() fails, pages in the range [addr, addr + size2m) may
 460     // have been unmapped, depending on the implementation of the kernel.  Remap
 461     // the range in that case.
 462     unsigned char v[size2m / size4k];
 463     if (mincore(addr, size2m, v) == -1 && errno == ENOMEM) {
 464       // [addr, addr + size2m) contains an unmapped page.
 465       int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED;
 466       int prot = PROT_READ | PROT_WRITE;
 467       void* normalPages = mmap(addr, size2m, prot, flags, -1, 0);
 468       if (normalPages != addr) {
 469         // Either the mmap() failed again without trying to get huge pages, or
 470         // it has returned something other than addr even with MAP_FIXED.  In
 471         // either case, we need to bail out.
 472         throw std::runtime_error{"mmap() failure with MAP_FIXED"};
 473       }
 474 #ifdef HAVE_NUMA
 475       // Enforce the NUMA node spec.
 476       if (node >= 0 && numa_num_nodes > 1) {
 477         mbind(normalPages, size2m, MPOL_BIND,
 478               &singleNodeMask, 32 /* maxnode */, 0 /* flags */);
 479       }
 480 #endif
 481       // Since hugetlb pages are not available, try transparent huge pages.
 482       madvise(normalPages, size2m, MADV_HUGEPAGE);
 483     }
 484   } else {
 485     ++s_num2MPages;
 486   }
 487
 488   return ret;
 489 #else  // not linux
 490   return nullptr;
 491 #endif
 492 }
 493
 494 int remap_interleaved_2m_pages(void* addr, size_t pages) {
 495 #ifdef __linux__
 496   assert(reinterpret_cast<uintptr_t>(addr) % size2m == 0);
 497   assert(addr != nullptr);
 498   int count = 0;
 499   std::atomic<uint32_t> node{0};
 500   while (pages > 0) {
 501     auto const curr_node = next_numa_node(node);
 502     count += (remap_2m(addr, curr_node) != nullptr);
 503     addr = (char*)addr + size2m;
 504     --pages;
 505   }
 506   return count;
 507 #else  // not linux
 508   return 0;
 509 #endif
 510 }
 511
 512 void* mmap_1g(void* addr, int node, bool map_fixed) {
 513 #ifdef __linux__
 514   if (s_num1GPages >= kMaxNum1GPages) return nullptr;
 515   if (get_huge1g_info(node).free_hugepages <= 0) return nullptr;
 516   if (node >= 0 && !numa_node_allowed(node)) return nullptr;
 517 #ifdef HAVE_NUMA
 518   SavedNumaPolicy numaPolicy;
 519   if (node >= 0 && numa_num_nodes > 1) {
 520     numaPolicy.save();
 521     unsigned long singleNodeMask = 1ul << node;
 522     set_mempolicy(MPOL_BIND, &singleNodeMask, sizeof(singleNodeMask));
 523   }
 524 #endif
 525   void* ret = mmap_1g_impl(addr, map_fixed);
 526   if (ret != nullptr) {
 527     s_1GPages[s_num1GPages++] = ret;
 528   }
 529   return ret;
 530 #else
 531   return nullptr;
 532 #endif
 533 }
 534
 535 unsigned num_1g_pages() {
 536   return s_num1GPages;
 537 }
 538
 539 unsigned num_2m_pages() {
 540   return s_num2MPages;
 541 }
 542
 543 int mprotect_1g_pages(int prot) {
 544 #ifdef __linux__
 545   for (unsigned i = 0; i < s_num1GPages; ++i) {
 546     void* p = s_1GPages[i];
 547     assert(p != nullptr &&
 548            (reinterpret_cast<uintptr_t>(p) & (size1g - 1)) == 0);
 549     if (auto ret = mprotect(p, size1g, prot)) {
 550       // mprotect() failed for this page, callers should check errno if they
 551       // care.
 552       return ret;
 553     }
 554   }
 555 #endif
 556   return 0;
 557 }
 558
 559 }