2 * Support for RAM backed by mmaped host memory.
4 * Copyright (c) 2015 Red Hat, Inc.
7 * Michael S. Tsirkin <mst@redhat.com>
9 * This work is licensed under the terms of the GNU GPL, version 2 or
10 * later. See the COPYING file in the top-level directory.
14 #include <linux/mman.h>
15 #else /* !CONFIG_LINUX */
17 #define MAP_SHARED_VALIDATE 0x0
18 #endif /* CONFIG_LINUX */
20 #include "qemu/osdep.h"
21 #include "qemu/mmap-alloc.h"
22 #include "qemu/host-utils.h"
23 #include "qemu/cutils.h"
24 #include "qemu/error-report.h"
26 #define HUGETLBFS_MAGIC 0x958458f6
32 size_t qemu_fd_getpagesize(int fd
)
40 ret
= fstatfs(fd
, &fs
);
41 } while (ret
!= 0 && errno
== EINTR
);
43 if (ret
== 0 && fs
.f_type
== HUGETLBFS_MAGIC
) {
48 /* SPARC Linux needs greater alignment than the pagesize */
49 return QEMU_VMALLOC_ALIGN
;
53 return qemu_real_host_page_size();
56 #define OVERCOMMIT_MEMORY_PATH "/proc/sys/vm/overcommit_memory"
57 static bool map_noreserve_effective(int fd
, uint32_t qemu_map_flags
)
59 #if defined(__linux__)
60 const bool readonly
= qemu_map_flags
& QEMU_MAP_READONLY
;
61 const bool shared
= qemu_map_flags
& QEMU_MAP_SHARED
;
62 gchar
*content
= NULL
;
67 * hugeltb accounting is different than ordinary swap reservation:
68 * a) Hugetlb pages from the pool are reserved for both private and
69 * shared mappings. For shared mappings, all mappers have to specify
71 * b) MAP_NORESERVE is not affected by /proc/sys/vm/overcommit_memory.
73 if (qemu_fd_getpagesize(fd
) != qemu_real_host_page_size()) {
78 * Accountable mappings in the kernel that can be affected by MAP_NORESEVE
79 * are private writable mappings (see mm/mmap.c:accountable_mapping() in
80 * Linux). For all shared or readonly mappings, MAP_NORESERVE is always
81 * implicitly active -- no reservation; this includes shmem. The only
82 * exception is shared anonymous memory, it is accounted like private
85 if (readonly
|| (shared
&& fd
>= 0)) {
90 * MAP_NORESERVE is globally ignored for applicable !hugetlb mappings when
91 * memory overcommit is set to "never". Sparse memory regions aren't really
92 * possible in this system configuration.
94 * Bail out now instead of silently committing way more memory than
95 * currently desired by the user.
97 if (g_file_get_contents(OVERCOMMIT_MEMORY_PATH
, &content
, NULL
, NULL
) &&
98 !qemu_strtoui(content
, &endptr
, 0, &tmp
) &&
99 (!endptr
|| *endptr
== '\n')) {
101 error_report("Skipping reservation of swap space is not supported:"
102 " \"" OVERCOMMIT_MEMORY_PATH
"\" is \"2\"");
107 /* this interface has been around since Linux 2.6 */
108 error_report("Skipping reservation of swap space is not supported:"
109 " Could not read: \"" OVERCOMMIT_MEMORY_PATH
"\"");
113 * E.g., FreeBSD used to define MAP_NORESERVE, never implemented it,
114 * and removed it a while ago.
116 error_report("Skipping reservation of swap space is not supported");
121 * Reserve a new memory region of the requested size to be used for mapping
122 * from the given fd (if any).
124 static void *mmap_reserve(size_t size
, int fd
)
126 int flags
= MAP_PRIVATE
;
128 #if defined(__powerpc64__) && defined(__linux__)
130 * On ppc64 mappings in the same segment (aka slice) must share the same
131 * page size. Since we will be re-allocating part of this segment
132 * from the supplied fd, we should make sure to use the same page size, to
133 * this end we mmap the supplied fd. In this case, set MAP_NORESERVE to
134 * avoid allocating backing store memory.
135 * We do this unless we are using the system page size, in which case
136 * anonymous memory is OK.
138 if (fd
== -1 || qemu_fd_getpagesize(fd
) == qemu_real_host_page_size()) {
140 flags
|= MAP_ANONYMOUS
;
142 flags
|= MAP_NORESERVE
;
146 flags
|= MAP_ANONYMOUS
;
149 return mmap(0, size
, PROT_NONE
, flags
, fd
, 0);
153 * Activate memory in a reserved region from the given fd (if any), to make
156 static void *mmap_activate(void *ptr
, size_t size
, int fd
,
157 uint32_t qemu_map_flags
, off_t map_offset
)
159 const bool noreserve
= qemu_map_flags
& QEMU_MAP_NORESERVE
;
160 const bool readonly
= qemu_map_flags
& QEMU_MAP_READONLY
;
161 const bool shared
= qemu_map_flags
& QEMU_MAP_SHARED
;
162 const bool sync
= qemu_map_flags
& QEMU_MAP_SYNC
;
163 const int prot
= PROT_READ
| (readonly
? 0 : PROT_WRITE
);
164 int map_sync_flags
= 0;
165 int flags
= MAP_FIXED
;
168 if (noreserve
&& !map_noreserve_effective(fd
, qemu_map_flags
)) {
172 flags
|= fd
== -1 ? MAP_ANONYMOUS
: 0;
173 flags
|= shared
? MAP_SHARED
: MAP_PRIVATE
;
174 flags
|= noreserve
? MAP_NORESERVE
: 0;
175 if (shared
&& sync
) {
176 map_sync_flags
= MAP_SYNC
| MAP_SHARED_VALIDATE
;
179 activated_ptr
= mmap(ptr
, size
, prot
, flags
| map_sync_flags
, fd
,
181 if (activated_ptr
== MAP_FAILED
&& map_sync_flags
) {
182 if (errno
== ENOTSUP
) {
183 char *proc_link
= g_strdup_printf("/proc/self/fd/%d", fd
);
184 char *file_name
= g_malloc0(PATH_MAX
);
185 int len
= readlink(proc_link
, file_name
, PATH_MAX
- 1);
190 file_name
[len
] = '\0';
191 fprintf(stderr
, "Warning: requesting persistence across crashes "
192 "for backend file %s failed. Proceeding without "
193 "persistence, data might become corrupted in case of host "
194 "crash.\n", file_name
);
197 warn_report("Using non DAX backing file with 'pmem=on' option"
201 * If mmap failed with MAP_SHARED_VALIDATE | MAP_SYNC, we will try
202 * again without these flags to handle backwards compatibility.
204 activated_ptr
= mmap(ptr
, size
, prot
, flags
, fd
, map_offset
);
206 return activated_ptr
;
209 static inline size_t mmap_guard_pagesize(int fd
)
211 #if defined(__powerpc64__) && defined(__linux__)
212 /* Mappings in the same segment must share the same page size */
213 return qemu_fd_getpagesize(fd
);
215 return qemu_real_host_page_size();
219 void *qemu_ram_mmap(int fd
,
222 uint32_t qemu_map_flags
,
225 const size_t guard_pagesize
= mmap_guard_pagesize(fd
);
226 size_t offset
, total
;
227 void *ptr
, *guardptr
;
230 * Note: this always allocates at least one extra page of virtual address
231 * space, even if size is already aligned.
233 total
= size
+ align
;
235 guardptr
= mmap_reserve(total
, fd
);
236 if (guardptr
== MAP_FAILED
) {
240 assert(is_power_of_2(align
));
241 /* Always align to host page size */
242 assert(align
>= guard_pagesize
);
244 offset
= QEMU_ALIGN_UP((uintptr_t)guardptr
, align
) - (uintptr_t)guardptr
;
246 ptr
= mmap_activate(guardptr
+ offset
, size
, fd
, qemu_map_flags
,
248 if (ptr
== MAP_FAILED
) {
249 munmap(guardptr
, total
);
254 munmap(guardptr
, offset
);
258 * Leave a single PROT_NONE page allocated after the RAM block, to serve as
259 * a guard page guarding against potential buffer overflows.
262 if (total
> size
+ guard_pagesize
) {
263 munmap(ptr
+ size
+ guard_pagesize
, total
- size
- guard_pagesize
);
269 void qemu_ram_munmap(int fd
, void *ptr
, size_t size
)
272 /* Unmap both the RAM block and the guard page */
273 munmap(ptr
, size
+ mmap_guard_pagesize(fd
));