2 * Support for RAM backed by mmaped host memory.
4 * Copyright (c) 2015 Red Hat, Inc.
7 * Michael S. Tsirkin <mst@redhat.com>
9 * This work is licensed under the terms of the GNU GPL, version 2 or
10 * later. See the COPYING file in the top-level directory.
14 #include <linux/mman.h>
15 #else /* !CONFIG_LINUX */
17 #define MAP_SHARED_VALIDATE 0x0
18 #endif /* CONFIG_LINUX */
20 #include "qemu/osdep.h"
21 #include "qemu/mmap-alloc.h"
22 #include "qemu/host-utils.h"
23 #include "qemu/cutils.h"
24 #include "qemu/error-report.h"
26 #define HUGETLBFS_MAGIC 0x958458f6
32 size_t qemu_fd_getpagesize(int fd
)
40 ret
= fstatfs(fd
, &fs
);
41 } while (ret
!= 0 && errno
== EINTR
);
43 if (ret
== 0 && fs
.f_type
== HUGETLBFS_MAGIC
) {
48 /* SPARC Linux needs greater alignment than the pagesize */
49 return QEMU_VMALLOC_ALIGN
;
53 return qemu_real_host_page_size();
56 size_t qemu_mempath_getpagesize(const char *mem_path
)
64 ret
= statfs(mem_path
, &fs
);
65 } while (ret
!= 0 && errno
== EINTR
);
68 fprintf(stderr
, "Couldn't statfs() memory path: %s\n",
73 if (fs
.f_type
== HUGETLBFS_MAGIC
) {
74 /* It's hugepage, return the huge page size */
79 /* SPARC Linux needs greater alignment than the pagesize */
80 return QEMU_VMALLOC_ALIGN
;
84 return qemu_real_host_page_size();
87 #define OVERCOMMIT_MEMORY_PATH "/proc/sys/vm/overcommit_memory"
88 static bool map_noreserve_effective(int fd
, uint32_t qemu_map_flags
)
90 #if defined(__linux__)
91 const bool readonly
= qemu_map_flags
& QEMU_MAP_READONLY
;
92 const bool shared
= qemu_map_flags
& QEMU_MAP_SHARED
;
93 gchar
*content
= NULL
;
98 * hugeltb accounting is different than ordinary swap reservation:
99 * a) Hugetlb pages from the pool are reserved for both private and
100 * shared mappings. For shared mappings, all mappers have to specify
102 * b) MAP_NORESERVE is not affected by /proc/sys/vm/overcommit_memory.
104 if (qemu_fd_getpagesize(fd
) != qemu_real_host_page_size()) {
109 * Accountable mappings in the kernel that can be affected by MAP_NORESEVE
110 * are private writable mappings (see mm/mmap.c:accountable_mapping() in
111 * Linux). For all shared or readonly mappings, MAP_NORESERVE is always
112 * implicitly active -- no reservation; this includes shmem. The only
113 * exception is shared anonymous memory, it is accounted like private
116 if (readonly
|| (shared
&& fd
>= 0)) {
121 * MAP_NORESERVE is globally ignored for applicable !hugetlb mappings when
122 * memory overcommit is set to "never". Sparse memory regions aren't really
123 * possible in this system configuration.
125 * Bail out now instead of silently committing way more memory than
126 * currently desired by the user.
128 if (g_file_get_contents(OVERCOMMIT_MEMORY_PATH
, &content
, NULL
, NULL
) &&
129 !qemu_strtoui(content
, &endptr
, 0, &tmp
) &&
130 (!endptr
|| *endptr
== '\n')) {
132 error_report("Skipping reservation of swap space is not supported:"
133 " \"" OVERCOMMIT_MEMORY_PATH
"\" is \"2\"");
138 /* this interface has been around since Linux 2.6 */
139 error_report("Skipping reservation of swap space is not supported:"
140 " Could not read: \"" OVERCOMMIT_MEMORY_PATH
"\"");
144 * E.g., FreeBSD used to define MAP_NORESERVE, never implemented it,
145 * and removed it a while ago.
147 error_report("Skipping reservation of swap space is not supported");
152 * Reserve a new memory region of the requested size to be used for mapping
153 * from the given fd (if any).
155 static void *mmap_reserve(size_t size
, int fd
)
157 int flags
= MAP_PRIVATE
;
159 #if defined(__powerpc64__) && defined(__linux__)
161 * On ppc64 mappings in the same segment (aka slice) must share the same
162 * page size. Since we will be re-allocating part of this segment
163 * from the supplied fd, we should make sure to use the same page size, to
164 * this end we mmap the supplied fd. In this case, set MAP_NORESERVE to
165 * avoid allocating backing store memory.
166 * We do this unless we are using the system page size, in which case
167 * anonymous memory is OK.
169 if (fd
== -1 || qemu_fd_getpagesize(fd
) == qemu_real_host_page_size()) {
171 flags
|= MAP_ANONYMOUS
;
173 flags
|= MAP_NORESERVE
;
177 flags
|= MAP_ANONYMOUS
;
180 return mmap(0, size
, PROT_NONE
, flags
, fd
, 0);
184 * Activate memory in a reserved region from the given fd (if any), to make
187 static void *mmap_activate(void *ptr
, size_t size
, int fd
,
188 uint32_t qemu_map_flags
, off_t map_offset
)
190 const bool noreserve
= qemu_map_flags
& QEMU_MAP_NORESERVE
;
191 const bool readonly
= qemu_map_flags
& QEMU_MAP_READONLY
;
192 const bool shared
= qemu_map_flags
& QEMU_MAP_SHARED
;
193 const bool sync
= qemu_map_flags
& QEMU_MAP_SYNC
;
194 const int prot
= PROT_READ
| (readonly
? 0 : PROT_WRITE
);
195 int map_sync_flags
= 0;
196 int flags
= MAP_FIXED
;
199 if (noreserve
&& !map_noreserve_effective(fd
, qemu_map_flags
)) {
203 flags
|= fd
== -1 ? MAP_ANONYMOUS
: 0;
204 flags
|= shared
? MAP_SHARED
: MAP_PRIVATE
;
205 flags
|= noreserve
? MAP_NORESERVE
: 0;
206 if (shared
&& sync
) {
207 map_sync_flags
= MAP_SYNC
| MAP_SHARED_VALIDATE
;
210 activated_ptr
= mmap(ptr
, size
, prot
, flags
| map_sync_flags
, fd
,
212 if (activated_ptr
== MAP_FAILED
&& map_sync_flags
) {
213 if (errno
== ENOTSUP
) {
214 char *proc_link
= g_strdup_printf("/proc/self/fd/%d", fd
);
215 char *file_name
= g_malloc0(PATH_MAX
);
216 int len
= readlink(proc_link
, file_name
, PATH_MAX
- 1);
221 file_name
[len
] = '\0';
222 fprintf(stderr
, "Warning: requesting persistence across crashes "
223 "for backend file %s failed. Proceeding without "
224 "persistence, data might become corrupted in case of host "
225 "crash.\n", file_name
);
228 warn_report("Using non DAX backing file with 'pmem=on' option"
232 * If mmap failed with MAP_SHARED_VALIDATE | MAP_SYNC, we will try
233 * again without these flags to handle backwards compatibility.
235 activated_ptr
= mmap(ptr
, size
, prot
, flags
, fd
, map_offset
);
237 return activated_ptr
;
240 static inline size_t mmap_guard_pagesize(int fd
)
242 #if defined(__powerpc64__) && defined(__linux__)
243 /* Mappings in the same segment must share the same page size */
244 return qemu_fd_getpagesize(fd
);
246 return qemu_real_host_page_size();
250 void *qemu_ram_mmap(int fd
,
253 uint32_t qemu_map_flags
,
256 const size_t guard_pagesize
= mmap_guard_pagesize(fd
);
257 size_t offset
, total
;
258 void *ptr
, *guardptr
;
261 * Note: this always allocates at least one extra page of virtual address
262 * space, even if size is already aligned.
264 total
= size
+ align
;
266 guardptr
= mmap_reserve(total
, fd
);
267 if (guardptr
== MAP_FAILED
) {
271 assert(is_power_of_2(align
));
272 /* Always align to host page size */
273 assert(align
>= guard_pagesize
);
275 offset
= QEMU_ALIGN_UP((uintptr_t)guardptr
, align
) - (uintptr_t)guardptr
;
277 ptr
= mmap_activate(guardptr
+ offset
, size
, fd
, qemu_map_flags
,
279 if (ptr
== MAP_FAILED
) {
280 munmap(guardptr
, total
);
285 munmap(guardptr
, offset
);
289 * Leave a single PROT_NONE page allocated after the RAM block, to serve as
290 * a guard page guarding against potential buffer overflows.
293 if (total
> size
+ guard_pagesize
) {
294 munmap(ptr
+ size
+ guard_pagesize
, total
- size
- guard_pagesize
);
300 void qemu_ram_munmap(int fd
, void *ptr
, size_t size
)
303 /* Unmap both the RAM block and the guard page */
304 munmap(ptr
, size
+ mmap_guard_pagesize(fd
));