kvm userspace: ksm support
[qemu-kvm/fedora.git] / linux-user / mmap.c
blobe05caa0a1121cd43fad2f7d5f86d63530b775f4c
1 /*
2 * mmap support for qemu
4 * Copyright (c) 2003 Fabrice Bellard
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, see <http://www.gnu.org/licenses/>.
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <stdarg.h>
22 #include <string.h>
23 #include <unistd.h>
24 #include <errno.h>
25 #include <sys/types.h>
26 #include <sys/stat.h>
27 #include <sys/mman.h>
28 #include <linux/mman.h>
29 #include <linux/unistd.h>
31 #include "qemu.h"
32 #include "qemu-common.h"
34 //#define DEBUG_MMAP
36 #if defined(USE_NPTL)
37 pthread_mutex_t mmap_mutex = PTHREAD_MUTEX_INITIALIZER;
38 static int __thread mmap_lock_count;
40 void mmap_lock(void)
42 if (mmap_lock_count++ == 0) {
43 pthread_mutex_lock(&mmap_mutex);
47 void mmap_unlock(void)
49 if (--mmap_lock_count == 0) {
50 pthread_mutex_unlock(&mmap_mutex);
54 /* Grab lock to make sure things are in a consistent state after fork(). */
55 void mmap_fork_start(void)
57 if (mmap_lock_count)
58 abort();
59 pthread_mutex_lock(&mmap_mutex);
62 void mmap_fork_end(int child)
64 if (child)
65 pthread_mutex_init(&mmap_mutex, NULL);
66 else
67 pthread_mutex_unlock(&mmap_mutex);
69 #else
70 /* We aren't threadsafe to start with, so no need to worry about locking. */
71 void mmap_lock(void)
75 void mmap_unlock(void)
78 #endif
80 void *qemu_vmalloc(size_t size)
82 void *p;
83 unsigned long addr;
84 mmap_lock();
85 /* Use map and mark the pages as used. */
86 p = mmap(NULL, size, PROT_READ | PROT_WRITE,
87 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
89 addr = (unsigned long)p;
90 if (addr == (target_ulong) addr) {
91 /* Allocated region overlaps guest address space.
92 This may recurse. */
93 page_set_flags(addr & TARGET_PAGE_MASK, TARGET_PAGE_ALIGN(addr + size),
94 PAGE_RESERVED);
97 mmap_unlock();
98 return p;
101 void *qemu_malloc(size_t size)
103 char * p;
104 size += 16;
105 p = qemu_vmalloc(size);
106 *(size_t *)p = size;
107 return p + 16;
110 /* We use map, which is always zero initialized. */
111 void * qemu_mallocz(size_t size)
113 return qemu_malloc(size);
116 void qemu_free(void *ptr)
118 /* FIXME: We should unmark the reserved pages here. However this gets
119 complicated when one target page spans multiple host pages, so we
120 don't bother. */
121 size_t *p;
122 p = (size_t *)((char *)ptr - 16);
123 munmap(p, *p);
126 void *qemu_realloc(void *ptr, size_t size)
128 size_t old_size, copy;
129 void *new_ptr;
131 if (!ptr)
132 return qemu_malloc(size);
133 old_size = *(size_t *)((char *)ptr - 16);
134 copy = old_size < size ? old_size : size;
135 new_ptr = qemu_malloc(size);
136 memcpy(new_ptr, ptr, copy);
137 qemu_free(ptr);
138 return new_ptr;
141 /* NOTE: all the constants are the HOST ones, but addresses are target. */
142 int target_mprotect(abi_ulong start, abi_ulong len, int prot)
144 abi_ulong end, host_start, host_end, addr;
145 int prot1, ret;
147 #ifdef DEBUG_MMAP
148 printf("mprotect: start=0x" TARGET_ABI_FMT_lx
149 "len=0x" TARGET_ABI_FMT_lx " prot=%c%c%c\n", start, len,
150 prot & PROT_READ ? 'r' : '-',
151 prot & PROT_WRITE ? 'w' : '-',
152 prot & PROT_EXEC ? 'x' : '-');
153 #endif
155 if ((start & ~TARGET_PAGE_MASK) != 0)
156 return -EINVAL;
157 len = TARGET_PAGE_ALIGN(len);
158 end = start + len;
159 if (end < start)
160 return -EINVAL;
161 prot &= PROT_READ | PROT_WRITE | PROT_EXEC;
162 if (len == 0)
163 return 0;
165 mmap_lock();
166 host_start = start & qemu_host_page_mask;
167 host_end = HOST_PAGE_ALIGN(end);
168 if (start > host_start) {
169 /* handle host page containing start */
170 prot1 = prot;
171 for(addr = host_start; addr < start; addr += TARGET_PAGE_SIZE) {
172 prot1 |= page_get_flags(addr);
174 if (host_end == host_start + qemu_host_page_size) {
175 for(addr = end; addr < host_end; addr += TARGET_PAGE_SIZE) {
176 prot1 |= page_get_flags(addr);
178 end = host_end;
180 ret = mprotect(g2h(host_start), qemu_host_page_size, prot1 & PAGE_BITS);
181 if (ret != 0)
182 goto error;
183 host_start += qemu_host_page_size;
185 if (end < host_end) {
186 prot1 = prot;
187 for(addr = end; addr < host_end; addr += TARGET_PAGE_SIZE) {
188 prot1 |= page_get_flags(addr);
190 ret = mprotect(g2h(host_end - qemu_host_page_size), qemu_host_page_size,
191 prot1 & PAGE_BITS);
192 if (ret != 0)
193 goto error;
194 host_end -= qemu_host_page_size;
197 /* handle the pages in the middle */
198 if (host_start < host_end) {
199 ret = mprotect(g2h(host_start), host_end - host_start, prot);
200 if (ret != 0)
201 goto error;
203 page_set_flags(start, start + len, prot | PAGE_VALID);
204 mmap_unlock();
205 return 0;
206 error:
207 mmap_unlock();
208 return ret;
211 /* map an incomplete host page */
212 static int mmap_frag(abi_ulong real_start,
213 abi_ulong start, abi_ulong end,
214 int prot, int flags, int fd, abi_ulong offset)
216 abi_ulong real_end, addr;
217 void *host_start;
218 int prot1, prot_new;
220 real_end = real_start + qemu_host_page_size;
221 host_start = g2h(real_start);
223 /* get the protection of the target pages outside the mapping */
224 prot1 = 0;
225 for(addr = real_start; addr < real_end; addr++) {
226 if (addr < start || addr >= end)
227 prot1 |= page_get_flags(addr);
230 if (prot1 == 0) {
231 /* no page was there, so we allocate one */
232 void *p = mmap(host_start, qemu_host_page_size, prot,
233 flags | MAP_ANONYMOUS, -1, 0);
234 if (p == MAP_FAILED)
235 return -1;
236 prot1 = prot;
238 prot1 &= PAGE_BITS;
240 prot_new = prot | prot1;
241 if (!(flags & MAP_ANONYMOUS)) {
242 /* msync() won't work here, so we return an error if write is
243 possible while it is a shared mapping */
244 if ((flags & MAP_TYPE) == MAP_SHARED &&
245 (prot & PROT_WRITE))
246 return -EINVAL;
248 /* adjust protection to be able to read */
249 if (!(prot1 & PROT_WRITE))
250 mprotect(host_start, qemu_host_page_size, prot1 | PROT_WRITE);
252 /* read the corresponding file data */
253 pread(fd, g2h(start), end - start, offset);
255 /* put final protection */
256 if (prot_new != (prot1 | PROT_WRITE))
257 mprotect(host_start, qemu_host_page_size, prot_new);
258 } else {
259 /* just update the protection */
260 if (prot_new != prot1) {
261 mprotect(host_start, qemu_host_page_size, prot_new);
264 return 0;
267 #if defined(__CYGWIN__)
268 /* Cygwin doesn't have a whole lot of address space. */
269 static abi_ulong mmap_next_start = 0x18000000;
270 #else
271 static abi_ulong mmap_next_start = 0x40000000;
272 #endif
274 unsigned long last_brk;
276 /* find a free memory area of size 'size'. The search starts at
277 'start'. If 'start' == 0, then a default start address is used.
278 Return -1 if error.
280 /* page_init() marks pages used by the host as reserved to be sure not
281 to use them. */
282 abi_ulong mmap_find_vma(abi_ulong start, abi_ulong size)
284 abi_ulong addr, addr1, addr_start;
285 int prot;
286 unsigned long new_brk;
288 new_brk = (unsigned long)sbrk(0);
289 if (last_brk && last_brk < new_brk && last_brk == (target_ulong)last_brk) {
290 /* This is a hack to catch the host allocating memory with brk().
291 If it uses mmap then we loose.
292 FIXME: We really want to avoid the host allocating memory in
293 the first place, and maybe leave some slack to avoid switching
294 to mmap. */
295 page_set_flags(last_brk & TARGET_PAGE_MASK,
296 TARGET_PAGE_ALIGN(new_brk),
297 PAGE_RESERVED);
299 last_brk = new_brk;
301 size = HOST_PAGE_ALIGN(size);
302 start = start & qemu_host_page_mask;
303 addr = start;
304 if (addr == 0)
305 addr = mmap_next_start;
306 addr_start = addr;
307 for(;;) {
308 prot = 0;
309 for(addr1 = addr; addr1 < (addr + size); addr1 += TARGET_PAGE_SIZE) {
310 prot |= page_get_flags(addr1);
312 if (prot == 0)
313 break;
314 addr += qemu_host_page_size;
315 /* we found nothing */
316 if (addr == addr_start)
317 return (abi_ulong)-1;
319 if (start == 0)
320 mmap_next_start = addr + size;
321 return addr;
324 /* NOTE: all the constants are the HOST ones */
325 abi_long target_mmap(abi_ulong start, abi_ulong len, int prot,
326 int flags, int fd, abi_ulong offset)
328 abi_ulong ret, end, real_start, real_end, retaddr, host_offset, host_len;
329 unsigned long host_start;
331 mmap_lock();
332 #ifdef DEBUG_MMAP
334 printf("mmap: start=0x" TARGET_ABI_FMT_lx
335 " len=0x" TARGET_ABI_FMT_lx " prot=%c%c%c flags=",
336 start, len,
337 prot & PROT_READ ? 'r' : '-',
338 prot & PROT_WRITE ? 'w' : '-',
339 prot & PROT_EXEC ? 'x' : '-');
340 if (flags & MAP_FIXED)
341 printf("MAP_FIXED ");
342 if (flags & MAP_ANONYMOUS)
343 printf("MAP_ANON ");
344 switch(flags & MAP_TYPE) {
345 case MAP_PRIVATE:
346 printf("MAP_PRIVATE ");
347 break;
348 case MAP_SHARED:
349 printf("MAP_SHARED ");
350 break;
351 default:
352 printf("[MAP_TYPE=0x%x] ", flags & MAP_TYPE);
353 break;
355 printf("fd=%d offset=" TARGET_ABI_FMT_lx "\n", fd, offset);
357 #endif
359 if (offset & ~TARGET_PAGE_MASK) {
360 errno = EINVAL;
361 goto fail;
364 len = TARGET_PAGE_ALIGN(len);
365 if (len == 0)
366 goto the_end;
367 real_start = start & qemu_host_page_mask;
369 /* When mapping files into a memory area larger than the file, accesses
370 to pages beyond the file size will cause a SIGBUS.
372 For example, if mmaping a file of 100 bytes on a host with 4K pages
373 emulating a target with 8K pages, the target expects to be able to
374 access the first 8K. But the host will trap us on any access beyond
375 4K.
377 When emulating a target with a larger page-size than the hosts, we
378 may need to truncate file maps at EOF and add extra anonymous pages
379 up to the targets page boundary. */
381 if ((qemu_real_host_page_size < TARGET_PAGE_SIZE)
382 && !(flags & MAP_ANONYMOUS)) {
383 struct stat sb;
385 if (fstat (fd, &sb) == -1)
386 goto fail;
388 /* Are we trying to create a map beyond EOF?. */
389 if (offset + len > sb.st_size) {
390 /* If so, truncate the file map at eof aligned with
391 the hosts real pagesize. Additional anonymous maps
392 will be created beyond EOF. */
393 len = (sb.st_size - offset);
394 len += qemu_real_host_page_size - 1;
395 len &= ~(qemu_real_host_page_size - 1);
399 if (!(flags & MAP_FIXED)) {
400 abi_ulong mmap_start;
401 void *p;
402 host_offset = offset & qemu_host_page_mask;
403 host_len = len + offset - host_offset;
404 host_len = HOST_PAGE_ALIGN(host_len);
405 mmap_start = mmap_find_vma(real_start, host_len);
406 if (mmap_start == (abi_ulong)-1) {
407 errno = ENOMEM;
408 goto fail;
410 /* Note: we prefer to control the mapping address. It is
411 especially important if qemu_host_page_size >
412 qemu_real_host_page_size */
413 p = mmap(g2h(mmap_start),
414 host_len, prot, flags | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
415 if (p == MAP_FAILED)
416 goto fail;
417 /* update start so that it points to the file position at 'offset' */
418 host_start = (unsigned long)p;
419 if (!(flags & MAP_ANONYMOUS)) {
420 p = mmap(g2h(mmap_start), len, prot,
421 flags | MAP_FIXED, fd, host_offset);
422 host_start += offset - host_offset;
424 start = h2g(host_start);
425 } else {
426 int flg;
427 target_ulong addr;
429 if (start & ~TARGET_PAGE_MASK) {
430 errno = EINVAL;
431 goto fail;
433 end = start + len;
434 real_end = HOST_PAGE_ALIGN(end);
437 * Test if requested memory area fits target address space
438 * It can fail only on 64-bit host with 32-bit target.
439 * On any other target/host host mmap() handles this error correctly.
441 if ((unsigned long)start + len - 1 > (abi_ulong) -1) {
442 errno = EINVAL;
443 goto fail;
446 for(addr = real_start; addr < real_end; addr += TARGET_PAGE_SIZE) {
447 flg = page_get_flags(addr);
448 if (flg & PAGE_RESERVED) {
449 errno = ENXIO;
450 goto fail;
454 /* worst case: we cannot map the file because the offset is not
455 aligned, so we read it */
456 if (!(flags & MAP_ANONYMOUS) &&
457 (offset & ~qemu_host_page_mask) != (start & ~qemu_host_page_mask)) {
458 /* msync() won't work here, so we return an error if write is
459 possible while it is a shared mapping */
460 if ((flags & MAP_TYPE) == MAP_SHARED &&
461 (prot & PROT_WRITE)) {
462 errno = EINVAL;
463 goto fail;
465 retaddr = target_mmap(start, len, prot | PROT_WRITE,
466 MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
467 -1, 0);
468 if (retaddr == -1)
469 goto fail;
470 pread(fd, g2h(start), len, offset);
471 if (!(prot & PROT_WRITE)) {
472 ret = target_mprotect(start, len, prot);
473 if (ret != 0) {
474 start = ret;
475 goto the_end;
478 goto the_end;
481 /* handle the start of the mapping */
482 if (start > real_start) {
483 if (real_end == real_start + qemu_host_page_size) {
484 /* one single host page */
485 ret = mmap_frag(real_start, start, end,
486 prot, flags, fd, offset);
487 if (ret == -1)
488 goto fail;
489 goto the_end1;
491 ret = mmap_frag(real_start, start, real_start + qemu_host_page_size,
492 prot, flags, fd, offset);
493 if (ret == -1)
494 goto fail;
495 real_start += qemu_host_page_size;
497 /* handle the end of the mapping */
498 if (end < real_end) {
499 ret = mmap_frag(real_end - qemu_host_page_size,
500 real_end - qemu_host_page_size, real_end,
501 prot, flags, fd,
502 offset + real_end - qemu_host_page_size - start);
503 if (ret == -1)
504 goto fail;
505 real_end -= qemu_host_page_size;
508 /* map the middle (easier) */
509 if (real_start < real_end) {
510 void *p;
511 unsigned long offset1;
512 if (flags & MAP_ANONYMOUS)
513 offset1 = 0;
514 else
515 offset1 = offset + real_start - start;
516 p = mmap(g2h(real_start), real_end - real_start,
517 prot, flags, fd, offset1);
518 if (p == MAP_FAILED)
519 goto fail;
522 the_end1:
523 page_set_flags(start, start + len, prot | PAGE_VALID);
524 the_end:
525 #ifdef DEBUG_MMAP
526 printf("ret=0x" TARGET_ABI_FMT_lx "\n", start);
527 page_dump(stdout);
528 printf("\n");
529 #endif
530 mmap_unlock();
531 return start;
532 fail:
533 mmap_unlock();
534 return -1;
537 int target_munmap(abi_ulong start, abi_ulong len)
539 abi_ulong end, real_start, real_end, addr;
540 int prot, ret;
542 #ifdef DEBUG_MMAP
543 printf("munmap: start=0x" TARGET_ABI_FMT_lx " len=0x"
544 TARGET_ABI_FMT_lx "\n",
545 start, len);
546 #endif
547 if (start & ~TARGET_PAGE_MASK)
548 return -EINVAL;
549 len = TARGET_PAGE_ALIGN(len);
550 if (len == 0)
551 return -EINVAL;
552 mmap_lock();
553 end = start + len;
554 real_start = start & qemu_host_page_mask;
555 real_end = HOST_PAGE_ALIGN(end);
557 if (start > real_start) {
558 /* handle host page containing start */
559 prot = 0;
560 for(addr = real_start; addr < start; addr += TARGET_PAGE_SIZE) {
561 prot |= page_get_flags(addr);
563 if (real_end == real_start + qemu_host_page_size) {
564 for(addr = end; addr < real_end; addr += TARGET_PAGE_SIZE) {
565 prot |= page_get_flags(addr);
567 end = real_end;
569 if (prot != 0)
570 real_start += qemu_host_page_size;
572 if (end < real_end) {
573 prot = 0;
574 for(addr = end; addr < real_end; addr += TARGET_PAGE_SIZE) {
575 prot |= page_get_flags(addr);
577 if (prot != 0)
578 real_end -= qemu_host_page_size;
581 ret = 0;
582 /* unmap what we can */
583 if (real_start < real_end) {
584 ret = munmap(g2h(real_start), real_end - real_start);
587 if (ret == 0)
588 page_set_flags(start, start + len, 0);
589 mmap_unlock();
590 return ret;
593 abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size,
594 abi_ulong new_size, unsigned long flags,
595 abi_ulong new_addr)
597 int prot;
598 void *host_addr;
600 mmap_lock();
602 if (flags & MREMAP_FIXED)
603 host_addr = (void *) syscall(__NR_mremap, g2h(old_addr),
604 old_size, new_size,
605 flags,
606 new_addr);
607 else if (flags & MREMAP_MAYMOVE) {
608 abi_ulong mmap_start;
610 mmap_start = mmap_find_vma(0, new_size);
612 if (mmap_start == -1) {
613 errno = ENOMEM;
614 host_addr = MAP_FAILED;
615 } else
616 host_addr = (void *) syscall(__NR_mremap, g2h(old_addr),
617 old_size, new_size,
618 flags | MREMAP_FIXED,
619 g2h(mmap_start));
620 } else {
621 host_addr = mremap(g2h(old_addr), old_size, new_size, flags);
622 /* Check if address fits target address space */
623 if ((unsigned long)host_addr + new_size > (abi_ulong)-1) {
624 /* Revert mremap() changes */
625 host_addr = mremap(g2h(old_addr), new_size, old_size, flags);
626 errno = ENOMEM;
627 host_addr = MAP_FAILED;
631 if (host_addr == MAP_FAILED) {
632 new_addr = -1;
633 } else {
634 new_addr = h2g(host_addr);
635 prot = page_get_flags(old_addr);
636 page_set_flags(old_addr, old_addr + old_size, 0);
637 page_set_flags(new_addr, new_addr + new_size, prot | PAGE_VALID);
639 mmap_unlock();
640 return new_addr;
643 int target_msync(abi_ulong start, abi_ulong len, int flags)
645 abi_ulong end;
647 if (start & ~TARGET_PAGE_MASK)
648 return -EINVAL;
649 len = TARGET_PAGE_ALIGN(len);
650 end = start + len;
651 if (end < start)
652 return -EINVAL;
653 if (end == start)
654 return 0;
656 start &= qemu_host_page_mask;
657 return msync(g2h(start), end - start, flags);