block/amend: separate amend and create options for qemu-img
[qemu/ar7.git] / util / oslib-posix.c
blob39ddc77c85bf4490623180052ffe5c40b3de621e
1 /*
2 * os-posix-lib.c
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2010 Red Hat, Inc.
7 * QEMU library functions on POSIX which are shared between QEMU and
8 * the QEMU tools.
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include <termios.h>
32 #include <glib/gprintf.h>
34 #include "qemu-common.h"
35 #include "sysemu/sysemu.h"
36 #include "trace.h"
37 #include "qapi/error.h"
38 #include "qemu/sockets.h"
39 #include "qemu/thread.h"
40 #include <libgen.h>
41 #include <sys/signal.h>
42 #include "qemu/cutils.h"
44 #ifdef CONFIG_LINUX
45 #include <sys/syscall.h>
46 #endif
48 #ifdef __FreeBSD__
49 #include <sys/sysctl.h>
50 #include <sys/user.h>
51 #include <sys/thr.h>
52 #include <libutil.h>
53 #endif
55 #ifdef __NetBSD__
56 #include <sys/sysctl.h>
57 #include <lwp.h>
58 #endif
60 #ifdef __APPLE__
61 #include <mach-o/dyld.h>
62 #endif
64 #include "qemu/mmap-alloc.h"
66 #ifdef CONFIG_DEBUG_STACK_USAGE
67 #include "qemu/error-report.h"
68 #endif
70 #define MAX_MEM_PREALLOC_THREAD_COUNT 16
72 struct MemsetThread {
73 char *addr;
74 size_t numpages;
75 size_t hpagesize;
76 QemuThread pgthread;
77 sigjmp_buf env;
79 typedef struct MemsetThread MemsetThread;
81 static MemsetThread *memset_thread;
82 static int memset_num_threads;
83 static bool memset_thread_failed;
85 static QemuMutex page_mutex;
86 static QemuCond page_cond;
87 static bool threads_created_flag;
89 int qemu_get_thread_id(void)
91 #if defined(__linux__)
92 return syscall(SYS_gettid);
93 #elif defined(__FreeBSD__)
94 /* thread id is up to INT_MAX */
95 long tid;
96 thr_self(&tid);
97 return (int)tid;
98 #elif defined(__NetBSD__)
99 return _lwp_self();
100 #else
101 return getpid();
102 #endif
105 int qemu_daemon(int nochdir, int noclose)
107 return daemon(nochdir, noclose);
110 bool qemu_write_pidfile(const char *path, Error **errp)
112 int fd;
113 char pidstr[32];
115 while (1) {
116 struct stat a, b;
117 struct flock lock = {
118 .l_type = F_WRLCK,
119 .l_whence = SEEK_SET,
120 .l_len = 0,
123 fd = qemu_open(path, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR);
124 if (fd == -1) {
125 error_setg_errno(errp, errno, "Cannot open pid file");
126 return false;
129 if (fstat(fd, &b) < 0) {
130 error_setg_errno(errp, errno, "Cannot stat file");
131 goto fail_close;
134 if (fcntl(fd, F_SETLK, &lock)) {
135 error_setg_errno(errp, errno, "Cannot lock pid file");
136 goto fail_close;
140 * Now make sure the path we locked is the same one that now
141 * exists on the filesystem.
143 if (stat(path, &a) < 0) {
145 * PID file disappeared, someone else must be racing with
146 * us, so try again.
148 close(fd);
149 continue;
152 if (a.st_ino == b.st_ino) {
153 break;
157 * PID file was recreated, someone else must be racing with
158 * us, so try again.
160 close(fd);
163 if (ftruncate(fd, 0) < 0) {
164 error_setg_errno(errp, errno, "Failed to truncate pid file");
165 goto fail_unlink;
168 snprintf(pidstr, sizeof(pidstr), FMT_pid "\n", getpid());
169 if (write(fd, pidstr, strlen(pidstr)) != strlen(pidstr)) {
170 error_setg(errp, "Failed to write pid file");
171 goto fail_unlink;
174 return true;
176 fail_unlink:
177 unlink(path);
178 fail_close:
179 close(fd);
180 return false;
183 void *qemu_oom_check(void *ptr)
185 if (ptr == NULL) {
186 fprintf(stderr, "Failed to allocate memory: %s\n", strerror(errno));
187 abort();
189 return ptr;
192 void *qemu_try_memalign(size_t alignment, size_t size)
194 void *ptr;
196 if (alignment < sizeof(void*)) {
197 alignment = sizeof(void*);
200 #if defined(CONFIG_POSIX_MEMALIGN)
201 int ret;
202 ret = posix_memalign(&ptr, alignment, size);
203 if (ret != 0) {
204 errno = ret;
205 ptr = NULL;
207 #elif defined(CONFIG_BSD)
208 ptr = valloc(size);
209 #else
210 ptr = memalign(alignment, size);
211 #endif
212 trace_qemu_memalign(alignment, size, ptr);
213 return ptr;
216 void *qemu_memalign(size_t alignment, size_t size)
218 return qemu_oom_check(qemu_try_memalign(alignment, size));
221 /* alloc shared memory pages */
222 void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared)
224 size_t align = QEMU_VMALLOC_ALIGN;
225 void *ptr = qemu_ram_mmap(-1, size, align, shared, false);
227 if (ptr == MAP_FAILED) {
228 return NULL;
231 if (alignment) {
232 *alignment = align;
235 trace_qemu_anon_ram_alloc(size, ptr);
236 return ptr;
239 void qemu_vfree(void *ptr)
241 trace_qemu_vfree(ptr);
242 free(ptr);
245 void qemu_anon_ram_free(void *ptr, size_t size)
247 trace_qemu_anon_ram_free(ptr, size);
248 qemu_ram_munmap(-1, ptr, size);
251 void qemu_set_block(int fd)
253 int f;
254 f = fcntl(fd, F_GETFL);
255 assert(f != -1);
256 f = fcntl(fd, F_SETFL, f & ~O_NONBLOCK);
257 assert(f != -1);
260 void qemu_set_nonblock(int fd)
262 int f;
263 f = fcntl(fd, F_GETFL);
264 assert(f != -1);
265 f = fcntl(fd, F_SETFL, f | O_NONBLOCK);
266 #ifdef __OpenBSD__
267 if (f == -1) {
269 * Previous to OpenBSD 6.3, fcntl(F_SETFL) is not permitted on
270 * memory devices and sets errno to ENODEV.
271 * It's OK if we fail to set O_NONBLOCK on devices like /dev/null,
272 * because they will never block anyway.
274 assert(errno == ENODEV);
276 #else
277 assert(f != -1);
278 #endif
281 int socket_set_fast_reuse(int fd)
283 int val = 1, ret;
285 ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
286 (const char *)&val, sizeof(val));
288 assert(ret == 0);
290 return ret;
293 void qemu_set_cloexec(int fd)
295 int f;
296 f = fcntl(fd, F_GETFD);
297 assert(f != -1);
298 f = fcntl(fd, F_SETFD, f | FD_CLOEXEC);
299 assert(f != -1);
303 * Creates a pipe with FD_CLOEXEC set on both file descriptors
305 int qemu_pipe(int pipefd[2])
307 int ret;
309 #ifdef CONFIG_PIPE2
310 ret = pipe2(pipefd, O_CLOEXEC);
311 if (ret != -1 || errno != ENOSYS) {
312 return ret;
314 #endif
315 ret = pipe(pipefd);
316 if (ret == 0) {
317 qemu_set_cloexec(pipefd[0]);
318 qemu_set_cloexec(pipefd[1]);
321 return ret;
324 char *
325 qemu_get_local_state_pathname(const char *relative_pathname)
327 return g_strdup_printf("%s/%s", CONFIG_QEMU_LOCALSTATEDIR,
328 relative_pathname);
331 void qemu_set_tty_echo(int fd, bool echo)
333 struct termios tty;
335 tcgetattr(fd, &tty);
337 if (echo) {
338 tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN;
339 } else {
340 tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN);
343 tcsetattr(fd, TCSANOW, &tty);
346 static char exec_dir[PATH_MAX];
348 void qemu_init_exec_dir(const char *argv0)
350 char *dir;
351 char *p = NULL;
352 char buf[PATH_MAX];
354 assert(!exec_dir[0]);
356 #if defined(__linux__)
358 int len;
359 len = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
360 if (len > 0) {
361 buf[len] = 0;
362 p = buf;
365 #elif defined(__FreeBSD__) \
366 || (defined(__NetBSD__) && defined(KERN_PROC_PATHNAME))
368 #if defined(__FreeBSD__)
369 static int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
370 #else
371 static int mib[4] = {CTL_KERN, KERN_PROC_ARGS, -1, KERN_PROC_PATHNAME};
372 #endif
373 size_t len = sizeof(buf) - 1;
375 *buf = '\0';
376 if (!sysctl(mib, ARRAY_SIZE(mib), buf, &len, NULL, 0) &&
377 *buf) {
378 buf[sizeof(buf) - 1] = '\0';
379 p = buf;
382 #elif defined(__APPLE__)
384 char fpath[PATH_MAX];
385 uint32_t len = sizeof(fpath);
386 if (_NSGetExecutablePath(fpath, &len) == 0) {
387 p = realpath(fpath, buf);
388 if (!p) {
389 return;
393 #endif
394 /* If we don't have any way of figuring out the actual executable
395 location then try argv[0]. */
396 if (!p) {
397 if (!argv0) {
398 return;
400 p = realpath(argv0, buf);
401 if (!p) {
402 return;
405 dir = g_path_get_dirname(p);
407 pstrcpy(exec_dir, sizeof(exec_dir), dir);
409 g_free(dir);
412 char *qemu_get_exec_dir(void)
414 return g_strdup(exec_dir);
417 static void sigbus_handler(int signal)
419 int i;
420 if (memset_thread) {
421 for (i = 0; i < memset_num_threads; i++) {
422 if (qemu_thread_is_self(&memset_thread[i].pgthread)) {
423 siglongjmp(memset_thread[i].env, 1);
429 static void *do_touch_pages(void *arg)
431 MemsetThread *memset_args = (MemsetThread *)arg;
432 sigset_t set, oldset;
435 * On Linux, the page faults from the loop below can cause mmap_sem
436 * contention with allocation of the thread stacks. Do not start
437 * clearing until all threads have been created.
439 qemu_mutex_lock(&page_mutex);
440 while(!threads_created_flag){
441 qemu_cond_wait(&page_cond, &page_mutex);
443 qemu_mutex_unlock(&page_mutex);
445 /* unblock SIGBUS */
446 sigemptyset(&set);
447 sigaddset(&set, SIGBUS);
448 pthread_sigmask(SIG_UNBLOCK, &set, &oldset);
450 if (sigsetjmp(memset_args->env, 1)) {
451 memset_thread_failed = true;
452 } else {
453 char *addr = memset_args->addr;
454 size_t numpages = memset_args->numpages;
455 size_t hpagesize = memset_args->hpagesize;
456 size_t i;
457 for (i = 0; i < numpages; i++) {
459 * Read & write back the same value, so we don't
460 * corrupt existing user/app data that might be
461 * stored.
463 * 'volatile' to stop compiler optimizing this away
464 * to a no-op
466 * TODO: get a better solution from kernel so we
467 * don't need to write at all so we don't cause
468 * wear on the storage backing the region...
470 *(volatile char *)addr = *addr;
471 addr += hpagesize;
474 pthread_sigmask(SIG_SETMASK, &oldset, NULL);
475 return NULL;
478 static inline int get_memset_num_threads(int smp_cpus)
480 long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
481 int ret = 1;
483 if (host_procs > 0) {
484 ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), smp_cpus);
486 /* In case sysconf() fails, we fall back to single threaded */
487 return ret;
490 static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
491 int smp_cpus)
493 static gsize initialized = 0;
494 size_t numpages_per_thread, leftover;
495 char *addr = area;
496 int i = 0;
498 if (g_once_init_enter(&initialized)) {
499 qemu_mutex_init(&page_mutex);
500 qemu_cond_init(&page_cond);
501 g_once_init_leave(&initialized, 1);
504 memset_thread_failed = false;
505 threads_created_flag = false;
506 memset_num_threads = get_memset_num_threads(smp_cpus);
507 memset_thread = g_new0(MemsetThread, memset_num_threads);
508 numpages_per_thread = numpages / memset_num_threads;
509 leftover = numpages % memset_num_threads;
510 for (i = 0; i < memset_num_threads; i++) {
511 memset_thread[i].addr = addr;
512 memset_thread[i].numpages = numpages_per_thread + (i < leftover);
513 memset_thread[i].hpagesize = hpagesize;
514 qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
515 do_touch_pages, &memset_thread[i],
516 QEMU_THREAD_JOINABLE);
517 addr += memset_thread[i].numpages * hpagesize;
520 qemu_mutex_lock(&page_mutex);
521 threads_created_flag = true;
522 qemu_cond_broadcast(&page_cond);
523 qemu_mutex_unlock(&page_mutex);
525 for (i = 0; i < memset_num_threads; i++) {
526 qemu_thread_join(&memset_thread[i].pgthread);
528 g_free(memset_thread);
529 memset_thread = NULL;
531 return memset_thread_failed;
534 void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
535 Error **errp)
537 int ret;
538 struct sigaction act, oldact;
539 size_t hpagesize = qemu_fd_getpagesize(fd);
540 size_t numpages = DIV_ROUND_UP(memory, hpagesize);
542 memset(&act, 0, sizeof(act));
543 act.sa_handler = &sigbus_handler;
544 act.sa_flags = 0;
546 ret = sigaction(SIGBUS, &act, &oldact);
547 if (ret) {
548 error_setg_errno(errp, errno,
549 "os_mem_prealloc: failed to install signal handler");
550 return;
553 /* touch pages simultaneously */
554 if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) {
555 error_setg(errp, "os_mem_prealloc: Insufficient free host memory "
556 "pages available to allocate guest RAM");
559 ret = sigaction(SIGBUS, &oldact, NULL);
560 if (ret) {
561 /* Terminate QEMU since it can't recover from error */
562 perror("os_mem_prealloc: failed to reinstall signal handler");
563 exit(1);
567 char *qemu_get_pid_name(pid_t pid)
569 char *name = NULL;
571 #if defined(__FreeBSD__)
572 /* BSDs don't have /proc, but they provide a nice substitute */
573 struct kinfo_proc *proc = kinfo_getproc(pid);
575 if (proc) {
576 name = g_strdup(proc->ki_comm);
577 free(proc);
579 #else
580 /* Assume a system with reasonable procfs */
581 char *pid_path;
582 size_t len;
584 pid_path = g_strdup_printf("/proc/%d/cmdline", pid);
585 g_file_get_contents(pid_path, &name, &len, NULL);
586 g_free(pid_path);
587 #endif
589 return name;
593 pid_t qemu_fork(Error **errp)
595 sigset_t oldmask, newmask;
596 struct sigaction sig_action;
597 int saved_errno;
598 pid_t pid;
601 * Need to block signals now, so that child process can safely
602 * kill off caller's signal handlers without a race.
604 sigfillset(&newmask);
605 if (pthread_sigmask(SIG_SETMASK, &newmask, &oldmask) != 0) {
606 error_setg_errno(errp, errno,
607 "cannot block signals");
608 return -1;
611 pid = fork();
612 saved_errno = errno;
614 if (pid < 0) {
615 /* attempt to restore signal mask, but ignore failure, to
616 * avoid obscuring the fork failure */
617 (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
618 error_setg_errno(errp, saved_errno,
619 "cannot fork child process");
620 errno = saved_errno;
621 return -1;
622 } else if (pid) {
623 /* parent process */
625 /* Restore our original signal mask now that the child is
626 * safely running. Only documented failures are EFAULT (not
627 * possible, since we are using just-grabbed mask) or EINVAL
628 * (not possible, since we are using correct arguments). */
629 (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
630 } else {
631 /* child process */
632 size_t i;
634 /* Clear out all signal handlers from parent so nothing
635 * unexpected can happen in our child once we unblock
636 * signals */
637 sig_action.sa_handler = SIG_DFL;
638 sig_action.sa_flags = 0;
639 sigemptyset(&sig_action.sa_mask);
641 for (i = 1; i < NSIG; i++) {
642 /* Only possible errors are EFAULT or EINVAL The former
643 * won't happen, the latter we expect, so no need to check
644 * return value */
645 (void)sigaction(i, &sig_action, NULL);
648 /* Unmask all signals in child, since we've no idea what the
649 * caller's done with their signal mask and don't want to
650 * propagate that to children */
651 sigemptyset(&newmask);
652 if (pthread_sigmask(SIG_SETMASK, &newmask, NULL) != 0) {
653 Error *local_err = NULL;
654 error_setg_errno(&local_err, errno,
655 "cannot unblock signals");
656 error_report_err(local_err);
657 _exit(1);
660 return pid;
663 void *qemu_alloc_stack(size_t *sz)
665 void *ptr, *guardpage;
666 int flags;
667 #ifdef CONFIG_DEBUG_STACK_USAGE
668 void *ptr2;
669 #endif
670 size_t pagesz = qemu_real_host_page_size;
671 #ifdef _SC_THREAD_STACK_MIN
672 /* avoid stacks smaller than _SC_THREAD_STACK_MIN */
673 long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN);
674 *sz = MAX(MAX(min_stack_sz, 0), *sz);
675 #endif
676 /* adjust stack size to a multiple of the page size */
677 *sz = ROUND_UP(*sz, pagesz);
678 /* allocate one extra page for the guard page */
679 *sz += pagesz;
681 flags = MAP_PRIVATE | MAP_ANONYMOUS;
682 #if defined(MAP_STACK) && defined(__OpenBSD__)
683 /* Only enable MAP_STACK on OpenBSD. Other OS's such as
684 * Linux/FreeBSD/NetBSD have a flag with the same name
685 * but have differing functionality. OpenBSD will SEGV
686 * if it spots execution with a stack pointer pointing
687 * at memory that was not allocated with MAP_STACK.
689 flags |= MAP_STACK;
690 #endif
692 ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE, flags, -1, 0);
693 if (ptr == MAP_FAILED) {
694 perror("failed to allocate memory for stack");
695 abort();
698 #if defined(HOST_IA64)
699 /* separate register stack */
700 guardpage = ptr + (((*sz - pagesz) / 2) & ~pagesz);
701 #elif defined(HOST_HPPA)
702 /* stack grows up */
703 guardpage = ptr + *sz - pagesz;
704 #else
705 /* stack grows down */
706 guardpage = ptr;
707 #endif
708 if (mprotect(guardpage, pagesz, PROT_NONE) != 0) {
709 perror("failed to set up stack guard page");
710 abort();
713 #ifdef CONFIG_DEBUG_STACK_USAGE
714 for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) {
715 *(uint32_t *)ptr2 = 0xdeadbeaf;
717 #endif
719 return ptr;
722 #ifdef CONFIG_DEBUG_STACK_USAGE
723 static __thread unsigned int max_stack_usage;
724 #endif
726 void qemu_free_stack(void *stack, size_t sz)
728 #ifdef CONFIG_DEBUG_STACK_USAGE
729 unsigned int usage;
730 void *ptr;
732 for (ptr = stack + qemu_real_host_page_size; ptr < stack + sz;
733 ptr += sizeof(uint32_t)) {
734 if (*(uint32_t *)ptr != 0xdeadbeaf) {
735 break;
738 usage = sz - (uintptr_t) (ptr - stack);
739 if (usage > max_stack_usage) {
740 error_report("thread %d max stack usage increased from %u to %u",
741 qemu_get_thread_id(), max_stack_usage, usage);
742 max_stack_usage = usage;
744 #endif
746 munmap(stack, sz);
749 void sigaction_invoke(struct sigaction *action,
750 struct qemu_signalfd_siginfo *info)
752 siginfo_t si = {};
753 si.si_signo = info->ssi_signo;
754 si.si_errno = info->ssi_errno;
755 si.si_code = info->ssi_code;
757 /* Convert the minimal set of fields defined by POSIX.
758 * Positive si_code values are reserved for kernel-generated
759 * signals, where the valid siginfo fields are determined by
760 * the signal number. But according to POSIX, it is unspecified
761 * whether SI_USER and SI_QUEUE have values less than or equal to
762 * zero.
764 if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE ||
765 info->ssi_code <= 0) {
766 /* SIGTERM, etc. */
767 si.si_pid = info->ssi_pid;
768 si.si_uid = info->ssi_uid;
769 } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE ||
770 info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) {
771 si.si_addr = (void *)(uintptr_t)info->ssi_addr;
772 } else if (info->ssi_signo == SIGCHLD) {
773 si.si_pid = info->ssi_pid;
774 si.si_status = info->ssi_status;
775 si.si_uid = info->ssi_uid;
777 action->sa_sigaction(info->ssi_signo, &si, NULL);