nspawn: mount a clean instance of sysfs
[systemd_ALT/systemd_imz.git] / src / nspawn / nspawn.c
bloba8c8b7caf455e815186f927b7c0b6e68490ff3b9
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
3 /***
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55 #include "sd-id128.h"
56 #include "dev-setup.h"
58 typedef enum LinkJournal {
59 LINK_NO,
60 LINK_AUTO,
61 LINK_HOST,
62 LINK_GUEST
63 } LinkJournal;
65 static char *arg_directory = NULL;
66 static char *arg_user = NULL;
67 static char **arg_controllers = NULL;
68 static char *arg_uuid = NULL;
69 static bool arg_private_network = false;
70 static bool arg_read_only = false;
71 static bool arg_boot = false;
72 static LinkJournal arg_link_journal = LINK_AUTO;
73 static uint64_t arg_retain =
74 (1ULL << CAP_CHOWN) |
75 (1ULL << CAP_DAC_OVERRIDE) |
76 (1ULL << CAP_DAC_READ_SEARCH) |
77 (1ULL << CAP_FOWNER) |
78 (1ULL << CAP_FSETID) |
79 (1ULL << CAP_IPC_OWNER) |
80 (1ULL << CAP_KILL) |
81 (1ULL << CAP_LEASE) |
82 (1ULL << CAP_LINUX_IMMUTABLE) |
83 (1ULL << CAP_NET_BIND_SERVICE) |
84 (1ULL << CAP_NET_BROADCAST) |
85 (1ULL << CAP_NET_RAW) |
86 (1ULL << CAP_SETGID) |
87 (1ULL << CAP_SETFCAP) |
88 (1ULL << CAP_SETPCAP) |
89 (1ULL << CAP_SETUID) |
90 (1ULL << CAP_SYS_ADMIN) |
91 (1ULL << CAP_SYS_CHROOT) |
92 (1ULL << CAP_SYS_NICE) |
93 (1ULL << CAP_SYS_PTRACE) |
94 (1ULL << CAP_SYS_TTY_CONFIG) |
95 (1ULL << CAP_SYS_RESOURCE);
97 static int help(void) {
99 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
100 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
101 " -h --help Show this help\n"
102 " -D --directory=NAME Root directory for the container\n"
103 " -b --boot Boot up full system (i.e. invoke init)\n"
104 " -u --user=USER Run the command under specified user or uid\n"
105 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
106 " --uuid=UUID Set a specific machine UUID for the container\n"
107 " --private-network Disable network in container\n"
108 " --read-only Mount the root directory read-only\n"
109 " --capability=CAP In addition to the default, retain specified capability\n"
110 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
111 " -j Equivalent to --link-journal=host\n",
112 program_invocation_short_name);
114 return 0;
117 static int parse_argv(int argc, char *argv[]) {
119 enum {
120 ARG_PRIVATE_NETWORK = 0x100,
121 ARG_UUID,
122 ARG_READ_ONLY,
123 ARG_CAPABILITY,
124 ARG_LINK_JOURNAL
127 static const struct option options[] = {
128 { "help", no_argument, NULL, 'h' },
129 { "directory", required_argument, NULL, 'D' },
130 { "user", required_argument, NULL, 'u' },
131 { "controllers", required_argument, NULL, 'C' },
132 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
133 { "boot", no_argument, NULL, 'b' },
134 { "uuid", required_argument, NULL, ARG_UUID },
135 { "read-only", no_argument, NULL, ARG_READ_ONLY },
136 { "capability", required_argument, NULL, ARG_CAPABILITY },
137 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
138 { NULL, 0, NULL, 0 }
141 int c;
143 assert(argc >= 0);
144 assert(argv);
146 while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
148 switch (c) {
150 case 'h':
151 help();
152 return 0;
154 case 'D':
155 free(arg_directory);
156 arg_directory = canonicalize_file_name(optarg);
157 if (!arg_directory) {
158 log_error("Failed to canonicalize root directory.");
159 return -ENOMEM;
162 break;
164 case 'u':
165 free(arg_user);
166 if (!(arg_user = strdup(optarg))) {
167 log_error("Failed to duplicate user name.");
168 return -ENOMEM;
171 break;
173 case 'C':
174 strv_free(arg_controllers);
175 arg_controllers = strv_split(optarg, ",");
176 if (!arg_controllers) {
177 log_error("Failed to split controllers list.");
178 return -ENOMEM;
180 strv_uniq(arg_controllers);
182 break;
184 case ARG_PRIVATE_NETWORK:
185 arg_private_network = true;
186 break;
188 case 'b':
189 arg_boot = true;
190 break;
192 case ARG_UUID:
193 arg_uuid = optarg;
194 break;
196 case ARG_READ_ONLY:
197 arg_read_only = true;
198 break;
200 case ARG_CAPABILITY: {
201 char *state, *word;
202 size_t length;
204 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
205 cap_value_t cap;
206 char *t;
208 t = strndup(word, length);
209 if (!t)
210 return log_oom();
212 if (cap_from_name(t, &cap) < 0) {
213 log_error("Failed to parse capability %s.", t);
214 free(t);
215 return -EINVAL;
218 free(t);
219 arg_retain |= 1ULL << (uint64_t) cap;
222 break;
225 case 'j':
226 arg_link_journal = LINK_GUEST;
227 break;
229 case ARG_LINK_JOURNAL:
230 if (streq(optarg, "auto"))
231 arg_link_journal = LINK_AUTO;
232 else if (streq(optarg, "no"))
233 arg_link_journal = LINK_NO;
234 else if (streq(optarg, "guest"))
235 arg_link_journal = LINK_GUEST;
236 else if (streq(optarg, "host"))
237 arg_link_journal = LINK_HOST;
238 else {
239 log_error("Failed to parse link journal mode %s", optarg);
240 return -EINVAL;
243 break;
245 case '?':
246 return -EINVAL;
248 default:
249 log_error("Unknown option code %c", c);
250 return -EINVAL;
254 return 1;
257 static int mount_all(const char *dest) {
259 typedef struct MountPoint {
260 const char *what;
261 const char *where;
262 const char *type;
263 const char *options;
264 unsigned long flags;
265 bool fatal;
266 } MountPoint;
268 static const MountPoint mount_table[] = {
269 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
270 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
271 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
272 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
273 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
274 { "/dev/pts", "/dev/pts", NULL, NULL, MS_BIND, true },
275 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
276 #ifdef HAVE_SELINUX
277 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
278 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
279 #endif
282 unsigned k;
283 int r = 0;
284 char *where;
286 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
287 int t;
289 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
290 log_oom();
292 if (r == 0)
293 r = -ENOMEM;
295 break;
298 t = path_is_mount_point(where, true);
299 if (t < 0) {
300 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
301 free(where);
303 if (r == 0)
304 r = t;
306 continue;
309 mkdir_p_label(where, 0755);
311 if (mount(mount_table[k].what,
312 where,
313 mount_table[k].type,
314 mount_table[k].flags,
315 mount_table[k].options) < 0 &&
316 mount_table[k].fatal) {
318 log_error("mount(%s) failed: %m", where);
320 if (r == 0)
321 r = -errno;
324 free(where);
327 return r;
330 static int setup_timezone(const char *dest) {
331 char *where;
333 assert(dest);
335 /* Fix the timezone, if possible */
336 if (asprintf(&where, "%s/etc/localtime", dest) < 0)
337 return log_oom();
339 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
340 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
342 free(where);
344 if (asprintf(&where, "%s/etc/timezone", dest) < 0)
345 return log_oom();
347 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
348 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
350 free(where);
352 return 0;
355 static int setup_resolv_conf(const char *dest) {
356 char *where;
358 assert(dest);
360 if (arg_private_network)
361 return 0;
363 /* Fix resolv.conf, if possible */
364 if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
365 return log_oom();
368 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
369 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
371 free(where);
373 return 0;
376 static int copy_devnodes(const char *dest) {
378 static const char devnodes[] =
379 "null\0"
380 "zero\0"
381 "full\0"
382 "random\0"
383 "urandom\0"
384 "tty\0"
385 "ptmx\0"
386 "rtc0\0";
388 const char *d;
389 int r = 0;
390 mode_t u;
392 assert(dest);
394 u = umask(0000);
396 NULSTR_FOREACH(d, devnodes) {
397 struct stat st;
398 char *from = NULL, *to = NULL;
400 asprintf(&from, "/dev/%s", d);
401 asprintf(&to, "%s/dev/%s", dest, d);
403 if (!from || !to) {
404 log_error("Failed to allocate devnode path");
406 free(from);
407 free(to);
409 from = to = NULL;
411 if (r == 0)
412 r = -ENOMEM;
414 break;
417 if (stat(from, &st) < 0) {
419 if (errno != ENOENT) {
420 log_error("Failed to stat %s: %m", from);
421 if (r == 0)
422 r = -errno;
425 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
427 log_error("%s is not a char or block device, cannot copy.", from);
428 if (r == 0)
429 r = -EIO;
431 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
433 log_error("mknod(%s) failed: %m", dest);
434 if (r == 0)
435 r = -errno;
438 free(from);
439 free(to);
442 umask(u);
444 return r;
447 static int setup_dev_console(const char *dest, const char *console) {
448 struct stat st;
449 char *to = NULL;
450 int r;
451 mode_t u;
453 assert(dest);
454 assert(console);
456 u = umask(0000);
458 if (stat(console, &st) < 0) {
459 log_error("Failed to stat %s: %m", console);
460 r = -errno;
461 goto finish;
463 } else if (!S_ISCHR(st.st_mode)) {
464 log_error("/dev/console is not a char device.");
465 r = -EIO;
466 goto finish;
469 r = chmod_and_chown(console, 0600, 0, 0);
470 if (r < 0) {
471 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
472 goto finish;
475 if (asprintf(&to, "%s/dev/console", dest) < 0) {
476 r = log_oom();
477 goto finish;
480 /* We need to bind mount the right tty to /dev/console since
481 * ptys can only exist on pts file systems. To have something
482 * to bind mount things on we create a device node first, that
483 * has the right major/minor (note that the major minor
484 * doesn't actually matter here, since we mount it over
485 * anyway). */
487 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
488 log_error("mknod() for /dev/console failed: %m");
489 r = -errno;
490 goto finish;
493 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
494 log_error("Bind mount for /dev/console failed: %m");
495 r = -errno;
496 goto finish;
499 finish:
500 free(to);
501 umask(u);
503 return r;
506 static int setup_kmsg(const char *dest, int kmsg_socket) {
507 char *from = NULL, *to = NULL;
508 int r, fd, k;
509 mode_t u;
510 union {
511 struct cmsghdr cmsghdr;
512 uint8_t buf[CMSG_SPACE(sizeof(int))];
513 } control;
514 struct msghdr mh;
515 struct cmsghdr *cmsg;
517 assert(dest);
518 assert(kmsg_socket >= 0);
520 u = umask(0000);
522 /* We create the kmsg FIFO as /dev/kmsg, but immediately
523 * delete it after bind mounting it to /proc/kmsg. While FIFOs
524 * on the reading side behave very similar to /proc/kmsg,
525 * their writing side behaves differently from /dev/kmsg in
526 * that writing blocks when nothing is reading. In order to
527 * avoid any problems with containers deadlocking due to this
528 * we simply make /dev/kmsg unavailable to the container. */
529 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
530 r = log_oom();
531 goto finish;
534 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
535 r = log_oom();
536 goto finish;
539 if (mkfifo(from, 0600) < 0) {
540 log_error("mkfifo() for /dev/kmsg failed: %m");
541 r = -errno;
542 goto finish;
545 r = chmod_and_chown(from, 0600, 0, 0);
546 if (r < 0) {
547 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
548 goto finish;
551 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
552 log_error("Bind mount for /proc/kmsg failed: %m");
553 r = -errno;
554 goto finish;
557 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
558 if (fd < 0) {
559 log_error("Failed to open fifo: %m");
560 r = -errno;
561 goto finish;
564 zero(mh);
565 zero(control);
567 mh.msg_control = &control;
568 mh.msg_controllen = sizeof(control);
570 cmsg = CMSG_FIRSTHDR(&mh);
571 cmsg->cmsg_level = SOL_SOCKET;
572 cmsg->cmsg_type = SCM_RIGHTS;
573 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
574 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
576 mh.msg_controllen = cmsg->cmsg_len;
578 /* Store away the fd in the socket, so that it stays open as
579 * long as we run the child */
580 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
581 close_nointr_nofail(fd);
583 if (k < 0) {
584 log_error("Failed to send FIFO fd: %m");
585 r = -errno;
586 goto finish;
589 /* And now make the FIFO unavailable as /dev/kmsg... */
590 unlink(from);
592 finish:
593 free(from);
594 free(to);
595 umask(u);
597 return r;
600 static int setup_hostname(void) {
601 char *hn;
602 int r = 0;
604 hn = path_get_file_name(arg_directory);
605 if (hn) {
606 hn = strdup(hn);
607 if (!hn)
608 return -ENOMEM;
610 hostname_cleanup(hn);
612 if (!isempty(hn))
613 if (sethostname(hn, strlen(hn)) < 0)
614 r = -errno;
616 free(hn);
619 return r;
622 static int setup_journal(const char *directory) {
623 sd_id128_t machine_id;
624 char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
625 int r;
627 if (arg_link_journal == LINK_NO)
628 return 0;
630 p = strappend(directory, "/etc/machine-id");
631 if (!p) {
632 r = log_oom();
633 goto finish;
636 r = read_one_line_file(p, &b);
637 if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
638 r = 0;
639 goto finish;
640 } else if (r < 0) {
641 log_error("Failed to read machine ID: %s", strerror(-r));
642 return r;
645 l = strstrip(b);
646 if (isempty(l) && arg_link_journal == LINK_AUTO) {
647 r = 0;
648 goto finish;
651 /* Verify validaty */
652 r = sd_id128_from_string(l, &machine_id);
653 if (r < 0) {
654 log_error("Failed to parse machine ID: %s", strerror(-r));
655 goto finish;
658 free(p);
659 p = strappend("/var/log/journal/", l);
660 q = strjoin(directory, "/var/log/journal/", l, NULL);
661 if (!p || !q) {
662 r = log_oom();
663 goto finish;
666 if (path_is_mount_point(p, false) > 0 ||
667 path_is_mount_point(q, false) > 0) {
668 if (arg_link_journal != LINK_AUTO) {
669 log_error("Journal already a mount point, refusing.");
670 r = -EEXIST;
671 goto finish;
674 r = 0;
675 goto finish;
678 r = readlink_and_make_absolute(p, &d);
679 if (r >= 0) {
680 if ((arg_link_journal == LINK_GUEST ||
681 arg_link_journal == LINK_AUTO) &&
682 path_equal(d, q)) {
684 mkdir_p(q, 0755);
686 r = 0;
687 goto finish;
690 if (unlink(p) < 0) {
691 log_error("Failed to remove symlink %s: %m", p);
692 r = -errno;
693 goto finish;
695 } else if (r == -EINVAL) {
697 if (arg_link_journal == LINK_GUEST &&
698 rmdir(p) < 0) {
700 if (errno == ENOTDIR)
701 log_error("%s already exists and is neither symlink nor directory.", p);
702 else {
703 log_error("Failed to remove %s: %m", p);
704 r = -errno;
707 goto finish;
709 } else if (r != -ENOENT) {
710 log_error("readlink(%s) failed: %m", p);
711 goto finish;
714 if (arg_link_journal == LINK_GUEST) {
716 if (symlink(q, p) < 0) {
717 log_error("Failed to symlink %s to %s: %m", q, p);
718 r = -errno;
719 goto finish;
722 mkdir_p(q, 0755);
724 r = 0;
725 goto finish;
728 if (arg_link_journal == LINK_HOST) {
729 r = mkdir_p(p, 0755);
730 if (r < 0) {
731 log_error("Failed to create %s: %m", p);
732 goto finish;
735 } else if (access(p, F_OK) < 0) {
736 r = 0;
737 goto finish;
740 if (dir_is_empty(q) == 0) {
741 log_error("%s not empty.", q);
742 r = -ENOTEMPTY;
743 goto finish;
746 r = mkdir_p(q, 0755);
747 if (r < 0) {
748 log_error("Failed to create %s: %m", q);
749 goto finish;
752 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
753 log_error("Failed to bind mount journal from host into guest: %m");
754 r = -errno;
755 goto finish;
758 r = 0;
760 finish:
761 free(p);
762 free(q);
763 free(d);
764 free(b);
765 return r;
769 static int drop_capabilities(void) {
770 return capability_bounding_set_drop(~arg_retain, false);
773 static int is_os_tree(const char *path) {
774 int r;
775 char *p;
776 /* We use /bin/sh as flag file if something is an OS */
778 if (asprintf(&p, "%s/bin/sh", path) < 0)
779 return -ENOMEM;
781 r = access(p, F_OK);
782 free(p);
784 return r < 0 ? 0 : 1;
787 static int process_pty(int master, sigset_t *mask) {
789 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
790 size_t in_buffer_full = 0, out_buffer_full = 0;
791 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
792 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
793 int ep = -1, signal_fd = -1, r;
795 fd_nonblock(STDIN_FILENO, 1);
796 fd_nonblock(STDOUT_FILENO, 1);
797 fd_nonblock(master, 1);
799 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
800 if (signal_fd < 0) {
801 log_error("signalfd(): %m");
802 r = -errno;
803 goto finish;
806 ep = epoll_create1(EPOLL_CLOEXEC);
807 if (ep < 0) {
808 log_error("Failed to create epoll: %m");
809 r = -errno;
810 goto finish;
813 zero(stdin_ev);
814 stdin_ev.events = EPOLLIN|EPOLLET;
815 stdin_ev.data.fd = STDIN_FILENO;
817 zero(stdout_ev);
818 stdout_ev.events = EPOLLOUT|EPOLLET;
819 stdout_ev.data.fd = STDOUT_FILENO;
821 zero(master_ev);
822 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
823 master_ev.data.fd = master;
825 zero(signal_ev);
826 signal_ev.events = EPOLLIN;
827 signal_ev.data.fd = signal_fd;
829 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
830 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
831 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
832 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
833 log_error("Failed to regiser fds in epoll: %m");
834 r = -errno;
835 goto finish;
838 for (;;) {
839 struct epoll_event ev[16];
840 ssize_t k;
841 int i, nfds;
843 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
844 if (nfds < 0) {
846 if (errno == EINTR || errno == EAGAIN)
847 continue;
849 log_error("epoll_wait(): %m");
850 r = -errno;
851 goto finish;
854 assert(nfds >= 1);
856 for (i = 0; i < nfds; i++) {
857 if (ev[i].data.fd == STDIN_FILENO) {
859 if (ev[i].events & (EPOLLIN|EPOLLHUP))
860 stdin_readable = true;
862 } else if (ev[i].data.fd == STDOUT_FILENO) {
864 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
865 stdout_writable = true;
867 } else if (ev[i].data.fd == master) {
869 if (ev[i].events & (EPOLLIN|EPOLLHUP))
870 master_readable = true;
872 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
873 master_writable = true;
875 } else if (ev[i].data.fd == signal_fd) {
876 struct signalfd_siginfo sfsi;
877 ssize_t n;
879 n = read(signal_fd, &sfsi, sizeof(sfsi));
880 if (n != sizeof(sfsi)) {
882 if (n >= 0) {
883 log_error("Failed to read from signalfd: invalid block size");
884 r = -EIO;
885 goto finish;
888 if (errno != EINTR && errno != EAGAIN) {
889 log_error("Failed to read from signalfd: %m");
890 r = -errno;
891 goto finish;
893 } else {
895 if (sfsi.ssi_signo == SIGWINCH) {
896 struct winsize ws;
898 /* The window size changed, let's forward that. */
899 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
900 ioctl(master, TIOCSWINSZ, &ws);
901 } else {
902 r = 0;
903 goto finish;
909 while ((stdin_readable && in_buffer_full <= 0) ||
910 (master_writable && in_buffer_full > 0) ||
911 (master_readable && out_buffer_full <= 0) ||
912 (stdout_writable && out_buffer_full > 0)) {
914 if (stdin_readable && in_buffer_full < LINE_MAX) {
916 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
917 if (k < 0) {
919 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
920 stdin_readable = false;
921 else {
922 log_error("read(): %m");
923 r = -errno;
924 goto finish;
926 } else
927 in_buffer_full += (size_t) k;
930 if (master_writable && in_buffer_full > 0) {
932 k = write(master, in_buffer, in_buffer_full);
933 if (k < 0) {
935 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
936 master_writable = false;
937 else {
938 log_error("write(): %m");
939 r = -errno;
940 goto finish;
943 } else {
944 assert(in_buffer_full >= (size_t) k);
945 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
946 in_buffer_full -= k;
950 if (master_readable && out_buffer_full < LINE_MAX) {
952 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
953 if (k < 0) {
955 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
956 master_readable = false;
957 else {
958 log_error("read(): %m");
959 r = -errno;
960 goto finish;
962 } else
963 out_buffer_full += (size_t) k;
966 if (stdout_writable && out_buffer_full > 0) {
968 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
969 if (k < 0) {
971 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
972 stdout_writable = false;
973 else {
974 log_error("write(): %m");
975 r = -errno;
976 goto finish;
979 } else {
980 assert(out_buffer_full >= (size_t) k);
981 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
982 out_buffer_full -= k;
988 finish:
989 if (ep >= 0)
990 close_nointr_nofail(ep);
992 if (signal_fd >= 0)
993 close_nointr_nofail(signal_fd);
995 return r;
998 int main(int argc, char *argv[]) {
999 pid_t pid = 0;
1000 int r = EXIT_FAILURE, k;
1001 char *oldcg = NULL, *newcg = NULL;
1002 char **controller = NULL;
1003 int master = -1;
1004 const char *console = NULL;
1005 struct termios saved_attr, raw_attr;
1006 sigset_t mask;
1007 bool saved_attr_valid = false;
1008 struct winsize ws;
1009 int kmsg_socket_pair[2] = { -1, -1 };
1011 log_parse_environment();
1012 log_open();
1014 r = parse_argv(argc, argv);
1015 if (r <= 0)
1016 goto finish;
1018 if (arg_directory) {
1019 char *p;
1021 p = path_make_absolute_cwd(arg_directory);
1022 free(arg_directory);
1023 arg_directory = p;
1024 } else
1025 arg_directory = get_current_dir_name();
1027 if (!arg_directory) {
1028 log_error("Failed to determine path");
1029 goto finish;
1032 path_kill_slashes(arg_directory);
1034 if (geteuid() != 0) {
1035 log_error("Need to be root.");
1036 goto finish;
1039 if (sd_booted() <= 0) {
1040 log_error("Not running on a systemd system.");
1041 goto finish;
1044 if (path_equal(arg_directory, "/")) {
1045 log_error("Spawning container on root directory not supported.");
1046 goto finish;
1049 if (is_os_tree(arg_directory) <= 0) {
1050 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1051 goto finish;
1054 k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1055 if (k < 0) {
1056 log_error("Failed to determine current cgroup: %s", strerror(-k));
1057 goto finish;
1060 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1061 log_error("Failed to allocate cgroup path.");
1062 goto finish;
1065 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1066 if (k < 0) {
1067 log_error("Failed to create cgroup: %s", strerror(-k));
1068 goto finish;
1071 STRV_FOREACH(controller, arg_controllers) {
1072 k = cg_create_and_attach(*controller, newcg, 0);
1073 if (k < 0)
1074 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1077 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1078 if (master < 0) {
1079 log_error("Failed to acquire pseudo tty: %m");
1080 goto finish;
1083 console = ptsname(master);
1084 if (!console) {
1085 log_error("Failed to determine tty name: %m");
1086 goto finish;
1089 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1091 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1092 ioctl(master, TIOCSWINSZ, &ws);
1094 if (unlockpt(master) < 0) {
1095 log_error("Failed to unlock tty: %m");
1096 goto finish;
1099 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1100 log_error("Failed to get terminal attributes: %m");
1101 goto finish;
1104 saved_attr_valid = true;
1106 raw_attr = saved_attr;
1107 cfmakeraw(&raw_attr);
1108 raw_attr.c_lflag &= ~ECHO;
1110 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1111 log_error("Failed to set terminal attributes: %m");
1112 goto finish;
1115 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1116 log_error("Failed to create kmsg socket pair");
1117 goto finish;
1120 assert_se(sigemptyset(&mask) == 0);
1121 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1122 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1124 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1125 if (pid < 0) {
1126 if (errno == EINVAL)
1127 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1128 else
1129 log_error("clone() failed: %m");
1131 goto finish;
1134 if (pid == 0) {
1135 /* child */
1137 const char *home = NULL;
1138 uid_t uid = (uid_t) -1;
1139 gid_t gid = (gid_t) -1;
1140 const char *envp[] = {
1141 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1142 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1143 NULL, /* TERM */
1144 NULL, /* HOME */
1145 NULL, /* USER */
1146 NULL, /* LOGNAME */
1147 NULL, /* container_uuid */
1148 NULL
1151 envp[2] = strv_find_prefix(environ, "TERM=");
1153 close_nointr_nofail(master);
1155 close_nointr(STDIN_FILENO);
1156 close_nointr(STDOUT_FILENO);
1157 close_nointr(STDERR_FILENO);
1159 close_all_fds(&kmsg_socket_pair[1], 1);
1161 reset_all_signal_handlers();
1163 assert_se(sigemptyset(&mask) == 0);
1164 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1166 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1167 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1168 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1169 goto child_fail;
1171 if (setsid() < 0) {
1172 log_error("setsid() failed: %m");
1173 goto child_fail;
1176 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1177 log_error("PR_SET_PDEATHSIG failed: %m");
1178 goto child_fail;
1181 /* Mark everything as slave, so that we still
1182 * receive mounts from the real root, but don't
1183 * propagate mounts to the real root. */
1184 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1185 log_error("MS_SLAVE|MS_REC failed: %m");
1186 goto child_fail;
1189 /* Turn directory into bind mount */
1190 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1191 log_error("Failed to make bind mount.");
1192 goto child_fail;
1195 if (arg_read_only)
1196 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1197 log_error("Failed to make read-only.");
1198 goto child_fail;
1201 if (mount_all(arg_directory) < 0)
1202 goto child_fail;
1204 if (copy_devnodes(arg_directory) < 0)
1205 goto child_fail;
1207 dev_setup(arg_directory);
1209 if (setup_dev_console(arg_directory, console) < 0)
1210 goto child_fail;
1212 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1213 goto child_fail;
1215 close_nointr_nofail(kmsg_socket_pair[1]);
1217 if (setup_timezone(arg_directory) < 0)
1218 goto child_fail;
1220 if (setup_resolv_conf(arg_directory) < 0)
1221 goto child_fail;
1223 if (setup_journal(arg_directory) < 0)
1224 goto child_fail;
1226 if (chdir(arg_directory) < 0) {
1227 log_error("chdir(%s) failed: %m", arg_directory);
1228 goto child_fail;
1231 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1232 log_error("mount(MS_MOVE) failed: %m");
1233 goto child_fail;
1236 if (chroot(".") < 0) {
1237 log_error("chroot() failed: %m");
1238 goto child_fail;
1241 if (chdir("/") < 0) {
1242 log_error("chdir() failed: %m");
1243 goto child_fail;
1246 umask(0022);
1248 loopback_setup();
1250 if (drop_capabilities() < 0) {
1251 log_error("drop_capabilities() failed: %m");
1252 goto child_fail;
1255 if (arg_user) {
1257 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1258 log_error("get_user_creds() failed: %m");
1259 goto child_fail;
1262 if (mkdir_parents_label(home, 0775) < 0) {
1263 log_error("mkdir_parents_label() failed: %m");
1264 goto child_fail;
1267 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1268 log_error("mkdir_safe_label() failed: %m");
1269 goto child_fail;
1272 if (initgroups((const char*)arg_user, gid) < 0) {
1273 log_error("initgroups() failed: %m");
1274 goto child_fail;
1277 if (setresgid(gid, gid, gid) < 0) {
1278 log_error("setregid() failed: %m");
1279 goto child_fail;
1282 if (setresuid(uid, uid, uid) < 0) {
1283 log_error("setreuid() failed: %m");
1284 goto child_fail;
1288 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1289 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1290 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1291 log_oom();
1292 goto child_fail;
1295 if (arg_uuid) {
1296 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1297 log_oom();
1298 goto child_fail;
1302 setup_hostname();
1304 if (arg_boot) {
1305 char **a;
1306 size_t l;
1308 /* Automatically search for the init system */
1310 l = 1 + argc - optind;
1311 a = newa(char*, l + 1);
1312 memcpy(a + 1, argv + optind, l * sizeof(char*));
1314 a[0] = (char*) "/usr/lib/systemd/systemd";
1315 execve(a[0], a, (char**) envp);
1317 a[0] = (char*) "/lib/systemd/systemd";
1318 execve(a[0], a, (char**) envp);
1320 a[0] = (char*) "/sbin/init";
1321 execve(a[0], a, (char**) envp);
1322 } else if (argc > optind)
1323 execvpe(argv[optind], argv + optind, (char**) envp);
1324 else {
1325 chdir(home ? home : "/root");
1326 execle("/bin/bash", "-bash", NULL, (char**) envp);
1329 log_error("execv() failed: %m");
1331 child_fail:
1332 _exit(EXIT_FAILURE);
1335 if (process_pty(master, &mask) < 0)
1336 goto finish;
1338 if (saved_attr_valid) {
1339 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1340 saved_attr_valid = false;
1343 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1345 if (r < 0)
1346 r = EXIT_FAILURE;
1348 finish:
1349 if (saved_attr_valid)
1350 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1352 if (master >= 0)
1353 close_nointr_nofail(master);
1355 close_pipe(kmsg_socket_pair);
1357 if (oldcg)
1358 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1360 if (newcg)
1361 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1363 free(arg_directory);
1364 strv_free(arg_controllers);
1365 free(oldcg);
1366 free(newcg);
1368 return r;