1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
36 #include <sys/epoll.h>
38 #include <sys/signalfd.h>
42 #include <sys/socket.h>
44 #include <systemd/sd-daemon.h>
51 #include "cgroup-util.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
56 #include "dev-setup.h"
58 typedef enum LinkJournal
{
65 static char *arg_directory
= NULL
;
66 static char *arg_user
= NULL
;
67 static char **arg_controllers
= NULL
;
68 static char *arg_uuid
= NULL
;
69 static bool arg_private_network
= false;
70 static bool arg_read_only
= false;
71 static bool arg_boot
= false;
72 static LinkJournal arg_link_journal
= LINK_AUTO
;
73 static uint64_t arg_retain
=
75 (1ULL << CAP_DAC_OVERRIDE
) |
76 (1ULL << CAP_DAC_READ_SEARCH
) |
77 (1ULL << CAP_FOWNER
) |
78 (1ULL << CAP_FSETID
) |
79 (1ULL << CAP_IPC_OWNER
) |
82 (1ULL << CAP_LINUX_IMMUTABLE
) |
83 (1ULL << CAP_NET_BIND_SERVICE
) |
84 (1ULL << CAP_NET_BROADCAST
) |
85 (1ULL << CAP_NET_RAW
) |
86 (1ULL << CAP_SETGID
) |
87 (1ULL << CAP_SETFCAP
) |
88 (1ULL << CAP_SETPCAP
) |
89 (1ULL << CAP_SETUID
) |
90 (1ULL << CAP_SYS_ADMIN
) |
91 (1ULL << CAP_SYS_CHROOT
) |
92 (1ULL << CAP_SYS_NICE
) |
93 (1ULL << CAP_SYS_PTRACE
) |
94 (1ULL << CAP_SYS_TTY_CONFIG
) |
95 (1ULL << CAP_SYS_RESOURCE
);
97 static int help(void) {
99 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
100 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
101 " -h --help Show this help\n"
102 " -D --directory=NAME Root directory for the container\n"
103 " -b --boot Boot up full system (i.e. invoke init)\n"
104 " -u --user=USER Run the command under specified user or uid\n"
105 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
106 " --uuid=UUID Set a specific machine UUID for the container\n"
107 " --private-network Disable network in container\n"
108 " --read-only Mount the root directory read-only\n"
109 " --capability=CAP In addition to the default, retain specified capability\n"
110 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
111 " -j Equivalent to --link-journal=host\n",
112 program_invocation_short_name
);
117 static int parse_argv(int argc
, char *argv
[]) {
120 ARG_PRIVATE_NETWORK
= 0x100,
127 static const struct option options
[] = {
128 { "help", no_argument
, NULL
, 'h' },
129 { "directory", required_argument
, NULL
, 'D' },
130 { "user", required_argument
, NULL
, 'u' },
131 { "controllers", required_argument
, NULL
, 'C' },
132 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
133 { "boot", no_argument
, NULL
, 'b' },
134 { "uuid", required_argument
, NULL
, ARG_UUID
},
135 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
136 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
137 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
146 while ((c
= getopt_long(argc
, argv
, "+hD:u:C:bj", options
, NULL
)) >= 0) {
156 arg_directory
= canonicalize_file_name(optarg
);
157 if (!arg_directory
) {
158 log_error("Failed to canonicalize root directory.");
166 if (!(arg_user
= strdup(optarg
))) {
167 log_error("Failed to duplicate user name.");
174 strv_free(arg_controllers
);
175 arg_controllers
= strv_split(optarg
, ",");
176 if (!arg_controllers
) {
177 log_error("Failed to split controllers list.");
180 strv_uniq(arg_controllers
);
184 case ARG_PRIVATE_NETWORK
:
185 arg_private_network
= true;
197 arg_read_only
= true;
200 case ARG_CAPABILITY
: {
204 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
208 t
= strndup(word
, length
);
212 if (cap_from_name(t
, &cap
) < 0) {
213 log_error("Failed to parse capability %s.", t
);
219 arg_retain
|= 1ULL << (uint64_t) cap
;
226 arg_link_journal
= LINK_GUEST
;
229 case ARG_LINK_JOURNAL
:
230 if (streq(optarg
, "auto"))
231 arg_link_journal
= LINK_AUTO
;
232 else if (streq(optarg
, "no"))
233 arg_link_journal
= LINK_NO
;
234 else if (streq(optarg
, "guest"))
235 arg_link_journal
= LINK_GUEST
;
236 else if (streq(optarg
, "host"))
237 arg_link_journal
= LINK_HOST
;
239 log_error("Failed to parse link journal mode %s", optarg
);
249 log_error("Unknown option code %c", c
);
257 static int mount_all(const char *dest
) {
259 typedef struct MountPoint
{
268 static const MountPoint mount_table
[] = {
269 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true },
270 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
, true }, /* Bind mount first */
271 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_REMOUNT
, true }, /* Then, make it r/o */
272 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true },
273 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
, true },
274 { "/dev/pts", "/dev/pts", NULL
, NULL
, MS_BIND
, true },
275 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true },
277 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
, false }, /* Bind mount first */
278 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_REMOUNT
, false }, /* Then, make it r/o */
286 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
289 if (asprintf(&where
, "%s/%s", dest
, mount_table
[k
].where
) < 0) {
298 t
= path_is_mount_point(where
, true);
300 log_error("Failed to detect whether %s is a mount point: %s", where
, strerror(-t
));
309 mkdir_p_label(where
, 0755);
311 if (mount(mount_table
[k
].what
,
314 mount_table
[k
].flags
,
315 mount_table
[k
].options
) < 0 &&
316 mount_table
[k
].fatal
) {
318 log_error("mount(%s) failed: %m", where
);
330 static int setup_timezone(const char *dest
) {
335 /* Fix the timezone, if possible */
336 if (asprintf(&where
, "%s/etc/localtime", dest
) < 0)
339 if (mount("/etc/localtime", where
, "bind", MS_BIND
, NULL
) >= 0)
340 mount("/etc/localtime", where
, "bind", MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
);
344 if (asprintf(&where
, "%s/etc/timezone", dest
) < 0)
347 if (mount("/etc/timezone", where
, "bind", MS_BIND
, NULL
) >= 0)
348 mount("/etc/timezone", where
, "bind", MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
);
355 static int setup_resolv_conf(const char *dest
) {
360 if (arg_private_network
)
363 /* Fix resolv.conf, if possible */
364 if (asprintf(&where
, "%s/etc/resolv.conf", dest
) < 0) {
368 if (mount("/etc/resolv.conf", where
, "bind", MS_BIND
, NULL
) >= 0)
369 mount("/etc/resolv.conf", where
, "bind", MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
);
376 static int copy_devnodes(const char *dest
) {
378 static const char devnodes
[] =
396 NULSTR_FOREACH(d
, devnodes
) {
398 char *from
= NULL
, *to
= NULL
;
400 asprintf(&from
, "/dev/%s", d
);
401 asprintf(&to
, "%s/dev/%s", dest
, d
);
404 log_error("Failed to allocate devnode path");
417 if (stat(from
, &st
) < 0) {
419 if (errno
!= ENOENT
) {
420 log_error("Failed to stat %s: %m", from
);
425 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
427 log_error("%s is not a char or block device, cannot copy.", from
);
431 } else if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
433 log_error("mknod(%s) failed: %m", dest
);
447 static int setup_dev_console(const char *dest
, const char *console
) {
458 if (stat(console
, &st
) < 0) {
459 log_error("Failed to stat %s: %m", console
);
463 } else if (!S_ISCHR(st
.st_mode
)) {
464 log_error("/dev/console is not a char device.");
469 r
= chmod_and_chown(console
, 0600, 0, 0);
471 log_error("Failed to correct access mode for TTY: %s", strerror(-r
));
475 if (asprintf(&to
, "%s/dev/console", dest
) < 0) {
480 /* We need to bind mount the right tty to /dev/console since
481 * ptys can only exist on pts file systems. To have something
482 * to bind mount things on we create a device node first, that
483 * has the right major/minor (note that the major minor
484 * doesn't actually matter here, since we mount it over
487 if (mknod(to
, (st
.st_mode
& ~07777) | 0600, st
.st_rdev
) < 0) {
488 log_error("mknod() for /dev/console failed: %m");
493 if (mount(console
, to
, "bind", MS_BIND
, NULL
) < 0) {
494 log_error("Bind mount for /dev/console failed: %m");
506 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
507 char *from
= NULL
, *to
= NULL
;
511 struct cmsghdr cmsghdr
;
512 uint8_t buf
[CMSG_SPACE(sizeof(int))];
515 struct cmsghdr
*cmsg
;
518 assert(kmsg_socket
>= 0);
522 /* We create the kmsg FIFO as /dev/kmsg, but immediately
523 * delete it after bind mounting it to /proc/kmsg. While FIFOs
524 * on the reading side behave very similar to /proc/kmsg,
525 * their writing side behaves differently from /dev/kmsg in
526 * that writing blocks when nothing is reading. In order to
527 * avoid any problems with containers deadlocking due to this
528 * we simply make /dev/kmsg unavailable to the container. */
529 if (asprintf(&from
, "%s/dev/kmsg", dest
) < 0) {
534 if (asprintf(&to
, "%s/proc/kmsg", dest
) < 0) {
539 if (mkfifo(from
, 0600) < 0) {
540 log_error("mkfifo() for /dev/kmsg failed: %m");
545 r
= chmod_and_chown(from
, 0600, 0, 0);
547 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r
));
551 if (mount(from
, to
, "bind", MS_BIND
, NULL
) < 0) {
552 log_error("Bind mount for /proc/kmsg failed: %m");
557 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
559 log_error("Failed to open fifo: %m");
567 mh
.msg_control
= &control
;
568 mh
.msg_controllen
= sizeof(control
);
570 cmsg
= CMSG_FIRSTHDR(&mh
);
571 cmsg
->cmsg_level
= SOL_SOCKET
;
572 cmsg
->cmsg_type
= SCM_RIGHTS
;
573 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
574 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
576 mh
.msg_controllen
= cmsg
->cmsg_len
;
578 /* Store away the fd in the socket, so that it stays open as
579 * long as we run the child */
580 k
= sendmsg(kmsg_socket
, &mh
, MSG_DONTWAIT
|MSG_NOSIGNAL
);
581 close_nointr_nofail(fd
);
584 log_error("Failed to send FIFO fd: %m");
589 /* And now make the FIFO unavailable as /dev/kmsg... */
600 static int setup_hostname(void) {
604 hn
= path_get_file_name(arg_directory
);
610 hostname_cleanup(hn
);
613 if (sethostname(hn
, strlen(hn
)) < 0)
622 static int setup_journal(const char *directory
) {
623 sd_id128_t machine_id
;
624 char *p
= NULL
, *b
= NULL
, *l
, *q
= NULL
, *d
= NULL
;
627 if (arg_link_journal
== LINK_NO
)
630 p
= strappend(directory
, "/etc/machine-id");
636 r
= read_one_line_file(p
, &b
);
637 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
) {
641 log_error("Failed to read machine ID: %s", strerror(-r
));
646 if (isempty(l
) && arg_link_journal
== LINK_AUTO
) {
651 /* Verify validaty */
652 r
= sd_id128_from_string(l
, &machine_id
);
654 log_error("Failed to parse machine ID: %s", strerror(-r
));
659 p
= strappend("/var/log/journal/", l
);
660 q
= strjoin(directory
, "/var/log/journal/", l
, NULL
);
666 if (path_is_mount_point(p
, false) > 0 ||
667 path_is_mount_point(q
, false) > 0) {
668 if (arg_link_journal
!= LINK_AUTO
) {
669 log_error("Journal already a mount point, refusing.");
678 r
= readlink_and_make_absolute(p
, &d
);
680 if ((arg_link_journal
== LINK_GUEST
||
681 arg_link_journal
== LINK_AUTO
) &&
691 log_error("Failed to remove symlink %s: %m", p
);
695 } else if (r
== -EINVAL
) {
697 if (arg_link_journal
== LINK_GUEST
&&
700 if (errno
== ENOTDIR
)
701 log_error("%s already exists and is neither symlink nor directory.", p
);
703 log_error("Failed to remove %s: %m", p
);
709 } else if (r
!= -ENOENT
) {
710 log_error("readlink(%s) failed: %m", p
);
714 if (arg_link_journal
== LINK_GUEST
) {
716 if (symlink(q
, p
) < 0) {
717 log_error("Failed to symlink %s to %s: %m", q
, p
);
728 if (arg_link_journal
== LINK_HOST
) {
729 r
= mkdir_p(p
, 0755);
731 log_error("Failed to create %s: %m", p
);
735 } else if (access(p
, F_OK
) < 0) {
740 if (dir_is_empty(q
) == 0) {
741 log_error("%s not empty.", q
);
746 r
= mkdir_p(q
, 0755);
748 log_error("Failed to create %s: %m", q
);
752 if (mount(p
, q
, "bind", MS_BIND
, NULL
) < 0) {
753 log_error("Failed to bind mount journal from host into guest: %m");
769 static int drop_capabilities(void) {
770 return capability_bounding_set_drop(~arg_retain
, false);
773 static int is_os_tree(const char *path
) {
776 /* We use /bin/sh as flag file if something is an OS */
778 if (asprintf(&p
, "%s/bin/sh", path
) < 0)
784 return r
< 0 ? 0 : 1;
787 static int process_pty(int master
, sigset_t
*mask
) {
789 char in_buffer
[LINE_MAX
], out_buffer
[LINE_MAX
];
790 size_t in_buffer_full
= 0, out_buffer_full
= 0;
791 struct epoll_event stdin_ev
, stdout_ev
, master_ev
, signal_ev
;
792 bool stdin_readable
= false, stdout_writable
= false, master_readable
= false, master_writable
= false;
793 int ep
= -1, signal_fd
= -1, r
;
795 fd_nonblock(STDIN_FILENO
, 1);
796 fd_nonblock(STDOUT_FILENO
, 1);
797 fd_nonblock(master
, 1);
799 signal_fd
= signalfd(-1, mask
, SFD_NONBLOCK
|SFD_CLOEXEC
);
801 log_error("signalfd(): %m");
806 ep
= epoll_create1(EPOLL_CLOEXEC
);
808 log_error("Failed to create epoll: %m");
814 stdin_ev
.events
= EPOLLIN
|EPOLLET
;
815 stdin_ev
.data
.fd
= STDIN_FILENO
;
818 stdout_ev
.events
= EPOLLOUT
|EPOLLET
;
819 stdout_ev
.data
.fd
= STDOUT_FILENO
;
822 master_ev
.events
= EPOLLIN
|EPOLLOUT
|EPOLLET
;
823 master_ev
.data
.fd
= master
;
826 signal_ev
.events
= EPOLLIN
;
827 signal_ev
.data
.fd
= signal_fd
;
829 if (epoll_ctl(ep
, EPOLL_CTL_ADD
, STDIN_FILENO
, &stdin_ev
) < 0 ||
830 epoll_ctl(ep
, EPOLL_CTL_ADD
, STDOUT_FILENO
, &stdout_ev
) < 0 ||
831 epoll_ctl(ep
, EPOLL_CTL_ADD
, master
, &master_ev
) < 0 ||
832 epoll_ctl(ep
, EPOLL_CTL_ADD
, signal_fd
, &signal_ev
) < 0) {
833 log_error("Failed to regiser fds in epoll: %m");
839 struct epoll_event ev
[16];
843 nfds
= epoll_wait(ep
, ev
, ELEMENTSOF(ev
), -1);
846 if (errno
== EINTR
|| errno
== EAGAIN
)
849 log_error("epoll_wait(): %m");
856 for (i
= 0; i
< nfds
; i
++) {
857 if (ev
[i
].data
.fd
== STDIN_FILENO
) {
859 if (ev
[i
].events
& (EPOLLIN
|EPOLLHUP
))
860 stdin_readable
= true;
862 } else if (ev
[i
].data
.fd
== STDOUT_FILENO
) {
864 if (ev
[i
].events
& (EPOLLOUT
|EPOLLHUP
))
865 stdout_writable
= true;
867 } else if (ev
[i
].data
.fd
== master
) {
869 if (ev
[i
].events
& (EPOLLIN
|EPOLLHUP
))
870 master_readable
= true;
872 if (ev
[i
].events
& (EPOLLOUT
|EPOLLHUP
))
873 master_writable
= true;
875 } else if (ev
[i
].data
.fd
== signal_fd
) {
876 struct signalfd_siginfo sfsi
;
879 n
= read(signal_fd
, &sfsi
, sizeof(sfsi
));
880 if (n
!= sizeof(sfsi
)) {
883 log_error("Failed to read from signalfd: invalid block size");
888 if (errno
!= EINTR
&& errno
!= EAGAIN
) {
889 log_error("Failed to read from signalfd: %m");
895 if (sfsi
.ssi_signo
== SIGWINCH
) {
898 /* The window size changed, let's forward that. */
899 if (ioctl(STDIN_FILENO
, TIOCGWINSZ
, &ws
) >= 0)
900 ioctl(master
, TIOCSWINSZ
, &ws
);
909 while ((stdin_readable
&& in_buffer_full
<= 0) ||
910 (master_writable
&& in_buffer_full
> 0) ||
911 (master_readable
&& out_buffer_full
<= 0) ||
912 (stdout_writable
&& out_buffer_full
> 0)) {
914 if (stdin_readable
&& in_buffer_full
< LINE_MAX
) {
916 k
= read(STDIN_FILENO
, in_buffer
+ in_buffer_full
, LINE_MAX
- in_buffer_full
);
919 if (errno
== EAGAIN
|| errno
== EPIPE
|| errno
== ECONNRESET
|| errno
== EIO
)
920 stdin_readable
= false;
922 log_error("read(): %m");
927 in_buffer_full
+= (size_t) k
;
930 if (master_writable
&& in_buffer_full
> 0) {
932 k
= write(master
, in_buffer
, in_buffer_full
);
935 if (errno
== EAGAIN
|| errno
== EPIPE
|| errno
== ECONNRESET
|| errno
== EIO
)
936 master_writable
= false;
938 log_error("write(): %m");
944 assert(in_buffer_full
>= (size_t) k
);
945 memmove(in_buffer
, in_buffer
+ k
, in_buffer_full
- k
);
950 if (master_readable
&& out_buffer_full
< LINE_MAX
) {
952 k
= read(master
, out_buffer
+ out_buffer_full
, LINE_MAX
- out_buffer_full
);
955 if (errno
== EAGAIN
|| errno
== EPIPE
|| errno
== ECONNRESET
|| errno
== EIO
)
956 master_readable
= false;
958 log_error("read(): %m");
963 out_buffer_full
+= (size_t) k
;
966 if (stdout_writable
&& out_buffer_full
> 0) {
968 k
= write(STDOUT_FILENO
, out_buffer
, out_buffer_full
);
971 if (errno
== EAGAIN
|| errno
== EPIPE
|| errno
== ECONNRESET
|| errno
== EIO
)
972 stdout_writable
= false;
974 log_error("write(): %m");
980 assert(out_buffer_full
>= (size_t) k
);
981 memmove(out_buffer
, out_buffer
+ k
, out_buffer_full
- k
);
982 out_buffer_full
-= k
;
990 close_nointr_nofail(ep
);
993 close_nointr_nofail(signal_fd
);
998 int main(int argc
, char *argv
[]) {
1000 int r
= EXIT_FAILURE
, k
;
1001 char *oldcg
= NULL
, *newcg
= NULL
;
1002 char **controller
= NULL
;
1004 const char *console
= NULL
;
1005 struct termios saved_attr
, raw_attr
;
1007 bool saved_attr_valid
= false;
1009 int kmsg_socket_pair
[2] = { -1, -1 };
1011 log_parse_environment();
1014 r
= parse_argv(argc
, argv
);
1018 if (arg_directory
) {
1021 p
= path_make_absolute_cwd(arg_directory
);
1022 free(arg_directory
);
1025 arg_directory
= get_current_dir_name();
1027 if (!arg_directory
) {
1028 log_error("Failed to determine path");
1032 path_kill_slashes(arg_directory
);
1034 if (geteuid() != 0) {
1035 log_error("Need to be root.");
1039 if (sd_booted() <= 0) {
1040 log_error("Not running on a systemd system.");
1044 if (path_equal(arg_directory
, "/")) {
1045 log_error("Spawning container on root directory not supported.");
1049 if (is_os_tree(arg_directory
) <= 0) {
1050 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory
);
1054 k
= cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER
, 0, &oldcg
);
1056 log_error("Failed to determine current cgroup: %s", strerror(-k
));
1060 if (asprintf(&newcg
, "%s/nspawn-%lu", oldcg
, (unsigned long) getpid()) < 0) {
1061 log_error("Failed to allocate cgroup path.");
1065 k
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, newcg
, 0);
1067 log_error("Failed to create cgroup: %s", strerror(-k
));
1071 STRV_FOREACH(controller
, arg_controllers
) {
1072 k
= cg_create_and_attach(*controller
, newcg
, 0);
1074 log_warning("Failed to create cgroup in controller %s: %s", *controller
, strerror(-k
));
1077 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
1079 log_error("Failed to acquire pseudo tty: %m");
1083 console
= ptsname(master
);
1085 log_error("Failed to determine tty name: %m");
1089 log_info("Spawning namespace container on %s (console is %s).", arg_directory
, console
);
1091 if (ioctl(STDIN_FILENO
, TIOCGWINSZ
, &ws
) >= 0)
1092 ioctl(master
, TIOCSWINSZ
, &ws
);
1094 if (unlockpt(master
) < 0) {
1095 log_error("Failed to unlock tty: %m");
1099 if (tcgetattr(STDIN_FILENO
, &saved_attr
) < 0) {
1100 log_error("Failed to get terminal attributes: %m");
1104 saved_attr_valid
= true;
1106 raw_attr
= saved_attr
;
1107 cfmakeraw(&raw_attr
);
1108 raw_attr
.c_lflag
&= ~ECHO
;
1110 if (tcsetattr(STDIN_FILENO
, TCSANOW
, &raw_attr
) < 0) {
1111 log_error("Failed to set terminal attributes: %m");
1115 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_NONBLOCK
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
1116 log_error("Failed to create kmsg socket pair");
1120 assert_se(sigemptyset(&mask
) == 0);
1121 sigset_add_many(&mask
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1);
1122 assert_se(sigprocmask(SIG_BLOCK
, &mask
, NULL
) == 0);
1124 pid
= syscall(__NR_clone
, SIGCHLD
|CLONE_NEWIPC
|CLONE_NEWNS
|CLONE_NEWPID
|CLONE_NEWUTS
|(arg_private_network
? CLONE_NEWNET
: 0), NULL
);
1126 if (errno
== EINVAL
)
1127 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1129 log_error("clone() failed: %m");
1137 const char *home
= NULL
;
1138 uid_t uid
= (uid_t
) -1;
1139 gid_t gid
= (gid_t
) -1;
1140 const char *envp
[] = {
1141 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1142 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1147 NULL
, /* container_uuid */
1151 envp
[2] = strv_find_prefix(environ
, "TERM=");
1153 close_nointr_nofail(master
);
1155 close_nointr(STDIN_FILENO
);
1156 close_nointr(STDOUT_FILENO
);
1157 close_nointr(STDERR_FILENO
);
1159 close_all_fds(&kmsg_socket_pair
[1], 1);
1161 reset_all_signal_handlers();
1163 assert_se(sigemptyset(&mask
) == 0);
1164 assert_se(sigprocmask(SIG_SETMASK
, &mask
, NULL
) == 0);
1166 if (open_terminal(console
, O_RDWR
) != STDIN_FILENO
||
1167 dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
1168 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
1172 log_error("setsid() failed: %m");
1176 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0) {
1177 log_error("PR_SET_PDEATHSIG failed: %m");
1181 /* Mark everything as slave, so that we still
1182 * receive mounts from the real root, but don't
1183 * propagate mounts to the real root. */
1184 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0) {
1185 log_error("MS_SLAVE|MS_REC failed: %m");
1189 /* Turn directory into bind mount */
1190 if (mount(arg_directory
, arg_directory
, "bind", MS_BIND
|MS_REC
, NULL
) < 0) {
1191 log_error("Failed to make bind mount.");
1196 if (mount(arg_directory
, arg_directory
, "bind", MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_REC
, NULL
) < 0) {
1197 log_error("Failed to make read-only.");
1201 if (mount_all(arg_directory
) < 0)
1204 if (copy_devnodes(arg_directory
) < 0)
1207 dev_setup(arg_directory
);
1209 if (setup_dev_console(arg_directory
, console
) < 0)
1212 if (setup_kmsg(arg_directory
, kmsg_socket_pair
[1]) < 0)
1215 close_nointr_nofail(kmsg_socket_pair
[1]);
1217 if (setup_timezone(arg_directory
) < 0)
1220 if (setup_resolv_conf(arg_directory
) < 0)
1223 if (setup_journal(arg_directory
) < 0)
1226 if (chdir(arg_directory
) < 0) {
1227 log_error("chdir(%s) failed: %m", arg_directory
);
1231 if (mount(arg_directory
, "/", NULL
, MS_MOVE
, NULL
) < 0) {
1232 log_error("mount(MS_MOVE) failed: %m");
1236 if (chroot(".") < 0) {
1237 log_error("chroot() failed: %m");
1241 if (chdir("/") < 0) {
1242 log_error("chdir() failed: %m");
1250 if (drop_capabilities() < 0) {
1251 log_error("drop_capabilities() failed: %m");
1257 if (get_user_creds((const char**)&arg_user
, &uid
, &gid
, &home
, NULL
) < 0) {
1258 log_error("get_user_creds() failed: %m");
1262 if (mkdir_parents_label(home
, 0775) < 0) {
1263 log_error("mkdir_parents_label() failed: %m");
1267 if (mkdir_safe_label(home
, 0775, uid
, gid
) < 0) {
1268 log_error("mkdir_safe_label() failed: %m");
1272 if (initgroups((const char*)arg_user
, gid
) < 0) {
1273 log_error("initgroups() failed: %m");
1277 if (setresgid(gid
, gid
, gid
) < 0) {
1278 log_error("setregid() failed: %m");
1282 if (setresuid(uid
, uid
, uid
) < 0) {
1283 log_error("setreuid() failed: %m");
1288 if ((asprintf((char**)(envp
+ 3), "HOME=%s", home
? home
: "/root") < 0) ||
1289 (asprintf((char**)(envp
+ 4), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
1290 (asprintf((char**)(envp
+ 5), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0)) {
1296 if (asprintf((char**)(envp
+ 6), "container_uuid=%s", arg_uuid
) < 0) {
1308 /* Automatically search for the init system */
1310 l
= 1 + argc
- optind
;
1311 a
= newa(char*, l
+ 1);
1312 memcpy(a
+ 1, argv
+ optind
, l
* sizeof(char*));
1314 a
[0] = (char*) "/usr/lib/systemd/systemd";
1315 execve(a
[0], a
, (char**) envp
);
1317 a
[0] = (char*) "/lib/systemd/systemd";
1318 execve(a
[0], a
, (char**) envp
);
1320 a
[0] = (char*) "/sbin/init";
1321 execve(a
[0], a
, (char**) envp
);
1322 } else if (argc
> optind
)
1323 execvpe(argv
[optind
], argv
+ optind
, (char**) envp
);
1325 chdir(home
? home
: "/root");
1326 execle("/bin/bash", "-bash", NULL
, (char**) envp
);
1329 log_error("execv() failed: %m");
1332 _exit(EXIT_FAILURE
);
1335 if (process_pty(master
, &mask
) < 0)
1338 if (saved_attr_valid
) {
1339 tcsetattr(STDIN_FILENO
, TCSANOW
, &saved_attr
);
1340 saved_attr_valid
= false;
1343 r
= wait_for_terminate_and_warn(argc
> optind
? argv
[optind
] : "bash", pid
);
1349 if (saved_attr_valid
)
1350 tcsetattr(STDIN_FILENO
, TCSANOW
, &saved_attr
);
1353 close_nointr_nofail(master
);
1355 close_pipe(kmsg_socket_pair
);
1358 cg_attach(SYSTEMD_CGROUP_CONTROLLER
, oldcg
, 0);
1361 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER
, newcg
, true);
1363 free(arg_directory
);
1364 strv_free(arg_controllers
);