1 ;;; GNU Guix --- Functional package management for GNU
2 ;;; Copyright © 2015 David Thompson <davet@gnu.org>
4 ;;; This file is part of GNU Guix.
6 ;;; GNU Guix is free software; you can redistribute it and/or modify it
7 ;;; under the terms of the GNU General Public License as published by
8 ;;; the Free Software Foundation; either version 3 of the License, or (at
9 ;;; your option) any later version.
11 ;;; GNU Guix is distributed in the hope that it will be useful, but
12 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ;;; GNU General Public License for more details.
16 ;;; You should have received a copy of the GNU General Public License
17 ;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
19 (define-module (gnu build linux-container)
20 #:use-module (ice-9 format)
21 #:use-module (ice-9 match)
22 #:use-module (ice-9 rdelim)
23 #:use-module (srfi srfi-98)
24 #:use-module (guix utils)
25 #:use-module (guix build utils)
26 #:use-module (guix build syscalls)
27 #:use-module (gnu system file-systems) ;<file-system>
28 #:use-module ((gnu build file-systems) #:select (mount-file-system))
29 #:export (user-namespace-supported?
30 unprivileged-user-namespace-supported?
37 (define (user-namespace-supported?)
38 "Return #t if user namespaces are supported on this system."
39 (file-exists? "/proc/self/ns/user"))
41 (define (unprivileged-user-namespace-supported?)
42 "Return #t if user namespaces can be created by unprivileged users."
43 (let ((userns-file "/proc/sys/kernel/unprivileged_userns_clone"))
44 (if (file-exists? userns-file)
45 (eqv? #\1 (call-with-input-file userns-file read-char))
48 (define (setgroups-supported?)
49 "Return #t if the setgroups proc file, introduced in Linux-libre 3.19,
51 (file-exists? "/proc/self/setgroups"))
54 '(mnt pid ipc uts user net))
56 (define (call-with-clean-exit thunk)
57 "Apply THUNK, but exit with a status code of 1 if it fails."
66 (define (purify-environment)
67 "Unset all environment variables."
69 (match (get-environment-variables)
70 (((names . _) ...) names))))
72 ;; The container setup procedure closely resembles that of the Docker
74 ;; https://raw.githubusercontent.com/docker/libcontainer/master/SPEC.md
75 (define* (mount-file-systems root mounts #:key mount-/sys? mount-/proc?)
76 "Mount the essential file systems and the those in MOUNTS, a list of
77 <file-system> objects, relative to ROOT; then make ROOT the new root directory
80 (string-append root dir))
82 (define (touch file-name)
83 (call-with-output-file file-name (const #t)))
85 (define (bind-mount src dest)
86 (mount src dest "none" MS_BIND))
88 ;; Like mount, but creates the mount point if it doesn't exist.
89 (define* (mount* source target type #:optional (flags 0) options
90 #:key (update-mtab? #f))
92 (mount source target type flags options #:update-mtab? update-mtab?))
94 ;; The container's file system is completely ephemeral, sans directories
95 ;; bind-mounted from the host.
96 (mount "none" root "tmpfs")
98 ;; A proc mount requires a new pid namespace.
100 (mount* "none" (scope "/proc") "proc"
101 (logior MS_NOEXEC MS_NOSUID MS_NODEV)))
103 ;; A sysfs mount requires the user to have the CAP_SYS_ADMIN capability in
104 ;; the current network namespace.
106 (mount* "none" (scope "/sys") "sysfs"
107 (logior MS_NOEXEC MS_NOSUID MS_NODEV MS_RDONLY)))
109 (mount* "none" (scope "/dev") "tmpfs"
110 (logior MS_NOEXEC MS_STRICTATIME)
113 ;; Create essential device nodes via bind-mounting them from the
114 ;; host, because a process within a user namespace cannot create
116 (for-each (lambda (device)
117 (when (file-exists? device)
118 ;; Create the mount point file.
119 (touch (scope device))
120 (bind-mount device (scope device))))
130 ;; Setup the container's /dev/console by bind mounting the pseudo-terminal
131 ;; associated with standard input.
132 (let ((in (current-input-port))
133 (console (scope "/dev/console")))
136 (chmod console #o600)
137 (bind-mount (ttyname in) console)))
139 ;; Setup standard input/output/error.
140 (symlink "/proc/self/fd" (scope "/dev/fd"))
141 (symlink "/proc/self/fd/0" (scope "/dev/stdin"))
142 (symlink "/proc/self/fd/1" (scope "/dev/stdout"))
143 (symlink "/proc/self/fd/2" (scope "/dev/stderr"))
145 ;; Mount user-specified file systems.
146 (for-each (lambda (file-system)
147 (mount-file-system (file-system->spec file-system)
151 ;; Jail the process inside the container's root file system.
152 (let ((put-old (string-append root "/real-root")))
154 (pivot-root root put-old)
156 (umount "real-root" MNT_DETACH)
157 (rmdir "real-root")))
159 (define (initialize-user-namespace pid host-uids)
160 "Configure the user namespace for PID. HOST-UIDS specifies the number of
161 host user identifiers to map into the user namespace."
163 (string-append "/proc/" (number->string pid)))
166 (string-append proc-dir file))
171 ;; Only root can write to the gid map without first disabling the
172 ;; setgroups syscall.
173 (unless (and (zero? uid) (zero? gid))
174 (call-with-output-file (scope "/setgroups")
176 (display "deny" port))))
178 ;; Map the user/group that created the container to the root user
179 ;; within the container.
180 (call-with-output-file (scope "/uid_map")
182 (format port "0 ~d ~d" uid host-uids)))
183 (call-with-output-file (scope "/gid_map")
185 (format port "0 ~d ~d" gid host-uids)))))
187 (define (namespaces->bit-mask namespaces)
188 "Return the number suitable for the 'flags' argument of 'clone' that
189 corresponds to the symbols in NAMESPACES."
190 ;; Use the same flags as fork(3) in addition to the namespace flags.
191 (apply logior SIGCHLD
196 ('user CLONE_NEWUSER)
201 (define (run-container root mounts namespaces host-uids thunk)
202 "Run THUNK in a new container process and return its PID. ROOT specifies
203 the root directory for the container. MOUNTS is a list of <file-system>
204 objects that specify file systems to mount inside the container. NAMESPACES
205 is a list of symbols that correspond to the possible Linux namespaces: mnt,
206 ipc, uts, user, and net. HOST-UIDS specifies the number of
207 host user identifiers to map into the user namespace."
208 ;; The parent process must initialize the user namespace for the child
209 ;; before it can boot. To negotiate this, a pipe is used such that the
210 ;; child process blocks until the parent writes to it.
211 (match (socketpair PF_UNIX SOCK_STREAM 0)
213 (let ((flags (namespaces->bit-mask namespaces)))
216 (call-with-clean-exit
219 ;; Wait for parent to set things up.
223 (when (memq 'mnt namespaces)
226 (mount-file-systems root mounts
227 #:mount-/proc? (memq 'pid namespaces)
228 #:mount-/sys? (memq 'net
231 ;; Forward the exception to the parent process.
233 (primitive-exit 3))))
234 ;; TODO: Manage capabilities.
238 (_ ;parent died or something
239 (primitive-exit 2))))))
242 (when (memq 'user namespaces)
243 (initialize-user-namespace pid host-uids))
244 ;; TODO: Initialize cgroups.
245 (write 'ready parent)
248 ;; Check whether the child process' setup phase succeeded.
249 (let ((message (read parent)))
254 (((? symbol? key) args ...) ;exception
255 (apply throw key args))
256 (_ ;unexpected termination
259 (define* (call-with-container mounts thunk #:key (namespaces %namespaces)
261 "Run THUNK in a new container process and return its exit status.
262 MOUNTS is a list of <file-system> objects that specify file systems to mount
263 inside the container. NAMESPACES is a list of symbols corresponding to
264 the identifiers for Linux namespaces: mnt, ipc, uts, pid, user, and net. By
265 default, all namespaces are used. HOST-UIDS is the number of host user
266 identifiers to map into the container's user namespace, if there is one. By
267 default, only a single uid/gid, that of the current user, is mapped into the
268 container. The host user that creates the container is the root user (uid/gid
269 0) within the container. Only root can map more than a single uid/gid.
271 Note that if THUNK needs to load any additional Guile modules, the relevant
272 module files must be present in one of the mappings in MOUNTS and the Guile
273 load path must be adjusted as needed."
274 (call-with-temporary-directory
276 (let ((pid (run-container root mounts namespaces host-uids thunk)))
277 ;; Catch SIGINT and kill the container process.
281 (kill pid SIGKILL))))
284 ((_ . status) status))))))
286 (define (container-excursion pid thunk)
287 "Run THUNK as a child process within the namespaces of process PID and
288 return the exit status."
289 (define (namespace-file pid namespace)
290 (string-append "/proc/" (number->string pid) "/ns/" namespace))
292 (match (primitive-fork)
294 (call-with-clean-exit
296 (for-each (lambda (ns)
297 (let ((source (namespace-file (getpid) ns))
298 (target (namespace-file pid ns)))
299 ;; Joining the namespace that the process already
300 ;; belongs to would throw an error so avoid that.
301 ;; XXX: This /proc interface leads to TOCTTOU.
302 (unless (string=? (readlink source) (readlink target))
303 (call-with-input-file source
304 (lambda (current-ns-port)
305 (call-with-input-file target
306 (lambda (new-ns-port)
307 (setns (fileno new-ns-port) 0))))))))
308 ;; It's important that the user namespace is joined first,
309 ;; so that the user will have the privileges to join the
310 ;; other namespaces. Furthermore, it's important that the
311 ;; mount namespace is joined last, otherwise the /proc mount
312 ;; point would no longer be accessible.
313 '("user" "ipc" "uts" "net" "pid" "mnt"))
320 (status:exit-val status))))))