1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
14 //sysnb raw_prctl(option int, arg2 int, arg3 int, arg4 int, arg5 int) (ret int, err Errno)
15 //prctl(option _C_int, arg2 _C_long, arg3 _C_long, arg4 _C_long, arg5 _C_long) _C_int
17 //sysnb rawUnshare(flags int) (err Errno)
18 //unshare(flags _C_int) _C_int
20 //sysnb rawMount(source *byte, target *byte, fstype *byte, flags uintptr, data *byte) (err Errno)
21 //mount(source *byte, target *byte, fstype *byte, flags _C_long, data *byte) _C_int
23 //sysnb rawOpenat(dirfd int, pathname *byte, flags int, perm uint32) (fd int, err Errno)
24 //__go_openat(dirfd _C_int, pathname *byte, flags _C_int, perm Mode_t) _C_int
26 // SysProcIDMap holds Container ID to Host ID mappings used for User Namespaces in Linux.
27 // See user_namespaces(7).
28 type SysProcIDMap
struct {
29 ContainerID
int // Container ID.
30 HostID
int // Host ID.
34 type SysProcAttr
struct {
35 Chroot
string // Chroot.
36 Credential
*Credential
// Credential.
37 // Ptrace tells the child to call ptrace(PTRACE_TRACEME).
38 // Call runtime.LockOSThread before starting a process with this set,
39 // and don't call UnlockOSThread until done with PtraceSyscall calls.
41 Setsid
bool // Create session.
42 // Setpgid sets the process group ID of the child to Pgid,
43 // or, if Pgid == 0, to the new child's process ID.
45 // Setctty sets the controlling terminal of the child to
46 // file descriptor Ctty. Ctty must be a descriptor number
47 // in the child process: an index into ProcAttr.Files.
48 // This is only meaningful if Setsid is true.
50 Noctty
bool // Detach fd 0 from controlling terminal
51 Ctty
int // Controlling TTY fd
52 // Foreground places the child process group in the foreground.
53 // This implies Setpgid. The Ctty field must be set to
54 // the descriptor of the controlling TTY.
55 // Unlike Setctty, in this case Ctty must be a descriptor
56 // number in the parent process.
58 Pgid
int // Child's process group ID if Setpgid.
59 Pdeathsig Signal
// Signal that the process will get when its parent dies (Linux and FreeBSD only)
60 Cloneflags
uintptr // Flags for clone calls (Linux only)
61 Unshareflags
uintptr // Flags for unshare calls (Linux only)
62 UidMappings
[]SysProcIDMap
// User ID mappings for user namespaces.
63 GidMappings
[]SysProcIDMap
// Group ID mappings for user namespaces.
64 // GidMappingsEnableSetgroups enabling setgroups syscall.
65 // If false, then setgroups syscall will be disabled for the child process.
66 // This parameter is no-op if GidMappings == nil. Otherwise for unprivileged
67 // users this should be set to false for mappings work.
68 GidMappingsEnableSetgroups
bool
69 AmbientCaps
[]uintptr // Ambient capabilities (Linux only)
73 none
= [...]byte{'n', 'o', 'n', 'e', 0}
74 slash
= [...]byte{'/', 0}
77 // Implemented in runtime package.
78 func runtime_BeforeFork()
79 func runtime_AfterFork()
80 func runtime_AfterForkInChild()
82 // Implemented in clone_linux.c
85 func rawClone(flags _C_ulong
, child_stack
*byte, ptid
*Pid_t
, ctid
*Pid_t
, regs unsafe
.Pointer
) _C_long
87 // Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child.
88 // If a dup or exec fails, write the errno error to pipe.
89 // (Pipe is close-on-exec so if exec succeeds, it will be closed.)
90 // In the child, this function must not acquire any locks, because
91 // they might have been locked at the time of the fork. This means
92 // no rescheduling, no malloc calls, and no new stack segments.
93 // For the same reason compiler does not race instrument it.
94 // The calls to RawSyscall are okay because they are assembly
95 // functions that do not grow the stack.
98 func forkAndExecInChild(argv0
*byte, argv
, envv
[]*byte, chroot
, dir
*byte, attr
*ProcAttr
, sys
*SysProcAttr
, pipe
int) (pid
int, err Errno
) {
99 // Set up and fork. This returns immediately in the parent or
100 // if there's an error.
101 r1
, err1
, p
, locked
:= forkAndExecInChild1(argv0
, argv
, envv
, chroot
, dir
, attr
, sys
, pipe
)
109 // parent; return PID
112 if sys
.UidMappings
!= nil || sys
.GidMappings
!= nil {
115 // uid/gid mappings will be written after fork and unshare(2) for user
117 if sys
.Unshareflags
&CLONE_NEWUSER
== 0 {
118 if err
:= writeUidGidMappings(pid
, sys
); err
!= nil {
122 raw_write(p
[1], (*byte)(unsafe
.Pointer(&err2
)), int(unsafe
.Sizeof(err2
)))
129 const _LINUX_CAPABILITY_VERSION_3
= 0x20080522
131 type capHeader
struct {
136 type capData
struct {
146 // See CAP_TO_INDEX in linux/capability.h:
147 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
149 // See CAP_TO_MASK in linux/capability.h:
150 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
152 // forkAndExecInChild1 implements the body of forkAndExecInChild up to
153 // the parent's post-fork path. This is a separate function so we can
154 // separate the child's and parent's stack frames if we're using
157 // This is go:noinline because the point is to keep the stack frames
158 // of this and forkAndExecInChild separate.
162 func forkAndExecInChild1(argv0
*byte, argv
, envv
[]*byte, chroot
, dir
*byte, attr
*ProcAttr
, sys
*SysProcAttr
, pipe
int) (r1
uintptr, err1 Errno
, p
[2]int, locked
bool) {
163 // Defined in linux/prctl.h starting with Linux 4.3.
165 PR_CAP_AMBIENT
= 0x2f
166 PR_CAP_AMBIENT_RAISE
= 0x2
169 // vfork requires that the child not touch any of the parent's
170 // active stack frames. Hence, the child does all post-fork
171 // processing in this stack frame and never returns, while the
172 // parent returns immediately from this frame and does all
173 // post-fork processing in the outer frame.
174 // Declare all variables at top in case any
175 // declarations require heap allocation (e.g., err1).
183 puid
, psetgroups
, pgid
[]byte
184 uidmap
, setgroups
, gidmap
[]byte
187 if sys
.UidMappings
!= nil {
188 puid
= []byte("/proc/self/uid_map\000")
189 uidmap
= formatIDMappings(sys
.UidMappings
)
192 if sys
.GidMappings
!= nil {
193 psetgroups
= []byte("/proc/self/setgroups\000")
194 pgid
= []byte("/proc/self/gid_map\000")
196 if sys
.GidMappingsEnableSetgroups
{
197 setgroups
= []byte("allow\000")
199 setgroups
= []byte("deny\000")
201 gidmap
= formatIDMappings(sys
.GidMappings
)
204 // Record parent PID so child can test if it has died.
207 // Guard against side effects of shuffling fds below.
208 // Make sure that nextfd is beyond any currently open files so
209 // that we can't run the risk of overwriting any of them.
210 fd
:= make([]int, len(attr
.Files
))
211 nextfd
= len(attr
.Files
)
212 for i
, ufd
:= range attr
.Files
{
213 if nextfd
< int(ufd
) {
220 // Allocate another pipe for parent to child communication for
221 // synchronizing writing of User ID/Group ID mappings.
222 if sys
.UidMappings
!= nil || sys
.GidMappings
!= nil {
223 if err
:= forkExecPipe(p
[:]); err
!= nil {
229 // About to call fork.
230 // No more allocation or calls of non-assembly functions.
233 r2
= int(rawClone(_C_ulong(uintptr(SIGCHLD
)|sys
.Cloneflags
), nil, nil, nil, unsafe
.Pointer(nil)))
238 // If we're in the parent, we must return immediately
239 // so we're not in the same stack frame as the child.
240 // This can at most use the return PC, which the child
241 // will not modify, and the results of
242 // rawVforkSyscall, which must have been written after
243 // the child was replaced.
248 // Fork succeeded, now in child.
250 // Enable the "keep capabilities" flag to set ambient capabilities later.
251 if len(sys
.AmbientCaps
) > 0 {
252 _
, err1
= raw_prctl(PR_SET_KEEPCAPS
, 1, 0, 0, 0)
258 // Wait for User ID/Group ID mappings to be written.
259 if sys
.UidMappings
!= nil || sys
.GidMappings
!= nil {
260 if err1
= raw_close(p
[1]); err1
!= 0 {
263 r2
, err1
= raw_read(p
[0], (*byte)(unsafe
.Pointer(&err2
)), int(unsafe
.Sizeof(err2
)))
267 if r2
!= int(unsafe
.Sizeof(err2
)) {
286 if sys
.Setpgid || sys
.Foreground
{
287 // Place child in process group.
288 err1
= raw_setpgid(0, sys
.Pgid
)
295 pgrp
:= Pid_t(sys
.Pgid
)
300 // Place process group in foreground.
301 _
, err1
= raw_ioctl_ptr(sys
.Ctty
, TIOCSPGRP
, unsafe
.Pointer(&pgrp
))
307 // Restore the signal mask. We do this after TIOCSPGRP to avoid
308 // having the kernel send a SIGTTOU signal to the process group.
309 runtime_AfterForkInChild()
312 if sys
.Unshareflags
!= 0 {
313 err1
= rawUnshare(int(sys
.Unshareflags
))
318 if sys
.Unshareflags
&CLONE_NEWUSER
!= 0 && sys
.GidMappings
!= nil {
319 dirfd
:= int(_AT_FDCWD
)
320 if fd1
, err1
= rawOpenat(dirfd
, &psetgroups
[0], O_WRONLY
, 0); err1
!= 0 {
323 _
, err1
= raw_write(fd1
, &setgroups
[0], len(setgroups
))
327 if err1
= raw_close(fd1
); err1
!= 0 {
331 if fd1
, err1
= rawOpenat(dirfd
, &pgid
[0], O_WRONLY
, 0); err1
!= 0 {
334 _
, err1
= raw_write(fd1
, &gidmap
[0], len(gidmap
))
338 if err1
= raw_close(fd1
); err1
!= 0 {
343 if sys
.Unshareflags
&CLONE_NEWUSER
!= 0 && sys
.UidMappings
!= nil {
344 dirfd
:= int(_AT_FDCWD
)
345 if fd1
, err1
= rawOpenat(dirfd
, &puid
[0], O_WRONLY
, 0); err1
!= 0 {
348 _
, err1
= raw_write(fd1
, &uidmap
[0], len(uidmap
))
352 if err1
= raw_close(fd1
); err1
!= 0 {
357 // The unshare system call in Linux doesn't unshare mount points
358 // mounted with --shared. Systemd mounts / with --shared. For a
359 // long discussion of the pros and cons of this see debian bug 739593.
360 // The Go model of unsharing is more like Plan 9, where you ask
361 // to unshare and the namespaces are unconditionally unshared.
362 // To make this model work we must further mark / as MS_PRIVATE.
363 // This is what the standard unshare command does.
364 if sys
.Unshareflags
&CLONE_NEWNS
== CLONE_NEWNS
{
365 err1
= rawMount(&none
[0], &slash
[0], nil, MS_REC|MS_PRIVATE
, nil)
374 err1
= raw_chroot(chroot
)
381 if cred
:= sys
.Credential
; cred
!= nil {
382 ngroups
:= len(cred
.Groups
)
383 var groups unsafe
.Pointer
385 groups
= unsafe
.Pointer(&cred
.Groups
[0])
387 if !(sys
.GidMappings
!= nil && !sys
.GidMappingsEnableSetgroups
&& ngroups
== 0) && !cred
.NoSetGroups
{
388 err1
= raw_setgroups(ngroups
, groups
)
393 _
, _
, err1
= RawSyscall(sys_SETGID
, uintptr(cred
.Gid
), 0, 0)
397 _
, _
, err1
= RawSyscall(sys_SETUID
, uintptr(cred
.Uid
), 0, 0)
403 if len(sys
.AmbientCaps
) != 0 {
404 // Ambient capabilities were added in the 4.3 kernel,
405 // so it is safe to always use _LINUX_CAPABILITY_VERSION_3.
406 caps
.hdr
.version
= _LINUX_CAPABILITY_VERSION_3
408 if _
, _
, err1
= RawSyscall(SYS_CAPGET
, uintptr(unsafe
.Pointer(&caps
.hdr
)), uintptr(unsafe
.Pointer(&caps
.data
[0])), 0); err1
!= 0 {
412 for _
, c
:= range sys
.AmbientCaps
{
413 // Add the c capability to the permitted and inheritable capability mask,
414 // otherwise we will not be able to add it to the ambient capability mask.
415 caps
.data
[capToIndex(c
)].permitted |
= capToMask(c
)
416 caps
.data
[capToIndex(c
)].inheritable |
= capToMask(c
)
419 if _
, _
, err1
= RawSyscall(SYS_CAPSET
, uintptr(unsafe
.Pointer(&caps
.hdr
)), uintptr(unsafe
.Pointer(&caps
.data
[0])), 0); err1
!= 0 {
423 for _
, c
:= range sys
.AmbientCaps
{
424 _
, _
, err1
= RawSyscall6(SYS_PRCTL
, PR_CAP_AMBIENT
, uintptr(PR_CAP_AMBIENT_RAISE
), c
, 0, 0, 0)
433 err1
= raw_chdir(dir
)
439 // Parent death signal
440 if sys
.Pdeathsig
!= 0 {
441 _
, err1
= raw_prctl(PR_SET_PDEATHSIG
, int(sys
.Pdeathsig
), 0, 0, 0)
446 // Signal self if parent is already dead. This might cause a
447 // duplicate signal in rare cases, but it won't matter when
452 err1
= raw_kill(pid
, sys
.Pdeathsig
)
459 // Pass 1: look for fd[i] < i and move those up above len(fd)
460 // so that pass 2 won't stomp on an fd it needs later.
462 err1
= raw_dup3(pipe
, nextfd
, O_CLOEXEC
)
469 for i
= 0; i
< len(fd
); i
++ {
470 if fd
[i
] >= 0 && fd
[i
] < int(i
) {
471 if nextfd
== pipe
{ // don't stomp on pipe
474 err1
= raw_dup3(fd
[i
], nextfd
, O_CLOEXEC
)
483 // Pass 2: dup fd[i] down onto i.
484 for i
= 0; i
< len(fd
); i
++ {
490 // dup2(i, i) won't clear close-on-exec flag on Linux,
491 // probably not elsewhere either.
492 _
, err1
= raw_fcntl(fd
[i
], F_SETFD
, 0)
498 // The new fd is created NOT close-on-exec,
499 // which is exactly what we want.
500 err1
= raw_dup3(fd
[i
], i
, 0)
506 // By convention, we don't close-on-exec the fds we are
507 // started with, so if len(fd) < 3, close 0, 1, 2 as needed.
508 // Programs that know they inherit fds >= 3 will need
509 // to set them close-on-exec.
510 for i
= len(fd
); i
< 3; i
++ {
514 // Detach fd 0 from tty
516 _
, err1
= raw_ioctl(0, TIOCNOTTY
, 0)
522 // Set the controlling TTY to Ctty
524 _
, err1
= raw_ioctl(sys
.Ctty
, TIOCSCTTY
, 1)
530 // Enable tracing if requested.
531 // Do this right before exec so that we don't unnecessarily trace the runtime
532 // setting up after the fork. See issue #21428.
534 err1
= raw_ptrace(_PTRACE_TRACEME
, 0, 0, 0)
541 err1
= raw_execve(argv0
, &argv
[0], &envv
[0])
544 // send error code on pipe
545 raw_write(pipe
, (*byte)(unsafe
.Pointer(&err1
)), int(unsafe
.Sizeof(err1
)))
551 // Try to open a pipe with O_CLOEXEC set on both file descriptors.
552 func forkExecPipe(p
[]int) (err error
) {
553 return Pipe2(p
, O_CLOEXEC
)
556 func formatIDMappings(idMap
[]SysProcIDMap
) []byte {
558 for _
, im
:= range idMap
{
559 data
= append(data
, []byte(itoa
.Itoa(im
.ContainerID
)+" "+itoa
.Itoa(im
.HostID
)+" "+itoa
.Itoa(im
.Size
)+"\n")...)
564 // writeIDMappings writes the user namespace User ID or Group ID mappings to the specified path.
565 func writeIDMappings(path
string, idMap
[]SysProcIDMap
) error
{
566 fd
, err
:= Open(path
, O_RDWR
, 0)
571 if _
, err
:= Write(fd
, formatIDMappings(idMap
)); err
!= nil {
576 if err
:= Close(fd
); err
!= nil {
583 // writeSetgroups writes to /proc/PID/setgroups "deny" if enable is false
584 // and "allow" if enable is true.
585 // This is needed since kernel 3.19, because you can't write gid_map without
586 // disabling setgroups() system call.
587 func writeSetgroups(pid
int, enable
bool) error
{
588 sgf
:= "/proc/" + itoa
.Itoa(pid
) + "/setgroups"
589 fd
, err
:= Open(sgf
, O_RDWR
, 0)
596 data
= []byte("allow")
598 data
= []byte("deny")
601 if _
, err
:= Write(fd
, data
); err
!= nil {
609 // writeUidGidMappings writes User ID and Group ID mappings for user namespaces
610 // for a process and it is called from the parent process.
611 func writeUidGidMappings(pid
int, sys
*SysProcAttr
) error
{
612 if sys
.UidMappings
!= nil {
613 uidf
:= "/proc/" + itoa
.Itoa(pid
) + "/uid_map"
614 if err
:= writeIDMappings(uidf
, sys
.UidMappings
); err
!= nil {
619 if sys
.GidMappings
!= nil {
620 // If the kernel is too old to support /proc/PID/setgroups, writeSetGroups will return ENOENT; this is OK.
621 if err
:= writeSetgroups(pid
, sys
.GidMappingsEnableSetgroups
); err
!= nil && err
!= ENOENT
{
624 gidf
:= "/proc/" + itoa
.Itoa(pid
) + "/gid_map"
625 if err
:= writeIDMappings(gidf
, sys
.GidMappings
); err
!= nil {