hppa: Fix ICE caused by mismatched predicate and constraint in xmpyu patterns
[official-gcc.git] / libgo / go / syscall / exec_linux.go
blob77846af89e4634a750bd969fd964bf3bc7303f31
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 //go:build linux
7 package syscall
9 import (
10 "internal/itoa"
11 "unsafe"
14 //sysnb raw_prctl(option int, arg2 int, arg3 int, arg4 int, arg5 int) (ret int, err Errno)
15 //prctl(option _C_int, arg2 _C_long, arg3 _C_long, arg4 _C_long, arg5 _C_long) _C_int
17 //sysnb rawUnshare(flags int) (err Errno)
18 //unshare(flags _C_int) _C_int
20 //sysnb rawMount(source *byte, target *byte, fstype *byte, flags uintptr, data *byte) (err Errno)
21 //mount(source *byte, target *byte, fstype *byte, flags _C_long, data *byte) _C_int
23 //sysnb rawOpenat(dirfd int, pathname *byte, flags int, perm uint32) (fd int, err Errno)
24 //__go_openat(dirfd _C_int, pathname *byte, flags _C_int, perm Mode_t) _C_int
26 // SysProcIDMap holds Container ID to Host ID mappings used for User Namespaces in Linux.
27 // See user_namespaces(7).
28 type SysProcIDMap struct {
29 ContainerID int // Container ID.
30 HostID int // Host ID.
31 Size int // Size.
34 type SysProcAttr struct {
35 Chroot string // Chroot.
36 Credential *Credential // Credential.
37 // Ptrace tells the child to call ptrace(PTRACE_TRACEME).
38 // Call runtime.LockOSThread before starting a process with this set,
39 // and don't call UnlockOSThread until done with PtraceSyscall calls.
40 Ptrace bool
41 Setsid bool // Create session.
42 // Setpgid sets the process group ID of the child to Pgid,
43 // or, if Pgid == 0, to the new child's process ID.
44 Setpgid bool
45 // Setctty sets the controlling terminal of the child to
46 // file descriptor Ctty. Ctty must be a descriptor number
47 // in the child process: an index into ProcAttr.Files.
48 // This is only meaningful if Setsid is true.
49 Setctty bool
50 Noctty bool // Detach fd 0 from controlling terminal
51 Ctty int // Controlling TTY fd
52 // Foreground places the child process group in the foreground.
53 // This implies Setpgid. The Ctty field must be set to
54 // the descriptor of the controlling TTY.
55 // Unlike Setctty, in this case Ctty must be a descriptor
56 // number in the parent process.
57 Foreground bool
58 Pgid int // Child's process group ID if Setpgid.
59 Pdeathsig Signal // Signal that the process will get when its parent dies (Linux and FreeBSD only)
60 Cloneflags uintptr // Flags for clone calls (Linux only)
61 Unshareflags uintptr // Flags for unshare calls (Linux only)
62 UidMappings []SysProcIDMap // User ID mappings for user namespaces.
63 GidMappings []SysProcIDMap // Group ID mappings for user namespaces.
64 // GidMappingsEnableSetgroups enabling setgroups syscall.
65 // If false, then setgroups syscall will be disabled for the child process.
66 // This parameter is no-op if GidMappings == nil. Otherwise for unprivileged
67 // users this should be set to false for mappings work.
68 GidMappingsEnableSetgroups bool
69 AmbientCaps []uintptr // Ambient capabilities (Linux only)
72 var (
73 none = [...]byte{'n', 'o', 'n', 'e', 0}
74 slash = [...]byte{'/', 0}
77 // Implemented in runtime package.
78 func runtime_BeforeFork()
79 func runtime_AfterFork()
80 func runtime_AfterForkInChild()
82 // Implemented in clone_linux.c
84 //go:noescape
85 func rawClone(flags _C_ulong, child_stack *byte, ptid *Pid_t, ctid *Pid_t, regs unsafe.Pointer) _C_long
87 // Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child.
88 // If a dup or exec fails, write the errno error to pipe.
89 // (Pipe is close-on-exec so if exec succeeds, it will be closed.)
90 // In the child, this function must not acquire any locks, because
91 // they might have been locked at the time of the fork. This means
92 // no rescheduling, no malloc calls, and no new stack segments.
93 // For the same reason compiler does not race instrument it.
94 // The calls to RawSyscall are okay because they are assembly
95 // functions that do not grow the stack.
97 //go:norace
98 func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
99 // Set up and fork. This returns immediately in the parent or
100 // if there's an error.
101 r1, err1, p, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
102 if locked {
103 runtime_AfterFork()
105 if err1 != 0 {
106 return 0, err1
109 // parent; return PID
110 pid = int(r1)
112 if sys.UidMappings != nil || sys.GidMappings != nil {
113 Close(p[0])
114 var err2 Errno
115 // uid/gid mappings will be written after fork and unshare(2) for user
116 // namespaces.
117 if sys.Unshareflags&CLONE_NEWUSER == 0 {
118 if err := writeUidGidMappings(pid, sys); err != nil {
119 err2 = err.(Errno)
122 raw_write(p[1], (*byte)(unsafe.Pointer(&err2)), int(unsafe.Sizeof(err2)))
123 Close(p[1])
126 return pid, 0
129 const _LINUX_CAPABILITY_VERSION_3 = 0x20080522
131 type capHeader struct {
132 version uint32
133 pid int32
136 type capData struct {
137 effective uint32
138 permitted uint32
139 inheritable uint32
141 type caps struct {
142 hdr capHeader
143 data [2]capData
146 // See CAP_TO_INDEX in linux/capability.h:
147 func capToIndex(cap uintptr) uintptr { return cap >> 5 }
149 // See CAP_TO_MASK in linux/capability.h:
150 func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) }
152 // forkAndExecInChild1 implements the body of forkAndExecInChild up to
153 // the parent's post-fork path. This is a separate function so we can
154 // separate the child's and parent's stack frames if we're using
155 // vfork.
157 // This is go:noinline because the point is to keep the stack frames
158 // of this and forkAndExecInChild separate.
160 //go:noinline
161 //go:norace
162 func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 Errno, p [2]int, locked bool) {
163 // Defined in linux/prctl.h starting with Linux 4.3.
164 const (
165 PR_CAP_AMBIENT = 0x2f
166 PR_CAP_AMBIENT_RAISE = 0x2
169 // vfork requires that the child not touch any of the parent's
170 // active stack frames. Hence, the child does all post-fork
171 // processing in this stack frame and never returns, while the
172 // parent returns immediately from this frame and does all
173 // post-fork processing in the outer frame.
174 // Declare all variables at top in case any
175 // declarations require heap allocation (e.g., err1).
176 var (
177 err2 Errno
178 nextfd int
179 i int
180 r2 int
181 caps caps
182 fd1 int
183 puid, psetgroups, pgid []byte
184 uidmap, setgroups, gidmap []byte
187 if sys.UidMappings != nil {
188 puid = []byte("/proc/self/uid_map\000")
189 uidmap = formatIDMappings(sys.UidMappings)
192 if sys.GidMappings != nil {
193 psetgroups = []byte("/proc/self/setgroups\000")
194 pgid = []byte("/proc/self/gid_map\000")
196 if sys.GidMappingsEnableSetgroups {
197 setgroups = []byte("allow\000")
198 } else {
199 setgroups = []byte("deny\000")
201 gidmap = formatIDMappings(sys.GidMappings)
204 // Record parent PID so child can test if it has died.
205 ppid := raw_getpid()
207 // Guard against side effects of shuffling fds below.
208 // Make sure that nextfd is beyond any currently open files so
209 // that we can't run the risk of overwriting any of them.
210 fd := make([]int, len(attr.Files))
211 nextfd = len(attr.Files)
212 for i, ufd := range attr.Files {
213 if nextfd < int(ufd) {
214 nextfd = int(ufd)
216 fd[i] = int(ufd)
218 nextfd++
220 // Allocate another pipe for parent to child communication for
221 // synchronizing writing of User ID/Group ID mappings.
222 if sys.UidMappings != nil || sys.GidMappings != nil {
223 if err := forkExecPipe(p[:]); err != nil {
224 err1 = err.(Errno)
225 return
229 // About to call fork.
230 // No more allocation or calls of non-assembly functions.
231 runtime_BeforeFork()
232 locked = true
233 r2 = int(rawClone(_C_ulong(uintptr(SIGCHLD)|sys.Cloneflags), nil, nil, nil, unsafe.Pointer(nil)))
234 if r2 < 0 {
235 err1 = GetErrno()
237 if r2 != 0 {
238 // If we're in the parent, we must return immediately
239 // so we're not in the same stack frame as the child.
240 // This can at most use the return PC, which the child
241 // will not modify, and the results of
242 // rawVforkSyscall, which must have been written after
243 // the child was replaced.
244 r1 = uintptr(r2)
245 return
248 // Fork succeeded, now in child.
250 // Enable the "keep capabilities" flag to set ambient capabilities later.
251 if len(sys.AmbientCaps) > 0 {
252 _, err1 = raw_prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0)
253 if err1 != 0 {
254 goto childerror
258 // Wait for User ID/Group ID mappings to be written.
259 if sys.UidMappings != nil || sys.GidMappings != nil {
260 if err1 = raw_close(p[1]); err1 != 0 {
261 goto childerror
263 r2, err1 = raw_read(p[0], (*byte)(unsafe.Pointer(&err2)), int(unsafe.Sizeof(err2)))
264 if err1 != 0 {
265 goto childerror
267 if r2 != int(unsafe.Sizeof(err2)) {
268 err1 = EINVAL
269 goto childerror
271 if err2 != 0 {
272 err1 = err2
273 goto childerror
277 // Session ID
278 if sys.Setsid {
279 err1 = raw_setsid()
280 if err1 != 0 {
281 goto childerror
285 // Set process group
286 if sys.Setpgid || sys.Foreground {
287 // Place child in process group.
288 err1 = raw_setpgid(0, sys.Pgid)
289 if err1 != 0 {
290 goto childerror
294 if sys.Foreground {
295 pgrp := Pid_t(sys.Pgid)
296 if pgrp == 0 {
297 pgrp = raw_getpid()
300 // Place process group in foreground.
301 _, err1 = raw_ioctl_ptr(sys.Ctty, TIOCSPGRP, unsafe.Pointer(&pgrp))
302 if err1 != 0 {
303 goto childerror
307 // Restore the signal mask. We do this after TIOCSPGRP to avoid
308 // having the kernel send a SIGTTOU signal to the process group.
309 runtime_AfterForkInChild()
311 // Unshare
312 if sys.Unshareflags != 0 {
313 err1 = rawUnshare(int(sys.Unshareflags))
314 if err1 != 0 {
315 goto childerror
318 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil {
319 dirfd := int(_AT_FDCWD)
320 if fd1, err1 = rawOpenat(dirfd, &psetgroups[0], O_WRONLY, 0); err1 != 0 {
321 goto childerror
323 _, err1 = raw_write(fd1, &setgroups[0], len(setgroups))
324 if err1 != 0 {
325 goto childerror
327 if err1 = raw_close(fd1); err1 != 0 {
328 goto childerror
331 if fd1, err1 = rawOpenat(dirfd, &pgid[0], O_WRONLY, 0); err1 != 0 {
332 goto childerror
334 _, err1 = raw_write(fd1, &gidmap[0], len(gidmap))
335 if err1 != 0 {
336 goto childerror
338 if err1 = raw_close(fd1); err1 != 0 {
339 goto childerror
343 if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil {
344 dirfd := int(_AT_FDCWD)
345 if fd1, err1 = rawOpenat(dirfd, &puid[0], O_WRONLY, 0); err1 != 0 {
346 goto childerror
348 _, err1 = raw_write(fd1, &uidmap[0], len(uidmap))
349 if err1 != 0 {
350 goto childerror
352 if err1 = raw_close(fd1); err1 != 0 {
353 goto childerror
357 // The unshare system call in Linux doesn't unshare mount points
358 // mounted with --shared. Systemd mounts / with --shared. For a
359 // long discussion of the pros and cons of this see debian bug 739593.
360 // The Go model of unsharing is more like Plan 9, where you ask
361 // to unshare and the namespaces are unconditionally unshared.
362 // To make this model work we must further mark / as MS_PRIVATE.
363 // This is what the standard unshare command does.
364 if sys.Unshareflags&CLONE_NEWNS == CLONE_NEWNS {
365 err1 = rawMount(&none[0], &slash[0], nil, MS_REC|MS_PRIVATE, nil)
366 if err1 != 0 {
367 goto childerror
372 // Chroot
373 if chroot != nil {
374 err1 = raw_chroot(chroot)
375 if err1 != 0 {
376 goto childerror
380 // User and groups
381 if cred := sys.Credential; cred != nil {
382 ngroups := len(cred.Groups)
383 var groups unsafe.Pointer
384 if ngroups > 0 {
385 groups = unsafe.Pointer(&cred.Groups[0])
387 if !(sys.GidMappings != nil && !sys.GidMappingsEnableSetgroups && ngroups == 0) && !cred.NoSetGroups {
388 err1 = raw_setgroups(ngroups, groups)
389 if err1 != 0 {
390 goto childerror
393 _, _, err1 = RawSyscall(sys_SETGID, uintptr(cred.Gid), 0, 0)
394 if err1 != 0 {
395 goto childerror
397 _, _, err1 = RawSyscall(sys_SETUID, uintptr(cred.Uid), 0, 0)
398 if err1 != 0 {
399 goto childerror
403 if len(sys.AmbientCaps) != 0 {
404 // Ambient capabilities were added in the 4.3 kernel,
405 // so it is safe to always use _LINUX_CAPABILITY_VERSION_3.
406 caps.hdr.version = _LINUX_CAPABILITY_VERSION_3
408 if _, _, err1 = RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
409 goto childerror
412 for _, c := range sys.AmbientCaps {
413 // Add the c capability to the permitted and inheritable capability mask,
414 // otherwise we will not be able to add it to the ambient capability mask.
415 caps.data[capToIndex(c)].permitted |= capToMask(c)
416 caps.data[capToIndex(c)].inheritable |= capToMask(c)
419 if _, _, err1 = RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 {
420 goto childerror
423 for _, c := range sys.AmbientCaps {
424 _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0)
425 if err1 != 0 {
426 goto childerror
431 // Chdir
432 if dir != nil {
433 err1 = raw_chdir(dir)
434 if err1 != 0 {
435 goto childerror
439 // Parent death signal
440 if sys.Pdeathsig != 0 {
441 _, err1 = raw_prctl(PR_SET_PDEATHSIG, int(sys.Pdeathsig), 0, 0, 0)
442 if err1 != 0 {
443 goto childerror
446 // Signal self if parent is already dead. This might cause a
447 // duplicate signal in rare cases, but it won't matter when
448 // using SIGKILL.
449 r1 := raw_getppid()
450 if r1 != ppid {
451 pid := raw_getpid()
452 err1 = raw_kill(pid, sys.Pdeathsig)
453 if err1 != 0 {
454 goto childerror
459 // Pass 1: look for fd[i] < i and move those up above len(fd)
460 // so that pass 2 won't stomp on an fd it needs later.
461 if pipe < nextfd {
462 err1 = raw_dup3(pipe, nextfd, O_CLOEXEC)
463 if err1 != 0 {
464 goto childerror
466 pipe = nextfd
467 nextfd++
469 for i = 0; i < len(fd); i++ {
470 if fd[i] >= 0 && fd[i] < int(i) {
471 if nextfd == pipe { // don't stomp on pipe
472 nextfd++
474 err1 = raw_dup3(fd[i], nextfd, O_CLOEXEC)
475 if err1 != 0 {
476 goto childerror
478 fd[i] = nextfd
479 nextfd++
483 // Pass 2: dup fd[i] down onto i.
484 for i = 0; i < len(fd); i++ {
485 if fd[i] == -1 {
486 raw_close(i)
487 continue
489 if fd[i] == int(i) {
490 // dup2(i, i) won't clear close-on-exec flag on Linux,
491 // probably not elsewhere either.
492 _, err1 = raw_fcntl(fd[i], F_SETFD, 0)
493 if err1 != 0 {
494 goto childerror
496 continue
498 // The new fd is created NOT close-on-exec,
499 // which is exactly what we want.
500 err1 = raw_dup3(fd[i], i, 0)
501 if err1 != 0 {
502 goto childerror
506 // By convention, we don't close-on-exec the fds we are
507 // started with, so if len(fd) < 3, close 0, 1, 2 as needed.
508 // Programs that know they inherit fds >= 3 will need
509 // to set them close-on-exec.
510 for i = len(fd); i < 3; i++ {
511 raw_close(i)
514 // Detach fd 0 from tty
515 if sys.Noctty {
516 _, err1 = raw_ioctl(0, TIOCNOTTY, 0)
517 if err1 != 0 {
518 goto childerror
522 // Set the controlling TTY to Ctty
523 if sys.Setctty {
524 _, err1 = raw_ioctl(sys.Ctty, TIOCSCTTY, 1)
525 if err1 != 0 {
526 goto childerror
530 // Enable tracing if requested.
531 // Do this right before exec so that we don't unnecessarily trace the runtime
532 // setting up after the fork. See issue #21428.
533 if sys.Ptrace {
534 err1 = raw_ptrace(_PTRACE_TRACEME, 0, 0, 0)
535 if err1 != 0 {
536 goto childerror
540 // Time to exec.
541 err1 = raw_execve(argv0, &argv[0], &envv[0])
543 childerror:
544 // send error code on pipe
545 raw_write(pipe, (*byte)(unsafe.Pointer(&err1)), int(unsafe.Sizeof(err1)))
546 for {
547 raw_exit(253)
551 // Try to open a pipe with O_CLOEXEC set on both file descriptors.
552 func forkExecPipe(p []int) (err error) {
553 return Pipe2(p, O_CLOEXEC)
556 func formatIDMappings(idMap []SysProcIDMap) []byte {
557 var data []byte
558 for _, im := range idMap {
559 data = append(data, []byte(itoa.Itoa(im.ContainerID)+" "+itoa.Itoa(im.HostID)+" "+itoa.Itoa(im.Size)+"\n")...)
561 return data
564 // writeIDMappings writes the user namespace User ID or Group ID mappings to the specified path.
565 func writeIDMappings(path string, idMap []SysProcIDMap) error {
566 fd, err := Open(path, O_RDWR, 0)
567 if err != nil {
568 return err
571 if _, err := Write(fd, formatIDMappings(idMap)); err != nil {
572 Close(fd)
573 return err
576 if err := Close(fd); err != nil {
577 return err
580 return nil
583 // writeSetgroups writes to /proc/PID/setgroups "deny" if enable is false
584 // and "allow" if enable is true.
585 // This is needed since kernel 3.19, because you can't write gid_map without
586 // disabling setgroups() system call.
587 func writeSetgroups(pid int, enable bool) error {
588 sgf := "/proc/" + itoa.Itoa(pid) + "/setgroups"
589 fd, err := Open(sgf, O_RDWR, 0)
590 if err != nil {
591 return err
594 var data []byte
595 if enable {
596 data = []byte("allow")
597 } else {
598 data = []byte("deny")
601 if _, err := Write(fd, data); err != nil {
602 Close(fd)
603 return err
606 return Close(fd)
609 // writeUidGidMappings writes User ID and Group ID mappings for user namespaces
610 // for a process and it is called from the parent process.
611 func writeUidGidMappings(pid int, sys *SysProcAttr) error {
612 if sys.UidMappings != nil {
613 uidf := "/proc/" + itoa.Itoa(pid) + "/uid_map"
614 if err := writeIDMappings(uidf, sys.UidMappings); err != nil {
615 return err
619 if sys.GidMappings != nil {
620 // If the kernel is too old to support /proc/PID/setgroups, writeSetGroups will return ENOENT; this is OK.
621 if err := writeSetgroups(pid, sys.GidMappingsEnableSetgroups); err != nil && err != ENOENT {
622 return err
624 gidf := "/proc/" + itoa.Itoa(pid) + "/gid_map"
625 if err := writeIDMappings(gidf, sys.GidMappings); err != nil {
626 return err
630 return nil