man2/seccomp.2

   1 .\" Copyright (C) 2014 Kees Cook <keescook@chromium.org>
   2 .\" and Copyright (C) 2012 Will Drewry <wad@chromium.org>
   3 .\" and Copyright (C) 2008, 2014,2017 Michael Kerrisk <mtk.manpages@gmail.com>
   4 .\" and Copyright (C) 2017 Tyler Hicks <tyhicks@canonical.com>
   5 .\" and Copyright (C) 2020 Tycho Andersen <tycho@tycho.ws>
   6 .\"
   7 .\" %%%LICENSE_START(VERBATIM)
   8 .\" Permission is granted to make and distribute verbatim copies of this
   9 .\" manual provided the copyright notice and this permission notice are
  10 .\" preserved on all copies.
  11 .\"
  12 .\" Permission is granted to copy and distribute modified versions of this
  13 .\" manual under the conditions for verbatim copying, provided that the
  14 .\" entire resulting derived work is distributed under the terms of a
  15 .\" permission notice identical to this one.
  16 .\"
  17 .\" Since the Linux kernel and libraries are constantly changing, this
  18 .\" manual page may be incorrect or out-of-date.  The author(s) assume no
  19 .\" responsibility for errors or omissions, or for damages resulting from
  20 .\" the use of the information contained herein.  The author(s) may not
  21 .\" have taken the same level of care in the production of this manual,
  22 .\" which is licensed free of charge, as they might when working
  23 .\" professionally.
  24 .\"
  25 .\" Formatted or processed versions of this manual, if unaccompanied by
  26 .\" the source, must acknowledge the copyright and authors of this work.
  27 .\" %%%LICENSE_END
  28 .\"
  29 .TH SECCOMP 2 2021-03-22 "Linux" "Linux Programmer's Manual"
  30 .SH NAME
  31 seccomp \- operate on Secure Computing state of the process
  32 .SH SYNOPSIS
  33 .nf
  34 .BR "#include <linux/seccomp.h>" "  /* Definition of " SECCOMP_* " constants */"
  35 .BR "#include <linux/filter.h>" "   /* Definition of " "struct sock_fprog" " */"
  36 .BR "#include <linux/audit.h>" "    /* Definition of " AUDIT_* " constants */"
  37 .BR "#include <linux/signal.h>" "   /* Definition of " SIG* " constants */"
  38 .BR "#include <sys/ptrace.h>" "     /* Definition of " PTRACE_* " constants */"
  39 .\" Kees Cook noted: Anything that uses SECCOMP_RET_TRACE returns will
  40 .\"                  need <sys/ptrace.h>
  41 .BR "#include <sys/syscall.h>" "    /* Definition of " SYS_* " constants */"
  42 .B #include <unistd.h>
  43 .PP
  44 .BI "int syscall(SYS_seccomp, unsigned int " operation ", unsigned int " flags ,
  45 .BI "            void *" args );
  46 .fi
  47 .PP
  48 .IR Note :
  49 glibc provides no wrapper for
  50 .BR seccomp (),
  51 necessitating the use of
  52 .BR syscall (2).
  53 .SH DESCRIPTION
  54 The
  55 .BR seccomp ()
  56 system call operates on the Secure Computing (seccomp) state of the
  57 calling process.
  58 .PP
  59 Currently, Linux supports the following
  60 .IR operation
  61 values:
  62 .TP
  63 .BR SECCOMP_SET_MODE_STRICT
  64 The only system calls that the calling thread is permitted to make are
  65 .BR read (2),
  66 .BR write (2),
  67 .BR _exit (2)
  68 (but not
  69 .BR exit_group (2)),
  70 and
  71 .BR sigreturn (2).
  72 Other system calls result in the delivery of a
  73 .BR SIGKILL
  74 signal.
  75 Strict secure computing mode is useful for number-crunching
  76 applications that may need to execute untrusted byte code, perhaps
  77 obtained by reading from a pipe or socket.
  78 .IP
  79 Note that although the calling thread can no longer call
  80 .BR sigprocmask (2),
  81 it can use
  82 .BR sigreturn (2)
  83 to block all signals apart from
  84 .BR SIGKILL
  85 and
  86 .BR SIGSTOP .
  87 This means that
  88 .BR alarm (2)
  89 (for example) is not sufficient for restricting the process's execution time.
  90 Instead, to reliably terminate the process,
  91 .BR SIGKILL
  92 must be used.
  93 This can be done by using
  94 .BR timer_create (2)
  95 with
  96 .BR SIGEV_SIGNAL
  97 and
  98 .IR sigev_signo
  99 set to
 100 .BR SIGKILL ,
 101 or by using
 102 .BR setrlimit (2)
 103 to set the hard limit for
 104 .BR RLIMIT_CPU .
 105 .IP
 106 This operation is available only if the kernel is configured with
 107 .BR CONFIG_SECCOMP
 108 enabled.
 109 .IP
 110 The value of
 111 .IR flags
 112 must be 0, and
 113 .IR args
 114 must be NULL.
 115 .IP
 116 This operation is functionally identical to the call:
 117 .IP
 118 .in +4n
 119 .EX
 120 prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
 121 .EE
 122 .in
 123 .TP
 124 .BR SECCOMP_SET_MODE_FILTER
 125 The system calls allowed are defined by a pointer to a Berkeley Packet
 126 Filter (BPF) passed via
 127 .IR args .
 128 This argument is a pointer to a
 129 .IR "struct\ sock_fprog" ;
 130 it can be designed to filter arbitrary system calls and system call
 131 arguments.
 132 If the filter is invalid,
 133 .BR seccomp ()
 134 fails, returning
 135 .BR EINVAL
 136 in
 137 .IR errno .
 138 .IP
 139 If
 140 .BR fork (2)
 141 or
 142 .BR clone (2)
 143 is allowed by the filter, any child processes will be constrained to
 144 the same system call filters as the parent.
 145 If
 146 .BR execve (2)
 147 is allowed,
 148 the existing filters will be preserved across a call to
 149 .BR execve (2).
 150 .IP
 151 In order to use the
 152 .BR SECCOMP_SET_MODE_FILTER
 153 operation, either the calling thread must have the
 154 .BR CAP_SYS_ADMIN
 155 capability in its user namespace, or the thread must already have the
 156 .I no_new_privs
 157 bit set.
 158 If that bit was not already set by an ancestor of this thread,
 159 the thread must make the following call:
 160 .IP
 161 .in +4n
 162 .EX
 163 prctl(PR_SET_NO_NEW_PRIVS, 1);
 164 .EE
 165 .in
 166 .IP
 167 Otherwise, the
 168 .BR SECCOMP_SET_MODE_FILTER
 169 operation fails and returns
 170 .BR EACCES
 171 in
 172 .IR errno .
 173 This requirement ensures that an unprivileged process cannot apply
 174 a malicious filter and then invoke a set-user-ID or
 175 other privileged program using
 176 .BR execve (2),
 177 thus potentially compromising that program.
 178 (Such a malicious filter might, for example, cause an attempt to use
 179 .BR setuid (2)
 180 to set the caller's user IDs to nonzero values to instead
 181 return 0 without actually making the system call.
 182 Thus, the program might be tricked into retaining superuser privileges
 183 in circumstances where it is possible to influence it to do
 184 dangerous things because it did not actually drop privileges.)
 185 .IP
 186 If
 187 .BR prctl (2)
 188 or
 189 .BR seccomp ()
 190 is allowed by the attached filter, further filters may be added.
 191 This will increase evaluation time, but allows for further reduction of
 192 the attack surface during execution of a thread.
 193 .IP
 194 The
 195 .BR SECCOMP_SET_MODE_FILTER
 196 operation is available only if the kernel is configured with
 197 .BR CONFIG_SECCOMP_FILTER
 198 enabled.
 199 .IP
 200 When
 201 .IR flags
 202 is 0, this operation is functionally identical to the call:
 203 .IP
 204 .in +4n
 205 .EX
 206 prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args);
 207 .EE
 208 .in
 209 .IP
 210 The recognized
 211 .IR flags
 212 are:
 213 .RS
 214 .TP
 215 .BR SECCOMP_FILTER_FLAG_LOG " (since Linux 4.14)"
 216 .\" commit e66a39977985b1e69e17c4042cb290768eca9b02
 217 All filter return actions except
 218 .BR SECCOMP_RET_ALLOW
 219 should be logged.
 220 An administrator may override this filter flag by preventing specific
 221 actions from being logged via the
 222 .IR /proc/sys/kernel/seccomp/actions_logged
 223 file.
 224 .TP
 225 .BR SECCOMP_FILTER_FLAG_NEW_LISTENER " (since Linux 5.0)"
 226 .\" commit 6a21cc50f0c7f87dae5259f6cfefe024412313f6
 227 After successfully installing the filter program,
 228 return a new user-space notification file descriptor.
 229 (The close-on-exec flag is set for the file descriptor.)
 230 When the filter returns
 231 .BR SECCOMP_RET_USER_NOTIF
 232 a notification will be sent to this file descriptor.
 233 .IP
 234 At most one seccomp filter using the
 235 .BR SECCOMP_FILTER_FLAG_NEW_LISTENER
 236 flag can be installed for a thread.
 237 .IP
 238 See
 239 .BR seccomp_unotify (2)
 240 for further details.
 241 .TP
 242 .BR SECCOMP_FILTER_FLAG_SPEC_ALLOW " (since Linux 4.17)"
 243 .\" commit 00a02d0c502a06d15e07b857f8ff921e3e402675
 244 Disable Speculative Store Bypass mitigation.
 245 .TP
 246 .BR SECCOMP_FILTER_FLAG_TSYNC
 247 When adding a new filter, synchronize all other threads of the calling
 248 process to the same seccomp filter tree.
 249 A "filter tree" is the ordered list of filters attached to a thread.
 250 (Attaching identical filters in separate
 251 .BR seccomp ()
 252 calls results in different filters from this perspective.)
 253 .IP
 254 If any thread cannot synchronize to the same filter tree,
 255 the call will not attach the new seccomp filter,
 256 and will fail, returning the first thread ID found that cannot synchronize.
 257 Synchronization will fail if another thread in the same process is in
 258 .BR SECCOMP_MODE_STRICT
 259 or if it has attached new seccomp filters to itself,
 260 diverging from the calling thread's filter tree.
 261 .RE
 262 .TP
 263 .BR SECCOMP_GET_ACTION_AVAIL " (since Linux 4.14)"
 264 .\" commit d612b1fd8010d0d67b5287fe146b8b55bcbb8655
 265 Test to see if an action is supported by the kernel.
 266 This operation is helpful to confirm that the kernel knows
 267 of a more recently added filter return action
 268 since the kernel treats all unknown actions as
 269 .BR SECCOMP_RET_KILL_PROCESS .
 270 .IP
 271 The value of
 272 .IR flags
 273 must be 0, and
 274 .IR args
 275 must be a pointer to an unsigned 32-bit filter return action.
 276 .TP
 277 .BR SECCOMP_GET_NOTIF_SIZES " (since Linux 5.0)"
 278 .\" commit 6a21cc50f0c7f87dae5259f6cfefe024412313f6
 279 Get the sizes of the seccomp user-space notification structures.
 280 Since these structures may evolve and grow over time,
 281 this command can be used to determine how
 282 much memory to allocate for sending and receiving notifications.
 283 .IP
 284 The value of
 285 .IR flags
 286 must be 0, and
 287 .IR args
 288 must be a pointer to a
 289 .IR "struct seccomp_notif_sizes" ,
 290 which has the following form:
 291 .IP
 292 .EX
 293 struct seccomp_notif_sizes
 294     __u16 seccomp_notif;      /* Size of notification structure */
 295     __u16 seccomp_notif_resp; /* Size of response structure */
 296     __u16 seccomp_data;       /* Size of \(aqstruct seccomp_data\(aq */
 297 };
 298 .EE
 299 .IP
 300 See
 301 .BR seccomp_unotify (2)
 302 for further details.
 303 .\"
 304 .SS Filters
 305 When adding filters via
 306 .BR SECCOMP_SET_MODE_FILTER ,
 307 .IR args
 308 points to a filter program:
 309 .PP
 310 .in +4n
 311 .EX
 312 struct sock_fprog {
 313     unsigned short      len;    /* Number of BPF instructions */
 314     struct sock_filter *filter; /* Pointer to array of
 315                                    BPF instructions */
 316 };
 317 .EE
 318 .in
 319 .PP
 320 Each program must contain one or more BPF instructions:
 321 .PP
 322 .in +4n
 323 .EX
 324 struct sock_filter {            /* Filter block */
 325     __u16 code;                 /* Actual filter code */
 326     __u8  jt;                   /* Jump true */
 327     __u8  jf;                   /* Jump false */
 328     __u32 k;                    /* Generic multiuse field */
 329 };
 330 .EE
 331 .in
 332 .PP
 333 When executing the instructions, the BPF program operates on the
 334 system call information made available (i.e., use the
 335 .BR BPF_ABS
 336 addressing mode) as a (read-only)
 337 .\" Quoting Kees Cook:
 338 .\"     If BPF even allows changing the data, it's not copied back to
 339 .\"     the syscall when it runs. Anything wanting to do things like
 340 .\"     that would need to use ptrace to catch the call and directly
 341 .\"     modify the registers before continuing with the call.
 342 buffer of the following form:
 343 .PP
 344 .in +4n
 345 .EX
 346 struct seccomp_data {
 347     int   nr;                   /* System call number */
 348     __u32 arch;                 /* AUDIT_ARCH_* value
 349                                    (see <linux/audit.h>) */
 350     __u64 instruction_pointer;  /* CPU instruction pointer */
 351     __u64 args[6];              /* Up to 6 system call arguments */
 352 };
 353 .EE
 354 .in
 355 .PP
 356 Because numbering of system calls varies between architectures and
 357 some architectures (e.g., x86-64) allow user-space code to use
 358 the calling conventions of multiple architectures
 359 (and the convention being used may vary over the life of a process that uses
 360 .BR execve (2)
 361 to execute binaries that employ the different conventions),
 362 it is usually necessary to verify the value of the
 363 .IR arch
 364 field.
 365 .PP
 366 It is strongly recommended to use an allow-list approach whenever
 367 possible because such an approach is more robust and simple.
 368 A deny-list will have to be updated whenever a potentially
 369 dangerous system call is added (or a dangerous flag or option if those
 370 are deny-listed), and it is often possible to alter the
 371 representation of a value without altering its meaning, leading to
 372 a deny-list bypass.
 373 See also
 374 .IR Caveats
 375 below.
 376 .PP
 377 The
 378 .IR arch
 379 field is not unique for all calling conventions.
 380 The x86-64 ABI and the x32 ABI both use
 381 .BR AUDIT_ARCH_X86_64
 382 as
 383 .IR arch ,
 384 and they run on the same processors.
 385 Instead, the mask
 386 .BR __X32_SYSCALL_BIT
 387 is used on the system call number to tell the two ABIs apart.
 388 .\" As noted by Dave Drysdale in a note at the end of
 389 .\" https://lwn.net/Articles/604515/
 390 .\"     One additional detail to point out for the x32 ABI case:
 391 .\"     the syscall number gets a high bit set (__X32_SYSCALL_BIT),
 392 .\"     to mark it as an x32 call.
 393 .\"
 394 .\"     If x32 support is included in the kernel, then __SYSCALL_MASK
 395 .\"     will have a value that is not all-ones, and this will trigger
 396 .\"     an extra instruction in system_call to mask off the extra bit,
 397 .\"     so that the syscall table indexing still works.
 398 .PP
 399 This means that a policy must either deny all syscalls with
 400 .BR __X32_SYSCALL_BIT
 401 or it must recognize syscalls with and without
 402 .BR __X32_SYSCALL_BIT
 403 set.
 404 A list of system calls to be denied based on
 405 .IR nr
 406 that does not also contain
 407 .IR nr
 408 values with
 409 .BR __X32_SYSCALL_BIT
 410 set can be bypassed by a malicious program that sets
 411 .BR __X32_SYSCALL_BIT .
 412 .PP
 413 Additionally, kernels prior to Linux 5.4 incorrectly permitted
 414 .IR nr
 415 in the ranges 512-547 as well as the corresponding non-x32 syscalls ORed
 416 with
 417 .BR __X32_SYSCALL_BIT .
 418 For example,
 419 .IR nr
 420 == 521 and
 421 .IR nr
 422 == (101 |
 423 .BR __X32_SYSCALL_BIT )
 424 would result in invocations of
 425 .BR ptrace (2)
 426 with potentially confused x32-vs-x86_64 semantics in the kernel.
 427 Policies intended to work on kernels before Linux 5.4 must ensure that they
 428 deny or otherwise correctly handle these system calls.
 429 On Linux 5.4 and newer,
 430 .\" commit 6365b842aae4490ebfafadfc6bb27a6d3cc54757
 431 such system calls will fail with the error
 432 .BR ENOSYS ,
 433 without doing anything.
 434 .PP
 435 The
 436 .I instruction_pointer
 437 field provides the address of the machine-language instruction that
 438 performed the system call.
 439 This might be useful in conjunction with the use of
 440 .I /proc/[pid]/maps
 441 to perform checks based on which region (mapping) of the program
 442 made the system call.
 443 (Probably, it is wise to lock down the
 444 .BR mmap (2)
 445 and
 446 .BR mprotect (2)
 447 system calls to prevent the program from subverting such checks.)
 448 .PP
 449 When checking values from
 450 .IR args ,
 451 keep in mind that arguments are often
 452 silently truncated before being processed, but after the seccomp check.
 453 For example, this happens if the i386 ABI is used on an
 454 x86-64 kernel: although the kernel will normally not look beyond
 455 the 32 lowest bits of the arguments, the values of the full
 456 64-bit registers will be present in the seccomp data.
 457 A less surprising example is that if the x86-64 ABI is used to perform
 458 a system call that takes an argument of type
 459 .IR int ,
 460 the more-significant half of the argument register is ignored by
 461 the system call, but visible in the seccomp data.
 462 .PP
 463 A seccomp filter returns a 32-bit value consisting of two parts:
 464 the most significant 16 bits
 465 (corresponding to the mask defined by the constant
 466 .BR SECCOMP_RET_ACTION_FULL )
 467 contain one of the "action" values listed below;
 468 the least significant 16-bits (defined by the constant
 469 .BR SECCOMP_RET_DATA )
 470 are "data" to be associated with this return value.
 471 .PP
 472 If multiple filters exist, they are \fIall\fP executed,
 473 in reverse order of their addition to the filter tree\(emthat is,
 474 the most recently installed filter is executed first.
 475 (Note that all filters will be called
 476 even if one of the earlier filters returns
 477 .BR SECCOMP_RET_KILL .
 478 This is done to simplify the kernel code and to provide a
 479 tiny speed-up in the execution of sets of filters by
 480 avoiding a check for this uncommon case.)
 481 .\" From an Aug 2015 conversation with Kees Cook where I asked why *all*
 482 .\" filters are applied even if one of the early filters returns
 483 .\" SECCOMP_RET_KILL:
 484 .\"
 485 .\"     It's just because it would be an optimization that would only speed up
 486 .\"     the RET_KILL case, but it's the uncommon one and the one that doesn't
 487 .\"     benefit meaningfully from such a change (you need to kill the process
 488 .\"     really quickly?). We would speed up killing a program at the (albeit
 489 .\"     tiny) expense to all other filtered programs. Best to keep the filter
 490 .\"     execution logic clear, simple, and as fast as possible for all
 491 .\"     filters.
 492 The return value for the evaluation of a given system call is the first-seen
 493 action value of highest precedence (along with its accompanying data)
 494 returned by execution of all of the filters.
 495 .PP
 496 In decreasing order of precedence,
 497 the action values that may be returned by a seccomp filter are:
 498 .TP
 499 .BR SECCOMP_RET_KILL_PROCESS " (since Linux 4.14)"
 500 .\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945
 501 .\" commit 0466bdb99e8744bc9befa8d62a317f0fd7fd7421
 502 This value results in immediate termination of the process,
 503 with a core dump.
 504 The system call is not executed.
 505 By contrast with
 506 .BR SECCOMP_RET_KILL_THREAD
 507 below, all threads in the thread group are terminated.
 508 (For a discussion of thread groups, see the description of the
 509 .BR CLONE_THREAD
 510 flag in
 511 .BR clone (2).)
 512 .IP
 513 The process terminates
 514 .I "as though"
 515 killed by a
 516 .B SIGSYS
 517 signal.
 518 Even if a signal handler has been registered for
 519 .BR SIGSYS ,
 520 the handler will be ignored in this case and the process always terminates.
 521 To a parent process that is waiting on this process (using
 522 .BR waitpid (2)
 523 or similar), the returned
 524 .I wstatus
 525 will indicate that its child was terminated as though by a
 526 .BR SIGSYS
 527 signal.
 528 .TP
 529 .BR SECCOMP_RET_KILL_THREAD " (or " SECCOMP_RET_KILL )
 530 This value results in immediate termination of the thread
 531 that made the system call.
 532 The system call is not executed.
 533 Other threads in the same thread group will continue to execute.
 534 .IP
 535 The thread terminates
 536 .I "as though"
 537 killed by a
 538 .B SIGSYS
 539 signal.
 540 See
 541 .BR SECCOMP_RET_KILL_PROCESS
 542 above.
 543 .IP
 544 .\" See these commits:
 545 .\" seccomp: dump core when using SECCOMP_RET_KILL
 546 .\"    (b25e67161c295c98acda92123b2dd1e7d8642901)
 547 .\" seccomp: Only dump core when single-threaded
 548 .\"    (d7276e321ff8a53106a59c85ca46d03e34288893)
 549 Before Linux 4.11,
 550 any process terminated in this way would not trigger a coredump
 551 (even though
 552 .B SIGSYS
 553 is documented in
 554 .BR signal (7)
 555 as having a default action of termination with a core dump).
 556 Since Linux 4.11,
 557 a single-threaded process will dump core if terminated in this way.
 558 .IP
 559 With the addition of
 560 .BR SECCOMP_RET_KILL_PROCESS
 561 in Linux 4.14,
 562 .BR SECCOMP_RET_KILL_THREAD
 563 was added as a synonym for
 564 .BR SECCOMP_RET_KILL ,
 565 in order to more clearly distinguish the two actions.
 566 .IP
 567 .BR Note :
 568 the use of
 569 .BR SECCOMP_RET_KILL_THREAD
 570 to kill a single thread in a multithreaded process is likely to leave the
 571 process in a permanently inconsistent and possibly corrupt state.
 572 .TP
 573 .BR SECCOMP_RET_TRAP
 574 This value results in the kernel sending a thread-directed
 575 .BR SIGSYS
 576 signal to the triggering thread.
 577 (The system call is not executed.)
 578 Various fields will be set in the
 579 .I siginfo_t
 580 structure (see
 581 .BR sigaction (2))
 582 associated with signal:
 583 .RS
 584 .IP * 3
 585 .I si_signo
 586 will contain
 587 .BR SIGSYS .
 588 .IP *
 589 .IR si_call_addr
 590 will show the address of the system call instruction.
 591 .IP *
 592 .IR si_syscall
 593 and
 594 .IR si_arch
 595 will indicate which system call was attempted.
 596 .IP *
 597 .I si_code
 598 will contain
 599 .BR SYS_SECCOMP .
 600 .IP *
 601 .I si_errno
 602 will contain the
 603 .BR SECCOMP_RET_DATA
 604 portion of the filter return value.
 605 .RE
 606 .IP
 607 The program counter will be as though the system call happened
 608 (i.e., the program counter will not point to the system call instruction).
 609 The return value register will contain an architecture\-dependent value;
 610 if resuming execution, set it to something appropriate for the system call.
 611 (The architecture dependency is because replacing it with
 612 .BR ENOSYS
 613 could overwrite some useful information.)
 614 .TP
 615 .BR SECCOMP_RET_ERRNO
 616 This value results in the
 617 .B SECCOMP_RET_DATA
 618 portion of the filter's return value being passed to user space as the
 619 .IR errno
 620 value without executing the system call.
 621 .TP
 622 .BR SECCOMP_RET_USER_NOTIF " (since Linux 5.0)"
 623 .\" commit 6a21cc50f0c7f87dae5259f6cfefe024412313f6
 624 Forward the system call to an attached user-space supervisor
 625 process to allow that process to decide what to do with the system call.
 626 If there is no attached supervisor (either
 627 because the filter was not installed with the
 628 .BR SECCOMP_FILTER_FLAG_NEW_LISTENER
 629 flag or because the file descriptor was closed), the filter returns
 630 .BR ENOSYS
 631 (similar to what happens when a filter returns
 632 .BR SECCOMP_RET_TRACE
 633 and there is no tracer).
 634 See
 635 .BR seccomp_unotify (2)
 636 for further details.
 637 .IP
 638 Note that the supervisor process will not be notified
 639 if another filter returns an action value with a precedence greater than
 640 .BR SECCOMP_RET_USER_NOTIF .
 641 .TP
 642 .BR SECCOMP_RET_TRACE
 643 When returned, this value will cause the kernel to attempt to notify a
 644 .BR ptrace (2)-based
 645 tracer prior to executing the system call.
 646 If there is no tracer present,
 647 the system call is not executed and returns a failure status with
 648 .I errno
 649 set to
 650 .BR ENOSYS .
 651 .IP
 652 A tracer will be notified if it requests
 653 .BR PTRACE_O_TRACESECCOMP
 654 using
 655 .IR ptrace(PTRACE_SETOPTIONS) .
 656 The tracer will be notified of a
 657 .BR PTRACE_EVENT_SECCOMP
 658 and the
 659 .BR SECCOMP_RET_DATA
 660 portion of the filter's return value will be available to the tracer via
 661 .BR PTRACE_GETEVENTMSG .
 662 .IP
 663 The tracer can skip the system call by changing the system call number
 664 to \-1.
 665 Alternatively, the tracer can change the system call
 666 requested by changing the system call to a valid system call number.
 667 If the tracer asks to skip the system call, then the system call will
 668 appear to return the value that the tracer puts in the return value register.
 669 .IP
 670 .\" This was changed in ce6526e8afa4.
 671 .\" A related hole, using PTRACE_SYSCALL instead of SECCOMP_RET_TRACE, was
 672 .\" changed in arch-specific commits, e.g. 93e35efb8de4 for X86 and
 673 .\" 0f3912fd934c for ARM.
 674 Before kernel 4.8, the seccomp check will not be run again after the tracer is
 675 notified.
 676 (This means that, on older kernels, seccomp-based sandboxes
 677 .B "must not"
 678 allow use of
 679 .BR ptrace (2)\(emeven
 680 of other
 681 sandboxed processes\(emwithout extreme care;
 682 ptracers can use this mechanism to escape from the seccomp sandbox.)
 683 .IP
 684 Note that a tracer process will not be notified
 685 if another filter returns an action value with a precedence greater than
 686 .BR SECCOMP_RET_TRACE .
 687 .TP
 688 .BR SECCOMP_RET_LOG " (since Linux 4.14)"
 689 .\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4
 690 This value results in the system call being executed after
 691 the filter return action is logged.
 692 An administrator may override the logging of this action via
 693 the
 694 .IR /proc/sys/kernel/seccomp/actions_logged
 695 file.
 696 .TP
 697 .BR SECCOMP_RET_ALLOW
 698 This value results in the system call being executed.
 699 .PP
 700 If an action value other than one of the above is specified,
 701 then the filter action is treated as either
 702 .BR SECCOMP_RET_KILL_PROCESS
 703 (since Linux 4.14)
 704 .\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945
 705 or
 706 .BR SECCOMP_RET_KILL_THREAD
 707 (in Linux 4.13 and earlier).
 708 .\"
 709 .SS /proc interfaces
 710 The files in the directory
 711 .IR /proc/sys/kernel/seccomp
 712 provide additional seccomp information and configuration:
 713 .TP
 714 .IR actions_avail " (since Linux 4.14)"
 715 .\" commit 8e5f1ad116df6b0de65eac458d5e7c318d1c05af
 716 A read-only ordered list of seccomp filter return actions in string form.
 717 The ordering, from left-to-right, is in decreasing order of precedence.
 718 The list represents the set of seccomp filter return actions
 719 supported by the kernel.
 720 .TP
 721 .IR actions_logged " (since Linux 4.14)"
 722 .\" commit 0ddec0fc8900201c0897b87b762b7c420436662f
 723 A read-write ordered list of seccomp filter return actions that
 724 are allowed to be logged.
 725 Writes to the file do not need to be in ordered form but reads from
 726 the file will be ordered in the same way as the
 727 .IR actions_avail
 728 file.
 729 .IP
 730 It is important to note that the value of
 731 .IR actions_logged
 732 does not prevent certain filter return actions from being logged when
 733 the audit subsystem is configured to audit a task.
 734 If the action is not found in the
 735 .IR actions_logged
 736 file, the final decision on whether to audit the action for that task is
 737 ultimately left up to the audit subsystem to decide for all filter return
 738 actions other than
 739 .BR SECCOMP_RET_ALLOW .
 740 .IP
 741 The "allow" string is not accepted in the
 742 .IR actions_logged
 743 file as it is not possible to log
 744 .BR SECCOMP_RET_ALLOW
 745 actions.
 746 Attempting to write "allow" to the file will fail with the error
 747 .BR EINVAL .
 748 .\"
 749 .SS Audit logging of seccomp actions
 750 .\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4
 751 Since Linux 4.14, the kernel provides the facility to log the
 752 actions returned by seccomp filters in the audit log.
 753 The kernel makes the decision to log an action based on
 754 the action type,  whether or not the action is present in the
 755 .I actions_logged
 756 file, and whether kernel auditing is enabled
 757 (e.g., via the kernel boot option
 758 .IR audit=1 ).
 759 .\" or auditing could be enabled via the netlink API (AUDIT_SET)
 760 The rules are as follows:
 761 .IP * 3
 762 If the action is
 763 .BR SECCOMP_RET_ALLOW ,
 764 the action is not logged.
 765 .IP *
 766 Otherwise, if the action is either
 767 .BR SECCOMP_RET_KILL_PROCESS
 768 or
 769 .BR SECCOMP_RET_KILL_THREAD ,
 770 and that action appears in the
 771 .IR actions_logged
 772 file, the action is logged.
 773 .IP *
 774 Otherwise, if the filter has requested logging (the
 775 .BR SECCOMP_FILTER_FLAG_LOG
 776 flag)
 777 and the action appears in the
 778 .IR actions_logged
 779 file, the action is logged.
 780 .IP *
 781 Otherwise, if kernel auditing is enabled and the process is being audited
 782 .RB ( autrace (8)),
 783 the action is logged.
 784 .IP *
 785 Otherwise, the action is not logged.
 786 .SH RETURN VALUE
 787 On success,
 788 .BR seccomp ()
 789 returns 0.
 790 On error, if
 791 .BR SECCOMP_FILTER_FLAG_TSYNC
 792 was used,
 793 the return value is the ID of the thread
 794 that caused the synchronization failure.
 795 (This ID is a kernel thread ID of the type returned by
 796 .BR clone (2)
 797 and
 798 .BR gettid (2).)
 799 On other errors, \-1 is returned, and
 800 .IR errno
 801 is set to indicate the error.
 802 .SH ERRORS
 803 .BR seccomp ()
 804 can fail for the following reasons:
 805 .TP
 806 .BR EACCES
 807 The caller did not have the
 808 .BR CAP_SYS_ADMIN
 809 capability in its user namespace, or had not set
 810 .IR no_new_privs
 811 before using
 812 .BR SECCOMP_SET_MODE_FILTER .
 813 .TP
 814 .BR EBUSY
 815 While installing a new filter, the
 816 .BR SECCOMP_FILTER_FLAG_NEW_LISTENER
 817 flag was specified,
 818 but a previous filter had already been installed with that flag.
 819 .TP
 820 .BR EFAULT
 821 .IR args
 822 was not a valid address.
 823 .TP
 824 .BR EINVAL
 825 .IR operation
 826 is unknown or is not supported by this kernel version or configuration.
 827 .TP
 828 .B EINVAL
 829 The specified
 830 .IR flags
 831 are invalid for the given
 832 .IR operation .
 833 .TP
 834 .BR EINVAL
 835 .I operation
 836 included
 837 .BR BPF_ABS ,
 838 but the specified offset was not aligned to a 32-bit boundary or exceeded
 839 .IR "sizeof(struct\ seccomp_data)" .
 840 .TP
 841 .BR EINVAL
 842 .\" See kernel/seccomp.c::seccomp_may_assign_mode() in 3.18 sources
 843 A secure computing mode has already been set, and
 844 .I operation
 845 differs from the existing setting.
 846 .TP
 847 .BR EINVAL
 848 .I operation
 849 specified
 850 .BR SECCOMP_SET_MODE_FILTER ,
 851 but the filter program pointed to by
 852 .I args
 853 was not valid or the length of the filter program was zero or exceeded
 854 .B BPF_MAXINSNS
 855 (4096) instructions.
 856 .TP
 857 .BR ENOMEM
 858 Out of memory.
 859 .TP
 860 .BR ENOMEM
 861 .\" ENOMEM in kernel/seccomp.c::seccomp_attach_filter() in 3.18 sources
 862 The total length of all filter programs attached
 863 to the calling thread would exceed
 864 .B MAX_INSNS_PER_PATH
 865 (32768) instructions.
 866 Note that for the purposes of calculating this limit,
 867 each already existing filter program incurs an
 868 overhead penalty of 4 instructions.
 869 .TP
 870 .BR EOPNOTSUPP
 871 .I operation
 872 specified
 873 .BR SECCOMP_GET_ACTION_AVAIL ,
 874 but the kernel does not support the filter return action specified by
 875 .IR args .
 876 .TP
 877 .BR ESRCH
 878 Another thread caused a failure during thread sync, but its ID could not
 879 be determined.
 880 .SH VERSIONS
 881 The
 882 .BR seccomp ()
 883 system call first appeared in Linux 3.17.
 884 .\" FIXME . Add glibc version
 885 .SH CONFORMING TO
 886 The
 887 .BR seccomp ()
 888 system call is a nonstandard Linux extension.
 889 .SH NOTES
 890 Rather than hand-coding seccomp filters as shown in the example below,
 891 you may prefer to employ the
 892 .I libseccomp
 893 library, which provides a front-end for generating seccomp filters.
 894 .PP
 895 The
 896 .IR Seccomp
 897 field of the
 898 .IR /proc/[pid]/status
 899 file provides a method of viewing the seccomp mode of a process; see
 900 .BR proc (5).
 901 .PP
 902 .BR seccomp ()
 903 provides a superset of the functionality provided by the
 904 .BR prctl (2)
 905 .BR PR_SET_SECCOMP
 906 operation (which does not support
 907 .IR flags ).
 908 .PP
 909 Since Linux 4.4, the
 910 .BR ptrace (2)
 911 .B PTRACE_SECCOMP_GET_FILTER
 912 operation can be used to dump a process's seccomp filters.
 913 .\"
 914 .SS Architecture support for seccomp BPF
 915 Architecture support for seccomp BPF filtering
 916 .\" Check by grepping for HAVE_ARCH_SECCOMP_FILTER in Kconfig files in
 917 .\" kernel source. Last checked in Linux 4.16-rc source.
 918 is available on the following architectures:
 919 .IP * 3
 920 x86-64, i386, x32 (since Linux 3.5)
 921 .PD 0
 922 .IP *
 923 ARM (since Linux 3.8)
 924 .IP *
 925 s390 (since Linux 3.8)
 926 .IP *
 927 MIPS (since Linux 3.16)
 928 .IP *
 929 ARM-64 (since Linux 3.19)
 930 .IP *
 931 PowerPC (since Linux 4.3)
 932 .IP *
 933 Tile (since Linux 4.3)
 934 .IP *
 935 PA-RISC (since Linux 4.6)
 936 .\" User mode Linux since Linux 4.6
 937 .PD
 938 .\"
 939 .SS Caveats
 940 There are various subtleties to consider when applying seccomp filters
 941 to a program, including the following:
 942 .IP * 3
 943 Some traditional system calls have user-space implementations in the
 944 .BR vdso (7)
 945 on many architectures.
 946 Notable examples include
 947 .BR clock_gettime (2),
 948 .BR gettimeofday (2),
 949 and
 950 .BR time (2).
 951 On such architectures,
 952 seccomp filtering for these system calls will have no effect.
 953 (However, there are cases where the
 954 .BR vdso (7)
 955 implementations may fall back to invoking the true system call,
 956 in which case seccomp filters would see the system call.)
 957 .IP *
 958 Seccomp filtering is based on system call numbers.
 959 However, applications typically do not directly invoke system calls,
 960 but instead call wrapper functions in the C library which
 961 in turn invoke the system calls.
 962 Consequently, one must be aware of the following:
 963 .RS
 964 .IP \(bu 3
 965 The glibc wrappers for some traditional system calls may actually
 966 employ system calls with different names in the kernel.
 967 For example, the
 968 .BR exit (2)
 969 wrapper function actually employs the
 970 .BR exit_group (2)
 971 system call, and the
 972 .BR fork (2)
 973 wrapper function actually calls
 974 .BR clone (2).
 975 .IP \(bu
 976 The behavior of wrapper functions may vary across architectures,
 977 according to the range of system calls provided on those architectures.
 978 In other words, the same wrapper function may invoke
 979 different system calls on different architectures.
 980 .IP \(bu
 981 Finally, the behavior of wrapper functions can change across glibc versions.
 982 For example, in older versions, the glibc wrapper function for
 983 .BR open (2)
 984 invoked the system call of the same name,
 985 but starting in glibc 2.26, the implementation switched to calling
 986 .BR openat (2)
 987 on all architectures.
 988 .RE
 989 .PP
 990 The consequence of the above points is that it may be necessary
 991 to filter for a system call other than might be expected.
 992 Various manual pages in Section 2 provide helpful details
 993 about the differences between wrapper functions and
 994 the underlying system calls in subsections entitled
 995 .IR "C library/kernel differences" .
 996 .PP
 997 Furthermore, note that the application of seccomp filters
 998 even risks causing bugs in an application,
 999 when the filters cause unexpected failures for legitimate operations
1000 that the application might need to perform.
1001 Such bugs may not easily be discovered when testing the seccomp
1002 filters if the bugs occur in rarely used application code paths.
1003 .\"
1004 .SS Seccomp-specific BPF details
1005 Note the following BPF details specific to seccomp filters:
1006 .IP * 3
1007 The
1008 .B BPF_H
1009 and
1010 .B BPF_B
1011 size modifiers are not supported: all operations must load and store
1012 (4-byte) words
1013 .RB ( BPF_W ).
1014 .IP *
1015 To access the contents of the
1016 .I seccomp_data
1017 buffer, use the
1018 .B BPF_ABS
1019 addressing mode modifier.
1020 .IP *
1021 The
1022 .B BPF_LEN
1023 addressing mode modifier yields an immediate mode operand
1024 whose value is the size of the
1025 .IR seccomp_data
1026 buffer.
1027 .SH EXAMPLES
1028 The program below accepts four or more arguments.
1029 The first three arguments are a system call number,
1030 a numeric architecture identifier, and an error number.
1031 The program uses these values to construct a BPF filter
1032 that is used at run time to perform the following checks:
1033 .IP [1] 4
1034 If the program is not running on the specified architecture,
1035 the BPF filter causes system calls to fail with the error
1036 .BR ENOSYS .
1037 .IP [2]
1038 If the program attempts to execute the system call with the specified number,
1039 the BPF filter causes the system call to fail, with
1040 .I errno
1041 being set to the specified error number.
1042 .PP
1043 The remaining command-line arguments specify
1044 the pathname and additional arguments of a program
1045 that the example program should attempt to execute using
1046 .BR execv (3)
1047 (a library function that employs the
1048 .BR execve (2)
1049 system call).
1050 Some example runs of the program are shown below.
1051 .PP
1052 First, we display the architecture that we are running on (x86-64)
1053 and then construct a shell function that looks up system call
1054 numbers on this architecture:
1055 .PP
1056 .in +4n
1057 .EX
1058 $ \fBuname \-m\fP
1059 x86_64
1060 $ \fBsyscall_nr() {
1061     cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \e
1062     awk \(aq$2 != "x32" && $3 == "\(aq$1\(aq" { print $1 }\(aq
1063 }\fP
1064 .EE
1065 .in
1066 .PP
1067 When the BPF filter rejects a system call (case [2] above),
1068 it causes the system call to fail with the error number
1069 specified on the command line.
1070 In the experiments shown here, we'll use error number 99:
1071 .PP
1072 .in +4n
1073 .EX
1074 $ \fBerrno 99\fP
1075 EADDRNOTAVAIL 99 Cannot assign requested address
1076 .EE
1077 .in
1078 .PP
1079 In the following example, we attempt to run the command
1080 .BR whoami (1),
1081 but the BPF filter rejects the
1082 .BR execve (2)
1083 system call, so that the command is not even executed:
1084 .PP
1085 .in +4n
1086 .EX
1087 $ \fBsyscall_nr execve\fP
1088 59
1089 $ \fB./a.out\fP
1090 Usage: ./a.out <syscall_nr> <arch> <errno> <prog> [<args>]
1091 Hint for <arch>: AUDIT_ARCH_I386: 0x40000003
1092                  AUDIT_ARCH_X86_64: 0xC000003E
1093 $ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP
1094 execv: Cannot assign requested address
1095 .EE
1096 .in
1097 .PP
1098 In the next example, the BPF filter rejects the
1099 .BR write (2)
1100 system call, so that, although it is successfully started, the
1101 .BR whoami (1)
1102 command is not able to write output:
1103 .PP
1104 .in +4n
1105 .EX
1106 $ \fBsyscall_nr write\fP
1107 1
1108 $ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP
1109 .EE
1110 .in
1111 .PP
1112 In the final example,
1113 the BPF filter rejects a system call that is not used by the
1114 .BR whoami (1)
1115 command, so it is able to successfully execute and produce output:
1116 .PP
1117 .in +4n
1118 .EX
1119 $ \fBsyscall_nr preadv\fP
1120 295
1121 $ \fB./a.out 295 0xC000003E 99 /bin/whoami\fP
1122 cecilia
1123 .EE
1124 .in
1125 .SS Program source
1126 .EX
1127 #include <errno.h>
1128 #include <stddef.h>
1129 #include <stdio.h>
1130 #include <stdlib.h>
1131 #include <unistd.h>
1132 #include <linux/audit.h>
1133 #include <linux/filter.h>
1134 #include <linux/seccomp.h>
1135 #include <sys/prctl.h>
1136
1137 #define X32_SYSCALL_BIT 0x40000000
1138 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
1139
1140 static int
1141 install_filter(int syscall_nr, int t_arch, int f_errno)
1142 {
1143     unsigned int upper_nr_limit = 0xffffffff;
1144
1145     /* Assume that AUDIT_ARCH_X86_64 means the normal x86\-64 ABI
1146        (in the x32 ABI, all system calls have bit 30 set in the
1147        \(aqnr\(aq field, meaning the numbers are >= X32_SYSCALL_BIT). */
1148     if (t_arch == AUDIT_ARCH_X86_64)
1149         upper_nr_limit = X32_SYSCALL_BIT \- 1;
1150
1151     struct sock_filter filter[] = {
1152         /* [0] Load architecture from \(aqseccomp_data\(aq buffer into
1153                accumulator. */
1154         BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
1155                  (offsetof(struct seccomp_data, arch))),
1156
1157         /* [1] Jump forward 5 instructions if architecture does not
1158                match \(aqt_arch\(aq. */
1159         BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, t_arch, 0, 5),
1160
1161         /* [2] Load system call number from \(aqseccomp_data\(aq buffer into
1162                accumulator. */
1163         BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
1164                  (offsetof(struct seccomp_data, nr))),
1165
1166         /* [3] Check ABI \- only needed for x86\-64 in deny\-list use
1167                cases.  Use BPF_JGT instead of checking against the bit
1168                mask to avoid having to reload the syscall number. */
1169         BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, upper_nr_limit, 3, 0),
1170
1171         /* [4] Jump forward 1 instruction if system call number
1172                does not match \(aqsyscall_nr\(aq. */
1173         BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1),
1174
1175         /* [5] Matching architecture and system call: don\(aqt execute
1176            the system call, and return \(aqf_errno\(aq in \(aqerrno\(aq. */
1177         BPF_STMT(BPF_RET | BPF_K,
1178                  SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)),
1179
1180         /* [6] Destination of system call number mismatch: allow other
1181                system calls. */
1182         BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
1183
1184         /* [7] Destination of architecture mismatch: kill process. */
1185         BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
1186     };
1187
1188     struct sock_fprog prog = {
1189         .len = ARRAY_SIZE(filter),
1190         .filter = filter,
1191     };
1192
1193     if (seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog)) {
1194         perror("seccomp");
1195         return 1;
1196     }
1197
1198     return 0;
1199 }
1200
1201 int
1202 main(int argc, char *argv[])
1203 {
1204     if (argc < 5) {
1205         fprintf(stderr, "Usage: "
1206                 "%s <syscall_nr> <arch> <errno> <prog> [<args>]\en"
1207                 "Hint for <arch>: AUDIT_ARCH_I386: 0x%X\en"
1208                 "                 AUDIT_ARCH_X86_64: 0x%X\en"
1209                 "\en", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
1210         exit(EXIT_FAILURE);
1211     }
1212
1213     if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
1214         perror("prctl");
1215         exit(EXIT_FAILURE);
1216     }
1217
1218     if (install_filter(strtol(argv[1], NULL, 0),
1219                        strtol(argv[2], NULL, 0),
1220                        strtol(argv[3], NULL, 0)))
1221         exit(EXIT_FAILURE);
1222
1223     execv(argv[4], &argv[4]);
1224     perror("execv");
1225     exit(EXIT_FAILURE);
1226 }
1227 .EE
1228 .SH SEE ALSO
1229 .BR bpfc (1),
1230 .BR strace (1),
1231 .BR bpf (2),
1232 .BR prctl (2),
1233 .BR ptrace (2),
1234 .BR seccomp_unotify (2),
1235 .BR sigaction (2),
1236 .BR proc (5),
1237 .BR signal (7),
1238 .BR socket (7)
1239 .PP
1240 Various pages from the
1241 .I libseccomp
1242 library, including:
1243 .BR scmp_sys_resolver (1),
1244 .BR seccomp_export_bpf (3),
1245 .BR seccomp_init (3),
1246 .BR seccomp_load (3),
1247 and
1248 .BR seccomp_rule_add (3).
1249 .PP
1250 The kernel source files
1251 .IR Documentation/networking/filter.txt
1252 and
1253 .IR Documentation/userspace\-api/seccomp_filter.rst
1254 .\" commit c061f33f35be0ccc80f4b8e0aea5dfd2ed7e01a3
1255 (or
1256 .IR Documentation/prctl/seccomp_filter.txt
1257 before Linux 4.13).
1258 .PP
1259 McCanne, S.\& and Jacobson, V.\& (1992)
1260 .IR "The BSD Packet Filter: A New Architecture for User-level Packet Capture" ,
1261 Proceedings of the USENIX Winter 1993 Conference
1262 .UR http://www.tcpdump.org/papers/bpf\-usenix93.pdf
1263 .UE