man2/seccomp_unotify.2

   1 .\" Copyright (C) 2020 Michael Kerrisk <mtk.manpages@gmail.com>
   2 .\"
   3 .\" %%%LICENSE_START(VERBATIM)
   4 .\" Permission is granted to make and distribute verbatim copies of this
   5 .\" manual provided the copyright notice and this permission notice are
   6 .\" preserved on all copies.
   7 .\"
   8 .\" Permission is granted to copy and distribute modified versions of this
   9 .\" manual under the conditions for verbatim copying, provided that the
  10 .\" entire resulting derived work is distributed under the terms of a
  11 .\" permission notice identical to this one.
  12 .\"
  13 .\" Since the Linux kernel and libraries are constantly changing, this
  14 .\" manual page may be incorrect or out-of-date.  The author(s) assume no
  15 .\" responsibility for errors or omissions, or for damages resulting from
  16 .\" the use of the information contained herein.  The author(s) may not
  17 .\" have taken the same level of care in the production of this manual,
  18 .\" which is licensed free of charge, as they might when working
  19 .\" professionally.
  20 .\"
  21 .\" Formatted or processed versions of this manual, if unaccompanied by
  22 .\" the source, must acknowledge the copyright and authors of this work.
  23 .\" %%%LICENSE_END
  24 .\"
  25 .TH SECCOMP_UNOTIFY 2 2021-06-20 "Linux" "Linux Programmer's Manual"
  26 .SH NAME
  27 seccomp_unotify \- Seccomp user-space notification mechanism
  28 .SH SYNOPSIS
  29 .nf
  30 .B #include <linux/seccomp.h>
  31 .B #include <linux/filter.h>
  32 .B #include <linux/audit.h>
  33 .PP
  34 .BI "int seccomp(unsigned int " operation ", unsigned int " flags \
  35 ", void *" args );
  36 .PP
  37 .B #include <sys/ioctl.h>
  38 .PP
  39 .BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_RECV,"
  40 .BI "          struct seccomp_notif *" req );
  41 .BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_SEND,"
  42 .BI "          struct seccomp_notif_resp *" resp );
  43 .BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_ID_VALID, __u64 *" id );
  44 .BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_ADDFD,"
  45 .BI "          struct seccomp_notif_addfd *" addfd );
  46 .fi
  47 .SH DESCRIPTION
  48 This page describes the user-space notification mechanism provided by the
  49 Secure Computing (seccomp) facility.
  50 As well as the use of the
  51 .B SECCOMP_FILTER_FLAG_NEW_LISTENER
  52 flag, the
  53 .BR SECCOMP_RET_USER_NOTIF
  54 action value, and the
  55 .B SECCOMP_GET_NOTIF_SIZES
  56 operation described in
  57 .BR seccomp (2),
  58 this mechanism involves the use of a number of related
  59 .BR ioctl (2)
  60 operations (described below).
  61 .\"
  62 .SS Overview
  63 In conventional usage of a seccomp filter,
  64 the decision about how to treat a system call is made by the filter itself.
  65 By contrast, the user-space notification mechanism allows
  66 the seccomp filter to delegate
  67 the handling of the system call to another user-space process.
  68 Note that this mechanism is explicitly
  69 .B not
  70 intended as a method implementing security policy; see NOTES.
  71 .PP
  72 In the discussion that follows,
  73 the thread(s) on which the seccomp filter is installed is (are)
  74 referred to as the
  75 .IR target ,
  76 and the process that is notified by the user-space notification
  77 mechanism is referred to as the
  78 .IR supervisor .
  79 .PP
  80 A suitably privileged supervisor can use the user-space notification
  81 mechanism to perform actions on behalf of the target.
  82 The advantage of the user-space notification mechanism is that
  83 the supervisor will
  84 usually be able to retrieve information about the target and the
  85 performed system call that the seccomp filter itself cannot.
  86 (A seccomp filter is limited in the information it can obtain and
  87 the actions that it can perform because it
  88 is running on a virtual machine inside the kernel.)
  89 .PP
  90 An overview of the steps performed by the target and the supervisor
  91 is as follows:
  92 .\"-------------------------------------
  93 .IP 1. 3
  94 The target establishes a seccomp filter in the usual manner,
  95 but with two differences:
  96 .RS
  97 .IP \(bu 2
  98 The
  99 .BR seccomp (2)
 100 .I flags
 101 argument includes the flag
 102 .BR SECCOMP_FILTER_FLAG_NEW_LISTENER .
 103 Consequently, the return value of the (successful)
 104 .BR seccomp (2)
 105 call is a new "listening"
 106 file descriptor that can be used to receive notifications.
 107 Only one "listening" seccomp filter can be installed for a thread.
 108 .\" FIXME
 109 .\" Is the last sentence above correct?
 110 .\"
 111 .\" Kees Cook (25 Oct 2020) notes:
 112 .\"
 113 .\" I like this limitation, but I expect that it'll need to change in the
 114 .\" future. Even with LSMs, we see the need for arbitrary stacking, and the
 115 .\" idea of there being only 1 supervisor will eventually break down. Right
 116 .\" now there is only 1 because only container managers are using this
 117 .\" feature. But if some daemon starts using it to isolate some thread,
 118 .\" suddenly it might break if a container manager is trying to listen to it
 119 .\" too, etc. I expect it won't be needed soon, but I do think it'll change.
 120 .\"
 121 .IP \(bu
 122 In cases where it is appropriate, the seccomp filter returns the action value
 123 .BR SECCOMP_RET_USER_NOTIF .
 124 This return value will trigger a notification event.
 125 .RE
 126 .\"-------------------------------------
 127 .IP 2.
 128 In order that the supervisor can obtain notifications
 129 using the listening file descriptor,
 130 (a duplicate of) that file descriptor must be passed from
 131 the target to the supervisor.
 132 One way in which this could be done is by passing the file descriptor
 133 over a UNIX domain socket connection between the target and the supervisor
 134 (using the
 135 .BR SCM_RIGHTS
 136 ancillary message type described in
 137 .BR unix (7)).
 138 Another way to do this is through the use of
 139 .BR pidfd_getfd (2).
 140 .\" Jann Horn:
 141 .\"     Instead of using unix domain sockets to send the fd to the
 142 .\"     parent, I think you could also use clone3() with
 143 .\"     flags==CLONE_FILES|SIGCHLD, dup2() the seccomp fd to an fd
 144 .\"     that was reserved in the parent, call unshare(CLONE_FILES)
 145 .\"     in the child after setting up the seccomp fd, and wake
 146 .\"     up the parent with something like pthread_cond_signal()?
 147 .\"     I'm not sure whether that'd look better or worse in the
 148 .\"     end though, so maybe just ignore this comment.
 149 .\"-------------------------------------
 150 .IP 3.
 151 The supervisor will receive notification events
 152 on the listening file descriptor.
 153 These events are returned as structures of type
 154 .IR seccomp_notif .
 155 Because this structure and its size may evolve over kernel versions,
 156 the supervisor must first determine the size of this structure
 157 using the
 158 .BR seccomp (2)
 159 .B SECCOMP_GET_NOTIF_SIZES
 160 operation, which returns a structure of type
 161 .IR seccomp_notif_sizes .
 162 The supervisor allocates a buffer of size
 163 .I seccomp_notif_sizes.seccomp_notif
 164 bytes to receive notification events.
 165 In addition,the supervisor allocates another buffer of size
 166 .I seccomp_notif_sizes.seccomp_notif_resp
 167 bytes for the response (a
 168 .I struct seccomp_notif_resp
 169 structure)
 170 that it will provide to the kernel (and thus the target).
 171 .\"-------------------------------------
 172 .IP 4.
 173 The target then performs its workload,
 174 which includes system calls that will be controlled by the seccomp filter.
 175 Whenever one of these system calls causes the filter to return the
 176 .B SECCOMP_RET_USER_NOTIF
 177 action value, the kernel does
 178 .I not
 179 (yet) execute the system call;
 180 instead, execution of the target is temporarily blocked inside
 181 the kernel (in a sleep state that is interruptible by signals)
 182 and a notification event is generated on the listening file descriptor.
 183 .\"-------------------------------------
 184 .IP 5.
 185 The supervisor can now repeatedly monitor the
 186 listening file descriptor for
 187 .BR SECCOMP_RET_USER_NOTIF -triggered
 188 events.
 189 To do this, the supervisor uses the
 190 .B SECCOMP_IOCTL_NOTIF_RECV
 191 .BR ioctl (2)
 192 operation to read information about a notification event;
 193 this operation blocks until an event is available.
 194 The operation returns a
 195 .I seccomp_notif
 196 structure containing information about the system call
 197 that is being attempted by the target.
 198 (As described in NOTES,
 199 the file descriptor can also be monitored with
 200 .BR select (2),
 201 .BR poll (2),
 202 or
 203 .BR epoll (7).)
 204 .\" FIXME
 205 .\" Christian Brauner:
 206 .\"
 207 .\" Do we support O_NONBLOCK with SECCOMP_IOCTL_NOTIF_RECV and if
 208 .\" not should we?
 209 .\"
 210 .\" Michael Kerrisk:
 211 .\"
 212 .\" A quick test suggests that O_NONBLOCK has no effect on the blocking
 213 .\" behavior of SECCOMP_IOCTL_NOTIF_RECV.
 214 .
 215 .\"-------------------------------------
 216 .IP 6.
 217 The
 218 .I seccomp_notif
 219 structure returned by the
 220 .B SECCOMP_IOCTL_NOTIF_RECV
 221 operation includes the same information (a
 222 .I seccomp_data
 223 structure) that was passed to the seccomp filter.
 224 This information allows the supervisor to discover the system call number and
 225 the arguments for the target's system call.
 226 In addition, the notification event contains the ID of the thread
 227 that triggered the notification and a unique cookie value that
 228 is used in subsequent
 229 .B SECCOMP_IOCTL_NOTIF_ID_VALID
 230 and
 231 .B SECCOMP_IOCTL_NOTIF_SEND
 232 operations.
 233 .IP
 234 The information in the notification can be used to discover the
 235 values of pointer arguments for the target's system call.
 236 (This is something that can't be done from within a seccomp filter.)
 237 One way in which the supervisor can do this is to open the corresponding
 238 .I /proc/[tid]/mem
 239 file (see
 240 .BR proc (5))
 241 and read bytes from the location that corresponds to one of
 242 the pointer arguments whose value is supplied in the notification event.
 243 .\" Tycho Andersen mentioned that there are alternatives to /proc/PID/mem,
 244 .\" such as ptrace() and /proc/PID/map_files
 245 (The supervisor must be careful to avoid
 246 a race condition that can occur when doing this;
 247 see the description of the
 248 .BR SECCOMP_IOCTL_NOTIF_ID_VALID
 249 .BR ioctl (2)
 250 operation below.)
 251 In addition,
 252 the supervisor can access other system information that is visible
 253 in user space but which is not accessible from a seccomp filter.
 254 .\"-------------------------------------
 255 .IP 7.
 256 Having obtained information as per the previous step,
 257 the supervisor may then choose to perform an action in response
 258 to the target's system call
 259 (which, as noted above, is not executed when the seccomp filter returns the
 260 .B SECCOMP_RET_USER_NOTIF
 261 action value).
 262 .IP
 263 One example use case here relates to containers.
 264 The target may be located inside a container where
 265 it does not have sufficient capabilities to mount a filesystem
 266 in the container's mount namespace.
 267 However, the supervisor may be a more privileged process that
 268 does have sufficient capabilities to perform the mount operation.
 269 .\"-------------------------------------
 270 .IP 8.
 271 The supervisor then sends a response to the notification.
 272 The information in this response is used by the kernel to construct
 273 a return value for the target's system call and provide
 274 a value that will be assigned to the
 275 .I errno
 276 variable of the target.
 277 .IP
 278 The response is sent using the
 279 .B SECCOMP_IOCTL_NOTIF_SEND
 280 .BR ioctl (2)
 281 operation, which is used to transmit a
 282 .I seccomp_notif_resp
 283 structure to the kernel.
 284 This structure includes a cookie value that the supervisor obtained in the
 285 .I seccomp_notif
 286 structure returned by the
 287 .B SECCOMP_IOCTL_NOTIF_RECV
 288 operation.
 289 This cookie value allows the kernel to associate the response with the
 290 target.
 291 This structure must include the cookie value that the supervisor
 292 obtained in the
 293 .I seccomp_notif
 294 structure returned by the
 295 .B SECCOMP_IOCTL_NOTIF_RECV
 296 operation;
 297 the cookie allows the kernel to associate the response with the target.
 298 .\"-------------------------------------
 299 .IP 9.
 300 Once the notification has been sent,
 301 the system call in the target thread unblocks,
 302 returning the information that was provided by the supervisor
 303 in the notification response.
 304 .\"-------------------------------------
 305 .PP
 306 As a variation on the last two steps,
 307 the supervisor can send a response that tells the kernel that it
 308 should execute the target thread's system call; see the discussion of
 309 .BR SECCOMP_USER_NOTIF_FLAG_CONTINUE ,
 310 below.
 311 .\"
 312 .SH IOCTL OPERATIONS
 313 The following
 314 .BR ioctl (2)
 315 operations are supported by the seccomp user-space
 316 notification file descriptor.
 317 For each of these operations, the first (file descriptor) argument of
 318 .BR ioctl (2)
 319 is the listening file descriptor returned by a call to
 320 .BR seccomp (2)
 321 with the
 322 .BR SECCOMP_FILTER_FLAG_NEW_LISTENER
 323 flag.
 324 .\"
 325 .SS SECCOMP_IOCTL_NOTIF_RECV
 326 The
 327 .B SECCOMP_IOCTL_NOTIF_RECV
 328 operation (available since Linux 5.0) is used to obtain a user-space
 329 notification event.
 330 If no such event is currently pending,
 331 the operation blocks until an event occurs.
 332 The third
 333 .BR ioctl (2)
 334 argument is a pointer to a structure of the following form
 335 which contains information about the event.
 336 This structure must be zeroed out before the call.
 337 .PP
 338 .in +4n
 339 .EX
 340 struct seccomp_notif {
 341     __u64  id;              /* Cookie */
 342     __u32  pid;             /* TID of target thread */
 343     __u32  flags;           /* Currently unused (0) */
 344     struct seccomp_data data;   /* See seccomp(2) */
 345 };
 346 .EE
 347 .in
 348 .PP
 349 The fields in this structure are as follows:
 350 .TP
 351 .I id
 352 This is a cookie for the notification.
 353 Each such cookie is guaranteed to be unique for the corresponding
 354 seccomp filter.
 355 .RS
 356 .IP \(bu 2
 357 The cookie can be used with the
 358 .B SECCOMP_IOCTL_NOTIF_ID_VALID
 359 .BR ioctl (2)
 360 operation described below.
 361 .IP \(bu
 362 When returning a notification response to the kernel,
 363 the supervisor must include the cookie value in the
 364 .IR seccomp_notif_resp
 365 structure that is specified as the argument of the
 366 .BR SECCOMP_IOCTL_NOTIF_SEND
 367 operation.
 368 .RE
 369 .TP
 370 .I pid
 371 This is the thread ID of the target thread that triggered
 372 the notification event.
 373 .TP
 374 .I flags
 375 This is a bit mask of flags providing further information on the event.
 376 In the current implementation, this field is always zero.
 377 .TP
 378 .I data
 379 This is a
 380 .I seccomp_data
 381 structure containing information about the system call that
 382 triggered the notification.
 383 This is the same structure that is passed to the seccomp filter.
 384 See
 385 .BR seccomp (2)
 386 for details of this structure.
 387 .PP
 388 On success, this operation returns 0; on failure, \-1 is returned, and
 389 .I errno
 390 is set to indicate the cause of the error.
 391 This operation can fail with the following errors:
 392 .TP
 393 .BR EINVAL " (since Linux 5.5)"
 394 .\" commit 2882d53c9c6f3b8311d225062522f03772cf0179
 395 The
 396 .I seccomp_notif
 397 structure that was passed to the call contained nonzero fields.
 398 .TP
 399 .B ENOENT
 400 The target thread was killed by a signal as the notification information
 401 was being generated,
 402 or the target's (blocked) system call was interrupted by a signal handler.
 403 .\" FIXME
 404 .\" From my experiments,
 405 .\" it appears that if a SECCOMP_IOCTL_NOTIF_RECV is done after
 406 .\" the target thread terminates, then the ioctl() simply
 407 .\" blocks (rather than returning an error to indicate that the
 408 .\" target no longer exists).
 409 .\"
 410 .\" I found that surprising, and it required some contortions in
 411 .\" the example program.  It was not possible to code my SIGCHLD
 412 .\" handler (which reaps the zombie when the worker/target
 413 .\" terminates) to simply set a flag checked in the main
 414 .\" handleNotifications() loop, since this created an
 415 .\" unavoidable race where the child might terminate just after
 416 .\" I had checked the flag, but before I blocked (forever!) in the
 417 .\" SECCOMP_IOCTL_NOTIF_RECV operation. Instead, I had to code
 418 .\" the signal handler to simply call _exit(2) in order to
 419 .\" terminate the parent process (the supervisor).
 420 .\"
 421 .\" Is this expected behavior? It seems to me rather
 422 .\" desirable that SECCOMP_IOCTL_NOTIF_RECV should give an error
 423 .\" if the target has terminated.
 424 .\"
 425 .\" Jann posted a patch to rectify this, but there was no response
 426 .\" (Lore link: https://bit.ly/3jvUBxk) to his question about fixing
 427 .\" this issue. (I've tried building with the patch, but encountered
 428 .\" an issue with the target process entering D state after a signal.)
 429 .\"
 430 .\" For now, this behavior is documented in BUGS.
 431 .\"
 432 .\" Kees Cook commented: Let's change [this] ASAP!
 433 .\"
 434 .SS SECCOMP_IOCTL_NOTIF_ID_VALID
 435 The
 436 .B SECCOMP_IOCTL_NOTIF_ID_VALID
 437 operation (available since Linux 5.0) is used to check that a notification ID
 438 returned by an earlier
 439 .B SECCOMP_IOCTL_NOTIF_RECV
 440 operation is still valid
 441 (i.e., that the target still exists and its system call
 442 is still blocked waiting for a response).
 443 .PP
 444 The third
 445 .BR ioctl (2)
 446 argument is a pointer to the cookie
 447 .RI ( id )
 448 returned by the
 449 .B SECCOMP_IOCTL_NOTIF_RECV
 450 operation.
 451 .PP
 452 This operation is necessary to avoid race conditions that can occur when the
 453 .I pid
 454 returned by the
 455 .B SECCOMP_IOCTL_NOTIF_RECV
 456 operation terminates, and that process ID is reused by another process.
 457 An example of this kind of race is the following
 458 .IP 1. 3
 459 A notification is generated on the listening file descriptor.
 460 The returned
 461 .I seccomp_notif
 462 contains the TID of the target thread (in the
 463 .I pid
 464 field of the structure).
 465 .IP 2.
 466 The target terminates.
 467 .IP 3.
 468 Another thread or process is created on the system that by chance reuses the
 469 TID that was freed when the target terminated.
 470 .IP 4.
 471 The supervisor
 472 .BR open (2)s
 473 the
 474 .IR /proc/[tid]/mem
 475 file for the TID obtained in step 1, with the intention of (say)
 476 inspecting the memory location(s) that containing the argument(s) of
 477 the system call that triggered the notification in step 1.
 478 .PP
 479 In the above scenario, the risk is that the supervisor may try
 480 to access the memory of a process other than the target.
 481 This race can be avoided by following the call to
 482 .BR open (2)
 483 with a
 484 .B SECCOMP_IOCTL_NOTIF_ID_VALID
 485 operation to verify that the process that generated the notification
 486 is still alive.
 487 (Note that if the target terminates after the latter step,
 488 a subsequent
 489 .BR read (2)
 490 from the file descriptor may return 0, indicating end of file.)
 491 .\" Jann Horn:
 492 .\"     the PID can be reused, but the /proc/$pid directory is
 493 .\"     internally not associated with the numeric PID, but,
 494 .\"     conceptually speaking, with a specific incarnation of the
 495 .\"     PID, or something like that.  (Actually, it is associated
 496 .\"     with the "struct pid", which is not reused, instead of the
 497 .\"     numeric PID.
 498 .PP
 499 See NOTES for a discussion of other cases where
 500 .B SECCOMP_IOCTL_NOTIF_ID_VALID
 501 checks must be performed.
 502 .PP
 503 On success (i.e., the notification ID is still valid),
 504 this operation returns 0.
 505 On failure (i.e., the notification ID is no longer valid),
 506 \-1 is returned, and
 507 .I errno
 508 is set to
 509 .BR ENOENT .
 510 .\"
 511 .SS SECCOMP_IOCTL_NOTIF_SEND
 512 The
 513 .B SECCOMP_IOCTL_NOTIF_SEND
 514 operation (available since Linux 5.0)
 515 is used to send a notification response back to the kernel.
 516 The third
 517 .BR ioctl (2)
 518 argument of this structure is a pointer to a structure of the following form:
 519 .PP
 520 .in +4n
 521 .EX
 522 struct seccomp_notif_resp {
 523     __u64 id;           /* Cookie value */
 524     __s64 val;          /* Success return value */
 525     __s32 error;        /* 0 (success) or negative error number */
 526     __u32 flags;        /* See below */
 527 };
 528 .EE
 529 .in
 530 .PP
 531 The fields of this structure are as follows:
 532 .TP
 533 .I id
 534 This is the cookie value that was obtained using the
 535 .B SECCOMP_IOCTL_NOTIF_RECV
 536 operation.
 537 This cookie value allows the kernel to correctly associate this response
 538 with the system call that triggered the user-space notification.
 539 .TP
 540 .I val
 541 This is the value that will be used for a spoofed
 542 success return for the target's system call; see below.
 543 .TP
 544 .I error
 545 This is the value that will be used as the error number
 546 .RI ( errno )
 547 for a spoofed error return for the target's system call; see below.
 548 .TP
 549 .I flags
 550 This is a bit mask that includes zero or more of the following flags:
 551 .RS
 552 .TP
 553 .BR SECCOMP_USER_NOTIF_FLAG_CONTINUE " (since Linux 5.5)"
 554 Tell the kernel to execute the target's system call.
 555 .\" commit fb3c5386b382d4097476ce9647260fc89b34afdb
 556 .RE
 557 .PP
 558 Two kinds of response are possible:
 559 .IP \(bu 2
 560 A response to the kernel telling it to execute the
 561 target's system call.
 562 In this case, the
 563 .I flags
 564 field includes
 565 .B SECCOMP_USER_NOTIF_FLAG_CONTINUE
 566 and the
 567 .I error
 568 and
 569 .I val
 570 fields must be zero.
 571 .IP
 572 This kind of response can be useful in cases where the supervisor needs
 573 to do deeper analysis of the target's system call than is possible
 574 from a seccomp filter (e.g., examining the values of pointer arguments),
 575 and, having decided that the system call does not require emulation
 576 by the supervisor, the supervisor wants the system call to
 577 be executed normally in the target.
 578 .IP
 579 The
 580 .B SECCOMP_USER_NOTIF_FLAG_CONTINUE
 581 flag should be used with caution; see NOTES.
 582 .IP \(bu
 583 A spoofed return value for the target's system call.
 584 In this case, the kernel does not execute the target's system call,
 585 instead causing the system call to return a spoofed value as specified by
 586 fields of the
 587 .I seccomp_notif_resp
 588 structure.
 589 The supervisor should set the fields of this structure as follows:
 590 .RS
 591 .IP + 3
 592 .I flags
 593 does not contain
 594 .BR SECCOMP_USER_NOTIF_FLAG_CONTINUE .
 595 .IP +
 596 .I error
 597 is set either to 0 for a spoofed "success" return or to a negative
 598 error number for a spoofed "failure" return.
 599 In the former case, the kernel causes the target's system call
 600 to return the value specified in the
 601 .I val
 602 field.
 603 In the latter case, the kernel causes the target's system call
 604 to return \-1, and
 605 .I errno
 606 is assigned the negated
 607 .I error
 608 value.
 609 .IP +
 610 .I val
 611 is set to a value that will be used as the return value for a spoofed
 612 "success" return for the target's system call.
 613 The value in this field is ignored if the
 614 .I error
 615 field contains a nonzero value.
 616 .\" FIXME
 617 .\" Kees Cook suggested:
 618 .\"
 619 .\" Strictly speaking, this is architecture specific, but
 620 .\" all architectures do it this way. Should seccomp enforce
 621 .\" val == 0 when err != 0 ?
 622 .\"
 623 .\" Christian Brauner
 624 .\"
 625 .\" Feels like it should, at least for the SEND ioctl where we already
 626 .\" verify that val and err are both 0 when CONTINUE is specified (as you
 627 .\" pointed out correctly above).
 628 .RE
 629 .PP
 630 On success, this operation returns 0; on failure, \-1 is returned, and
 631 .I errno
 632 is set to indicate the cause of the error.
 633 This operation can fail with the following errors:
 634 .TP
 635 .B EINPROGRESS
 636 A response to this notification has already been sent.
 637 .TP
 638 .B EINVAL
 639 An invalid value was specified in the
 640 .I flags field.
 641 .TP
 642 .B
 643 .B EINVAL
 644 The
 645 .I flags
 646 field contained
 647 .BR SECCOMP_USER_NOTIF_FLAG_CONTINUE ,
 648 and the
 649 .I error
 650 or
 651 .I val
 652 field was not zero.
 653 .TP
 654 .B ENOENT
 655 The blocked system call in the target
 656 has been interrupted by a signal handler
 657 or the target has terminated.
 658 .\" Jann Horn notes:
 659 .\"     you could also get this [ENOENT] if a response has already
 660 .\"     been sent, instead of EINPROGRESS - the only difference is
 661 .\"     whether the target thread has picked up the response yet
 662 .\"
 663 .SS SECCOMP_IOCTL_NOTIF_ADDFD
 664 The
 665 .B SECCOMP_IOCTL_NOTIF_ADDFD
 666 operation (available since Linux 5.9)
 667 allows the supervisor to install a file descriptor
 668 into the target's file descriptor table.
 669 Much like the use of
 670 .BR SCM_RIGHTS
 671 messages described in
 672 .BR unix (7),
 673 this operation is semantically equivalent to duplicating
 674 a file descriptor from the supervisor's file descriptor table
 675 into the target's file descriptor table.
 676 .PP
 677 The
 678 .BR SECCOMP_IOCTL_NOTIF_ADDFD
 679 operation permits the supervisor to emulate a target system call (such as
 680 .BR socket (2)
 681 or
 682 .BR openat (2))
 683 that generates a file descriptor.
 684 The supervisor can perform the system call that generates
 685 the file descriptor (and associated open file description)
 686 and then use this operation to allocate
 687 a file descriptor that refers to the same open file description in the target.
 688 (For an explanation of open file descriptions, see
 689 .BR open (2).)
 690 .PP
 691 Once this operation has been performed,
 692 the supervisor can close its copy of the file descriptor.
 693 .PP
 694 In the target,
 695 the received file descriptor is subject to the same
 696 Linux Security Module (LSM) checks as are applied to a file descriptor
 697 that is received in an
 698 .BR SCM_RIGHTS
 699 ancillary message.
 700 If the file descriptor refers to a socket,
 701 it inherits the cgroup version 1 network controller settings
 702 .RI ( classid
 703 and
 704 .IR netprioidx )
 705 of the target.
 706 .PP
 707 The third
 708 .BR ioctl (2)
 709 argument is a pointer to a structure of the following form:
 710 .PP
 711 .in +4n
 712 .EX
 713 struct seccomp_notif_addfd {
 714     __u64 id;           /* Cookie value */
 715     __u32 flags;        /* Flags */
 716     __u32 srcfd;        /* Local file descriptor number */
 717     __u32 newfd;        /* 0 or desired file descriptor
 718                            number in target */
 719     __u32 newfd_flags;  /* Flags to set on target file
 720                            descriptor */
 721 };
 722 .EE
 723 .in
 724 .PP
 725 The fields in this structure are as follows:
 726 .TP
 727 .I id
 728 This field should be set to the notification ID
 729 (cookie value) that was obtained via
 730 .BR SECCOMP_IOCTL_NOTIF_RECV .
 731 .TP
 732 .I flags
 733 This field is a bit mask of flags that modify the behavior of the operation.
 734 Currently, only one flag is supported:
 735 .RS
 736 .TP
 737 .BR SECCOMP_ADDFD_FLAG_SETFD
 738 When allocating the file descriptor in the target,
 739 use the file descriptor number specified in the
 740 .I newfd
 741 field.
 742 .TP
 743 .BR SECCOMP_ADDFD_FLAG_SEND
 744 Available since Linux 5.14, combines the
 745 .B SECCOMP_IOCTL_NOTIF_ADDFD
 746 ioctl with
 747 .B SECCOMP_IOCTL_NOTIF_SEND
 748 into an atomic operation. On successful invocation, the target process's
 749 errno will be 0 and the return value will be the file descriptor number that was
 750 installed in the target. If allocating the file descriptor in the tatget fails,
 751 the target's syscall continues to be blocked until a successful response is
 752 sent.
 753 .RE
 754 .TP
 755 .I srcfd
 756 This field should be set to the number of the file descriptor
 757 in the supervisor that is to be duplicated.
 758 .TP
 759 .I newfd
 760 This field determines which file descriptor number is allocated in the target.
 761 If the
 762 .BR SECCOMP_ADDFD_FLAG_SETFD
 763 flag is set,
 764 then this field specifies which file descriptor number should be allocated.
 765 If this file descriptor number is already open in the target,
 766 it is atomically closed and reused.
 767 If the descriptor duplication fails due to an LSM check, or if
 768 .I srcfd
 769 is not a valid file descriptor,
 770 the file descriptor
 771 .I newfd
 772 will not be closed in the target process.
 773 .IP
 774 If the
 775 .BR SECCOMP_ADDFD_FLAG_SETFD
 776 flag it not set, then this field must be 0,
 777 and the kernel allocates the lowest unused file descriptor number
 778 in the target.
 779 .TP
 780 .I newfd_flags
 781 This field is a bit mask specifying flags that should be set on
 782 the file descriptor that is received in the target process.
 783 Currently, only the following flag is implemented:
 784 .RS
 785 .TP
 786 .B O_CLOEXEC
 787 Set the close-on-exec flag on the received file descriptor.
 788 .RE
 789 .PP
 790 On success, this
 791 .BR ioctl (2)
 792 call returns the number of the file descriptor that was allocated
 793 in the target.
 794 Assuming that the emulated system call is one that returns
 795 a file descriptor as its function result (e.g.,
 796 .BR socket (2)),
 797 this value can be used as the return value
 798 .RI ( resp.val )
 799 that is supplied in the response that is subsequently sent with the
 800 .BR SECCOMP_IOCTL_NOTIF_SEND
 801 operation.
 802 .PP
 803 On error, \-1 is returned and
 804 .I errno
 805 is set to indicate the cause of the error.
 806 .PP
 807 This operation can fail with the following errors:
 808 .TP
 809 .B EBADF
 810 Allocating the file descriptor in the target would cause the target's
 811 .BR RLIMIT_NOFILE
 812 limit to be exceeded (see
 813 .BR getrlimit (2)).
 814 .TP
 815 .B EBUSY
 816 If the flag
 817 .B SECCOMP_IOCTL_NOTIF_SEND
 818 is used, this means the operation can't proceed until other
 819 .B SECCOMP_IOCTL_NOTIF_ADDFD
 820 requests are processed.
 821 .TP
 822 .B EINPROGRESS
 823 The user-space notification specified in the
 824 .I id
 825 field exists but has not yet been fetched (by a
 826 .BR SECCOMP_IOCTL_NOTIF_RECV )
 827 or has already been responded to (by a
 828 .BR SECCOMP_IOCTL_NOTIF_SEND ).
 829 .TP
 830 .B EINVAL
 831 An invalid flag was specified in the
 832 .I flags
 833 or
 834 .I newfd_flags
 835 field, or the
 836 .I newfd
 837 field is nonzero and the
 838 .B SECCOMP_ADDFD_FLAG_SETFD
 839 flag was not specified in the
 840 .I flags
 841 field.
 842 .TP
 843 .B EMFILE
 844 The file descriptor number specified in
 845 .I newfd
 846 exceeds the limit specified in
 847 .IR /proc/sys/fs/nr_open .
 848 .TP
 849 .B ENOENT
 850 The blocked system call in the target
 851 has been interrupted by a signal handler
 852 or the target has terminated.
 853 .PP
 854 Here is some sample code (with error handling omitted) that uses the
 855 .B SECCOMP_ADDFD_FLAG_SETFD
 856 operation (here, to emulate a call to
 857 .BR openat (2)):
 858 .PP
 859 .EX
 860 .in +4n
 861 int fd, removeFd;
 862
 863 fd = openat(req->data.args[0], path, req->data.args[2],
 864                 req->data.args[3]);
 865
 866 struct seccomp_notif_addfd addfd;
 867 addfd.id = req->id; /* Cookie from SECCOMP_IOCTL_NOTIF_RECV */
 868 addfd.srcfd = fd;
 869 addfd.newfd = 0;
 870 addfd.flags = 0;
 871 addfd.newfd_flags = O_CLOEXEC;
 872
 873 targetFd = ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
 874
 875 close(fd);          /* No longer needed in supervisor */
 876
 877 struct seccomp_notif_resp *resp;
 878     /* Code to allocate 'resp' omitted */
 879 resp->id = req->id;
 880 resp->error = 0;        /* "Success" */
 881 resp->val = targetFd;
 882 resp->flags = 0;
 883 ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_SEND, resp);
 884 .in
 885 .EE
 886 .SH NOTES
 887 One example use case for the user-space notification
 888 mechanism is to allow a container manager
 889 (a process which is typically running with more privilege than
 890 the processes inside the container)
 891 to mount block devices or create device nodes for the container.
 892 The mount use case provides an example of where the
 893 .BR SECCOMP_USER_NOTIF_FLAG_CONTINUE
 894 .BR ioctl (2)
 895 operation is useful.
 896 Upon receiving a notification for the
 897 .BR mount (2)
 898 system call, the container manager (the "supervisor") can distinguish
 899 a request to mount a block filesystem
 900 (which would not be possible for a "target" process inside the container)
 901 and mount that file system.
 902 If, on the other hand, the container manager detects that the operation
 903 could be performed by the process inside the container
 904 (e.g., a mount of a
 905 .BR tmpfs (5)
 906 filesystem), it can notify the kernel that the target process's
 907 .BR mount (2)
 908 system call can continue.
 909 .\"
 910 .SS select()/poll()/epoll semantics
 911 The file descriptor returned when
 912 .BR seccomp (2)
 913 is employed with the
 914 .B SECCOMP_FILTER_FLAG_NEW_LISTENER
 915 flag can be monitored using
 916 .BR poll (2),
 917 .BR epoll (7),
 918 and
 919 .BR select (2).
 920 These interfaces indicate that the file descriptor is ready as follows:
 921 .IP \(bu 2
 922 When a notification is pending,
 923 these interfaces indicate that the file descriptor is readable.
 924 Following such an indication, a subsequent
 925 .B SECCOMP_IOCTL_NOTIF_RECV
 926 .BR ioctl (2)
 927 will not block, returning either information about a notification
 928 or else failing with the error
 929 .B EINTR
 930 if the target has been killed by a signal or its system call
 931 has been interrupted by a signal handler.
 932 .IP \(bu
 933 After the notification has been received (i.e., by the
 934 .B SECCOMP_IOCTL_NOTIF_RECV
 935 .BR ioctl (2)
 936 operation), these interfaces indicate that the file descriptor is writable,
 937 meaning that a notification response can be sent using the
 938 .B SECCOMP_IOCTL_NOTIF_SEND
 939 .BR ioctl (2)
 940 operation.
 941 .IP \(bu
 942 After the last thread using the filter has terminated and been reaped using
 943 .BR waitpid (2)
 944 (or similar),
 945 the file descriptor indicates an end-of-file condition (readable in
 946 .BR select (2);
 947 .BR POLLHUP / EPOLLHUP
 948 in
 949 .BR poll (2)/
 950 .BR epoll_wait (2)).
 951 .SS Design goals; use of SECCOMP_USER_NOTIF_FLAG_CONTINUE
 952 The intent of the user-space notification feature is
 953 to allow system calls to be performed on behalf of the target.
 954 The target's system call should either be handled by the supervisor or
 955 allowed to continue normally in the kernel (where standard security
 956 policies will be applied).
 957 .PP
 958 .BR "Note well" :
 959 this mechanism must not be used to make security policy decisions
 960 about the system call,
 961 which would be inherently race-prone for reasons described next.
 962 .PP
 963 The
 964 .B SECCOMP_USER_NOTIF_FLAG_CONTINUE
 965 flag must be used with caution.
 966 If set by the supervisor, the target's system call will continue.
 967 However, there is a time-of-check, time-of-use race here,
 968 since an attacker could exploit the interval of time where the target is
 969 blocked waiting on the "continue" response to do things such as
 970 rewriting the system call arguments.
 971 .PP
 972 Note furthermore that a user-space notifier can be bypassed if
 973 the existing filters allow the use of
 974 .BR seccomp (2)
 975 or
 976 .BR prctl (2)
 977 to install a filter that returns an action value with a higher precedence than
 978 .B SECCOMP_RET_USER_NOTIF
 979 (see
 980 .BR seccomp (2)).
 981 .PP
 982 It should thus be absolutely clear that the
 983 seccomp user-space notification mechanism
 984 .B can not
 985 be used to implement a security policy!
 986 It should only ever be used in scenarios where a more privileged process
 987 supervises the system calls of a lesser privileged target to
 988 get around kernel-enforced security restrictions when
 989 the supervisor deems this safe.
 990 In other words,
 991 in order to continue a system call, the supervisor should be sure that
 992 another security mechanism or the kernel itself will sufficiently block
 993 the system call if its arguments are rewritten to something unsafe.
 994 .\"
 995 .SS Caveats regarding the use of /proc/[tid]/mem
 996 The discussion above noted the need to use the
 997 .BR SECCOMP_IOCTL_NOTIF_ID_VALID
 998 .BR ioctl (2)
 999 when opening the
1000 .IR /proc/[tid]/mem
1001 file of the target
1002 to avoid the possibility of accessing the memory of the wrong process
1003 in the event that the target terminates and its ID
1004 is recycled by another (unrelated) thread.
1005 However, the use of this
1006 .BR ioctl (2)
1007 operation is also necessary in other situations,
1008 as explained in the following paragraphs.
1009 .PP
1010 Consider the following scenario, where the supervisor
1011 tries to read the pathname argument of a target's blocked
1012 .BR mount (2)
1013 system call:
1014 .IP \(bu 2
1015 From one of its functions
1016 .RI ( func() ),
1017 the target calls
1018 .BR mount (2),
1019 which triggers a user-space notification and causes the target to block.
1020 .IP \(bu
1021 The supervisor receives the notification, opens
1022 .IR /proc/[tid]/mem ,
1023 and (successfully) performs the
1024 .BR SECCOMP_IOCTL_NOTIF_ID_VALID
1025 check.
1026 .IP \(bu
1027 The target receives a signal, which causes the
1028 .BR mount (2)
1029 to abort.
1030 .IP \(bu
1031 The signal handler executes in the target, and returns.
1032 .IP \(bu
1033 Upon return from the handler, the execution of
1034 .I func()
1035 resumes, and it returns (and perhaps other functions are called,
1036 overwriting the memory that had been used for the stack frame of
1037 .IR func() ).
1038 .IP \(bu
1039 Using the address provided in the notification information,
1040 the supervisor reads from the target's memory location that used to
1041 contain the pathname.
1042 .IP \(bu
1043 The supervisor now calls
1044 .BR mount (2)
1045 with some arbitrary bytes obtained in the previous step.
1046 .PP
1047 The conclusion from the above scenario is this:
1048 since the target's blocked system call may be interrupted by a signal handler,
1049 the supervisor must be written to expect that the
1050 target may abandon its system call at
1051 .B any
1052 time;
1053 in such an event, any information that the supervisor obtained from
1054 the target's memory must be considered invalid.
1055 .PP
1056 To prevent such scenarios,
1057 every read from the target's memory must be separated from use of
1058 the bytes so obtained by a
1059 .BR SECCOMP_IOCTL_NOTIF_ID_VALID
1060 check.
1061 In the above example, the check would be placed between the two final steps.
1062 An example of such a check is shown in EXAMPLES.
1063 .PP
1064 Following on from the above, it should be clear that
1065 a write by the supervisor into the target's memory can
1066 .B never
1067 be considered safe.
1068 .\"
1069 .SS Caveats regarding blocking system calls
1070 Suppose that the target performs a blocking system call (e.g.,
1071 .BR accept (2))
1072 that the supervisor should handle.
1073 The supervisor might then in turn execute the same blocking system call.
1074 .PP
1075 In this scenario,
1076 it is important to note that if the target's system call is now
1077 interrupted by a signal, the supervisor is
1078 .I not
1079 informed of this.
1080 If the supervisor does not take suitable steps to
1081 actively discover that the target's system call has been canceled,
1082 various difficulties can occur.
1083 Taking the example of
1084 .BR accept (2),
1085 the supervisor might remain blocked in its
1086 .BR accept (2)
1087 holding a port number that the target
1088 (which, after the interruption by the signal handler,
1089 perhaps closed  its listening socket) might expect to be able to reuse in a
1090 .BR bind (2)
1091 call.
1092 .PP
1093 Therefore, when the supervisor wishes to emulate a blocking system call,
1094 it must do so in such a way that it gets informed if the target's
1095 system call is interrupted by a signal handler.
1096 For example, if the supervisor itself executes the same
1097 blocking system call, then it could employ a separate thread
1098 that uses the
1099 .B SECCOMP_IOCTL_NOTIF_ID_VALID
1100 operation to check if the target is still blocked in its system call.
1101 Alternatively, in the
1102 .BR accept (2)
1103 example, the supervisor might use
1104 .BR poll (2)
1105 to monitor both the notification file descriptor
1106 (so as as to discover when the target's
1107 .BR accept (2)
1108 call has been interrupted) and the listening file descriptor
1109 (so as to know when a connection is available).
1110 .PP
1111 If the target's system call is interrupted,
1112 the supervisor must take care to release resources (e.g., file descriptors)
1113 that it acquired on behalf of the target.
1114 .\"
1115 .SS Interaction with SA_RESTART signal handlers
1116 Consider the following scenario:
1117 .IP \(bu 2
1118 The target process has used
1119 .BR sigaction (2)
1120 to install a signal handler with the
1121 .B SA_RESTART
1122 flag.
1123 .IP \(bu
1124 The target has made a system call that triggered a seccomp
1125 user-space notification and the target is currently blocked
1126 until the supervisor sends a notification response.
1127 .IP \(bu
1128 A signal is delivered to the target and the signal handler is executed.
1129 .IP \(bu
1130 When (if) the supervisor attempts to send a notification response, the
1131 .B SECCOMP_IOCTL_NOTIF_SEND
1132 .BR ioctl (2))
1133 operation will fail with the
1134 .BR ENOENT
1135 error.
1136 .PP
1137 In this scenario, the kernel will restart the target's system call.
1138 Consequently, the supervisor will receive another user-space notification.
1139 Thus, depending on how many times the blocked system call
1140 is interrupted by a signal handler,
1141 the supervisor may receive multiple notifications for
1142 the same instance of a system call in the target.
1143 .PP
1144 One oddity is that system call restarting as described in this scenario
1145 will occur even for the blocking system calls listed in
1146 .BR signal (7)
1147 that would
1148 .B never
1149 normally be restarted by the
1150 .BR SA_RESTART
1151 flag.
1152 .PP
1153 Furthermore, if the supervisor response is a file descriptor
1154 added with
1155 .B SECCOMP_IOCTL_NOTIF_ADDFD,
1156 then the flag
1157 .B SECCOMP_ADDFD_FLAG_SEND
1158 can be used to atomically add the file descriptor and return that value,
1159 making sure no file descriptors are inadvertently leaked into the target.
1160 .\" FIXME
1161 .\" About the above, Kees Cook commented:
1162 .\"
1163 .\" Does this need fixing? I imagine the correct behavior for this case
1164 .\" would be a response to _SEND of EINPROGRESS and the target would see
1165 .\" EINTR normally?
1166 .\"
1167 .\" I mean, it's not like seccomp doesn't already expose weirdness with
1168 .\" syscall restarts. Not even arm64 compat agrees[3] with arm32 in this
1169 .\" regard. :(
1170 .
1171 .\" FIXME
1172 .\" Michael Kerrisk:
1173 .\" I wonder about the effect of this oddity for system calls that
1174 .\" are normally nonrestartable because they have timeouts. My
1175 .\" understanding is that the kernel doesn't restart those system
1176 .\" calls because it's impossible for the kernel to restart the call
1177 .\" with the right timeout value. I wonder what happens when those
1178 .\" system calls are restarted in the scenario we're discussing.)
1179 .SH BUGS
1180 If a
1181 .BR SECCOMP_IOCTL_NOTIF_RECV
1182 .BR ioctl (2)
1183 operation
1184 .\" or a poll/epoll/select
1185 is performed after the target terminates, then the
1186 .BR ioctl (2)
1187 call simply blocks (rather than returning an error to indicate that the
1188 target no longer exists).
1189 .\" FIXME
1190 .\" Comment from Kees Cook:
1191 .\"
1192 .\" I want this fixed. It caused me no end of pain when building the
1193 .\" selftests, and ended up spawning my implementing a global test timeout
1194 .\" in kselftest. :P Before the usage counter refactor, there was no sane
1195 .\" way to deal with this, but now I think we're close.
1196 .\"
1197 .SH EXAMPLES
1198 The (somewhat contrived) program shown below demonstrates the use of
1199 the interfaces described in this page.
1200 The program creates a child process that serves as the "target" process.
1201 The child process installs a seccomp filter that returns the
1202 .B SECCOMP_RET_USER_NOTIF
1203 action value if a call is made to
1204 .BR mkdir (2).
1205 The child process then calls
1206 .BR mkdir (2)
1207 once for each of the supplied command-line arguments,
1208 and reports the result returned by the call.
1209 After processing all arguments, the child process terminates.
1210 .PP
1211 The parent process acts as the supervisor, listening for the notifications
1212 that are generated when the target process calls
1213 .BR mkdir (2).
1214 When such a notification occurs,
1215 the supervisor examines the memory of the target process (using
1216 .IR /proc/[pid]/mem )
1217 to discover the pathname argument that was supplied to the
1218 .BR mkdir (2)
1219 call, and performs one of the following actions:
1220 .IP \(bu 2
1221 If the pathname begins with the prefix "/tmp/",
1222 then the supervisor attempts to create the specified directory,
1223 and then spoofs a return for the target process based on the return
1224 value of the supervisor's
1225 .BR mkdir (2)
1226 call.
1227 In the event that that call succeeds,
1228 the spoofed success return value is the length of the pathname.
1229 .IP \(bu
1230 If the pathname begins with "./" (i.e., it is a relative pathname),
1231 the supervisor sends a
1232 .B SECCOMP_USER_NOTIF_FLAG_CONTINUE
1233 response to the kernel to say that the kernel should execute
1234 the target process's
1235 .BR mkdir (2)
1236 call.
1237 .IP \(bu
1238 If the pathname begins with some other prefix,
1239 the supervisor spoofs an error return for the target process,
1240 so that the target process's
1241 .BR mkdir (2)
1242 call appears to fail with the error
1243 .BR EOPNOTSUPP
1244 ("Operation not supported").
1245 Additionally, if the specified pathname is exactly "/bye",
1246 then the supervisor terminates.
1247 .PP
1248 This program can be used to demonstrate various aspects of the
1249 behavior of the seccomp user-space notification mechanism.
1250 To help aid such demonstrations,
1251 the program logs various messages to show the operation
1252 of the target process (lines prefixed "T:") and the supervisor
1253 (indented lines prefixed "S:").
1254 .PP
1255 In the following example, the target attempts to create the directory
1256 .IR /tmp/x .
1257 Upon receiving the notification, the supervisor creates the directory on the
1258 target's behalf,
1259 and spoofs a success return to be received by the target process's
1260 .BR mkdir (2)
1261 call.
1262 .PP
1263 .in +4n
1264 .EX
1265 $ \fB./seccomp_unotify /tmp/x\fP
1266 T: PID = 23168
1267
1268 T: about to mkdir("/tmp/x")
1269         S: got notification (ID 0x17445c4a0f4e0e3c) for PID 23168
1270         S: executing: mkdir("/tmp/x", 0700)
1271         S: success! spoofed return = 6
1272         S: sending response (flags = 0; val = 6; error = 0)
1273 T: SUCCESS: mkdir(2) returned 6
1274
1275 T: terminating
1276         S: target has terminated; bye
1277 .EE
1278 .in
1279 .PP
1280 In the above output, note that the spoofed return value seen by the target
1281 process is 6 (the length of the pathname
1282 .IR /tmp/x ),
1283 whereas a normal
1284 .BR mkdir (2)
1285 call returns 0 on success.
1286 .PP
1287 In the next example, the target attempts to create a directory using the
1288 relative pathname
1289 .IR ./sub .
1290 Since this pathname starts with "./",
1291 the supervisor sends a
1292 .B SECCOMP_USER_NOTIF_FLAG_CONTINUE
1293 response to the kernel,
1294 and the kernel then (successfully) executes the target process's
1295 .BR mkdir (2)
1296 call.
1297 .PP
1298 .in +4n
1299 .EX
1300 $ \fB./seccomp_unotify ./sub\fP
1301 T: PID = 23204
1302
1303 T: about to mkdir("./sub")
1304         S: got notification (ID 0xddb16abe25b4c12) for PID 23204
1305         S: target can execute system call
1306         S: sending response (flags = 0x1; val = 0; error = 0)
1307 T: SUCCESS: mkdir(2) returned 0
1308
1309 T: terminating
1310         S: target has terminated; bye
1311 .EE
1312 .in
1313 .PP
1314 If the target process attempts to create a directory with
1315 a pathname that doesn't start with "." and doesn't begin with the prefix
1316 "/tmp/", then the supervisor spoofs an error return
1317 .RB ( EOPNOTSUPP ,
1318 "Operation not  supported")
1319 for the target's
1320 .BR mkdir (2)
1321 call (which is not executed):
1322 .PP
1323 .in +4n
1324 .EX
1325 $ \fB./seccomp_unotify /xxx\fP
1326 T: PID = 23178
1327
1328 T: about to mkdir("/xxx")
1329         S: got notification (ID 0xe7dc095d1c524e80) for PID 23178
1330         S: spoofing error response (Operation not supported)
1331         S: sending response (flags = 0; val = 0; error = \-95)
1332 T: ERROR: mkdir(2): Operation not supported
1333
1334 T: terminating
1335         S: target has terminated; bye
1336 .EE
1337 .in
1338 .PP
1339 In the next example,
1340 the target process attempts to create a directory with the pathname
1341 .BR /tmp/nosuchdir/b .
1342 Upon receiving the notification,
1343 the supervisor attempts to create that directory, but the
1344 .BR mkdir (2)
1345 call fails because the directory
1346 .BR /tmp/nosuchdir
1347 does not exist.
1348 Consequently, the supervisor spoofs an error return that passes the error
1349 that it received back to the target process's
1350 .BR mkdir (2)
1351 call.
1352 .PP
1353 .in +4n
1354 .EX
1355 $ \fB./seccomp_unotify /tmp/nosuchdir/b\fP
1356 T: PID = 23199
1357
1358 T: about to mkdir("/tmp/nosuchdir/b")
1359         S: got notification (ID 0x8744454293506046) for PID 23199
1360         S: executing: mkdir("/tmp/nosuchdir/b", 0700)
1361         S: failure! (errno = 2; No such file or directory)
1362         S: sending response (flags = 0; val = 0; error = \-2)
1363 T: ERROR: mkdir(2): No such file or directory
1364
1365 T: terminating
1366         S: target has terminated; bye
1367 .EE
1368 .in
1369 .PP
1370 If the supervisor receives a notification and sees that the
1371 argument of the target's
1372 .BR mkdir (2)
1373 is the string "/bye", then (as well as spoofing an
1374 .B EOPNOTSUPP
1375 error), the supervisor terminates.
1376 If the target process subsequently executes another
1377 .BR mkdir (2)
1378 that triggers its seccomp filter to return the
1379 .B SECCOMP_RET_USER_NOTIF
1380 action value, then the kernel causes the target process's system call to
1381 fail with the error
1382 .B ENOSYS
1383 ("Function not implemented").
1384 This is demonstrated by the following example:
1385 .PP
1386 .in +4n
1387 .EX
1388 $ \fB./seccomp_unotify /bye /tmp/y\fP
1389 T: PID = 23185
1390
1391 T: about to mkdir("/bye")
1392         S: got notification (ID 0xa81236b1d2f7b0f4) for PID 23185
1393         S: spoofing error response (Operation not supported)
1394         S: sending response (flags = 0; val = 0; error = \-95)
1395         S: terminating **********
1396 T: ERROR: mkdir(2): Operation not supported
1397
1398 T: about to mkdir("/tmp/y")
1399 T: ERROR: mkdir(2): Function not implemented
1400
1401 T: terminating
1402 .EE
1403 .in
1404 .\"
1405 .SS Program source
1406 .EX
1407 #define _GNU_SOURCE
1408 #include <errno.h>
1409 #include <fcntl.h>
1410 #include <limits.h>
1411 #include <linux/audit.h>
1412 #include <linux/filter.h>
1413 #include <linux/seccomp.h>
1414 #include <signal.h>
1415 #include <stdbool.h>
1416 #include <stddef.h>
1417 #include <stdint.h>
1418 #include <stdio.h>
1419 #include <stdlib.h>
1420 #include <sys/socket.h>
1421 #include <sys/ioctl.h>
1422 #include <sys/prctl.h>
1423 #include <sys/stat.h>
1424 #include <sys/types.h>
1425 #include <sys/un.h>
1426 #include <sys/syscall.h>
1427 #include <unistd.h>
1428
1429 #define errExit(msg)    do { perror(msg); exit(EXIT_FAILURE); \e
1430                         } while (0)
1431
1432 /* Send the file descriptor \(aqfd\(aq over the connected UNIX domain socket
1433    \(aqsockfd\(aq. Returns 0 on success, or \-1 on error. */
1434
1435 static int
1436 sendfd(int sockfd, int fd)
1437 {
1438     struct msghdr msgh;
1439     struct iovec iov;
1440     int data;
1441     struct cmsghdr *cmsgp;
1442
1443     /* Allocate a char array of suitable size to hold the ancillary data.
1444        However, since this buffer is in reality a \(aqstruct cmsghdr\(aq, use a
1445        union to ensure that it is suitably aligned. */
1446     union {
1447         char   buf[CMSG_SPACE(sizeof(int))];
1448                         /* Space large enough to hold an \(aqint\(aq */
1449         struct cmsghdr align;
1450     } controlMsg;
1451
1452     /* The \(aqmsg_name\(aq field can be used to specify the address of the
1453        destination socket when sending a datagram. However, we do not
1454        need to use this field because \(aqsockfd\(aq is a connected socket. */
1455
1456     msgh.msg_name = NULL;
1457     msgh.msg_namelen = 0;
1458
1459     /* On Linux, we must transmit at least one byte of real data in
1460        order to send ancillary data. We transmit an arbitrary integer
1461        whose value is ignored by recvfd(). */
1462
1463     msgh.msg_iov = &iov;
1464     msgh.msg_iovlen = 1;
1465     iov.iov_base = &data;
1466     iov.iov_len = sizeof(int);
1467     data = 12345;
1468
1469     /* Set \(aqmsghdr\(aq fields that describe ancillary data */
1470
1471     msgh.msg_control = controlMsg.buf;
1472     msgh.msg_controllen = sizeof(controlMsg.buf);
1473
1474     /* Set up ancillary data describing file descriptor to send */
1475
1476     cmsgp = CMSG_FIRSTHDR(&msgh);
1477     cmsgp\->cmsg_level = SOL_SOCKET;
1478     cmsgp\->cmsg_type = SCM_RIGHTS;
1479     cmsgp\->cmsg_len = CMSG_LEN(sizeof(int));
1480     memcpy(CMSG_DATA(cmsgp), &fd, sizeof(int));
1481
1482     /* Send real plus ancillary data */
1483
1484     if (sendmsg(sockfd, &msgh, 0) == \-1)
1485         return \-1;
1486
1487     return 0;
1488 }
1489
1490 /* Receive a file descriptor on a connected UNIX domain socket. Returns
1491    the received file descriptor on success, or \-1 on error. */
1492
1493 static int
1494 recvfd(int sockfd)
1495 {
1496     struct msghdr msgh;
1497     struct iovec iov;
1498     int data, fd;
1499     ssize_t nr;
1500
1501     /* Allocate a char buffer for the ancillary data. See the comments
1502        in sendfd() */
1503     union {
1504         char   buf[CMSG_SPACE(sizeof(int))];
1505         struct cmsghdr align;
1506     } controlMsg;
1507     struct cmsghdr *cmsgp;
1508
1509     /* The \(aqmsg_name\(aq field can be used to obtain the address of the
1510        sending socket. However, we do not need this information. */
1511
1512     msgh.msg_name = NULL;
1513     msgh.msg_namelen = 0;
1514
1515     /* Specify buffer for receiving real data */
1516
1517     msgh.msg_iov = &iov;
1518     msgh.msg_iovlen = 1;
1519     iov.iov_base = &data;       /* Real data is an \(aqint\(aq */
1520     iov.iov_len = sizeof(int);
1521
1522     /* Set \(aqmsghdr\(aq fields that describe ancillary data */
1523
1524     msgh.msg_control = controlMsg.buf;
1525     msgh.msg_controllen = sizeof(controlMsg.buf);
1526
1527     /* Receive real plus ancillary data; real data is ignored */
1528
1529     nr = recvmsg(sockfd, &msgh, 0);
1530     if (nr == \-1)
1531         return \-1;
1532
1533     cmsgp = CMSG_FIRSTHDR(&msgh);
1534
1535     /* Check the validity of the \(aqcmsghdr\(aq */
1536
1537     if (cmsgp == NULL ||
1538             cmsgp\->cmsg_len != CMSG_LEN(sizeof(int)) ||
1539             cmsgp\->cmsg_level != SOL_SOCKET ||
1540             cmsgp\->cmsg_type != SCM_RIGHTS) {
1541         errno = EINVAL;
1542         return \-1;
1543     }
1544
1545     /* Return the received file descriptor to our caller */
1546
1547     memcpy(&fd, CMSG_DATA(cmsgp), sizeof(int));
1548     return fd;
1549 }
1550
1551 static void
1552 sigchldHandler(int sig)
1553 {
1554     char msg[] = "\etS: target has terminated; bye\en";
1555
1556     write(STDOUT_FILENO, msg, sizeof(msg) - 1);
1557     _exit(EXIT_SUCCESS);
1558 }
1559
1560 static int
1561 seccomp(unsigned int operation, unsigned int flags, void *args)
1562 {
1563     return syscall(__NR_seccomp, operation, flags, args);
1564 }
1565
1566 /* The following is the x86\-64\-specific BPF boilerplate code for checking
1567    that the BPF program is running on the right architecture + ABI. At
1568    completion of these instructions, the accumulator contains the system
1569    call number. */
1570
1571 /* For the x32 ABI, all system call numbers have bit 30 set */
1572
1573 #define X32_SYSCALL_BIT         0x40000000
1574
1575 #define X86_64_CHECK_ARCH_AND_LOAD_SYSCALL_NR \e
1576         BPF_STMT(BPF_LD | BPF_W | BPF_ABS, \e
1577                 (offsetof(struct seccomp_data, arch))), \e
1578         BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 2), \e
1579         BPF_STMT(BPF_LD | BPF_W | BPF_ABS, \e
1580                  (offsetof(struct seccomp_data, nr))), \e
1581         BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, X32_SYSCALL_BIT, 0, 1), \e
1582         BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS)
1583
1584 /* installNotifyFilter() installs a seccomp filter that generates
1585    user\-space notifications (SECCOMP_RET_USER_NOTIF) when the process
1586    calls mkdir(2); the filter allows all other system calls.
1587
1588    The function return value is a file descriptor from which the
1589    user\-space notifications can be fetched. */
1590
1591 static int
1592 installNotifyFilter(void)
1593 {
1594     struct sock_filter filter[] = {
1595         X86_64_CHECK_ARCH_AND_LOAD_SYSCALL_NR,
1596
1597         /* mkdir() triggers notification to user\-space supervisor */
1598
1599         BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_mkdir, 0, 1),
1600         BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
1601
1602         /* Every other system call is allowed */
1603
1604         BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
1605     };
1606
1607     struct sock_fprog prog = {
1608         .len = sizeof(filter) / sizeof(filter[0]),
1609         .filter = filter,
1610     };
1611
1612     /* Install the filter with the SECCOMP_FILTER_FLAG_NEW_LISTENER flag;
1613        as a result, seccomp() returns a notification file descriptor. */
1614
1615     int notifyFd = seccomp(SECCOMP_SET_MODE_FILTER,
1616                            SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
1617     if (notifyFd == \-1)
1618         errExit("seccomp\-install\-notify\-filter");
1619
1620     return notifyFd;
1621 }
1622
1623 /* Close a pair of sockets created by socketpair() */
1624
1625 static void
1626 closeSocketPair(int sockPair[2])
1627 {
1628     if (close(sockPair[0]) == \-1)
1629         errExit("closeSocketPair\-close\-0");
1630     if (close(sockPair[1]) == \-1)
1631         errExit("closeSocketPair\-close\-1");
1632 }
1633
1634 /* Implementation of the target process; create a child process that:
1635
1636    (1) installs a seccomp filter with the
1637        SECCOMP_FILTER_FLAG_NEW_LISTENER flag;
1638    (2) writes the seccomp notification file descriptor returned from
1639        the previous step onto the UNIX domain socket, \(aqsockPair[0]\(aq;
1640    (3) calls mkdir(2) for each element of \(aqargv\(aq.
1641
1642    The function return value in the parent is the PID of the child
1643    process; the child does not return from this function. */
1644
1645 static pid_t
1646 targetProcess(int sockPair[2], char *argv[])
1647 {
1648     pid_t targetPid = fork();
1649     if (targetPid == \-1)
1650         errExit("fork");
1651
1652     if (targetPid > 0)          /* In parent, return PID of child */
1653         return targetPid;
1654
1655     /* Child falls through to here */
1656
1657     printf("T: PID = %ld\en", (long) getpid());
1658
1659     /* Install seccomp filter(s) */
1660
1661     if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
1662         errExit("prctl");
1663
1664     int notifyFd = installNotifyFilter();
1665
1666     /* Pass the notification file descriptor to the tracing process over
1667        a UNIX domain socket */
1668
1669     if (sendfd(sockPair[0], notifyFd) == \-1)
1670         errExit("sendfd");
1671
1672     /* Notification and socket FDs are no longer needed in target */
1673
1674     if (close(notifyFd) == \-1)
1675         errExit("close\-target\-notify\-fd");
1676
1677     closeSocketPair(sockPair);
1678
1679     /* Perform a mkdir() call for each of the command\-line arguments */
1680
1681     for (char **ap = argv; *ap != NULL; ap++) {
1682         printf("\enT: about to mkdir(\e"%s\e")\en", *ap);
1683
1684         int s = mkdir(*ap, 0700);
1685         if (s == \-1)
1686             perror("T: ERROR: mkdir(2)");
1687         else
1688             printf("T: SUCCESS: mkdir(2) returned %d\en", s);
1689     }
1690
1691     printf("\enT: terminating\en");
1692     exit(EXIT_SUCCESS);
1693 }
1694
1695 /* Check that the notification ID provided by a SECCOMP_IOCTL_NOTIF_RECV
1696    operation is still valid. It will no longer be valid if the target
1697    process has terminated or is no longer blocked in the system call that
1698    generated the notification (because it was interrupted by a signal).
1699
1700    This operation can be used when doing such things as accessing
1701    /proc/PID files in the target process in order to avoid TOCTOU race
1702    conditions where the PID that is returned by SECCOMP_IOCTL_NOTIF_RECV
1703    terminates and is reused by another process. */
1704
1705 static bool
1706 cookieIsValid(int notifyFd, uint64_t id)
1707 {
1708     return ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_ID_VALID, &id) == 0;
1709 }
1710
1711 /* Access the memory of the target process in order to fetch the
1712    pathname referred to by the system call argument \(aqargNum\(aq in
1713    \(aqreq\->data.args[]\(aq.  The pathname is returned in \(aqpath\(aq,
1714    a buffer of \(aqlen\(aq bytes allocated by the caller.
1715
1716    Returns true if the pathname is successfully fetched, and false
1717    otherwise. For possible causes of failure, see the comments below. */
1718
1719 static bool
1720 getTargetPathname(struct seccomp_notif *req, int notifyFd,
1721                   int argNum, char *path, size_t len)
1722 {
1723     char procMemPath[PATH_MAX];
1724
1725     snprintf(procMemPath, sizeof(procMemPath), "/proc/%d/mem", req\->pid);
1726
1727     int procMemFd = open(procMemPath, O_RDONLY | O_CLOEXEC);
1728     if (procMemFd == \-1)
1729         return false;
1730
1731     /* Check that the process whose info we are accessing is still alive
1732        and blocked in the system call that caused the notification.
1733        If the SECCOMP_IOCTL_NOTIF_ID_VALID operation (performed in
1734        cookieIsValid()) succeeded, we know that the /proc/PID/mem file
1735        descriptor that we opened corresponded to the process for which we
1736        received a notification. If that process subsequently terminates,
1737        then read() on that file descriptor will return 0 (EOF). */
1738
1739     if (!cookieIsValid(notifyFd, req\->id)) {
1740         close(procMemFd);
1741         return false;
1742     }
1743
1744     /* Read bytes at the location containing the pathname argument */
1745
1746     ssize_t nread = pread(procMemFd, path, len, req\->data.args[argNum]);
1747
1748     close(procMemFd);
1749
1750     if (nread <= 0)
1751         return false;
1752
1753     /* Once again check that the notification ID is still valid. The
1754        case we are particularly concerned about here is that just
1755        before we fetched the pathname, the target\(aqs blocked system
1756        call was interrupted by a signal handler, and after the handler
1757        returned, the target carried on execution (past the interrupted
1758        system call). In that case, we have no guarantees about what we
1759        are reading, since the target\(aqs memory may have been arbitrarily
1760        changed by subsequent operations. */
1761
1762     if (!cookieIsValid(notifyFd, req\->id)) {
1763         perror("\etS: notification ID check failed!!!");
1764         return false;
1765     }
1766
1767     /* Even if the target\(aqs system call was not interrupted by a signal,
1768        we have no guarantees about what was in the memory of the target
1769        process. (The memory may have been modified by another thread, or
1770        even by an external attacking process.) We therefore treat the
1771        buffer returned by pread() as untrusted input. The buffer should
1772        contain a terminating null byte; if not, then we will trigger an
1773        error for the target process. */
1774
1775     if (strnlen(path, nread) < nread)
1776         return true;
1777
1778     return false;
1779 }
1780
1781 /* Allocate buffers for the seccomp user\-space notification request and
1782    response structures. It is the caller\(aqs responsibility to free the
1783    buffers returned via \(aqreq\(aq and \(aqresp\(aq. */
1784
1785 static void
1786 allocSeccompNotifBuffers(struct seccomp_notif **req,
1787         struct seccomp_notif_resp **resp,
1788         struct seccomp_notif_sizes *sizes)
1789 {
1790     /* Discover the sizes of the structures that are used to receive
1791        notifications and send notification responses, and allocate
1792        buffers of those sizes. */
1793
1794     if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, sizes) == \-1)
1795         errExit("seccomp\-SECCOMP_GET_NOTIF_SIZES");
1796
1797     *req = malloc(sizes\->seccomp_notif);
1798     if (*req == NULL)
1799         errExit("malloc\-seccomp_notif");
1800
1801     /* When allocating the response buffer, we must allow for the fact
1802        that the user\-space binary may have been built with user\-space
1803        headers where \(aqstruct seccomp_notif_resp\(aq is bigger than the
1804        response buffer expected by the (older) kernel. Therefore, we
1805        allocate a buffer that is the maximum of the two sizes. This
1806        ensures that if the supervisor places bytes into the response
1807        structure that are past the response size that the kernel expects,
1808        then the supervisor is not touching an invalid memory location. */
1809
1810     size_t resp_size = sizes\->seccomp_notif_resp;
1811     if (sizeof(struct seccomp_notif_resp) > resp_size)
1812         resp_size = sizeof(struct seccomp_notif_resp);
1813
1814     *resp = malloc(resp_size);
1815     if (resp == NULL)
1816         errExit("malloc\-seccomp_notif_resp");
1817
1818 }
1819
1820 /* Handle notifications that arrive via the SECCOMP_RET_USER_NOTIF file
1821    descriptor, \(aqnotifyFd\(aq. */
1822
1823 static void
1824 handleNotifications(int notifyFd)
1825 {
1826     struct seccomp_notif_sizes sizes;
1827     struct seccomp_notif *req;
1828     struct seccomp_notif_resp *resp;
1829     char path[PATH_MAX];
1830
1831     allocSeccompNotifBuffers(&req, &resp, &sizes);
1832
1833     /* Loop handling notifications */
1834
1835     for (;;) {
1836
1837         /* Wait for next notification, returning info in \(aq*req\(aq */
1838
1839         memset(req, 0, sizes.seccomp_notif);
1840         if (ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_RECV, req) == \-1) {
1841             if (errno == EINTR)
1842                 continue;
1843             errExit("\etS: ioctl\-SECCOMP_IOCTL_NOTIF_RECV");
1844         }
1845
1846         printf("\etS: got notification (ID %#llx) for PID %d\en",
1847                 req\->id, req\->pid);
1848
1849         /* The only system call that can generate a notification event
1850            is mkdir(2). Nevertheless, we check that the notified system
1851            call is indeed mkdir() as kind of future\-proofing of this
1852            code in case the seccomp filter is later modified to
1853            generate notifications for other system calls. */
1854
1855         if (req\->data.nr != __NR_mkdir) {
1856             printf("\etS: notification contained unexpected "
1857                     "system call number; bye!!!\en");
1858             exit(EXIT_FAILURE);
1859         }
1860
1861         bool pathOK = getTargetPathname(req, notifyFd, 0, path,
1862                                         sizeof(path));
1863
1864         /* Prepopulate some fields of the response */
1865
1866         resp\->id = req\->id;     /* Response includes notification ID */
1867         resp\->flags = 0;
1868         resp\->val = 0;
1869
1870         /* If getTargetPathname() failed, trigger an EINVAL error
1871            response (sending this response may yield an error if the
1872            failure occurred because the notification ID was no longer
1873            valid); if the directory is in /tmp, then create it on behalf
1874            of the supervisor; if the pathname starts with \(aq.\(aq, tell the
1875            kernel to let the target process execute the mkdir();
1876            otherwise, give an error for a directory pathname in any other
1877            location. */
1878
1879         if (!pathOK) {
1880             resp->error = -EINVAL;
1881             printf("\etS: spoofing error for invalid pathname (%s)\en",
1882                     strerror(-resp->error));
1883         } else if (strncmp(path, "/tmp/", strlen("/tmp/")) == 0) {
1884             printf("\etS: executing: mkdir(\e"%s\e", %#llo)\en",
1885                     path, req\->data.args[1]);
1886
1887             if (mkdir(path, req\->data.args[1]) == 0) {
1888                 resp\->error = 0;            /* "Success" */
1889                 resp\->val = strlen(path);   /* Used as return value of
1890                                                mkdir() in target */
1891                 printf("\etS: success! spoofed return = %lld\en",
1892                         resp\->val);
1893             } else {
1894
1895                 /* If mkdir() failed in the supervisor, pass the error
1896                    back to the target */
1897
1898                 resp\->error = \-errno;
1899                 printf("\etS: failure! (errno = %d; %s)\en", errno,
1900                         strerror(errno));
1901             }
1902         } else if (strncmp(path, "./", strlen("./")) == 0) {
1903             resp\->error = resp\->val = 0;
1904             resp\->flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
1905             printf("\etS: target can execute system call\en");
1906         } else {
1907             resp\->error = \-EOPNOTSUPP;
1908             printf("\etS: spoofing error response (%s)\en",
1909                     strerror(\-resp\->error));
1910         }
1911
1912         /* Send a response to the notification */
1913
1914         printf("\etS: sending response "
1915                 "(flags = %#x; val = %lld; error = %d)\en",
1916                 resp\->flags, resp\->val, resp\->error);
1917
1918         if (ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_SEND, resp) == \-1) {
1919             if (errno == ENOENT)
1920                 printf("\etS: response failed with ENOENT; "
1921                         "perhaps target process\(aqs syscall was "
1922                         "interrupted by a signal?\en");
1923             else
1924                 perror("ioctl\-SECCOMP_IOCTL_NOTIF_SEND");
1925         }
1926
1927         /* If the pathname is just "/bye", then the supervisor breaks out
1928            of the loop and terminates. This allows us to see what happens
1929            if the target process makes further calls to mkdir(2). */
1930
1931         if (strcmp(path, "/bye") == 0)
1932             break;
1933     }
1934
1935     free(req);
1936     free(resp);
1937     printf("\etS: terminating **********\en");
1938     exit(EXIT_FAILURE);
1939 }
1940
1941 /* Implementation of the supervisor process:
1942
1943    (1) obtains the notification file descriptor from \(aqsockPair[1]\(aq
1944    (2) handles notifications that arrive on that file descriptor. */
1945
1946 static void
1947 supervisor(int sockPair[2])
1948 {
1949     int notifyFd = recvfd(sockPair[1]);
1950     if (notifyFd == \-1)
1951         errExit("recvfd");
1952
1953     closeSocketPair(sockPair);  /* We no longer need the socket pair */
1954
1955     handleNotifications(notifyFd);
1956 }
1957
1958 int
1959 main(int argc, char *argv[])
1960 {
1961     int sockPair[2];
1962
1963     setbuf(stdout, NULL);
1964
1965     if (argc < 2) {
1966         fprintf(stderr, "At least one pathname argument is required\en");
1967         exit(EXIT_FAILURE);
1968     }
1969
1970     /* Create a UNIX domain socket that is used to pass the seccomp
1971        notification file descriptor from the target process to the
1972        supervisor process. */
1973
1974     if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockPair) == \-1)
1975         errExit("socketpair");
1976
1977     /* Create a child process\-\-the "target"\-\-that installs seccomp
1978        filtering. The target process writes the seccomp notification
1979        file descriptor onto \(aqsockPair[0]\(aq and then calls mkdir(2) for
1980        each directory in the command\-line arguments. */
1981
1982     (void) targetProcess(sockPair, &argv[optind]);
1983
1984     /* Catch SIGCHLD when the target terminates, so that the
1985        supervisor can also terminate. */
1986
1987     struct sigaction sa;
1988     sa.sa_handler = sigchldHandler;
1989     sa.sa_flags = 0;
1990     sigemptyset(&sa.sa_mask);
1991     if (sigaction(SIGCHLD, &sa, NULL) == \-1)
1992         errExit("sigaction");
1993
1994     supervisor(sockPair);
1995
1996     exit(EXIT_SUCCESS);
1997 }
1998 .EE
1999 .SH SEE ALSO
2000 .BR ioctl (2),
2001 .BR pidfd_open (2),
2002 .BR pidfd_getfd (2),
2003 .BR seccomp (2)
2004 .PP
2005 A further example program can be found in the kernel source file
2006 .IR samples/seccomp/user-trap.c .