man2/mount_setattr.2

   1 .\" Copyright (c) 2021 by Christian Brauner <christian.brauner@ubuntu.com>
   2 .\"
   3 .\" %%%LICENSE_START(VERBATIM)
   4 .\" Permission is granted to make and distribute verbatim copies of this
   5 .\" manual provided the copyright notice and this permission notice are
   6 .\" preserved on all copies.
   7 .\"
   8 .\" Permission is granted to copy and distribute modified versions of this
   9 .\" manual under the conditions for verbatim copying, provided that the
  10 .\" entire resulting derived work is distributed under the terms of a
  11 .\" permission notice identical to this one.
  12 .\"
  13 .\" Since the Linux kernel and libraries are constantly changing, this
  14 .\" manual page may be incorrect or out-of-date.  The author(s) assume no
  15 .\" responsibility for errors or omissions, or for damages resulting from
  16 .\" the use of the information contained herein.  The author(s) may not
  17 .\" have taken the same level of care in the production of this manual,
  18 .\" which is licensed free of charge, as they might when working
  19 .\" professionally.
  20 .\"
  21 .\" Formatted or processed versions of this manual, if unaccompanied by
  22 .\" the source, must acknowledge the copyright and authors of this work.
  23 .\" %%%LICENSE_END
  24 .\"
  25 .TH MOUNT_SETATTR 2 2021-03-22 "Linux" "Linux Programmer's Manual"
  26 .SH NAME
  27 mount_setattr \- change properties of a mount or mount tree
  28 .SH SYNOPSIS
  29 .nf
  30
  31 .PP
  32 .BR "#include <linux/fcntl.h>" " /* Definition of " AT_* " constants */"
  33 .BR "#include <linux/mount.h>" " /* Definition of " MOUNT_ATTR_* " constants */"
  34 .BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
  35 .B #include <unistd.h>
  36 .PP
  37 .BI "int syscall(SYS_mount_setattr, int " dirfd ", const char *" pathname ,
  38 .BI "            unsigned int " flags ", struct mount_attr *" attr \
  39 ", size_t " size );
  40 .fi
  41 .PP
  42 .IR Note :
  43 glibc provides no wrapper for
  44 .BR mount_setattr (),
  45 necessitating the use of
  46 .BR syscall (2).
  47 .SH DESCRIPTION
  48 The
  49 .BR mount_setattr ()
  50 system call changes the mount properties of a mount or an entire mount tree.
  51 If
  52 .I pathname
  53 is a relative pathname,
  54 then it is interpreted relative to
  55 the directory referred to by the file descriptor
  56 .IR dirfd .
  57 If
  58 .I dirfd
  59 is the special value
  60 .BR AT_FDCWD ,
  61 then
  62 .I pathname
  63 is interpreted relative to
  64 the current working directory of the calling process.
  65 If
  66 .I pathname
  67 is the empty string and
  68 .B AT_EMPTY_PATH
  69 is specified in
  70 .IR flags ,
  71 then the mount properties of the mount identified by
  72 .I dirfd
  73 are changed.
  74 (See
  75 .BR openat (2)
  76 for an explanation of why the
  77 .I dirfd
  78 argument is useful.)
  79 .PP
  80 The
  81 .BR mount_setattr ()
  82 system call uses an extensible structure
  83 .RI ( "struct mount_attr" )
  84 to allow for future extensions.
  85 Any non-flag extensions to
  86 .BR mount_setattr ()
  87 will be implemented as new fields appended to the this structure,
  88 with a zero value in a new field resulting in the kernel behaving
  89 as though that extension field was not present.
  90 Therefore,
  91 the caller
  92 .I must
  93 zero-fill this structure on initialization.
  94 See the "Extensibility" subsection under
  95 .B NOTES
  96 for more details.
  97 .PP
  98 The
  99 .I size
 100 argument should usually be specified as
 101 .IR "sizeof(struct mount_attr)" .
 102 However, if the caller is using a kernel that supports an extended
 103 .IR "struct mount_attr" ,
 104 but the caller does not intend to make use of these features,
 105 it is possible to pass the size of an earlier
 106 version of the structure together with the extended structure.
 107 This allows the kernel to not copy later parts of the structure
 108 that aren't used anyway.
 109 With each extension that changes the size of
 110 .IR "struct mount_attr" ,
 111 the kernel will expose a definition of the form
 112 .BI MOUNT_ATTR_SIZE_VER number\c
 113 \&.
 114 For example, the macro for the size of the initial version of
 115 .I struct mount_attr
 116 is
 117 .BR MOUNT_ATTR_SIZE_VER0 .
 118 .PP
 119 The
 120 .I flags
 121 argument can be used to alter the pathname resolution behavior.
 122 The supported values are:
 123 .TP
 124 .B AT_EMPTY_PATH
 125 If
 126 .I pathname
 127 is the empty string,
 128 change the mount properties on
 129 .I dirfd
 130 itself.
 131 .TP
 132 .B AT_RECURSIVE
 133 Change the mount properties of the entire mount tree.
 134 .TP
 135 .B AT_SYMLINK_NOFOLLOW
 136 Don't follow trailing symbolic links.
 137 .TP
 138 .B AT_NO_AUTOMOUNT
 139 Don't trigger automounts.
 140 .PP
 141 The
 142 .I attr
 143 argument of
 144 .BR mount_setattr ()
 145 is a structure of the following form:
 146 .PP
 147 .in +4n
 148 .EX
 149 struct mount_attr {
 150     __u64 attr_set;     /* Mount properties to set */
 151     __u64 attr_clr;     /* Mount properties to clear */
 152     __u64 propagation;  /* Mount propagation type */
 153     __u64 userns_fd;    /* User namespace file descriptor */
 154 };
 155 .EE
 156 .in
 157 .PP
 158 The
 159 .I attr_set
 160 and
 161 .I attr_clr
 162 members are used to specify the mount properties that
 163 are supposed to be set or cleared for a mount or mount tree.
 164 Flags set in
 165 .I attr_set
 166 enable a property on a mount or mount tree,
 167 and flags set in
 168 .I attr_clr
 169 remove a property from a mount or mount tree.
 170 .PP
 171 When changing mount properties,
 172 the kernel will first clear the flags specified
 173 in the
 174 .I attr_clr
 175 field,
 176 and then set the flags specified in the
 177 .I attr_set
 178 field.
 179 For example, these settings:
 180 .PP
 181 .in +4n
 182 .EX
 183 struct mount_attr attr = {
 184     .attr_clr = MOUNT_ATTR_NOEXEC | MOUNT_ATTR_NODEV,
 185     .attr_set = MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID,
 186 };
 187 .EE
 188 .in
 189 .PP
 190 are equivalent to the following steps:
 191 .PP
 192 .in +4n
 193 .EX
 194 unsigned int current_mnt_flags = mnt->mnt_flags;
 195
 196 /*
 197  * Clear all flags set in .attr_clr,
 198  * clearing MOUNT_ATTR_NOEXEC and MOUNT_ATTR_NODEV.
 199  */
 200 current_mnt_flags &= ~attr->attr_clr;
 201
 202 /*
 203  * Now set all flags set in .attr_set,
 204  * applying MOUNT_ATTR_RDONLY and MOUNT_ATTR_NOSUID.
 205  */
 206 current_mnt_flags |= attr->attr_set;
 207
 208 mnt->mnt_flags = current_mnt_flags;
 209 .EE
 210 .in
 211 .PP
 212 As a result of this change, the mount or mount tree (a) is read-only;
 213 (b) blocks the execution of set-user-ID and set-group-ID programs;
 214 (c) allows execution of programs; and (d) allows access to devices.
 215 .PP
 216 Multiple changes with the same set of flags requested
 217 in
 218 .I attr_clr
 219 and
 220 .I attr_set
 221 are guaranteed to be idempotent after the changes have been applied.
 222 .PP
 223 The following mount attributes can be specified in the
 224 .I attr_set
 225 or
 226 .I attr_clr
 227 fields:
 228 .TP
 229 .B MOUNT_ATTR_RDONLY
 230 If set in
 231 .IR attr_set ,
 232 makes the mount read-only.
 233 If set in
 234 .IR attr_clr ,
 235 removes the read-only setting if set on the mount.
 236 .TP
 237 .B MOUNT_ATTR_NOSUID
 238 If set in
 239 .IR attr_set ,
 240 causes the mount not to honor the set-user-ID and set-group-ID mode bits and
 241 file capabilities when executing programs.
 242 If set in
 243 .IR attr_clr ,
 244 clears the set-user-ID, set-group-ID,
 245 and file capability restriction if set on this mount.
 246 .TP
 247 .B MOUNT_ATTR_NODEV
 248 If set in
 249 .IR attr_set ,
 250 prevents access to devices on this mount.
 251 If set in
 252 .IR attr_clr ,
 253 removes the restriction that prevented accessing devices on this mount.
 254 .TP
 255 .B MOUNT_ATTR_NOEXEC
 256 If set in
 257 .IR attr_set ,
 258 prevents executing programs on this mount.
 259 If set in
 260 .IR attr_clr ,
 261 removes the restriction that prevented executing programs on this mount.
 262 .TP
 263 .B MOUNT_ATTR_NOSYMFOLLOW
 264 If set in
 265 .IR attr_set ,
 266 prevents following symbolic links on this mount.
 267 If set in
 268 .IR attr_clr ,
 269 removes the restriction that prevented following symbolic links on this mount.
 270 .TP
 271 .B MOUNT_ATTR_NODIRATIME
 272 If set in
 273 .IR attr_set ,
 274 prevents updating access time for directories on this mount.
 275 If set in
 276 .IR attr_clr ,
 277 removes the restriction that prevented updating access time for directories.
 278 Note that
 279 .B MOUNT_ATTR_NODIRATIME
 280 can be combined with other access-time settings
 281 and is implied by the noatime setting.
 282 All other access-time settings are mutually exclusive.
 283 .TP
 284 .BR MOUNT_ATTR__ATIME " - changing access-time settings"
 285 The access-time values listed below are an enumeration that
 286 includes the value zero, expressed in the bits defined by the mask
 287 .BR MOUNT_ATTR__ATIME .
 288 Even though these bits are an enumeration
 289 (in contrast to the other mount flags such as
 290 .BR MOUNT_ATTR_NOEXEC ),
 291 they are nonetheless passed in
 292 .I attr_set
 293 and
 294 .I attr_clr
 295 for consistency with
 296 .BR fsmount (2),
 297 which introduced this behavior.
 298 .IP
 299 Note that,
 300 since the access-time values are an enumeration rather than bit values,
 301 a caller wanting to transition to a different access-time setting
 302 cannot simply specify the access-time setting in
 303 .IR attr_set ,
 304 but must also include
 305 .B MOUNT_ATTR__ATIME
 306 in the
 307 .I attr_clr
 308 field.
 309 The kernel will verify that
 310 .B MOUNT_ATTR__ATIME
 311 isn't partially set in
 312 .IR attr_clr
 313 (i.e., either all bits in the
 314 .B MOUNT_ATTR__ATIME
 315 bit field are either set or clear), and that
 316 .I attr_set
 317 doesn't have any access-time bits set if
 318 .B MOUNT_ATTR__ATIME
 319 isn't set in
 320 .IR attr_clr .
 321 .RS
 322 .TP
 323 .B MOUNT_ATTR_RELATIME
 324 When a file is accessed via this mount,
 325 update the file's last access time (atime)
 326 only if the current value of atime is less than or equal to
 327 the file's last modification time (mtime) or last status change time (ctime).
 328 .IP
 329 To enable this access-time setting on a mount or mount tree,
 330 .B MOUNT_ATTR_RELATIME
 331 must be set in
 332 .I attr_set
 333 and
 334 .B MOUNT_ATTR__ATIME
 335 must be set in the
 336 .I attr_clr
 337 field.
 338 .TP
 339 .B MOUNT_ATTR_NOATIME
 340 Do not update access times for (all types of) files on this mount.
 341 .IP
 342 To enable this access-time setting on a mount or mount tree,
 343 .B MOUNT_ATTR_NOATIME
 344 must be set in
 345 .I attr_set
 346 and
 347 .B MOUNT_ATTR__ATIME
 348 must be set in the
 349 .I attr_clr
 350 field.
 351 .TP
 352 .B MOUNT_ATTR_STRICTATIME
 353 Always update the last access time (atime)
 354 when files are accessed on this mount.
 355 .IP
 356 To enable this access-time setting on a mount or mount tree,
 357 .B MOUNT_ATTR_STRICTATIME
 358 must be set in
 359 .I attr_set
 360 and
 361 .B MOUNT_ATTR__ATIME
 362 must be set in the
 363 .I attr_clr
 364 field.
 365 .RE
 366 .TP
 367 .B MOUNT_ATTR_IDMAP
 368 If set in
 369 .IR attr_set ,
 370 creates an ID-mapped mount.
 371 The ID mapping is taken from the user namespace specified in
 372 .I userns_fd
 373 and attached to the mount.
 374 .IP
 375 Since it is not supported to
 376 change the ID mapping of a mount after it has been ID mapped,
 377 it is invalid to specify
 378 .B MOUNT_ATTR_IDMAP
 379 in
 380 .IR attr_clr .
 381 .IP
 382 For further details, see the subsection "ID-mapped mounts" under NOTES.
 383 .PP
 384 The
 385 .I propagation
 386 field is used to specify the propagation type of the mount or mount tree.
 387 This field either has the value zero,
 388 meaning leave the propagation type unchanged, or it has one of
 389 the following values:
 390 .TP
 391 .B MS_PRIVATE
 392 Turn all mounts into private mounts.
 393 .TP
 394 .B MS_SHARED
 395 Turn all mounts into shared mounts.
 396 .TP
 397 .B MS_SLAVE
 398 Turn all mounts into dependent mounts.
 399 .TP
 400 .B MS_UNBINDABLE
 401 Turn all mounts into unbindable mounts.
 402 .PP
 403 For further details on the above propagation types, see
 404 .BR mount_namespaces (7).
 405 .SH RETURN VALUE
 406 On success,
 407 .BR mount_setattr ()
 408 returns zero.
 409 On error,
 410 \-1 is returned and
 411 .I errno
 412 is set to indicate the cause of the error.
 413 .SH ERRORS
 414 .TP
 415 .B EBADF
 416 .I pathname
 417 is relative but
 418 .I dirfd
 419 is neither
 420 .B AT_FDCWD
 421 nor a valid file descriptor.
 422 .TP
 423 .B EBADF
 424 .I userns_fd
 425 is not a valid file descriptor.
 426 .TP
 427 .B EBUSY
 428 The caller tried to change the mount to
 429 .BR MOUNT_ATTR_RDONLY ,
 430 but the mount still holds files open for writing.
 431 .TP
 432 .B EINVAL
 433 The pathname specified via the
 434 .I dirfd
 435 and
 436 .I pathname
 437 arguments to
 438 .BR mount_setattr ()
 439 isn't a mount point.
 440 .TP
 441 .B EINVAL
 442 An unsupported value was set in
 443 .IR flags .
 444 .TP
 445 .B EINVAL
 446 An unsupported value was specified in the
 447 .I attr_set
 448 field of
 449 .IR mount_attr .
 450 .TP
 451 .B EINVAL
 452 An unsupported value was specified in the
 453 .I attr_clr
 454 field of
 455 .IR mount_attr .
 456 .TP
 457 .B EINVAL
 458 An unsupported value was specified in the
 459 .I propagation
 460 field of
 461 .IR mount_attr .
 462 .TP
 463 .B EINVAL
 464 More than one of
 465 .BR MS_SHARED ,
 466 .BR MS_SLAVE ,
 467 .BR MS_PRIVATE ,
 468 or
 469 .B MS_UNBINDABLE
 470 was set in the
 471 .I propagation
 472 field of
 473 .IR mount_attr .
 474 .TP
 475 .B EINVAL
 476 An access-time setting was specified in the
 477 .I attr_set
 478 field without
 479 .B MOUNT_ATTR__ATIME
 480 being set in the
 481 .I attr_clr
 482 field.
 483 .TP
 484 .B EINVAL
 485 .B MOUNT_ATTR_IDMAP
 486 was specified in
 487 .IR attr_clr .
 488 .TP
 489 .B EINVAL
 490 A file descriptor value was specified in
 491 .I userns_fd
 492 which exceeds
 493 .BR INT_MAX .
 494 .TP
 495 .B EINVAL
 496 A valid file descriptor value was specified in
 497 .IR userns_fd ,
 498 but the file descriptor did not refer to a user namespace.
 499 .TP
 500 .B EINVAL
 501 The underlying filesystem does not support ID-mapped mounts.
 502 .TP
 503 .B EINVAL
 504 The mount that is to be ID mapped is not a detached mount;
 505 that is, the mount has not previously been visible in a mount namespace.
 506 .TP
 507 .B EINVAL
 508 A partial access-time setting was specified in
 509 .I attr_clr
 510 instead of
 511 .B MOUNT_ATTR__ATIME
 512 being set.
 513 .TP
 514 .B EINVAL
 515 The mount is located outside the caller's mount namespace.
 516 .TP
 517 .B EINVAL
 518 The underlying filesystem has been mounted in a mount namespace that is
 519 owned by a noninitial user namespace
 520 .TP
 521 .B ENOENT
 522 A pathname was empty or had a nonexistent component.
 523 .TP
 524 .B ENOMEM
 525 When changing mount propagation to
 526 .BR MS_SHARED ,
 527 a new peer group ID needs to be allocated for all mounts without a peer group
 528 ID set.
 529 This allocation failed because there was not
 530 enough memory to allocate the relevant internal structures.
 531 .TP
 532 .B ENOSPC
 533 When changing mount propagation to
 534 .BR MS_SHARED ,
 535 a new peer group ID needs to be allocated for all mounts without a peer group
 536 ID set.
 537 This allocation failed because
 538 the kernel has run out of IDs.
 539 .\" Christian Bruner: i.e. someone has somehow managed to
 540 .\" allocate so many peer groups and managed to keep the kernel running
 541 .\" (???) that the ida has ran out of ids
 542 .\" Note that technically further error codes are possible that are
 543 .\" specific to the ID allocation implementation used.
 544 .TP
 545 .B EPERM
 546 One of the mounts had at least one of
 547 .BR MOUNT_ATTR_NOATIME ,
 548 .BR MOUNT_ATTR_NODEV ,
 549 .BR MOUNT_ATTR_NODIRATIME ,
 550 .BR MOUNT_ATTR_NOEXEC ,
 551 .BR MOUNT_ATTR_NOSUID ,
 552 or
 553 .B MOUNT_ATTR_RDONLY
 554 set and the flag is locked.
 555 Mount attributes become locked on a mount if:
 556 .RS
 557 .IP \(bu 3
 558 A new mount or mount tree is created causing mount propagation across user
 559 namespaces
 560 (i.e., propagation to a mount namespace owned by a different user namespace).
 561 The kernel will lock the aforementioned flags to prevent these sensitive
 562 properties from being altered.
 563 .IP \(bu
 564 A new mount and user namespace pair is created.
 565 This happens for example when specifying
 566 .B CLONE_NEWUSER | CLONE_NEWNS
 567 in
 568 .BR unshare (2),
 569 .BR clone (2),
 570 or
 571 .BR clone3 (2).
 572 The aforementioned flags become locked in the new mount namespace
 573 to prevent sensitive mount properties from being altered.
 574 Since the newly created mount namespace will be owned by the
 575 newly created user namespace,
 576 a calling process that is privileged in the new
 577 user namespace would\(emin the absence of such locking\(embe
 578 able to alter sensitive mount properties (e.g., to remount a mount
 579 that was marked read-only as read-write in the new mount namespace).
 580 .RE
 581 .TP
 582 .B EPERM
 583 A valid file descriptor value was specified in
 584 .IR userns_fd ,
 585 but the file descriptor refers to the initial user namespace.
 586 .TP
 587 .B EPERM
 588 An attempt was made to add an ID mapping to a mount that is already ID mapped.
 589 .TP
 590 .B EPERM
 591 The caller does not have
 592 .B CAP_SYS_ADMIN
 593 in the initial user namespace.
 594 .SH VERSIONS
 595 .BR mount_setattr ()
 596 first appeared in Linux 5.12.
 597 .\" commit 7d6beb71da3cc033649d641e1e608713b8220290
 598 .\" commit 2a1867219c7b27f928e2545782b86daaf9ad50bd
 599 .\" commit 9caccd41541a6f7d6279928d9f971f6642c361af
 600 .SH CONFORMING TO
 601 .BR mount_setattr ()
 602 is Linux-specific.
 603 .SH NOTES
 604 .SS ID-mapped mounts
 605 Creating an ID-mapped mount makes it possible to
 606 change the ownership of all files located under a mount.
 607 Thus, ID-mapped mounts make it possible to
 608 change ownership in a temporary and localized way.
 609 It is a localized change because the ownership changes are
 610 visible only via a specific mount.
 611 All other users and locations where the filesystem is exposed are unaffected.
 612 It is a temporary change because
 613 the ownership changes are tied to the lifetime of the mount.
 614 .PP
 615 Whenever callers interact with the filesystem through an ID-mapped mount,
 616 the ID mapping of the mount will be applied to
 617 user and group IDs associated with filesystem objects.
 618 This encompasses the user and group IDs associated with inodes
 619 and also the following
 620 .BR xattr (7)
 621 keys:
 622 .IP \(bu 3
 623 .IR security.capability ,
 624 whenever filesystem capabilities
 625 are stored or returned in the
 626 .B VFS_CAP_REVISION_3
 627 format,
 628 which stores a root user ID alongside the capabilities
 629 (see
 630 .BR capabilities (7)).
 631 .IP \(bu
 632 .I system.posix_acl_access
 633 and
 634 .IR system.posix_acl_default ,
 635 whenever user IDs or group IDs are stored in
 636 .B ACL_USER
 637 or
 638 .B ACL_GROUP
 639 entries.
 640 .PP
 641 The following conditions must be met in order to create an ID-mapped mount:
 642 .IP \(bu 3
 643 The caller must have the
 644 .B CAP_SYS_ADMIN
 645 capability in the initial user namespace.
 646 .IP \(bu
 647 The filesystem must be mounted in a mount namespace
 648 that is owned by the initial user namespace.
 649 .IP \(bu
 650 The underlying filesystem must support ID-mapped mounts.
 651 Currently, the
 652 .BR xfs (5),
 653 .BR ext4 (5),
 654 and
 655 .B FAT
 656 filesystems support ID-mapped mounts
 657 with more filesystems being actively worked on.
 658 .IP \(bu
 659 The mount must not already be ID-mapped.
 660 This also implies that the ID mapping of a mount cannot be altered.
 661 .IP \(bu
 662 The mount must be a detached mount;
 663 that is,
 664 it must have been created by calling
 665 .BR open_tree (2)
 666 with the
 667 .B OPEN_TREE_CLONE
 668 flag and it must not already have been visible in a mount namespace.
 669 (To put things another way:
 670 the mount must not have been attached to the filesystem hierarchy
 671 with a system call such as
 672 .BR move_mount (2).)
 673 .PP
 674 ID mappings can be created for user IDs, group IDs, and project IDs.
 675 An ID mapping is essentially a mapping of a range of user or group IDs into
 676 another or the same range of user or group IDs.
 677 ID mappings are written to map files as three numbers
 678 separated by white space.
 679 The first two numbers specify the starting user or group ID
 680 in each of the two user namespaces.
 681 The third number specifies the range of the ID mapping.
 682 For example,
 683 a mapping for user IDs such as "1000\ 1001\ 1" would indicate that
 684 user ID 1000 in the caller's user namespace is mapped to
 685 user ID 1001 in its ancestor user namespace.
 686 Since the map range is 1,
 687 only user ID 1000 is mapped.
 688 .PP
 689 It is possible to specify up to 340 ID mappings for each ID mapping type.
 690 If any user IDs or group IDs are not mapped,
 691 all files owned by that unmapped user or group ID will appear as
 692 being owned by the overflow user ID or overflow group ID respectively.
 693 .PP
 694 Further details on setting up ID mappings can be found in
 695 .BR user_namespaces (7).
 696 .PP
 697 In the common case, the user namespace passed in
 698 .I userns_fd
 699 (together with
 700 .B MOUNT_ATTR_IDMAP
 701 in
 702 .IR attr_set )
 703 to create an ID-mapped mount will be the user namespace of a container.
 704 In other scenarios it will be a dedicated user namespace associated with
 705 a user's login session as is the case for portable home directories in
 706 .BR systemd-homed.service (8)).
 707 It is also perfectly fine to create a dedicated user namespace
 708 for the sake of ID mapping a mount.
 709 .PP
 710 ID-mapped mounts can be useful in the following
 711 and a variety of other scenarios:
 712 .IP \(bu 3
 713 Sharing files or filesystems
 714 between multiple users or multiple machines,
 715 especially in complex scenarios.
 716 For example,
 717 ID-mapped mounts are used to implement portable home directories in
 718 .BR systemd-homed.service (8),
 719 where they allow users to move their home directory
 720 to an external storage device
 721 and use it on multiple computers
 722 where they are assigned different user IDs and group IDs.
 723 This effectively makes it possible to
 724 assign random user IDs and group IDs at login time.
 725 .IP \(bu
 726 Sharing files or filesystems
 727 from the host with unprivileged containers.
 728 This allows a user to avoid having to change ownership permanently through
 729 .BR chown (2).
 730 .IP \(bu
 731 ID mapping a container's root filesystem.
 732 Users don't need to change ownership permanently through
 733 .BR chown (2).
 734 Especially for large root filesystems, using
 735 .BR chown (2)
 736 can be prohibitively expensive.
 737 .IP \(bu
 738 Sharing files or filesystems
 739 between containers with non-overlapping ID mappings.
 740 .IP \(bu
 741 Implementing discretionary access (DAC) permission checking
 742 for filesystems lacking a concept of ownership.
 743 .IP \(bu
 744 Efficiently changing ownership on a per-mount basis.
 745 In contrast to
 746 .BR chown (2),
 747 changing ownership of large sets of files is instantaneous with
 748 ID-mapped mounts.
 749 This is especially useful when ownership of
 750 an entire root filesystem of a virtual machine or container
 751 is to be changed as mentioned above.
 752 With ID-mapped mounts,
 753 a single
 754 .BR mount_setattr ()
 755 system call will be sufficient to change the ownership of all files.
 756 .IP \(bu
 757 Taking the current ownership into account.
 758 ID mappings specify precisely
 759 what a user or group ID is supposed to be mapped to.
 760 This contrasts with the
 761 .BR chown (2)
 762 system call which cannot by itself
 763 take the current ownership of the files it changes into account.
 764 It simply changes the ownership to the specified user ID and group ID.
 765 .IP \(bu
 766 Locally and temporarily restricted ownership changes.
 767 ID-mapped mounts make it possible to change ownership locally,
 768 restricting the ownership changes to specific mounts,
 769 and temporarily as the ownership changes only apply as long as the mount exists.
 770 By contrast,
 771 changing ownership via the
 772 .BR chown (2)
 773 system call changes the ownership globally and permanently.
 774 .\"
 775 .SS Extensibility
 776 In order to allow for future extensibility,
 777 .BR mount_setattr ()
 778 requires the user-space application to specify the size of the
 779 .I mount_attr
 780 structure that it is passing.
 781 By providing this information, it is possible for
 782 .BR mount_setattr ()
 783 to provide both forwards- and backwards-compatibility, with
 784 .I size
 785 acting as an implicit version number.
 786 (Because new extension fields will always
 787 be appended, the structure size will always increase.)
 788 This extensibility design is very similar to other system calls such as
 789 .BR perf_setattr (2),
 790 .BR perf_event_open (2),
 791 .BR clone3 (2)
 792 and
 793 .BR openat2 (2).
 794 .PP
 795 Let
 796 .I usize
 797 be the size of the structure as specified by the user-space application,
 798 and let
 799 .I ksize
 800 be the size of the structure which the kernel supports,
 801 then there are three cases to consider:
 802 .IP \(bu 3
 803 If
 804 .I ksize
 805 equals
 806 .IR usize ,
 807 then there is no version mismatch and
 808 .I attr
 809 can be used verbatim.
 810 .IP \(bu
 811 If
 812 .I ksize
 813 is larger than
 814 .IR usize ,
 815 then there are some extension fields that the kernel supports
 816 which the user-space application is unaware of.
 817 Because a zero value in any added extension field signifies a no-op,
 818 the kernel treats all of the extension fields
 819 not provided by the user-space application
 820 as having zero values.
 821 This provides backwards-compatibility.
 822 .IP \(bu
 823 If
 824 .I ksize
 825 is smaller than
 826 .IR usize ,
 827 then there are some extension fields which the user-space application is aware
 828 of but which the kernel does not support.
 829 Because any extension field must have its zero values signify a no-op,
 830 the kernel can safely ignore the unsupported extension fields
 831 if they are all zero.
 832 If any unsupported extension fields are non-zero,
 833 then \-1 is returned and
 834 .I errno
 835 is set to
 836 .BR E2BIG .
 837 This provides forwards-compatibility.
 838 .PP
 839 Because the definition of
 840 .I struct mount_attr
 841 may change in the future
 842 (with new fields being added when system headers are updated),
 843 user-space applications should zero-fill
 844 .I struct mount_attr
 845 to ensure that recompiling the program with new headers will not result in
 846 spurious errors at runtime.
 847 The simplest way is to use a designated initializer:
 848 .PP
 849 .in +4n
 850 .EX
 851 struct mount_attr attr = {
 852     .attr_set = MOUNT_ATTR_RDONLY,
 853     .attr_clr = MOUNT_ATTR_NODEV
 854 };
 855 .EE
 856 .in
 857 .PP
 858 Alternatively, the structure can be zero-filled using
 859 .BR memset (3)
 860 or similar functions:
 861 .PP
 862 .in +4n
 863 .EX
 864 struct mount_attr attr;
 865 memset(&attr, 0, sizeof(attr));
 866 attr.attr_set = MOUNT_ATTR_RDONLY;
 867 attr.attr_clr = MOUNT_ATTR_NODEV;
 868 .EE
 869 .in
 870 .PP
 871 A user-space application that wishes to determine which extensions the running
 872 kernel supports can do so by conducting a binary search on
 873 .I size
 874 with a structure which has every byte nonzero
 875 (to find the largest value which doesn't produce an error of
 876 .BR E2BIG ).
 877 .SH EXAMPLES
 878 .EX
 879 /*
 880  * This program allows the caller to create a new detached mount
 881  * and set various properties on it.
 882  */
 883 #define _GNU_SOURCE
 884 #include <errno.h>
 885 #include <fcntl.h>
 886 #include <getopt.h>
 887 #include <linux/mount.h>
 888 #include <linux/types.h>
 889 #include <stdbool.h>
 890 #include <stdio.h>
 891 #include <stdlib.h>
 892 #include <string.h>
 893 #include <sys/syscall.h>
 894 #include <unistd.h>
 895
 896 static inline int
 897 mount_setattr(int dirfd, const char *pathname, unsigned int flags,
 898               struct mount_attr *attr, size_t size)
 899 {
 900     return syscall(SYS_mount_setattr, dirfd, pathname, flags,
 901                    attr, size);
 902 }
 903
 904 static inline int
 905 open_tree(int dirfd, const char *filename, unsigned int flags)
 906 {
 907     return syscall(SYS_open_tree, dirfd, filename, flags);
 908 }
 909
 910 static inline int
 911 move_mount(int from_dirfd, const char *from_pathname,
 912            int to_dirfd, const char *to_pathname, unsigned int flags)
 913 {
 914     return syscall(SYS_move_mount, from_dirfd, from_pathname,
 915                    to_dirfd, to_pathname, flags);
 916 }
 917
 918 static const struct option longopts[] = {
 919     {"map\-mount",       required_argument,  NULL,  'a'},
 920     {"recursive",       no_argument,        NULL,  'b'},
 921     {"read\-only",       no_argument,        NULL,  'c'},
 922     {"block\-setid",     no_argument,        NULL,  'd'},
 923     {"block\-devices",   no_argument,        NULL,  'e'},
 924     {"block\-exec",      no_argument,        NULL,  'f'},
 925     {"no\-access\-time",  no_argument,        NULL,  'g'},
 926     { NULL,             0,                  NULL,   0 },
 927 };
 928
 929 #define exit_log(format, ...)  do           \e
 930 {                                           \e
 931     fprintf(stderr, format, ##__VA_ARGS__); \e
 932     exit(EXIT_FAILURE);                     \e
 933 } while (0)
 934
 935 int
 936 main(int argc, char *argv[])
 937 {
 938     struct mount_attr *attr = &(struct mount_attr){};
 939     int fd_userns = \-1;
 940     bool recursive = false;
 941     int index = 0;
 942     int ret;
 943
 944     while ((ret = getopt_long_only(argc, argv, "",
 945                                    longopts, &index)) != \-1) {
 946         switch (ret) {
 947         case 'a':
 948             fd_userns = open(optarg, O_RDONLY | O_CLOEXEC);
 949             if (fd_userns == \-1)
 950                 exit_log("%m \- Failed top open %s\en", optarg);
 951             break;
 952         case 'b':
 953             recursive = true;
 954             break;
 955         case 'c':
 956             attr\->attr_set |= MOUNT_ATTR_RDONLY;
 957             break;
 958         case 'd':
 959             attr\->attr_set |= MOUNT_ATTR_NOSUID;
 960             break;
 961         case 'e':
 962             attr\->attr_set |= MOUNT_ATTR_NODEV;
 963             break;
 964         case 'f':
 965             attr\->attr_set |= MOUNT_ATTR_NOEXEC;
 966             break;
 967         case 'g':
 968             attr\->attr_set |= MOUNT_ATTR_NOATIME;
 969             attr\->attr_clr |= MOUNT_ATTR__ATIME;
 970             break;
 971         default:
 972             exit_log("Invalid argument specified");
 973         }
 974     }
 975
 976     if ((argc \- optind) < 2)
 977         exit_log("Missing source or target mount point\en");
 978
 979     const char *source = argv[optind];
 980     const char *target = argv[optind + 1];
 981
 982     /* In the following, \-1 as the \(aqdirfd\(aq argument ensures that
 983        open_tree() fails if \(aqsource\(aq is not an absolute pathname. */
 984 .\" Christian Brauner
 985 .\"     When writing programs I like to never use relative paths with AT_FDCWD
 986 .\"     because. Because making assumptions about the current working directory
 987 .\"     of the calling process is just too easy to get wrong; especially when
 988 .\"     pivot_root() or chroot() are in play.
 989 .\"     My absolut preference (joke intended) is to open a well-known starting
 990 .\"     point with an absolute path to get a dirfd and then scope all future
 991 .\"     operations beneath that dirfd. This already works with old-style
 992 .\"     openat() and _very_ cautious programming but openat2() and its
 993 .\"     resolve-flag space have made this **chef's kiss**.
 994 .\"     If I can't operate based on a well-known dirfd I use absolute paths
 995 .\"     with a -EBADF dirfd passed to *at() functions.
 996
 997     int fd_tree = open_tree(\-1, source,
 998                        OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC |
 999                        AT_EMPTY_PATH | (recursive ? AT_RECURSIVE : 0));
1000     if (fd_tree == \-1)
1001         exit_log("%m \- Failed to open %s\en", source);
1002
1003     if (fd_userns >= 0) {
1004         attr\->attr_set  |= MOUNT_ATTR_IDMAP;
1005         attr\->userns_fd = fd_userns;
1006     }
1007
1008     ret = mount_setattr(fd_tree, "",
1009                         AT_EMPTY_PATH | (recursive ? AT_RECURSIVE : 0),
1010                         attr, sizeof(struct mount_attr));
1011     if (ret == \-1)
1012         exit_log("%m \- Failed to change mount attributes\en");
1013
1014     close(fd_userns);
1015
1016     /* In the following, \-1 as the \(aqto_dirfd\(aq argument ensures that
1017        open_tree() fails if \(aqtarget\(aq is not an absolute pathname. */
1018
1019     ret = move_mount(fd_tree, "", \-1, target,
1020                      MOVE_MOUNT_F_EMPTY_PATH);
1021     if (ret == \-1)
1022         exit_log("%m \- Failed to attach mount to %s\en", target);
1023
1024     close(fd_tree);
1025
1026     exit(EXIT_SUCCESS);
1027 }
1028 .EE
1029 .SH SEE ALSO
1030 .BR newuidmap (1),
1031 .BR newgidmap (1),
1032 .BR clone (2),
1033 .BR mount (2),
1034 .BR unshare (2),
1035 .BR proc (5),
1036 .BR mount_namespaces (7),
1037 .BR capabilities (7),
1038 .BR user_namespaces (7),
1039 .BR xattr (7)