1 .\" Copyright (c) 2016, IBM Corporation.
2 .\" Written by Mike Rapoport <rppt@linux.vnet.ibm.com>
3 .\" and Copyright (C) 2017 Michael Kerrisk <mtk.manpages@gmail.com>
5 .\" SPDX-License-Identifier: Linux-man-pages-copyleft
7 .TH USERFAULTFD 2 2021-03-22 "Linux man-pages (unreleased)"
9 userfaultfd \- create a file descriptor for handling page faults in user space
12 .RI ( libc ", " \-lc )
15 .BR "#include <fcntl.h>" " /* Definition of " O_* " constants */"
16 .BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
17 .BR "#include <linux/userfaultfd.h>" " /* Definition of " UFFD_* " constants */"
18 .B #include <unistd.h>
20 .BI "int syscall(SYS_userfaultfd, int " flags );
24 glibc provides no wrapper for
26 necessitating the use of
30 creates a new userfaultfd object that can be used for delegation of page-fault
31 handling to a user-space application,
32 and returns a file descriptor that refers to the new object.
33 The new userfaultfd object is configured using
36 Once the userfaultfd object is configured, the application can use
38 to receive userfaultfd notifications.
39 The reads from userfaultfd may be blocking or non-blocking,
40 depending on the value of
42 used for the creation of the userfaultfd or subsequent calls to
45 The following values may be bitwise ORed in
47 to change the behavior of
51 Enable the close-on-exec flag for the new userfaultfd file descriptor.
52 See the description of the
58 Enables non-blocking operation for the userfaultfd object.
59 See the description of the
64 .B UFFD_USER_MODE_ONLY
65 This is an userfaultfd-specific flag that was introduced in Linux 5.11.
66 When set, the userfaultfd object will only be able to handle
67 page faults originated from the user space on the registered regions.
68 When a kernel-originated fault was triggered
69 on the registered range with this userfaultfd, a
71 signal will be delivered.
73 When the last file descriptor referring to a userfaultfd object is closed,
74 all memory ranges that were registered with the object are unregistered
75 and unread events are flushed.
78 Userfaultfd supports three modes of registration:
80 .BR UFFDIO_REGISTER_MODE_MISSING " (since 4.10)"
82 .B UFFDIO_REGISTER_MODE_MISSING
83 mode, user-space will receive a page-fault notification
84 when a missing page is accessed.
85 The faulted thread will be stopped from execution until the page fault is
86 resolved from user-space by either an
92 .BR UFFDIO_REGISTER_MODE_MINOR " (since 5.13)"
94 .B UFFDIO_REGISTER_MODE_MINOR
95 mode, user-space will receive a page-fault notification
96 when a minor page fault occurs.
98 when a backing page is in the page cache,
99 but page table entries don't yet exist.
100 The faulted thread will be stopped from execution
101 until the page fault is resolved from user-space by an
105 .BR UFFDIO_REGISTER_MODE_WP " (since 5.7)"
107 .B UFFDIO_REGISTER_MODE_WP
108 mode, user-space will receive a page-fault notification
109 when a write-protected page is written.
110 The faulted thread will be stopped from execution
111 until user-space write-unprotects the page using an
112 .B UFFDIO_WRITEPROTECT
115 Multiple modes can be enabled at the same time for the same memory range.
117 Since Linux 4.14, a userfaultfd page-fault notification can selectively embed
118 faulting thread ID information into the notification.
119 One needs to enable this feature explicitly using the
120 .B UFFD_FEATURE_THREAD_ID
121 feature bit when initializing the userfaultfd context.
122 By default, thread ID reporting is disabled.
124 The userfaultfd mechanism is designed to allow a thread in a multithreaded
125 program to perform user-space paging for the other threads in the process.
126 When a page fault occurs for one of the regions registered
127 to the userfaultfd object,
128 the faulting thread is put to sleep and
129 an event is generated that can be read via the userfaultfd file descriptor.
130 The fault-handling thread reads events from this file descriptor and services
131 them using the operations described in
132 .BR ioctl_userfaultfd (2).
133 When servicing the page fault events,
134 the fault-handling thread can trigger a wake-up for the sleeping thread.
136 It is possible for the faulting threads and the fault-handling threads
137 to run in the context of different processes.
138 In this case, these threads may belong to different programs,
139 and the program that executes the faulting threads
140 will not necessarily cooperate with the program that handles the page faults.
141 In such non-cooperative mode,
142 the process that monitors userfaultfd and handles page faults
143 needs to be aware of the changes in the virtual memory layout
144 of the faulting process to avoid memory corruption.
147 userfaultfd can also notify the fault-handling threads about changes
148 in the virtual memory layout of the faulting process.
149 In addition, if the faulting process invokes
151 the userfaultfd objects associated with the parent may be duplicated
152 into the child process and the userfaultfd monitor will be notified
156 about the file descriptor associated with the userfault objects
157 created for the child process,
158 which allows the userfaultfd monitor to perform user-space paging
159 for the child process.
160 Unlike page faults which have to be synchronous and require an
161 explicit or implicit wakeup,
162 all other events are delivered asynchronously and
163 the non-cooperative process resumes execution as
164 soon as the userfaultfd manager executes
166 The userfaultfd manager should carefully synchronize calls to
168 with the processing of events.
170 The current asynchronous model of the event delivery is optimal for
171 single threaded non-cooperative userfaultfd manager implementations.
172 .\" Regarding the preceding sentence, Mike Rapoport says:
173 .\" The major point here is that current events delivery model could be
174 .\" problematic for multi-threaded monitor. I even suspect that it would be
175 .\" impossible to ensure synchronization between page faults and non-page
176 .\" fault events in multi-threaded monitor.
178 .\" FIXME elaborate about non-cooperating mode, describe its limitations
179 .\" for kernels before 4.11, features added in 4.11
180 .\" and limitations remaining in 4.11
181 .\" Maybe it's worth adding a dedicated sub-section...
184 Since Linux 5.7, userfaultfd is able to do
185 synchronous page dirty tracking using the new write-protect register mode.
186 One should check against the feature bit
187 .B UFFD_FEATURE_PAGEFAULT_FLAG_WP
188 before using this feature.
189 Similar to the original userfaultfd missing mode, the write-protect mode will
190 generate a userfaultfd notification when the protected page is written.
191 The user needs to resolve the page fault by unprotecting the faulted page and
192 kicking the faulted thread to continue.
193 For more information,
194 please refer to the "Userfaultfd write-protect mode" section.
196 .SS Userfaultfd operation
197 After the userfaultfd object is created with
199 the application must enable it using the
203 This operation allows a handshake between the kernel and user space
204 to determine the API version and supported features.
205 This operation must be performed before any of the other
207 operations described below (or those operations fail with the
214 the application then registers memory address ranges using the
218 After successful completion of a
221 a page fault occurring in the requested memory range, and satisfying
222 the mode defined at the registration time, will be forwarded by the kernel to
223 the user-space application.
224 The application can then use the
230 operations to resolve the page fault.
232 Since Linux 4.14, if the application sets the
233 .B UFFD_FEATURE_SIGBUS
234 feature bit using the
237 no page-fault notification will be forwarded to user space.
240 signal is delivered to the faulting process.
242 userfaultfd can be used for robustness purposes to simply catch
243 any access to areas within the registered address range that do not
244 have pages allocated, without having to listen to userfaultfd events.
245 No userfaultfd monitor will be required for dealing with such memory
247 For example, this feature can be useful for applications that
248 want to prevent the kernel from automatically allocating pages and filling
249 holes in sparse files when the hole is accessed through a memory mapping.
252 .B UFFD_FEATURE_SIGBUS
253 feature is implicitly inherited through
255 if used in combination with
256 .BR UFFD_FEATURE_FORK .
258 Details of the various
260 operations can be found in
261 .BR ioctl_userfaultfd (2).
263 Since Linux 4.11, events other than page-fault may enabled during
268 userfaultfd can be used only with anonymous private memory mappings.
270 userfaultfd can be also used with hugetlbfs and shared memory mappings.
272 .SS Userfaultfd write-protect mode (since 5.7)
273 Since Linux 5.7, userfaultfd supports write-protect mode for anonymous memory.
274 The user needs to first check availability of this feature using
276 ioctl against the feature bit
277 .B UFFD_FEATURE_PAGEFAULT_FLAG_WP
278 before using this feature.
281 the write-protection mode was also supported on
282 shmem and hugetlbfs memory types.
283 It can be detected with the feature bit
284 .BR UFFD_FEATURE_WP_HUGETLBFS_SHMEM .
286 To register with userfaultfd write-protect mode, the user needs to initiate the
289 .B UFFDIO_REGISTER_MODE_WP
291 Note that it is legal to monitor the same memory range with multiple modes.
292 For example, the user can do
295 .BR "UFFDIO_REGISTER_MODE_MISSING | UFFDIO_REGISTER_MODE_WP" .
297 .B UFFDIO_REGISTER_MODE_WP
298 registered, user-space will
300 receive any notification when a missing page is written.
301 Instead, user-space will receive a write-protect page-fault notification
302 only when an existing but write-protected page got written.
307 .B UFFDIO_REGISTER_MODE_WP
309 the user can write-protect any existing memory within the range using the ioctl
310 .B UFFDIO_WRITEPROTECT
312 .I uffdio_writeprotect.mode
314 .BR UFFDIO_WRITEPROTECT_MODE_WP .
316 When a write-protect event happens,
317 user-space will receive a page-fault notification whose
318 .I uffd_msg.pagefault.flags
320 .B UFFD_PAGEFAULT_FLAG_WP
322 Note: since only writes can trigger this kind of fault,
323 write-protect notifications will always have the
324 .B UFFD_PAGEFAULT_FLAG_WRITE
325 bit set along with the
326 .B UFFD_PAGEFAULT_FLAG_WP
329 To resolve a write-protection page fault, the user should initiate another
330 .B UFFDIO_WRITEPROTECT
332 .I uffd_msg.pagefault.flags
334 .B UFFDIO_WRITEPROTECT_MODE_WP
335 cleared upon the faulted page or range.
337 .SS Userfaultfd minor fault mode (since 5.13)
339 userfaultfd supports minor fault mode.
341 fault messages are produced not for major faults
342 (where the page was missing),
343 but rather for minor faults,
344 where a page exists in the page cache,
345 but the page table entries are not yet present.
346 The user needs to first check availability of this feature using the
348 ioctl with the appropriate feature bits set before using this feature:
349 .B UFFD_FEATURE_MINOR_HUGETLBFS
352 .B UFFD_FEATURE_MINOR_SHMEM
355 To register with userfaultfd minor fault mode,
356 the user needs to initiate the
359 .B UFFD_REGISTER_MODE_MINOR
362 When a minor fault occurs,
363 user-space will receive a page-fault notification
365 .I uffd_msg.pagefault.flags
367 .B UFFD_PAGEFAULT_FLAG_MINOR
370 To resolve a minor page fault,
371 the handler should decide whether or not
372 the existing page contents need to be modified first.
374 this should be done in-place via a second,
375 non-userfaultfd-registered mapping
376 to the same backing page
377 (e.g., by mapping the shmem or hugetlbfs file twice).
378 Once the page is considered "up to date",
379 the fault can be resolved by initiating an
382 which installs the page table entries and
384 wakes up the faulting thread(s).
386 Minor fault mode supports only hugetlbfs-backed (since Linux 5.13)
387 and shmem-backed (since Linux 5.14) memory.
389 .SS Reading from the userfaultfd structure
392 from the userfaultfd file descriptor returns one or more
394 structures, each of which describes a page-fault event
395 or an event required for the non-cooperative userfaultfd usage:
400 uint8_t event; /* Type of event */
404 uint64_t flags; /* Flags describing fault */
405 uint64_t address; /* Faulting address */
407 uint32_t ptid; /* Thread ID of the fault */
411 struct { /* Since Linux 4.11 */
412 uint32_t ufd; /* Userfault file descriptor
413 of the child process */
416 struct { /* Since Linux 4.11 */
417 uint64_t from; /* Old address of remapped area */
418 uint64_t to; /* New address of remapped area */
419 uint64_t len; /* Original mapping length */
422 struct { /* Since Linux 4.11 */
423 uint64_t start; /* Start address of removed area */
424 uint64_t end; /* End address of removed area */
429 /* Padding fields omitted */
434 If multiple events are available and the supplied buffer is large enough,
436 returns as many events as will fit in the supplied buffer.
437 If the buffer supplied to
439 is smaller than the size of the
446 The fields set in the
448 structure are as follows:
452 Depending of the event type,
453 different fields of the
455 union represent details required for the event processing.
456 The non-page-fault events are generated only when appropriate feature
457 is enabled during API handshake with
461 The following values can appear in the
466 .BR UFFD_EVENT_PAGEFAULT " (since Linux 4.3)"
468 The page-fault details are available in the
472 .BR UFFD_EVENT_FORK " (since Linux 4.11)"
473 Generated when the faulting process invokes
480 The event details are available in the
483 .\" FIXME describe duplication of userfault file descriptor during fork
485 .BR UFFD_EVENT_REMAP " (since Linux 4.11)"
486 Generated when the faulting process invokes
488 The event details are available in the
492 .BR UFFD_EVENT_REMOVE " (since Linux 4.11)"
493 Generated when the faulting process invokes
500 The event details are available in the
504 .BR UFFD_EVENT_UNMAP " (since Linux 4.11)"
505 Generated when the faulting process unmaps a memory range,
506 either explicitly using
512 The event details are available in the
518 The address that triggered the page fault.
521 A bit mask of flags that describe the event.
523 .BR UFFD_EVENT_PAGEFAULT ,
524 the following flag may appear:
527 .B UFFD_PAGEFAULT_FLAG_WP
528 If this flag is set, then the fault was a write-protect fault.
530 .B UFFD_PAGEFAULT_FLAG_MINOR
531 If this flag is set, then the fault was a minor fault.
533 .B UFFD_PAGEFAULT_FLAG_WRITE
534 If this flag is set, then the fault was a write fault.
537 .B UFFD_PAGEFAULT_FLAG_WP
539 .B UFFD_PAGEFAULT_FLAG_MINOR
540 are set, then the fault was a missing fault.
543 .I pagefault.feat.pid
544 The thread ID that triggered the page fault.
547 The file descriptor associated with the userfault object
548 created for the child created by
552 The original address of the memory range that was remapped using
556 The new address of the memory range that was remapped using
560 The original length of the memory range that was remapped using
564 The start address of the memory range that was freed using
569 The end address of the memory range that was freed using
575 on a userfaultfd file descriptor can fail with the following errors:
578 The userfaultfd object has not yet been enabled using the
585 flag is enabled in the associated open file description,
586 the userfaultfd file descriptor can be monitored with
591 When events are available, the file descriptor indicates as readable.
594 flag is not enabled, then
596 (always) indicates the file as having a
600 indicates the file descriptor as both readable and writable.
601 .\" FIXME What is the reason for this seemingly odd behavior with respect
602 .\" to the O_NONBLOCK flag? (see userfaultfd_poll() in fs/userfaultfd.c).
603 .\" Something needs to be said about this.
607 returns a new file descriptor that refers to the userfaultfd object.
608 On error, \-1 is returned, and
610 is set to indicate the error.
614 An unsupported value was specified in
618 The per-process limit on the number of open file descriptors has been
622 The system-wide limit on the total number of open files has been
626 Insufficient kernel memory was available.
628 .BR EPERM " (since Linux 5.2)"
629 .\" cefdca0a86be517bc390fc4541e3674b8e7803b0
630 The caller is not privileged (does not have the
632 capability in the initial user namespace), and
633 .I /proc/sys/vm/unprivileged_userfaultfd
638 system call first appeared in Linux 4.3.
640 The support for hugetlbfs and shared memory areas and
641 non-page-fault events was added in Linux 4.11
644 is Linux-specific and should not be used in programs intended to be
647 The userfaultfd mechanism can be used as an alternative to
648 traditional user-space paging techniques based on the use of the
652 It can also be used to implement lazy restore
653 for checkpoint/restore mechanisms,
654 as well as post-copy migration to allow (nearly) uninterrupted execution
655 when transferring virtual machines and Linux containers
656 from one host to another.
659 .B UFFD_FEATURE_EVENT_FORK
660 is enabled and a system call from the
662 family is interrupted by a signal or failed, a stale userfaultfd descriptor
664 In this case, a spurious
666 will be delivered to the userfaultfd monitor.
668 The program below demonstrates the use of the userfaultfd mechanism.
669 The program creates two threads, one of which acts as the
670 page-fault handler for the process, for the pages in a demand-page zero
674 The program takes one command-line argument,
675 which is the number of pages that will be created in a mapping
676 whose page faults will be handled via userfaultfd.
677 After creating a userfaultfd object,
678 the program then creates an anonymous private mapping of the specified size
679 and registers the address range of that mapping using the
683 The program then creates a second thread that will perform the
684 task of handling page faults.
686 The main thread then walks through the pages of the mapping fetching
687 bytes from successive pages.
688 Because the pages have not yet been accessed,
689 the first access of a byte in each page will trigger a page-fault event
690 on the userfaultfd file descriptor.
692 Each of the page-fault events is handled by the second thread,
693 which sits in a loop processing input from the userfaultfd file descriptor.
694 In each loop iteration, the second thread first calls
696 to check the state of the file descriptor,
697 and then reads an event from the file descriptor.
698 All such events should be
699 .B UFFD_EVENT_PAGEFAULT
701 which the thread handles by copying a page of data into
702 the faulting region using the
707 The following is an example of what we see when running the program:
711 $ \fB./userfaultfd_demo 3\fP
712 Address returned by mmap() = 0x7fd30106c000
714 fault_handler_thread():
715 poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
716 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106c00f
717 (uffdio_copy.copy returned 4096)
718 Read address 0x7fd30106c00f in main(): A
719 Read address 0x7fd30106c40f in main(): A
720 Read address 0x7fd30106c80f in main(): A
721 Read address 0x7fd30106cc0f in main(): A
723 fault_handler_thread():
724 poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
725 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106d00f
726 (uffdio_copy.copy returned 4096)
727 Read address 0x7fd30106d00f in main(): B
728 Read address 0x7fd30106d40f in main(): B
729 Read address 0x7fd30106d80f in main(): B
730 Read address 0x7fd30106dc0f in main(): B
732 fault_handler_thread():
733 poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
734 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106e00f
735 (uffdio_copy.copy returned 4096)
736 Read address 0x7fd30106e00f in main(): C
737 Read address 0x7fd30106e40f in main(): C
738 Read address 0x7fd30106e80f in main(): C
739 Read address 0x7fd30106ec0f in main(): C
744 .\" SRC BEGIN (userfaultfd.c)
746 /* userfaultfd_demo.c
748 Licensed under the GNU General Public License version 2 or later.
753 #include <inttypes.h>
754 #include <linux/userfaultfd.h>
760 #include <sys/ioctl.h>
761 #include <sys/mman.h>
762 #include <sys/syscall.h>
765 #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \e
768 static int page_size;
771 fault_handler_thread(void *arg)
774 struct pollfd pollfd;
775 static struct uffd_msg msg; /* Data read from userfaultfd */
776 static int fault_cnt = 0; /* Number of faults so far handled */
777 long uffd; /* userfaultfd file descriptor */
778 static char *page = NULL;
779 struct uffdio_copy uffdio_copy;
784 /* Create a page that will be copied into the faulting region. */
787 page = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
788 MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0);
789 if (page == MAP_FAILED)
793 /* Loop, handling incoming events on the userfaultfd
798 /* See what poll() tells us about the userfaultfd. */
801 pollfd.events = POLLIN;
802 nready = poll(&pollfd, 1, \-1);
806 printf("\enfault_handler_thread():\en");
807 printf(" poll() returns: nready = %d; "
808 "POLLIN = %d; POLLERR = %d\en", nready,
809 (pollfd.revents & POLLIN) != 0,
810 (pollfd.revents & POLLERR) != 0);
812 /* Read an event from the userfaultfd. */
814 nread = read(uffd, &msg, sizeof(msg));
816 printf("EOF on userfaultfd!\en");
823 /* We expect only one kind of event; verify that assumption. */
825 if (msg.event != UFFD_EVENT_PAGEFAULT) {
826 fprintf(stderr, "Unexpected event on userfaultfd\en");
830 /* Display info about the page\-fault event. */
832 printf(" UFFD_EVENT_PAGEFAULT event: ");
833 printf("flags = %"PRIx64"; ", msg.arg.pagefault.flags);
834 printf("address = %"PRIx64"\en", msg.arg.pagefault.address);
836 /* Copy the page pointed to by \(aqpage\(aq into the faulting
837 region. Vary the contents that are copied in, so that it
838 is more obvious that each fault is handled separately. */
840 memset(page, \(aqA\(aq + fault_cnt % 20, page_size);
843 uffdio_copy.src = (unsigned long) page;
845 /* We need to handle page faults in units of pages(!).
846 So, round faulting address down to page boundary. */
848 uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address &
849 \(ti(page_size \- 1);
850 uffdio_copy.len = page_size;
851 uffdio_copy.mode = 0;
852 uffdio_copy.copy = 0;
853 if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == \-1)
854 errExit("ioctl\-UFFDIO_COPY");
856 printf(" (uffdio_copy.copy returned %"PRId64")\en",
862 main(int argc, char *argv[])
866 long uffd; /* userfaultfd file descriptor */
867 char *addr; /* Start of region handled by userfaultfd */
868 uint64_t len; /* Length of region handled by userfaultfd */
869 pthread_t thr; /* ID of thread that handles page faults */
870 struct uffdio_api uffdio_api;
871 struct uffdio_register uffdio_register;
875 fprintf(stderr, "Usage: %s num\-pages\en", argv[0]);
879 page_size = sysconf(_SC_PAGE_SIZE);
880 len = strtoull(argv[1], NULL, 0) * page_size;
882 /* Create and enable userfaultfd object. */
884 uffd = syscall(SYS_userfaultfd, O_CLOEXEC | O_NONBLOCK);
886 errExit("userfaultfd");
888 uffdio_api.api = UFFD_API;
889 uffdio_api.features = 0;
890 if (ioctl(uffd, UFFDIO_API, &uffdio_api) == \-1)
891 errExit("ioctl\-UFFDIO_API");
893 /* Create a private anonymous mapping. The memory will be
894 demand\-zero paged\-\-that is, not yet allocated. When we
895 actually touch the memory, it will be allocated via
898 addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
899 MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0);
900 if (addr == MAP_FAILED)
903 printf("Address returned by mmap() = %p\en", addr);
905 /* Register the memory range of the mapping we just created for
906 handling by the userfaultfd object. In mode, we request to track
907 missing pages (i.e., pages that have not yet been faulted in). */
909 uffdio_register.range.start = (unsigned long) addr;
910 uffdio_register.range.len = len;
911 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
912 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == \-1)
913 errExit("ioctl\-UFFDIO_REGISTER");
915 /* Create a thread that will process the userfaultfd events. */
917 s = pthread_create(&thr, NULL, fault_handler_thread, (void *) uffd);
920 errExit("pthread_create");
923 /* Main thread now touches memory in the mapping, touching
924 locations 1024 bytes apart. This will trigger userfaultfd
925 events for all pages in the region. */
927 l = 0xf; /* Ensure that faulting address is not on a page
928 boundary, in order to test that we correctly
929 handle that case in fault_handling_thread(). */
932 printf("Read address %p in %s(): ", addr + l, __func__);
935 usleep(100000); /* Slow things down a little */
945 .BR ioctl_userfaultfd (2),
949 .I Documentation/admin\-guide/mm/userfaultfd.rst
950 in the Linux kernel source tree