1 .\" Copyright (c) 2016, IBM Corporation.
2 .\" Written by Mike Rapoport <rppt@linux.vnet.ibm.com>
3 .\" and Copyright (C) 2017 Michael Kerrisk <mtk.manpages@gmail.com>
5 .\" %%%LICENSE_START(VERBATIM)
6 .\" Permission is granted to make and distribute verbatim copies of this
7 .\" manual provided the copyright notice and this permission notice are
8 .\" preserved on all copies.
10 .\" Permission is granted to copy and distribute modified versions of this
11 .\" manual under the conditions for verbatim copying, provided that the
12 .\" entire resulting derived work is distributed under the terms of a
13 .\" permission notice identical to this one.
15 .\" Since the Linux kernel and libraries are constantly changing, this
16 .\" manual page may be incorrect or out-of-date. The author(s) assume no
17 .\" responsibility for errors or omissions, or for damages resulting from
18 .\" the use of the information contained herein. The author(s) may not
19 .\" have taken the same level of care in the production of this manual,
20 .\" which is licensed free of charge, as they might when working
23 .\" Formatted or processed versions of this manual, if unaccompanied by
24 .\" the source, must acknowledge the copyright and authors of this work.
27 .TH USERFAULTFD 2 2021-03-22 "Linux" "Linux Programmer's Manual"
29 userfaultfd \- create a file descriptor for handling page faults in user space
32 .BR "#include <fcntl.h>" " /* Definition of " O_* " constants */"
33 .BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
34 .B #include <unistd.h>
36 .BI "int syscall(SYS_userfaultfd, int " flags );
40 glibc provides no wrapper for
42 necessitating the use of
46 creates a new userfaultfd object that can be used for delegation of page-fault
47 handling to a user-space application,
48 and returns a file descriptor that refers to the new object.
49 The new userfaultfd object is configured using
52 Once the userfaultfd object is configured, the application can use
54 to receive userfaultfd notifications.
55 The reads from userfaultfd may be blocking or non-blocking,
56 depending on the value of
58 used for the creation of the userfaultfd or subsequent calls to
61 The following values may be bitwise ORed in
63 to change the behavior of
67 Enable the close-on-exec flag for the new userfaultfd file descriptor.
68 See the description of the
74 Enables non-blocking operation for the userfaultfd object.
75 See the description of the
80 When the last file descriptor referring to a userfaultfd object is closed,
81 all memory ranges that were registered with the object are unregistered
82 and unread events are flushed.
85 Userfaultfd supports two modes of registration:
87 .BR UFFDIO_REGISTER_MODE_MISSING " (since 4.10)"
89 .B UFFDIO_REGISTER_MODE_MISSING
90 mode, user-space will receive a page-fault notification
91 when a missing page is accessed.
92 The faulted thread will be stopped from execution until the page fault is
93 resolved from user-space by either an
99 .BR UFFDIO_REGISTER_MODE_WP " (since 5.7)"
101 .B UFFDIO_REGISTER_MODE_WP
102 mode, user-space will receive a page-fault notification
103 when a write-protected page is written.
104 The faulted thread will be stopped from execution
105 until user-space write-unprotects the page using an
106 .B UFFDIO_WRITEPROTECT
109 Multiple modes can be enabled at the same time for the same memory range.
111 Since Linux 4.14, a userfaultfd page-fault notification can selectively embed
112 faulting thread ID information into the notification.
113 One needs to enable this feature explicitly using the
114 .B UFFD_FEATURE_THREAD_ID
115 feature bit when initializing the userfaultfd context.
116 By default, thread ID reporting is disabled.
118 The userfaultfd mechanism is designed to allow a thread in a multithreaded
119 program to perform user-space paging for the other threads in the process.
120 When a page fault occurs for one of the regions registered
121 to the userfaultfd object,
122 the faulting thread is put to sleep and
123 an event is generated that can be read via the userfaultfd file descriptor.
124 The fault-handling thread reads events from this file descriptor and services
125 them using the operations described in
126 .BR ioctl_userfaultfd (2).
127 When servicing the page fault events,
128 the fault-handling thread can trigger a wake-up for the sleeping thread.
130 It is possible for the faulting threads and the fault-handling threads
131 to run in the context of different processes.
132 In this case, these threads may belong to different programs,
133 and the program that executes the faulting threads
134 will not necessarily cooperate with the program that handles the page faults.
135 In such non-cooperative mode,
136 the process that monitors userfaultfd and handles page faults
137 needs to be aware of the changes in the virtual memory layout
138 of the faulting process to avoid memory corruption.
141 userfaultfd can also notify the fault-handling threads about changes
142 in the virtual memory layout of the faulting process.
143 In addition, if the faulting process invokes
145 the userfaultfd objects associated with the parent may be duplicated
146 into the child process and the userfaultfd monitor will be notified
150 about the file descriptor associated with the userfault objects
151 created for the child process,
152 which allows the userfaultfd monitor to perform user-space paging
153 for the child process.
154 Unlike page faults which have to be synchronous and require an
155 explicit or implicit wakeup,
156 all other events are delivered asynchronously and
157 the non-cooperative process resumes execution as
158 soon as the userfaultfd manager executes
160 The userfaultfd manager should carefully synchronize calls to
162 with the processing of events.
164 The current asynchronous model of the event delivery is optimal for
165 single threaded non-cooperative userfaultfd manager implementations.
166 .\" Regarding the preceding sentence, Mike Rapoport says:
167 .\" The major point here is that current events delivery model could be
168 .\" problematic for multi-threaded monitor. I even suspect that it would be
169 .\" impossible to ensure synchronization between page faults and non-page
170 .\" fault events in multi-threaded monitor.
172 .\" FIXME elaborate about non-cooperating mode, describe its limitations
173 .\" for kernels before 4.11, features added in 4.11
174 .\" and limitations remaining in 4.11
175 .\" Maybe it's worth adding a dedicated sub-section...
178 Since Linux 5.7, userfaultfd is able to do
179 synchronous page dirty tracking using the new write-protect register mode.
180 One should check against the feature bit
181 .B UFFD_FEATURE_PAGEFAULT_FLAG_WP
182 before using this feature.
183 Similar to the original userfaultfd missing mode, the write-protect mode will
184 generate a userfaultfd notification when the protected page is written.
185 The user needs to resolve the page fault by unprotecting the faulted page and
186 kicking the faulted thread to continue.
187 For more information,
188 please refer to the "Userfaultfd write-protect mode" section.
190 .SS Userfaultfd operation
191 After the userfaultfd object is created with
193 the application must enable it using the
197 This operation allows a handshake between the kernel and user space
198 to determine the API version and supported features.
199 This operation must be performed before any of the other
201 operations described below (or those operations fail with the
208 the application then registers memory address ranges using the
212 After successful completion of a
215 a page fault occurring in the requested memory range, and satisfying
216 the mode defined at the registration time, will be forwarded by the kernel to
217 the user-space application.
218 The application can then use the
223 operations to resolve the page fault.
225 Since Linux 4.14, if the application sets the
226 .B UFFD_FEATURE_SIGBUS
227 feature bit using the
230 no page-fault notification will be forwarded to user space.
233 signal is delivered to the faulting process.
235 userfaultfd can be used for robustness purposes to simply catch
236 any access to areas within the registered address range that do not
237 have pages allocated, without having to listen to userfaultfd events.
238 No userfaultfd monitor will be required for dealing with such memory
240 For example, this feature can be useful for applications that
241 want to prevent the kernel from automatically allocating pages and filling
242 holes in sparse files when the hole is accessed through a memory mapping.
245 .B UFFD_FEATURE_SIGBUS
246 feature is implicitly inherited through
248 if used in combination with
249 .BR UFFD_FEATURE_FORK .
251 Details of the various
253 operations can be found in
254 .BR ioctl_userfaultfd (2).
256 Since Linux 4.11, events other than page-fault may enabled during
261 userfaultfd can be used only with anonymous private memory mappings.
263 userfaultfd can be also used with hugetlbfs and shared memory mappings.
265 .SS Userfaultfd write-protect mode (since 5.7)
266 Since Linux 5.7, userfaultfd supports write-protect mode.
267 The user needs to first check availability of this feature using
269 ioctl against the feature bit
270 .B UFFD_FEATURE_PAGEFAULT_FLAG_WP
271 before using this feature.
273 To register with userfaultfd write-protect mode, the user needs to initiate the
276 .B UFFDIO_REGISTER_MODE_WP
278 Note that it is legal to monitor the same memory range with multiple modes.
279 For example, the user can do
282 .BR "UFFDIO_REGISTER_MODE_MISSING | UFFDIO_REGISTER_MODE_WP" .
284 .B UFFDIO_REGISTER_MODE_WP
285 registered, user-space will
287 receive any notification when a missing page is written.
288 Instead, user-space will receive a write-protect page-fault notification
289 only when an existing but write-protected page got written.
294 .B UFFDIO_REGISTER_MODE_WP
296 the user can write-protect any existing memory within the range using the ioctl
297 .B UFFDIO_WRITEPROTECT
299 .I uffdio_writeprotect.mode
301 .BR UFFDIO_WRITEPROTECT_MODE_WP .
303 When a write-protect event happens,
304 user-space will receive a page-fault notification whose
305 .I uffd_msg.pagefault.flags
307 .B UFFD_PAGEFAULT_FLAG_WP
309 Note: since only writes can trigger this kind of fault,
310 write-protect notifications will always have the
311 .B UFFD_PAGEFAULT_FLAG_WRITE
312 bit set along with the
313 .BR UFFD_PAGEFAULT_FLAG_WP
316 To resolve a write-protection page fault, the user should initiate another
317 .B UFFDIO_WRITEPROTECT
319 .I uffd_msg.pagefault.flags
321 .B UFFDIO_WRITEPROTECT_MODE_WP
322 cleared upon the faulted page or range.
324 Write-protect mode supports only private anonymous memory.
325 .SS Reading from the userfaultfd structure
328 from the userfaultfd file descriptor returns one or more
330 structures, each of which describes a page-fault event
331 or an event required for the non-cooperative userfaultfd usage:
336 __u8 event; /* Type of event */
340 __u64 flags; /* Flags describing fault */
341 __u64 address; /* Faulting address */
343 __u32 ptid; /* Thread ID of the fault */
347 struct { /* Since Linux 4.11 */
348 __u32 ufd; /* Userfault file descriptor
349 of the child process */
352 struct { /* Since Linux 4.11 */
353 __u64 from; /* Old address of remapped area */
354 __u64 to; /* New address of remapped area */
355 __u64 len; /* Original mapping length */
358 struct { /* Since Linux 4.11 */
359 __u64 start; /* Start address of removed area */
360 __u64 end; /* End address of removed area */
365 /* Padding fields omitted */
370 If multiple events are available and the supplied buffer is large enough,
372 returns as many events as will fit in the supplied buffer.
373 If the buffer supplied to
375 is smaller than the size of the
382 The fields set in the
384 structure are as follows:
388 Depending of the event type,
389 different fields of the
391 union represent details required for the event processing.
392 The non-page-fault events are generated only when appropriate feature
393 is enabled during API handshake with
397 The following values can appear in the
402 .BR UFFD_EVENT_PAGEFAULT " (since Linux 4.3)"
404 The page-fault details are available in the
408 .BR UFFD_EVENT_FORK " (since Linux 4.11)"
409 Generated when the faulting process invokes
416 The event details are available in the
419 .\" FIXME describe duplication of userfault file descriptor during fork
421 .BR UFFD_EVENT_REMAP " (since Linux 4.11)"
422 Generated when the faulting process invokes
424 The event details are available in the
428 .BR UFFD_EVENT_REMOVE " (since Linux 4.11)"
429 Generated when the faulting process invokes
436 The event details are available in the
440 .BR UFFD_EVENT_UNMAP " (since Linux 4.11)"
441 Generated when the faulting process unmaps a memory range,
442 either explicitly using
448 The event details are available in the
454 The address that triggered the page fault.
457 A bit mask of flags that describe the event.
459 .BR UFFD_EVENT_PAGEFAULT ,
460 the following flag may appear:
463 .B UFFD_PAGEFAULT_FLAG_WRITE
464 If the address is in a range that was registered with the
465 .B UFFDIO_REGISTER_MODE_MISSING
467 .BR ioctl_userfaultfd (2))
468 and this flag is set, this a write fault;
469 otherwise it is a read fault.
471 .B UFFD_PAGEFAULT_FLAG_WP
472 If the address is in a range that was registered with the
473 .B UFFDIO_REGISTER_MODE_WP
474 flag, when this bit is set, it means it is a write-protect fault.
475 Otherwise it is a page-missing fault.
478 .I pagefault.feat.pid
479 The thread ID that triggered the page fault.
482 The file descriptor associated with the userfault object
483 created for the child created by
487 The original address of the memory range that was remapped using
491 The new address of the memory range that was remapped using
495 The original length of the memory range that was remapped using
499 The start address of the memory range that was freed using
504 The end address of the memory range that was freed using
510 on a userfaultfd file descriptor can fail with the following errors:
513 The userfaultfd object has not yet been enabled using the
520 flag is enabled in the associated open file description,
521 the userfaultfd file descriptor can be monitored with
526 When events are available, the file descriptor indicates as readable.
529 flag is not enabled, then
531 (always) indicates the file as having a
535 indicates the file descriptor as both readable and writable.
536 .\" FIXME What is the reason for this seemingly odd behavior with respect
537 .\" to the O_NONBLOCK flag? (see userfaultfd_poll() in fs/userfaultfd.c).
538 .\" Something needs to be said about this.
542 returns a new file descriptor that refers to the userfaultfd object.
543 On error, \-1 is returned, and
545 is set to indicate the error.
549 An unsupported value was specified in
553 The per-process limit on the number of open file descriptors has been
557 The system-wide limit on the total number of open files has been
561 Insufficient kernel memory was available.
563 .BR EPERM " (since Linux 5.2)"
564 .\" cefdca0a86be517bc390fc4541e3674b8e7803b0
565 The caller is not privileged (does not have the
567 capability in the initial user namespace), and
568 .I /proc/sys/vm/unprivileged_userfaultfd
573 system call first appeared in Linux 4.3.
575 The support for hugetlbfs and shared memory areas and
576 non-page-fault events was added in Linux 4.11
579 is Linux-specific and should not be used in programs intended to be
582 The userfaultfd mechanism can be used as an alternative to
583 traditional user-space paging techniques based on the use of the
587 It can also be used to implement lazy restore
588 for checkpoint/restore mechanisms,
589 as well as post-copy migration to allow (nearly) uninterrupted execution
590 when transferring virtual machines and Linux containers
591 from one host to another.
594 .B UFFD_FEATURE_EVENT_FORK
595 is enabled and a system call from the
597 family is interrupted by a signal or failed, a stale userfaultfd descriptor
599 In this case, a spurious
601 will be delivered to the userfaultfd monitor.
603 The program below demonstrates the use of the userfaultfd mechanism.
604 The program creates two threads, one of which acts as the
605 page-fault handler for the process, for the pages in a demand-page zero
609 The program takes one command-line argument,
610 which is the number of pages that will be created in a mapping
611 whose page faults will be handled via userfaultfd.
612 After creating a userfaultfd object,
613 the program then creates an anonymous private mapping of the specified size
614 and registers the address range of that mapping using the
618 The program then creates a second thread that will perform the
619 task of handling page faults.
621 The main thread then walks through the pages of the mapping fetching
622 bytes from successive pages.
623 Because the pages have not yet been accessed,
624 the first access of a byte in each page will trigger a page-fault event
625 on the userfaultfd file descriptor.
627 Each of the page-fault events is handled by the second thread,
628 which sits in a loop processing input from the userfaultfd file descriptor.
629 In each loop iteration, the second thread first calls
631 to check the state of the file descriptor,
632 and then reads an event from the file descriptor.
633 All such events should be
634 .B UFFD_EVENT_PAGEFAULT
636 which the thread handles by copying a page of data into
637 the faulting region using the
642 The following is an example of what we see when running the program:
646 $ \fB./userfaultfd_demo 3\fP
647 Address returned by mmap() = 0x7fd30106c000
649 fault_handler_thread():
650 poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
651 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106c00f
652 (uffdio_copy.copy returned 4096)
653 Read address 0x7fd30106c00f in main(): A
654 Read address 0x7fd30106c40f in main(): A
655 Read address 0x7fd30106c80f in main(): A
656 Read address 0x7fd30106cc0f in main(): A
658 fault_handler_thread():
659 poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
660 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106d00f
661 (uffdio_copy.copy returned 4096)
662 Read address 0x7fd30106d00f in main(): B
663 Read address 0x7fd30106d40f in main(): B
664 Read address 0x7fd30106d80f in main(): B
665 Read address 0x7fd30106dc0f in main(): B
667 fault_handler_thread():
668 poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
669 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106e00f
670 (uffdio_copy.copy returned 4096)
671 Read address 0x7fd30106e00f in main(): C
672 Read address 0x7fd30106e40f in main(): C
673 Read address 0x7fd30106e80f in main(): C
674 Read address 0x7fd30106ec0f in main(): C
680 /* userfaultfd_demo.c
682 Licensed under the GNU General Public License version 2 or later.
685 #include <inttypes.h>
686 #include <sys/types.h>
688 #include <linux/userfaultfd.h>
697 #include <sys/mman.h>
698 #include <sys/syscall.h>
699 #include <sys/ioctl.h>
702 #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \e
705 static int page_size;
708 fault_handler_thread(void *arg)
710 static struct uffd_msg msg; /* Data read from userfaultfd */
711 static int fault_cnt = 0; /* Number of faults so far handled */
712 long uffd; /* userfaultfd file descriptor */
713 static char *page = NULL;
714 struct uffdio_copy uffdio_copy;
719 /* Create a page that will be copied into the faulting region. */
722 page = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
723 MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0);
724 if (page == MAP_FAILED)
728 /* Loop, handling incoming events on the userfaultfd
733 /* See what poll() tells us about the userfaultfd. */
735 struct pollfd pollfd;
738 pollfd.events = POLLIN;
739 nready = poll(&pollfd, 1, \-1);
743 printf("\enfault_handler_thread():\en");
744 printf(" poll() returns: nready = %d; "
745 "POLLIN = %d; POLLERR = %d\en", nready,
746 (pollfd.revents & POLLIN) != 0,
747 (pollfd.revents & POLLERR) != 0);
749 /* Read an event from the userfaultfd. */
751 nread = read(uffd, &msg, sizeof(msg));
753 printf("EOF on userfaultfd!\en");
760 /* We expect only one kind of event; verify that assumption. */
762 if (msg.event != UFFD_EVENT_PAGEFAULT) {
763 fprintf(stderr, "Unexpected event on userfaultfd\en");
767 /* Display info about the page\-fault event. */
769 printf(" UFFD_EVENT_PAGEFAULT event: ");
770 printf("flags = %"PRIx64"; ", msg.arg.pagefault.flags);
771 printf("address = %"PRIx64"\en", msg.arg.pagefault.address);
773 /* Copy the page pointed to by \(aqpage\(aq into the faulting
774 region. Vary the contents that are copied in, so that it
775 is more obvious that each fault is handled separately. */
777 memset(page, \(aqA\(aq + fault_cnt % 20, page_size);
780 uffdio_copy.src = (unsigned long) page;
782 /* We need to handle page faults in units of pages(!).
783 So, round faulting address down to page boundary. */
785 uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address &
786 \(ti(page_size \- 1);
787 uffdio_copy.len = page_size;
788 uffdio_copy.mode = 0;
789 uffdio_copy.copy = 0;
790 if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == \-1)
791 errExit("ioctl\-UFFDIO_COPY");
793 printf(" (uffdio_copy.copy returned %"PRId64")\en",
799 main(int argc, char *argv[])
801 long uffd; /* userfaultfd file descriptor */
802 char *addr; /* Start of region handled by userfaultfd */
803 uint64_t len; /* Length of region handled by userfaultfd */
804 pthread_t thr; /* ID of thread that handles page faults */
805 struct uffdio_api uffdio_api;
806 struct uffdio_register uffdio_register;
810 fprintf(stderr, "Usage: %s num\-pages\en", argv[0]);
814 page_size = sysconf(_SC_PAGE_SIZE);
815 len = strtoull(argv[1], NULL, 0) * page_size;
817 /* Create and enable userfaultfd object. */
819 uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
821 errExit("userfaultfd");
823 uffdio_api.api = UFFD_API;
824 uffdio_api.features = 0;
825 if (ioctl(uffd, UFFDIO_API, &uffdio_api) == \-1)
826 errExit("ioctl\-UFFDIO_API");
828 /* Create a private anonymous mapping. The memory will be
829 demand\-zero paged\-\-that is, not yet allocated. When we
830 actually touch the memory, it will be allocated via
833 addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
834 MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0);
835 if (addr == MAP_FAILED)
838 printf("Address returned by mmap() = %p\en", addr);
840 /* Register the memory range of the mapping we just created for
841 handling by the userfaultfd object. In mode, we request to track
842 missing pages (i.e., pages that have not yet been faulted in). */
844 uffdio_register.range.start = (unsigned long) addr;
845 uffdio_register.range.len = len;
846 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
847 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == \-1)
848 errExit("ioctl\-UFFDIO_REGISTER");
850 /* Create a thread that will process the userfaultfd events. */
852 s = pthread_create(&thr, NULL, fault_handler_thread, (void *) uffd);
855 errExit("pthread_create");
858 /* Main thread now touches memory in the mapping, touching
859 locations 1024 bytes apart. This will trigger userfaultfd
860 events for all pages in the region. */
863 l = 0xf; /* Ensure that faulting address is not on a page
864 boundary, in order to test that we correctly
865 handle that case in fault_handling_thread(). */
868 printf("Read address %p in main(): ", addr + l);
871 usleep(100000); /* Slow things down a little */
880 .BR ioctl_userfaultfd (2),
884 .IR Documentation/admin\-guide/mm/userfaultfd.rst
885 in the Linux kernel source tree