file: Add file_extents=yes to --dump-plugin output
[nbdkit.git] / plugins / file / file.c
blob647ea5d68c1358b8532ed487d330fa9f0518b7fe
1 /* nbdkit
2 * Copyright (C) 2013-2021 Red Hat Inc.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
33 #ifdef WIN32
34 #error "build error: winfile.c should be used on Windows"
35 #endif
37 #include <config.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <stdbool.h>
42 #include <string.h>
43 #include <inttypes.h>
44 #include <fcntl.h>
45 #include <unistd.h>
46 #include <errno.h>
47 #include <sys/types.h>
48 #include <sys/stat.h>
49 #include <errno.h>
50 #include <dirent.h>
52 #ifdef HAVE_SYS_IOCTL_H
53 #include <sys/ioctl.h>
54 #endif
56 #include <pthread.h>
58 #if defined (__linux__) && !defined (FALLOC_FL_PUNCH_HOLE)
59 #include <linux/falloc.h> /* For FALLOC_FL_*, glibc < 2.18 */
60 #endif
62 #if defined (__linux__) && HAVE_LINUX_FS_H
63 #include <linux/fs.h> /* For BLKZEROOUT */
64 #endif
66 #define NBDKIT_API_VERSION 2
67 #include <nbdkit-plugin.h>
69 #include "cleanup.h"
70 #include "isaligned.h"
71 #include "fdatasync.h"
73 static enum {
74 mode_none,
75 mode_filename,
76 mode_directory,
77 mode_fd,
78 mode_dirfd,
79 } mode = mode_none;
80 static char *filename = NULL;
81 static char *directory = NULL;
82 static int filedesc = -1;
84 /* posix_fadvise mode: -1 = don't set it, or POSIX_FADV_*. */
85 static int fadvise_mode =
86 #if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_NORMAL)
87 POSIX_FADV_NORMAL
88 #else
90 #endif
93 /* cache mode */
94 static enum { cache_default, cache_none } cache_mode = cache_default;
96 /* Define EVICT_WRITES if we are going to evict the page cache
97 * (cache=none) after writing. This is only known to work on Linux.
99 #ifdef __linux__
100 #define EVICT_WRITES 1
101 #endif
103 #ifdef EVICT_WRITES
104 /* Queue writes so they will be evicted from the cache. See
105 * libnbd.git copy/file-ops.c for the rationale behind this.
107 #define NR_WINDOWS 8
109 struct write_window {
110 int fd;
111 uint64_t offset;
112 size_t len;
115 static pthread_mutex_t window_lock = PTHREAD_MUTEX_INITIALIZER;
116 static struct write_window window[NR_WINDOWS];
118 static void
119 evict_writes (int fd, uint64_t offset, size_t len)
121 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock);
123 /* Evict the oldest window from the page cache. */
124 if (window[0].len > 0) {
125 sync_file_range (window[0].fd, window[0].offset, window[0].len,
126 SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|
127 SYNC_FILE_RANGE_WAIT_AFTER);
128 posix_fadvise (window[0].fd, window[0].offset, window[0].len,
129 POSIX_FADV_DONTNEED);
132 /* Move the Nth window to N-1. */
133 memmove (&window[0], &window[1], sizeof window[0] * (NR_WINDOWS-1));
135 /* Set up the current window and tell Linux to start writing it out
136 * to disk (asynchronously).
138 sync_file_range (fd, offset, len, SYNC_FILE_RANGE_WRITE);
139 window[NR_WINDOWS-1].fd = fd;
140 window[NR_WINDOWS-1].offset = offset;
141 window[NR_WINDOWS-1].len = len;
144 /* When we close the handle we must remove any windows which are still
145 * associated. They missed the boat, oh well :-(
147 static void
148 remove_fd_from_window (int fd)
150 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock);
151 size_t i;
153 for (i = 0; i < NR_WINDOWS; ++i)
154 if (window[i].len > 0 && window[i].fd == fd)
155 window[i].len = 0;
157 #endif /* EVICT_WRITES */
159 /* Any callbacks using lseek must be protected by this lock. */
160 static pthread_mutex_t lseek_lock = PTHREAD_MUTEX_INITIALIZER;
162 /* to enable: -D file.zero=1 */
163 NBDKIT_DLL_PUBLIC int file_debug_zero;
165 static bool __attribute__((unused))
166 is_enotsup (int err)
168 return err == ENOTSUP || err == EOPNOTSUPP;
171 static void
172 file_unload (void)
174 free (filename);
175 free (directory);
178 /* Called for each key=value passed on the command line. This plugin
179 * only accepts file=<filename> and dir=<dirname>, where exactly
180 * one is required.
182 static int
183 file_config (const char *key, const char *value)
185 /* See FILENAMES AND PATHS in nbdkit-plugin(3).
186 * Our use of nbdkit_realpath requires the destination to exist at
187 * startup; use nbdkit_absolute_path instead if we wanted to defer
188 * existence checks to the last possible moment.
190 if (strcmp (key, "file") == 0) {
191 if (mode != mode_none) goto wrong_mode;
192 mode = mode_filename;
193 assert (filename == NULL);
194 filename = nbdkit_realpath (value);
195 if (!filename)
196 return -1;
198 else if (strcmp (key, "directory") == 0 ||
199 strcmp (key, "dir") == 0) {
200 if (mode != mode_none) goto wrong_mode;
201 mode = mode_directory;
202 assert (directory == NULL);
203 directory = nbdkit_realpath (value);
204 if (!directory)
205 return -1;
207 else if (strcmp (key, "fd") == 0) {
208 if (mode != mode_none) goto wrong_mode;
209 mode = mode_fd;
210 assert (filedesc == -1);
211 if (nbdkit_parse_int ("fd", value, &filedesc) == -1)
212 return -1;
213 if (filedesc <= STDERR_FILENO) {
214 nbdkit_error ("file descriptor must be > %d because "
215 "stdin, stdout and stderr are reserved for nbdkit",
216 STDERR_FILENO);
217 return -1;
220 else if (strcmp (key, "dirfd") == 0) {
221 if (mode != mode_none) goto wrong_mode;
222 mode = mode_dirfd;
223 assert (filedesc == -1);
224 if (nbdkit_parse_int ("dirfd", value, &filedesc) == -1)
225 return -1;
226 if (filedesc <= STDERR_FILENO) {
227 nbdkit_error ("file descriptor must be > %d because "
228 "stdin, stdout and stderr are reserved for nbdkit",
229 STDERR_FILENO);
230 return -1;
233 else if (strcmp (key, "fadvise") == 0) {
234 /* As this is a hint, if the kernel doesn't support the feature
235 * ignore the parameter.
237 if (strcmp (value, "normal") == 0) {
238 #if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_NORMAL)
239 fadvise_mode = POSIX_FADV_NORMAL;
240 #else
241 fadvise_mode = -1;
242 #endif
244 else if (strcmp (value, "random") == 0) {
245 #if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_RANDOM)
246 fadvise_mode = POSIX_FADV_RANDOM;
247 #else
248 fadvise_mode = -1;
249 #endif
251 else if (strcmp (value, "sequential") == 0) {
252 #if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_SEQUENTIAL)
253 fadvise_mode = POSIX_FADV_SEQUENTIAL;
254 #else
255 fadvise_mode = -1;
256 #endif
258 else {
259 nbdkit_error ("unknown fadvise mode: %s", value);
260 return -1;
263 else if (strcmp (key, "cache") == 0) {
264 if (strcmp (value, "default") == 0)
265 cache_mode = cache_default;
266 else if (strcmp (value, "none") == 0)
267 cache_mode = cache_none;
268 else {
269 nbdkit_error ("unknown cache mode: %s", value);
270 return -1;
273 else if (strcmp (key, "rdelay") == 0 ||
274 strcmp (key, "wdelay") == 0) {
275 nbdkit_error ("add --filter=delay on the command line");
276 return -1;
278 else {
279 nbdkit_error ("unknown parameter '%s'", key);
280 return -1;
282 return 0;
284 wrong_mode:
285 nbdkit_error ("%s parameter can only appear once on the command line",
286 "file|dir|fd|dirfd");
287 return -1;
290 /* Check that the user passed exactly one parameter. */
291 static int
292 file_config_complete (void)
294 int r;
295 struct stat sb;
297 switch (mode) {
298 case mode_none:
299 nbdkit_error ("you must supply [file=]<FILENAME>, "
300 "dir=<DIRNAME> or fd=<FD> "
301 "parameter after the plugin name "
302 "on the command line");
303 return -1;
305 case mode_filename:
306 assert (filename != NULL);
307 assert (directory == NULL);
308 assert (filedesc == -1);
310 /* Sanity check now, rather than waiting for first client open.
311 * See also comment in .config about use of nbdkit_realpath. Yes,
312 * this is a harmless TOCTTOU race.
314 r = stat (filename, &sb);
315 if (r == 0 && S_ISDIR (sb.st_mode)) {
316 nbdkit_error ("use dir= to serve files within %s", filename);
317 return -1;
319 if (r == -1 || !(S_ISBLK (sb.st_mode) || S_ISREG (sb.st_mode))) {
320 nbdkit_error ("file is not regular or block device: %s", filename);
321 return -1;
323 break;
325 case mode_directory:
326 assert (filename == NULL);
327 assert (directory != NULL);
328 assert (filedesc == -1);
330 if (stat (directory, &sb) == -1 || !S_ISDIR (sb.st_mode)) {
331 nbdkit_error ("expecting a directory: %s", directory);
332 return -1;
334 break;
336 case mode_fd:
337 assert (filename == NULL);
338 assert (directory == NULL);
339 assert (filedesc > STDERR_FILENO);
341 r = fstat (filedesc, &sb);
342 if (r == -1 || !(S_ISBLK (sb.st_mode) || S_ISREG (sb.st_mode))) {
343 nbdkit_error ("fd is not regular or block device: %d", filedesc);
344 return -1;
346 break;
348 case mode_dirfd:
349 assert (filename == NULL);
350 assert (directory == NULL);
351 assert (filedesc > STDERR_FILENO);
353 r = fstat (filedesc, &sb);
354 if (r == -1 || !(S_ISDIR (sb.st_mode))) {
355 nbdkit_error ("dirfd is not a directory: %d", filedesc);
356 return -1;
360 return 0;
363 #define file_config_help \
364 "[file=]<FILENAME> The filename to serve.\n" \
365 "dir=<DIRNAME> A directory containing files to serve.\n" \
366 "cache=<MODE> Set use of caching (default, none).\n" \
367 "fadise=<LEVEL> Set fadvise hint (normal, random, sequential).\n" \
369 /* Print some extra information about how the plugin was compiled. */
370 static void
371 file_dump_plugin (void)
373 #ifdef BLKSSZGET
374 printf ("file_blksszget=yes\n");
375 #endif
376 #ifdef BLKZEROOUT
377 printf ("file_blkzeroout=yes\n");
378 #endif
379 #ifdef SEEK_HOLE
380 printf ("file_extents=yes\n");
381 #endif
382 #ifdef FALLOC_FL_PUNCH_HOLE
383 printf ("file_falloc_fl_punch_hole=yes\n");
384 #endif
385 #ifdef FALLOC_FL_ZERO_RANGE
386 printf ("file_falloc_fl_zero_range=yes\n");
387 #endif
390 /* Common code for listing exports of a directory. */
391 static int
392 list_exports_of_directory (struct nbdkit_exports *exports, DIR *dir)
394 struct dirent *entry;
396 errno = 0;
397 while ((entry = readdir (dir)) != NULL) {
398 int r = -1;
399 struct stat sb;
401 #if HAVE_STRUCT_DIRENT_D_TYPE
402 if (entry->d_type == DT_BLK || entry->d_type == DT_REG)
403 r = 1;
404 else if (entry->d_type != DT_LNK && entry->d_type != DT_UNKNOWN)
405 r = 0;
406 #endif
407 /* TODO: when chasing symlinks, is statx any nicer than fstatat? */
408 if (r == -1 && fstatat (dirfd (dir), entry->d_name, &sb, 0) == 0 &&
409 (S_ISREG (sb.st_mode) || S_ISBLK (sb.st_mode)))
410 r = 1;
411 if (r == 1 && nbdkit_add_export (exports, entry->d_name, NULL) == -1)
412 return -1;
413 errno = 0;
416 if (errno) {
417 nbdkit_error ("readdir: %m");
418 return -1;
421 return 0;
424 static int
425 file_list_exports (int readonly, int default_only,
426 struct nbdkit_exports *exports)
428 /* We don't fork, so no need to worry about FD_CLOEXEC on the directory */
429 DIR *dir;
430 int dfd, r;
432 switch (mode) {
433 case mode_filename:
434 case mode_fd:
435 return nbdkit_add_export (exports, "", NULL);
437 case mode_directory:
438 dir = opendir (directory);
439 if (dir == NULL) {
440 nbdkit_error ("opendir: %m");
441 return -1;
443 r = list_exports_of_directory (exports, dir);
444 closedir (dir);
445 return r;
447 case mode_dirfd:
448 dfd = dup (filedesc);
449 if (dfd == -1) {
450 nbdkit_error ("dup: %m");
451 return -1;
453 dir = fdopendir (dfd);
454 if (dir == NULL) {
455 nbdkit_error ("fdopendir: %m");
456 return -1;
458 r = list_exports_of_directory (exports, dir);
459 closedir (dir); /* also closes dfd */
460 return r;
462 default: abort ();
466 /* The per-connection handle. */
467 struct handle {
468 int fd;
469 bool is_block_device;
470 int sector_size;
471 bool can_write;
472 bool can_punch_hole;
473 bool can_zero_range;
474 bool can_fallocate;
475 bool can_zeroout;
478 /* Common code for opening a file by name, used by mode_filename and
479 * mode_directory only. If successful, sets h->fd and may adjust
480 * h->can_write.
482 static int
483 open_file_by_name (struct handle *h, int readonly, int dfd, const char *file)
485 int flags;
487 assert (h->fd == -1);
489 flags = O_CLOEXEC|O_NOCTTY;
490 if (readonly)
491 flags |= O_RDONLY;
492 else
493 flags |= O_RDWR;
495 h->fd = openat (dfd, file, flags);
496 if (h->fd == -1 && !readonly) {
497 nbdkit_debug ("open O_RDWR failed, falling back to read-only: %s: %m",
498 file);
499 flags = (flags & ~O_ACCMODE) | O_RDONLY;
500 h->fd = openat (dfd, file, flags);
501 h->can_write = false;
503 if (h->fd == -1) {
504 nbdkit_error ("open: %s: %m", file);
505 free (h);
506 return -1;
509 return 0;
512 /* Create the per-connection handle. */
513 static void *
514 file_open (int readonly)
516 struct handle *h;
517 struct stat statbuf;
518 const char *file;
520 h = malloc (sizeof *h);
521 if (h == NULL) {
522 nbdkit_error ("malloc: %m");
523 return NULL;
525 h->can_write = !readonly;
526 h->fd = -1;
528 switch (mode) {
529 case mode_filename:
530 file = filename;
531 if (open_file_by_name (h, readonly, -1, file) == -1) {
532 free (h);
533 return NULL;
535 break;
537 case mode_directory: {
538 int dfd;
540 file = nbdkit_export_name ();
541 if (strchr (file, '/')) {
542 nbdkit_error ("exportname cannot contain /");
543 free (h);
544 errno = EINVAL;
545 return NULL;
547 dfd = open (directory, O_RDONLY | O_DIRECTORY | O_CLOEXEC);
548 if (dfd == -1) {
549 nbdkit_error ("open %s: %m", directory);
550 free (h);
551 return NULL;
553 if (open_file_by_name (h, readonly, dfd, file) == -1) {
554 free (h);
555 close (dfd);
556 return NULL;
558 close (dfd);
559 break;
562 case mode_fd: {
563 int r;
565 /* This is needed for error messages. */
566 file = "<file descriptor>";
568 h->fd = dup (filedesc);
569 if (h->fd == -1) {
570 nbdkit_error ("dup fd=%d: %m", filedesc);
571 free (h);
572 return NULL;
575 /* If the file descriptor is readonly then we should not advertise
576 * writes as they will fail later.
578 r = fcntl (h->fd, F_GETFL);
579 if (r == -1) {
580 nbdkit_error ("fcntl: F_GETFL: %m");
581 close (h->fd);
582 free (h);
583 return NULL;
585 r &= O_ACCMODE;
586 if (r == O_RDONLY)
587 h->can_write = false;
588 else if (r == O_WRONLY)
589 nbdkit_debug ("file descriptor is write-only (ie. not readable): "
590 "NBD protocol does not support this, but continuing "
591 "anyway!");
592 break;
595 case mode_dirfd: {
596 int dfd;
598 file = nbdkit_export_name ();
599 if (strchr (file, '/')) {
600 nbdkit_error ("exportname cannot contain /");
601 free (h);
602 errno = EINVAL;
603 return NULL;
605 /* We don't fork, so no need to worry about FD_CLOEXEC on the directory */
606 dfd = dup (filedesc);
607 if (dfd == -1) {
608 nbdkit_error ("dup dirfd=%d: %m", filedesc);
609 free (h);
610 return NULL;
612 if (open_file_by_name (h, readonly, dfd, file) == -1) {
613 free (h);
614 close (dfd);
615 return NULL;
617 close (dfd);
618 break;
621 default:
622 abort ();
625 assert (h->fd >= 0);
627 if (fstat (h->fd, &statbuf) == -1) {
628 nbdkit_error ("fstat: %s: %m", file);
629 close (h->fd);
630 free (h);
631 return NULL;
634 if (fadvise_mode != -1) {
635 /* This is a hint so we ignore failures. */
636 #ifdef HAVE_POSIX_FADVISE
637 int r = posix_fadvise (h->fd, 0, 0, fadvise_mode);
638 if (r == -1)
639 nbdkit_debug ("posix_fadvise: %s: %m (ignored)", file);
640 #else
641 nbdkit_debug ("fadvise is not supported");
642 #endif
645 if (S_ISBLK (statbuf.st_mode))
646 h->is_block_device = true;
647 else if (S_ISREG (statbuf.st_mode))
648 h->is_block_device = false;
649 else {
650 nbdkit_error ("file is not regular or block device: %s", file);
651 close (h->fd);
652 free (h);
653 return NULL;
655 h->sector_size = 4096; /* Start with safe guess */
657 #ifdef BLKSSZGET
658 if (h->is_block_device) {
659 if (ioctl (h->fd, BLKSSZGET, &h->sector_size))
660 nbdkit_debug ("cannot get sector size: %s: %m", file);
662 #endif
664 #ifdef FALLOC_FL_PUNCH_HOLE
665 h->can_punch_hole = true;
666 #else
667 h->can_punch_hole = false;
668 #endif
670 #ifdef FALLOC_FL_ZERO_RANGE
671 h->can_zero_range = true;
672 #else
673 h->can_zero_range = false;
674 #endif
676 h->can_fallocate = true;
677 h->can_zeroout = h->is_block_device;
679 return h;
682 /* Free up the per-connection handle. */
683 static void
684 file_close (void *handle)
686 struct handle *h = handle;
688 #ifdef EVICT_WRITES
689 remove_fd_from_window (h->fd);
690 #endif
691 close (h->fd);
692 free (h);
695 #define THREAD_MODEL NBDKIT_THREAD_MODEL_PARALLEL
697 /* For block devices, stat->st_size is not the true size. The caller
698 * grabs the lock.
700 static int64_t
701 block_device_size (int fd)
703 off_t size;
705 size = lseek (fd, 0, SEEK_END);
706 if (size == -1) {
707 nbdkit_error ("lseek (to find device size): %m");
708 return -1;
711 return size;
714 /* Get the file size. */
715 static int64_t
716 file_get_size (void *handle)
718 struct handle *h = handle;
720 if (h->is_block_device) {
721 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lseek_lock);
722 return block_device_size (h->fd);
723 } else {
724 /* Regular file. */
725 struct stat statbuf;
727 if (fstat (h->fd, &statbuf) == -1) {
728 nbdkit_error ("fstat: %m");
729 return -1;
732 return statbuf.st_size;
736 /* Check if file is read-only. */
737 static int
738 file_can_write (void *handle)
740 struct handle *h = handle;
742 return h->can_write;
745 /* Allow multiple parallel connections from a single client. */
746 static int
747 file_can_multi_conn (void *handle)
749 return 1;
752 static int
753 file_can_trim (void *handle)
755 /* Trim is advisory, but we prefer to advertise it only when we can
756 * actually (attempt to) punch holes. Since not all filesystems
757 * support all fallocate modes, it would be nice if we had a way
758 * from fpathconf() to definitively learn what will work on a given
759 * fd for a more precise answer; oh well. */
760 #ifdef FALLOC_FL_PUNCH_HOLE
761 return 1;
762 #else
763 return 0;
764 #endif
767 static int
768 file_can_fua (void *handle)
770 return NBDKIT_FUA_NATIVE;
773 static int
774 file_can_cache (void *handle)
776 /* Prefer posix_fadvise(), but letting nbdkit call .pread on our
777 * behalf also tends to work well for the local file system
778 * cache.
780 #if HAVE_POSIX_FADVISE
781 return NBDKIT_FUA_NATIVE;
782 #else
783 return NBDKIT_FUA_EMULATE;
784 #endif
787 /* Flush the file to disk. */
788 static int
789 file_flush (void *handle, uint32_t flags)
791 struct handle *h = handle;
793 if (fdatasync (h->fd) == -1) {
794 nbdkit_error ("fdatasync: %m");
795 return -1;
798 return 0;
801 /* Read data from the file. */
802 static int
803 file_pread (void *handle, void *buf, uint32_t count, uint64_t offset,
804 uint32_t flags)
806 struct handle *h = handle;
807 #if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED)
808 uint32_t orig_count = count;
809 uint64_t orig_offset = offset;
810 #endif
812 while (count > 0) {
813 ssize_t r = pread (h->fd, buf, count, offset);
814 if (r == -1) {
815 nbdkit_error ("pread: %m");
816 return -1;
818 if (r == 0) {
819 nbdkit_error ("pread: unexpected end of file");
820 return -1;
822 buf += r;
823 count -= r;
824 offset += r;
827 #if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED)
828 /* On Linux this will evict the pages we just read from the page cache. */
829 if (cache_mode == cache_none)
830 posix_fadvise (h->fd, orig_offset, orig_count, POSIX_FADV_DONTNEED);
831 #endif
833 return 0;
836 /* Write data to the file. */
837 static int
838 file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset,
839 uint32_t flags)
841 struct handle *h = handle;
843 #if EVICT_WRITES
844 uint32_t orig_count = count;
845 uint64_t orig_offset = offset;
846 #endif
848 while (count > 0) {
849 ssize_t r = pwrite (h->fd, buf, count, offset);
850 if (r == -1) {
851 nbdkit_error ("pwrite: %m");
852 return -1;
854 buf += r;
855 count -= r;
856 offset += r;
859 if ((flags & NBDKIT_FLAG_FUA) && file_flush (handle, 0) == -1)
860 return -1;
862 #if EVICT_WRITES
863 if (cache_mode == cache_none)
864 evict_writes (h->fd, orig_offset, orig_count);
865 #endif
867 return 0;
870 #if defined (FALLOC_FL_PUNCH_HOLE) || defined (FALLOC_FL_ZERO_RANGE)
871 static int
872 do_fallocate (int fd, int mode_, off_t offset, off_t len)
874 int r = fallocate (fd, mode_, offset, len);
875 if (r == -1 && errno == ENODEV) {
876 /* kernel 3.10 fails with ENODEV for block device. Kernel >= 4.9 fails
877 with EOPNOTSUPP in this case. Normalize errno to simplify callers. */
878 errno = EOPNOTSUPP;
880 return r;
882 #endif
884 /* Write zeroes to the file. */
885 static int
886 file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags)
888 struct handle *h __attribute__((unused)) = handle;
890 #ifdef FALLOC_FL_PUNCH_HOLE
891 if (h->can_punch_hole && (flags & NBDKIT_FLAG_MAY_TRIM)) {
892 int r;
894 r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
895 offset, count);
896 if (r == 0) {
897 if (file_debug_zero)
898 nbdkit_debug ("h->can_punch_hole && may_trim: "
899 "zero succeeded using fallocate");
900 goto out;
903 if (!is_enotsup (errno)) {
904 nbdkit_error ("zero: %m");
905 return -1;
908 h->can_punch_hole = false;
910 #endif
912 #ifdef FALLOC_FL_ZERO_RANGE
913 if (h->can_zero_range) {
914 int r;
916 r = do_fallocate (h->fd, FALLOC_FL_ZERO_RANGE, offset, count);
917 if (r == 0) {
918 if (file_debug_zero)
919 nbdkit_debug ("h->can_zero-range: "
920 "zero succeeded using fallocate");
921 goto out;
924 if (!is_enotsup (errno)) {
925 nbdkit_error ("zero: %m");
926 return -1;
929 h->can_zero_range = false;
931 #endif
933 #ifdef FALLOC_FL_PUNCH_HOLE
934 /* If we can punch hole but may not trim, we can combine punching hole and
935 * fallocate to zero a range. This is expected to be more efficient than
936 * writing zeroes manually. */
937 if (h->can_punch_hole && h->can_fallocate) {
938 int r;
940 r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
941 offset, count);
942 if (r == 0) {
943 r = do_fallocate (h->fd, 0, offset, count);
944 if (r == 0) {
945 if (file_debug_zero)
946 nbdkit_debug ("h->can_punch_hole && h->can_fallocate: "
947 "zero succeeded using fallocate");
948 goto out;
951 if (!is_enotsup (errno)) {
952 nbdkit_error ("zero: %m");
953 return -1;
956 h->can_fallocate = false;
957 } else {
958 if (!is_enotsup (errno)) {
959 nbdkit_error ("zero: %m");
960 return -1;
963 h->can_punch_hole = false;
966 #endif
968 #ifdef BLKZEROOUT
969 /* For aligned range and block device, we can use BLKZEROOUT. */
970 if (h->can_zeroout && IS_ALIGNED (offset | count, h->sector_size)) {
971 int r;
972 uint64_t range[2] = {offset, count};
974 r = ioctl (h->fd, BLKZEROOUT, &range);
975 if (r == 0) {
976 if (file_debug_zero)
977 nbdkit_debug ("h->can_zeroout && IS_ALIGNED: "
978 "zero succeeded using BLKZEROOUT");
979 goto out;
982 if (errno != ENOTTY) {
983 nbdkit_error ("zero: %m");
984 return -1;
987 h->can_zeroout = false;
989 #endif
991 /* Trigger a fall back to writing */
992 if (file_debug_zero)
993 nbdkit_debug ("zero falling back to writing");
994 errno = EOPNOTSUPP;
995 return -1;
997 #ifdef __clang__
998 __attribute__((unused))
999 #endif
1000 out:
1001 if ((flags & NBDKIT_FLAG_FUA) && file_flush (handle, 0) == -1)
1002 return -1;
1003 return 0;
1006 /* Punch a hole in the file. */
1007 static int
1008 file_trim (void *handle, uint32_t count, uint64_t offset, uint32_t flags)
1010 #ifdef FALLOC_FL_PUNCH_HOLE
1011 struct handle *h = handle;
1012 int r;
1014 if (h->can_punch_hole) {
1015 r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1016 offset, count);
1017 if (r == -1) {
1018 /* Trim is advisory; we don't care if it fails for anything other
1019 * than EIO or EPERM. */
1020 if (errno == EPERM || errno == EIO) {
1021 nbdkit_error ("fallocate: %m");
1022 return -1;
1025 if (is_enotsup (EOPNOTSUPP))
1026 h->can_punch_hole = false;
1028 nbdkit_debug ("ignoring failed fallocate during trim: %m");
1031 #endif
1033 if ((flags & NBDKIT_FLAG_FUA) && file_flush (handle, 0) == -1)
1034 return -1;
1036 return 0;
1039 #ifdef SEEK_HOLE
1040 /* Extents. */
1042 static int
1043 file_can_extents (void *handle)
1045 struct handle *h = handle;
1046 off_t r;
1048 /* A simple test to see whether SEEK_HOLE etc is likely to work on
1049 * the current filesystem.
1051 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lseek_lock);
1052 r = lseek (h->fd, 0, SEEK_HOLE);
1053 if (r == -1) {
1054 nbdkit_debug ("extents disabled: lseek: SEEK_HOLE: %m");
1055 return 0;
1057 return 1;
1060 static int
1061 do_extents (void *handle, uint32_t count, uint64_t offset,
1062 uint32_t flags, struct nbdkit_extents *extents)
1064 struct handle *h = handle;
1065 const bool req_one = flags & NBDKIT_FLAG_REQ_ONE;
1066 uint64_t end = offset + count;
1068 do {
1069 off_t pos;
1071 pos = lseek (h->fd, offset, SEEK_DATA);
1072 if (pos == -1) {
1073 if (errno == ENXIO) {
1074 /* The current man page does not describe this situation well,
1075 * but a proposed change to POSIX adds these words for ENXIO:
1076 * "or the whence argument is SEEK_DATA and the offset falls
1077 * within the final hole of the file."
1079 pos = end;
1081 else {
1082 nbdkit_error ("lseek: SEEK_DATA: %" PRIu64 ": %m", offset);
1083 return -1;
1087 /* We know there is a hole from offset to pos-1. */
1088 if (pos > offset) {
1089 if (nbdkit_add_extent (extents, offset, pos - offset,
1090 NBDKIT_EXTENT_HOLE | NBDKIT_EXTENT_ZERO) == -1)
1091 return -1;
1092 if (req_one)
1093 break;
1096 offset = pos;
1097 if (offset >= end)
1098 break;
1100 pos = lseek (h->fd, offset, SEEK_HOLE);
1101 if (pos == -1) {
1102 nbdkit_error ("lseek: SEEK_HOLE: %" PRIu64 ": %m", offset);
1103 return -1;
1106 /* We know there is data from offset to pos-1. */
1107 if (pos > offset) {
1108 if (nbdkit_add_extent (extents, offset, pos - offset,
1109 0 /* allocated data */) == -1)
1110 return -1;
1111 if (req_one)
1112 break;
1115 offset = pos;
1116 } while (offset < end);
1118 return 0;
1121 static int
1122 file_extents (void *handle, uint32_t count, uint64_t offset,
1123 uint32_t flags, struct nbdkit_extents *extents)
1125 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lseek_lock);
1126 return do_extents (handle, count, offset, flags, extents);
1128 #endif /* SEEK_HOLE */
1130 #if HAVE_POSIX_FADVISE
1131 /* Caching. */
1132 static int
1133 file_cache (void *handle, uint32_t count, uint64_t offset, uint32_t flags)
1135 struct handle *h = handle;
1136 int r;
1138 /* Cache is advisory, we don't care if this fails */
1139 r = posix_fadvise (h->fd, offset, count, POSIX_FADV_WILLNEED);
1140 if (r) {
1141 errno = r;
1142 nbdkit_error ("posix_fadvise: %m");
1143 return -1;
1145 return 0;
1147 #endif /* HAVE_POSIX_FADVISE */
1149 static struct nbdkit_plugin plugin = {
1150 .name = "file",
1151 .longname = "nbdkit file plugin",
1152 .version = PACKAGE_VERSION,
1153 .unload = file_unload,
1154 .config = file_config,
1155 .config_complete = file_config_complete,
1156 .config_help = file_config_help,
1157 .magic_config_key = "file",
1158 .dump_plugin = file_dump_plugin,
1159 .list_exports = file_list_exports,
1160 .open = file_open,
1161 .close = file_close,
1162 .get_size = file_get_size,
1163 .can_write = file_can_write,
1164 .can_multi_conn = file_can_multi_conn,
1165 .can_trim = file_can_trim,
1166 .can_fua = file_can_fua,
1167 .can_cache = file_can_cache,
1168 .pread = file_pread,
1169 .pwrite = file_pwrite,
1170 .flush = file_flush,
1171 .trim = file_trim,
1172 .zero = file_zero,
1173 #ifdef SEEK_HOLE
1174 .can_extents = file_can_extents,
1175 .extents = file_extents,
1176 #endif
1177 #if HAVE_POSIX_FADVISE
1178 .cache = file_cache,
1179 #endif
1180 .errno_is_preserved = 1,
1183 NBDKIT_REGISTER_PLUGIN(plugin)