cow, cache: Better mkostemp fallback
[nbdkit/ericb.git] / filters / cow / blk.c
blob2cae1122e3c03ce461f4518a54cea01eb418634a
1 /* nbdkit
2 * Copyright (C) 2018-2019 Red Hat Inc.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
33 /* Notes on the design and implementation of this filter:
35 * The filter works by creating a large, sparse temporary file, the
36 * same size as the underlying device. Being sparse, initially this
37 * takes up no space.
39 * We confine all pread/pwrite operations to the filesystem block
40 * size. The blk_* functions below only work on whole filesystem
41 * block boundaries. A smaller-than-block-size pwrite will turn into
42 * a read-modify-write of a whole block. We also assume that the
43 * plugin returns the same immutable data for each pread call we make,
44 * and optimize on this basis.
46 * A block bitmap is maintained in memory recording if each block in
47 * the temporary file is "allocated" (1) or "hole" (0).
49 * When reading a block we first check the bitmap to see if that file
50 * block is allocated or a hole. If allocated, we return it from the
51 * temporary file. If a hole, we issue a pread to the underlying
52 * plugin.
54 * When writing a block we unconditionally write the data to the
55 * temporary file, setting the bit in the bitmap.
57 * We allow the client to request FUA, and emulate it with a flush
58 * (arguably, since the write overlay is temporary, we could ignore
59 * FUA altogether).
62 #include <config.h>
64 #include <stdio.h>
65 #include <stdlib.h>
66 #include <stdint.h>
67 #include <stdbool.h>
68 #include <string.h>
69 #include <inttypes.h>
70 #include <unistd.h>
71 #include <fcntl.h>
72 #include <errno.h>
73 #include <sys/types.h>
74 #include <sys/ioctl.h>
76 #ifdef HAVE_ALLOCA_H
77 #include <alloca.h>
78 #endif
80 #include <nbdkit-filter.h>
82 #include "bitmap.h"
83 #include "blk.h"
85 #ifndef HAVE_FDATASYNC
86 #define fdatasync fsync
87 #endif
89 /* The temporary overlay. */
90 static int fd = -1;
92 /* Bitmap. Bit = 1 => allocated, 0 => hole. */
93 static struct bitmap bm;
95 int
96 blk_init (void)
98 const char *tmpdir;
99 size_t len;
100 char *template;
102 bitmap_init (&bm, BLKSIZE, 1 /* bits per block */);
104 tmpdir = getenv ("TMPDIR");
105 if (!tmpdir)
106 tmpdir = LARGE_TMPDIR;
108 nbdkit_debug ("cow: temporary directory for overlay: %s", tmpdir);
110 len = strlen (tmpdir) + 8;
111 template = alloca (len);
112 snprintf (template, len, "%s/XXXXXX", tmpdir);
114 #ifdef HAVE_MKOSTEMP
115 fd = mkostemp (template, O_CLOEXEC);
116 #else
117 /* Not atomic, but this is only invoked during .load, so the race
118 * won't affect any plugin actions trying to fork
120 fd = mkstemp (template);
121 if (fd >= 0) {
122 fd = set_cloexec (fd);
123 if (fd < 0) {
124 int e = errno;
125 unlink (template);
126 errno = e;
129 #endif
130 if (fd == -1) {
131 nbdkit_error ("mkostemp: %s: %m", tmpdir);
132 return -1;
135 unlink (template);
136 return 0;
139 void
140 blk_free (void)
142 if (fd >= 0)
143 close (fd);
145 bitmap_free (&bm);
148 /* Allocate or resize the overlay file and bitmap. */
150 blk_set_size (uint64_t new_size)
152 if (bitmap_resize (&bm, new_size) == -1)
153 return -1;
155 if (ftruncate (fd, new_size) == -1) {
156 nbdkit_error ("ftruncate: %m");
157 return -1;
160 return 0;
163 /* Return true if the block is allocated. Consults the bitmap. */
164 static bool
165 blk_is_allocated (uint64_t blknum)
167 return bitmap_get_blk (&bm, blknum, false);
170 /* Mark a block as allocated. */
171 static void
172 blk_set_allocated (uint64_t blknum)
174 bitmap_set_blk (&bm, blknum, true);
177 /* These are the block operations. They always read or write a single
178 * whole block of size ‘blksize’.
181 blk_read (struct nbdkit_next_ops *next_ops, void *nxdata,
182 uint64_t blknum, uint8_t *block, int *err)
184 off_t offset = blknum * BLKSIZE;
185 bool allocated = blk_is_allocated (blknum);
187 nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset %" PRIu64 ") is %s",
188 blknum, (uint64_t) offset,
189 !allocated ? "a hole" : "allocated");
191 if (!allocated) /* Read underlying plugin. */
192 return next_ops->pread (nxdata, block, BLKSIZE, offset, 0, err);
193 else { /* Read overlay. */
194 if (pread (fd, block, BLKSIZE, offset) == -1) {
195 *err = errno;
196 nbdkit_error ("pread: %m");
197 return -1;
199 return 0;
204 blk_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
205 uint64_t blknum, uint8_t *block, enum cache_mode mode, int *err)
207 off_t offset = blknum * BLKSIZE;
208 bool allocated = blk_is_allocated (blknum);
210 nbdkit_debug ("cow: blk_cache block %" PRIu64 " (offset %" PRIu64 ") is %s",
211 blknum, (uint64_t) offset,
212 !allocated ? "a hole" : "allocated");
214 if (allocated) {
215 #if HAVE_POSIX_FADVISE
216 int r = posix_fadvise (fd, offset, BLKSIZE, POSIX_FADV_WILLNEED);
217 if (r) {
218 errno = r;
219 nbdkit_error ("posix_fadvise: %m");
220 return -1;
222 #endif
223 return 0;
225 if (mode == BLK_CACHE_IGNORE)
226 return 0;
227 if (mode == BLK_CACHE_PASSTHROUGH)
228 return next_ops->cache (nxdata, BLKSIZE, offset, 0, err);
229 if (next_ops->pread (nxdata, block, BLKSIZE, offset, 0, err) == -1)
230 return -1;
231 if (mode == BLK_CACHE_COW) {
232 if (pwrite (fd, block, BLKSIZE, offset) == -1) {
233 *err = errno;
234 nbdkit_error ("pwrite: %m");
235 return -1;
237 blk_set_allocated (blknum);
239 return 0;
243 blk_write (uint64_t blknum, const uint8_t *block, int *err)
245 off_t offset = blknum * BLKSIZE;
247 nbdkit_debug ("cow: blk_write block %" PRIu64 " (offset %" PRIu64 ")",
248 blknum, (uint64_t) offset);
250 if (pwrite (fd, block, BLKSIZE, offset) == -1) {
251 *err = errno;
252 nbdkit_error ("pwrite: %m");
253 return -1;
255 blk_set_allocated (blknum);
257 return 0;
261 blk_flush (void)
263 /* I think we don't care about file metadata for this temporary
264 * file, so only flush the data.
266 if (fdatasync (fd) == -1) {
267 nbdkit_error ("fdatasync: %m");
268 return -1;
271 return 0;