2 * Copyright (C) 2018-2019 Red Hat Inc.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 /* Notes on the design and implementation of this filter:
35 * The filter works by creating a large, sparse temporary file, the
36 * same size as the underlying device. Being sparse, initially this
39 * We confine all pread/pwrite operations to the filesystem block
40 * size. The blk_* functions below only work on whole filesystem
41 * block boundaries. A smaller-than-block-size pwrite will turn into
42 * a read-modify-write of a whole block. We also assume that the
43 * plugin returns the same immutable data for each pread call we make,
44 * and optimize on this basis.
46 * A block bitmap is maintained in memory recording if each block in
47 * the temporary file is "allocated" (1) or "hole" (0).
49 * When reading a block we first check the bitmap to see if that file
50 * block is allocated or a hole. If allocated, we return it from the
51 * temporary file. If a hole, we issue a pread to the underlying
54 * When writing a block we unconditionally write the data to the
55 * temporary file, setting the bit in the bitmap.
57 * We allow the client to request FUA, and emulate it with a flush
58 * (arguably, since the write overlay is temporary, we could ignore
73 #include <sys/types.h>
74 #include <sys/ioctl.h>
80 #include <nbdkit-filter.h>
85 #ifndef HAVE_FDATASYNC
86 #define fdatasync fsync
89 /* The temporary overlay. */
92 /* Bitmap. Bit = 1 => allocated, 0 => hole. */
93 static struct bitmap bm
;
102 bitmap_init (&bm
, BLKSIZE
, 1 /* bits per block */);
104 tmpdir
= getenv ("TMPDIR");
106 tmpdir
= LARGE_TMPDIR
;
108 nbdkit_debug ("cow: temporary directory for overlay: %s", tmpdir
);
110 len
= strlen (tmpdir
) + 8;
111 template = alloca (len
);
112 snprintf (template, len
, "%s/XXXXXX", tmpdir
);
115 fd
= mkostemp (template, O_CLOEXEC
);
117 /* Not atomic, but this is only invoked during .load, so the race
118 * won't affect any plugin actions trying to fork
120 fd
= mkstemp (template);
122 fd
= set_cloexec (fd
);
131 nbdkit_error ("mkostemp: %s: %m", tmpdir
);
148 /* Allocate or resize the overlay file and bitmap. */
150 blk_set_size (uint64_t new_size
)
152 if (bitmap_resize (&bm
, new_size
) == -1)
155 if (ftruncate (fd
, new_size
) == -1) {
156 nbdkit_error ("ftruncate: %m");
163 /* Return true if the block is allocated. Consults the bitmap. */
165 blk_is_allocated (uint64_t blknum
)
167 return bitmap_get_blk (&bm
, blknum
, false);
170 /* Mark a block as allocated. */
172 blk_set_allocated (uint64_t blknum
)
174 bitmap_set_blk (&bm
, blknum
, true);
177 /* These are the block operations. They always read or write a single
178 * whole block of size ‘blksize’.
181 blk_read (struct nbdkit_next_ops
*next_ops
, void *nxdata
,
182 uint64_t blknum
, uint8_t *block
, int *err
)
184 off_t offset
= blknum
* BLKSIZE
;
185 bool allocated
= blk_is_allocated (blknum
);
187 nbdkit_debug ("cow: blk_read block %" PRIu64
" (offset %" PRIu64
") is %s",
188 blknum
, (uint64_t) offset
,
189 !allocated
? "a hole" : "allocated");
191 if (!allocated
) /* Read underlying plugin. */
192 return next_ops
->pread (nxdata
, block
, BLKSIZE
, offset
, 0, err
);
193 else { /* Read overlay. */
194 if (pread (fd
, block
, BLKSIZE
, offset
) == -1) {
196 nbdkit_error ("pread: %m");
204 blk_cache (struct nbdkit_next_ops
*next_ops
, void *nxdata
,
205 uint64_t blknum
, uint8_t *block
, enum cache_mode mode
, int *err
)
207 off_t offset
= blknum
* BLKSIZE
;
208 bool allocated
= blk_is_allocated (blknum
);
210 nbdkit_debug ("cow: blk_cache block %" PRIu64
" (offset %" PRIu64
") is %s",
211 blknum
, (uint64_t) offset
,
212 !allocated
? "a hole" : "allocated");
215 #if HAVE_POSIX_FADVISE
216 int r
= posix_fadvise (fd
, offset
, BLKSIZE
, POSIX_FADV_WILLNEED
);
219 nbdkit_error ("posix_fadvise: %m");
225 if (mode
== BLK_CACHE_IGNORE
)
227 if (mode
== BLK_CACHE_PASSTHROUGH
)
228 return next_ops
->cache (nxdata
, BLKSIZE
, offset
, 0, err
);
229 if (next_ops
->pread (nxdata
, block
, BLKSIZE
, offset
, 0, err
) == -1)
231 if (mode
== BLK_CACHE_COW
) {
232 if (pwrite (fd
, block
, BLKSIZE
, offset
) == -1) {
234 nbdkit_error ("pwrite: %m");
237 blk_set_allocated (blknum
);
243 blk_write (uint64_t blknum
, const uint8_t *block
, int *err
)
245 off_t offset
= blknum
* BLKSIZE
;
247 nbdkit_debug ("cow: blk_write block %" PRIu64
" (offset %" PRIu64
")",
248 blknum
, (uint64_t) offset
);
250 if (pwrite (fd
, block
, BLKSIZE
, offset
) == -1) {
252 nbdkit_error ("pwrite: %m");
255 blk_set_allocated (blknum
);
263 /* I think we don't care about file metadata for this temporary
264 * file, so only flush the data.
266 if (fdatasync (fd
) == -1) {
267 nbdkit_error ("fdatasync: %m");