4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 /* Notes on the design and implementation of this filter:
35 * The filter works by creating a large, sparse temporary file, the
36 * same size as the underlying device. Being sparse, initially this
39 * We confine all pread/pwrite operations to the filesystem block
40 * size. The blk_* functions below only work on whole filesystem
41 * block boundaries. A smaller-than-block-size pwrite will turn into
42 * a read-modify-write of a whole block. We also assume that the
43 * plugin returns the same immutable data for each pread call we make,
44 * and optimize on this basis.
46 * A 2-bit per block bitmap is maintained in memory recording if each
47 * block in the temporary file is:
49 * 00 = not allocated in the overlay (read through to the plugin)
50 * 01 = allocated in the overlay
52 * 11 = trimmed in the overlay
54 * When reading a block we first check the bitmap to see if that file
55 * block is allocated, trimmed or not. If allocated, we return it
56 * from the temporary file. Trimmed returns zeroes. If not allocated
57 * we issue a pread to the underlying plugin.
59 * When writing a block we unconditionally write the data to the
60 * temporary file, setting the bit in the bitmap. (Writing zeroes is
61 * handled the same way.)
63 * When trimming we set the trimmed flag in the bitmap for whole
64 * blocks, and handle the unaligned portions like writing zeroes
65 * above. We could punch holes in the overlay as an optimization, but
66 * for simplicity we do not do that yet.
68 * Since the overlay is a deleted temporary file, we can ignore FUA
84 #include <sys/types.h>
92 #include <nbdkit-filter.h>
96 #include "fdatasync.h"
105 /* The temporary overlay. */
108 /* This lock protects the bitmap from parallel access. */
109 static pthread_mutex_t lock
= PTHREAD_MUTEX_INITIALIZER
;
112 static struct bitmap bm
;
115 BLOCK_NOT_ALLOCATED
= 0,
121 state_to_string (enum bm_entry state
)
124 case BLOCK_NOT_ALLOCATED
: return "not allocated";
125 case BLOCK_ALLOCATED
: return "allocated";
126 case BLOCK_TRIMMED
: return "trimmed";
131 /* Extra debugging (-D cow.verbose=1). */
132 NBDKIT_DLL_PUBLIC
int cow_debug_verbose
= 0;
141 bitmap_init (&bm
, blksize
, 2 /* bits per block */);
143 tmpdir
= getenv ("TMPDIR");
145 tmpdir
= LARGE_TMPDIR
;
147 nbdkit_debug ("cow: temporary directory for overlay: %s", tmpdir
);
149 len
= strlen (tmpdir
) + 8;
150 template = alloca (len
);
151 snprintf (template, len
, "%s/XXXXXX", tmpdir
);
154 fd
= mkostemp (template, O_CLOEXEC
);
156 /* Not atomic, but this is only invoked during .load, so the race
157 * won't affect any plugin actions trying to fork
159 fd
= mkstemp (template);
161 fd
= set_cloexec (fd
);
170 nbdkit_error ("mkostemp: %s: %m", tmpdir
);
187 /* Because blk_set_size is called before the other blk_* functions
188 * this should be set to the true size before we need it.
190 static uint64_t size
= 0;
192 /* Allocate or resize the overlay file and bitmap. */
194 blk_set_size (uint64_t new_size
)
196 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
200 if (bitmap_resize (&bm
, size
) == -1)
203 if (ftruncate (fd
, ROUND_UP (size
, blksize
)) == -1) {
204 nbdkit_error ("ftruncate: %m");
211 /* This is a bit of a hack since usually this information is hidden in
212 * the blk module. However it is needed when calculating extents.
215 blk_status (uint64_t blknum
, bool *present
, bool *trimmed
)
217 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
218 enum bm_entry state
= bitmap_get_blk (&bm
, blknum
, BLOCK_NOT_ALLOCATED
);
220 *present
= state
!= BLOCK_NOT_ALLOCATED
;
221 *trimmed
= state
== BLOCK_TRIMMED
;
224 /* These are the block operations. They always read or write whole
225 * blocks of size ‘blksize’.
228 blk_read_multiple (nbdkit_next
*next
,
229 uint64_t blknum
, uint64_t nrblocks
,
230 uint8_t *block
, bool cow_on_read
, int *err
)
232 off_t offset
= blknum
* blksize
;
234 uint64_t b
, runblocks
;
236 /* Find out how many of the following blocks form a "run" with the
237 * same state. We can process that many blocks in one go.
239 * About the locking: The state might be modified from another
240 * thread - for example another thread might write
241 * (BLOCK_NOT_ALLOCATED -> BLOCK_ALLOCATED) while we are reading
242 * from the plugin, returning the old data. However a read issued
243 * after the write returns should always return the correct data.
246 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
247 state
= bitmap_get_blk (&bm
, blknum
, BLOCK_NOT_ALLOCATED
);
249 for (b
= 1, runblocks
= 1; b
< nrblocks
; ++b
, ++runblocks
) {
250 enum bm_entry s
= bitmap_get_blk (&bm
, blknum
+ b
, BLOCK_NOT_ALLOCATED
);
256 if (cow_debug_verbose
)
257 nbdkit_debug ("cow: blk_read_multiple block %" PRIu64
258 " (offset %" PRIu64
") run of length %" PRIu64
260 blknum
, (uint64_t) offset
, runblocks
,
261 state_to_string (state
));
263 if (state
== BLOCK_NOT_ALLOCATED
) { /* Read underlying plugin. */
264 unsigned n
, tail
= 0;
266 assert (blksize
* runblocks
<= UINT_MAX
);
267 n
= blksize
* runblocks
;
269 if (offset
+ n
> size
) {
270 tail
= offset
+ n
- size
;
274 if (next
->pread (next
, block
, n
, offset
, 0, err
) == -1)
277 /* Normally we're reading whole blocks, but at the very end of the
278 * file we might read a partial block. Deal with that case by
281 memset (block
+ n
, 0, tail
);
283 /* If cow-on-read is true then copy the blocks to the cache and
284 * set them as allocated.
287 if (cow_debug_verbose
)
288 nbdkit_debug ("cow: cow-on-read saving %" PRIu64
" blocks "
289 "at offset %" PRIu64
" into the cache",
292 if (full_pwrite (fd
, block
, blksize
* runblocks
, offset
) == -1) {
294 nbdkit_error ("pwrite: %m");
297 for (b
= 0; b
< runblocks
; ++b
)
298 bitmap_set_blk (&bm
, blknum
+b
, BLOCK_ALLOCATED
);
301 else if (state
== BLOCK_ALLOCATED
) { /* Read overlay. */
302 if (full_pread (fd
, block
, blksize
* runblocks
, offset
) == -1) {
304 nbdkit_error ("pread: %m");
308 else /* state == BLOCK_TRIMMED */ {
309 memset (block
, 0, blksize
* runblocks
);
312 /* If all done, return. */
313 if (runblocks
== nrblocks
)
316 /* Recurse to read remaining blocks. */
317 return blk_read_multiple (next
,
319 nrblocks
- runblocks
,
320 block
+ blksize
* runblocks
,
325 blk_read (nbdkit_next
*next
,
326 uint64_t blknum
, uint8_t *block
, bool cow_on_read
, int *err
)
328 return blk_read_multiple (next
, blknum
, 1, block
, cow_on_read
, err
);
332 blk_cache (nbdkit_next
*next
,
333 uint64_t blknum
, uint8_t *block
, enum cache_mode mode
, int *err
)
335 /* XXX Could make this lock more fine-grained with some thought. */
336 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
337 off_t offset
= blknum
* blksize
;
338 enum bm_entry state
= bitmap_get_blk (&bm
, blknum
, BLOCK_NOT_ALLOCATED
);
339 unsigned n
= blksize
, tail
= 0;
341 if (offset
+ n
> size
) {
342 tail
= offset
+ n
- size
;
346 if (cow_debug_verbose
)
347 nbdkit_debug ("cow: blk_cache block %" PRIu64
" (offset %" PRIu64
") is %s",
348 blknum
, (uint64_t) offset
, state_to_string (state
));
350 if (state
== BLOCK_ALLOCATED
) {
351 #if HAVE_POSIX_FADVISE
352 int r
= posix_fadvise (fd
, offset
, blksize
, POSIX_FADV_WILLNEED
);
355 nbdkit_error ("posix_fadvise: %m");
361 if (state
== BLOCK_TRIMMED
)
363 if (mode
== BLK_CACHE_IGNORE
)
365 if (mode
== BLK_CACHE_PASSTHROUGH
)
366 return next
->cache (next
, n
, offset
, 0, err
);
368 if (next
->pread (next
, block
, n
, offset
, 0, err
) == -1)
370 /* Normally we're reading whole blocks, but at the very end of the
371 * file we might read a partial block. Deal with that case by
374 memset (block
+ n
, 0, tail
);
376 if (mode
== BLK_CACHE_COW
) {
377 if (full_pwrite (fd
, block
, blksize
, offset
) == -1) {
379 nbdkit_error ("pwrite: %m");
382 bitmap_set_blk (&bm
, blknum
, BLOCK_ALLOCATED
);
388 blk_write (uint64_t blknum
, const uint8_t *block
, int *err
)
390 off_t offset
= blknum
* blksize
;
392 if (cow_debug_verbose
)
393 nbdkit_debug ("cow: blk_write block %" PRIu64
" (offset %" PRIu64
")",
394 blknum
, (uint64_t) offset
);
396 if (full_pwrite (fd
, block
, blksize
, offset
) == -1) {
398 nbdkit_error ("pwrite: %m");
402 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
403 bitmap_set_blk (&bm
, blknum
, BLOCK_ALLOCATED
);
409 blk_trim (uint64_t blknum
, int *err
)
411 off_t offset
= blknum
* blksize
;
413 if (cow_debug_verbose
)
414 nbdkit_debug ("cow: blk_trim block %" PRIu64
" (offset %" PRIu64
")",
415 blknum
, (uint64_t) offset
);
417 /* XXX As an optimization we could punch a whole in the overlay
418 * here. However it's not trivial since blksize is unrelated to the
419 * overlay filesystem block size.
421 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
422 bitmap_set_blk (&bm
, blknum
, BLOCK_TRIMMED
);