Update Red Hat Copyright Notices
[nbdkit.git] / filters / cow / blk.c
blobd41f62d242823ccc2c55bf2a882c3c0a470ea7a9
1 /* nbdkit
2 * Copyright Red Hat
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
33 /* Notes on the design and implementation of this filter:
35 * The filter works by creating a large, sparse temporary file, the
36 * same size as the underlying device. Being sparse, initially this
37 * takes up no space.
39 * We confine all pread/pwrite operations to the filesystem block
40 * size. The blk_* functions below only work on whole filesystem
41 * block boundaries. A smaller-than-block-size pwrite will turn into
42 * a read-modify-write of a whole block. We also assume that the
43 * plugin returns the same immutable data for each pread call we make,
44 * and optimize on this basis.
46 * A 2-bit per block bitmap is maintained in memory recording if each
47 * block in the temporary file is:
49 * 00 = not allocated in the overlay (read through to the plugin)
50 * 01 = allocated in the overlay
51 * 10 = <unused>
52 * 11 = trimmed in the overlay
54 * When reading a block we first check the bitmap to see if that file
55 * block is allocated, trimmed or not. If allocated, we return it
56 * from the temporary file. Trimmed returns zeroes. If not allocated
57 * we issue a pread to the underlying plugin.
59 * When writing a block we unconditionally write the data to the
60 * temporary file, setting the bit in the bitmap. (Writing zeroes is
61 * handled the same way.)
63 * When trimming we set the trimmed flag in the bitmap for whole
64 * blocks, and handle the unaligned portions like writing zeroes
65 * above. We could punch holes in the overlay as an optimization, but
66 * for simplicity we do not do that yet.
68 * Since the overlay is a deleted temporary file, we can ignore FUA
69 * and flush commands.
72 #include <config.h>
74 #include <stdio.h>
75 #include <stdlib.h>
76 #include <stdint.h>
77 #include <stdbool.h>
78 #include <string.h>
79 #include <inttypes.h>
80 #include <unistd.h>
81 #include <fcntl.h>
82 #include <limits.h>
83 #include <errno.h>
84 #include <sys/types.h>
86 #ifdef HAVE_ALLOCA_H
87 #include <alloca.h>
88 #endif
90 #include <pthread.h>
92 #include <nbdkit-filter.h>
94 #include "bitmap.h"
95 #include "cleanup.h"
96 #include "fdatasync.h"
97 #include "rounding.h"
98 #include "pread.h"
99 #include "pwrite.h"
100 #include "utils.h"
102 #include "cow.h"
103 #include "blk.h"
105 /* The temporary overlay. */
106 static int fd = -1;
108 /* This lock protects the bitmap from parallel access. */
109 static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
111 /* Bitmap. */
112 static struct bitmap bm;
114 enum bm_entry {
115 BLOCK_NOT_ALLOCATED = 0,
116 BLOCK_ALLOCATED = 1,
117 BLOCK_TRIMMED = 3,
120 static const char *
121 state_to_string (enum bm_entry state)
123 switch (state) {
124 case BLOCK_NOT_ALLOCATED: return "not allocated";
125 case BLOCK_ALLOCATED: return "allocated";
126 case BLOCK_TRIMMED: return "trimmed";
127 default: abort ();
131 /* Extra debugging (-D cow.verbose=1). */
132 NBDKIT_DLL_PUBLIC int cow_debug_verbose = 0;
135 blk_init (void)
137 const char *tmpdir;
138 size_t len;
139 char *template;
141 bitmap_init (&bm, blksize, 2 /* bits per block */);
143 tmpdir = getenv ("TMPDIR");
144 if (!tmpdir)
145 tmpdir = LARGE_TMPDIR;
147 nbdkit_debug ("cow: temporary directory for overlay: %s", tmpdir);
149 len = strlen (tmpdir) + 8;
150 template = alloca (len);
151 snprintf (template, len, "%s/XXXXXX", tmpdir);
153 #ifdef HAVE_MKOSTEMP
154 fd = mkostemp (template, O_CLOEXEC);
155 #else
156 /* Not atomic, but this is only invoked during .load, so the race
157 * won't affect any plugin actions trying to fork
159 fd = mkstemp (template);
160 if (fd >= 0) {
161 fd = set_cloexec (fd);
162 if (fd < 0) {
163 int e = errno;
164 unlink (template);
165 errno = e;
168 #endif
169 if (fd == -1) {
170 nbdkit_error ("mkostemp: %s: %m", tmpdir);
171 return -1;
174 unlink (template);
175 return 0;
178 void
179 blk_free (void)
181 if (fd >= 0)
182 close (fd);
184 bitmap_free (&bm);
187 /* Because blk_set_size is called before the other blk_* functions
188 * this should be set to the true size before we need it.
190 static uint64_t size = 0;
192 /* Allocate or resize the overlay file and bitmap. */
194 blk_set_size (uint64_t new_size)
196 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
198 size = new_size;
200 if (bitmap_resize (&bm, size) == -1)
201 return -1;
203 if (ftruncate (fd, ROUND_UP (size, blksize)) == -1) {
204 nbdkit_error ("ftruncate: %m");
205 return -1;
208 return 0;
211 /* This is a bit of a hack since usually this information is hidden in
212 * the blk module. However it is needed when calculating extents.
214 void
215 blk_status (uint64_t blknum, bool *present, bool *trimmed)
217 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
218 enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
220 *present = state != BLOCK_NOT_ALLOCATED;
221 *trimmed = state == BLOCK_TRIMMED;
224 /* These are the block operations. They always read or write whole
225 * blocks of size ‘blksize’.
228 blk_read_multiple (nbdkit_next *next,
229 uint64_t blknum, uint64_t nrblocks,
230 uint8_t *block, bool cow_on_read, int *err)
232 off_t offset = blknum * blksize;
233 enum bm_entry state;
234 uint64_t b, runblocks;
236 /* Find out how many of the following blocks form a "run" with the
237 * same state. We can process that many blocks in one go.
239 * About the locking: The state might be modified from another
240 * thread - for example another thread might write
241 * (BLOCK_NOT_ALLOCATED -> BLOCK_ALLOCATED) while we are reading
242 * from the plugin, returning the old data. However a read issued
243 * after the write returns should always return the correct data.
246 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
247 state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
249 for (b = 1, runblocks = 1; b < nrblocks; ++b, ++runblocks) {
250 enum bm_entry s = bitmap_get_blk (&bm, blknum + b, BLOCK_NOT_ALLOCATED);
251 if (state != s)
252 break;
256 if (cow_debug_verbose)
257 nbdkit_debug ("cow: blk_read_multiple block %" PRIu64
258 " (offset %" PRIu64 ") run of length %" PRIu64
259 " is %s",
260 blknum, (uint64_t) offset, runblocks,
261 state_to_string (state));
263 if (state == BLOCK_NOT_ALLOCATED) { /* Read underlying plugin. */
264 unsigned n, tail = 0;
266 assert (blksize * runblocks <= UINT_MAX);
267 n = blksize * runblocks;
269 if (offset + n > size) {
270 tail = offset + n - size;
271 n -= tail;
274 if (next->pread (next, block, n, offset, 0, err) == -1)
275 return -1;
277 /* Normally we're reading whole blocks, but at the very end of the
278 * file we might read a partial block. Deal with that case by
279 * zeroing the tail.
281 memset (block + n, 0, tail);
283 /* If cow-on-read is true then copy the blocks to the cache and
284 * set them as allocated.
286 if (cow_on_read) {
287 if (cow_debug_verbose)
288 nbdkit_debug ("cow: cow-on-read saving %" PRIu64 " blocks "
289 "at offset %" PRIu64 " into the cache",
290 runblocks, offset);
292 if (full_pwrite (fd, block, blksize * runblocks, offset) == -1) {
293 *err = errno;
294 nbdkit_error ("pwrite: %m");
295 return -1;
297 for (b = 0; b < runblocks; ++b)
298 bitmap_set_blk (&bm, blknum+b, BLOCK_ALLOCATED);
301 else if (state == BLOCK_ALLOCATED) { /* Read overlay. */
302 if (full_pread (fd, block, blksize * runblocks, offset) == -1) {
303 *err = errno;
304 nbdkit_error ("pread: %m");
305 return -1;
308 else /* state == BLOCK_TRIMMED */ {
309 memset (block, 0, blksize * runblocks);
312 /* If all done, return. */
313 if (runblocks == nrblocks)
314 return 0;
316 /* Recurse to read remaining blocks. */
317 return blk_read_multiple (next,
318 blknum + runblocks,
319 nrblocks - runblocks,
320 block + blksize * runblocks,
321 cow_on_read, err);
325 blk_read (nbdkit_next *next,
326 uint64_t blknum, uint8_t *block, bool cow_on_read, int *err)
328 return blk_read_multiple (next, blknum, 1, block, cow_on_read, err);
332 blk_cache (nbdkit_next *next,
333 uint64_t blknum, uint8_t *block, enum cache_mode mode, int *err)
335 /* XXX Could make this lock more fine-grained with some thought. */
336 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
337 off_t offset = blknum * blksize;
338 enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
339 unsigned n = blksize, tail = 0;
341 if (offset + n > size) {
342 tail = offset + n - size;
343 n -= tail;
346 if (cow_debug_verbose)
347 nbdkit_debug ("cow: blk_cache block %" PRIu64 " (offset %" PRIu64 ") is %s",
348 blknum, (uint64_t) offset, state_to_string (state));
350 if (state == BLOCK_ALLOCATED) {
351 #if HAVE_POSIX_FADVISE
352 int r = posix_fadvise (fd, offset, blksize, POSIX_FADV_WILLNEED);
353 if (r) {
354 errno = r;
355 nbdkit_error ("posix_fadvise: %m");
356 return -1;
358 #endif
359 return 0;
361 if (state == BLOCK_TRIMMED)
362 return 0;
363 if (mode == BLK_CACHE_IGNORE)
364 return 0;
365 if (mode == BLK_CACHE_PASSTHROUGH)
366 return next->cache (next, n, offset, 0, err);
368 if (next->pread (next, block, n, offset, 0, err) == -1)
369 return -1;
370 /* Normally we're reading whole blocks, but at the very end of the
371 * file we might read a partial block. Deal with that case by
372 * zeroing the tail.
374 memset (block + n, 0, tail);
376 if (mode == BLK_CACHE_COW) {
377 if (full_pwrite (fd, block, blksize, offset) == -1) {
378 *err = errno;
379 nbdkit_error ("pwrite: %m");
380 return -1;
382 bitmap_set_blk (&bm, blknum, BLOCK_ALLOCATED);
384 return 0;
388 blk_write (uint64_t blknum, const uint8_t *block, int *err)
390 off_t offset = blknum * blksize;
392 if (cow_debug_verbose)
393 nbdkit_debug ("cow: blk_write block %" PRIu64 " (offset %" PRIu64 ")",
394 blknum, (uint64_t) offset);
396 if (full_pwrite (fd, block, blksize, offset) == -1) {
397 *err = errno;
398 nbdkit_error ("pwrite: %m");
399 return -1;
402 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
403 bitmap_set_blk (&bm, blknum, BLOCK_ALLOCATED);
405 return 0;
409 blk_trim (uint64_t blknum, int *err)
411 off_t offset = blknum * blksize;
413 if (cow_debug_verbose)
414 nbdkit_debug ("cow: blk_trim block %" PRIu64 " (offset %" PRIu64 ")",
415 blknum, (uint64_t) offset);
417 /* XXX As an optimization we could punch a whole in the overlay
418 * here. However it's not trivial since blksize is unrelated to the
419 * overlay filesystem block size.
421 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
422 bitmap_set_blk (&bm, blknum, BLOCK_TRIMMED);
423 return 0;