Update Red Hat Copyright Notices
[nbdkit.git] / filters / cache / blk.c
blobba2c41b46eecbed35b50574e122c4086a1ccfdaf
1 /* nbdkit
2 * Copyright Red Hat
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
33 /* These are the block operations. They always read or write a single
34 * whole block of size ‘blksize’.
37 #include <config.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <stdint.h>
42 #include <stdbool.h>
43 #include <inttypes.h>
44 #include <string.h>
45 #include <unistd.h>
46 #include <fcntl.h>
47 #include <limits.h>
48 #include <errno.h>
50 #ifdef HAVE_SYS_STATVFS_H
51 #include <sys/statvfs.h>
52 #endif
54 #include <nbdkit-filter.h>
56 #include "bitmap.h"
57 #include "minmax.h"
58 #include "rounding.h"
59 #include "utils.h"
61 #include "cache.h"
62 #include "blk.h"
63 #include "lru.h"
64 #include "reclaim.h"
66 /* The cache. */
67 static int fd = -1;
69 /* Bitmap. There are two bits per block which are updated as we read,
70 * write back or write through blocks.
72 * 00 = not in cache
73 * 01 = block cached and clean
74 * 10 = <unused>
75 * 11 = block cached and dirty
77 * Future enhancement:
79 * We need to cache information about holes, ie. blocks which read as
80 * zeroes but are not explicitly stored in the cache. This
81 * information could be set when clients call cache_zero (and defer
82 * calling plugin->zero until flush). The information could also
83 * interact with extents, so when plugin->extents returns information
84 * that a hole exists we can record this information in the cache and
85 * not have to query the plugin a second time (especially useful for
86 * VDDK where querying extents is slow, and for qemu which [in 2019]
87 * repeatedly requests the same information with REQ_ONE set).
89 static struct bitmap bm;
91 enum bm_entry {
92 BLOCK_NOT_CACHED = 0, /* assumed to be zero by reclaim code */
93 BLOCK_CLEAN = 1,
94 BLOCK_DIRTY = 3,
97 static const char *
98 state_to_string (enum bm_entry state)
100 switch (state) {
101 case BLOCK_NOT_CACHED: return "not cached";
102 case BLOCK_CLEAN: return "clean";
103 case BLOCK_DIRTY: return "dirty";
104 default: abort ();
108 /* Extra debugging (-D cache.verbose=1). */
109 NBDKIT_DLL_PUBLIC int cache_debug_verbose = 0;
112 blk_init (void)
114 const char *tmpdir;
115 size_t len;
116 char *template;
117 struct statvfs statvfs;
119 tmpdir = getenv ("TMPDIR");
120 if (!tmpdir)
121 tmpdir = LARGE_TMPDIR;
123 nbdkit_debug ("cache: temporary directory for cache: %s", tmpdir);
125 len = strlen (tmpdir) + 8;
126 template = alloca (len);
127 snprintf (template, len, "%s/XXXXXX", tmpdir);
129 #ifdef HAVE_MKOSTEMP
130 fd = mkostemp (template, O_CLOEXEC);
131 #else
132 /* Not atomic, but this is only invoked during .load, so the race
133 * won't affect any plugin actions trying to fork
135 fd = mkstemp (template);
136 if (fd >= 0) {
137 fd = set_cloexec (fd);
138 if (fd < 0) {
139 int e = errno;
140 unlink (template);
141 errno = e;
144 #endif
145 if (fd == -1) {
146 nbdkit_error ("mkostemp: %s: %m", tmpdir);
147 return -1;
150 unlink (template);
152 /* Choose the block size.
154 * A 4K block size means that we need 64 MB of memory to store the
155 * bitmaps for a 1 TB underlying image. However to support
156 * hole-punching (for reclaiming) we need the block size to be at
157 * least as large as the filesystem block size.
159 if (fstatvfs (fd, &statvfs) == -1) {
160 nbdkit_error ("fstatvfs: %s: %m", tmpdir);
161 return -1;
163 blksize = MAX (min_block_size, statvfs.f_bsize);
164 nbdkit_debug ("cache: block size: %u", blksize);
166 bitmap_init (&bm, blksize, 2 /* bits per block */);
168 lru_init ();
170 return 0;
173 void
174 blk_free (void)
176 if (fd >= 0)
177 close (fd);
179 bitmap_free (&bm);
181 lru_free ();
184 /* Because blk_set_size is called before the other blk_* functions
185 * this should be set to the true size before we need it.
187 static uint64_t size = 0;
190 blk_set_size (uint64_t new_size)
192 size = new_size;
194 if (bitmap_resize (&bm, size) == -1)
195 return -1;
197 if (ftruncate (fd, ROUND_UP (size, blksize)) == -1) {
198 nbdkit_error ("ftruncate: %m");
199 return -1;
202 if (lru_set_size (size) == -1)
203 return -1;
205 return 0;
208 static int
209 _blk_read_multiple (nbdkit_next *next,
210 uint64_t blknum, uint64_t nrblocks,
211 uint8_t *block, int *err)
213 off_t offset = blknum * blksize;
214 bool not_cached =
215 bitmap_get_blk (&bm, blknum, BLOCK_NOT_CACHED) == BLOCK_NOT_CACHED;
216 uint64_t b, runblocks;
218 assert (nrblocks > 0);
220 if (cache_debug_verbose)
221 nbdkit_debug ("cache: blk_read_multiple block %" PRIu64
222 " (offset %" PRIu64 ") is %s",
223 blknum, (uint64_t) offset,
224 not_cached ? "not cached" : "cached");
226 /* Find out how many of the following blocks form a "run" with the
227 * same cached/not-cached state. We can process that many blocks in
228 * one go.
230 for (b = 1, runblocks = 1; b < nrblocks; ++b, ++runblocks) {
231 bool s =
232 bitmap_get_blk (&bm, blknum + b, BLOCK_NOT_CACHED) == BLOCK_NOT_CACHED;
233 if (not_cached != s)
234 break;
237 if (not_cached) { /* Read underlying plugin. */
238 unsigned n, tail = 0;
240 assert (blksize * runblocks <= UINT_MAX);
241 n = blksize * runblocks;
243 if (offset + n > size) {
244 tail = offset + n - size;
245 n -= tail;
248 if (next->pread (next, block, n, offset, 0, err) == -1)
249 return -1;
251 /* Normally we're reading whole blocks, but at the very end of the
252 * file we might read a partial block. Deal with that case by
253 * zeroing the tail.
255 memset (block + n, 0, tail);
257 /* If cache-on-read, copy the blocks to the cache. */
258 if (cache_on_read ()) {
259 if (cache_debug_verbose)
260 nbdkit_debug ("cache: cache-on-read block %" PRIu64
261 " (offset %" PRIu64 ")",
262 blknum, (uint64_t) offset);
264 if (full_pwrite (fd, block, blksize * runblocks, offset) == -1) {
265 *err = errno;
266 nbdkit_error ("pwrite: %m");
267 return -1;
269 for (b = 0; b < runblocks; ++b) {
270 bitmap_set_blk (&bm, blknum + b, BLOCK_CLEAN);
271 lru_set_recently_accessed (blknum + b);
275 else { /* Read cache. */
276 if (full_pread (fd, block, blksize * runblocks, offset) == -1) {
277 *err = errno;
278 nbdkit_error ("pread: %m");
279 return -1;
281 for (b = 0; b < runblocks; ++b)
282 lru_set_recently_accessed (blknum + b);
285 /* If all done, return. */
286 if (runblocks == nrblocks)
287 return 0;
289 /* Recurse to read remaining blocks. */
290 return _blk_read_multiple (next,
291 blknum + runblocks,
292 nrblocks - runblocks,
293 block + blksize * runblocks,
294 err);
298 blk_read_multiple (nbdkit_next *next,
299 uint64_t blknum, uint64_t nrblocks,
300 uint8_t *block, int *err)
302 reclaim (fd, &bm);
303 return _blk_read_multiple (next, blknum, nrblocks, block, err);
307 blk_read (nbdkit_next *next,
308 uint64_t blknum, uint8_t *block, int *err)
310 return blk_read_multiple (next, blknum, 1, block, err);
314 blk_cache (nbdkit_next *next,
315 uint64_t blknum, uint8_t *block, int *err)
317 off_t offset = blknum * blksize;
318 enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_CACHED);
320 reclaim (fd, &bm);
322 if (cache_debug_verbose)
323 nbdkit_debug ("cache: blk_cache block %" PRIu64
324 " (offset %" PRIu64 ") is %s",
325 blknum, (uint64_t) offset,
326 state_to_string (state));
328 if (state == BLOCK_NOT_CACHED) {
329 /* Read underlying plugin, copy to cache regardless of cache-on-read. */
330 unsigned n = blksize, tail = 0;
332 if (offset + n > size) {
333 tail = offset + n - size;
334 n -= tail;
337 if (next->pread (next, block, n, offset, 0, err) == -1)
338 return -1;
340 /* Normally we're reading whole blocks, but at the very end of the
341 * file we might read a partial block. Deal with that case by
342 * zeroing the tail.
344 memset (block + n, 0, tail);
346 if (cache_debug_verbose)
347 nbdkit_debug ("cache: cache block %" PRIu64 " (offset %" PRIu64 ")",
348 blknum, (uint64_t) offset);
350 if (full_pwrite (fd, block, blksize, offset) == -1) {
351 *err = errno;
352 nbdkit_error ("pwrite: %m");
353 return -1;
355 bitmap_set_blk (&bm, blknum, BLOCK_CLEAN);
356 lru_set_recently_accessed (blknum);
358 else {
359 #if HAVE_POSIX_FADVISE
360 int r = posix_fadvise (fd, offset, blksize, POSIX_FADV_WILLNEED);
361 if (r) {
362 errno = r;
363 nbdkit_error ("posix_fadvise: %m");
364 return -1;
366 #endif
367 lru_set_recently_accessed (blknum);
369 return 0;
373 blk_writethrough (nbdkit_next *next,
374 uint64_t blknum, const uint8_t *block, uint32_t flags,
375 int *err)
377 off_t offset = blknum * blksize;
378 unsigned n = blksize, tail = 0;
380 if (offset + n > size) {
381 tail = offset + n - size;
382 n -= tail;
385 reclaim (fd, &bm);
387 if (cache_debug_verbose)
388 nbdkit_debug ("cache: writethrough block %" PRIu64 " (offset %" PRIu64 ")",
389 blknum, (uint64_t) offset);
391 if (full_pwrite (fd, block, blksize, offset) == -1) {
392 *err = errno;
393 nbdkit_error ("pwrite: %m");
394 return -1;
397 if (next->pwrite (next, block, n, offset, flags, err) == -1)
398 return -1;
400 bitmap_set_blk (&bm, blknum, BLOCK_CLEAN);
401 lru_set_recently_accessed (blknum);
403 return 0;
407 blk_write (nbdkit_next *next,
408 uint64_t blknum, const uint8_t *block, uint32_t flags,
409 int *err)
411 off_t offset;
413 if (cache_mode == CACHE_MODE_WRITETHROUGH ||
414 (cache_mode == CACHE_MODE_WRITEBACK && (flags & NBDKIT_FLAG_FUA)))
415 return blk_writethrough (next, blknum, block, flags, err);
417 offset = blknum * blksize;
419 reclaim (fd, &bm);
421 if (cache_debug_verbose)
422 nbdkit_debug ("cache: writeback block %" PRIu64 " (offset %" PRIu64 ")",
423 blknum, (uint64_t) offset);
425 if (full_pwrite (fd, block, blksize, offset) == -1) {
426 *err = errno;
427 nbdkit_error ("pwrite: %m");
428 return -1;
430 bitmap_set_blk (&bm, blknum, BLOCK_DIRTY);
431 lru_set_recently_accessed (blknum);
433 return 0;
437 for_each_dirty_block (block_callback f, void *vp)
439 uint64_t blknum;
440 enum bm_entry state;
442 bitmap_for (&bm, blknum) {
443 state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_CACHED);
444 if (state == BLOCK_DIRTY) {
445 if (f (blknum, vp) == -1)
446 return -1;
450 return 0;