cow, cache: Better mkostemp fallback
[nbdkit/ericb.git] / filters / cache / blk.c
blob272d176efc56004a990b60552c3acfc0a3aa158e
1 /* nbdkit
2 * Copyright (C) 2018-2019 Red Hat Inc.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
33 /* These are the block operations. They always read or write a single
34 * whole block of size ‘blksize’.
37 #include <config.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <stdint.h>
42 #include <stdbool.h>
43 #include <inttypes.h>
44 #include <string.h>
45 #include <unistd.h>
46 #include <fcntl.h>
47 #include <errno.h>
48 #include <sys/statvfs.h>
50 #include <nbdkit-filter.h>
52 #include "bitmap.h"
53 #include "minmax.h"
55 #include "cache.h"
56 #include "blk.h"
57 #include "lru.h"
58 #include "reclaim.h"
60 /* The cache. */
61 static int fd = -1;
63 /* Bitmap. There are two bits per block which are updated as we read,
64 * write back or write through blocks.
66 * 00 = not in cache
67 * 01 = block cached and clean
68 * 10 = <unused>
69 * 11 = block cached and dirty
71 * Future enhancement:
73 * We need to cache information about holes, ie. blocks which read as
74 * zeroes but are not explicitly stored in the cache. This
75 * information could be set when clients call cache_zero (and defer
76 * calling plugin->zero until flush). The information could also
77 * interact with extents, so when plugin->extents returns information
78 * that a hole exists we can record this information in the cache and
79 * not have to query the plugin a second time (especially useful for
80 * VDDK where querying extents is slow, and for qemu which [in 2019]
81 * repeatedly requests the same information with REQ_ONE set).
83 static struct bitmap bm;
85 enum bm_entry {
86 BLOCK_NOT_CACHED = 0, /* assumed to be zero by reclaim code */
87 BLOCK_CLEAN = 1,
88 BLOCK_DIRTY = 3,
91 int
92 blk_init (void)
94 const char *tmpdir;
95 size_t len;
96 char *template;
97 struct statvfs statvfs;
99 tmpdir = getenv ("TMPDIR");
100 if (!tmpdir)
101 tmpdir = LARGE_TMPDIR;
103 nbdkit_debug ("cache: temporary directory for cache: %s", tmpdir);
105 len = strlen (tmpdir) + 8;
106 template = alloca (len);
107 snprintf (template, len, "%s/XXXXXX", tmpdir);
109 #ifdef HAVE_MKOSTEMP
110 fd = mkostemp (template, O_CLOEXEC);
111 #else
112 /* Not atomic, but this is only invoked during .load, so the race
113 * won't affect any plugin actions trying to fork
115 fd = mkstemp (template);
116 if (fd >= 0) {
117 fd = set_cloexec (fd);
118 if (fd < 0) {
119 int e = errno;
120 unlink (template);
121 errno = e;
124 #endif
125 if (fd == -1) {
126 nbdkit_error ("mkostemp: %s: %m", tmpdir);
127 return -1;
130 unlink (template);
132 /* Choose the block size.
134 * A 4K block size means that we need 64 MB of memory to store the
135 * bitmaps for a 1 TB underlying image. However to support
136 * hole-punching (for reclaiming) we need the block size to be at
137 * least as large as the filesystem block size.
139 if (fstatvfs (fd, &statvfs) == -1) {
140 nbdkit_error ("fstatvfs: %s: %m", tmpdir);
141 return -1;
143 blksize = MAX (4096, statvfs.f_bsize);
144 nbdkit_debug ("cache: block size: %u", blksize);
146 bitmap_init (&bm, blksize, 2 /* bits per block */);
148 lru_init ();
150 return 0;
153 void
154 blk_free (void)
156 if (fd >= 0)
157 close (fd);
159 bitmap_free (&bm);
161 lru_free ();
165 blk_set_size (uint64_t new_size)
167 if (bitmap_resize (&bm, new_size) == -1)
168 return -1;
170 if (ftruncate (fd, new_size) == -1) {
171 nbdkit_error ("ftruncate: %m");
172 return -1;
175 if (lru_set_size (new_size) == -1)
176 return -1;
178 return 0;
182 blk_read (struct nbdkit_next_ops *next_ops, void *nxdata,
183 uint64_t blknum, uint8_t *block, int *err)
185 off_t offset = blknum * blksize;
186 enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_CACHED);
188 reclaim (fd, &bm);
190 nbdkit_debug ("cache: blk_read block %" PRIu64 " (offset %" PRIu64 ") is %s",
191 blknum, (uint64_t) offset,
192 state == BLOCK_NOT_CACHED ? "not cached" :
193 state == BLOCK_CLEAN ? "clean" :
194 state == BLOCK_DIRTY ? "dirty" :
195 "unknown");
197 if (state == BLOCK_NOT_CACHED) { /* Read underlying plugin. */
198 if (next_ops->pread (nxdata, block, blksize, offset, 0, err) == -1)
199 return -1;
201 /* If cache-on-read, copy the block to the cache. */
202 if (cache_on_read) {
203 off_t offset = blknum * blksize;
205 nbdkit_debug ("cache: cache-on-read block %" PRIu64
206 " (offset %" PRIu64 ")",
207 blknum, (uint64_t) offset);
209 if (pwrite (fd, block, blksize, offset) == -1) {
210 *err = errno;
211 nbdkit_error ("pwrite: %m");
212 return -1;
214 bitmap_set_blk (&bm, blknum, BLOCK_CLEAN);
215 lru_set_recently_accessed (blknum);
217 return 0;
219 else { /* Read cache. */
220 if (pread (fd, block, blksize, offset) == -1) {
221 *err = errno;
222 nbdkit_error ("pread: %m");
223 return -1;
225 lru_set_recently_accessed (blknum);
226 return 0;
231 blk_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
232 uint64_t blknum, uint8_t *block, int *err)
234 off_t offset = blknum * blksize;
235 enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_CACHED);
237 reclaim (fd, &bm);
239 nbdkit_debug ("cache: blk_cache block %" PRIu64 " (offset %" PRIu64 ") is %s",
240 blknum, (uint64_t) offset,
241 state == BLOCK_NOT_CACHED ? "not cached" :
242 state == BLOCK_CLEAN ? "clean" :
243 state == BLOCK_DIRTY ? "dirty" :
244 "unknown");
246 if (state == BLOCK_NOT_CACHED) {
247 off_t offset = blknum * blksize;
249 /* Read underlying plugin, copy to cache regardless of cache-on-read. */
250 if (next_ops->pread (nxdata, block, blksize, offset, 0, err) == -1)
251 return -1;
253 nbdkit_debug ("cache: cache block %" PRIu64 " (offset %" PRIu64 ")",
254 blknum, (uint64_t) offset);
256 if (pwrite (fd, block, blksize, offset) == -1) {
257 *err = errno;
258 nbdkit_error ("pwrite: %m");
259 return -1;
261 bitmap_set_blk (&bm, blknum, BLOCK_CLEAN);
262 lru_set_recently_accessed (blknum);
264 else {
265 #if HAVE_POSIX_FADVISE
266 int r = posix_fadvise (fd, offset, blksize, POSIX_FADV_WILLNEED);
267 if (r) {
268 errno = r;
269 nbdkit_error ("posix_fadvise: %m");
270 return -1;
272 #endif
273 lru_set_recently_accessed (blknum);
275 return 0;
279 blk_writethrough (struct nbdkit_next_ops *next_ops, void *nxdata,
280 uint64_t blknum, const uint8_t *block, uint32_t flags,
281 int *err)
283 off_t offset = blknum * blksize;
285 reclaim (fd, &bm);
287 nbdkit_debug ("cache: writethrough block %" PRIu64 " (offset %" PRIu64 ")",
288 blknum, (uint64_t) offset);
290 if (pwrite (fd, block, blksize, offset) == -1) {
291 *err = errno;
292 nbdkit_error ("pwrite: %m");
293 return -1;
296 if (next_ops->pwrite (nxdata, block, blksize, offset, flags, err) == -1)
297 return -1;
299 bitmap_set_blk (&bm, blknum, BLOCK_CLEAN);
300 lru_set_recently_accessed (blknum);
302 return 0;
306 blk_write (struct nbdkit_next_ops *next_ops, void *nxdata,
307 uint64_t blknum, const uint8_t *block, uint32_t flags,
308 int *err)
310 off_t offset;
312 if (cache_mode == CACHE_MODE_WRITETHROUGH ||
313 (cache_mode == CACHE_MODE_WRITEBACK && (flags & NBDKIT_FLAG_FUA)))
314 return blk_writethrough (next_ops, nxdata, blknum, block, flags, err);
316 offset = blknum * blksize;
318 reclaim (fd, &bm);
320 nbdkit_debug ("cache: writeback block %" PRIu64 " (offset %" PRIu64 ")",
321 blknum, (uint64_t) offset);
323 if (pwrite (fd, block, blksize, offset) == -1) {
324 *err = errno;
325 nbdkit_error ("pwrite: %m");
326 return -1;
328 bitmap_set_blk (&bm, blknum, BLOCK_DIRTY);
329 lru_set_recently_accessed (blknum);
331 return 0;
335 for_each_dirty_block (block_callback f, void *vp)
337 uint64_t blknum;
338 enum bm_entry state;
340 bitmap_for (&bm, blknum) {
341 state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_CACHED);
342 if (state == BLOCK_DIRTY) {
343 if (f (blknum, vp) == -1)
344 return -1;
348 return 0;