2 * Copyright (C) 2018-2019 Red Hat Inc.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 /* These are the block operations. They always read or write a single
34 * whole block of size ‘blksize’.
48 #include <sys/statvfs.h>
50 #include <nbdkit-filter.h>
63 /* Bitmap. There are two bits per block which are updated as we read,
64 * write back or write through blocks.
67 * 01 = block cached and clean
69 * 11 = block cached and dirty
73 * We need to cache information about holes, ie. blocks which read as
74 * zeroes but are not explicitly stored in the cache. This
75 * information could be set when clients call cache_zero (and defer
76 * calling plugin->zero until flush). The information could also
77 * interact with extents, so when plugin->extents returns information
78 * that a hole exists we can record this information in the cache and
79 * not have to query the plugin a second time (especially useful for
80 * VDDK where querying extents is slow, and for qemu which [in 2019]
81 * repeatedly requests the same information with REQ_ONE set).
83 static struct bitmap bm
;
86 BLOCK_NOT_CACHED
= 0, /* assumed to be zero by reclaim code */
97 struct statvfs statvfs
;
99 tmpdir
= getenv ("TMPDIR");
101 tmpdir
= LARGE_TMPDIR
;
103 nbdkit_debug ("cache: temporary directory for cache: %s", tmpdir
);
105 len
= strlen (tmpdir
) + 8;
106 template = alloca (len
);
107 snprintf (template, len
, "%s/XXXXXX", tmpdir
);
110 fd
= mkostemp (template, O_CLOEXEC
);
112 /* Not atomic, but this is only invoked during .load, so the race
113 * won't affect any plugin actions trying to fork
115 fd
= mkstemp (template);
117 fd
= set_cloexec (fd
);
126 nbdkit_error ("mkostemp: %s: %m", tmpdir
);
132 /* Choose the block size.
134 * A 4K block size means that we need 64 MB of memory to store the
135 * bitmaps for a 1 TB underlying image. However to support
136 * hole-punching (for reclaiming) we need the block size to be at
137 * least as large as the filesystem block size.
139 if (fstatvfs (fd
, &statvfs
) == -1) {
140 nbdkit_error ("fstatvfs: %s: %m", tmpdir
);
143 blksize
= MAX (4096, statvfs
.f_bsize
);
144 nbdkit_debug ("cache: block size: %u", blksize
);
146 bitmap_init (&bm
, blksize
, 2 /* bits per block */);
165 blk_set_size (uint64_t new_size
)
167 if (bitmap_resize (&bm
, new_size
) == -1)
170 if (ftruncate (fd
, new_size
) == -1) {
171 nbdkit_error ("ftruncate: %m");
175 if (lru_set_size (new_size
) == -1)
182 blk_read (struct nbdkit_next_ops
*next_ops
, void *nxdata
,
183 uint64_t blknum
, uint8_t *block
, int *err
)
185 off_t offset
= blknum
* blksize
;
186 enum bm_entry state
= bitmap_get_blk (&bm
, blknum
, BLOCK_NOT_CACHED
);
190 nbdkit_debug ("cache: blk_read block %" PRIu64
" (offset %" PRIu64
") is %s",
191 blknum
, (uint64_t) offset
,
192 state
== BLOCK_NOT_CACHED
? "not cached" :
193 state
== BLOCK_CLEAN
? "clean" :
194 state
== BLOCK_DIRTY
? "dirty" :
197 if (state
== BLOCK_NOT_CACHED
) { /* Read underlying plugin. */
198 if (next_ops
->pread (nxdata
, block
, blksize
, offset
, 0, err
) == -1)
201 /* If cache-on-read, copy the block to the cache. */
203 off_t offset
= blknum
* blksize
;
205 nbdkit_debug ("cache: cache-on-read block %" PRIu64
206 " (offset %" PRIu64
")",
207 blknum
, (uint64_t) offset
);
209 if (pwrite (fd
, block
, blksize
, offset
) == -1) {
211 nbdkit_error ("pwrite: %m");
214 bitmap_set_blk (&bm
, blknum
, BLOCK_CLEAN
);
215 lru_set_recently_accessed (blknum
);
219 else { /* Read cache. */
220 if (pread (fd
, block
, blksize
, offset
) == -1) {
222 nbdkit_error ("pread: %m");
225 lru_set_recently_accessed (blknum
);
231 blk_cache (struct nbdkit_next_ops
*next_ops
, void *nxdata
,
232 uint64_t blknum
, uint8_t *block
, int *err
)
234 off_t offset
= blknum
* blksize
;
235 enum bm_entry state
= bitmap_get_blk (&bm
, blknum
, BLOCK_NOT_CACHED
);
239 nbdkit_debug ("cache: blk_cache block %" PRIu64
" (offset %" PRIu64
") is %s",
240 blknum
, (uint64_t) offset
,
241 state
== BLOCK_NOT_CACHED
? "not cached" :
242 state
== BLOCK_CLEAN
? "clean" :
243 state
== BLOCK_DIRTY
? "dirty" :
246 if (state
== BLOCK_NOT_CACHED
) {
247 off_t offset
= blknum
* blksize
;
249 /* Read underlying plugin, copy to cache regardless of cache-on-read. */
250 if (next_ops
->pread (nxdata
, block
, blksize
, offset
, 0, err
) == -1)
253 nbdkit_debug ("cache: cache block %" PRIu64
" (offset %" PRIu64
")",
254 blknum
, (uint64_t) offset
);
256 if (pwrite (fd
, block
, blksize
, offset
) == -1) {
258 nbdkit_error ("pwrite: %m");
261 bitmap_set_blk (&bm
, blknum
, BLOCK_CLEAN
);
262 lru_set_recently_accessed (blknum
);
265 #if HAVE_POSIX_FADVISE
266 int r
= posix_fadvise (fd
, offset
, blksize
, POSIX_FADV_WILLNEED
);
269 nbdkit_error ("posix_fadvise: %m");
273 lru_set_recently_accessed (blknum
);
279 blk_writethrough (struct nbdkit_next_ops
*next_ops
, void *nxdata
,
280 uint64_t blknum
, const uint8_t *block
, uint32_t flags
,
283 off_t offset
= blknum
* blksize
;
287 nbdkit_debug ("cache: writethrough block %" PRIu64
" (offset %" PRIu64
")",
288 blknum
, (uint64_t) offset
);
290 if (pwrite (fd
, block
, blksize
, offset
) == -1) {
292 nbdkit_error ("pwrite: %m");
296 if (next_ops
->pwrite (nxdata
, block
, blksize
, offset
, flags
, err
) == -1)
299 bitmap_set_blk (&bm
, blknum
, BLOCK_CLEAN
);
300 lru_set_recently_accessed (blknum
);
306 blk_write (struct nbdkit_next_ops
*next_ops
, void *nxdata
,
307 uint64_t blknum
, const uint8_t *block
, uint32_t flags
,
312 if (cache_mode
== CACHE_MODE_WRITETHROUGH
||
313 (cache_mode
== CACHE_MODE_WRITEBACK
&& (flags
& NBDKIT_FLAG_FUA
)))
314 return blk_writethrough (next_ops
, nxdata
, blknum
, block
, flags
, err
);
316 offset
= blknum
* blksize
;
320 nbdkit_debug ("cache: writeback block %" PRIu64
" (offset %" PRIu64
")",
321 blknum
, (uint64_t) offset
);
323 if (pwrite (fd
, block
, blksize
, offset
) == -1) {
325 nbdkit_error ("pwrite: %m");
328 bitmap_set_blk (&bm
, blknum
, BLOCK_DIRTY
);
329 lru_set_recently_accessed (blknum
);
335 for_each_dirty_block (block_callback f
, void *vp
)
340 bitmap_for (&bm
, blknum
) {
341 state
= bitmap_get_blk (&bm
, blknum
, BLOCK_NOT_CACHED
);
342 if (state
== BLOCK_DIRTY
) {
343 if (f (blknum
, vp
) == -1)