4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 /* These are the block operations. They always read or write a single
34 * whole block of size ‘blksize’.
50 #ifdef HAVE_SYS_STATVFS_H
51 #include <sys/statvfs.h>
54 #include <nbdkit-filter.h>
69 /* Bitmap. There are two bits per block which are updated as we read,
70 * write back or write through blocks.
73 * 01 = block cached and clean
75 * 11 = block cached and dirty
79 * We need to cache information about holes, ie. blocks which read as
80 * zeroes but are not explicitly stored in the cache. This
81 * information could be set when clients call cache_zero (and defer
82 * calling plugin->zero until flush). The information could also
83 * interact with extents, so when plugin->extents returns information
84 * that a hole exists we can record this information in the cache and
85 * not have to query the plugin a second time (especially useful for
86 * VDDK where querying extents is slow, and for qemu which [in 2019]
87 * repeatedly requests the same information with REQ_ONE set).
89 static struct bitmap bm
;
92 BLOCK_NOT_CACHED
= 0, /* assumed to be zero by reclaim code */
98 state_to_string (enum bm_entry state
)
101 case BLOCK_NOT_CACHED
: return "not cached";
102 case BLOCK_CLEAN
: return "clean";
103 case BLOCK_DIRTY
: return "dirty";
108 /* Extra debugging (-D cache.verbose=1). */
109 NBDKIT_DLL_PUBLIC
int cache_debug_verbose
= 0;
117 struct statvfs statvfs
;
119 tmpdir
= getenv ("TMPDIR");
121 tmpdir
= LARGE_TMPDIR
;
123 nbdkit_debug ("cache: temporary directory for cache: %s", tmpdir
);
125 len
= strlen (tmpdir
) + 8;
126 template = alloca (len
);
127 snprintf (template, len
, "%s/XXXXXX", tmpdir
);
130 fd
= mkostemp (template, O_CLOEXEC
);
132 /* Not atomic, but this is only invoked during .load, so the race
133 * won't affect any plugin actions trying to fork
135 fd
= mkstemp (template);
137 fd
= set_cloexec (fd
);
146 nbdkit_error ("mkostemp: %s: %m", tmpdir
);
152 /* Choose the block size.
154 * A 4K block size means that we need 64 MB of memory to store the
155 * bitmaps for a 1 TB underlying image. However to support
156 * hole-punching (for reclaiming) we need the block size to be at
157 * least as large as the filesystem block size.
159 if (fstatvfs (fd
, &statvfs
) == -1) {
160 nbdkit_error ("fstatvfs: %s: %m", tmpdir
);
163 blksize
= MAX (min_block_size
, statvfs
.f_bsize
);
164 nbdkit_debug ("cache: block size: %u", blksize
);
166 bitmap_init (&bm
, blksize
, 2 /* bits per block */);
184 /* Because blk_set_size is called before the other blk_* functions
185 * this should be set to the true size before we need it.
187 static uint64_t size
= 0;
190 blk_set_size (uint64_t new_size
)
194 if (bitmap_resize (&bm
, size
) == -1)
197 if (ftruncate (fd
, ROUND_UP (size
, blksize
)) == -1) {
198 nbdkit_error ("ftruncate: %m");
202 if (lru_set_size (size
) == -1)
209 _blk_read_multiple (nbdkit_next
*next
,
210 uint64_t blknum
, uint64_t nrblocks
,
211 uint8_t *block
, int *err
)
213 off_t offset
= blknum
* blksize
;
215 bitmap_get_blk (&bm
, blknum
, BLOCK_NOT_CACHED
) == BLOCK_NOT_CACHED
;
216 uint64_t b
, runblocks
;
218 assert (nrblocks
> 0);
220 if (cache_debug_verbose
)
221 nbdkit_debug ("cache: blk_read_multiple block %" PRIu64
222 " (offset %" PRIu64
") is %s",
223 blknum
, (uint64_t) offset
,
224 not_cached
? "not cached" : "cached");
226 /* Find out how many of the following blocks form a "run" with the
227 * same cached/not-cached state. We can process that many blocks in
230 for (b
= 1, runblocks
= 1; b
< nrblocks
; ++b
, ++runblocks
) {
232 bitmap_get_blk (&bm
, blknum
+ b
, BLOCK_NOT_CACHED
) == BLOCK_NOT_CACHED
;
237 if (not_cached
) { /* Read underlying plugin. */
238 unsigned n
, tail
= 0;
240 assert (blksize
* runblocks
<= UINT_MAX
);
241 n
= blksize
* runblocks
;
243 if (offset
+ n
> size
) {
244 tail
= offset
+ n
- size
;
248 if (next
->pread (next
, block
, n
, offset
, 0, err
) == -1)
251 /* Normally we're reading whole blocks, but at the very end of the
252 * file we might read a partial block. Deal with that case by
255 memset (block
+ n
, 0, tail
);
257 /* If cache-on-read, copy the blocks to the cache. */
258 if (cache_on_read ()) {
259 if (cache_debug_verbose
)
260 nbdkit_debug ("cache: cache-on-read block %" PRIu64
261 " (offset %" PRIu64
")",
262 blknum
, (uint64_t) offset
);
264 if (full_pwrite (fd
, block
, blksize
* runblocks
, offset
) == -1) {
266 nbdkit_error ("pwrite: %m");
269 for (b
= 0; b
< runblocks
; ++b
) {
270 bitmap_set_blk (&bm
, blknum
+ b
, BLOCK_CLEAN
);
271 lru_set_recently_accessed (blknum
+ b
);
275 else { /* Read cache. */
276 if (full_pread (fd
, block
, blksize
* runblocks
, offset
) == -1) {
278 nbdkit_error ("pread: %m");
281 for (b
= 0; b
< runblocks
; ++b
)
282 lru_set_recently_accessed (blknum
+ b
);
285 /* If all done, return. */
286 if (runblocks
== nrblocks
)
289 /* Recurse to read remaining blocks. */
290 return _blk_read_multiple (next
,
292 nrblocks
- runblocks
,
293 block
+ blksize
* runblocks
,
298 blk_read_multiple (nbdkit_next
*next
,
299 uint64_t blknum
, uint64_t nrblocks
,
300 uint8_t *block
, int *err
)
303 return _blk_read_multiple (next
, blknum
, nrblocks
, block
, err
);
307 blk_read (nbdkit_next
*next
,
308 uint64_t blknum
, uint8_t *block
, int *err
)
310 return blk_read_multiple (next
, blknum
, 1, block
, err
);
314 blk_cache (nbdkit_next
*next
,
315 uint64_t blknum
, uint8_t *block
, int *err
)
317 off_t offset
= blknum
* blksize
;
318 enum bm_entry state
= bitmap_get_blk (&bm
, blknum
, BLOCK_NOT_CACHED
);
322 if (cache_debug_verbose
)
323 nbdkit_debug ("cache: blk_cache block %" PRIu64
324 " (offset %" PRIu64
") is %s",
325 blknum
, (uint64_t) offset
,
326 state_to_string (state
));
328 if (state
== BLOCK_NOT_CACHED
) {
329 /* Read underlying plugin, copy to cache regardless of cache-on-read. */
330 unsigned n
= blksize
, tail
= 0;
332 if (offset
+ n
> size
) {
333 tail
= offset
+ n
- size
;
337 if (next
->pread (next
, block
, n
, offset
, 0, err
) == -1)
340 /* Normally we're reading whole blocks, but at the very end of the
341 * file we might read a partial block. Deal with that case by
344 memset (block
+ n
, 0, tail
);
346 if (cache_debug_verbose
)
347 nbdkit_debug ("cache: cache block %" PRIu64
" (offset %" PRIu64
")",
348 blknum
, (uint64_t) offset
);
350 if (full_pwrite (fd
, block
, blksize
, offset
) == -1) {
352 nbdkit_error ("pwrite: %m");
355 bitmap_set_blk (&bm
, blknum
, BLOCK_CLEAN
);
356 lru_set_recently_accessed (blknum
);
359 #if HAVE_POSIX_FADVISE
360 int r
= posix_fadvise (fd
, offset
, blksize
, POSIX_FADV_WILLNEED
);
363 nbdkit_error ("posix_fadvise: %m");
367 lru_set_recently_accessed (blknum
);
373 blk_writethrough (nbdkit_next
*next
,
374 uint64_t blknum
, const uint8_t *block
, uint32_t flags
,
377 off_t offset
= blknum
* blksize
;
378 unsigned n
= blksize
, tail
= 0;
380 if (offset
+ n
> size
) {
381 tail
= offset
+ n
- size
;
387 if (cache_debug_verbose
)
388 nbdkit_debug ("cache: writethrough block %" PRIu64
" (offset %" PRIu64
")",
389 blknum
, (uint64_t) offset
);
391 if (full_pwrite (fd
, block
, blksize
, offset
) == -1) {
393 nbdkit_error ("pwrite: %m");
397 if (next
->pwrite (next
, block
, n
, offset
, flags
, err
) == -1)
400 bitmap_set_blk (&bm
, blknum
, BLOCK_CLEAN
);
401 lru_set_recently_accessed (blknum
);
407 blk_write (nbdkit_next
*next
,
408 uint64_t blknum
, const uint8_t *block
, uint32_t flags
,
413 if (cache_mode
== CACHE_MODE_WRITETHROUGH
||
414 (cache_mode
== CACHE_MODE_WRITEBACK
&& (flags
& NBDKIT_FLAG_FUA
)))
415 return blk_writethrough (next
, blknum
, block
, flags
, err
);
417 offset
= blknum
* blksize
;
421 if (cache_debug_verbose
)
422 nbdkit_debug ("cache: writeback block %" PRIu64
" (offset %" PRIu64
")",
423 blknum
, (uint64_t) offset
);
425 if (full_pwrite (fd
, block
, blksize
, offset
) == -1) {
427 nbdkit_error ("pwrite: %m");
430 bitmap_set_blk (&bm
, blknum
, BLOCK_DIRTY
);
431 lru_set_recently_accessed (blknum
);
437 for_each_dirty_block (block_callback f
, void *vp
)
442 bitmap_for (&bm
, blknum
) {
443 state
= bitmap_get_blk (&bm
, blknum
, BLOCK_NOT_CACHED
);
444 if (state
== BLOCK_DIRTY
) {
445 if (f (blknum
, vp
) == -1)