4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
46 #include <sys/types.h>
48 #ifdef HAVE_SYS_IOCTL_H
49 #include <sys/ioctl.h>
58 #include <nbdkit-filter.h>
65 #include "isaligned.h"
66 #include "ispowerof2.h"
70 /* In order to handle parallel requests safely, this lock must be held
71 * when calling any blk_* functions.
73 static pthread_mutex_t lock
= PTHREAD_MUTEX_INITIALIZER
;
75 unsigned blksize
; /* actual block size (picked by blk.c) */
76 unsigned min_block_size
= 65536;
77 enum cache_mode cache_mode
= CACHE_MODE_WRITEBACK
;
78 int64_t max_size
= -1;
79 unsigned hi_thresh
= 95, lo_thresh
= 80;
80 enum cor_mode cor_mode
= COR_OFF
;
83 static int cache_flush (nbdkit_next
*next
, void *handle
, uint32_t flags
,
93 cache_config (nbdkit_next_config
*next
, nbdkit_backend
*nxdata
,
94 const char *key
, const char *value
)
96 if (strcmp (key
, "cache") == 0) {
97 if (strcmp (value
, "writeback") == 0) {
98 cache_mode
= CACHE_MODE_WRITEBACK
;
101 else if (strcmp (value
, "writethrough") == 0) {
102 cache_mode
= CACHE_MODE_WRITETHROUGH
;
105 else if (strcmp (value
, "unsafe") == 0) {
106 cache_mode
= CACHE_MODE_UNSAFE
;
110 nbdkit_error ("invalid cache parameter, should be "
111 "writeback|writethrough|unsafe");
115 else if (strcmp (key
, "cache-min-block-size") == 0) {
118 r
= nbdkit_parse_size (value
);
121 if (r
< 4096 || !is_power_of_2 (r
) || r
> UINT_MAX
) {
122 nbdkit_error ("cache-min-block-size is not a power of 2, or is too small or too large");
128 #ifdef HAVE_CACHE_RECLAIM
129 else if (strcmp (key
, "cache-max-size") == 0) {
132 r
= nbdkit_parse_size (value
);
135 /* We set a lower limit for the cache size just to keep out of
139 nbdkit_error ("cache-max-size is too small");
145 else if (strcmp (key
, "cache-high-threshold") == 0) {
146 if (nbdkit_parse_unsigned ("cache-high-threshold",
147 value
, &hi_thresh
) == -1)
149 if (hi_thresh
== 0) {
150 nbdkit_error ("cache-high-threshold must be greater than zero");
155 else if (strcmp (key
, "cache-low-threshold") == 0) {
156 if (nbdkit_parse_unsigned ("cache-low-threshold",
157 value
, &lo_thresh
) == -1)
159 if (lo_thresh
== 0) {
160 nbdkit_error ("cache-low-threshold must be greater than zero");
165 #else /* !HAVE_CACHE_RECLAIM */
166 else if (strcmp (key
, "cache-max-size") == 0 ||
167 strcmp (key
, "cache-high-threshold") == 0 ||
168 strcmp (key
, "cache-low-threshold") == 0) {
169 nbdkit_error ("this platform does not support cache reclaim");
172 #endif /* !HAVE_CACHE_RECLAIM */
173 else if (strcmp (key
, "cache-on-read") == 0) {
174 if (value
[0] == '/') {
179 int r
= nbdkit_parse_bool (value
);
182 cor_mode
= r
? COR_ON
: COR_OFF
;
187 return next (nxdata
, key
, value
);
191 #define cache_config_help_common \
192 "cache=MODE Set cache MODE, one of writeback (default),\n" \
193 " writethrough, or unsafe.\n" \
194 "cache-on-read=BOOL|/PATH Set to true to cache on reads (default false).\n"
195 #ifndef HAVE_CACHE_RECLAIM
196 #define cache_config_help cache_config_help_common
198 #define cache_config_help cache_config_help_common \
199 "cache-max-size=SIZE Set maximum space used by cache.\n" \
200 "cache-high-threshold=PCT Percentage of max size where reclaim begins.\n" \
201 "cache-low-threshold=PCT Percentage of max size where reclaim ends.\n"
204 /* Decide if cache-on-read is currently on or off. */
209 case COR_ON
: return true;
210 case COR_OFF
: return false;
211 case COR_PATH
: return access (cor_path
, F_OK
) == 0;
217 cache_config_complete (nbdkit_next_config_complete
*next
,
218 nbdkit_backend
*nxdata
)
220 /* If cache-max-size was set then check the thresholds. */
221 if (max_size
!= -1) {
222 if (lo_thresh
>= hi_thresh
) {
223 nbdkit_error ("cache-low-threshold must be "
224 "less than cache-high-threshold");
229 return next (nxdata
);
233 cache_get_ready (int thread_model
)
235 if (blk_init () == -1)
241 /* Get the file size, set the cache size. */
243 cache_get_size (nbdkit_next
*next
,
249 size
= next
->get_size (next
);
253 nbdkit_debug ("cache: underlying file size: %" PRIi64
, size
);
255 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
256 r
= blk_set_size (size
);
263 /* Block size constraints. */
265 cache_block_size (nbdkit_next
*next
, void *handle
,
266 uint32_t *minimum
, uint32_t *preferred
, uint32_t *maximum
)
268 if (next
->block_size (next
, minimum
, preferred
, maximum
) == -1)
271 if (*minimum
== 0) { /* No constraints set by the plugin. */
273 *preferred
= blksize
;
274 *maximum
= 0xffffffff;
277 if (*maximum
>= blksize
)
278 *preferred
= MAX (*preferred
, blksize
);
284 /* Force an early call to cache_get_size because we have to set the
285 * backing file size and bitmap size before any other read or write
289 cache_prepare (nbdkit_next
*next
,
290 void *handle
, int readonly
)
294 r
= cache_get_size (next
, handle
);
300 /* Override the plugin's .can_cache, because we are caching here instead */
302 cache_can_cache (nbdkit_next
*next
, void *handle
)
304 return NBDKIT_CACHE_NATIVE
;
307 /* Override the plugin's .can_fast_zero, because our .zero is not fast */
309 cache_can_fast_zero (nbdkit_next
*next
,
312 /* It is better to advertise support even when we always reject fast
318 /* Override the plugin's .can_flush, if we are cache=unsafe */
320 cache_can_flush (nbdkit_next
*next
,
323 if (cache_mode
== CACHE_MODE_UNSAFE
)
325 return next
->can_flush (next
);
329 /* Override the plugin's .can_fua, if we are cache=unsafe */
331 cache_can_fua (nbdkit_next
*next
,
334 if (cache_mode
== CACHE_MODE_UNSAFE
)
335 return NBDKIT_FUA_NATIVE
;
336 return next
->can_fua (next
);
339 /* Override the plugin's .can_multi_conn, if we are not cache=writethrough */
341 cache_can_multi_conn (nbdkit_next
*next
,
344 /* For CACHE_MODE_UNSAFE, we always advertise a no-op flush because
345 * our local cache access is consistent between connections, and we
346 * don't care about persisting the data to the underlying plugin.
348 * For CACHE_MODE_WRITEBACK, things are more subtle: we only write
349 * to the plugin during NBD_CMD_FLUSH, at which point that one
350 * connection writes back ALL cached blocks regardless of which
351 * connection originally wrote them, so a client can be assured that
352 * blocks from all connections have reached the plugin's permanent
353 * storage with only one connection having to send a flush.
355 * But for CACHE_MODE_WRITETHROUGH, we are at the mercy of the
356 * plugin; data written by connection A is not guaranteed to be made
357 * persistent by a flush from connection B unless the plugin itself
358 * supports multi-conn.
360 if (cache_mode
!= CACHE_MODE_WRITETHROUGH
)
362 return next
->can_multi_conn (next
);
367 cache_pread (nbdkit_next
*next
,
368 void *handle
, void *buf
, uint32_t count
, uint64_t offset
,
369 uint32_t flags
, int *err
)
371 CLEANUP_FREE
uint8_t *block
= NULL
;
372 uint64_t blknum
, blkoffs
, nrblocks
;
376 if (!IS_ALIGNED (count
| offset
, blksize
)) {
377 block
= malloc (blksize
);
380 nbdkit_error ("malloc: %m");
385 blknum
= offset
/ blksize
; /* block number */
386 blkoffs
= offset
% blksize
; /* offset within the block */
390 uint64_t n
= MIN (blksize
- blkoffs
, count
);
393 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
394 r
= blk_read (next
, blknum
, block
, err
);
398 memcpy (buf
, &block
[blkoffs
], n
);
407 nrblocks
= count
/ blksize
;
409 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
410 r
= blk_read_multiple (next
, blknum
, nrblocks
, buf
, err
);
414 buf
+= nrblocks
* blksize
;
415 count
-= nrblocks
* blksize
;
416 offset
+= nrblocks
* blksize
;
423 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
424 r
= blk_read (next
, blknum
, block
, err
);
428 memcpy (buf
, block
, count
);
436 cache_pwrite (nbdkit_next
*next
,
437 void *handle
, const void *buf
, uint32_t count
, uint64_t offset
,
438 uint32_t flags
, int *err
)
440 CLEANUP_FREE
uint8_t *block
= NULL
;
441 uint64_t blknum
, blkoffs
;
443 bool need_flush
= false;
445 if (!IS_ALIGNED (count
| offset
, blksize
)) {
446 block
= malloc (blksize
);
449 nbdkit_error ("malloc: %m");
454 if ((flags
& NBDKIT_FLAG_FUA
) &&
455 (cache_mode
== CACHE_MODE_UNSAFE
||
456 next
->can_fua (next
) == NBDKIT_FUA_EMULATE
)) {
457 flags
&= ~NBDKIT_FLAG_FUA
;
461 blknum
= offset
/ blksize
; /* block number */
462 blkoffs
= offset
% blksize
; /* offset within the block */
466 uint64_t n
= MIN (blksize
- blkoffs
, count
);
468 /* Do a read-modify-write operation on the current block.
469 * Hold the lock over the whole operation.
472 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
473 r
= blk_read (next
, blknum
, block
, err
);
475 memcpy (&block
[blkoffs
], buf
, n
);
476 r
= blk_write (next
, blknum
, block
, flags
, err
);
488 while (count
>= blksize
) {
489 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
490 r
= blk_write (next
, blknum
, buf
, flags
, err
);
503 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
504 r
= blk_read (next
, blknum
, block
, err
);
506 memcpy (block
, buf
, count
);
507 r
= blk_write (next
, blknum
, block
, flags
, err
);
514 return cache_flush (next
, handle
, 0, err
);
520 cache_zero (nbdkit_next
*next
,
521 void *handle
, uint32_t count
, uint64_t offset
, uint32_t flags
,
524 CLEANUP_FREE
uint8_t *block
= NULL
;
525 uint64_t blknum
, blkoffs
;
527 bool need_flush
= false;
529 /* We are purposefully avoiding next->zero, so a zero request is
530 * never faster than plain writes.
532 if (flags
& NBDKIT_FLAG_FAST_ZERO
) {
537 block
= malloc (blksize
);
540 nbdkit_error ("malloc: %m");
544 flags
&= ~NBDKIT_FLAG_MAY_TRIM
;
545 if ((flags
& NBDKIT_FLAG_FUA
) &&
546 (cache_mode
== CACHE_MODE_UNSAFE
||
547 next
->can_fua (next
) == NBDKIT_FUA_EMULATE
)) {
548 flags
&= ~NBDKIT_FLAG_FUA
;
552 blknum
= offset
/ blksize
; /* block number */
553 blkoffs
= offset
% blksize
; /* offset within the block */
557 uint64_t n
= MIN (blksize
- blkoffs
, count
);
559 /* Do a read-modify-write operation on the current block.
560 * Hold the lock over the whole operation.
562 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
563 r
= blk_read (next
, blknum
, block
, err
);
565 memset (&block
[blkoffs
], 0, n
);
566 r
= blk_write (next
, blknum
, block
, flags
, err
);
577 if (count
>= blksize
)
578 memset (block
, 0, blksize
);
579 while (count
>=blksize
) {
580 /* Intentional that we do not use next->zero */
581 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
582 r
= blk_write (next
, blknum
, block
, flags
, err
);
593 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
594 r
= blk_read (next
, blknum
, block
, err
);
596 memset (block
, 0, count
);
597 r
= blk_write (next
, blknum
, block
, flags
, err
);
604 return cache_flush (next
, handle
, 0, err
);
608 /* Flush: Go through all the dirty blocks, flushing them to disk. */
610 uint8_t *block
; /* bounce buffer */
611 unsigned errors
; /* count of errors seen */
612 int first_errno
; /* first errno seen */
616 static int flush_dirty_block (uint64_t blknum
, void *);
619 cache_flush (nbdkit_next
*next
, void *handle
,
620 uint32_t flags
, int *err
)
622 CLEANUP_FREE
uint8_t *block
= NULL
;
623 struct flush_data data
=
624 { .errors
= 0, .first_errno
= 0, .next
= next
};
627 if (cache_mode
== CACHE_MODE_UNSAFE
)
632 /* Allocate the bounce buffer. */
633 block
= malloc (blksize
);
636 nbdkit_error ("malloc: %m");
641 /* In theory if cache_mode == CACHE_MODE_WRITETHROUGH then there
642 * should be no dirty blocks. However we go through the cache here
643 * to be sure. Also we still need to issue the flush to the
644 * underlying storage.
647 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
648 for_each_dirty_block (flush_dirty_block
, &data
);
651 /* Now issue a flush request to the underlying storage. */
652 if (next
->flush (next
, 0, data
.errors
? &tmp
: &data
.first_errno
) == -1)
655 if (data
.errors
> 0) {
656 *err
= data
.first_errno
;
663 flush_dirty_block (uint64_t blknum
, void *datav
)
665 struct flush_data
*data
= datav
;
668 /* Perform a read + writethrough which will read from the
669 * cache and write it through to the underlying storage.
671 if (blk_read (data
->next
, blknum
, data
->block
,
672 data
->errors
? &tmp
: &data
->first_errno
) == -1)
674 if (blk_writethrough (data
->next
, blknum
, data
->block
, 0,
675 data
->errors
? &tmp
: &data
->first_errno
) == -1)
681 nbdkit_error ("cache: flush of block %" PRIu64
" failed", blknum
);
683 return 0; /* continue scanning and flushing. */
688 cache_cache (nbdkit_next
*next
,
689 void *handle
, uint32_t count
, uint64_t offset
,
690 uint32_t flags
, int *err
)
692 CLEANUP_FREE
uint8_t *block
= NULL
;
693 uint64_t blknum
, blkoffs
;
695 uint64_t remaining
= count
; /* Rounding out could exceed 32 bits */
698 block
= malloc (blksize
);
701 nbdkit_error ("malloc: %m");
705 blknum
= offset
/ blksize
; /* block number */
706 blkoffs
= offset
% blksize
; /* offset within the block */
709 remaining
+= blkoffs
;
713 remaining
= ROUND_UP (remaining
, blksize
);
717 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock
);
718 r
= blk_cache (next
, blknum
, block
, err
);
722 remaining
-= blksize
;
730 static struct nbdkit_filter filter
= {
732 .longname
= "nbdkit caching filter",
733 .unload
= cache_unload
,
734 .config
= cache_config
,
735 .config_complete
= cache_config_complete
,
736 .config_help
= cache_config_help
,
737 .get_ready
= cache_get_ready
,
738 .prepare
= cache_prepare
,
739 .get_size
= cache_get_size
,
740 .block_size
= cache_block_size
,
741 .can_cache
= cache_can_cache
,
742 .can_fast_zero
= cache_can_fast_zero
,
743 .can_flush
= cache_can_flush
,
744 .can_fua
= cache_can_fua
,
745 .can_multi_conn
= cache_can_multi_conn
,
746 .pread
= cache_pread
,
747 .pwrite
= cache_pwrite
,
749 .flush
= cache_flush
,
750 .cache
= cache_cache
,
753 NBDKIT_REGISTER_FILTER (filter
)