4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48 #include <nbdkit-filter.h>
51 #include "isaligned.h"
52 #include "ispowerof2.h"
59 /* Read-modify-write requests are serialized through this global lock.
60 * This is only used for unaligned requests which should be
63 static pthread_mutex_t rmw_lock
= PTHREAD_MUTEX_INITIALIZER
;
65 unsigned blksize
= 65536; /* block size */
67 static bool cow_on_cache
;
69 /* Cache on read ("cow-on-read") mode. */
70 extern enum cor_mode
{
75 enum cor_mode cor_mode
= COR_OFF
;
85 cow_config (nbdkit_next_config
*next
, nbdkit_backend
*nxdata
,
86 const char *key
, const char *value
)
88 if (strcmp (key
, "cow-block-size") == 0) {
89 int64_t r
= nbdkit_parse_size (value
);
92 if (r
< 4096 || r
> UINT_MAX
|| !is_power_of_2 (r
)) {
93 nbdkit_error ("cow-block-size is out of range (4096..2G) "
94 "or not a power of 2");
100 else if (strcmp (key
, "cow-on-cache") == 0) {
103 r
= nbdkit_parse_bool (value
);
109 else if (strcmp (key
, "cow-on-read") == 0) {
110 if (value
[0] == '/') {
115 int r
= nbdkit_parse_bool (value
);
118 cor_mode
= r
? COR_ON
: COR_OFF
;
123 return next (nxdata
, key
, value
);
127 #define cow_config_help \
128 "cow-block-size=<N> Set COW block size.\n" \
129 "cow-on-cache=<BOOL> Copy cache (prefetch) requests to the overlay.\n" \
130 "cow-on-read=<BOOL>|/PATH Copy read requests to the overlay."
133 cow_get_ready (int thread_model
)
135 if (blk_init () == -1)
141 /* Decide if cow-on-read is currently on or off. */
146 case COR_ON
: return true;
147 case COR_OFF
: return false;
148 case COR_PATH
: return access (cor_path
, F_OK
) == 0;
154 cow_open (nbdkit_next_open
*next
, nbdkit_context
*nxdata
,
155 int readonly
, const char *exportname
, int is_tls
)
157 /* Always pass readonly=1 to the underlying plugin. */
158 if (next (nxdata
, 1, exportname
) == -1)
161 return NBDKIT_HANDLE_NOT_NEEDED
;
164 /* Get the file size, set the cache size. */
166 cow_get_size (nbdkit_next
*next
,
172 size
= next
->get_size (next
);
176 nbdkit_debug ("cow: underlying file size: %" PRIi64
, size
);
178 r
= blk_set_size (size
);
185 /* Block size constraints. */
187 cow_block_size (nbdkit_next
*next
, void *handle
,
188 uint32_t *minimum
, uint32_t *preferred
, uint32_t *maximum
)
190 if (next
->block_size (next
, minimum
, preferred
, maximum
) == -1)
193 if (*minimum
== 0) { /* No constraints set by the plugin. */
195 *preferred
= blksize
;
196 *maximum
= 0xffffffff;
199 if (*maximum
>= blksize
)
200 *preferred
= MAX (*preferred
, blksize
);
206 /* Force an early call to cow_get_size because we have to set the
207 * backing file size and bitmap size before any other read or write
211 cow_prepare (nbdkit_next
*next
,
212 void *handle
, int readonly
)
216 r
= cow_get_size (next
, handle
);
217 return r
>= 0 ? 0 : -1;
221 cow_can_write (nbdkit_next
*next
, void *handle
)
227 cow_can_trim (nbdkit_next
*next
, void *handle
)
233 cow_can_extents (nbdkit_next
*next
, void *handle
)
239 cow_can_flush (nbdkit_next
*next
, void *handle
)
245 cow_can_fua (nbdkit_next
*next
, void *handle
)
247 return NBDKIT_FUA_NATIVE
;
251 cow_can_cache (nbdkit_next
*next
, void *handle
)
253 /* Cache next->can_cache now, so later calls to next->cache
254 * don't fail, even though we override the answer here.
256 int r
= next
->can_cache (next
);
259 return NBDKIT_CACHE_NATIVE
;
263 cow_can_multi_conn (nbdkit_next
*next
,
266 /* Our cache is consistent between connections. */
270 /* Override the plugin's .can_fast_zero, because our .zero is not fast */
272 cow_can_fast_zero (nbdkit_next
*next
,
275 /* It is better to advertise support even when we always reject fast
281 static int cow_flush (nbdkit_next
*next
, void *handle
, uint32_t flags
,
286 cow_pread (nbdkit_next
*next
,
287 void *handle
, void *buf
, uint32_t count
, uint64_t offset
,
288 uint32_t flags
, int *err
)
290 CLEANUP_FREE
uint8_t *block
= NULL
;
291 uint64_t blknum
, blkoffs
, nrblocks
;
294 if (!IS_ALIGNED (count
| offset
, blksize
)) {
295 block
= malloc (blksize
);
298 nbdkit_error ("malloc: %m");
303 blknum
= offset
/ blksize
; /* block number */
304 blkoffs
= offset
% blksize
; /* offset within the block */
308 uint64_t n
= MIN (blksize
- blkoffs
, count
);
311 r
= blk_read (next
, blknum
, block
, cow_on_read (), err
);
315 memcpy (buf
, &block
[blkoffs
], n
);
324 nrblocks
= count
/ blksize
;
326 r
= blk_read_multiple (next
, blknum
, nrblocks
, buf
, cow_on_read (), err
);
330 buf
+= nrblocks
* blksize
;
331 count
-= nrblocks
* blksize
;
332 offset
+= nrblocks
* blksize
;
339 r
= blk_read (next
, blknum
, block
, cow_on_read (), err
);
343 memcpy (buf
, block
, count
);
351 cow_pwrite (nbdkit_next
*next
,
352 void *handle
, const void *buf
, uint32_t count
, uint64_t offset
,
353 uint32_t flags
, int *err
)
355 CLEANUP_FREE
uint8_t *block
= NULL
;
356 uint64_t blknum
, blkoffs
;
359 if (!IS_ALIGNED (count
| offset
, blksize
)) {
360 block
= malloc (blksize
);
363 nbdkit_error ("malloc: %m");
368 blknum
= offset
/ blksize
; /* block number */
369 blkoffs
= offset
% blksize
; /* offset within the block */
373 uint64_t n
= MIN (blksize
- blkoffs
, count
);
375 /* Do a read-modify-write operation on the current block.
376 * Hold the rmw_lock over the whole operation.
379 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock
);
380 r
= blk_read (next
, blknum
, block
, cow_on_read (), err
);
382 memcpy (&block
[blkoffs
], buf
, n
);
383 r
= blk_write (blknum
, block
, err
);
395 while (count
>= blksize
) {
396 r
= blk_write (blknum
, buf
, err
);
409 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock
);
410 r
= blk_read (next
, blknum
, block
, cow_on_read (), err
);
412 memcpy (block
, buf
, count
);
413 r
= blk_write (blknum
, block
, err
);
419 /* flags & NBDKIT_FLAG_FUA is deliberately ignored. */
426 cow_zero (nbdkit_next
*next
,
427 void *handle
, uint32_t count
, uint64_t offset
, uint32_t flags
,
430 CLEANUP_FREE
uint8_t *block
= NULL
;
431 uint64_t blknum
, blkoffs
;
434 /* We are purposefully avoiding next->zero, so a zero request is
435 * never faster than plain writes.
437 if (flags
& NBDKIT_FLAG_FAST_ZERO
) {
442 block
= malloc (blksize
);
445 nbdkit_error ("malloc: %m");
449 blknum
= offset
/ blksize
; /* block number */
450 blkoffs
= offset
% blksize
; /* offset within the block */
454 uint64_t n
= MIN (blksize
- blkoffs
, count
);
456 /* Do a read-modify-write operation on the current block.
457 * Hold the rmw_lock over the whole operation.
459 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock
);
460 r
= blk_read (next
, blknum
, block
, cow_on_read (), err
);
462 memset (&block
[blkoffs
], 0, n
);
463 r
= blk_write (blknum
, block
, err
);
474 if (count
>= blksize
)
475 memset (block
, 0, blksize
);
476 while (count
>= blksize
) {
477 /* XXX There is the possibility of optimizing this: since this loop is
478 * writing a whole, aligned block, we should use FALLOC_FL_ZERO_RANGE.
480 r
= blk_write (blknum
, block
, err
);
491 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock
);
492 r
= blk_read (next
, blknum
, block
, cow_on_read (), err
);
494 memset (block
, 0, count
);
495 r
= blk_write (blknum
, block
, err
);
501 /* flags & NBDKIT_FLAG_FUA is deliberately ignored. */
508 cow_trim (nbdkit_next
*next
,
509 void *handle
, uint32_t count
, uint64_t offset
, uint32_t flags
,
512 CLEANUP_FREE
uint8_t *block
= NULL
;
513 uint64_t blknum
, blkoffs
;
516 if (!IS_ALIGNED (count
| offset
, blksize
)) {
517 block
= malloc (blksize
);
520 nbdkit_error ("malloc: %m");
525 blknum
= offset
/ blksize
; /* block number */
526 blkoffs
= offset
% blksize
; /* offset within the block */
530 uint64_t n
= MIN (blksize
- blkoffs
, count
);
532 /* Do a read-modify-write operation on the current block.
533 * Hold the lock over the whole operation.
535 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock
);
536 r
= blk_read (next
, blknum
, block
, cow_on_read (), err
);
538 memset (&block
[blkoffs
], 0, n
);
539 r
= blk_write (blknum
, block
, err
);
550 while (count
>= blksize
) {
551 r
= blk_trim (blknum
, err
);
562 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock
);
563 r
= blk_read (next
, blknum
, block
, cow_on_read (), err
);
565 memset (block
, 0, count
);
566 r
= blk_write (blknum
, block
, err
);
572 /* flags & NBDKIT_FLAG_FUA is deliberately ignored. */
578 cow_flush (nbdkit_next
*next
, void *handle
,
579 uint32_t flags
, int *err
)
581 /* Deliberately ignored. */
586 cow_cache (nbdkit_next
*next
,
587 void *handle
, uint32_t count
, uint64_t offset
,
588 uint32_t flags
, int *err
)
590 CLEANUP_FREE
uint8_t *block
= NULL
;
591 uint64_t blknum
, blkoffs
;
593 uint64_t remaining
= count
; /* Rounding out could exceed 32 bits */
594 enum cache_mode mode
;
596 switch (next
->can_cache (next
)) {
597 case NBDKIT_CACHE_NONE
:
598 mode
= BLK_CACHE_IGNORE
;
600 case NBDKIT_CACHE_EMULATE
:
601 mode
= BLK_CACHE_READ
;
603 case NBDKIT_CACHE_NATIVE
:
604 mode
= BLK_CACHE_PASSTHROUGH
;
607 abort (); /* Guaranteed thanks to early caching */
610 mode
= BLK_CACHE_COW
;
613 block
= malloc (blksize
);
616 nbdkit_error ("malloc: %m");
620 blknum
= offset
/ blksize
; /* block number */
621 blkoffs
= offset
% blksize
; /* offset within the block */
624 remaining
+= blkoffs
;
628 remaining
= ROUND_UP (remaining
, blksize
);
632 r
= blk_cache (next
, blknum
, block
, mode
, err
);
636 remaining
-= blksize
;
646 cow_extents (nbdkit_next
*next
,
647 void *handle
, uint32_t count32
, uint64_t offset
, uint32_t flags
,
648 struct nbdkit_extents
*extents
, int *err
)
650 const bool can_extents
= next
->can_extents (next
);
651 const bool req_one
= flags
& NBDKIT_FLAG_REQ_ONE
;
652 uint64_t count
= count32
;
656 /* To make this easier, align the requested extents to whole blocks.
657 * Note that count is a 64 bit variable containing at most a 32 bit
658 * value so rounding up is safe here.
660 end
= offset
+ count
;
661 offset
= ROUND_DOWN (offset
, blksize
);
662 end
= ROUND_UP (end
, blksize
);
663 count
= end
- offset
;
664 blknum
= offset
/ blksize
;
666 assert (IS_ALIGNED (offset
, blksize
));
667 assert (IS_ALIGNED (count
, blksize
));
668 assert (count
> 0); /* We must make forward progress. */
671 bool present
, trimmed
;
672 struct nbdkit_extent e
;
674 blk_status (blknum
, &present
, &trimmed
);
676 /* Present in the overlay. */
682 e
.type
= NBDKIT_EXTENT_HOLE
|NBDKIT_EXTENT_ZERO
;
686 if (nbdkit_add_extent (extents
, e
.offset
, e
.length
, e
.type
) == -1) {
696 /* Not present in the overlay, but we can ask the plugin. */
697 else if (can_extents
) {
698 uint64_t range_offset
= offset
;
699 uint32_t range_count
= 0;
703 /* Asking the plugin for a single block of extents is not
704 * efficient for some plugins (eg. VDDK) so ask for as much data
708 /* nbdkit_extents_full cannot read more than a 32 bit range
709 * (range_count), but count is a 64 bit quantity, so don't
710 * overflow range_count here.
712 if (range_count
>= UINT32_MAX
- blksize
+ 1) break;
717 range_count
+= blksize
;
719 if (count
== 0) break;
720 blk_status (blknum
, &present
, &trimmed
);
724 /* Don't ask for extent data beyond the end of the plugin. */
725 size
= next
->get_size (next
);
729 if (range_offset
+ range_count
> size
) {
730 unsigned tail
= range_offset
+ range_count
- size
;
734 CLEANUP_EXTENTS_FREE
struct nbdkit_extents
*extents2
=
735 nbdkit_extents_full (next
, range_count
, range_offset
, flags
, err
);
736 if (extents2
== NULL
)
739 for (i
= 0; i
< nbdkit_extents_count (extents2
); ++i
) {
740 e
= nbdkit_get_extent (extents2
, i
);
741 if (nbdkit_add_extent (extents
, e
.offset
, e
.length
, e
.type
) == -1) {
748 /* Otherwise assume the block is non-sparse. */
754 if (nbdkit_add_extent (extents
, e
.offset
, e
.length
, e
.type
) == -1) {
764 /* If the caller only wanted the first extent, and we've managed
765 * to add at least one extent to the list, then we can drop out
766 * now. (Note calling nbdkit_add_extent above does not mean the
767 * extent got added since it might be before the first offset.)
769 if (req_one
&& nbdkit_extents_count (extents
) > 0)
776 static struct nbdkit_filter filter
= {
778 .longname
= "nbdkit copy-on-write (COW) filter",
779 .unload
= cow_unload
,
781 .config
= cow_config
,
782 .config_help
= cow_config_help
,
783 .get_ready
= cow_get_ready
,
784 .prepare
= cow_prepare
,
785 .get_size
= cow_get_size
,
786 .block_size
= cow_block_size
,
787 .can_write
= cow_can_write
,
788 .can_flush
= cow_can_flush
,
789 .can_trim
= cow_can_trim
,
790 .can_extents
= cow_can_extents
,
791 .can_fua
= cow_can_fua
,
792 .can_cache
= cow_can_cache
,
793 .can_fast_zero
= cow_can_fast_zero
,
794 .can_multi_conn
= cow_can_multi_conn
,
796 .pwrite
= cow_pwrite
,
801 .extents
= cow_extents
,
804 NBDKIT_REGISTER_FILTER (filter
)