Update Red Hat Copyright Notices
[nbdkit.git] / filters / cow / cow.c
blob4526bbfcd41958de4731d16087eed6955c35603f
1 /* nbdkit
2 * Copyright Red Hat
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
33 #include <config.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <stdint.h>
38 #include <stdbool.h>
39 #include <inttypes.h>
40 #include <string.h>
41 #include <unistd.h>
42 #include <errno.h>
43 #include <limits.h>
44 #include <assert.h>
46 #include <pthread.h>
48 #include <nbdkit-filter.h>
50 #include "cleanup.h"
51 #include "isaligned.h"
52 #include "ispowerof2.h"
53 #include "minmax.h"
54 #include "rounding.h"
56 #include "cow.h"
57 #include "blk.h"
59 /* Read-modify-write requests are serialized through this global lock.
60 * This is only used for unaligned requests which should be
61 * infrequent.
63 static pthread_mutex_t rmw_lock = PTHREAD_MUTEX_INITIALIZER;
65 unsigned blksize = 65536; /* block size */
67 static bool cow_on_cache;
69 /* Cache on read ("cow-on-read") mode. */
70 extern enum cor_mode {
71 COR_OFF,
72 COR_ON,
73 COR_PATH,
74 } cor_mode;
75 enum cor_mode cor_mode = COR_OFF;
76 const char *cor_path;
78 static void
79 cow_unload (void)
81 blk_free ();
84 static int
85 cow_config (nbdkit_next_config *next, nbdkit_backend *nxdata,
86 const char *key, const char *value)
88 if (strcmp (key, "cow-block-size") == 0) {
89 int64_t r = nbdkit_parse_size (value);
90 if (r == -1)
91 return -1;
92 if (r < 4096 || r > UINT_MAX || !is_power_of_2 (r)) {
93 nbdkit_error ("cow-block-size is out of range (4096..2G) "
94 "or not a power of 2");
95 return -1;
97 blksize = r;
98 return 0;
100 else if (strcmp (key, "cow-on-cache") == 0) {
101 int r;
103 r = nbdkit_parse_bool (value);
104 if (r == -1)
105 return -1;
106 cow_on_cache = r;
107 return 0;
109 else if (strcmp (key, "cow-on-read") == 0) {
110 if (value[0] == '/') {
111 cor_path = value;
112 cor_mode = COR_PATH;
114 else {
115 int r = nbdkit_parse_bool (value);
116 if (r == -1)
117 return -1;
118 cor_mode = r ? COR_ON : COR_OFF;
120 return 0;
122 else {
123 return next (nxdata, key, value);
127 #define cow_config_help \
128 "cow-block-size=<N> Set COW block size.\n" \
129 "cow-on-cache=<BOOL> Copy cache (prefetch) requests to the overlay.\n" \
130 "cow-on-read=<BOOL>|/PATH Copy read requests to the overlay."
132 static int
133 cow_get_ready (int thread_model)
135 if (blk_init () == -1)
136 return -1;
138 return 0;
141 /* Decide if cow-on-read is currently on or off. */
142 bool
143 cow_on_read (void)
145 switch (cor_mode) {
146 case COR_ON: return true;
147 case COR_OFF: return false;
148 case COR_PATH: return access (cor_path, F_OK) == 0;
149 default: abort ();
153 static void *
154 cow_open (nbdkit_next_open *next, nbdkit_context *nxdata,
155 int readonly, const char *exportname, int is_tls)
157 /* Always pass readonly=1 to the underlying plugin. */
158 if (next (nxdata, 1, exportname) == -1)
159 return NULL;
161 return NBDKIT_HANDLE_NOT_NEEDED;
164 /* Get the file size, set the cache size. */
165 static int64_t
166 cow_get_size (nbdkit_next *next,
167 void *handle)
169 int64_t size;
170 int r;
172 size = next->get_size (next);
173 if (size == -1)
174 return -1;
176 nbdkit_debug ("cow: underlying file size: %" PRIi64, size);
178 r = blk_set_size (size);
179 if (r == -1)
180 return -1;
182 return size;
185 /* Block size constraints. */
186 static int
187 cow_block_size (nbdkit_next *next, void *handle,
188 uint32_t *minimum, uint32_t *preferred, uint32_t *maximum)
190 if (next->block_size (next, minimum, preferred, maximum) == -1)
191 return -1;
193 if (*minimum == 0) { /* No constraints set by the plugin. */
194 *minimum = 1;
195 *preferred = blksize;
196 *maximum = 0xffffffff;
198 else {
199 if (*maximum >= blksize)
200 *preferred = MAX (*preferred, blksize);
203 return 0;
206 /* Force an early call to cow_get_size because we have to set the
207 * backing file size and bitmap size before any other read or write
208 * calls.
210 static int
211 cow_prepare (nbdkit_next *next,
212 void *handle, int readonly)
214 int64_t r;
216 r = cow_get_size (next, handle);
217 return r >= 0 ? 0 : -1;
220 static int
221 cow_can_write (nbdkit_next *next, void *handle)
223 return 1;
226 static int
227 cow_can_trim (nbdkit_next *next, void *handle)
229 return 1;
232 static int
233 cow_can_extents (nbdkit_next *next, void *handle)
235 return 1;
238 static int
239 cow_can_flush (nbdkit_next *next, void *handle)
241 return 1;
244 static int
245 cow_can_fua (nbdkit_next *next, void *handle)
247 return NBDKIT_FUA_NATIVE;
250 static int
251 cow_can_cache (nbdkit_next *next, void *handle)
253 /* Cache next->can_cache now, so later calls to next->cache
254 * don't fail, even though we override the answer here.
256 int r = next->can_cache (next);
257 if (r == -1)
258 return -1;
259 return NBDKIT_CACHE_NATIVE;
262 static int
263 cow_can_multi_conn (nbdkit_next *next,
264 void *handle)
266 /* Our cache is consistent between connections. */
267 return 1;
270 /* Override the plugin's .can_fast_zero, because our .zero is not fast */
271 static int
272 cow_can_fast_zero (nbdkit_next *next,
273 void *handle)
275 /* It is better to advertise support even when we always reject fast
276 * zero attempts.
278 return 1;
281 static int cow_flush (nbdkit_next *next, void *handle, uint32_t flags,
282 int *err);
284 /* Read data. */
285 static int
286 cow_pread (nbdkit_next *next,
287 void *handle, void *buf, uint32_t count, uint64_t offset,
288 uint32_t flags, int *err)
290 CLEANUP_FREE uint8_t *block = NULL;
291 uint64_t blknum, blkoffs, nrblocks;
292 int r;
294 if (!IS_ALIGNED (count | offset, blksize)) {
295 block = malloc (blksize);
296 if (block == NULL) {
297 *err = errno;
298 nbdkit_error ("malloc: %m");
299 return -1;
303 blknum = offset / blksize; /* block number */
304 blkoffs = offset % blksize; /* offset within the block */
306 /* Unaligned head */
307 if (blkoffs) {
308 uint64_t n = MIN (blksize - blkoffs, count);
310 assert (block);
311 r = blk_read (next, blknum, block, cow_on_read (), err);
312 if (r == -1)
313 return -1;
315 memcpy (buf, &block[blkoffs], n);
317 buf += n;
318 count -= n;
319 offset += n;
320 blknum++;
323 /* Aligned body */
324 nrblocks = count / blksize;
325 if (nrblocks > 0) {
326 r = blk_read_multiple (next, blknum, nrblocks, buf, cow_on_read (), err);
327 if (r == -1)
328 return -1;
330 buf += nrblocks * blksize;
331 count -= nrblocks * blksize;
332 offset += nrblocks * blksize;
333 blknum += nrblocks;
336 /* Unaligned tail */
337 if (count) {
338 assert (block);
339 r = blk_read (next, blknum, block, cow_on_read (), err);
340 if (r == -1)
341 return -1;
343 memcpy (buf, block, count);
346 return 0;
349 /* Write data. */
350 static int
351 cow_pwrite (nbdkit_next *next,
352 void *handle, const void *buf, uint32_t count, uint64_t offset,
353 uint32_t flags, int *err)
355 CLEANUP_FREE uint8_t *block = NULL;
356 uint64_t blknum, blkoffs;
357 int r;
359 if (!IS_ALIGNED (count | offset, blksize)) {
360 block = malloc (blksize);
361 if (block == NULL) {
362 *err = errno;
363 nbdkit_error ("malloc: %m");
364 return -1;
368 blknum = offset / blksize; /* block number */
369 blkoffs = offset % blksize; /* offset within the block */
371 /* Unaligned head */
372 if (blkoffs) {
373 uint64_t n = MIN (blksize - blkoffs, count);
375 /* Do a read-modify-write operation on the current block.
376 * Hold the rmw_lock over the whole operation.
378 assert (block);
379 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
380 r = blk_read (next, blknum, block, cow_on_read (), err);
381 if (r != -1) {
382 memcpy (&block[blkoffs], buf, n);
383 r = blk_write (blknum, block, err);
385 if (r == -1)
386 return -1;
388 buf += n;
389 count -= n;
390 offset += n;
391 blknum++;
394 /* Aligned body */
395 while (count >= blksize) {
396 r = blk_write (blknum, buf, err);
397 if (r == -1)
398 return -1;
400 buf += blksize;
401 count -= blksize;
402 offset += blksize;
403 blknum++;
406 /* Unaligned tail */
407 if (count) {
408 assert (block);
409 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
410 r = blk_read (next, blknum, block, cow_on_read (), err);
411 if (r != -1) {
412 memcpy (block, buf, count);
413 r = blk_write (blknum, block, err);
415 if (r == -1)
416 return -1;
419 /* flags & NBDKIT_FLAG_FUA is deliberately ignored. */
421 return 0;
424 /* Zero data. */
425 static int
426 cow_zero (nbdkit_next *next,
427 void *handle, uint32_t count, uint64_t offset, uint32_t flags,
428 int *err)
430 CLEANUP_FREE uint8_t *block = NULL;
431 uint64_t blknum, blkoffs;
432 int r;
434 /* We are purposefully avoiding next->zero, so a zero request is
435 * never faster than plain writes.
437 if (flags & NBDKIT_FLAG_FAST_ZERO) {
438 *err = ENOTSUP;
439 return -1;
442 block = malloc (blksize);
443 if (block == NULL) {
444 *err = errno;
445 nbdkit_error ("malloc: %m");
446 return -1;
449 blknum = offset / blksize; /* block number */
450 blkoffs = offset % blksize; /* offset within the block */
452 /* Unaligned head */
453 if (blkoffs) {
454 uint64_t n = MIN (blksize - blkoffs, count);
456 /* Do a read-modify-write operation on the current block.
457 * Hold the rmw_lock over the whole operation.
459 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
460 r = blk_read (next, blknum, block, cow_on_read (), err);
461 if (r != -1) {
462 memset (&block[blkoffs], 0, n);
463 r = blk_write (blknum, block, err);
465 if (r == -1)
466 return -1;
468 count -= n;
469 offset += n;
470 blknum++;
473 /* Aligned body */
474 if (count >= blksize)
475 memset (block, 0, blksize);
476 while (count >= blksize) {
477 /* XXX There is the possibility of optimizing this: since this loop is
478 * writing a whole, aligned block, we should use FALLOC_FL_ZERO_RANGE.
480 r = blk_write (blknum, block, err);
481 if (r == -1)
482 return -1;
484 count -= blksize;
485 offset += blksize;
486 blknum++;
489 /* Unaligned tail */
490 if (count) {
491 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
492 r = blk_read (next, blknum, block, cow_on_read (), err);
493 if (r != -1) {
494 memset (block, 0, count);
495 r = blk_write (blknum, block, err);
497 if (r == -1)
498 return -1;
501 /* flags & NBDKIT_FLAG_FUA is deliberately ignored. */
503 return 0;
506 /* Trim data. */
507 static int
508 cow_trim (nbdkit_next *next,
509 void *handle, uint32_t count, uint64_t offset, uint32_t flags,
510 int *err)
512 CLEANUP_FREE uint8_t *block = NULL;
513 uint64_t blknum, blkoffs;
514 int r;
516 if (!IS_ALIGNED (count | offset, blksize)) {
517 block = malloc (blksize);
518 if (block == NULL) {
519 *err = errno;
520 nbdkit_error ("malloc: %m");
521 return -1;
525 blknum = offset / blksize; /* block number */
526 blkoffs = offset % blksize; /* offset within the block */
528 /* Unaligned head */
529 if (blkoffs) {
530 uint64_t n = MIN (blksize - blkoffs, count);
532 /* Do a read-modify-write operation on the current block.
533 * Hold the lock over the whole operation.
535 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
536 r = blk_read (next, blknum, block, cow_on_read (), err);
537 if (r != -1) {
538 memset (&block[blkoffs], 0, n);
539 r = blk_write (blknum, block, err);
541 if (r == -1)
542 return -1;
544 count -= n;
545 offset += n;
546 blknum++;
549 /* Aligned body */
550 while (count >= blksize) {
551 r = blk_trim (blknum, err);
552 if (r == -1)
553 return -1;
555 count -= blksize;
556 offset += blksize;
557 blknum++;
560 /* Unaligned tail */
561 if (count) {
562 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
563 r = blk_read (next, blknum, block, cow_on_read (), err);
564 if (r != -1) {
565 memset (block, 0, count);
566 r = blk_write (blknum, block, err);
568 if (r == -1)
569 return -1;
572 /* flags & NBDKIT_FLAG_FUA is deliberately ignored. */
574 return 0;
577 static int
578 cow_flush (nbdkit_next *next, void *handle,
579 uint32_t flags, int *err)
581 /* Deliberately ignored. */
582 return 0;
585 static int
586 cow_cache (nbdkit_next *next,
587 void *handle, uint32_t count, uint64_t offset,
588 uint32_t flags, int *err)
590 CLEANUP_FREE uint8_t *block = NULL;
591 uint64_t blknum, blkoffs;
592 int r;
593 uint64_t remaining = count; /* Rounding out could exceed 32 bits */
594 enum cache_mode mode;
596 switch (next->can_cache (next)) {
597 case NBDKIT_CACHE_NONE:
598 mode = BLK_CACHE_IGNORE;
599 break;
600 case NBDKIT_CACHE_EMULATE:
601 mode = BLK_CACHE_READ;
602 break;
603 case NBDKIT_CACHE_NATIVE:
604 mode = BLK_CACHE_PASSTHROUGH;
605 break;
606 default:
607 abort (); /* Guaranteed thanks to early caching */
609 if (cow_on_cache)
610 mode = BLK_CACHE_COW;
612 assert (!flags);
613 block = malloc (blksize);
614 if (block == NULL) {
615 *err = errno;
616 nbdkit_error ("malloc: %m");
617 return -1;
620 blknum = offset / blksize; /* block number */
621 blkoffs = offset % blksize; /* offset within the block */
623 /* Unaligned head */
624 remaining += blkoffs;
625 offset -= blkoffs;
627 /* Unaligned tail */
628 remaining = ROUND_UP (remaining, blksize);
630 /* Aligned body */
631 while (remaining) {
632 r = blk_cache (next, blknum, block, mode, err);
633 if (r == -1)
634 return -1;
636 remaining -= blksize;
637 offset += blksize;
638 blknum++;
641 return 0;
644 /* Extents. */
645 static int
646 cow_extents (nbdkit_next *next,
647 void *handle, uint32_t count32, uint64_t offset, uint32_t flags,
648 struct nbdkit_extents *extents, int *err)
650 const bool can_extents = next->can_extents (next);
651 const bool req_one = flags & NBDKIT_FLAG_REQ_ONE;
652 uint64_t count = count32;
653 uint64_t end;
654 uint64_t blknum;
656 /* To make this easier, align the requested extents to whole blocks.
657 * Note that count is a 64 bit variable containing at most a 32 bit
658 * value so rounding up is safe here.
660 end = offset + count;
661 offset = ROUND_DOWN (offset, blksize);
662 end = ROUND_UP (end, blksize);
663 count = end - offset;
664 blknum = offset / blksize;
666 assert (IS_ALIGNED (offset, blksize));
667 assert (IS_ALIGNED (count, blksize));
668 assert (count > 0); /* We must make forward progress. */
670 while (count > 0) {
671 bool present, trimmed;
672 struct nbdkit_extent e;
674 blk_status (blknum, &present, &trimmed);
676 /* Present in the overlay. */
677 if (present) {
678 e.offset = offset;
679 e.length = blksize;
681 if (trimmed)
682 e.type = NBDKIT_EXTENT_HOLE|NBDKIT_EXTENT_ZERO;
683 else
684 e.type = 0;
686 if (nbdkit_add_extent (extents, e.offset, e.length, e.type) == -1) {
687 *err = errno;
688 return -1;
691 blknum++;
692 offset += blksize;
693 count -= blksize;
696 /* Not present in the overlay, but we can ask the plugin. */
697 else if (can_extents) {
698 uint64_t range_offset = offset;
699 uint32_t range_count = 0;
700 size_t i;
701 int64_t size;
703 /* Asking the plugin for a single block of extents is not
704 * efficient for some plugins (eg. VDDK) so ask for as much data
705 * as we can.
707 for (;;) {
708 /* nbdkit_extents_full cannot read more than a 32 bit range
709 * (range_count), but count is a 64 bit quantity, so don't
710 * overflow range_count here.
712 if (range_count >= UINT32_MAX - blksize + 1) break;
714 blknum++;
715 offset += blksize;
716 count -= blksize;
717 range_count += blksize;
719 if (count == 0) break;
720 blk_status (blknum, &present, &trimmed);
721 if (present) break;
724 /* Don't ask for extent data beyond the end of the plugin. */
725 size = next->get_size (next);
726 if (size == -1)
727 return -1;
729 if (range_offset + range_count > size) {
730 unsigned tail = range_offset + range_count - size;
731 range_count -= tail;
734 CLEANUP_EXTENTS_FREE struct nbdkit_extents *extents2 =
735 nbdkit_extents_full (next, range_count, range_offset, flags, err);
736 if (extents2 == NULL)
737 return -1;
739 for (i = 0; i < nbdkit_extents_count (extents2); ++i) {
740 e = nbdkit_get_extent (extents2, i);
741 if (nbdkit_add_extent (extents, e.offset, e.length, e.type) == -1) {
742 *err = errno;
743 return -1;
748 /* Otherwise assume the block is non-sparse. */
749 else {
750 e.offset = offset;
751 e.length = blksize;
752 e.type = 0;
754 if (nbdkit_add_extent (extents, e.offset, e.length, e.type) == -1) {
755 *err = errno;
756 return -1;
759 blknum++;
760 offset += blksize;
761 count -= blksize;
764 /* If the caller only wanted the first extent, and we've managed
765 * to add at least one extent to the list, then we can drop out
766 * now. (Note calling nbdkit_add_extent above does not mean the
767 * extent got added since it might be before the first offset.)
769 if (req_one && nbdkit_extents_count (extents) > 0)
770 break;
773 return 0;
776 static struct nbdkit_filter filter = {
777 .name = "cow",
778 .longname = "nbdkit copy-on-write (COW) filter",
779 .unload = cow_unload,
780 .open = cow_open,
781 .config = cow_config,
782 .config_help = cow_config_help,
783 .get_ready = cow_get_ready,
784 .prepare = cow_prepare,
785 .get_size = cow_get_size,
786 .block_size = cow_block_size,
787 .can_write = cow_can_write,
788 .can_flush = cow_can_flush,
789 .can_trim = cow_can_trim,
790 .can_extents = cow_can_extents,
791 .can_fua = cow_can_fua,
792 .can_cache = cow_can_cache,
793 .can_fast_zero = cow_can_fast_zero,
794 .can_multi_conn = cow_can_multi_conn,
795 .pread = cow_pread,
796 .pwrite = cow_pwrite,
797 .zero = cow_zero,
798 .trim = cow_trim,
799 .flush = cow_flush,
800 .cache = cow_cache,
801 .extents = cow_extents,
804 NBDKIT_REGISTER_FILTER (filter)