Update Red Hat Copyright Notices
[nbdkit.git] / filters / cache / cache.c
blobc379f1005754e50edfd7d5bce214532a5b9b33db
1 /* nbdkit
2 * Copyright Red Hat
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
33 #include <config.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <stdint.h>
38 #include <stdbool.h>
39 #include <string.h>
40 #include <inttypes.h>
41 #include <unistd.h>
42 #include <fcntl.h>
43 #include <limits.h>
44 #include <errno.h>
45 #include <assert.h>
46 #include <sys/types.h>
48 #ifdef HAVE_SYS_IOCTL_H
49 #include <sys/ioctl.h>
50 #endif
52 #include <pthread.h>
54 #ifdef HAVE_ALLOCA_H
55 #include <alloca.h>
56 #endif
58 #include <nbdkit-filter.h>
60 #include "cleanup.h"
62 #include "cache.h"
63 #include "blk.h"
64 #include "reclaim.h"
65 #include "isaligned.h"
66 #include "ispowerof2.h"
67 #include "minmax.h"
68 #include "rounding.h"
70 /* In order to handle parallel requests safely, this lock must be held
71 * when calling any blk_* functions.
73 static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
75 unsigned blksize; /* actual block size (picked by blk.c) */
76 unsigned min_block_size = 65536;
77 enum cache_mode cache_mode = CACHE_MODE_WRITEBACK;
78 int64_t max_size = -1;
79 unsigned hi_thresh = 95, lo_thresh = 80;
80 enum cor_mode cor_mode = COR_OFF;
81 const char *cor_path;
83 static int cache_flush (nbdkit_next *next, void *handle, uint32_t flags,
84 int *err);
86 static void
87 cache_unload (void)
89 blk_free ();
92 static int
93 cache_config (nbdkit_next_config *next, nbdkit_backend *nxdata,
94 const char *key, const char *value)
96 if (strcmp (key, "cache") == 0) {
97 if (strcmp (value, "writeback") == 0) {
98 cache_mode = CACHE_MODE_WRITEBACK;
99 return 0;
101 else if (strcmp (value, "writethrough") == 0) {
102 cache_mode = CACHE_MODE_WRITETHROUGH;
103 return 0;
105 else if (strcmp (value, "unsafe") == 0) {
106 cache_mode = CACHE_MODE_UNSAFE;
107 return 0;
109 else {
110 nbdkit_error ("invalid cache parameter, should be "
111 "writeback|writethrough|unsafe");
112 return -1;
115 else if (strcmp (key, "cache-min-block-size") == 0) {
116 int64_t r;
118 r = nbdkit_parse_size (value);
119 if (r == -1)
120 return -1;
121 if (r < 4096 || !is_power_of_2 (r) || r > UINT_MAX) {
122 nbdkit_error ("cache-min-block-size is not a power of 2, or is too small or too large");
123 return -1;
125 min_block_size = r;
126 return 0;
128 #ifdef HAVE_CACHE_RECLAIM
129 else if (strcmp (key, "cache-max-size") == 0) {
130 int64_t r;
132 r = nbdkit_parse_size (value);
133 if (r == -1)
134 return -1;
135 /* We set a lower limit for the cache size just to keep out of
136 * trouble.
138 if (r < 1024*1024) {
139 nbdkit_error ("cache-max-size is too small");
140 return -1;
142 max_size = r;
143 return 0;
145 else if (strcmp (key, "cache-high-threshold") == 0) {
146 if (nbdkit_parse_unsigned ("cache-high-threshold",
147 value, &hi_thresh) == -1)
148 return -1;
149 if (hi_thresh == 0) {
150 nbdkit_error ("cache-high-threshold must be greater than zero");
151 return -1;
153 return 0;
155 else if (strcmp (key, "cache-low-threshold") == 0) {
156 if (nbdkit_parse_unsigned ("cache-low-threshold",
157 value, &lo_thresh) == -1)
158 return -1;
159 if (lo_thresh == 0) {
160 nbdkit_error ("cache-low-threshold must be greater than zero");
161 return -1;
163 return 0;
165 #else /* !HAVE_CACHE_RECLAIM */
166 else if (strcmp (key, "cache-max-size") == 0 ||
167 strcmp (key, "cache-high-threshold") == 0 ||
168 strcmp (key, "cache-low-threshold") == 0) {
169 nbdkit_error ("this platform does not support cache reclaim");
170 return -1;
172 #endif /* !HAVE_CACHE_RECLAIM */
173 else if (strcmp (key, "cache-on-read") == 0) {
174 if (value[0] == '/') {
175 cor_path = value;
176 cor_mode = COR_PATH;
178 else {
179 int r = nbdkit_parse_bool (value);
180 if (r == -1)
181 return -1;
182 cor_mode = r ? COR_ON : COR_OFF;
184 return 0;
186 else {
187 return next (nxdata, key, value);
191 #define cache_config_help_common \
192 "cache=MODE Set cache MODE, one of writeback (default),\n" \
193 " writethrough, or unsafe.\n" \
194 "cache-on-read=BOOL|/PATH Set to true to cache on reads (default false).\n"
195 #ifndef HAVE_CACHE_RECLAIM
196 #define cache_config_help cache_config_help_common
197 #else
198 #define cache_config_help cache_config_help_common \
199 "cache-max-size=SIZE Set maximum space used by cache.\n" \
200 "cache-high-threshold=PCT Percentage of max size where reclaim begins.\n" \
201 "cache-low-threshold=PCT Percentage of max size where reclaim ends.\n"
202 #endif
204 /* Decide if cache-on-read is currently on or off. */
205 bool
206 cache_on_read (void)
208 switch (cor_mode) {
209 case COR_ON: return true;
210 case COR_OFF: return false;
211 case COR_PATH: return access (cor_path, F_OK) == 0;
212 default: abort ();
216 static int
217 cache_config_complete (nbdkit_next_config_complete *next,
218 nbdkit_backend *nxdata)
220 /* If cache-max-size was set then check the thresholds. */
221 if (max_size != -1) {
222 if (lo_thresh >= hi_thresh) {
223 nbdkit_error ("cache-low-threshold must be "
224 "less than cache-high-threshold");
225 return -1;
229 return next (nxdata);
232 static int
233 cache_get_ready (int thread_model)
235 if (blk_init () == -1)
236 return -1;
238 return 0;
241 /* Get the file size, set the cache size. */
242 static int64_t
243 cache_get_size (nbdkit_next *next,
244 void *handle)
246 int64_t size;
247 int r;
249 size = next->get_size (next);
250 if (size == -1)
251 return -1;
253 nbdkit_debug ("cache: underlying file size: %" PRIi64, size);
255 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
256 r = blk_set_size (size);
257 if (r == -1)
258 return -1;
260 return size;
263 /* Block size constraints. */
264 static int
265 cache_block_size (nbdkit_next *next, void *handle,
266 uint32_t *minimum, uint32_t *preferred, uint32_t *maximum)
268 if (next->block_size (next, minimum, preferred, maximum) == -1)
269 return -1;
271 if (*minimum == 0) { /* No constraints set by the plugin. */
272 *minimum = 1;
273 *preferred = blksize;
274 *maximum = 0xffffffff;
276 else {
277 if (*maximum >= blksize)
278 *preferred = MAX (*preferred, blksize);
281 return 0;
284 /* Force an early call to cache_get_size because we have to set the
285 * backing file size and bitmap size before any other read or write
286 * calls.
288 static int
289 cache_prepare (nbdkit_next *next,
290 void *handle, int readonly)
292 int64_t r;
294 r = cache_get_size (next, handle);
295 if (r < 0)
296 return -1;
297 return 0;
300 /* Override the plugin's .can_cache, because we are caching here instead */
301 static int
302 cache_can_cache (nbdkit_next *next, void *handle)
304 return NBDKIT_CACHE_NATIVE;
307 /* Override the plugin's .can_fast_zero, because our .zero is not fast */
308 static int
309 cache_can_fast_zero (nbdkit_next *next,
310 void *handle)
312 /* It is better to advertise support even when we always reject fast
313 * zero attempts.
315 return 1;
318 /* Override the plugin's .can_flush, if we are cache=unsafe */
319 static int
320 cache_can_flush (nbdkit_next *next,
321 void *handle)
323 if (cache_mode == CACHE_MODE_UNSAFE)
324 return 1;
325 return next->can_flush (next);
329 /* Override the plugin's .can_fua, if we are cache=unsafe */
330 static int
331 cache_can_fua (nbdkit_next *next,
332 void *handle)
334 if (cache_mode == CACHE_MODE_UNSAFE)
335 return NBDKIT_FUA_NATIVE;
336 return next->can_fua (next);
339 /* Override the plugin's .can_multi_conn, if we are not cache=writethrough */
340 static int
341 cache_can_multi_conn (nbdkit_next *next,
342 void *handle)
344 /* For CACHE_MODE_UNSAFE, we always advertise a no-op flush because
345 * our local cache access is consistent between connections, and we
346 * don't care about persisting the data to the underlying plugin.
348 * For CACHE_MODE_WRITEBACK, things are more subtle: we only write
349 * to the plugin during NBD_CMD_FLUSH, at which point that one
350 * connection writes back ALL cached blocks regardless of which
351 * connection originally wrote them, so a client can be assured that
352 * blocks from all connections have reached the plugin's permanent
353 * storage with only one connection having to send a flush.
355 * But for CACHE_MODE_WRITETHROUGH, we are at the mercy of the
356 * plugin; data written by connection A is not guaranteed to be made
357 * persistent by a flush from connection B unless the plugin itself
358 * supports multi-conn.
360 if (cache_mode != CACHE_MODE_WRITETHROUGH)
361 return 1;
362 return next->can_multi_conn (next);
365 /* Read data. */
366 static int
367 cache_pread (nbdkit_next *next,
368 void *handle, void *buf, uint32_t count, uint64_t offset,
369 uint32_t flags, int *err)
371 CLEANUP_FREE uint8_t *block = NULL;
372 uint64_t blknum, blkoffs, nrblocks;
373 int r;
375 assert (!flags);
376 if (!IS_ALIGNED (count | offset, blksize)) {
377 block = malloc (blksize);
378 if (block == NULL) {
379 *err = errno;
380 nbdkit_error ("malloc: %m");
381 return -1;
385 blknum = offset / blksize; /* block number */
386 blkoffs = offset % blksize; /* offset within the block */
388 /* Unaligned head */
389 if (blkoffs) {
390 uint64_t n = MIN (blksize - blkoffs, count);
392 assert (block);
393 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
394 r = blk_read (next, blknum, block, err);
395 if (r == -1)
396 return -1;
398 memcpy (buf, &block[blkoffs], n);
400 buf += n;
401 count -= n;
402 offset += n;
403 blknum++;
406 /* Aligned body */
407 nrblocks = count / blksize;
408 if (nrblocks > 0) {
409 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
410 r = blk_read_multiple (next, blknum, nrblocks, buf, err);
411 if (r == -1)
412 return -1;
414 buf += nrblocks * blksize;
415 count -= nrblocks * blksize;
416 offset += nrblocks * blksize;
417 blknum += nrblocks;
420 /* Unaligned tail */
421 if (count) {
422 assert (block);
423 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
424 r = blk_read (next, blknum, block, err);
425 if (r == -1)
426 return -1;
428 memcpy (buf, block, count);
431 return 0;
434 /* Write data. */
435 static int
436 cache_pwrite (nbdkit_next *next,
437 void *handle, const void *buf, uint32_t count, uint64_t offset,
438 uint32_t flags, int *err)
440 CLEANUP_FREE uint8_t *block = NULL;
441 uint64_t blknum, blkoffs;
442 int r;
443 bool need_flush = false;
445 if (!IS_ALIGNED (count | offset, blksize)) {
446 block = malloc (blksize);
447 if (block == NULL) {
448 *err = errno;
449 nbdkit_error ("malloc: %m");
450 return -1;
454 if ((flags & NBDKIT_FLAG_FUA) &&
455 (cache_mode == CACHE_MODE_UNSAFE ||
456 next->can_fua (next) == NBDKIT_FUA_EMULATE)) {
457 flags &= ~NBDKIT_FLAG_FUA;
458 need_flush = true;
461 blknum = offset / blksize; /* block number */
462 blkoffs = offset % blksize; /* offset within the block */
464 /* Unaligned head */
465 if (blkoffs) {
466 uint64_t n = MIN (blksize - blkoffs, count);
468 /* Do a read-modify-write operation on the current block.
469 * Hold the lock over the whole operation.
471 assert (block);
472 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
473 r = blk_read (next, blknum, block, err);
474 if (r != -1) {
475 memcpy (&block[blkoffs], buf, n);
476 r = blk_write (next, blknum, block, flags, err);
478 if (r == -1)
479 return -1;
481 buf += n;
482 count -= n;
483 offset += n;
484 blknum++;
487 /* Aligned body */
488 while (count >= blksize) {
489 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
490 r = blk_write (next, blknum, buf, flags, err);
491 if (r == -1)
492 return -1;
494 buf += blksize;
495 count -= blksize;
496 offset += blksize;
497 blknum++;
500 /* Unaligned tail */
501 if (count) {
502 assert (block);
503 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
504 r = blk_read (next, blknum, block, err);
505 if (r != -1) {
506 memcpy (block, buf, count);
507 r = blk_write (next, blknum, block, flags, err);
509 if (r == -1)
510 return -1;
513 if (need_flush)
514 return cache_flush (next, handle, 0, err);
515 return 0;
518 /* Zero data. */
519 static int
520 cache_zero (nbdkit_next *next,
521 void *handle, uint32_t count, uint64_t offset, uint32_t flags,
522 int *err)
524 CLEANUP_FREE uint8_t *block = NULL;
525 uint64_t blknum, blkoffs;
526 int r;
527 bool need_flush = false;
529 /* We are purposefully avoiding next->zero, so a zero request is
530 * never faster than plain writes.
532 if (flags & NBDKIT_FLAG_FAST_ZERO) {
533 *err = ENOTSUP;
534 return -1;
537 block = malloc (blksize);
538 if (block == NULL) {
539 *err = errno;
540 nbdkit_error ("malloc: %m");
541 return -1;
544 flags &= ~NBDKIT_FLAG_MAY_TRIM;
545 if ((flags & NBDKIT_FLAG_FUA) &&
546 (cache_mode == CACHE_MODE_UNSAFE ||
547 next->can_fua (next) == NBDKIT_FUA_EMULATE)) {
548 flags &= ~NBDKIT_FLAG_FUA;
549 need_flush = true;
552 blknum = offset / blksize; /* block number */
553 blkoffs = offset % blksize; /* offset within the block */
555 /* Unaligned head */
556 if (blkoffs) {
557 uint64_t n = MIN (blksize - blkoffs, count);
559 /* Do a read-modify-write operation on the current block.
560 * Hold the lock over the whole operation.
562 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
563 r = blk_read (next, blknum, block, err);
564 if (r != -1) {
565 memset (&block[blkoffs], 0, n);
566 r = blk_write (next, blknum, block, flags, err);
568 if (r == -1)
569 return -1;
571 count -= n;
572 offset += n;
573 blknum++;
576 /* Aligned body */
577 if (count >= blksize)
578 memset (block, 0, blksize);
579 while (count >=blksize) {
580 /* Intentional that we do not use next->zero */
581 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
582 r = blk_write (next, blknum, block, flags, err);
583 if (r == -1)
584 return -1;
586 count -= blksize;
587 offset += blksize;
588 blknum++;
591 /* Unaligned tail */
592 if (count) {
593 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
594 r = blk_read (next, blknum, block, err);
595 if (r != -1) {
596 memset (block, 0, count);
597 r = blk_write (next, blknum, block, flags, err);
599 if (r == -1)
600 return -1;
603 if (need_flush)
604 return cache_flush (next, handle, 0, err);
605 return 0;
608 /* Flush: Go through all the dirty blocks, flushing them to disk. */
609 struct flush_data {
610 uint8_t *block; /* bounce buffer */
611 unsigned errors; /* count of errors seen */
612 int first_errno; /* first errno seen */
613 nbdkit_next *next;
616 static int flush_dirty_block (uint64_t blknum, void *);
618 static int
619 cache_flush (nbdkit_next *next, void *handle,
620 uint32_t flags, int *err)
622 CLEANUP_FREE uint8_t *block = NULL;
623 struct flush_data data =
624 { .errors = 0, .first_errno = 0, .next = next };
625 int tmp;
627 if (cache_mode == CACHE_MODE_UNSAFE)
628 return 0;
630 assert (!flags);
632 /* Allocate the bounce buffer. */
633 block = malloc (blksize);
634 if (block == NULL) {
635 *err = errno;
636 nbdkit_error ("malloc: %m");
637 return -1;
639 data.block = block;
641 /* In theory if cache_mode == CACHE_MODE_WRITETHROUGH then there
642 * should be no dirty blocks. However we go through the cache here
643 * to be sure. Also we still need to issue the flush to the
644 * underlying storage.
647 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
648 for_each_dirty_block (flush_dirty_block, &data);
651 /* Now issue a flush request to the underlying storage. */
652 if (next->flush (next, 0, data.errors ? &tmp : &data.first_errno) == -1)
653 data.errors++;
655 if (data.errors > 0) {
656 *err = data.first_errno;
657 return -1;
659 return 0;
662 static int
663 flush_dirty_block (uint64_t blknum, void *datav)
665 struct flush_data *data = datav;
666 int tmp;
668 /* Perform a read + writethrough which will read from the
669 * cache and write it through to the underlying storage.
671 if (blk_read (data->next, blknum, data->block,
672 data->errors ? &tmp : &data->first_errno) == -1)
673 goto err;
674 if (blk_writethrough (data->next, blknum, data->block, 0,
675 data->errors ? &tmp : &data->first_errno) == -1)
676 goto err;
678 return 0;
680 err:
681 nbdkit_error ("cache: flush of block %" PRIu64 " failed", blknum);
682 data->errors++;
683 return 0; /* continue scanning and flushing. */
686 /* Cache data. */
687 static int
688 cache_cache (nbdkit_next *next,
689 void *handle, uint32_t count, uint64_t offset,
690 uint32_t flags, int *err)
692 CLEANUP_FREE uint8_t *block = NULL;
693 uint64_t blknum, blkoffs;
694 int r;
695 uint64_t remaining = count; /* Rounding out could exceed 32 bits */
697 assert (!flags);
698 block = malloc (blksize);
699 if (block == NULL) {
700 *err = errno;
701 nbdkit_error ("malloc: %m");
702 return -1;
705 blknum = offset / blksize; /* block number */
706 blkoffs = offset % blksize; /* offset within the block */
708 /* Unaligned head */
709 remaining += blkoffs;
710 offset -= blkoffs;
712 /* Unaligned tail */
713 remaining = ROUND_UP (remaining, blksize);
715 /* Aligned body */
716 while (remaining) {
717 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
718 r = blk_cache (next, blknum, block, err);
719 if (r == -1)
720 return -1;
722 remaining -= blksize;
723 offset += blksize;
724 blknum++;
727 return 0;
730 static struct nbdkit_filter filter = {
731 .name = "cache",
732 .longname = "nbdkit caching filter",
733 .unload = cache_unload,
734 .config = cache_config,
735 .config_complete = cache_config_complete,
736 .config_help = cache_config_help,
737 .get_ready = cache_get_ready,
738 .prepare = cache_prepare,
739 .get_size = cache_get_size,
740 .block_size = cache_block_size,
741 .can_cache = cache_can_cache,
742 .can_fast_zero = cache_can_fast_zero,
743 .can_flush = cache_can_flush,
744 .can_fua = cache_can_fua,
745 .can_multi_conn = cache_can_multi_conn,
746 .pread = cache_pread,
747 .pwrite = cache_pwrite,
748 .zero = cache_zero,
749 .flush = cache_flush,
750 .cache = cache_cache,
753 NBDKIT_REGISTER_FILTER (filter)