Update Red Hat Copyright Notices
[nbdkit.git] / common / allocators / sparse.c
blob0f1e9aa8150f4207a9e1208c6b1980c78ab9b418
1 /* nbdkit
2 * Copyright Red Hat
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
33 #include <config.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <stdbool.h>
38 #include <stdint.h>
39 #include <inttypes.h>
40 #include <string.h>
41 #include <errno.h>
42 #include <assert.h>
44 #include <pthread.h>
46 #include <nbdkit-plugin.h>
48 #include "cleanup.h"
49 #include "iszero.h"
50 #include "vector.h"
52 #include "allocator.h"
53 #include "allocator-internal.h"
55 /* This allocator implements a sparse array of any size up to 2⁶³-1
56 * bytes.
58 * The array reads as zeroes until something is written.
60 * The implementation aims to be reasonably efficient for ordinary
61 * sized disks, while permitting huge (but sparse) disks for testing.
62 * Everything allocated has to be stored in memory. There is no
63 * temporary file backing.
65 * XXX It would be nice to change the locking to use r/w locks here,
66 * as that ought to greatly benefit read-heavy loads using multi-conn.
69 /* Two level directory for the sparse array.
71 * nbdkit supports disk sizes up to 2⁶³-1. The aim of the sparse
72 * array is to support up to 63 bit images for testing, although it
73 * won't necessarily be efficient for that use. However it should
74 * also be efficient for more reasonable sized disks.
76 * Although the CPU implements effectively the same kind of data
77 * structure (page tables) there are some advantages of reimplementing
78 * this:
80 * 1. Support for 32 bit (or even 64 bit since the virtual memory
81 * address space on 64 bit machines is not 63 bits in size).
83 * 2. In Linux, overcommit defaults prevent use of virtual memory as a
84 * sparse array without intrusive system configuration changes.
86 * 3. Could choose a page size which is more appropriate for disk
87 * images, plus some architectures have much larger page sizes than
88 * others making behaviour inconsistent across arches.
90 * To achieve this we use a B-Tree-like structure. The L1 directory
91 * contains an ordered, non-overlapping, non-contiguous list of
92 * (offset, pointer to L2 directory).
94 * Updating the L1 directory requires a linear scan but that operation
95 * should be very rare. Because the L1 directory is stored in order
96 * of offset, we can use an efficient binary search for lookups.
98 * Each L1 directory entry can address up to PAGE_SIZE*L2_SIZE bytes
99 * in the virtual disk image. With the current parameters this is
100 * 128MB, which is enough for a 100MB image to fit into a single L1
101 * directory, or a 10GB image to fit into 80 L1 entries. The page
102 * pointers in the L2 directory can be NULL (meaning no page / all
103 * zeroes).
105 * ┌────────────────────┐
106 * │ L1 directory │ ┌────────────────────┐
107 * │ offset, entry 0 ─────────▶ | L2 directory |
108 * │ offset, entry 1 │ | page 0 ─────────▶ page
109 * │ offset, entry 2 │ │ page 1 ─────────▶ page
110 * │ ... │ │ page 2 ─────────▶ page
111 * └────────────────────┘ │ ... │
112 * │ page L2_SIZE-1 ─────────▶ page
113 * └────────────────────┘
115 #define PAGE_SIZE 32768
116 #define L2_SIZE 4096
118 struct l2_entry {
119 void *page; /* Pointer to page (array of PAGE_SIZE bytes).*/
122 struct l1_entry {
123 uint64_t offset; /* Virtual offset of this entry. */
124 struct l2_entry *l2_dir; /* Pointer to L2 directory (L2_SIZE entries). */
127 DEFINE_VECTOR_TYPE (l1_dir, struct l1_entry);
129 struct sparse_array {
130 struct allocator a; /* Must come first. */
132 /* This lock is highly contended. When hammering the memory plugin
133 * with 8 fio threads, about 30% of the total system time was taken
134 * just waiting for this lock. Fixing this is quite hard.
136 pthread_mutex_t lock;
137 l1_dir l1_dir; /* L1 directory. */
140 /* Free L1 and/or L2 directories. */
141 static void
142 free_l2_dir (struct l2_entry *l2_dir)
144 size_t i;
146 for (i = 0; i < L2_SIZE; ++i)
147 free (l2_dir[i].page);
148 free (l2_dir);
151 static void
152 sparse_array_free (struct allocator *a)
154 struct sparse_array *sa = (struct sparse_array *) a;
155 size_t i;
157 if (sa) {
158 for (i = 0; i < sa->l1_dir.len; ++i)
159 free_l2_dir (sa->l1_dir.ptr[i].l2_dir);
160 free (sa->l1_dir.ptr);
161 pthread_mutex_destroy (&sa->lock);
162 free (sa);
166 static int
167 sparse_array_set_size_hint (struct allocator *a, uint64_t size)
169 /* Ignored. */
170 return 0;
173 /* Comparison function used when searching through the L1 directory. */
174 static int
175 compare_l1_offsets (const void *offsetp, const struct l1_entry *e)
177 const uint64_t offset = *(uint64_t *)offsetp;
179 if (offset < e->offset) return -1;
180 if (offset >= e->offset + PAGE_SIZE*L2_SIZE) return 1;
181 return 0;
184 /* Insert an entry in the L1 directory, keeping it ordered by offset.
185 * This involves an expensive linear scan but should be very rare.
187 static int
188 insert_l1_entry (struct sparse_array *sa, const struct l1_entry *entry)
190 size_t i;
192 for (i = 0; i < sa->l1_dir.len; ++i) {
193 if (entry->offset < sa->l1_dir.ptr[i].offset) {
194 /* Insert new entry before i'th directory entry. */
195 if (l1_dir_insert (&sa->l1_dir, *entry, i) == -1) {
196 nbdkit_error ("realloc: %m");
197 return -1;
199 if (sa->a.debug)
200 nbdkit_debug ("%s: inserted new L1 entry for %" PRIu64
201 " at l1_dir.ptr[%zu]",
202 __func__, entry->offset, i);
203 return 0;
206 /* This should never happens since each entry in the the L1
207 * directory is supposed to be unique.
209 assert (entry->offset != sa->l1_dir.ptr[i].offset);
212 /* Insert new entry at the end. */
213 if (l1_dir_append (&sa->l1_dir, *entry) == -1) {
214 nbdkit_error ("realloc: %m");
215 return -1;
217 if (sa->a.debug)
218 nbdkit_debug ("%s: inserted new L1 entry for %" PRIu64
219 " at end of l1_dir", __func__, entry->offset);
220 return 0;
223 /* Look up a virtual offset, returning the address of the offset, the
224 * count of bytes to the end of the page, and a pointer to the L2
225 * directory entry containing the page pointer.
227 * If the create flag is set then a new page and/or directory will be
228 * allocated if necessary. Use this flag when writing.
230 * NULL may be returned normally if the page is not mapped (meaning it
231 * reads as zero). However if the create flag is set and NULL is
232 * returned, this indicates an error.
234 static void *
235 lookup (struct sparse_array *sa, uint64_t offset, bool create,
236 uint64_t *remaining, struct l2_entry **l2_entry)
238 struct l1_entry *entry;
239 struct l2_entry *l2_dir;
240 uint64_t o;
241 void *page;
242 struct l1_entry new_entry;
244 *remaining = PAGE_SIZE - (offset & (PAGE_SIZE-1));
246 again:
247 /* Search the L1 directory. */
248 entry = l1_dir_search (&sa->l1_dir, &offset, compare_l1_offsets);
250 if (sa->a.debug) {
251 if (entry)
252 nbdkit_debug ("%s: search L1 dir: entry found: offset %" PRIu64,
253 __func__, entry->offset);
254 else
255 nbdkit_debug ("%s: search L1 dir: no entry found", __func__);
258 if (entry) {
259 l2_dir = entry->l2_dir;
261 /* Which page in the L2 directory? */
262 o = (offset - entry->offset) / PAGE_SIZE;
263 if (l2_entry)
264 *l2_entry = &l2_dir[o];
265 page = l2_dir[o].page;
266 if (!page && create) {
267 /* No page allocated. Allocate one if creating. */
268 page = calloc (PAGE_SIZE, 1);
269 if (page == NULL) {
270 nbdkit_error ("calloc: %m");
271 return NULL;
273 l2_dir[o].page = page;
275 if (!page)
276 return NULL;
277 else
278 return page + (offset & (PAGE_SIZE-1));
281 /* No L1 directory entry found. */
282 if (!create)
283 return NULL;
285 /* No L1 directory entry, and we're creating, so we need to allocate
286 * a new L1 directory entry and insert it in the L1 directory, and
287 * allocate the L2 directory with NULL page pointers. Then we can
288 * repeat the above search to create the page.
290 new_entry.offset = offset & ~(PAGE_SIZE*L2_SIZE-1);
291 new_entry.l2_dir = calloc (L2_SIZE, sizeof (struct l2_entry));
292 if (new_entry.l2_dir == NULL) {
293 nbdkit_error ("calloc: %m");
294 return NULL;
296 if (insert_l1_entry (sa, &new_entry) == -1) {
297 free (new_entry.l2_dir);
298 return NULL;
300 goto again;
303 static int
304 sparse_array_read (struct allocator *a,
305 void *buf, uint64_t count, uint64_t offset)
307 struct sparse_array *sa = (struct sparse_array *) a;
308 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&sa->lock);
309 uint64_t n;
310 void *p;
312 while (count > 0) {
313 p = lookup (sa, offset, false, &n, NULL);
314 if (n > count)
315 n = count;
317 if (p == NULL)
318 memset (buf, 0, n);
319 else
320 memcpy (buf, p, n);
322 buf += n;
323 count -= n;
324 offset += n;
327 return 0;
330 static int
331 sparse_array_write (struct allocator *a,
332 const void *buf, uint64_t count, uint64_t offset)
334 struct sparse_array *sa = (struct sparse_array *) a;
335 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&sa->lock);
336 uint64_t n;
337 void *p;
339 while (count > 0) {
340 p = lookup (sa, offset, true, &n, NULL);
341 if (p == NULL)
342 return -1;
344 if (n > count)
345 n = count;
346 memcpy (p, buf, n);
348 buf += n;
349 count -= n;
350 offset += n;
353 return 0;
356 static int sparse_array_zero (struct allocator *a,
357 uint64_t count, uint64_t offset);
359 static int
360 sparse_array_fill (struct allocator *a, char c,
361 uint64_t count, uint64_t offset)
363 struct sparse_array *sa = (struct sparse_array *) a;
364 uint64_t n;
365 void *p;
367 if (c == 0)
368 return sparse_array_zero (a, count, offset);
370 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&sa->lock);
372 while (count > 0) {
373 p = lookup (sa, offset, true, &n, NULL);
374 if (p == NULL)
375 return -1;
377 if (n > count)
378 n = count;
379 memset (p, c, n);
381 count -= n;
382 offset += n;
385 return 0;
388 static int
389 sparse_array_zero (struct allocator *a, uint64_t count, uint64_t offset)
391 struct sparse_array *sa = (struct sparse_array *) a;
392 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&sa->lock);
393 uint64_t n;
394 void *p;
395 struct l2_entry *l2_entry;
397 while (count > 0) {
398 p = lookup (sa, offset, false, &n, &l2_entry);
399 if (n > count)
400 n = count;
402 if (p) {
403 if (n < PAGE_SIZE)
404 memset (p, 0, n);
405 else
406 assert (p == l2_entry->page);
408 /* If the whole page is now zero, free it. */
409 if (n >= PAGE_SIZE || is_zero (l2_entry->page, PAGE_SIZE)) {
410 if (sa->a.debug)
411 nbdkit_debug ("%s: freeing zero page at offset %" PRIu64,
412 __func__, offset);
413 free (l2_entry->page);
414 l2_entry->page = NULL;
418 count -= n;
419 offset += n;
422 return 0;
425 static int
426 sparse_array_blit (struct allocator *a1,
427 struct allocator *a2,
428 uint64_t count,
429 uint64_t offset1, uint64_t offset2)
431 struct sparse_array *sa2 = (struct sparse_array *) a2;
432 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&sa2->lock);
433 uint64_t n;
434 void *p;
435 struct l2_entry *l2_entry;
437 assert (a1 != a2);
438 assert (strcmp (a2->f->type, "sparse") == 0);
440 while (count > 0) {
441 p = lookup (sa2, offset2, true, &n, &l2_entry);
442 if (p == NULL)
443 return -1;
445 if (n > count)
446 n = count;
448 /* Read the source allocator (a1) directly to p which points into
449 * the right place in sa2.
451 if (a1->f->read (a1, p, n, offset1) == -1)
452 return -1;
454 /* If the whole page is now zero, free it. */
455 if (is_zero (l2_entry->page, PAGE_SIZE)) {
456 if (sa2->a.debug)
457 nbdkit_debug ("%s: freeing zero page at offset %" PRIu64,
458 __func__, offset2);
459 free (l2_entry->page);
460 l2_entry->page = NULL;
463 count -= n;
464 offset1 += n;
465 offset2 += n;
468 return 0;
471 static int
472 sparse_array_extents (struct allocator *a,
473 uint64_t count, uint64_t offset,
474 struct nbdkit_extents *extents)
476 struct sparse_array *sa = (struct sparse_array *) a;
477 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&sa->lock);
478 uint64_t n;
479 uint32_t type;
480 void *p;
482 while (count > 0) {
483 p = lookup (sa, offset, false, &n, NULL);
485 /* Work out the type of this extent. */
486 if (p == NULL)
487 /* No backing page, so it's a hole. */
488 type = NBDKIT_EXTENT_HOLE | NBDKIT_EXTENT_ZERO;
489 else {
490 if (is_zero (p, n))
491 /* A backing page and it's all zero, it's a zero extent. */
492 type = NBDKIT_EXTENT_ZERO;
493 else
494 /* Normal allocated data. */
495 type = 0;
497 if (nbdkit_add_extent (extents, offset, n, type) == -1)
498 return -1;
500 if (n > count)
501 n = count;
503 count -= n;
504 offset += n;
507 return 0;
510 static struct allocator *
511 sparse_array_create (const void *paramsv)
513 const allocator_parameters *params = paramsv;
514 struct sparse_array *sa;
516 if (params->len > 0) {
517 nbdkit_error ("allocator=sparse does not take extra parameters");
518 return NULL;
521 sa = calloc (1, sizeof *sa);
522 if (sa == NULL) {
523 nbdkit_error ("calloc: %m");
524 return NULL;
526 pthread_mutex_init (&sa->lock, NULL);
528 return (struct allocator *) sa;
531 static struct allocator_functions functions = {
532 .type = "sparse",
533 .create = sparse_array_create,
534 .free = sparse_array_free,
535 .set_size_hint = sparse_array_set_size_hint,
536 .read = sparse_array_read,
537 .write = sparse_array_write,
538 .fill = sparse_array_fill,
539 .zero = sparse_array_zero,
540 .blit = sparse_array_blit,
541 .extents = sparse_array_extents,
544 static void register_sparse_array (void) __attribute__ ((constructor));
546 static void
547 register_sparse_array (void)
549 register_allocator (&functions);