4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
46 #include <nbdkit-plugin.h>
52 #include "allocator.h"
53 #include "allocator-internal.h"
55 /* This allocator implements a sparse array of any size up to 2⁶³-1
58 * The array reads as zeroes until something is written.
60 * The implementation aims to be reasonably efficient for ordinary
61 * sized disks, while permitting huge (but sparse) disks for testing.
62 * Everything allocated has to be stored in memory. There is no
63 * temporary file backing.
65 * XXX It would be nice to change the locking to use r/w locks here,
66 * as that ought to greatly benefit read-heavy loads using multi-conn.
69 /* Two level directory for the sparse array.
71 * nbdkit supports disk sizes up to 2⁶³-1. The aim of the sparse
72 * array is to support up to 63 bit images for testing, although it
73 * won't necessarily be efficient for that use. However it should
74 * also be efficient for more reasonable sized disks.
76 * Although the CPU implements effectively the same kind of data
77 * structure (page tables) there are some advantages of reimplementing
80 * 1. Support for 32 bit (or even 64 bit since the virtual memory
81 * address space on 64 bit machines is not 63 bits in size).
83 * 2. In Linux, overcommit defaults prevent use of virtual memory as a
84 * sparse array without intrusive system configuration changes.
86 * 3. Could choose a page size which is more appropriate for disk
87 * images, plus some architectures have much larger page sizes than
88 * others making behaviour inconsistent across arches.
90 * To achieve this we use a B-Tree-like structure. The L1 directory
91 * contains an ordered, non-overlapping, non-contiguous list of
92 * (offset, pointer to L2 directory).
94 * Updating the L1 directory requires a linear scan but that operation
95 * should be very rare. Because the L1 directory is stored in order
96 * of offset, we can use an efficient binary search for lookups.
98 * Each L1 directory entry can address up to PAGE_SIZE*L2_SIZE bytes
99 * in the virtual disk image. With the current parameters this is
100 * 128MB, which is enough for a 100MB image to fit into a single L1
101 * directory, or a 10GB image to fit into 80 L1 entries. The page
102 * pointers in the L2 directory can be NULL (meaning no page / all
105 * ┌────────────────────┐
106 * │ L1 directory │ ┌────────────────────┐
107 * │ offset, entry 0 ─────────▶ | L2 directory |
108 * │ offset, entry 1 │ | page 0 ─────────▶ page
109 * │ offset, entry 2 │ │ page 1 ─────────▶ page
110 * │ ... │ │ page 2 ─────────▶ page
111 * └────────────────────┘ │ ... │
112 * │ page L2_SIZE-1 ─────────▶ page
113 * └────────────────────┘
115 #define PAGE_SIZE 32768
119 void *page
; /* Pointer to page (array of PAGE_SIZE bytes).*/
123 uint64_t offset
; /* Virtual offset of this entry. */
124 struct l2_entry
*l2_dir
; /* Pointer to L2 directory (L2_SIZE entries). */
127 DEFINE_VECTOR_TYPE (l1_dir
, struct l1_entry
);
129 struct sparse_array
{
130 struct allocator a
; /* Must come first. */
132 /* This lock is highly contended. When hammering the memory plugin
133 * with 8 fio threads, about 30% of the total system time was taken
134 * just waiting for this lock. Fixing this is quite hard.
136 pthread_mutex_t lock
;
137 l1_dir l1_dir
; /* L1 directory. */
140 /* Free L1 and/or L2 directories. */
142 free_l2_dir (struct l2_entry
*l2_dir
)
146 for (i
= 0; i
< L2_SIZE
; ++i
)
147 free (l2_dir
[i
].page
);
152 sparse_array_free (struct allocator
*a
)
154 struct sparse_array
*sa
= (struct sparse_array
*) a
;
158 for (i
= 0; i
< sa
->l1_dir
.len
; ++i
)
159 free_l2_dir (sa
->l1_dir
.ptr
[i
].l2_dir
);
160 free (sa
->l1_dir
.ptr
);
161 pthread_mutex_destroy (&sa
->lock
);
167 sparse_array_set_size_hint (struct allocator
*a
, uint64_t size
)
173 /* Comparison function used when searching through the L1 directory. */
175 compare_l1_offsets (const void *offsetp
, const struct l1_entry
*e
)
177 const uint64_t offset
= *(uint64_t *)offsetp
;
179 if (offset
< e
->offset
) return -1;
180 if (offset
>= e
->offset
+ PAGE_SIZE
*L2_SIZE
) return 1;
184 /* Insert an entry in the L1 directory, keeping it ordered by offset.
185 * This involves an expensive linear scan but should be very rare.
188 insert_l1_entry (struct sparse_array
*sa
, const struct l1_entry
*entry
)
192 for (i
= 0; i
< sa
->l1_dir
.len
; ++i
) {
193 if (entry
->offset
< sa
->l1_dir
.ptr
[i
].offset
) {
194 /* Insert new entry before i'th directory entry. */
195 if (l1_dir_insert (&sa
->l1_dir
, *entry
, i
) == -1) {
196 nbdkit_error ("realloc: %m");
200 nbdkit_debug ("%s: inserted new L1 entry for %" PRIu64
201 " at l1_dir.ptr[%zu]",
202 __func__
, entry
->offset
, i
);
206 /* This should never happens since each entry in the the L1
207 * directory is supposed to be unique.
209 assert (entry
->offset
!= sa
->l1_dir
.ptr
[i
].offset
);
212 /* Insert new entry at the end. */
213 if (l1_dir_append (&sa
->l1_dir
, *entry
) == -1) {
214 nbdkit_error ("realloc: %m");
218 nbdkit_debug ("%s: inserted new L1 entry for %" PRIu64
219 " at end of l1_dir", __func__
, entry
->offset
);
223 /* Look up a virtual offset, returning the address of the offset, the
224 * count of bytes to the end of the page, and a pointer to the L2
225 * directory entry containing the page pointer.
227 * If the create flag is set then a new page and/or directory will be
228 * allocated if necessary. Use this flag when writing.
230 * NULL may be returned normally if the page is not mapped (meaning it
231 * reads as zero). However if the create flag is set and NULL is
232 * returned, this indicates an error.
235 lookup (struct sparse_array
*sa
, uint64_t offset
, bool create
,
236 uint64_t *remaining
, struct l2_entry
**l2_entry
)
238 struct l1_entry
*entry
;
239 struct l2_entry
*l2_dir
;
242 struct l1_entry new_entry
;
244 *remaining
= PAGE_SIZE
- (offset
& (PAGE_SIZE
-1));
247 /* Search the L1 directory. */
248 entry
= l1_dir_search (&sa
->l1_dir
, &offset
, compare_l1_offsets
);
252 nbdkit_debug ("%s: search L1 dir: entry found: offset %" PRIu64
,
253 __func__
, entry
->offset
);
255 nbdkit_debug ("%s: search L1 dir: no entry found", __func__
);
259 l2_dir
= entry
->l2_dir
;
261 /* Which page in the L2 directory? */
262 o
= (offset
- entry
->offset
) / PAGE_SIZE
;
264 *l2_entry
= &l2_dir
[o
];
265 page
= l2_dir
[o
].page
;
266 if (!page
&& create
) {
267 /* No page allocated. Allocate one if creating. */
268 page
= calloc (PAGE_SIZE
, 1);
270 nbdkit_error ("calloc: %m");
273 l2_dir
[o
].page
= page
;
278 return page
+ (offset
& (PAGE_SIZE
-1));
281 /* No L1 directory entry found. */
285 /* No L1 directory entry, and we're creating, so we need to allocate
286 * a new L1 directory entry and insert it in the L1 directory, and
287 * allocate the L2 directory with NULL page pointers. Then we can
288 * repeat the above search to create the page.
290 new_entry
.offset
= offset
& ~(PAGE_SIZE
*L2_SIZE
-1);
291 new_entry
.l2_dir
= calloc (L2_SIZE
, sizeof (struct l2_entry
));
292 if (new_entry
.l2_dir
== NULL
) {
293 nbdkit_error ("calloc: %m");
296 if (insert_l1_entry (sa
, &new_entry
) == -1) {
297 free (new_entry
.l2_dir
);
304 sparse_array_read (struct allocator
*a
,
305 void *buf
, uint64_t count
, uint64_t offset
)
307 struct sparse_array
*sa
= (struct sparse_array
*) a
;
308 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&sa
->lock
);
313 p
= lookup (sa
, offset
, false, &n
, NULL
);
331 sparse_array_write (struct allocator
*a
,
332 const void *buf
, uint64_t count
, uint64_t offset
)
334 struct sparse_array
*sa
= (struct sparse_array
*) a
;
335 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&sa
->lock
);
340 p
= lookup (sa
, offset
, true, &n
, NULL
);
356 static int sparse_array_zero (struct allocator
*a
,
357 uint64_t count
, uint64_t offset
);
360 sparse_array_fill (struct allocator
*a
, char c
,
361 uint64_t count
, uint64_t offset
)
363 struct sparse_array
*sa
= (struct sparse_array
*) a
;
368 return sparse_array_zero (a
, count
, offset
);
370 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&sa
->lock
);
373 p
= lookup (sa
, offset
, true, &n
, NULL
);
389 sparse_array_zero (struct allocator
*a
, uint64_t count
, uint64_t offset
)
391 struct sparse_array
*sa
= (struct sparse_array
*) a
;
392 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&sa
->lock
);
395 struct l2_entry
*l2_entry
;
398 p
= lookup (sa
, offset
, false, &n
, &l2_entry
);
406 assert (p
== l2_entry
->page
);
408 /* If the whole page is now zero, free it. */
409 if (n
>= PAGE_SIZE
|| is_zero (l2_entry
->page
, PAGE_SIZE
)) {
411 nbdkit_debug ("%s: freeing zero page at offset %" PRIu64
,
413 free (l2_entry
->page
);
414 l2_entry
->page
= NULL
;
426 sparse_array_blit (struct allocator
*a1
,
427 struct allocator
*a2
,
429 uint64_t offset1
, uint64_t offset2
)
431 struct sparse_array
*sa2
= (struct sparse_array
*) a2
;
432 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&sa2
->lock
);
435 struct l2_entry
*l2_entry
;
438 assert (strcmp (a2
->f
->type
, "sparse") == 0);
441 p
= lookup (sa2
, offset2
, true, &n
, &l2_entry
);
448 /* Read the source allocator (a1) directly to p which points into
449 * the right place in sa2.
451 if (a1
->f
->read (a1
, p
, n
, offset1
) == -1)
454 /* If the whole page is now zero, free it. */
455 if (is_zero (l2_entry
->page
, PAGE_SIZE
)) {
457 nbdkit_debug ("%s: freeing zero page at offset %" PRIu64
,
459 free (l2_entry
->page
);
460 l2_entry
->page
= NULL
;
472 sparse_array_extents (struct allocator
*a
,
473 uint64_t count
, uint64_t offset
,
474 struct nbdkit_extents
*extents
)
476 struct sparse_array
*sa
= (struct sparse_array
*) a
;
477 ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&sa
->lock
);
483 p
= lookup (sa
, offset
, false, &n
, NULL
);
485 /* Work out the type of this extent. */
487 /* No backing page, so it's a hole. */
488 type
= NBDKIT_EXTENT_HOLE
| NBDKIT_EXTENT_ZERO
;
491 /* A backing page and it's all zero, it's a zero extent. */
492 type
= NBDKIT_EXTENT_ZERO
;
494 /* Normal allocated data. */
497 if (nbdkit_add_extent (extents
, offset
, n
, type
) == -1)
510 static struct allocator
*
511 sparse_array_create (const void *paramsv
)
513 const allocator_parameters
*params
= paramsv
;
514 struct sparse_array
*sa
;
516 if (params
->len
> 0) {
517 nbdkit_error ("allocator=sparse does not take extra parameters");
521 sa
= calloc (1, sizeof *sa
);
523 nbdkit_error ("calloc: %m");
526 pthread_mutex_init (&sa
->lock
, NULL
);
528 return (struct allocator
*) sa
;
531 static struct allocator_functions functions
= {
533 .create
= sparse_array_create
,
534 .free
= sparse_array_free
,
535 .set_size_hint
= sparse_array_set_size_hint
,
536 .read
= sparse_array_read
,
537 .write
= sparse_array_write
,
538 .fill
= sparse_array_fill
,
539 .zero
= sparse_array_zero
,
540 .blit
= sparse_array_blit
,
541 .extents
= sparse_array_extents
,
544 static void register_sparse_array (void) __attribute__ ((constructor
));
547 register_sparse_array (void)
549 register_allocator (&functions
);