2 * Copyright (C) 2017-2019 Red Hat Inc.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
44 #include <nbdkit-plugin.h>
49 /* Two level directory for the sparse array.
51 * nbdkit supports disk sizes up to 2⁶³-1. The aim of the sparse
52 * array is to support up to 63 bit images for testing, although it
53 * won't necessarily be efficient for that use. However it should
54 * also be efficient for more reasonable sized disks.
56 * Although the CPU implements effectively the same kind of data
57 * structure (page tables) there are some advantages of reimplementing
60 * 1. Support for 32 bit (or even 64 bit since the virtual memory
61 * address space on 64 bit machines is not 63 bits in size).
63 * 2. In Linux, overcommit defaults prevent use of virtual memory as a
64 * sparse array without intrusive system configuration changes.
66 * 3. Could choose a page size which is more appropriate for disk
67 * images, plus some architectures have much larger page sizes than
68 * others making behaviour inconsistent across arches.
70 * To achieve this we use a B-Tree-like structure. The L1 directory
71 * contains an ordered, non-overlapping, non-contiguous list of
72 * (offset, pointer to L2 directory).
74 * Updating the L1 directory requires a linear scan but that operation
75 * should be very rare. Because the L1 directory is stored in order
76 * of offset, we can use an efficient binary search for lookups.
78 * Each L1 directory entry can address up to PAGE_SIZE*L2_SIZE bytes
79 * in the virtual disk image. With the current parameters this is
80 * 128MB, which is enough for a 100MB image to fit into a single L1
81 * directory, or a 10GB image to fit into 80 L1 entries. The page
82 * pointers in the L2 directory can be NULL (meaning no page / all
85 * ┌────────────────────┐
86 * │ L1 directory │ ┌────────────────────┐
87 * │ offset, entry 0 ─────────▶ | L2 directory |
88 * │ offset, entry 1 │ | page 0 ─────────▶ page
89 * │ offset, entry 2 │ │ page 1 ─────────▶ page
90 * │ ... │ │ page 2 ─────────▶ page
91 * └────────────────────┘ │ ... │
92 * │ page L2_SIZE-1 ─────────▶ page
93 * └────────────────────┘
95 #define PAGE_SIZE 32768
99 uint64_t offset
; /* Virtual offset of this entry. */
100 void **l2_dir
; /* Pointer to L2 directory. */
103 struct sparse_array
{
104 struct l1_entry
*l1_dir
; /* L1 directory. */
105 size_t l1_size
; /* Number of entries in L1 directory. */
109 /* Free L1 and/or L2 directories. */
111 free_l2_dir (void **l2_dir
)
115 for (i
= 0; i
< L2_SIZE
; ++i
)
121 free_sparse_array (struct sparse_array
*sa
)
126 for (i
= 0; i
< sa
->l1_size
; ++i
)
127 free_l2_dir (sa
->l1_dir
[i
].l2_dir
);
133 struct sparse_array
*
134 alloc_sparse_array (bool debug
)
136 struct sparse_array
*sa
;
138 sa
= malloc (sizeof *sa
);
147 /* Comparison function used when searching through the L1 directory. */
149 compare_l1_offsets (const void *offsetp
, const void *entryp
)
151 const uint64_t offset
= *(uint64_t *)offsetp
;
152 const struct l1_entry
*e
= entryp
;
154 if (offset
< e
->offset
) return -1;
155 if (offset
>= e
->offset
+ PAGE_SIZE
*L2_SIZE
) return 1;
159 /* Insert an entry in the L1 directory, keeping it ordered by offset.
160 * This involves an expensive linear scan but should be very rare.
163 insert_l1_entry (struct sparse_array
*sa
, const struct l1_entry
*entry
)
166 struct l1_entry
*old_l1_dir
= sa
->l1_dir
;
168 /* The l1_dir always ends up one bigger, so reallocate it first. */
169 sa
->l1_dir
= realloc (sa
->l1_dir
, (sa
->l1_size
+1) * sizeof (struct l1_entry
));
170 if (sa
->l1_dir
== NULL
) {
171 sa
->l1_dir
= old_l1_dir
;
172 nbdkit_error ("realloc");
176 for (i
= 0; i
< sa
->l1_size
; ++i
) {
177 if (entry
->offset
< sa
->l1_dir
[i
].offset
) {
178 /* Insert new entry before i'th directory entry. */
179 memmove (&sa
->l1_dir
[i
+1], &sa
->l1_dir
[i
],
180 (sa
->l1_size
-i
) * sizeof (struct l1_entry
));
181 sa
->l1_dir
[i
] = *entry
;
184 nbdkit_debug ("%s: inserted new L1 entry for %" PRIu64
186 __func__
, entry
->offset
, i
);
190 /* This should never happens since each entry in the the L1
191 * directory is supposed to be unique.
193 assert (entry
->offset
!= sa
->l1_dir
[i
].offset
);
196 /* Insert new entry at the end. */
197 sa
->l1_dir
[sa
->l1_size
] = *entry
;
200 nbdkit_debug ("%s: inserted new L1 entry for %" PRIu64
201 " at end of l1_dir", __func__
, entry
->offset
);
205 /* Look up a virtual offset, returning the address of the offset, the
206 * count of bytes to the end of the page, and a pointer to the L2
207 * directory entry containing the page pointer.
209 * If the create flag is set then a new page and/or directory will be
210 * allocated if necessary. Use this flag when writing.
212 * NULL may be returned normally if the page is not mapped (meaning it
213 * reads as zero). However if the create flag is set and NULL is
214 * returned, this indicates an error.
217 lookup (struct sparse_array
*sa
, uint64_t offset
, bool create
,
218 uint32_t *remaining
, void ***l2_page
)
220 struct l1_entry
*entry
;
224 struct l1_entry new_entry
;
226 *remaining
= PAGE_SIZE
- (offset
& (PAGE_SIZE
-1));
229 /* Search the L1 directory. */
230 entry
= bsearch (&offset
, sa
->l1_dir
, sa
->l1_size
, sizeof (struct l1_entry
),
235 nbdkit_debug ("%s: search L1 dir: entry found: offset %" PRIu64
,
236 __func__
, entry
->offset
);
238 nbdkit_debug ("%s: search L1 dir: no entry found", __func__
);
242 l2_dir
= entry
->l2_dir
;
244 /* Which page in the L2 directory? */
245 o
= (offset
- entry
->offset
) / PAGE_SIZE
;
247 *l2_page
= &l2_dir
[o
];
249 if (!page
&& create
) {
250 /* No page allocated. Allocate one if creating. */
251 page
= calloc (PAGE_SIZE
, 1);
253 nbdkit_error ("calloc");
261 return page
+ (offset
& (PAGE_SIZE
-1));
264 /* No L1 directory entry found. */
268 /* No L1 directory entry, and we're creating, so we need to allocate
269 * a new L1 directory entry and insert it in the L1 directory, and
270 * allocate the L2 directory with NULL page pointers. Then we can
271 * repeat the above search to create the page.
273 new_entry
.offset
= offset
& ~(PAGE_SIZE
*L2_SIZE
-1);
274 new_entry
.l2_dir
= calloc (L2_SIZE
, sizeof (void *));
275 if (new_entry
.l2_dir
== NULL
) {
276 nbdkit_error ("calloc");
279 if (insert_l1_entry (sa
, &new_entry
) == -1) {
280 free (new_entry
.l2_dir
);
287 sparse_array_read (struct sparse_array
*sa
,
288 void *buf
, uint32_t count
, uint64_t offset
)
294 p
= lookup (sa
, offset
, false, &n
, NULL
);
310 sparse_array_write (struct sparse_array
*sa
,
311 const void *buf
, uint32_t count
, uint64_t offset
)
317 p
= lookup (sa
, offset
, true, &n
, NULL
);
334 sparse_array_zero (struct sparse_array
*sa
, uint32_t count
, uint64_t offset
)
341 p
= lookup (sa
, offset
, false, &n
, &l2_page
);
349 assert (p
== *l2_page
);
351 /* If the whole page is now zero, free it. */
352 if (n
>= PAGE_SIZE
|| is_zero (*l2_page
, PAGE_SIZE
)) {
354 nbdkit_debug ("%s: freeing zero page at offset %" PRIu64
,
367 sparse_array_extents (struct sparse_array
*sa
,
368 uint32_t count
, uint64_t offset
,
369 struct nbdkit_extents
*extents
)
375 p
= lookup (sa
, offset
, false, &n
, NULL
);
377 /* Work out the type of this extent. */
379 /* No backing page, so it's a hole. */
380 type
= NBDKIT_EXTENT_HOLE
| NBDKIT_EXTENT_ZERO
;
383 /* A backing page and it's all zero, it's a zero extent. */
384 type
= NBDKIT_EXTENT_ZERO
;
386 /* Normal allocated data. */
389 if (nbdkit_add_extent (extents
, offset
, n
, type
) == -1)