python: Use PyObject_CallFunction instead of constructing the tuple.
[nbdkit/ericb.git] / common / sparse / sparse.c
blob0acfa1fe1a197a277d46bd8729c275ec178adba8
1 /* nbdkit
2 * Copyright (C) 2017-2019 Red Hat Inc.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * * Neither the name of Red Hat nor the names of its contributors may be
16 * used to endorse or promote products derived from this software without
17 * specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
26 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
27 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
33 #include <config.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <stdbool.h>
38 #include <stdint.h>
39 #include <inttypes.h>
40 #include <string.h>
41 #include <errno.h>
42 #include <assert.h>
44 #include <nbdkit-plugin.h>
46 #include "iszero.h"
47 #include "sparse.h"
49 /* Two level directory for the sparse array.
51 * nbdkit supports disk sizes up to 2⁶³-1. The aim of the sparse
52 * array is to support up to 63 bit images for testing, although it
53 * won't necessarily be efficient for that use. However it should
54 * also be efficient for more reasonable sized disks.
56 * Although the CPU implements effectively the same kind of data
57 * structure (page tables) there are some advantages of reimplementing
58 * this:
60 * 1. Support for 32 bit (or even 64 bit since the virtual memory
61 * address space on 64 bit machines is not 63 bits in size).
63 * 2. In Linux, overcommit defaults prevent use of virtual memory as a
64 * sparse array without intrusive system configuration changes.
66 * 3. Could choose a page size which is more appropriate for disk
67 * images, plus some architectures have much larger page sizes than
68 * others making behaviour inconsistent across arches.
70 * To achieve this we use a B-Tree-like structure. The L1 directory
71 * contains an ordered, non-overlapping, non-contiguous list of
72 * (offset, pointer to L2 directory).
74 * Updating the L1 directory requires a linear scan but that operation
75 * should be very rare. Because the L1 directory is stored in order
76 * of offset, we can use an efficient binary search for lookups.
78 * Each L1 directory entry can address up to PAGE_SIZE*L2_SIZE bytes
79 * in the virtual disk image. With the current parameters this is
80 * 128MB, which is enough for a 100MB image to fit into a single L1
81 * directory, or a 10GB image to fit into 80 L1 entries. The page
82 * pointers in the L2 directory can be NULL (meaning no page / all
83 * zeroes).
85 * ┌────────────────────┐
86 * │ L1 directory │ ┌────────────────────┐
87 * │ offset, entry 0 ─────────▶ | L2 directory |
88 * │ offset, entry 1 │ | page 0 ─────────▶ page
89 * │ offset, entry 2 │ │ page 1 ─────────▶ page
90 * │ ... │ │ page 2 ─────────▶ page
91 * └────────────────────┘ │ ... │
92 * │ page L2_SIZE-1 ─────────▶ page
93 * └────────────────────┘
95 #define PAGE_SIZE 32768
96 #define L2_SIZE 4096
98 struct l1_entry {
99 uint64_t offset; /* Virtual offset of this entry. */
100 void **l2_dir; /* Pointer to L2 directory. */
103 struct sparse_array {
104 struct l1_entry *l1_dir; /* L1 directory. */
105 size_t l1_size; /* Number of entries in L1 directory. */
106 bool debug;
109 /* Free L1 and/or L2 directories. */
110 static void
111 free_l2_dir (void **l2_dir)
113 size_t i;
115 for (i = 0; i < L2_SIZE; ++i)
116 free (l2_dir[i]);
117 free (l2_dir);
120 void
121 free_sparse_array (struct sparse_array *sa)
123 size_t i;
125 if (sa) {
126 for (i = 0; i < sa->l1_size; ++i)
127 free_l2_dir (sa->l1_dir[i].l2_dir);
128 free (sa->l1_dir);
129 free (sa);
133 struct sparse_array *
134 alloc_sparse_array (bool debug)
136 struct sparse_array *sa;
138 sa = malloc (sizeof *sa);
139 if (sa == NULL)
140 return NULL;
141 sa->l1_dir = NULL;
142 sa->l1_size = 0;
143 sa->debug = debug;
144 return sa;
147 /* Comparison function used when searching through the L1 directory. */
148 static int
149 compare_l1_offsets (const void *offsetp, const void *entryp)
151 const uint64_t offset = *(uint64_t *)offsetp;
152 const struct l1_entry *e = entryp;
154 if (offset < e->offset) return -1;
155 if (offset >= e->offset + PAGE_SIZE*L2_SIZE) return 1;
156 return 0;
159 /* Insert an entry in the L1 directory, keeping it ordered by offset.
160 * This involves an expensive linear scan but should be very rare.
162 static int
163 insert_l1_entry (struct sparse_array *sa, const struct l1_entry *entry)
165 size_t i;
166 struct l1_entry *old_l1_dir = sa->l1_dir;
168 /* The l1_dir always ends up one bigger, so reallocate it first. */
169 sa->l1_dir = realloc (sa->l1_dir, (sa->l1_size+1) * sizeof (struct l1_entry));
170 if (sa->l1_dir == NULL) {
171 sa->l1_dir = old_l1_dir;
172 nbdkit_error ("realloc");
173 return -1;
176 for (i = 0; i < sa->l1_size; ++i) {
177 if (entry->offset < sa->l1_dir[i].offset) {
178 /* Insert new entry before i'th directory entry. */
179 memmove (&sa->l1_dir[i+1], &sa->l1_dir[i],
180 (sa->l1_size-i) * sizeof (struct l1_entry));
181 sa->l1_dir[i] = *entry;
182 sa->l1_size++;
183 if (sa->debug)
184 nbdkit_debug ("%s: inserted new L1 entry for %" PRIu64
185 " at l1_dir[%zu]",
186 __func__, entry->offset, i);
187 return 0;
190 /* This should never happens since each entry in the the L1
191 * directory is supposed to be unique.
193 assert (entry->offset != sa->l1_dir[i].offset);
196 /* Insert new entry at the end. */
197 sa->l1_dir[sa->l1_size] = *entry;
198 sa->l1_size++;
199 if (sa->debug)
200 nbdkit_debug ("%s: inserted new L1 entry for %" PRIu64
201 " at end of l1_dir", __func__, entry->offset);
202 return 0;
205 /* Look up a virtual offset, returning the address of the offset, the
206 * count of bytes to the end of the page, and a pointer to the L2
207 * directory entry containing the page pointer.
209 * If the create flag is set then a new page and/or directory will be
210 * allocated if necessary. Use this flag when writing.
212 * NULL may be returned normally if the page is not mapped (meaning it
213 * reads as zero). However if the create flag is set and NULL is
214 * returned, this indicates an error.
216 static void *
217 lookup (struct sparse_array *sa, uint64_t offset, bool create,
218 uint32_t *remaining, void ***l2_page)
220 struct l1_entry *entry;
221 void **l2_dir;
222 uint64_t o;
223 void *page;
224 struct l1_entry new_entry;
226 *remaining = PAGE_SIZE - (offset & (PAGE_SIZE-1));
228 again:
229 /* Search the L1 directory. */
230 entry = bsearch (&offset, sa->l1_dir, sa->l1_size, sizeof (struct l1_entry),
231 compare_l1_offsets);
233 if (sa->debug) {
234 if (entry)
235 nbdkit_debug ("%s: search L1 dir: entry found: offset %" PRIu64,
236 __func__, entry->offset);
237 else
238 nbdkit_debug ("%s: search L1 dir: no entry found", __func__);
241 if (entry) {
242 l2_dir = entry->l2_dir;
244 /* Which page in the L2 directory? */
245 o = (offset - entry->offset) / PAGE_SIZE;
246 if (l2_page)
247 *l2_page = &l2_dir[o];
248 page = l2_dir[o];
249 if (!page && create) {
250 /* No page allocated. Allocate one if creating. */
251 page = calloc (PAGE_SIZE, 1);
252 if (page == NULL) {
253 nbdkit_error ("calloc");
254 return NULL;
256 l2_dir[o] = page;
258 if (!page)
259 return NULL;
260 else
261 return page + (offset & (PAGE_SIZE-1));
264 /* No L1 directory entry found. */
265 if (!create)
266 return NULL;
268 /* No L1 directory entry, and we're creating, so we need to allocate
269 * a new L1 directory entry and insert it in the L1 directory, and
270 * allocate the L2 directory with NULL page pointers. Then we can
271 * repeat the above search to create the page.
273 new_entry.offset = offset & ~(PAGE_SIZE*L2_SIZE-1);
274 new_entry.l2_dir = calloc (L2_SIZE, sizeof (void *));
275 if (new_entry.l2_dir == NULL) {
276 nbdkit_error ("calloc");
277 return NULL;
279 if (insert_l1_entry (sa, &new_entry) == -1) {
280 free (new_entry.l2_dir);
281 return NULL;
283 goto again;
286 void
287 sparse_array_read (struct sparse_array *sa,
288 void *buf, uint32_t count, uint64_t offset)
290 uint32_t n;
291 void *p;
293 while (count > 0) {
294 p = lookup (sa, offset, false, &n, NULL);
295 if (n > count)
296 n = count;
298 if (p == NULL)
299 memset (buf, 0, n);
300 else
301 memcpy (buf, p, n);
303 buf += n;
304 count -= n;
305 offset += n;
310 sparse_array_write (struct sparse_array *sa,
311 const void *buf, uint32_t count, uint64_t offset)
313 uint32_t n;
314 void *p;
316 while (count > 0) {
317 p = lookup (sa, offset, true, &n, NULL);
318 if (p == NULL)
319 return -1;
321 if (n > count)
322 n = count;
323 memcpy (p, buf, n);
325 buf += n;
326 count -= n;
327 offset += n;
330 return 0;
333 void
334 sparse_array_zero (struct sparse_array *sa, uint32_t count, uint64_t offset)
336 uint32_t n;
337 void *p;
338 void **l2_page;
340 while (count > 0) {
341 p = lookup (sa, offset, false, &n, &l2_page);
342 if (n > count)
343 n = count;
345 if (p) {
346 if (n < PAGE_SIZE)
347 memset (p, 0, n);
348 else
349 assert (p == *l2_page);
351 /* If the whole page is now zero, free it. */
352 if (n >= PAGE_SIZE || is_zero (*l2_page, PAGE_SIZE)) {
353 if (sa->debug)
354 nbdkit_debug ("%s: freeing zero page at offset %" PRIu64,
355 __func__, offset);
356 free (*l2_page);
357 *l2_page = NULL;
361 count -= n;
362 offset += n;
367 sparse_array_extents (struct sparse_array *sa,
368 uint32_t count, uint64_t offset,
369 struct nbdkit_extents *extents)
371 uint32_t n, type;
372 void *p;
374 while (count > 0) {
375 p = lookup (sa, offset, false, &n, NULL);
377 /* Work out the type of this extent. */
378 if (p == NULL)
379 /* No backing page, so it's a hole. */
380 type = NBDKIT_EXTENT_HOLE | NBDKIT_EXTENT_ZERO;
381 else {
382 if (is_zero (p, n))
383 /* A backing page and it's all zero, it's a zero extent. */
384 type = NBDKIT_EXTENT_ZERO;
385 else
386 /* Normal allocated data. */
387 type = 0;
389 if (nbdkit_add_extent (extents, offset, n, type) == -1)
390 return -1;
392 if (n > count)
393 n = count;
395 count -= n;
396 offset += n;
399 return 0;