7614 zfs device evacuation/removal
[unleashed.git] / usr / src / uts / common / fs / zfs / vdev_indirect_mapping.c
blobb350ee59fa4bb7c4c5e1e93cfcfe55f2dbe7b184
1 /*
2 * CDDL HEADER START
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
9 * A full copy of the text of the CDDL should have accompanied this
10 * source. A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
13 * CDDL HEADER END
17 * Copyright (c) 2015 by Delphix. All rights reserved.
20 #include <sys/dmu_tx.h>
21 #include <sys/dsl_pool.h>
22 #include <sys/spa.h>
23 #include <sys/vdev_impl.h>
24 #include <sys/vdev_indirect_mapping.h>
25 #include <sys/zfeature.h>
26 #include <sys/dmu_objset.h>
28 static boolean_t
29 vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim)
31 ASSERT(vim != NULL);
33 ASSERT(vim->vim_object != 0);
34 ASSERT(vim->vim_objset != NULL);
35 ASSERT(vim->vim_phys != NULL);
36 ASSERT(vim->vim_dbuf != NULL);
38 EQUIV(vim->vim_phys->vimp_num_entries > 0,
39 vim->vim_entries != NULL);
40 if (vim->vim_phys->vimp_num_entries > 0) {
41 vdev_indirect_mapping_entry_phys_t *last_entry =
42 &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1];
43 uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(last_entry);
44 uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst);
46 ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size);
48 if (vim->vim_havecounts) {
49 ASSERT(vim->vim_phys->vimp_counts_object != 0);
52 return (B_TRUE);
55 uint64_t
56 vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim)
58 ASSERT(vdev_indirect_mapping_verify(vim));
60 return (vim->vim_phys->vimp_num_entries);
63 uint64_t
64 vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim)
66 ASSERT(vdev_indirect_mapping_verify(vim));
68 return (vim->vim_phys->vimp_max_offset);
71 uint64_t
72 vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim)
74 ASSERT(vdev_indirect_mapping_verify(vim));
76 return (vim->vim_object);
79 uint64_t
80 vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim)
82 ASSERT(vdev_indirect_mapping_verify(vim));
84 return (vim->vim_phys->vimp_bytes_mapped);
88 * The length (in bytes) of the mapping object array in memory and
89 * (logically) on disk.
91 * Note that unlike most of our accessor functions,
92 * we don't assert that the struct is consistent; therefore it can be
93 * called while there may be concurrent changes, if we don't care about
94 * the value being immediately stale (e.g. from spa_removal_get_stats()).
96 uint64_t
97 vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim)
99 return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries));
103 * Compare an offset with an indirect mapping entry; there are three
104 * possible scenarios:
106 * 1. The offset is "less than" the mapping entry; meaning the
107 * offset is less than the source offset of the mapping entry. In
108 * this case, there is no overlap between the offset and the
109 * mapping entry and -1 will be returned.
111 * 2. The offset is "greater than" the mapping entry; meaning the
112 * offset is greater than the mapping entry's source offset plus
113 * the entry's size. In this case, there is no overlap between
114 * the offset and the mapping entry and 1 will be returned.
116 * NOTE: If the offset is actually equal to the entry's offset
117 * plus size, this is considered to be "greater" than the entry,
118 * and this case applies (i.e. 1 will be returned). Thus, the
119 * entry's "range" can be considered to be inclusive at its
120 * start, but exclusive at its end: e.g. [src, src + size).
122 * 3. The last case to consider is if the offset actually falls
123 * within the mapping entry's range. If this is the case, the
124 * offset is considered to be "equal to" the mapping entry and
125 * 0 will be returned.
127 * NOTE: If the offset is equal to the entry's source offset,
128 * this case applies and 0 will be returned. If the offset is
129 * equal to the entry's source plus its size, this case does
130 * *not* apply (see "NOTE" above for scenario 2), and 1 will be
131 * returned.
133 static int
134 dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
136 const uint64_t const *key = v_key;
137 const vdev_indirect_mapping_entry_phys_t const *array_elem =
138 v_array_elem;
139 uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
141 if (*key < src_offset) {
142 return (-1);
143 } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
144 return (0);
145 } else {
146 return (1);
151 * Returns the mapping entry for the given offset.
153 * It's possible that the given offset will not be in the mapping table
154 * (i.e. no mapping entries contain this offset), in which case, the
155 * return value value depends on the "next_if_missing" parameter.
157 * If the offset is not found in the table and "next_if_missing" is
158 * B_FALSE, then NULL will always be returned. The behavior is intended
159 * to allow consumers to get the entry corresponding to the offset
160 * parameter, iff the offset overlaps with an entry in the table.
162 * If the offset is not found in the table and "next_if_missing" is
163 * B_TRUE, then the entry nearest to the given offset will be returned,
164 * such that the entry's source offset is greater than the offset
165 * passed in (i.e. the "next" mapping entry in the table is returned, if
166 * the offset is missing from the table). If there are no entries whose
167 * source offset is greater than the passed in offset, NULL is returned.
169 static vdev_indirect_mapping_entry_phys_t *
170 vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim,
171 uint64_t offset, boolean_t next_if_missing)
173 ASSERT(vdev_indirect_mapping_verify(vim));
174 ASSERT(vim->vim_phys->vimp_num_entries > 0);
176 vdev_indirect_mapping_entry_phys_t *entry = NULL;
178 uint64_t last = vim->vim_phys->vimp_num_entries - 1;
179 uint64_t base = 0;
182 * We don't define these inside of the while loop because we use
183 * their value in the case that offset isn't in the mapping.
185 uint64_t mid;
186 int result;
188 while (last >= base) {
189 mid = base + ((last - base) >> 1);
191 result = dva_mapping_overlap_compare(&offset,
192 &vim->vim_entries[mid]);
194 if (result == 0) {
195 entry = &vim->vim_entries[mid];
196 break;
197 } else if (result < 0) {
198 last = mid - 1;
199 } else {
200 base = mid + 1;
204 if (entry == NULL && next_if_missing) {
205 ASSERT3U(base, ==, last + 1);
206 ASSERT(mid == base || mid == last);
207 ASSERT3S(result, !=, 0);
210 * The offset we're looking for isn't actually contained
211 * in the mapping table, thus we need to return the
212 * closest mapping entry that is greater than the
213 * offset. We reuse the result of the last comparison,
214 * comparing the mapping entry at index "mid" and the
215 * offset. The offset is guaranteed to lie between
216 * indices one less than "mid", and one greater than
217 * "mid"; we just need to determine if offset is greater
218 * than, or less than the mapping entry contained at
219 * index "mid".
222 uint64_t index;
223 if (result < 0)
224 index = mid;
225 else
226 index = mid + 1;
228 ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries);
230 if (index == vim->vim_phys->vimp_num_entries) {
232 * If "index" is past the end of the entries
233 * array, then not only is the offset not in the
234 * mapping table, but it's actually greater than
235 * all entries in the table. In this case, we
236 * can't return a mapping entry greater than the
237 * offset (since none exist), so we return NULL.
240 ASSERT3S(dva_mapping_overlap_compare(&offset,
241 &vim->vim_entries[index - 1]), >, 0);
243 return (NULL);
244 } else {
246 * Just to be safe, we verify the offset falls
247 * in between the mapping entries at index and
248 * one less than index. Since we know the offset
249 * doesn't overlap an entry, and we're supposed
250 * to return the entry just greater than the
251 * offset, both of the following tests must be
252 * true.
254 ASSERT3S(dva_mapping_overlap_compare(&offset,
255 &vim->vim_entries[index]), <, 0);
256 IMPLY(index >= 1, dva_mapping_overlap_compare(&offset,
257 &vim->vim_entries[index - 1]) > 0);
259 return (&vim->vim_entries[index]);
261 } else {
262 return (entry);
266 vdev_indirect_mapping_entry_phys_t *
267 vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
268 uint64_t offset)
270 return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
271 B_FALSE));
274 vdev_indirect_mapping_entry_phys_t *
275 vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
276 uint64_t offset)
278 return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
279 B_TRUE));
283 void
284 vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim)
286 ASSERT(vdev_indirect_mapping_verify(vim));
288 if (vim->vim_phys->vimp_num_entries > 0) {
289 uint64_t map_size = vdev_indirect_mapping_size(vim);
290 kmem_free(vim->vim_entries, map_size);
291 vim->vim_entries = NULL;
294 dmu_buf_rele(vim->vim_dbuf, vim);
296 vim->vim_objset = NULL;
297 vim->vim_object = 0;
298 vim->vim_dbuf = NULL;
299 vim->vim_phys = NULL;
301 kmem_free(vim, sizeof (*vim));
304 uint64_t
305 vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx)
307 uint64_t object;
308 ASSERT(dmu_tx_is_syncing(tx));
309 uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0;
311 if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
312 bonus_size = sizeof (vdev_indirect_mapping_phys_t);
315 object = dmu_object_alloc(os,
316 DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
317 DMU_OTN_UINT64_METADATA, bonus_size,
318 tx);
320 if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
321 dmu_buf_t *dbuf;
322 vdev_indirect_mapping_phys_t *vimp;
324 VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf));
325 dmu_buf_will_dirty(dbuf, tx);
326 vimp = dbuf->db_data;
327 vimp->vimp_counts_object = dmu_object_alloc(os,
328 DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE,
329 DMU_OT_NONE, 0, tx);
330 spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
331 dmu_buf_rele(dbuf, FTAG);
334 return (object);
338 vdev_indirect_mapping_t *
339 vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object)
341 vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP);
342 dmu_object_info_t doi;
343 VERIFY0(dmu_object_info(os, mapping_object, &doi));
345 vim->vim_objset = os;
346 vim->vim_object = mapping_object;
348 VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim,
349 &vim->vim_dbuf));
350 vim->vim_phys = vim->vim_dbuf->db_data;
352 vim->vim_havecounts =
353 (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0);
355 if (vim->vim_phys->vimp_num_entries > 0) {
356 uint64_t map_size = vdev_indirect_mapping_size(vim);
357 vim->vim_entries = kmem_alloc(map_size, KM_SLEEP);
358 VERIFY0(dmu_read(os, vim->vim_object, 0, map_size,
359 vim->vim_entries, DMU_READ_PREFETCH));
362 ASSERT(vdev_indirect_mapping_verify(vim));
364 return (vim);
367 void
368 vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
370 vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object);
371 if (vim->vim_havecounts) {
372 VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object,
373 tx));
374 spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
376 vdev_indirect_mapping_close(vim);
378 VERIFY0(dmu_object_free(os, object, tx));
382 * Append the list of vdev_indirect_mapping_entry_t's to the on-disk
383 * mapping object. Also remove the entries from the list and free them.
384 * This also implicitly extends the max_offset of the mapping (to the end
385 * of the last entry).
387 void
388 vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
389 list_t *list, dmu_tx_t *tx)
391 vdev_indirect_mapping_entry_phys_t *mapbuf;
392 uint64_t old_size;
393 uint32_t *countbuf = NULL;
394 vdev_indirect_mapping_entry_phys_t *old_entries;
395 uint64_t old_count;
396 uint64_t entries_written = 0;
398 ASSERT(vdev_indirect_mapping_verify(vim));
399 ASSERT(dmu_tx_is_syncing(tx));
400 ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
401 ASSERT(!list_is_empty(list));
403 old_size = vdev_indirect_mapping_size(vim);
404 old_entries = vim->vim_entries;
405 old_count = vim->vim_phys->vimp_num_entries;
407 dmu_buf_will_dirty(vim->vim_dbuf, tx);
409 mapbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
410 if (vim->vim_havecounts) {
411 countbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
412 ASSERT(spa_feature_is_active(vim->vim_objset->os_spa,
413 SPA_FEATURE_OBSOLETE_COUNTS));
415 while (!list_is_empty(list)) {
416 uint64_t i;
418 * Write entries from the list to the
419 * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE.
421 for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) {
422 vdev_indirect_mapping_entry_t *entry =
423 list_remove_head(list);
424 if (entry == NULL)
425 break;
427 uint64_t size =
428 DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst);
429 uint64_t src_offset =
430 DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping);
433 * We shouldn't be adding an entry which is fully
434 * obsolete.
436 ASSERT3U(entry->vime_obsolete_count, <, size);
437 IMPLY(entry->vime_obsolete_count != 0,
438 vim->vim_havecounts);
440 mapbuf[i] = entry->vime_mapping;
441 if (vim->vim_havecounts)
442 countbuf[i] = entry->vime_obsolete_count;
444 vim->vim_phys->vimp_bytes_mapped += size;
445 ASSERT3U(src_offset, >=,
446 vim->vim_phys->vimp_max_offset);
447 vim->vim_phys->vimp_max_offset = src_offset + size;
449 entries_written++;
451 kmem_free(entry, sizeof (*entry));
453 dmu_write(vim->vim_objset, vim->vim_object,
454 vim->vim_phys->vimp_num_entries * sizeof (*mapbuf),
455 i * sizeof (*mapbuf),
456 mapbuf, tx);
457 if (vim->vim_havecounts) {
458 dmu_write(vim->vim_objset,
459 vim->vim_phys->vimp_counts_object,
460 vim->vim_phys->vimp_num_entries *
461 sizeof (*countbuf),
462 i * sizeof (*countbuf), countbuf, tx);
464 vim->vim_phys->vimp_num_entries += i;
466 zio_buf_free(mapbuf, SPA_OLD_MAXBLOCKSIZE);
467 if (vim->vim_havecounts)
468 zio_buf_free(countbuf, SPA_OLD_MAXBLOCKSIZE);
471 * Update the entry array to reflect the new entries. First, copy
472 * over any old entries then read back the new entries we just wrote.
474 uint64_t new_size = vdev_indirect_mapping_size(vim);
475 ASSERT3U(new_size, >, old_size);
476 ASSERT3U(new_size - old_size, ==,
477 entries_written * sizeof (vdev_indirect_mapping_entry_phys_t));
478 vim->vim_entries = kmem_alloc(new_size, KM_SLEEP);
479 if (old_size > 0) {
480 bcopy(old_entries, vim->vim_entries, old_size);
481 kmem_free(old_entries, old_size);
483 VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size,
484 new_size - old_size, &vim->vim_entries[old_count],
485 DMU_READ_PREFETCH));
487 zfs_dbgmsg("txg %llu: wrote %llu entries to "
488 "indirect mapping obj %llu; max offset=0x%llx",
489 (u_longlong_t)dmu_tx_get_txg(tx),
490 (u_longlong_t)entries_written,
491 (u_longlong_t)vim->vim_object,
492 (u_longlong_t)vim->vim_phys->vimp_max_offset);
496 * Increment the relevant counts for the specified offset and length.
497 * The counts array must be obtained from
498 * vdev_indirect_mapping_load_obsolete_counts().
500 void
501 vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim,
502 uint64_t offset, uint64_t length, uint32_t *counts)
504 vdev_indirect_mapping_entry_phys_t *mapping;
505 uint64_t index;
507 mapping = vdev_indirect_mapping_entry_for_offset(vim, offset);
509 ASSERT(length > 0);
510 ASSERT3P(mapping, !=, NULL);
512 index = mapping - vim->vim_entries;
514 while (length > 0) {
515 ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim));
517 uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
518 uint64_t inner_offset = offset -
519 DVA_MAPPING_GET_SRC_OFFSET(mapping);
520 VERIFY3U(inner_offset, <, size);
521 uint64_t inner_size = MIN(length, size - inner_offset);
523 VERIFY3U(counts[index] + inner_size, <=, size);
524 counts[index] += inner_size;
526 offset += inner_size;
527 length -= inner_size;
528 mapping++;
529 index++;
533 typedef struct load_obsolete_space_map_arg {
534 vdev_indirect_mapping_t *losma_vim;
535 uint32_t *losma_counts;
536 } load_obsolete_space_map_arg_t;
538 static int
539 load_obsolete_sm_callback(maptype_t type, uint64_t offset, uint64_t size,
540 void *arg)
542 load_obsolete_space_map_arg_t *losma = arg;
543 ASSERT3S(type, ==, SM_ALLOC);
545 vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim,
546 offset, size, losma->losma_counts);
548 return (0);
552 * Modify the counts (increment them) based on the spacemap.
554 void
555 vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
556 uint32_t *counts, space_map_t *obsolete_space_sm)
558 load_obsolete_space_map_arg_t losma;
559 losma.losma_counts = counts;
560 losma.losma_vim = vim;
561 VERIFY0(space_map_iterate(obsolete_space_sm,
562 load_obsolete_sm_callback, &losma));
566 * Read the obsolete counts from disk, returning them in an array.
568 uint32_t *
569 vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim)
571 ASSERT(vdev_indirect_mapping_verify(vim));
573 uint64_t counts_size =
574 vim->vim_phys->vimp_num_entries * sizeof (uint32_t);
575 uint32_t *counts = kmem_alloc(counts_size, KM_SLEEP);
576 if (vim->vim_havecounts) {
577 VERIFY0(dmu_read(vim->vim_objset,
578 vim->vim_phys->vimp_counts_object,
579 0, counts_size,
580 counts, DMU_READ_PREFETCH));
581 } else {
582 bzero(counts, counts_size);
584 return (counts);
587 extern void
588 vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim,
589 uint32_t *counts)
591 ASSERT(vdev_indirect_mapping_verify(vim));
593 kmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t));