9166 zfs storage pool checkpoint
[unleashed.git] / usr / src / uts / common / fs / zfs / vdev_indirect.c
blob988e21687165b02844efcd0272c93500ffa0d92c
1 /*
2 * CDDL HEADER START
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
9 * A full copy of the text of the CDDL should have accompanied this
10 * source. A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
13 * CDDL HEADER END
17 * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
20 #include <sys/zfs_context.h>
21 #include <sys/spa.h>
22 #include <sys/spa_impl.h>
23 #include <sys/vdev_impl.h>
24 #include <sys/fs/zfs.h>
25 #include <sys/zio.h>
26 #include <sys/metaslab.h>
27 #include <sys/refcount.h>
28 #include <sys/dmu.h>
29 #include <sys/vdev_indirect_mapping.h>
30 #include <sys/dmu_tx.h>
31 #include <sys/dsl_synctask.h>
32 #include <sys/zap.h>
33 #include <sys/abd.h>
34 #include <sys/zthr.h>
37 * An indirect vdev corresponds to a vdev that has been removed. Since
38 * we cannot rewrite block pointers of snapshots, etc., we keep a
39 * mapping from old location on the removed device to the new location
40 * on another device in the pool and use this mapping whenever we need
41 * to access the DVA. Unfortunately, this mapping did not respect
42 * logical block boundaries when it was first created, and so a DVA on
43 * this indirect vdev may be "split" into multiple sections that each
44 * map to a different location. As a consequence, not all DVAs can be
45 * translated to an equivalent new DVA. Instead we must provide a
46 * "vdev_remap" operation that executes a callback on each contiguous
47 * segment of the new location. This function is used in multiple ways:
49 * - reads and repair writes to this device use the callback to create
50 * a child io for each mapped segment.
52 * - frees and claims to this device use the callback to free or claim
53 * each mapped segment. (Note that we don't actually need to claim
54 * log blocks on indirect vdevs, because we don't allocate to
55 * removing vdevs. However, zdb uses zio_claim() for its leak
56 * detection.)
60 * "Big theory statement" for how we mark blocks obsolete.
62 * When a block on an indirect vdev is freed or remapped, a section of
63 * that vdev's mapping may no longer be referenced (aka "obsolete"). We
64 * keep track of how much of each mapping entry is obsolete. When
65 * an entry becomes completely obsolete, we can remove it, thus reducing
66 * the memory used by the mapping. The complete picture of obsolescence
67 * is given by the following data structures, described below:
68 * - the entry-specific obsolete count
69 * - the vdev-specific obsolete spacemap
70 * - the pool-specific obsolete bpobj
72 * == On disk data structures used ==
74 * We track the obsolete space for the pool using several objects. Each
75 * of these objects is created on demand and freed when no longer
76 * needed, and is assumed to be empty if it does not exist.
77 * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
79 * - Each vic_mapping_object (associated with an indirect vdev) can
80 * have a vimp_counts_object. This is an array of uint32_t's
81 * with the same number of entries as the vic_mapping_object. When
82 * the mapping is condensed, entries from the vic_obsolete_sm_object
83 * (see below) are folded into the counts. Therefore, each
84 * obsolete_counts entry tells us the number of bytes in the
85 * corresponding mapping entry that were not referenced when the
86 * mapping was last condensed.
88 * - Each indirect or removing vdev can have a vic_obsolete_sm_object.
89 * This is a space map containing an alloc entry for every DVA that
90 * has been obsoleted since the last time this indirect vdev was
91 * condensed. We use this object in order to improve performance
92 * when marking a DVA as obsolete. Instead of modifying an arbitrary
93 * offset of the vimp_counts_object, we only need to append an entry
94 * to the end of this object. When a DVA becomes obsolete, it is
95 * added to the obsolete space map. This happens when the DVA is
96 * freed, remapped and not referenced by a snapshot, or the last
97 * snapshot referencing it is destroyed.
99 * - Each dataset can have a ds_remap_deadlist object. This is a
100 * deadlist object containing all blocks that were remapped in this
101 * dataset but referenced in a previous snapshot. Blocks can *only*
102 * appear on this list if they were remapped (dsl_dataset_block_remapped);
103 * blocks that were killed in a head dataset are put on the normal
104 * ds_deadlist and marked obsolete when they are freed.
106 * - The pool can have a dp_obsolete_bpobj. This is a list of blocks
107 * in the pool that need to be marked obsolete. When a snapshot is
108 * destroyed, we move some of the ds_remap_deadlist to the obsolete
109 * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then
110 * asynchronously process the obsolete bpobj, moving its entries to
111 * the specific vdevs' obsolete space maps.
113 * == Summary of how we mark blocks as obsolete ==
115 * - When freeing a block: if any DVA is on an indirect vdev, append to
116 * vic_obsolete_sm_object.
117 * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
118 * references; otherwise append to vic_obsolete_sm_object).
119 * - When freeing a snapshot: move parts of ds_remap_deadlist to
120 * dp_obsolete_bpobj (same algorithm as ds_deadlist).
121 * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
122 * individual vdev's vic_obsolete_sm_object.
126 * "Big theory statement" for how we condense indirect vdevs.
128 * Condensing an indirect vdev's mapping is the process of determining
129 * the precise counts of obsolete space for each mapping entry (by
130 * integrating the obsolete spacemap into the obsolete counts) and
131 * writing out a new mapping that contains only referenced entries.
133 * We condense a vdev when we expect the mapping to shrink (see
134 * vdev_indirect_should_condense()), but only perform one condense at a
135 * time to limit the memory usage. In addition, we use a separate
136 * open-context thread (spa_condense_indirect_thread) to incrementally
137 * create the new mapping object in a way that minimizes the impact on
138 * the rest of the system.
140 * == Generating a new mapping ==
142 * To generate a new mapping, we follow these steps:
144 * 1. Save the old obsolete space map and create a new mapping object
145 * (see spa_condense_indirect_start_sync()). This initializes the
146 * spa_condensing_indirect_phys with the "previous obsolete space map",
147 * which is now read only. Newly obsolete DVAs will be added to a
148 * new (initially empty) obsolete space map, and will not be
149 * considered as part of this condense operation.
151 * 2. Construct in memory the precise counts of obsolete space for each
152 * mapping entry, by incorporating the obsolete space map into the
153 * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
155 * 3. Iterate through each mapping entry, writing to the new mapping any
156 * entries that are not completely obsolete (i.e. which don't have
157 * obsolete count == mapping length). (See
158 * spa_condense_indirect_generate_new_mapping().)
160 * 4. Destroy the old mapping object and switch over to the new one
161 * (spa_condense_indirect_complete_sync).
163 * == Restarting from failure ==
165 * To restart the condense when we import/open the pool, we must start
166 * at the 2nd step above: reconstruct the precise counts in memory,
167 * based on the space map + counts. Then in the 3rd step, we start
168 * iterating where we left off: at vimp_max_offset of the new mapping
169 * object.
172 boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE;
175 * Condense if at least this percent of the bytes in the mapping is
176 * obsolete. With the default of 25%, the amount of space mapped
177 * will be reduced to 1% of its original size after at most 16
178 * condenses. Higher values will condense less often (causing less
179 * i/o); lower values will reduce the mapping size more quickly.
181 int zfs_indirect_condense_obsolete_pct = 25;
184 * Condense if the obsolete space map takes up more than this amount of
185 * space on disk (logically). This limits the amount of disk space
186 * consumed by the obsolete space map; the default of 1GB is small enough
187 * that we typically don't mind "wasting" it.
189 uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
192 * Don't bother condensing if the mapping uses less than this amount of
193 * memory. The default of 128KB is considered a "trivial" amount of
194 * memory and not worth reducing.
196 uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
199 * This is used by the test suite so that it can ensure that certain
200 * actions happen while in the middle of a condense (which might otherwise
201 * complete too quickly). If used to reduce the performance impact of
202 * condensing in production, a maximum value of 1 should be sufficient.
204 int zfs_condense_indirect_commit_entry_delay_ticks = 0;
207 * Mark the given offset and size as being obsolete.
209 void
210 vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
212 spa_t *spa = vd->vdev_spa;
214 ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
215 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
216 ASSERT(size > 0);
217 VERIFY(vdev_indirect_mapping_entry_for_offset(
218 vd->vdev_indirect_mapping, offset) != NULL);
220 if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
221 mutex_enter(&vd->vdev_obsolete_lock);
222 range_tree_add(vd->vdev_obsolete_segments, offset, size);
223 mutex_exit(&vd->vdev_obsolete_lock);
224 vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa));
229 * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
230 * wrapper is provided because the DMU does not know about vdev_t's and
231 * cannot directly call vdev_indirect_mark_obsolete.
233 void
234 spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
235 uint64_t size, dmu_tx_t *tx)
237 vdev_t *vd = vdev_lookup_top(spa, vdev_id);
238 ASSERT(dmu_tx_is_syncing(tx));
240 /* The DMU can only remap indirect vdevs. */
241 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
242 vdev_indirect_mark_obsolete(vd, offset, size);
245 static spa_condensing_indirect_t *
246 spa_condensing_indirect_create(spa_t *spa)
248 spa_condensing_indirect_phys_t *scip =
249 &spa->spa_condensing_indirect_phys;
250 spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP);
251 objset_t *mos = spa->spa_meta_objset;
253 for (int i = 0; i < TXG_SIZE; i++) {
254 list_create(&sci->sci_new_mapping_entries[i],
255 sizeof (vdev_indirect_mapping_entry_t),
256 offsetof(vdev_indirect_mapping_entry_t, vime_node));
259 sci->sci_new_mapping =
260 vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
262 return (sci);
265 static void
266 spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
268 for (int i = 0; i < TXG_SIZE; i++)
269 list_destroy(&sci->sci_new_mapping_entries[i]);
271 if (sci->sci_new_mapping != NULL)
272 vdev_indirect_mapping_close(sci->sci_new_mapping);
274 kmem_free(sci, sizeof (*sci));
277 boolean_t
278 vdev_indirect_should_condense(vdev_t *vd)
280 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
281 spa_t *spa = vd->vdev_spa;
283 ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
285 if (!zfs_condense_indirect_vdevs_enable)
286 return (B_FALSE);
289 * We can only condense one indirect vdev at a time.
291 if (spa->spa_condensing_indirect != NULL)
292 return (B_FALSE);
294 if (spa_shutting_down(spa))
295 return (B_FALSE);
298 * The mapping object size must not change while we are
299 * condensing, so we can only condense indirect vdevs
300 * (not vdevs that are still in the middle of being removed).
302 if (vd->vdev_ops != &vdev_indirect_ops)
303 return (B_FALSE);
306 * If nothing new has been marked obsolete, there is no
307 * point in condensing.
309 if (vd->vdev_obsolete_sm == NULL) {
310 ASSERT0(vdev_obsolete_sm_object(vd));
311 return (B_FALSE);
314 ASSERT(vd->vdev_obsolete_sm != NULL);
316 ASSERT3U(vdev_obsolete_sm_object(vd), ==,
317 space_map_object(vd->vdev_obsolete_sm));
319 uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
320 uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
321 uint64_t mapping_size = vdev_indirect_mapping_size(vim);
322 uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
324 ASSERT3U(bytes_obsolete, <=, bytes_mapped);
327 * If a high percentage of the bytes that are mapped have become
328 * obsolete, condense (unless the mapping is already small enough).
329 * This has a good chance of reducing the amount of memory used
330 * by the mapping.
332 if (bytes_obsolete * 100 / bytes_mapped >=
333 zfs_indirect_condense_obsolete_pct &&
334 mapping_size > zfs_condense_min_mapping_bytes) {
335 zfs_dbgmsg("should condense vdev %llu because obsolete "
336 "spacemap covers %d%% of %lluMB mapping",
337 (u_longlong_t)vd->vdev_id,
338 (int)(bytes_obsolete * 100 / bytes_mapped),
339 (u_longlong_t)bytes_mapped / 1024 / 1024);
340 return (B_TRUE);
344 * If the obsolete space map takes up too much space on disk,
345 * condense in order to free up this disk space.
347 if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
348 zfs_dbgmsg("should condense vdev %llu because obsolete sm "
349 "length %lluMB >= max size %lluMB",
350 (u_longlong_t)vd->vdev_id,
351 (u_longlong_t)obsolete_sm_size / 1024 / 1024,
352 (u_longlong_t)zfs_condense_max_obsolete_bytes /
353 1024 / 1024);
354 return (B_TRUE);
357 return (B_FALSE);
361 * This sync task completes (finishes) a condense, deleting the old
362 * mapping and replacing it with the new one.
364 static void
365 spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx)
367 spa_condensing_indirect_t *sci = arg;
368 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
369 spa_condensing_indirect_phys_t *scip =
370 &spa->spa_condensing_indirect_phys;
371 vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
372 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
373 objset_t *mos = spa->spa_meta_objset;
374 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
375 uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
376 uint64_t new_count =
377 vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
379 ASSERT(dmu_tx_is_syncing(tx));
380 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
381 ASSERT3P(sci, ==, spa->spa_condensing_indirect);
382 for (int i = 0; i < TXG_SIZE; i++) {
383 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
385 ASSERT(vic->vic_mapping_object != 0);
386 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
387 ASSERT(scip->scip_next_mapping_object != 0);
388 ASSERT(scip->scip_prev_obsolete_sm_object != 0);
391 * Reset vdev_indirect_mapping to refer to the new object.
393 rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
394 vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
395 vd->vdev_indirect_mapping = sci->sci_new_mapping;
396 rw_exit(&vd->vdev_indirect_rwlock);
398 sci->sci_new_mapping = NULL;
399 vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
400 vic->vic_mapping_object = scip->scip_next_mapping_object;
401 scip->scip_next_mapping_object = 0;
403 space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
404 spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
405 scip->scip_prev_obsolete_sm_object = 0;
407 scip->scip_vdev = 0;
409 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
410 DMU_POOL_CONDENSING_INDIRECT, tx));
411 spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
412 spa->spa_condensing_indirect = NULL;
414 zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
415 "new mapping object %llu has %llu entries "
416 "(was %llu entries)",
417 vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object,
418 new_count, old_count);
420 vdev_config_dirty(spa->spa_root_vdev);
424 * This sync task appends entries to the new mapping object.
426 static void
427 spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx)
429 spa_condensing_indirect_t *sci = arg;
430 uint64_t txg = dmu_tx_get_txg(tx);
431 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
433 ASSERT(dmu_tx_is_syncing(tx));
434 ASSERT3P(sci, ==, spa->spa_condensing_indirect);
436 vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
437 &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
438 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
442 * Open-context function to add one entry to the new mapping. The new
443 * entry will be remembered and written from syncing context.
445 static void
446 spa_condense_indirect_commit_entry(spa_t *spa,
447 vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
449 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
451 ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
453 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
454 dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
455 VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
456 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
459 * If we are the first entry committed this txg, kick off the sync
460 * task to write to the MOS on our behalf.
462 if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
463 dsl_sync_task_nowait(dmu_tx_pool(tx),
464 spa_condense_indirect_commit_sync, sci,
465 0, ZFS_SPACE_CHECK_NONE, tx);
468 vdev_indirect_mapping_entry_t *vime =
469 kmem_alloc(sizeof (*vime), KM_SLEEP);
470 vime->vime_mapping = *vimep;
471 vime->vime_obsolete_count = count;
472 list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
474 dmu_tx_commit(tx);
477 static void
478 spa_condense_indirect_generate_new_mapping(vdev_t *vd,
479 uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
481 spa_t *spa = vd->vdev_spa;
482 uint64_t mapi = start_index;
483 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
484 uint64_t old_num_entries =
485 vdev_indirect_mapping_num_entries(old_mapping);
487 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
488 ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
490 zfs_dbgmsg("starting condense of vdev %llu from index %llu",
491 (u_longlong_t)vd->vdev_id,
492 (u_longlong_t)mapi);
494 while (mapi < old_num_entries) {
496 if (zthr_iscancelled(zthr)) {
497 zfs_dbgmsg("pausing condense of vdev %llu "
498 "at index %llu", (u_longlong_t)vd->vdev_id,
499 (u_longlong_t)mapi);
500 break;
503 vdev_indirect_mapping_entry_phys_t *entry =
504 &old_mapping->vim_entries[mapi];
505 uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
506 ASSERT3U(obsolete_counts[mapi], <=, entry_size);
507 if (obsolete_counts[mapi] < entry_size) {
508 spa_condense_indirect_commit_entry(spa, entry,
509 obsolete_counts[mapi]);
512 * This delay may be requested for testing, debugging,
513 * or performance reasons.
515 delay(zfs_condense_indirect_commit_entry_delay_ticks);
518 mapi++;
522 /* ARGSUSED */
523 static boolean_t
524 spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
526 spa_t *spa = arg;
528 return (spa->spa_condensing_indirect != NULL);
531 /* ARGSUSED */
532 static int
533 spa_condense_indirect_thread(void *arg, zthr_t *zthr)
535 spa_t *spa = arg;
536 vdev_t *vd;
538 ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
539 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
540 vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
541 ASSERT3P(vd, !=, NULL);
542 spa_config_exit(spa, SCL_VDEV, FTAG);
544 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
545 spa_condensing_indirect_phys_t *scip =
546 &spa->spa_condensing_indirect_phys;
547 uint32_t *counts;
548 uint64_t start_index;
549 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
550 space_map_t *prev_obsolete_sm = NULL;
552 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
553 ASSERT(scip->scip_next_mapping_object != 0);
554 ASSERT(scip->scip_prev_obsolete_sm_object != 0);
555 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
557 for (int i = 0; i < TXG_SIZE; i++) {
559 * The list must start out empty in order for the
560 * _commit_sync() sync task to be properly registered
561 * on the first call to _commit_entry(); so it's wise
562 * to double check and ensure we actually are starting
563 * with empty lists.
565 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
568 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
569 scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
570 space_map_update(prev_obsolete_sm);
571 counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
572 if (prev_obsolete_sm != NULL) {
573 vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
574 counts, prev_obsolete_sm);
576 space_map_close(prev_obsolete_sm);
579 * Generate new mapping. Determine what index to continue from
580 * based on the max offset that we've already written in the
581 * new mapping.
583 uint64_t max_offset =
584 vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
585 if (max_offset == 0) {
586 /* We haven't written anything to the new mapping yet. */
587 start_index = 0;
588 } else {
590 * Pick up from where we left off. _entry_for_offset()
591 * returns a pointer into the vim_entries array. If
592 * max_offset is greater than any of the mappings
593 * contained in the table NULL will be returned and
594 * that indicates we've exhausted our iteration of the
595 * old_mapping.
598 vdev_indirect_mapping_entry_phys_t *entry =
599 vdev_indirect_mapping_entry_for_offset_or_next(old_mapping,
600 max_offset);
602 if (entry == NULL) {
604 * We've already written the whole new mapping.
605 * This special value will cause us to skip the
606 * generate_new_mapping step and just do the sync
607 * task to complete the condense.
609 start_index = UINT64_MAX;
610 } else {
611 start_index = entry - old_mapping->vim_entries;
612 ASSERT3U(start_index, <,
613 vdev_indirect_mapping_num_entries(old_mapping));
617 spa_condense_indirect_generate_new_mapping(vd, counts,
618 start_index, zthr);
620 vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
623 * If the zthr has received a cancellation signal while running
624 * in generate_new_mapping() or at any point after that, then bail
625 * early. We don't want to complete the condense if the spa is
626 * shutting down.
628 if (zthr_iscancelled(zthr))
629 return (0);
631 VERIFY0(dsl_sync_task(spa_name(spa), NULL,
632 spa_condense_indirect_complete_sync, sci, 0,
633 ZFS_SPACE_CHECK_EXTRA_RESERVED));
635 return (0);
639 * Sync task to begin the condensing process.
641 void
642 spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx)
644 spa_t *spa = vd->vdev_spa;
645 spa_condensing_indirect_phys_t *scip =
646 &spa->spa_condensing_indirect_phys;
648 ASSERT0(scip->scip_next_mapping_object);
649 ASSERT0(scip->scip_prev_obsolete_sm_object);
650 ASSERT0(scip->scip_vdev);
651 ASSERT(dmu_tx_is_syncing(tx));
652 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
653 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS));
654 ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping));
656 uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd);
657 ASSERT(obsolete_sm_obj != 0);
659 scip->scip_vdev = vd->vdev_id;
660 scip->scip_next_mapping_object =
661 vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx);
663 scip->scip_prev_obsolete_sm_object = obsolete_sm_obj;
666 * We don't need to allocate a new space map object, since
667 * vdev_indirect_sync_obsolete will allocate one when needed.
669 space_map_close(vd->vdev_obsolete_sm);
670 vd->vdev_obsolete_sm = NULL;
671 VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
672 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
674 VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
675 DMU_POOL_DIRECTORY_OBJECT,
676 DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
677 sizeof (*scip) / sizeof (uint64_t), scip, tx));
679 ASSERT3P(spa->spa_condensing_indirect, ==, NULL);
680 spa->spa_condensing_indirect = spa_condensing_indirect_create(spa);
682 zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
683 "posm=%llu nm=%llu",
684 vd->vdev_id, dmu_tx_get_txg(tx),
685 (u_longlong_t)scip->scip_prev_obsolete_sm_object,
686 (u_longlong_t)scip->scip_next_mapping_object);
688 zthr_wakeup(spa->spa_condense_zthr);
692 * Sync to the given vdev's obsolete space map any segments that are no longer
693 * referenced as of the given txg.
695 * If the obsolete space map doesn't exist yet, create and open it.
697 void
698 vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
700 spa_t *spa = vd->vdev_spa;
701 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
703 ASSERT3U(vic->vic_mapping_object, !=, 0);
704 ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0);
705 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
706 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS));
708 if (vdev_obsolete_sm_object(vd) == 0) {
709 uint64_t obsolete_sm_object =
710 space_map_alloc(spa->spa_meta_objset,
711 vdev_standard_sm_blksz, tx);
713 ASSERT(vd->vdev_top_zap != 0);
714 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
715 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM,
716 sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx));
717 ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0);
719 spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
720 VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
721 spa->spa_meta_objset, obsolete_sm_object,
722 0, vd->vdev_asize, 0));
723 space_map_update(vd->vdev_obsolete_sm);
726 ASSERT(vd->vdev_obsolete_sm != NULL);
727 ASSERT3U(vdev_obsolete_sm_object(vd), ==,
728 space_map_object(vd->vdev_obsolete_sm));
730 space_map_write(vd->vdev_obsolete_sm,
731 vd->vdev_obsolete_segments, SM_ALLOC, tx);
732 space_map_update(vd->vdev_obsolete_sm);
733 range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
737 spa_condense_init(spa_t *spa)
739 int error = zap_lookup(spa->spa_meta_objset,
740 DMU_POOL_DIRECTORY_OBJECT,
741 DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
742 sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t),
743 &spa->spa_condensing_indirect_phys);
744 if (error == 0) {
745 if (spa_writeable(spa)) {
746 spa->spa_condensing_indirect =
747 spa_condensing_indirect_create(spa);
749 return (0);
750 } else if (error == ENOENT) {
751 return (0);
752 } else {
753 return (error);
757 void
758 spa_condense_fini(spa_t *spa)
760 if (spa->spa_condensing_indirect != NULL) {
761 spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
762 spa->spa_condensing_indirect = NULL;
766 void
767 spa_start_indirect_condensing_thread(spa_t *spa)
769 ASSERT3P(spa->spa_condense_zthr, ==, NULL);
770 spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check,
771 spa_condense_indirect_thread, spa);
775 * Gets the obsolete spacemap object from the vdev's ZAP.
776 * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't
777 * exist yet.
780 vdev_obsolete_sm_object(vdev_t *vd)
782 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
783 if (vd->vdev_top_zap == 0) {
784 return (0);
787 uint64_t sm_obj = 0;
788 int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
789 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj);
791 ASSERT(err == 0 || err == ENOENT);
793 return (sm_obj);
796 boolean_t
797 vdev_obsolete_counts_are_precise(vdev_t *vd)
799 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
800 if (vd->vdev_top_zap == 0) {
801 return (B_FALSE);
804 uint64_t val = 0;
805 int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
806 VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val);
808 ASSERT(err == 0 || err == ENOENT);
810 return (val != 0);
813 /* ARGSUSED */
814 static void
815 vdev_indirect_close(vdev_t *vd)
819 /* ARGSUSED */
820 static void
821 vdev_indirect_io_done(zio_t *zio)
825 /* ARGSUSED */
826 static int
827 vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
828 uint64_t *ashift)
830 *psize = *max_psize = vd->vdev_asize +
831 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
832 *ashift = vd->vdev_ashift;
833 return (0);
836 typedef struct remap_segment {
837 vdev_t *rs_vd;
838 uint64_t rs_offset;
839 uint64_t rs_asize;
840 uint64_t rs_split_offset;
841 list_node_t rs_node;
842 } remap_segment_t;
844 remap_segment_t *
845 rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
847 remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP);
848 rs->rs_vd = vd;
849 rs->rs_offset = offset;
850 rs->rs_asize = asize;
851 rs->rs_split_offset = split_offset;
852 return (rs);
856 * Given an indirect vdev and an extent on that vdev, it duplicates the
857 * physical entries of the indirect mapping that correspond to the extent
858 * to a new array and returns a pointer to it. In addition, copied_entries
859 * is populated with the number of mapping entries that were duplicated.
861 * Note that the function assumes that the caller holds vdev_indirect_rwlock.
862 * This ensures that the mapping won't change due to condensing as we
863 * copy over its contents.
865 * Finally, since we are doing an allocation, it is up to the caller to
866 * free the array allocated in this function.
868 vdev_indirect_mapping_entry_phys_t *
869 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
870 uint64_t asize, uint64_t *copied_entries)
872 vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
873 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
874 uint64_t entries = 0;
876 ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock));
878 vdev_indirect_mapping_entry_phys_t *first_mapping =
879 vdev_indirect_mapping_entry_for_offset(vim, offset);
880 ASSERT3P(first_mapping, !=, NULL);
882 vdev_indirect_mapping_entry_phys_t *m = first_mapping;
883 while (asize > 0) {
884 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
886 ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m));
887 ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size);
889 uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
890 uint64_t inner_size = MIN(asize, size - inner_offset);
892 offset += inner_size;
893 asize -= inner_size;
894 entries++;
895 m++;
898 size_t copy_length = entries * sizeof (*first_mapping);
899 duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP);
900 bcopy(first_mapping, duplicate_mappings, copy_length);
901 *copied_entries = entries;
903 return (duplicate_mappings);
907 * Goes through the relevant indirect mappings until it hits a concrete vdev
908 * and issues the callback. On the way to the concrete vdev, if any other
909 * indirect vdevs are encountered, then the callback will also be called on
910 * each of those indirect vdevs. For example, if the segment is mapped to
911 * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
912 * mapped to segment B on concrete vdev 2, then the callback will be called on
913 * both vdev 1 and vdev 2.
915 * While the callback passed to vdev_indirect_remap() is called on every vdev
916 * the function encounters, certain callbacks only care about concrete vdevs.
917 * These types of callbacks should return immediately and explicitly when they
918 * are called on an indirect vdev.
920 * Because there is a possibility that a DVA section in the indirect device
921 * has been split into multiple sections in our mapping, we keep track
922 * of the relevant contiguous segments of the new location (remap_segment_t)
923 * in a stack. This way we can call the callback for each of the new sections
924 * created by a single section of the indirect device. Note though, that in
925 * this scenario the callbacks in each split block won't occur in-order in
926 * terms of offset, so callers should not make any assumptions about that.
928 * For callbacks that don't handle split blocks and immediately return when
929 * they encounter them (as is the case for remap_blkptr_cb), the caller can
930 * assume that its callback will be applied from the first indirect vdev
931 * encountered to the last one and then the concrete vdev, in that order.
933 static void
934 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
935 void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg)
937 list_t stack;
938 spa_t *spa = vd->vdev_spa;
940 list_create(&stack, sizeof (remap_segment_t),
941 offsetof(remap_segment_t, rs_node));
943 for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
944 rs != NULL; rs = list_remove_head(&stack)) {
945 vdev_t *v = rs->rs_vd;
946 uint64_t num_entries = 0;
948 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
949 ASSERT(rs->rs_asize > 0);
952 * Note: As this function can be called from open context
953 * (e.g. zio_read()), we need the following rwlock to
954 * prevent the mapping from being changed by condensing.
956 * So we grab the lock and we make a copy of the entries
957 * that are relevant to the extent that we are working on.
958 * Once that is done, we drop the lock and iterate over
959 * our copy of the mapping. Once we are done with the with
960 * the remap segment and we free it, we also free our copy
961 * of the indirect mapping entries that are relevant to it.
963 * This way we don't need to wait until the function is
964 * finished with a segment, to condense it. In addition, we
965 * don't need a recursive rwlock for the case that a call to
966 * vdev_indirect_remap() needs to call itself (through the
967 * codepath of its callback) for the same vdev in the middle
968 * of its execution.
970 rw_enter(&v->vdev_indirect_rwlock, RW_READER);
971 vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping;
972 ASSERT3P(vim, !=, NULL);
974 vdev_indirect_mapping_entry_phys_t *mapping =
975 vdev_indirect_mapping_duplicate_adjacent_entries(v,
976 rs->rs_offset, rs->rs_asize, &num_entries);
977 ASSERT3P(mapping, !=, NULL);
978 ASSERT3U(num_entries, >, 0);
979 rw_exit(&v->vdev_indirect_rwlock);
981 for (uint64_t i = 0; i < num_entries; i++) {
983 * Note: the vdev_indirect_mapping can not change
984 * while we are running. It only changes while the
985 * removal is in progress, and then only from syncing
986 * context. While a removal is in progress, this
987 * function is only called for frees, which also only
988 * happen from syncing context.
990 vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
992 ASSERT3P(m, !=, NULL);
993 ASSERT3U(rs->rs_asize, >, 0);
995 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
996 uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
997 uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
999 ASSERT3U(rs->rs_offset, >=,
1000 DVA_MAPPING_GET_SRC_OFFSET(m));
1001 ASSERT3U(rs->rs_offset, <,
1002 DVA_MAPPING_GET_SRC_OFFSET(m) + size);
1003 ASSERT3U(dst_vdev, !=, v->vdev_id);
1005 uint64_t inner_offset = rs->rs_offset -
1006 DVA_MAPPING_GET_SRC_OFFSET(m);
1007 uint64_t inner_size =
1008 MIN(rs->rs_asize, size - inner_offset);
1010 vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
1011 ASSERT3P(dst_v, !=, NULL);
1013 if (dst_v->vdev_ops == &vdev_indirect_ops) {
1014 list_insert_head(&stack,
1015 rs_alloc(dst_v, dst_offset + inner_offset,
1016 inner_size, rs->rs_split_offset));
1020 if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) &&
1021 IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) {
1023 * Note: This clause exists only solely for
1024 * testing purposes. We use it to ensure that
1025 * split blocks work and that the callbacks
1026 * using them yield the same result if issued
1027 * in reverse order.
1029 uint64_t inner_half = inner_size / 2;
1031 func(rs->rs_split_offset + inner_half, dst_v,
1032 dst_offset + inner_offset + inner_half,
1033 inner_half, arg);
1035 func(rs->rs_split_offset, dst_v,
1036 dst_offset + inner_offset,
1037 inner_half, arg);
1038 } else {
1039 func(rs->rs_split_offset, dst_v,
1040 dst_offset + inner_offset,
1041 inner_size, arg);
1044 rs->rs_offset += inner_size;
1045 rs->rs_asize -= inner_size;
1046 rs->rs_split_offset += inner_size;
1048 VERIFY0(rs->rs_asize);
1050 kmem_free(mapping, num_entries * sizeof (*mapping));
1051 kmem_free(rs, sizeof (remap_segment_t));
1053 list_destroy(&stack);
1056 static void
1057 vdev_indirect_child_io_done(zio_t *zio)
1059 zio_t *pio = zio->io_private;
1061 mutex_enter(&pio->io_lock);
1062 pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
1063 mutex_exit(&pio->io_lock);
1065 abd_put(zio->io_abd);
1068 static void
1069 vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset,
1070 uint64_t size, void *arg)
1072 zio_t *zio = arg;
1074 ASSERT3P(vd, !=, NULL);
1076 if (vd->vdev_ops == &vdev_indirect_ops)
1077 return;
1079 zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
1080 abd_get_offset(zio->io_abd, split_offset),
1081 size, zio->io_type, zio->io_priority,
1082 0, vdev_indirect_child_io_done, zio));
1085 static void
1086 vdev_indirect_io_start(zio_t *zio)
1088 spa_t *spa = zio->io_spa;
1090 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1091 if (zio->io_type != ZIO_TYPE_READ) {
1092 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
1093 ASSERT((zio->io_flags &
1094 (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
1097 vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
1098 vdev_indirect_io_start_cb, zio);
1100 zio_execute(zio);
1103 vdev_ops_t vdev_indirect_ops = {
1104 vdev_indirect_open,
1105 vdev_indirect_close,
1106 vdev_default_asize,
1107 vdev_indirect_io_start,
1108 vdev_indirect_io_done,
1109 NULL,
1110 NULL,
1111 NULL,
1112 vdev_indirect_remap,
1113 VDEV_TYPE_INDIRECT, /* name of this vdev type */
1114 B_FALSE /* leaf vdev */