4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
9 * A full copy of the text of the CDDL should have accompanied this
10 * source. A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
17 * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
20 #include <sys/zfs_context.h>
22 #include <sys/spa_impl.h>
23 #include <sys/vdev_impl.h>
24 #include <sys/fs/zfs.h>
26 #include <sys/metaslab.h>
27 #include <sys/refcount.h>
29 #include <sys/vdev_indirect_mapping.h>
30 #include <sys/dmu_tx.h>
31 #include <sys/dsl_synctask.h>
37 * An indirect vdev corresponds to a vdev that has been removed. Since
38 * we cannot rewrite block pointers of snapshots, etc., we keep a
39 * mapping from old location on the removed device to the new location
40 * on another device in the pool and use this mapping whenever we need
41 * to access the DVA. Unfortunately, this mapping did not respect
42 * logical block boundaries when it was first created, and so a DVA on
43 * this indirect vdev may be "split" into multiple sections that each
44 * map to a different location. As a consequence, not all DVAs can be
45 * translated to an equivalent new DVA. Instead we must provide a
46 * "vdev_remap" operation that executes a callback on each contiguous
47 * segment of the new location. This function is used in multiple ways:
49 * - reads and repair writes to this device use the callback to create
50 * a child io for each mapped segment.
52 * - frees and claims to this device use the callback to free or claim
53 * each mapped segment. (Note that we don't actually need to claim
54 * log blocks on indirect vdevs, because we don't allocate to
55 * removing vdevs. However, zdb uses zio_claim() for its leak
60 * "Big theory statement" for how we mark blocks obsolete.
62 * When a block on an indirect vdev is freed or remapped, a section of
63 * that vdev's mapping may no longer be referenced (aka "obsolete"). We
64 * keep track of how much of each mapping entry is obsolete. When
65 * an entry becomes completely obsolete, we can remove it, thus reducing
66 * the memory used by the mapping. The complete picture of obsolescence
67 * is given by the following data structures, described below:
68 * - the entry-specific obsolete count
69 * - the vdev-specific obsolete spacemap
70 * - the pool-specific obsolete bpobj
72 * == On disk data structures used ==
74 * We track the obsolete space for the pool using several objects. Each
75 * of these objects is created on demand and freed when no longer
76 * needed, and is assumed to be empty if it does not exist.
77 * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
79 * - Each vic_mapping_object (associated with an indirect vdev) can
80 * have a vimp_counts_object. This is an array of uint32_t's
81 * with the same number of entries as the vic_mapping_object. When
82 * the mapping is condensed, entries from the vic_obsolete_sm_object
83 * (see below) are folded into the counts. Therefore, each
84 * obsolete_counts entry tells us the number of bytes in the
85 * corresponding mapping entry that were not referenced when the
86 * mapping was last condensed.
88 * - Each indirect or removing vdev can have a vic_obsolete_sm_object.
89 * This is a space map containing an alloc entry for every DVA that
90 * has been obsoleted since the last time this indirect vdev was
91 * condensed. We use this object in order to improve performance
92 * when marking a DVA as obsolete. Instead of modifying an arbitrary
93 * offset of the vimp_counts_object, we only need to append an entry
94 * to the end of this object. When a DVA becomes obsolete, it is
95 * added to the obsolete space map. This happens when the DVA is
96 * freed, remapped and not referenced by a snapshot, or the last
97 * snapshot referencing it is destroyed.
99 * - Each dataset can have a ds_remap_deadlist object. This is a
100 * deadlist object containing all blocks that were remapped in this
101 * dataset but referenced in a previous snapshot. Blocks can *only*
102 * appear on this list if they were remapped (dsl_dataset_block_remapped);
103 * blocks that were killed in a head dataset are put on the normal
104 * ds_deadlist and marked obsolete when they are freed.
106 * - The pool can have a dp_obsolete_bpobj. This is a list of blocks
107 * in the pool that need to be marked obsolete. When a snapshot is
108 * destroyed, we move some of the ds_remap_deadlist to the obsolete
109 * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then
110 * asynchronously process the obsolete bpobj, moving its entries to
111 * the specific vdevs' obsolete space maps.
113 * == Summary of how we mark blocks as obsolete ==
115 * - When freeing a block: if any DVA is on an indirect vdev, append to
116 * vic_obsolete_sm_object.
117 * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
118 * references; otherwise append to vic_obsolete_sm_object).
119 * - When freeing a snapshot: move parts of ds_remap_deadlist to
120 * dp_obsolete_bpobj (same algorithm as ds_deadlist).
121 * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
122 * individual vdev's vic_obsolete_sm_object.
126 * "Big theory statement" for how we condense indirect vdevs.
128 * Condensing an indirect vdev's mapping is the process of determining
129 * the precise counts of obsolete space for each mapping entry (by
130 * integrating the obsolete spacemap into the obsolete counts) and
131 * writing out a new mapping that contains only referenced entries.
133 * We condense a vdev when we expect the mapping to shrink (see
134 * vdev_indirect_should_condense()), but only perform one condense at a
135 * time to limit the memory usage. In addition, we use a separate
136 * open-context thread (spa_condense_indirect_thread) to incrementally
137 * create the new mapping object in a way that minimizes the impact on
138 * the rest of the system.
140 * == Generating a new mapping ==
142 * To generate a new mapping, we follow these steps:
144 * 1. Save the old obsolete space map and create a new mapping object
145 * (see spa_condense_indirect_start_sync()). This initializes the
146 * spa_condensing_indirect_phys with the "previous obsolete space map",
147 * which is now read only. Newly obsolete DVAs will be added to a
148 * new (initially empty) obsolete space map, and will not be
149 * considered as part of this condense operation.
151 * 2. Construct in memory the precise counts of obsolete space for each
152 * mapping entry, by incorporating the obsolete space map into the
153 * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
155 * 3. Iterate through each mapping entry, writing to the new mapping any
156 * entries that are not completely obsolete (i.e. which don't have
157 * obsolete count == mapping length). (See
158 * spa_condense_indirect_generate_new_mapping().)
160 * 4. Destroy the old mapping object and switch over to the new one
161 * (spa_condense_indirect_complete_sync).
163 * == Restarting from failure ==
165 * To restart the condense when we import/open the pool, we must start
166 * at the 2nd step above: reconstruct the precise counts in memory,
167 * based on the space map + counts. Then in the 3rd step, we start
168 * iterating where we left off: at vimp_max_offset of the new mapping
172 boolean_t zfs_condense_indirect_vdevs_enable
= B_TRUE
;
175 * Condense if at least this percent of the bytes in the mapping is
176 * obsolete. With the default of 25%, the amount of space mapped
177 * will be reduced to 1% of its original size after at most 16
178 * condenses. Higher values will condense less often (causing less
179 * i/o); lower values will reduce the mapping size more quickly.
181 int zfs_indirect_condense_obsolete_pct
= 25;
184 * Condense if the obsolete space map takes up more than this amount of
185 * space on disk (logically). This limits the amount of disk space
186 * consumed by the obsolete space map; the default of 1GB is small enough
187 * that we typically don't mind "wasting" it.
189 uint64_t zfs_condense_max_obsolete_bytes
= 1024 * 1024 * 1024;
192 * Don't bother condensing if the mapping uses less than this amount of
193 * memory. The default of 128KB is considered a "trivial" amount of
194 * memory and not worth reducing.
196 uint64_t zfs_condense_min_mapping_bytes
= 128 * 1024;
199 * This is used by the test suite so that it can ensure that certain
200 * actions happen while in the middle of a condense (which might otherwise
201 * complete too quickly). If used to reduce the performance impact of
202 * condensing in production, a maximum value of 1 should be sufficient.
204 int zfs_condense_indirect_commit_entry_delay_ticks
= 0;
207 * Mark the given offset and size as being obsolete.
210 vdev_indirect_mark_obsolete(vdev_t
*vd
, uint64_t offset
, uint64_t size
)
212 spa_t
*spa
= vd
->vdev_spa
;
214 ASSERT3U(vd
->vdev_indirect_config
.vic_mapping_object
, !=, 0);
215 ASSERT(vd
->vdev_removing
|| vd
->vdev_ops
== &vdev_indirect_ops
);
217 VERIFY(vdev_indirect_mapping_entry_for_offset(
218 vd
->vdev_indirect_mapping
, offset
) != NULL
);
220 if (spa_feature_is_enabled(spa
, SPA_FEATURE_OBSOLETE_COUNTS
)) {
221 mutex_enter(&vd
->vdev_obsolete_lock
);
222 range_tree_add(vd
->vdev_obsolete_segments
, offset
, size
);
223 mutex_exit(&vd
->vdev_obsolete_lock
);
224 vdev_dirty(vd
, 0, NULL
, spa_syncing_txg(spa
));
229 * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
230 * wrapper is provided because the DMU does not know about vdev_t's and
231 * cannot directly call vdev_indirect_mark_obsolete.
234 spa_vdev_indirect_mark_obsolete(spa_t
*spa
, uint64_t vdev_id
, uint64_t offset
,
235 uint64_t size
, dmu_tx_t
*tx
)
237 vdev_t
*vd
= vdev_lookup_top(spa
, vdev_id
);
238 ASSERT(dmu_tx_is_syncing(tx
));
240 /* The DMU can only remap indirect vdevs. */
241 ASSERT3P(vd
->vdev_ops
, ==, &vdev_indirect_ops
);
242 vdev_indirect_mark_obsolete(vd
, offset
, size
);
245 static spa_condensing_indirect_t
*
246 spa_condensing_indirect_create(spa_t
*spa
)
248 spa_condensing_indirect_phys_t
*scip
=
249 &spa
->spa_condensing_indirect_phys
;
250 spa_condensing_indirect_t
*sci
= kmem_zalloc(sizeof (*sci
), KM_SLEEP
);
251 objset_t
*mos
= spa
->spa_meta_objset
;
253 for (int i
= 0; i
< TXG_SIZE
; i
++) {
254 list_create(&sci
->sci_new_mapping_entries
[i
],
255 sizeof (vdev_indirect_mapping_entry_t
),
256 offsetof(vdev_indirect_mapping_entry_t
, vime_node
));
259 sci
->sci_new_mapping
=
260 vdev_indirect_mapping_open(mos
, scip
->scip_next_mapping_object
);
266 spa_condensing_indirect_destroy(spa_condensing_indirect_t
*sci
)
268 for (int i
= 0; i
< TXG_SIZE
; i
++)
269 list_destroy(&sci
->sci_new_mapping_entries
[i
]);
271 if (sci
->sci_new_mapping
!= NULL
)
272 vdev_indirect_mapping_close(sci
->sci_new_mapping
);
274 kmem_free(sci
, sizeof (*sci
));
278 vdev_indirect_should_condense(vdev_t
*vd
)
280 vdev_indirect_mapping_t
*vim
= vd
->vdev_indirect_mapping
;
281 spa_t
*spa
= vd
->vdev_spa
;
283 ASSERT(dsl_pool_sync_context(spa
->spa_dsl_pool
));
285 if (!zfs_condense_indirect_vdevs_enable
)
289 * We can only condense one indirect vdev at a time.
291 if (spa
->spa_condensing_indirect
!= NULL
)
294 if (spa_shutting_down(spa
))
298 * The mapping object size must not change while we are
299 * condensing, so we can only condense indirect vdevs
300 * (not vdevs that are still in the middle of being removed).
302 if (vd
->vdev_ops
!= &vdev_indirect_ops
)
306 * If nothing new has been marked obsolete, there is no
307 * point in condensing.
309 if (vd
->vdev_obsolete_sm
== NULL
) {
310 ASSERT0(vdev_obsolete_sm_object(vd
));
314 ASSERT(vd
->vdev_obsolete_sm
!= NULL
);
316 ASSERT3U(vdev_obsolete_sm_object(vd
), ==,
317 space_map_object(vd
->vdev_obsolete_sm
));
319 uint64_t bytes_mapped
= vdev_indirect_mapping_bytes_mapped(vim
);
320 uint64_t bytes_obsolete
= space_map_allocated(vd
->vdev_obsolete_sm
);
321 uint64_t mapping_size
= vdev_indirect_mapping_size(vim
);
322 uint64_t obsolete_sm_size
= space_map_length(vd
->vdev_obsolete_sm
);
324 ASSERT3U(bytes_obsolete
, <=, bytes_mapped
);
327 * If a high percentage of the bytes that are mapped have become
328 * obsolete, condense (unless the mapping is already small enough).
329 * This has a good chance of reducing the amount of memory used
332 if (bytes_obsolete
* 100 / bytes_mapped
>=
333 zfs_indirect_condense_obsolete_pct
&&
334 mapping_size
> zfs_condense_min_mapping_bytes
) {
335 zfs_dbgmsg("should condense vdev %llu because obsolete "
336 "spacemap covers %d%% of %lluMB mapping",
337 (u_longlong_t
)vd
->vdev_id
,
338 (int)(bytes_obsolete
* 100 / bytes_mapped
),
339 (u_longlong_t
)bytes_mapped
/ 1024 / 1024);
344 * If the obsolete space map takes up too much space on disk,
345 * condense in order to free up this disk space.
347 if (obsolete_sm_size
>= zfs_condense_max_obsolete_bytes
) {
348 zfs_dbgmsg("should condense vdev %llu because obsolete sm "
349 "length %lluMB >= max size %lluMB",
350 (u_longlong_t
)vd
->vdev_id
,
351 (u_longlong_t
)obsolete_sm_size
/ 1024 / 1024,
352 (u_longlong_t
)zfs_condense_max_obsolete_bytes
/
361 * This sync task completes (finishes) a condense, deleting the old
362 * mapping and replacing it with the new one.
365 spa_condense_indirect_complete_sync(void *arg
, dmu_tx_t
*tx
)
367 spa_condensing_indirect_t
*sci
= arg
;
368 spa_t
*spa
= dmu_tx_pool(tx
)->dp_spa
;
369 spa_condensing_indirect_phys_t
*scip
=
370 &spa
->spa_condensing_indirect_phys
;
371 vdev_t
*vd
= vdev_lookup_top(spa
, scip
->scip_vdev
);
372 vdev_indirect_config_t
*vic
= &vd
->vdev_indirect_config
;
373 objset_t
*mos
= spa
->spa_meta_objset
;
374 vdev_indirect_mapping_t
*old_mapping
= vd
->vdev_indirect_mapping
;
375 uint64_t old_count
= vdev_indirect_mapping_num_entries(old_mapping
);
377 vdev_indirect_mapping_num_entries(sci
->sci_new_mapping
);
379 ASSERT(dmu_tx_is_syncing(tx
));
380 ASSERT3P(vd
->vdev_ops
, ==, &vdev_indirect_ops
);
381 ASSERT3P(sci
, ==, spa
->spa_condensing_indirect
);
382 for (int i
= 0; i
< TXG_SIZE
; i
++) {
383 ASSERT(list_is_empty(&sci
->sci_new_mapping_entries
[i
]));
385 ASSERT(vic
->vic_mapping_object
!= 0);
386 ASSERT3U(vd
->vdev_id
, ==, scip
->scip_vdev
);
387 ASSERT(scip
->scip_next_mapping_object
!= 0);
388 ASSERT(scip
->scip_prev_obsolete_sm_object
!= 0);
391 * Reset vdev_indirect_mapping to refer to the new object.
393 rw_enter(&vd
->vdev_indirect_rwlock
, RW_WRITER
);
394 vdev_indirect_mapping_close(vd
->vdev_indirect_mapping
);
395 vd
->vdev_indirect_mapping
= sci
->sci_new_mapping
;
396 rw_exit(&vd
->vdev_indirect_rwlock
);
398 sci
->sci_new_mapping
= NULL
;
399 vdev_indirect_mapping_free(mos
, vic
->vic_mapping_object
, tx
);
400 vic
->vic_mapping_object
= scip
->scip_next_mapping_object
;
401 scip
->scip_next_mapping_object
= 0;
403 space_map_free_obj(mos
, scip
->scip_prev_obsolete_sm_object
, tx
);
404 spa_feature_decr(spa
, SPA_FEATURE_OBSOLETE_COUNTS
, tx
);
405 scip
->scip_prev_obsolete_sm_object
= 0;
409 VERIFY0(zap_remove(mos
, DMU_POOL_DIRECTORY_OBJECT
,
410 DMU_POOL_CONDENSING_INDIRECT
, tx
));
411 spa_condensing_indirect_destroy(spa
->spa_condensing_indirect
);
412 spa
->spa_condensing_indirect
= NULL
;
414 zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
415 "new mapping object %llu has %llu entries "
416 "(was %llu entries)",
417 vd
->vdev_id
, dmu_tx_get_txg(tx
), vic
->vic_mapping_object
,
418 new_count
, old_count
);
420 vdev_config_dirty(spa
->spa_root_vdev
);
424 * This sync task appends entries to the new mapping object.
427 spa_condense_indirect_commit_sync(void *arg
, dmu_tx_t
*tx
)
429 spa_condensing_indirect_t
*sci
= arg
;
430 uint64_t txg
= dmu_tx_get_txg(tx
);
431 spa_t
*spa
= dmu_tx_pool(tx
)->dp_spa
;
433 ASSERT(dmu_tx_is_syncing(tx
));
434 ASSERT3P(sci
, ==, spa
->spa_condensing_indirect
);
436 vdev_indirect_mapping_add_entries(sci
->sci_new_mapping
,
437 &sci
->sci_new_mapping_entries
[txg
& TXG_MASK
], tx
);
438 ASSERT(list_is_empty(&sci
->sci_new_mapping_entries
[txg
& TXG_MASK
]));
442 * Open-context function to add one entry to the new mapping. The new
443 * entry will be remembered and written from syncing context.
446 spa_condense_indirect_commit_entry(spa_t
*spa
,
447 vdev_indirect_mapping_entry_phys_t
*vimep
, uint32_t count
)
449 spa_condensing_indirect_t
*sci
= spa
->spa_condensing_indirect
;
451 ASSERT3U(count
, <, DVA_GET_ASIZE(&vimep
->vimep_dst
));
453 dmu_tx_t
*tx
= dmu_tx_create_dd(spa_get_dsl(spa
)->dp_mos_dir
);
454 dmu_tx_hold_space(tx
, sizeof (*vimep
) + sizeof (count
));
455 VERIFY0(dmu_tx_assign(tx
, TXG_WAIT
));
456 int txgoff
= dmu_tx_get_txg(tx
) & TXG_MASK
;
459 * If we are the first entry committed this txg, kick off the sync
460 * task to write to the MOS on our behalf.
462 if (list_is_empty(&sci
->sci_new_mapping_entries
[txgoff
])) {
463 dsl_sync_task_nowait(dmu_tx_pool(tx
),
464 spa_condense_indirect_commit_sync
, sci
,
465 0, ZFS_SPACE_CHECK_NONE
, tx
);
468 vdev_indirect_mapping_entry_t
*vime
=
469 kmem_alloc(sizeof (*vime
), KM_SLEEP
);
470 vime
->vime_mapping
= *vimep
;
471 vime
->vime_obsolete_count
= count
;
472 list_insert_tail(&sci
->sci_new_mapping_entries
[txgoff
], vime
);
478 spa_condense_indirect_generate_new_mapping(vdev_t
*vd
,
479 uint32_t *obsolete_counts
, uint64_t start_index
, zthr_t
*zthr
)
481 spa_t
*spa
= vd
->vdev_spa
;
482 uint64_t mapi
= start_index
;
483 vdev_indirect_mapping_t
*old_mapping
= vd
->vdev_indirect_mapping
;
484 uint64_t old_num_entries
=
485 vdev_indirect_mapping_num_entries(old_mapping
);
487 ASSERT3P(vd
->vdev_ops
, ==, &vdev_indirect_ops
);
488 ASSERT3U(vd
->vdev_id
, ==, spa
->spa_condensing_indirect_phys
.scip_vdev
);
490 zfs_dbgmsg("starting condense of vdev %llu from index %llu",
491 (u_longlong_t
)vd
->vdev_id
,
494 while (mapi
< old_num_entries
) {
496 if (zthr_iscancelled(zthr
)) {
497 zfs_dbgmsg("pausing condense of vdev %llu "
498 "at index %llu", (u_longlong_t
)vd
->vdev_id
,
503 vdev_indirect_mapping_entry_phys_t
*entry
=
504 &old_mapping
->vim_entries
[mapi
];
505 uint64_t entry_size
= DVA_GET_ASIZE(&entry
->vimep_dst
);
506 ASSERT3U(obsolete_counts
[mapi
], <=, entry_size
);
507 if (obsolete_counts
[mapi
] < entry_size
) {
508 spa_condense_indirect_commit_entry(spa
, entry
,
509 obsolete_counts
[mapi
]);
512 * This delay may be requested for testing, debugging,
513 * or performance reasons.
515 delay(zfs_condense_indirect_commit_entry_delay_ticks
);
524 spa_condense_indirect_thread_check(void *arg
, zthr_t
*zthr
)
528 return (spa
->spa_condensing_indirect
!= NULL
);
533 spa_condense_indirect_thread(void *arg
, zthr_t
*zthr
)
538 ASSERT3P(spa
->spa_condensing_indirect
, !=, NULL
);
539 spa_config_enter(spa
, SCL_VDEV
, FTAG
, RW_READER
);
540 vd
= vdev_lookup_top(spa
, spa
->spa_condensing_indirect_phys
.scip_vdev
);
541 ASSERT3P(vd
, !=, NULL
);
542 spa_config_exit(spa
, SCL_VDEV
, FTAG
);
544 spa_condensing_indirect_t
*sci
= spa
->spa_condensing_indirect
;
545 spa_condensing_indirect_phys_t
*scip
=
546 &spa
->spa_condensing_indirect_phys
;
548 uint64_t start_index
;
549 vdev_indirect_mapping_t
*old_mapping
= vd
->vdev_indirect_mapping
;
550 space_map_t
*prev_obsolete_sm
= NULL
;
552 ASSERT3U(vd
->vdev_id
, ==, scip
->scip_vdev
);
553 ASSERT(scip
->scip_next_mapping_object
!= 0);
554 ASSERT(scip
->scip_prev_obsolete_sm_object
!= 0);
555 ASSERT3P(vd
->vdev_ops
, ==, &vdev_indirect_ops
);
557 for (int i
= 0; i
< TXG_SIZE
; i
++) {
559 * The list must start out empty in order for the
560 * _commit_sync() sync task to be properly registered
561 * on the first call to _commit_entry(); so it's wise
562 * to double check and ensure we actually are starting
565 ASSERT(list_is_empty(&sci
->sci_new_mapping_entries
[i
]));
568 VERIFY0(space_map_open(&prev_obsolete_sm
, spa
->spa_meta_objset
,
569 scip
->scip_prev_obsolete_sm_object
, 0, vd
->vdev_asize
, 0));
570 space_map_update(prev_obsolete_sm
);
571 counts
= vdev_indirect_mapping_load_obsolete_counts(old_mapping
);
572 if (prev_obsolete_sm
!= NULL
) {
573 vdev_indirect_mapping_load_obsolete_spacemap(old_mapping
,
574 counts
, prev_obsolete_sm
);
576 space_map_close(prev_obsolete_sm
);
579 * Generate new mapping. Determine what index to continue from
580 * based on the max offset that we've already written in the
583 uint64_t max_offset
=
584 vdev_indirect_mapping_max_offset(sci
->sci_new_mapping
);
585 if (max_offset
== 0) {
586 /* We haven't written anything to the new mapping yet. */
590 * Pick up from where we left off. _entry_for_offset()
591 * returns a pointer into the vim_entries array. If
592 * max_offset is greater than any of the mappings
593 * contained in the table NULL will be returned and
594 * that indicates we've exhausted our iteration of the
598 vdev_indirect_mapping_entry_phys_t
*entry
=
599 vdev_indirect_mapping_entry_for_offset_or_next(old_mapping
,
604 * We've already written the whole new mapping.
605 * This special value will cause us to skip the
606 * generate_new_mapping step and just do the sync
607 * task to complete the condense.
609 start_index
= UINT64_MAX
;
611 start_index
= entry
- old_mapping
->vim_entries
;
612 ASSERT3U(start_index
, <,
613 vdev_indirect_mapping_num_entries(old_mapping
));
617 spa_condense_indirect_generate_new_mapping(vd
, counts
,
620 vdev_indirect_mapping_free_obsolete_counts(old_mapping
, counts
);
623 * If the zthr has received a cancellation signal while running
624 * in generate_new_mapping() or at any point after that, then bail
625 * early. We don't want to complete the condense if the spa is
628 if (zthr_iscancelled(zthr
))
631 VERIFY0(dsl_sync_task(spa_name(spa
), NULL
,
632 spa_condense_indirect_complete_sync
, sci
, 0,
633 ZFS_SPACE_CHECK_EXTRA_RESERVED
));
639 * Sync task to begin the condensing process.
642 spa_condense_indirect_start_sync(vdev_t
*vd
, dmu_tx_t
*tx
)
644 spa_t
*spa
= vd
->vdev_spa
;
645 spa_condensing_indirect_phys_t
*scip
=
646 &spa
->spa_condensing_indirect_phys
;
648 ASSERT0(scip
->scip_next_mapping_object
);
649 ASSERT0(scip
->scip_prev_obsolete_sm_object
);
650 ASSERT0(scip
->scip_vdev
);
651 ASSERT(dmu_tx_is_syncing(tx
));
652 ASSERT3P(vd
->vdev_ops
, ==, &vdev_indirect_ops
);
653 ASSERT(spa_feature_is_active(spa
, SPA_FEATURE_OBSOLETE_COUNTS
));
654 ASSERT(vdev_indirect_mapping_num_entries(vd
->vdev_indirect_mapping
));
656 uint64_t obsolete_sm_obj
= vdev_obsolete_sm_object(vd
);
657 ASSERT(obsolete_sm_obj
!= 0);
659 scip
->scip_vdev
= vd
->vdev_id
;
660 scip
->scip_next_mapping_object
=
661 vdev_indirect_mapping_alloc(spa
->spa_meta_objset
, tx
);
663 scip
->scip_prev_obsolete_sm_object
= obsolete_sm_obj
;
666 * We don't need to allocate a new space map object, since
667 * vdev_indirect_sync_obsolete will allocate one when needed.
669 space_map_close(vd
->vdev_obsolete_sm
);
670 vd
->vdev_obsolete_sm
= NULL
;
671 VERIFY0(zap_remove(spa
->spa_meta_objset
, vd
->vdev_top_zap
,
672 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM
, tx
));
674 VERIFY0(zap_add(spa
->spa_dsl_pool
->dp_meta_objset
,
675 DMU_POOL_DIRECTORY_OBJECT
,
676 DMU_POOL_CONDENSING_INDIRECT
, sizeof (uint64_t),
677 sizeof (*scip
) / sizeof (uint64_t), scip
, tx
));
679 ASSERT3P(spa
->spa_condensing_indirect
, ==, NULL
);
680 spa
->spa_condensing_indirect
= spa_condensing_indirect_create(spa
);
682 zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
684 vd
->vdev_id
, dmu_tx_get_txg(tx
),
685 (u_longlong_t
)scip
->scip_prev_obsolete_sm_object
,
686 (u_longlong_t
)scip
->scip_next_mapping_object
);
688 zthr_wakeup(spa
->spa_condense_zthr
);
692 * Sync to the given vdev's obsolete space map any segments that are no longer
693 * referenced as of the given txg.
695 * If the obsolete space map doesn't exist yet, create and open it.
698 vdev_indirect_sync_obsolete(vdev_t
*vd
, dmu_tx_t
*tx
)
700 spa_t
*spa
= vd
->vdev_spa
;
701 vdev_indirect_config_t
*vic
= &vd
->vdev_indirect_config
;
703 ASSERT3U(vic
->vic_mapping_object
, !=, 0);
704 ASSERT(range_tree_space(vd
->vdev_obsolete_segments
) > 0);
705 ASSERT(vd
->vdev_removing
|| vd
->vdev_ops
== &vdev_indirect_ops
);
706 ASSERT(spa_feature_is_enabled(spa
, SPA_FEATURE_OBSOLETE_COUNTS
));
708 if (vdev_obsolete_sm_object(vd
) == 0) {
709 uint64_t obsolete_sm_object
=
710 space_map_alloc(spa
->spa_meta_objset
,
711 vdev_standard_sm_blksz
, tx
);
713 ASSERT(vd
->vdev_top_zap
!= 0);
714 VERIFY0(zap_add(vd
->vdev_spa
->spa_meta_objset
, vd
->vdev_top_zap
,
715 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM
,
716 sizeof (obsolete_sm_object
), 1, &obsolete_sm_object
, tx
));
717 ASSERT3U(vdev_obsolete_sm_object(vd
), !=, 0);
719 spa_feature_incr(spa
, SPA_FEATURE_OBSOLETE_COUNTS
, tx
);
720 VERIFY0(space_map_open(&vd
->vdev_obsolete_sm
,
721 spa
->spa_meta_objset
, obsolete_sm_object
,
722 0, vd
->vdev_asize
, 0));
723 space_map_update(vd
->vdev_obsolete_sm
);
726 ASSERT(vd
->vdev_obsolete_sm
!= NULL
);
727 ASSERT3U(vdev_obsolete_sm_object(vd
), ==,
728 space_map_object(vd
->vdev_obsolete_sm
));
730 space_map_write(vd
->vdev_obsolete_sm
,
731 vd
->vdev_obsolete_segments
, SM_ALLOC
, tx
);
732 space_map_update(vd
->vdev_obsolete_sm
);
733 range_tree_vacate(vd
->vdev_obsolete_segments
, NULL
, NULL
);
737 spa_condense_init(spa_t
*spa
)
739 int error
= zap_lookup(spa
->spa_meta_objset
,
740 DMU_POOL_DIRECTORY_OBJECT
,
741 DMU_POOL_CONDENSING_INDIRECT
, sizeof (uint64_t),
742 sizeof (spa
->spa_condensing_indirect_phys
) / sizeof (uint64_t),
743 &spa
->spa_condensing_indirect_phys
);
745 if (spa_writeable(spa
)) {
746 spa
->spa_condensing_indirect
=
747 spa_condensing_indirect_create(spa
);
750 } else if (error
== ENOENT
) {
758 spa_condense_fini(spa_t
*spa
)
760 if (spa
->spa_condensing_indirect
!= NULL
) {
761 spa_condensing_indirect_destroy(spa
->spa_condensing_indirect
);
762 spa
->spa_condensing_indirect
= NULL
;
767 spa_start_indirect_condensing_thread(spa_t
*spa
)
769 ASSERT3P(spa
->spa_condense_zthr
, ==, NULL
);
770 spa
->spa_condense_zthr
= zthr_create(spa_condense_indirect_thread_check
,
771 spa_condense_indirect_thread
, spa
);
775 * Gets the obsolete spacemap object from the vdev's ZAP.
776 * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't
780 vdev_obsolete_sm_object(vdev_t
*vd
)
782 ASSERT0(spa_config_held(vd
->vdev_spa
, SCL_ALL
, RW_WRITER
));
783 if (vd
->vdev_top_zap
== 0) {
788 int err
= zap_lookup(vd
->vdev_spa
->spa_meta_objset
, vd
->vdev_top_zap
,
789 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM
, sizeof (sm_obj
), 1, &sm_obj
);
791 ASSERT(err
== 0 || err
== ENOENT
);
797 vdev_obsolete_counts_are_precise(vdev_t
*vd
)
799 ASSERT0(spa_config_held(vd
->vdev_spa
, SCL_ALL
, RW_WRITER
));
800 if (vd
->vdev_top_zap
== 0) {
805 int err
= zap_lookup(vd
->vdev_spa
->spa_meta_objset
, vd
->vdev_top_zap
,
806 VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE
, sizeof (val
), 1, &val
);
808 ASSERT(err
== 0 || err
== ENOENT
);
815 vdev_indirect_close(vdev_t
*vd
)
821 vdev_indirect_io_done(zio_t
*zio
)
827 vdev_indirect_open(vdev_t
*vd
, uint64_t *psize
, uint64_t *max_psize
,
830 *psize
= *max_psize
= vd
->vdev_asize
+
831 VDEV_LABEL_START_SIZE
+ VDEV_LABEL_END_SIZE
;
832 *ashift
= vd
->vdev_ashift
;
836 typedef struct remap_segment
{
840 uint64_t rs_split_offset
;
845 rs_alloc(vdev_t
*vd
, uint64_t offset
, uint64_t asize
, uint64_t split_offset
)
847 remap_segment_t
*rs
= kmem_alloc(sizeof (remap_segment_t
), KM_SLEEP
);
849 rs
->rs_offset
= offset
;
850 rs
->rs_asize
= asize
;
851 rs
->rs_split_offset
= split_offset
;
856 * Given an indirect vdev and an extent on that vdev, it duplicates the
857 * physical entries of the indirect mapping that correspond to the extent
858 * to a new array and returns a pointer to it. In addition, copied_entries
859 * is populated with the number of mapping entries that were duplicated.
861 * Note that the function assumes that the caller holds vdev_indirect_rwlock.
862 * This ensures that the mapping won't change due to condensing as we
863 * copy over its contents.
865 * Finally, since we are doing an allocation, it is up to the caller to
866 * free the array allocated in this function.
868 vdev_indirect_mapping_entry_phys_t
*
869 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t
*vd
, uint64_t offset
,
870 uint64_t asize
, uint64_t *copied_entries
)
872 vdev_indirect_mapping_entry_phys_t
*duplicate_mappings
= NULL
;
873 vdev_indirect_mapping_t
*vim
= vd
->vdev_indirect_mapping
;
874 uint64_t entries
= 0;
876 ASSERT(RW_READ_HELD(&vd
->vdev_indirect_rwlock
));
878 vdev_indirect_mapping_entry_phys_t
*first_mapping
=
879 vdev_indirect_mapping_entry_for_offset(vim
, offset
);
880 ASSERT3P(first_mapping
, !=, NULL
);
882 vdev_indirect_mapping_entry_phys_t
*m
= first_mapping
;
884 uint64_t size
= DVA_GET_ASIZE(&m
->vimep_dst
);
886 ASSERT3U(offset
, >=, DVA_MAPPING_GET_SRC_OFFSET(m
));
887 ASSERT3U(offset
, <, DVA_MAPPING_GET_SRC_OFFSET(m
) + size
);
889 uint64_t inner_offset
= offset
- DVA_MAPPING_GET_SRC_OFFSET(m
);
890 uint64_t inner_size
= MIN(asize
, size
- inner_offset
);
892 offset
+= inner_size
;
898 size_t copy_length
= entries
* sizeof (*first_mapping
);
899 duplicate_mappings
= kmem_alloc(copy_length
, KM_SLEEP
);
900 bcopy(first_mapping
, duplicate_mappings
, copy_length
);
901 *copied_entries
= entries
;
903 return (duplicate_mappings
);
907 * Goes through the relevant indirect mappings until it hits a concrete vdev
908 * and issues the callback. On the way to the concrete vdev, if any other
909 * indirect vdevs are encountered, then the callback will also be called on
910 * each of those indirect vdevs. For example, if the segment is mapped to
911 * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
912 * mapped to segment B on concrete vdev 2, then the callback will be called on
913 * both vdev 1 and vdev 2.
915 * While the callback passed to vdev_indirect_remap() is called on every vdev
916 * the function encounters, certain callbacks only care about concrete vdevs.
917 * These types of callbacks should return immediately and explicitly when they
918 * are called on an indirect vdev.
920 * Because there is a possibility that a DVA section in the indirect device
921 * has been split into multiple sections in our mapping, we keep track
922 * of the relevant contiguous segments of the new location (remap_segment_t)
923 * in a stack. This way we can call the callback for each of the new sections
924 * created by a single section of the indirect device. Note though, that in
925 * this scenario the callbacks in each split block won't occur in-order in
926 * terms of offset, so callers should not make any assumptions about that.
928 * For callbacks that don't handle split blocks and immediately return when
929 * they encounter them (as is the case for remap_blkptr_cb), the caller can
930 * assume that its callback will be applied from the first indirect vdev
931 * encountered to the last one and then the concrete vdev, in that order.
934 vdev_indirect_remap(vdev_t
*vd
, uint64_t offset
, uint64_t asize
,
935 void (*func
)(uint64_t, vdev_t
*, uint64_t, uint64_t, void *), void *arg
)
938 spa_t
*spa
= vd
->vdev_spa
;
940 list_create(&stack
, sizeof (remap_segment_t
),
941 offsetof(remap_segment_t
, rs_node
));
943 for (remap_segment_t
*rs
= rs_alloc(vd
, offset
, asize
, 0);
944 rs
!= NULL
; rs
= list_remove_head(&stack
)) {
945 vdev_t
*v
= rs
->rs_vd
;
946 uint64_t num_entries
= 0;
948 ASSERT(spa_config_held(spa
, SCL_ALL
, RW_READER
) != 0);
949 ASSERT(rs
->rs_asize
> 0);
952 * Note: As this function can be called from open context
953 * (e.g. zio_read()), we need the following rwlock to
954 * prevent the mapping from being changed by condensing.
956 * So we grab the lock and we make a copy of the entries
957 * that are relevant to the extent that we are working on.
958 * Once that is done, we drop the lock and iterate over
959 * our copy of the mapping. Once we are done with the with
960 * the remap segment and we free it, we also free our copy
961 * of the indirect mapping entries that are relevant to it.
963 * This way we don't need to wait until the function is
964 * finished with a segment, to condense it. In addition, we
965 * don't need a recursive rwlock for the case that a call to
966 * vdev_indirect_remap() needs to call itself (through the
967 * codepath of its callback) for the same vdev in the middle
970 rw_enter(&v
->vdev_indirect_rwlock
, RW_READER
);
971 vdev_indirect_mapping_t
*vim
= v
->vdev_indirect_mapping
;
972 ASSERT3P(vim
, !=, NULL
);
974 vdev_indirect_mapping_entry_phys_t
*mapping
=
975 vdev_indirect_mapping_duplicate_adjacent_entries(v
,
976 rs
->rs_offset
, rs
->rs_asize
, &num_entries
);
977 ASSERT3P(mapping
, !=, NULL
);
978 ASSERT3U(num_entries
, >, 0);
979 rw_exit(&v
->vdev_indirect_rwlock
);
981 for (uint64_t i
= 0; i
< num_entries
; i
++) {
983 * Note: the vdev_indirect_mapping can not change
984 * while we are running. It only changes while the
985 * removal is in progress, and then only from syncing
986 * context. While a removal is in progress, this
987 * function is only called for frees, which also only
988 * happen from syncing context.
990 vdev_indirect_mapping_entry_phys_t
*m
= &mapping
[i
];
992 ASSERT3P(m
, !=, NULL
);
993 ASSERT3U(rs
->rs_asize
, >, 0);
995 uint64_t size
= DVA_GET_ASIZE(&m
->vimep_dst
);
996 uint64_t dst_offset
= DVA_GET_OFFSET(&m
->vimep_dst
);
997 uint64_t dst_vdev
= DVA_GET_VDEV(&m
->vimep_dst
);
999 ASSERT3U(rs
->rs_offset
, >=,
1000 DVA_MAPPING_GET_SRC_OFFSET(m
));
1001 ASSERT3U(rs
->rs_offset
, <,
1002 DVA_MAPPING_GET_SRC_OFFSET(m
) + size
);
1003 ASSERT3U(dst_vdev
, !=, v
->vdev_id
);
1005 uint64_t inner_offset
= rs
->rs_offset
-
1006 DVA_MAPPING_GET_SRC_OFFSET(m
);
1007 uint64_t inner_size
=
1008 MIN(rs
->rs_asize
, size
- inner_offset
);
1010 vdev_t
*dst_v
= vdev_lookup_top(spa
, dst_vdev
);
1011 ASSERT3P(dst_v
, !=, NULL
);
1013 if (dst_v
->vdev_ops
== &vdev_indirect_ops
) {
1014 list_insert_head(&stack
,
1015 rs_alloc(dst_v
, dst_offset
+ inner_offset
,
1016 inner_size
, rs
->rs_split_offset
));
1020 if ((zfs_flags
& ZFS_DEBUG_INDIRECT_REMAP
) &&
1021 IS_P2ALIGNED(inner_size
, 2 * SPA_MINBLOCKSIZE
)) {
1023 * Note: This clause exists only solely for
1024 * testing purposes. We use it to ensure that
1025 * split blocks work and that the callbacks
1026 * using them yield the same result if issued
1029 uint64_t inner_half
= inner_size
/ 2;
1031 func(rs
->rs_split_offset
+ inner_half
, dst_v
,
1032 dst_offset
+ inner_offset
+ inner_half
,
1035 func(rs
->rs_split_offset
, dst_v
,
1036 dst_offset
+ inner_offset
,
1039 func(rs
->rs_split_offset
, dst_v
,
1040 dst_offset
+ inner_offset
,
1044 rs
->rs_offset
+= inner_size
;
1045 rs
->rs_asize
-= inner_size
;
1046 rs
->rs_split_offset
+= inner_size
;
1048 VERIFY0(rs
->rs_asize
);
1050 kmem_free(mapping
, num_entries
* sizeof (*mapping
));
1051 kmem_free(rs
, sizeof (remap_segment_t
));
1053 list_destroy(&stack
);
1057 vdev_indirect_child_io_done(zio_t
*zio
)
1059 zio_t
*pio
= zio
->io_private
;
1061 mutex_enter(&pio
->io_lock
);
1062 pio
->io_error
= zio_worst_error(pio
->io_error
, zio
->io_error
);
1063 mutex_exit(&pio
->io_lock
);
1065 abd_put(zio
->io_abd
);
1069 vdev_indirect_io_start_cb(uint64_t split_offset
, vdev_t
*vd
, uint64_t offset
,
1070 uint64_t size
, void *arg
)
1074 ASSERT3P(vd
, !=, NULL
);
1076 if (vd
->vdev_ops
== &vdev_indirect_ops
)
1079 zio_nowait(zio_vdev_child_io(zio
, NULL
, vd
, offset
,
1080 abd_get_offset(zio
->io_abd
, split_offset
),
1081 size
, zio
->io_type
, zio
->io_priority
,
1082 0, vdev_indirect_child_io_done
, zio
));
1086 vdev_indirect_io_start(zio_t
*zio
)
1088 spa_t
*spa
= zio
->io_spa
;
1090 ASSERT(spa_config_held(spa
, SCL_ALL
, RW_READER
) != 0);
1091 if (zio
->io_type
!= ZIO_TYPE_READ
) {
1092 ASSERT3U(zio
->io_type
, ==, ZIO_TYPE_WRITE
);
1093 ASSERT((zio
->io_flags
&
1094 (ZIO_FLAG_SELF_HEAL
| ZIO_FLAG_INDUCE_DAMAGE
)) != 0);
1097 vdev_indirect_remap(zio
->io_vd
, zio
->io_offset
, zio
->io_size
,
1098 vdev_indirect_io_start_cb
, zio
);
1103 vdev_ops_t vdev_indirect_ops
= {
1105 vdev_indirect_close
,
1107 vdev_indirect_io_start
,
1108 vdev_indirect_io_done
,
1112 vdev_indirect_remap
,
1113 VDEV_TYPE_INDIRECT
, /* name of this vdev type */
1114 B_FALSE
/* leaf vdev */