4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
9 * A full copy of the text of the CDDL should have accompanied this
10 * source. A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
17 * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
20 #include <sys/zfs_context.h>
22 #include <sys/spa_impl.h>
23 #include <sys/vdev_impl.h>
24 #include <sys/fs/zfs.h>
26 #include <sys/metaslab.h>
27 #include <sys/refcount.h>
29 #include <sys/vdev_indirect_mapping.h>
30 #include <sys/dmu_tx.h>
31 #include <sys/dsl_synctask.h>
37 * An indirect vdev corresponds to a vdev that has been removed. Since
38 * we cannot rewrite block pointers of snapshots, etc., we keep a
39 * mapping from old location on the removed device to the new location
40 * on another device in the pool and use this mapping whenever we need
41 * to access the DVA. Unfortunately, this mapping did not respect
42 * logical block boundaries when it was first created, and so a DVA on
43 * this indirect vdev may be "split" into multiple sections that each
44 * map to a different location. As a consequence, not all DVAs can be
45 * translated to an equivalent new DVA. Instead we must provide a
46 * "vdev_remap" operation that executes a callback on each contiguous
47 * segment of the new location. This function is used in multiple ways:
49 * - reads and repair writes to this device use the callback to create
50 * a child io for each mapped segment.
52 * - frees and claims to this device use the callback to free or claim
53 * each mapped segment. (Note that we don't actually need to claim
54 * log blocks on indirect vdevs, because we don't allocate to
55 * removing vdevs. However, zdb uses zio_claim() for its leak
60 * "Big theory statement" for how we mark blocks obsolete.
62 * When a block on an indirect vdev is freed or remapped, a section of
63 * that vdev's mapping may no longer be referenced (aka "obsolete"). We
64 * keep track of how much of each mapping entry is obsolete. When
65 * an entry becomes completely obsolete, we can remove it, thus reducing
66 * the memory used by the mapping. The complete picture of obsolescence
67 * is given by the following data structures, described below:
68 * - the entry-specific obsolete count
69 * - the vdev-specific obsolete spacemap
70 * - the pool-specific obsolete bpobj
72 * == On disk data structures used ==
74 * We track the obsolete space for the pool using several objects. Each
75 * of these objects is created on demand and freed when no longer
76 * needed, and is assumed to be empty if it does not exist.
77 * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
79 * - Each vic_mapping_object (associated with an indirect vdev) can
80 * have a vimp_counts_object. This is an array of uint32_t's
81 * with the same number of entries as the vic_mapping_object. When
82 * the mapping is condensed, entries from the vic_obsolete_sm_object
83 * (see below) are folded into the counts. Therefore, each
84 * obsolete_counts entry tells us the number of bytes in the
85 * corresponding mapping entry that were not referenced when the
86 * mapping was last condensed.
88 * - Each indirect or removing vdev can have a vic_obsolete_sm_object.
89 * This is a space map containing an alloc entry for every DVA that
90 * has been obsoleted since the last time this indirect vdev was
91 * condensed. We use this object in order to improve performance
92 * when marking a DVA as obsolete. Instead of modifying an arbitrary
93 * offset of the vimp_counts_object, we only need to append an entry
94 * to the end of this object. When a DVA becomes obsolete, it is
95 * added to the obsolete space map. This happens when the DVA is
96 * freed, remapped and not referenced by a snapshot, or the last
97 * snapshot referencing it is destroyed.
99 * - Each dataset can have a ds_remap_deadlist object. This is a
100 * deadlist object containing all blocks that were remapped in this
101 * dataset but referenced in a previous snapshot. Blocks can *only*
102 * appear on this list if they were remapped (dsl_dataset_block_remapped);
103 * blocks that were killed in a head dataset are put on the normal
104 * ds_deadlist and marked obsolete when they are freed.
106 * - The pool can have a dp_obsolete_bpobj. This is a list of blocks
107 * in the pool that need to be marked obsolete. When a snapshot is
108 * destroyed, we move some of the ds_remap_deadlist to the obsolete
109 * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then
110 * asynchronously process the obsolete bpobj, moving its entries to
111 * the specific vdevs' obsolete space maps.
113 * == Summary of how we mark blocks as obsolete ==
115 * - When freeing a block: if any DVA is on an indirect vdev, append to
116 * vic_obsolete_sm_object.
117 * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
118 * references; otherwise append to vic_obsolete_sm_object).
119 * - When freeing a snapshot: move parts of ds_remap_deadlist to
120 * dp_obsolete_bpobj (same algorithm as ds_deadlist).
121 * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
122 * individual vdev's vic_obsolete_sm_object.
126 * "Big theory statement" for how we condense indirect vdevs.
128 * Condensing an indirect vdev's mapping is the process of determining
129 * the precise counts of obsolete space for each mapping entry (by
130 * integrating the obsolete spacemap into the obsolete counts) and
131 * writing out a new mapping that contains only referenced entries.
133 * We condense a vdev when we expect the mapping to shrink (see
134 * vdev_indirect_should_condense()), but only perform one condense at a
135 * time to limit the memory usage. In addition, we use a separate
136 * open-context thread (spa_condense_indirect_thread) to incrementally
137 * create the new mapping object in a way that minimizes the impact on
138 * the rest of the system.
140 * == Generating a new mapping ==
142 * To generate a new mapping, we follow these steps:
144 * 1. Save the old obsolete space map and create a new mapping object
145 * (see spa_condense_indirect_start_sync()). This initializes the
146 * spa_condensing_indirect_phys with the "previous obsolete space map",
147 * which is now read only. Newly obsolete DVAs will be added to a
148 * new (initially empty) obsolete space map, and will not be
149 * considered as part of this condense operation.
151 * 2. Construct in memory the precise counts of obsolete space for each
152 * mapping entry, by incorporating the obsolete space map into the
153 * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
155 * 3. Iterate through each mapping entry, writing to the new mapping any
156 * entries that are not completely obsolete (i.e. which don't have
157 * obsolete count == mapping length). (See
158 * spa_condense_indirect_generate_new_mapping().)
160 * 4. Destroy the old mapping object and switch over to the new one
161 * (spa_condense_indirect_complete_sync).
163 * == Restarting from failure ==
165 * To restart the condense when we import/open the pool, we must start
166 * at the 2nd step above: reconstruct the precise counts in memory,
167 * based on the space map + counts. Then in the 3rd step, we start
168 * iterating where we left off: at vimp_max_offset of the new mapping
172 boolean_t zfs_condense_indirect_vdevs_enable
= B_TRUE
;
175 * Condense if at least this percent of the bytes in the mapping is
176 * obsolete. With the default of 25%, the amount of space mapped
177 * will be reduced to 1% of its original size after at most 16
178 * condenses. Higher values will condense less often (causing less
179 * i/o); lower values will reduce the mapping size more quickly.
181 int zfs_indirect_condense_obsolete_pct
= 25;
184 * Condense if the obsolete space map takes up more than this amount of
185 * space on disk (logically). This limits the amount of disk space
186 * consumed by the obsolete space map; the default of 1GB is small enough
187 * that we typically don't mind "wasting" it.
189 uint64_t zfs_condense_max_obsolete_bytes
= 1024 * 1024 * 1024;
192 * Don't bother condensing if the mapping uses less than this amount of
193 * memory. The default of 128KB is considered a "trivial" amount of
194 * memory and not worth reducing.
196 uint64_t zfs_condense_min_mapping_bytes
= 128 * 1024;
199 * This is used by the test suite so that it can ensure that certain
200 * actions happen while in the middle of a condense (which might otherwise
201 * complete too quickly). If used to reduce the performance impact of
202 * condensing in production, a maximum value of 1 should be sufficient.
204 int zfs_condense_indirect_commit_entry_delay_ticks
= 0;
207 * Mark the given offset and size as being obsolete in the given txg.
210 vdev_indirect_mark_obsolete(vdev_t
*vd
, uint64_t offset
, uint64_t size
,
213 spa_t
*spa
= vd
->vdev_spa
;
214 ASSERT3U(spa_syncing_txg(spa
), ==, txg
);
215 ASSERT3U(vd
->vdev_indirect_config
.vic_mapping_object
, !=, 0);
216 ASSERT(vd
->vdev_removing
|| vd
->vdev_ops
== &vdev_indirect_ops
);
218 VERIFY(vdev_indirect_mapping_entry_for_offset(
219 vd
->vdev_indirect_mapping
, offset
) != NULL
);
221 if (spa_feature_is_enabled(spa
, SPA_FEATURE_OBSOLETE_COUNTS
)) {
222 mutex_enter(&vd
->vdev_obsolete_lock
);
223 range_tree_add(vd
->vdev_obsolete_segments
, offset
, size
);
224 mutex_exit(&vd
->vdev_obsolete_lock
);
225 vdev_dirty(vd
, 0, NULL
, txg
);
230 * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
231 * wrapper is provided because the DMU does not know about vdev_t's and
232 * cannot directly call vdev_indirect_mark_obsolete.
235 spa_vdev_indirect_mark_obsolete(spa_t
*spa
, uint64_t vdev_id
, uint64_t offset
,
236 uint64_t size
, dmu_tx_t
*tx
)
238 vdev_t
*vd
= vdev_lookup_top(spa
, vdev_id
);
239 ASSERT(dmu_tx_is_syncing(tx
));
241 /* The DMU can only remap indirect vdevs. */
242 ASSERT3P(vd
->vdev_ops
, ==, &vdev_indirect_ops
);
243 vdev_indirect_mark_obsolete(vd
, offset
, size
, dmu_tx_get_txg(tx
));
246 static spa_condensing_indirect_t
*
247 spa_condensing_indirect_create(spa_t
*spa
)
249 spa_condensing_indirect_phys_t
*scip
=
250 &spa
->spa_condensing_indirect_phys
;
251 spa_condensing_indirect_t
*sci
= kmem_zalloc(sizeof (*sci
), KM_SLEEP
);
252 objset_t
*mos
= spa
->spa_meta_objset
;
254 for (int i
= 0; i
< TXG_SIZE
; i
++) {
255 list_create(&sci
->sci_new_mapping_entries
[i
],
256 sizeof (vdev_indirect_mapping_entry_t
),
257 offsetof(vdev_indirect_mapping_entry_t
, vime_node
));
260 sci
->sci_new_mapping
=
261 vdev_indirect_mapping_open(mos
, scip
->scip_next_mapping_object
);
267 spa_condensing_indirect_destroy(spa_condensing_indirect_t
*sci
)
269 for (int i
= 0; i
< TXG_SIZE
; i
++)
270 list_destroy(&sci
->sci_new_mapping_entries
[i
]);
272 if (sci
->sci_new_mapping
!= NULL
)
273 vdev_indirect_mapping_close(sci
->sci_new_mapping
);
275 kmem_free(sci
, sizeof (*sci
));
279 vdev_indirect_should_condense(vdev_t
*vd
)
281 vdev_indirect_mapping_t
*vim
= vd
->vdev_indirect_mapping
;
282 spa_t
*spa
= vd
->vdev_spa
;
284 ASSERT(dsl_pool_sync_context(spa
->spa_dsl_pool
));
286 if (!zfs_condense_indirect_vdevs_enable
)
290 * We can only condense one indirect vdev at a time.
292 if (spa
->spa_condensing_indirect
!= NULL
)
295 if (spa_shutting_down(spa
))
299 * The mapping object size must not change while we are
300 * condensing, so we can only condense indirect vdevs
301 * (not vdevs that are still in the middle of being removed).
303 if (vd
->vdev_ops
!= &vdev_indirect_ops
)
307 * If nothing new has been marked obsolete, there is no
308 * point in condensing.
310 if (vd
->vdev_obsolete_sm
== NULL
) {
311 ASSERT0(vdev_obsolete_sm_object(vd
));
315 ASSERT(vd
->vdev_obsolete_sm
!= NULL
);
317 ASSERT3U(vdev_obsolete_sm_object(vd
), ==,
318 space_map_object(vd
->vdev_obsolete_sm
));
320 uint64_t bytes_mapped
= vdev_indirect_mapping_bytes_mapped(vim
);
321 uint64_t bytes_obsolete
= space_map_allocated(vd
->vdev_obsolete_sm
);
322 uint64_t mapping_size
= vdev_indirect_mapping_size(vim
);
323 uint64_t obsolete_sm_size
= space_map_length(vd
->vdev_obsolete_sm
);
325 ASSERT3U(bytes_obsolete
, <=, bytes_mapped
);
328 * If a high percentage of the bytes that are mapped have become
329 * obsolete, condense (unless the mapping is already small enough).
330 * This has a good chance of reducing the amount of memory used
333 if (bytes_obsolete
* 100 / bytes_mapped
>=
334 zfs_indirect_condense_obsolete_pct
&&
335 mapping_size
> zfs_condense_min_mapping_bytes
) {
336 zfs_dbgmsg("should condense vdev %llu because obsolete "
337 "spacemap covers %d%% of %lluMB mapping",
338 (u_longlong_t
)vd
->vdev_id
,
339 (int)(bytes_obsolete
* 100 / bytes_mapped
),
340 (u_longlong_t
)bytes_mapped
/ 1024 / 1024);
345 * If the obsolete space map takes up too much space on disk,
346 * condense in order to free up this disk space.
348 if (obsolete_sm_size
>= zfs_condense_max_obsolete_bytes
) {
349 zfs_dbgmsg("should condense vdev %llu because obsolete sm "
350 "length %lluMB >= max size %lluMB",
351 (u_longlong_t
)vd
->vdev_id
,
352 (u_longlong_t
)obsolete_sm_size
/ 1024 / 1024,
353 (u_longlong_t
)zfs_condense_max_obsolete_bytes
/
362 * This sync task completes (finishes) a condense, deleting the old
363 * mapping and replacing it with the new one.
366 spa_condense_indirect_complete_sync(void *arg
, dmu_tx_t
*tx
)
368 spa_condensing_indirect_t
*sci
= arg
;
369 spa_t
*spa
= dmu_tx_pool(tx
)->dp_spa
;
370 spa_condensing_indirect_phys_t
*scip
=
371 &spa
->spa_condensing_indirect_phys
;
372 vdev_t
*vd
= vdev_lookup_top(spa
, scip
->scip_vdev
);
373 vdev_indirect_config_t
*vic
= &vd
->vdev_indirect_config
;
374 objset_t
*mos
= spa
->spa_meta_objset
;
375 vdev_indirect_mapping_t
*old_mapping
= vd
->vdev_indirect_mapping
;
376 uint64_t old_count
= vdev_indirect_mapping_num_entries(old_mapping
);
378 vdev_indirect_mapping_num_entries(sci
->sci_new_mapping
);
380 ASSERT(dmu_tx_is_syncing(tx
));
381 ASSERT3P(vd
->vdev_ops
, ==, &vdev_indirect_ops
);
382 ASSERT3P(sci
, ==, spa
->spa_condensing_indirect
);
383 for (int i
= 0; i
< TXG_SIZE
; i
++) {
384 ASSERT(list_is_empty(&sci
->sci_new_mapping_entries
[i
]));
386 ASSERT(vic
->vic_mapping_object
!= 0);
387 ASSERT3U(vd
->vdev_id
, ==, scip
->scip_vdev
);
388 ASSERT(scip
->scip_next_mapping_object
!= 0);
389 ASSERT(scip
->scip_prev_obsolete_sm_object
!= 0);
392 * Reset vdev_indirect_mapping to refer to the new object.
394 rw_enter(&vd
->vdev_indirect_rwlock
, RW_WRITER
);
395 vdev_indirect_mapping_close(vd
->vdev_indirect_mapping
);
396 vd
->vdev_indirect_mapping
= sci
->sci_new_mapping
;
397 rw_exit(&vd
->vdev_indirect_rwlock
);
399 sci
->sci_new_mapping
= NULL
;
400 vdev_indirect_mapping_free(mos
, vic
->vic_mapping_object
, tx
);
401 vic
->vic_mapping_object
= scip
->scip_next_mapping_object
;
402 scip
->scip_next_mapping_object
= 0;
404 space_map_free_obj(mos
, scip
->scip_prev_obsolete_sm_object
, tx
);
405 spa_feature_decr(spa
, SPA_FEATURE_OBSOLETE_COUNTS
, tx
);
406 scip
->scip_prev_obsolete_sm_object
= 0;
410 VERIFY0(zap_remove(mos
, DMU_POOL_DIRECTORY_OBJECT
,
411 DMU_POOL_CONDENSING_INDIRECT
, tx
));
412 spa_condensing_indirect_destroy(spa
->spa_condensing_indirect
);
413 spa
->spa_condensing_indirect
= NULL
;
415 zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
416 "new mapping object %llu has %llu entries "
417 "(was %llu entries)",
418 vd
->vdev_id
, dmu_tx_get_txg(tx
), vic
->vic_mapping_object
,
419 new_count
, old_count
);
421 vdev_config_dirty(spa
->spa_root_vdev
);
425 * This sync task appends entries to the new mapping object.
428 spa_condense_indirect_commit_sync(void *arg
, dmu_tx_t
*tx
)
430 spa_condensing_indirect_t
*sci
= arg
;
431 uint64_t txg
= dmu_tx_get_txg(tx
);
432 spa_t
*spa
= dmu_tx_pool(tx
)->dp_spa
;
434 ASSERT(dmu_tx_is_syncing(tx
));
435 ASSERT3P(sci
, ==, spa
->spa_condensing_indirect
);
437 vdev_indirect_mapping_add_entries(sci
->sci_new_mapping
,
438 &sci
->sci_new_mapping_entries
[txg
& TXG_MASK
], tx
);
439 ASSERT(list_is_empty(&sci
->sci_new_mapping_entries
[txg
& TXG_MASK
]));
443 * Open-context function to add one entry to the new mapping. The new
444 * entry will be remembered and written from syncing context.
447 spa_condense_indirect_commit_entry(spa_t
*spa
,
448 vdev_indirect_mapping_entry_phys_t
*vimep
, uint32_t count
)
450 spa_condensing_indirect_t
*sci
= spa
->spa_condensing_indirect
;
452 ASSERT3U(count
, <, DVA_GET_ASIZE(&vimep
->vimep_dst
));
454 dmu_tx_t
*tx
= dmu_tx_create_dd(spa_get_dsl(spa
)->dp_mos_dir
);
455 dmu_tx_hold_space(tx
, sizeof (*vimep
) + sizeof (count
));
456 VERIFY0(dmu_tx_assign(tx
, TXG_WAIT
));
457 int txgoff
= dmu_tx_get_txg(tx
) & TXG_MASK
;
460 * If we are the first entry committed this txg, kick off the sync
461 * task to write to the MOS on our behalf.
463 if (list_is_empty(&sci
->sci_new_mapping_entries
[txgoff
])) {
464 dsl_sync_task_nowait(dmu_tx_pool(tx
),
465 spa_condense_indirect_commit_sync
, sci
,
466 0, ZFS_SPACE_CHECK_NONE
, tx
);
469 vdev_indirect_mapping_entry_t
*vime
=
470 kmem_alloc(sizeof (*vime
), KM_SLEEP
);
471 vime
->vime_mapping
= *vimep
;
472 vime
->vime_obsolete_count
= count
;
473 list_insert_tail(&sci
->sci_new_mapping_entries
[txgoff
], vime
);
479 spa_condense_indirect_generate_new_mapping(vdev_t
*vd
,
480 uint32_t *obsolete_counts
, uint64_t start_index
, zthr_t
*zthr
)
482 spa_t
*spa
= vd
->vdev_spa
;
483 uint64_t mapi
= start_index
;
484 vdev_indirect_mapping_t
*old_mapping
= vd
->vdev_indirect_mapping
;
485 uint64_t old_num_entries
=
486 vdev_indirect_mapping_num_entries(old_mapping
);
488 ASSERT3P(vd
->vdev_ops
, ==, &vdev_indirect_ops
);
489 ASSERT3U(vd
->vdev_id
, ==, spa
->spa_condensing_indirect_phys
.scip_vdev
);
491 zfs_dbgmsg("starting condense of vdev %llu from index %llu",
492 (u_longlong_t
)vd
->vdev_id
,
495 while (mapi
< old_num_entries
) {
497 if (zthr_iscancelled(zthr
)) {
498 zfs_dbgmsg("pausing condense of vdev %llu "
499 "at index %llu", (u_longlong_t
)vd
->vdev_id
,
504 vdev_indirect_mapping_entry_phys_t
*entry
=
505 &old_mapping
->vim_entries
[mapi
];
506 uint64_t entry_size
= DVA_GET_ASIZE(&entry
->vimep_dst
);
507 ASSERT3U(obsolete_counts
[mapi
], <=, entry_size
);
508 if (obsolete_counts
[mapi
] < entry_size
) {
509 spa_condense_indirect_commit_entry(spa
, entry
,
510 obsolete_counts
[mapi
]);
513 * This delay may be requested for testing, debugging,
514 * or performance reasons.
516 delay(zfs_condense_indirect_commit_entry_delay_ticks
);
525 spa_condense_indirect_thread_check(void *arg
, zthr_t
*zthr
)
529 return (spa
->spa_condensing_indirect
!= NULL
);
534 spa_condense_indirect_thread(void *arg
, zthr_t
*zthr
)
539 ASSERT3P(spa
->spa_condensing_indirect
, !=, NULL
);
540 spa_config_enter(spa
, SCL_VDEV
, FTAG
, RW_READER
);
541 vd
= vdev_lookup_top(spa
, spa
->spa_condensing_indirect_phys
.scip_vdev
);
542 ASSERT3P(vd
, !=, NULL
);
543 spa_config_exit(spa
, SCL_VDEV
, FTAG
);
545 spa_condensing_indirect_t
*sci
= spa
->spa_condensing_indirect
;
546 spa_condensing_indirect_phys_t
*scip
=
547 &spa
->spa_condensing_indirect_phys
;
549 uint64_t start_index
;
550 vdev_indirect_mapping_t
*old_mapping
= vd
->vdev_indirect_mapping
;
551 space_map_t
*prev_obsolete_sm
= NULL
;
553 ASSERT3U(vd
->vdev_id
, ==, scip
->scip_vdev
);
554 ASSERT(scip
->scip_next_mapping_object
!= 0);
555 ASSERT(scip
->scip_prev_obsolete_sm_object
!= 0);
556 ASSERT3P(vd
->vdev_ops
, ==, &vdev_indirect_ops
);
558 for (int i
= 0; i
< TXG_SIZE
; i
++) {
560 * The list must start out empty in order for the
561 * _commit_sync() sync task to be properly registered
562 * on the first call to _commit_entry(); so it's wise
563 * to double check and ensure we actually are starting
566 ASSERT(list_is_empty(&sci
->sci_new_mapping_entries
[i
]));
569 VERIFY0(space_map_open(&prev_obsolete_sm
, spa
->spa_meta_objset
,
570 scip
->scip_prev_obsolete_sm_object
, 0, vd
->vdev_asize
, 0));
571 space_map_update(prev_obsolete_sm
);
572 counts
= vdev_indirect_mapping_load_obsolete_counts(old_mapping
);
573 if (prev_obsolete_sm
!= NULL
) {
574 vdev_indirect_mapping_load_obsolete_spacemap(old_mapping
,
575 counts
, prev_obsolete_sm
);
577 space_map_close(prev_obsolete_sm
);
580 * Generate new mapping. Determine what index to continue from
581 * based on the max offset that we've already written in the
584 uint64_t max_offset
=
585 vdev_indirect_mapping_max_offset(sci
->sci_new_mapping
);
586 if (max_offset
== 0) {
587 /* We haven't written anything to the new mapping yet. */
591 * Pick up from where we left off. _entry_for_offset()
592 * returns a pointer into the vim_entries array. If
593 * max_offset is greater than any of the mappings
594 * contained in the table NULL will be returned and
595 * that indicates we've exhausted our iteration of the
599 vdev_indirect_mapping_entry_phys_t
*entry
=
600 vdev_indirect_mapping_entry_for_offset_or_next(old_mapping
,
605 * We've already written the whole new mapping.
606 * This special value will cause us to skip the
607 * generate_new_mapping step and just do the sync
608 * task to complete the condense.
610 start_index
= UINT64_MAX
;
612 start_index
= entry
- old_mapping
->vim_entries
;
613 ASSERT3U(start_index
, <,
614 vdev_indirect_mapping_num_entries(old_mapping
));
618 spa_condense_indirect_generate_new_mapping(vd
, counts
,
621 vdev_indirect_mapping_free_obsolete_counts(old_mapping
, counts
);
624 * If the zthr has received a cancellation signal while running
625 * in generate_new_mapping() or at any point after that, then bail
626 * early. We don't want to complete the condense if the spa is
629 if (zthr_iscancelled(zthr
))
632 VERIFY0(dsl_sync_task(spa_name(spa
), NULL
,
633 spa_condense_indirect_complete_sync
, sci
, 0, ZFS_SPACE_CHECK_NONE
));
639 * Sync task to begin the condensing process.
642 spa_condense_indirect_start_sync(vdev_t
*vd
, dmu_tx_t
*tx
)
644 spa_t
*spa
= vd
->vdev_spa
;
645 spa_condensing_indirect_phys_t
*scip
=
646 &spa
->spa_condensing_indirect_phys
;
648 ASSERT0(scip
->scip_next_mapping_object
);
649 ASSERT0(scip
->scip_prev_obsolete_sm_object
);
650 ASSERT0(scip
->scip_vdev
);
651 ASSERT(dmu_tx_is_syncing(tx
));
652 ASSERT3P(vd
->vdev_ops
, ==, &vdev_indirect_ops
);
653 ASSERT(spa_feature_is_active(spa
, SPA_FEATURE_OBSOLETE_COUNTS
));
654 ASSERT(vdev_indirect_mapping_num_entries(vd
->vdev_indirect_mapping
));
656 uint64_t obsolete_sm_obj
= vdev_obsolete_sm_object(vd
);
657 ASSERT(obsolete_sm_obj
!= 0);
659 scip
->scip_vdev
= vd
->vdev_id
;
660 scip
->scip_next_mapping_object
=
661 vdev_indirect_mapping_alloc(spa
->spa_meta_objset
, tx
);
663 scip
->scip_prev_obsolete_sm_object
= obsolete_sm_obj
;
666 * We don't need to allocate a new space map object, since
667 * vdev_indirect_sync_obsolete will allocate one when needed.
669 space_map_close(vd
->vdev_obsolete_sm
);
670 vd
->vdev_obsolete_sm
= NULL
;
671 VERIFY0(zap_remove(spa
->spa_meta_objset
, vd
->vdev_top_zap
,
672 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM
, tx
));
674 VERIFY0(zap_add(spa
->spa_dsl_pool
->dp_meta_objset
,
675 DMU_POOL_DIRECTORY_OBJECT
,
676 DMU_POOL_CONDENSING_INDIRECT
, sizeof (uint64_t),
677 sizeof (*scip
) / sizeof (uint64_t), scip
, tx
));
679 ASSERT3P(spa
->spa_condensing_indirect
, ==, NULL
);
680 spa
->spa_condensing_indirect
= spa_condensing_indirect_create(spa
);
682 zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
684 vd
->vdev_id
, dmu_tx_get_txg(tx
),
685 (u_longlong_t
)scip
->scip_prev_obsolete_sm_object
,
686 (u_longlong_t
)scip
->scip_next_mapping_object
);
688 zthr_wakeup(spa
->spa_condense_zthr
);
692 * Sync to the given vdev's obsolete space map any segments that are no longer
693 * referenced as of the given txg.
695 * If the obsolete space map doesn't exist yet, create and open it.
698 vdev_indirect_sync_obsolete(vdev_t
*vd
, dmu_tx_t
*tx
)
700 spa_t
*spa
= vd
->vdev_spa
;
701 vdev_indirect_config_t
*vic
= &vd
->vdev_indirect_config
;
703 ASSERT3U(vic
->vic_mapping_object
, !=, 0);
704 ASSERT(range_tree_space(vd
->vdev_obsolete_segments
) > 0);
705 ASSERT(vd
->vdev_removing
|| vd
->vdev_ops
== &vdev_indirect_ops
);
706 ASSERT(spa_feature_is_enabled(spa
, SPA_FEATURE_OBSOLETE_COUNTS
));
708 if (vdev_obsolete_sm_object(vd
) == 0) {
709 uint64_t obsolete_sm_object
=
710 space_map_alloc(spa
->spa_meta_objset
, tx
);
712 ASSERT(vd
->vdev_top_zap
!= 0);
713 VERIFY0(zap_add(vd
->vdev_spa
->spa_meta_objset
, vd
->vdev_top_zap
,
714 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM
,
715 sizeof (obsolete_sm_object
), 1, &obsolete_sm_object
, tx
));
716 ASSERT3U(vdev_obsolete_sm_object(vd
), !=, 0);
718 spa_feature_incr(spa
, SPA_FEATURE_OBSOLETE_COUNTS
, tx
);
719 VERIFY0(space_map_open(&vd
->vdev_obsolete_sm
,
720 spa
->spa_meta_objset
, obsolete_sm_object
,
721 0, vd
->vdev_asize
, 0));
722 space_map_update(vd
->vdev_obsolete_sm
);
725 ASSERT(vd
->vdev_obsolete_sm
!= NULL
);
726 ASSERT3U(vdev_obsolete_sm_object(vd
), ==,
727 space_map_object(vd
->vdev_obsolete_sm
));
729 space_map_write(vd
->vdev_obsolete_sm
,
730 vd
->vdev_obsolete_segments
, SM_ALLOC
, tx
);
731 space_map_update(vd
->vdev_obsolete_sm
);
732 range_tree_vacate(vd
->vdev_obsolete_segments
, NULL
, NULL
);
736 spa_condense_init(spa_t
*spa
)
738 int error
= zap_lookup(spa
->spa_meta_objset
,
739 DMU_POOL_DIRECTORY_OBJECT
,
740 DMU_POOL_CONDENSING_INDIRECT
, sizeof (uint64_t),
741 sizeof (spa
->spa_condensing_indirect_phys
) / sizeof (uint64_t),
742 &spa
->spa_condensing_indirect_phys
);
744 if (spa_writeable(spa
)) {
745 spa
->spa_condensing_indirect
=
746 spa_condensing_indirect_create(spa
);
749 } else if (error
== ENOENT
) {
757 spa_condense_fini(spa_t
*spa
)
759 if (spa
->spa_condensing_indirect
!= NULL
) {
760 spa_condensing_indirect_destroy(spa
->spa_condensing_indirect
);
761 spa
->spa_condensing_indirect
= NULL
;
766 spa_start_indirect_condensing_thread(spa_t
*spa
)
768 ASSERT3P(spa
->spa_condense_zthr
, ==, NULL
);
769 spa
->spa_condense_zthr
= zthr_create(spa_condense_indirect_thread_check
,
770 spa_condense_indirect_thread
, spa
);
774 * Gets the obsolete spacemap object from the vdev's ZAP.
775 * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't
779 vdev_obsolete_sm_object(vdev_t
*vd
)
781 ASSERT0(spa_config_held(vd
->vdev_spa
, SCL_ALL
, RW_WRITER
));
782 if (vd
->vdev_top_zap
== 0) {
787 int err
= zap_lookup(vd
->vdev_spa
->spa_meta_objset
, vd
->vdev_top_zap
,
788 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM
, sizeof (sm_obj
), 1, &sm_obj
);
790 ASSERT(err
== 0 || err
== ENOENT
);
796 vdev_obsolete_counts_are_precise(vdev_t
*vd
)
798 ASSERT0(spa_config_held(vd
->vdev_spa
, SCL_ALL
, RW_WRITER
));
799 if (vd
->vdev_top_zap
== 0) {
804 int err
= zap_lookup(vd
->vdev_spa
->spa_meta_objset
, vd
->vdev_top_zap
,
805 VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE
, sizeof (val
), 1, &val
);
807 ASSERT(err
== 0 || err
== ENOENT
);
814 vdev_indirect_close(vdev_t
*vd
)
820 vdev_indirect_io_done(zio_t
*zio
)
826 vdev_indirect_open(vdev_t
*vd
, uint64_t *psize
, uint64_t *max_psize
,
829 *psize
= *max_psize
= vd
->vdev_asize
+
830 VDEV_LABEL_START_SIZE
+ VDEV_LABEL_END_SIZE
;
831 *ashift
= vd
->vdev_ashift
;
835 typedef struct remap_segment
{
839 uint64_t rs_split_offset
;
844 rs_alloc(vdev_t
*vd
, uint64_t offset
, uint64_t asize
, uint64_t split_offset
)
846 remap_segment_t
*rs
= kmem_alloc(sizeof (remap_segment_t
), KM_SLEEP
);
848 rs
->rs_offset
= offset
;
849 rs
->rs_asize
= asize
;
850 rs
->rs_split_offset
= split_offset
;
855 * Goes through the relevant indirect mappings until it hits a concrete vdev
856 * and issues the callback. On the way to the concrete vdev, if any other
857 * indirect vdevs are encountered, then the callback will also be called on
858 * each of those indirect vdevs. For example, if the segment is mapped to
859 * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
860 * mapped to segment B on concrete vdev 2, then the callback will be called on
861 * both vdev 1 and vdev 2.
863 * While the callback passed to vdev_indirect_remap() is called on every vdev
864 * the function encounters, certain callbacks only care about concrete vdevs.
865 * These types of callbacks should return immediately and explicitly when they
866 * are called on an indirect vdev.
868 * Because there is a possibility that a DVA section in the indirect device
869 * has been split into multiple sections in our mapping, we keep track
870 * of the relevant contiguous segments of the new location (remap_segment_t)
871 * in a stack. This way we can call the callback for each of the new sections
872 * created by a single section of the indirect device. Note though, that in
873 * this scenario the callbacks in each split block won't occur in-order in
874 * terms of offset, so callers should not make any assumptions about that.
876 * For callbacks that don't handle split blocks and immediately return when
877 * they encounter them (as is the case for remap_blkptr_cb), the caller can
878 * assume that its callback will be applied from the first indirect vdev
879 * encountered to the last one and then the concrete vdev, in that order.
882 vdev_indirect_remap(vdev_t
*vd
, uint64_t offset
, uint64_t asize
,
883 void (*func
)(uint64_t, vdev_t
*, uint64_t, uint64_t, void *), void *arg
)
886 spa_t
*spa
= vd
->vdev_spa
;
888 list_create(&stack
, sizeof (remap_segment_t
),
889 offsetof(remap_segment_t
, rs_node
));
891 for (remap_segment_t
*rs
= rs_alloc(vd
, offset
, asize
, 0);
892 rs
!= NULL
; rs
= list_remove_head(&stack
)) {
893 vdev_t
*v
= rs
->rs_vd
;
896 * Note: this can be called from open context
897 * (eg. zio_read()), so we need the rwlock to prevent
898 * the mapping from being changed by condensing.
900 rw_enter(&v
->vdev_indirect_rwlock
, RW_READER
);
901 vdev_indirect_mapping_t
*vim
= v
->vdev_indirect_mapping
;
902 ASSERT3P(vim
, !=, NULL
);
904 ASSERT(spa_config_held(spa
, SCL_ALL
, RW_READER
) != 0);
905 ASSERT(rs
->rs_asize
> 0);
907 vdev_indirect_mapping_entry_phys_t
*mapping
=
908 vdev_indirect_mapping_entry_for_offset(vim
, rs
->rs_offset
);
909 ASSERT3P(mapping
, !=, NULL
);
911 while (rs
->rs_asize
> 0) {
913 * Note: the vdev_indirect_mapping can not change
914 * while we are running. It only changes while the
915 * removal is in progress, and then only from syncing
916 * context. While a removal is in progress, this
917 * function is only called for frees, which also only
918 * happen from syncing context.
921 uint64_t size
= DVA_GET_ASIZE(&mapping
->vimep_dst
);
922 uint64_t dst_offset
=
923 DVA_GET_OFFSET(&mapping
->vimep_dst
);
924 uint64_t dst_vdev
= DVA_GET_VDEV(&mapping
->vimep_dst
);
926 ASSERT3U(rs
->rs_offset
, >=,
927 DVA_MAPPING_GET_SRC_OFFSET(mapping
));
928 ASSERT3U(rs
->rs_offset
, <,
929 DVA_MAPPING_GET_SRC_OFFSET(mapping
) + size
);
930 ASSERT3U(dst_vdev
, !=, v
->vdev_id
);
932 uint64_t inner_offset
= rs
->rs_offset
-
933 DVA_MAPPING_GET_SRC_OFFSET(mapping
);
934 uint64_t inner_size
=
935 MIN(rs
->rs_asize
, size
- inner_offset
);
937 vdev_t
*dst_v
= vdev_lookup_top(spa
, dst_vdev
);
938 ASSERT3P(dst_v
, !=, NULL
);
940 if (dst_v
->vdev_ops
== &vdev_indirect_ops
) {
941 list_insert_head(&stack
,
942 rs_alloc(dst_v
, dst_offset
+ inner_offset
,
943 inner_size
, rs
->rs_split_offset
));
947 if ((zfs_flags
& ZFS_DEBUG_INDIRECT_REMAP
) &&
948 IS_P2ALIGNED(inner_size
, 2 * SPA_MINBLOCKSIZE
)) {
950 * Note: This clause exists only solely for
951 * testing purposes. We use it to ensure that
952 * split blocks work and that the callbacks
953 * using them yield the same result if issued
956 uint64_t inner_half
= inner_size
/ 2;
958 func(rs
->rs_split_offset
+ inner_half
, dst_v
,
959 dst_offset
+ inner_offset
+ inner_half
,
962 func(rs
->rs_split_offset
, dst_v
,
963 dst_offset
+ inner_offset
,
966 func(rs
->rs_split_offset
, dst_v
,
967 dst_offset
+ inner_offset
,
971 rs
->rs_offset
+= inner_size
;
972 rs
->rs_asize
-= inner_size
;
973 rs
->rs_split_offset
+= inner_size
;
977 rw_exit(&v
->vdev_indirect_rwlock
);
978 kmem_free(rs
, sizeof (remap_segment_t
));
980 list_destroy(&stack
);
984 vdev_indirect_child_io_done(zio_t
*zio
)
986 zio_t
*pio
= zio
->io_private
;
988 mutex_enter(&pio
->io_lock
);
989 pio
->io_error
= zio_worst_error(pio
->io_error
, zio
->io_error
);
990 mutex_exit(&pio
->io_lock
);
992 abd_put(zio
->io_abd
);
996 vdev_indirect_io_start_cb(uint64_t split_offset
, vdev_t
*vd
, uint64_t offset
,
997 uint64_t size
, void *arg
)
1001 ASSERT3P(vd
, !=, NULL
);
1003 if (vd
->vdev_ops
== &vdev_indirect_ops
)
1006 zio_nowait(zio_vdev_child_io(zio
, NULL
, vd
, offset
,
1007 abd_get_offset(zio
->io_abd
, split_offset
),
1008 size
, zio
->io_type
, zio
->io_priority
,
1009 0, vdev_indirect_child_io_done
, zio
));
1013 vdev_indirect_io_start(zio_t
*zio
)
1015 spa_t
*spa
= zio
->io_spa
;
1017 ASSERT(spa_config_held(spa
, SCL_ALL
, RW_READER
) != 0);
1018 if (zio
->io_type
!= ZIO_TYPE_READ
) {
1019 ASSERT3U(zio
->io_type
, ==, ZIO_TYPE_WRITE
);
1020 ASSERT((zio
->io_flags
&
1021 (ZIO_FLAG_SELF_HEAL
| ZIO_FLAG_INDUCE_DAMAGE
)) != 0);
1024 vdev_indirect_remap(zio
->io_vd
, zio
->io_offset
, zio
->io_size
,
1025 vdev_indirect_io_start_cb
, zio
);
1030 vdev_ops_t vdev_indirect_ops
= {
1032 vdev_indirect_close
,
1034 vdev_indirect_io_start
,
1035 vdev_indirect_io_done
,
1039 vdev_indirect_remap
,
1040 VDEV_TYPE_INDIRECT
, /* name of this vdev type */
1041 B_FALSE
/* leaf vdev */